Home | History | Annotate | Download | only in CodeGen
      1 // RUN: %clang_cc1 -triple thumbv7s-apple-darwin -target-abi apcs-gnu\
      2 // RUN:  -target-cpu swift -fallow-half-arguments-and-returns -ffreestanding -emit-llvm -o - %s \
      3 // RUN:  | opt -S -mem2reg | FileCheck %s
      4 
      5 // REQUIRES: long-tests
      6 
      7 #include <arm_neon.h>
      8 
      9 // CHECK-LABEL: define <8 x i8> @test_vaba_s8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
     10 // CHECK:   [[VABD_V_I_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %b, <8 x i8> %c) #4
     11 // CHECK:   [[ADD_I:%.*]] = add <8 x i8> %a, [[VABD_V_I_I]]
     12 // CHECK:   ret <8 x i8> [[ADD_I]]
     13 int8x8_t test_vaba_s8(int8x8_t a, int8x8_t b, int8x8_t c) {
     14   return vaba_s8(a, b, c);
     15 }
     16 
     17 // CHECK-LABEL: define <4 x i16> @test_vaba_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
     18 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
     19 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
     20 // CHECK:   [[VABD_V_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
     21 // CHECK:   [[VABD_V1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
     22 // CHECK:   [[VABD_V2_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> [[VABD_V_I_I]], <4 x i16> [[VABD_V1_I_I]]) #4
     23 // CHECK:   [[VABD_V3_I_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I]] to <8 x i8>
     24 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I_I]] to <4 x i16>
     25 // CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, [[TMP2]]
     26 // CHECK:   ret <4 x i16> [[ADD_I]]
     27 int16x4_t test_vaba_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
     28   return vaba_s16(a, b, c);
     29 }
     30 
     31 // CHECK-LABEL: define <2 x i32> @test_vaba_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
     32 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
     33 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
     34 // CHECK:   [[VABD_V_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
     35 // CHECK:   [[VABD_V1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
     36 // CHECK:   [[VABD_V2_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> [[VABD_V_I_I]], <2 x i32> [[VABD_V1_I_I]]) #4
     37 // CHECK:   [[VABD_V3_I_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I]] to <8 x i8>
     38 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I_I]] to <2 x i32>
     39 // CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, [[TMP2]]
     40 // CHECK:   ret <2 x i32> [[ADD_I]]
     41 int32x2_t test_vaba_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
     42   return vaba_s32(a, b, c);
     43 }
     44 
     45 // CHECK-LABEL: define <8 x i8> @test_vaba_u8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
     46 // CHECK:   [[VABD_V_I_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %b, <8 x i8> %c) #4
     47 // CHECK:   [[ADD_I:%.*]] = add <8 x i8> %a, [[VABD_V_I_I]]
     48 // CHECK:   ret <8 x i8> [[ADD_I]]
     49 uint8x8_t test_vaba_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) {
     50   return vaba_u8(a, b, c);
     51 }
     52 
     53 // CHECK-LABEL: define <4 x i16> @test_vaba_u16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
     54 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
     55 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
     56 // CHECK:   [[VABD_V_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
     57 // CHECK:   [[VABD_V1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
     58 // CHECK:   [[VABD_V2_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> [[VABD_V_I_I]], <4 x i16> [[VABD_V1_I_I]]) #4
     59 // CHECK:   [[VABD_V3_I_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I]] to <8 x i8>
     60 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I_I]] to <4 x i16>
     61 // CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, [[TMP2]]
     62 // CHECK:   ret <4 x i16> [[ADD_I]]
     63 uint16x4_t test_vaba_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) {
     64   return vaba_u16(a, b, c);
     65 }
     66 
     67 // CHECK-LABEL: define <2 x i32> @test_vaba_u32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
     68 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
     69 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
     70 // CHECK:   [[VABD_V_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
     71 // CHECK:   [[VABD_V1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
     72 // CHECK:   [[VABD_V2_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> [[VABD_V_I_I]], <2 x i32> [[VABD_V1_I_I]]) #4
     73 // CHECK:   [[VABD_V3_I_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I]] to <8 x i8>
     74 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I_I]] to <2 x i32>
     75 // CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, [[TMP2]]
     76 // CHECK:   ret <2 x i32> [[ADD_I]]
     77 uint32x2_t test_vaba_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) {
     78   return vaba_u32(a, b, c);
     79 }
     80 
     81 // CHECK-LABEL: define <16 x i8> @test_vabaq_s8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 {
     82 // CHECK:   [[VABDQ_V_I_I:%.*]] = call <16 x i8> @llvm.arm.neon.vabds.v16i8(<16 x i8> %b, <16 x i8> %c) #4
     83 // CHECK:   [[ADD_I:%.*]] = add <16 x i8> %a, [[VABDQ_V_I_I]]
     84 // CHECK:   ret <16 x i8> [[ADD_I]]
     85 int8x16_t test_vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c) {
     86   return vabaq_s8(a, b, c);
     87 }
     88 
     89 // CHECK-LABEL: define <8 x i16> @test_vabaq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) #0 {
     90 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
     91 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %c to <16 x i8>
     92 // CHECK:   [[VABDQ_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
     93 // CHECK:   [[VABDQ_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
     94 // CHECK:   [[VABDQ_V2_I_I:%.*]] = call <8 x i16> @llvm.arm.neon.vabds.v8i16(<8 x i16> [[VABDQ_V_I_I]], <8 x i16> [[VABDQ_V1_I_I]]) #4
     95 // CHECK:   [[VABDQ_V3_I_I:%.*]] = bitcast <8 x i16> [[VABDQ_V2_I_I]] to <16 x i8>
     96 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VABDQ_V3_I_I]] to <8 x i16>
     97 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[TMP2]]
     98 // CHECK:   ret <8 x i16> [[ADD_I]]
     99 int16x8_t test_vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c) {
    100   return vabaq_s16(a, b, c);
    101 }
    102 
    103 // CHECK-LABEL: define <4 x i32> @test_vabaq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) #0 {
    104 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
    105 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %c to <16 x i8>
    106 // CHECK:   [[VABDQ_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
    107 // CHECK:   [[VABDQ_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
    108 // CHECK:   [[VABDQ_V2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vabds.v4i32(<4 x i32> [[VABDQ_V_I_I]], <4 x i32> [[VABDQ_V1_I_I]]) #4
    109 // CHECK:   [[VABDQ_V3_I_I:%.*]] = bitcast <4 x i32> [[VABDQ_V2_I_I]] to <16 x i8>
    110 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VABDQ_V3_I_I]] to <4 x i32>
    111 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[TMP2]]
    112 // CHECK:   ret <4 x i32> [[ADD_I]]
    113 int32x4_t test_vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c) {
    114   return vabaq_s32(a, b, c);
    115 }
    116 
    117 // CHECK-LABEL: define <16 x i8> @test_vabaq_u8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 {
    118 // CHECK:   [[VABDQ_V_I_I:%.*]] = call <16 x i8> @llvm.arm.neon.vabdu.v16i8(<16 x i8> %b, <16 x i8> %c) #4
    119 // CHECK:   [[ADD_I:%.*]] = add <16 x i8> %a, [[VABDQ_V_I_I]]
    120 // CHECK:   ret <16 x i8> [[ADD_I]]
    121 uint8x16_t test_vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) {
    122   return vabaq_u8(a, b, c);
    123 }
    124 
    125 // CHECK-LABEL: define <8 x i16> @test_vabaq_u16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) #0 {
    126 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
    127 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %c to <16 x i8>
    128 // CHECK:   [[VABDQ_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
    129 // CHECK:   [[VABDQ_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
    130 // CHECK:   [[VABDQ_V2_I_I:%.*]] = call <8 x i16> @llvm.arm.neon.vabdu.v8i16(<8 x i16> [[VABDQ_V_I_I]], <8 x i16> [[VABDQ_V1_I_I]]) #4
    131 // CHECK:   [[VABDQ_V3_I_I:%.*]] = bitcast <8 x i16> [[VABDQ_V2_I_I]] to <16 x i8>
    132 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VABDQ_V3_I_I]] to <8 x i16>
    133 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[TMP2]]
    134 // CHECK:   ret <8 x i16> [[ADD_I]]
    135 uint16x8_t test_vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) {
    136   return vabaq_u16(a, b, c);
    137 }
    138 
    139 // CHECK-LABEL: define <4 x i32> @test_vabaq_u32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) #0 {
    140 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
    141 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %c to <16 x i8>
    142 // CHECK:   [[VABDQ_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
    143 // CHECK:   [[VABDQ_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
    144 // CHECK:   [[VABDQ_V2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vabdu.v4i32(<4 x i32> [[VABDQ_V_I_I]], <4 x i32> [[VABDQ_V1_I_I]]) #4
    145 // CHECK:   [[VABDQ_V3_I_I:%.*]] = bitcast <4 x i32> [[VABDQ_V2_I_I]] to <16 x i8>
    146 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VABDQ_V3_I_I]] to <4 x i32>
    147 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[TMP2]]
    148 // CHECK:   ret <4 x i32> [[ADD_I]]
    149 uint32x4_t test_vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) {
    150   return vabaq_u32(a, b, c);
    151 }
    152 
    153 
    154 // CHECK-LABEL: define <8 x i16> @test_vabal_s8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) #0 {
    155 // CHECK:   [[VABD_V_I_I_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %b, <8 x i8> %c) #4
    156 // CHECK:   [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_V_I_I_I]] to <8 x i16>
    157 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I_I]]
    158 // CHECK:   ret <8 x i16> [[ADD_I]]
    159 int16x8_t test_vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c) {
    160   return vabal_s8(a, b, c);
    161 }
    162 
    163 // CHECK-LABEL: define <4 x i32> @test_vabal_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
    164 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
    165 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
    166 // CHECK:   [[VABD_V_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
    167 // CHECK:   [[VABD_V1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
    168 // CHECK:   [[VABD_V2_I_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> [[VABD_V_I_I_I]], <4 x i16> [[VABD_V1_I_I_I]]) #4
    169 // CHECK:   [[VABD_V3_I_I_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I_I]] to <8 x i8>
    170 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I_I_I]] to <4 x i16>
    171 // CHECK:   [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8>
    172 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
    173 // CHECK:   [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[TMP4]] to <4 x i32>
    174 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I_I]]
    175 // CHECK:   ret <4 x i32> [[ADD_I]]
    176 int32x4_t test_vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
    177   return vabal_s16(a, b, c);
    178 }
    179 
    180 // CHECK-LABEL: define <2 x i64> @test_vabal_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
    181 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
    182 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
    183 // CHECK:   [[VABD_V_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
    184 // CHECK:   [[VABD_V1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
    185 // CHECK:   [[VABD_V2_I_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> [[VABD_V_I_I_I]], <2 x i32> [[VABD_V1_I_I_I]]) #4
    186 // CHECK:   [[VABD_V3_I_I_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I_I]] to <8 x i8>
    187 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I_I_I]] to <2 x i32>
    188 // CHECK:   [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8>
    189 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32>
    190 // CHECK:   [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[TMP4]] to <2 x i64>
    191 // CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I_I]]
    192 // CHECK:   ret <2 x i64> [[ADD_I]]
    193 int64x2_t test_vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
    194   return vabal_s32(a, b, c);
    195 }
    196 
    197 // CHECK-LABEL: define <8 x i16> @test_vabal_u8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) #0 {
    198 // CHECK:   [[VABD_V_I_I_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %b, <8 x i8> %c) #4
    199 // CHECK:   [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_V_I_I_I]] to <8 x i16>
    200 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I_I]]
    201 // CHECK:   ret <8 x i16> [[ADD_I]]
    202 uint16x8_t test_vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) {
    203   return vabal_u8(a, b, c);
    204 }
    205 
    206 // CHECK-LABEL: define <4 x i32> @test_vabal_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
    207 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
    208 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
    209 // CHECK:   [[VABD_V_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
    210 // CHECK:   [[VABD_V1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
    211 // CHECK:   [[VABD_V2_I_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> [[VABD_V_I_I_I]], <4 x i16> [[VABD_V1_I_I_I]]) #4
    212 // CHECK:   [[VABD_V3_I_I_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I_I]] to <8 x i8>
    213 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I_I_I]] to <4 x i16>
    214 // CHECK:   [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8>
    215 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
    216 // CHECK:   [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[TMP4]] to <4 x i32>
    217 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I_I]]
    218 // CHECK:   ret <4 x i32> [[ADD_I]]
    219 uint32x4_t test_vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) {
    220   return vabal_u16(a, b, c);
    221 }
    222 
    223 // CHECK-LABEL: define <2 x i64> @test_vabal_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
    224 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
    225 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
    226 // CHECK:   [[VABD_V_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
    227 // CHECK:   [[VABD_V1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
    228 // CHECK:   [[VABD_V2_I_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> [[VABD_V_I_I_I]], <2 x i32> [[VABD_V1_I_I_I]]) #4
    229 // CHECK:   [[VABD_V3_I_I_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I_I]] to <8 x i8>
    230 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I_I_I]] to <2 x i32>
    231 // CHECK:   [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8>
    232 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32>
    233 // CHECK:   [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[TMP4]] to <2 x i64>
    234 // CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I_I]]
    235 // CHECK:   ret <2 x i64> [[ADD_I]]
    236 uint64x2_t test_vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
    237   return vabal_u32(a, b, c);
    238 }
    239 
    240 
    241 // CHECK-LABEL: define <8 x i8> @test_vabd_s8(<8 x i8> %a, <8 x i8> %b) #0 {
    242 // CHECK:   [[VABD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %a, <8 x i8> %b) #4
    243 // CHECK:   ret <8 x i8> [[VABD_V_I]]
    244 int8x8_t test_vabd_s8(int8x8_t a, int8x8_t b) {
    245   return vabd_s8(a, b);
    246 }
    247 
    248 // CHECK-LABEL: define <4 x i16> @test_vabd_s16(<4 x i16> %a, <4 x i16> %b) #0 {
    249 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
    250 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
    251 // CHECK:   [[VABD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
    252 // CHECK:   [[VABD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
    253 // CHECK:   [[VABD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> [[VABD_V_I]], <4 x i16> [[VABD_V1_I]]) #4
    254 // CHECK:   [[VABD_V3_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I]] to <8 x i8>
    255 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I]] to <4 x i16>
    256 // CHECK:   ret <4 x i16> [[TMP2]]
    257 int16x4_t test_vabd_s16(int16x4_t a, int16x4_t b) {
    258   return vabd_s16(a, b);
    259 }
    260 
    261 // CHECK-LABEL: define <2 x i32> @test_vabd_s32(<2 x i32> %a, <2 x i32> %b) #0 {
    262 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
    263 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
    264 // CHECK:   [[VABD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
    265 // CHECK:   [[VABD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
    266 // CHECK:   [[VABD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> [[VABD_V_I]], <2 x i32> [[VABD_V1_I]]) #4
    267 // CHECK:   [[VABD_V3_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I]] to <8 x i8>
    268 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I]] to <2 x i32>
    269 // CHECK:   ret <2 x i32> [[TMP2]]
    270 int32x2_t test_vabd_s32(int32x2_t a, int32x2_t b) {
    271   return vabd_s32(a, b);
    272 }
    273 
    274 // CHECK-LABEL: define <8 x i8> @test_vabd_u8(<8 x i8> %a, <8 x i8> %b) #0 {
    275 // CHECK:   [[VABD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
    276 // CHECK:   ret <8 x i8> [[VABD_V_I]]
    277 uint8x8_t test_vabd_u8(uint8x8_t a, uint8x8_t b) {
    278   return vabd_u8(a, b);
    279 }
    280 
    281 // CHECK-LABEL: define <4 x i16> @test_vabd_u16(<4 x i16> %a, <4 x i16> %b) #0 {
    282 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
    283 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
    284 // CHECK:   [[VABD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
    285 // CHECK:   [[VABD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
    286 // CHECK:   [[VABD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> [[VABD_V_I]], <4 x i16> [[VABD_V1_I]]) #4
    287 // CHECK:   [[VABD_V3_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I]] to <8 x i8>
    288 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I]] to <4 x i16>
    289 // CHECK:   ret <4 x i16> [[TMP2]]
    290 uint16x4_t test_vabd_u16(uint16x4_t a, uint16x4_t b) {
    291   return vabd_u16(a, b);
    292 }
    293 
    294 // CHECK-LABEL: define <2 x i32> @test_vabd_u32(<2 x i32> %a, <2 x i32> %b) #0 {
    295 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
    296 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
    297 // CHECK:   [[VABD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
    298 // CHECK:   [[VABD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
    299 // CHECK:   [[VABD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> [[VABD_V_I]], <2 x i32> [[VABD_V1_I]]) #4
    300 // CHECK:   [[VABD_V3_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I]] to <8 x i8>
    301 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I]] to <2 x i32>
    302 // CHECK:   ret <2 x i32> [[TMP2]]
    303 uint32x2_t test_vabd_u32(uint32x2_t a, uint32x2_t b) {
    304   return vabd_u32(a, b);
    305 }
    306 
    307 // CHECK-LABEL: define <2 x float> @test_vabd_f32(<2 x float> %a, <2 x float> %b) #0 {
    308 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
    309 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
    310 // CHECK:   [[VABD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
    311 // CHECK:   [[VABD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
    312 // CHECK:   [[VABD_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vabds.v2f32(<2 x float> [[VABD_V_I]], <2 x float> [[VABD_V1_I]]) #4
    313 // CHECK:   [[VABD_V3_I:%.*]] = bitcast <2 x float> [[VABD_V2_I]] to <8 x i8>
    314 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I]] to <2 x float>
    315 // CHECK:   ret <2 x float> [[TMP2]]
    316 float32x2_t test_vabd_f32(float32x2_t a, float32x2_t b) {
    317   return vabd_f32(a, b);
    318 }
    319 
    320 // CHECK-LABEL: define <16 x i8> @test_vabdq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
    321 // CHECK:   [[VABDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vabds.v16i8(<16 x i8> %a, <16 x i8> %b) #4
    322 // CHECK:   ret <16 x i8> [[VABDQ_V_I]]
    323 int8x16_t test_vabdq_s8(int8x16_t a, int8x16_t b) {
    324   return vabdq_s8(a, b);
    325 }
    326 
    327 // CHECK-LABEL: define <8 x i16> @test_vabdq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
    328 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
    329 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
    330 // CHECK:   [[VABDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
    331 // CHECK:   [[VABDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
    332 // CHECK:   [[VABDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vabds.v8i16(<8 x i16> [[VABDQ_V_I]], <8 x i16> [[VABDQ_V1_I]]) #4
    333 // CHECK:   [[VABDQ_V3_I:%.*]] = bitcast <8 x i16> [[VABDQ_V2_I]] to <16 x i8>
    334 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VABDQ_V3_I]] to <8 x i16>
    335 // CHECK:   ret <8 x i16> [[TMP2]]
    336 int16x8_t test_vabdq_s16(int16x8_t a, int16x8_t b) {
    337   return vabdq_s16(a, b);
    338 }
    339 
    340 // CHECK-LABEL: define <4 x i32> @test_vabdq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
    341 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
    342 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
    343 // CHECK:   [[VABDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
    344 // CHECK:   [[VABDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
    345 // CHECK:   [[VABDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vabds.v4i32(<4 x i32> [[VABDQ_V_I]], <4 x i32> [[VABDQ_V1_I]]) #4
    346 // CHECK:   [[VABDQ_V3_I:%.*]] = bitcast <4 x i32> [[VABDQ_V2_I]] to <16 x i8>
    347 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VABDQ_V3_I]] to <4 x i32>
    348 // CHECK:   ret <4 x i32> [[TMP2]]
    349 int32x4_t test_vabdq_s32(int32x4_t a, int32x4_t b) {
    350   return vabdq_s32(a, b);
    351 }
    352 
    353 // CHECK-LABEL: define <16 x i8> @test_vabdq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
    354 // CHECK:   [[VABDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vabdu.v16i8(<16 x i8> %a, <16 x i8> %b) #4
    355 // CHECK:   ret <16 x i8> [[VABDQ_V_I]]
    356 uint8x16_t test_vabdq_u8(uint8x16_t a, uint8x16_t b) {
    357   return vabdq_u8(a, b);
    358 }
    359 
    360 // CHECK-LABEL: define <8 x i16> @test_vabdq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
    361 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
    362 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
    363 // CHECK:   [[VABDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
    364 // CHECK:   [[VABDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
    365 // CHECK:   [[VABDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vabdu.v8i16(<8 x i16> [[VABDQ_V_I]], <8 x i16> [[VABDQ_V1_I]]) #4
    366 // CHECK:   [[VABDQ_V3_I:%.*]] = bitcast <8 x i16> [[VABDQ_V2_I]] to <16 x i8>
    367 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VABDQ_V3_I]] to <8 x i16>
    368 // CHECK:   ret <8 x i16> [[TMP2]]
    369 uint16x8_t test_vabdq_u16(uint16x8_t a, uint16x8_t b) {
    370   return vabdq_u16(a, b);
    371 }
    372 
    373 // CHECK-LABEL: define <4 x i32> @test_vabdq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
    374 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
    375 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
    376 // CHECK:   [[VABDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
    377 // CHECK:   [[VABDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
    378 // CHECK:   [[VABDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vabdu.v4i32(<4 x i32> [[VABDQ_V_I]], <4 x i32> [[VABDQ_V1_I]]) #4
    379 // CHECK:   [[VABDQ_V3_I:%.*]] = bitcast <4 x i32> [[VABDQ_V2_I]] to <16 x i8>
    380 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VABDQ_V3_I]] to <4 x i32>
    381 // CHECK:   ret <4 x i32> [[TMP2]]
    382 uint32x4_t test_vabdq_u32(uint32x4_t a, uint32x4_t b) {
    383   return vabdq_u32(a, b);
    384 }
    385 
    386 // CHECK-LABEL: define <4 x float> @test_vabdq_f32(<4 x float> %a, <4 x float> %b) #0 {
    387 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
    388 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
    389 // CHECK:   [[VABDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
    390 // CHECK:   [[VABDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
    391 // CHECK:   [[VABDQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vabds.v4f32(<4 x float> [[VABDQ_V_I]], <4 x float> [[VABDQ_V1_I]]) #4
    392 // CHECK:   [[VABDQ_V3_I:%.*]] = bitcast <4 x float> [[VABDQ_V2_I]] to <16 x i8>
    393 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VABDQ_V3_I]] to <4 x float>
    394 // CHECK:   ret <4 x float> [[TMP2]]
    395 float32x4_t test_vabdq_f32(float32x4_t a, float32x4_t b) {
    396   return vabdq_f32(a, b);
    397 }
    398 
    399 
    400 // CHECK-LABEL: define <8 x i16> @test_vabdl_s8(<8 x i8> %a, <8 x i8> %b) #0 {
    401 // CHECK:   [[VABD_V_I_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %a, <8 x i8> %b) #4
    402 // CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> [[VABD_V_I_I]] to <8 x i16>
    403 // CHECK:   ret <8 x i16> [[VMOVL_I_I]]
    404 int16x8_t test_vabdl_s8(int8x8_t a, int8x8_t b) {
    405   return vabdl_s8(a, b);
    406 }
    407 
    408 // CHECK-LABEL: define <4 x i32> @test_vabdl_s16(<4 x i16> %a, <4 x i16> %b) #0 {
    409 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
    410 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
    411 // CHECK:   [[VABD_V_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
    412 // CHECK:   [[VABD_V1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
    413 // CHECK:   [[VABD_V2_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> [[VABD_V_I_I]], <4 x i16> [[VABD_V1_I_I]]) #4
    414 // CHECK:   [[VABD_V3_I_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I]] to <8 x i8>
    415 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I_I]] to <4 x i16>
    416 // CHECK:   [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8>
    417 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
    418 // CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP4]] to <4 x i32>
    419 // CHECK:   ret <4 x i32> [[VMOVL_I_I]]
    420 int32x4_t test_vabdl_s16(int16x4_t a, int16x4_t b) {
    421   return vabdl_s16(a, b);
    422 }
    423 
    424 // CHECK-LABEL: define <2 x i64> @test_vabdl_s32(<2 x i32> %a, <2 x i32> %b) #0 {
    425 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
    426 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
    427 // CHECK:   [[VABD_V_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
    428 // CHECK:   [[VABD_V1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
    429 // CHECK:   [[VABD_V2_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> [[VABD_V_I_I]], <2 x i32> [[VABD_V1_I_I]]) #4
    430 // CHECK:   [[VABD_V3_I_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I]] to <8 x i8>
    431 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I_I]] to <2 x i32>
    432 // CHECK:   [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8>
    433 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32>
    434 // CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP4]] to <2 x i64>
    435 // CHECK:   ret <2 x i64> [[VMOVL_I_I]]
    436 int64x2_t test_vabdl_s32(int32x2_t a, int32x2_t b) {
    437   return vabdl_s32(a, b);
    438 }
    439 
    440 // CHECK-LABEL: define <8 x i16> @test_vabdl_u8(<8 x i8> %a, <8 x i8> %b) #0 {
    441 // CHECK:   [[VABD_V_I_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
    442 // CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> [[VABD_V_I_I]] to <8 x i16>
    443 // CHECK:   ret <8 x i16> [[VMOVL_I_I]]
    444 uint16x8_t test_vabdl_u8(uint8x8_t a, uint8x8_t b) {
    445   return vabdl_u8(a, b);
    446 }
    447 
    448 // CHECK-LABEL: define <4 x i32> @test_vabdl_u16(<4 x i16> %a, <4 x i16> %b) #0 {
    449 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
    450 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
    451 // CHECK:   [[VABD_V_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
    452 // CHECK:   [[VABD_V1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
    453 // CHECK:   [[VABD_V2_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> [[VABD_V_I_I]], <4 x i16> [[VABD_V1_I_I]]) #4
    454 // CHECK:   [[VABD_V3_I_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I]] to <8 x i8>
    455 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I_I]] to <4 x i16>
    456 // CHECK:   [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8>
    457 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
    458 // CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP4]] to <4 x i32>
    459 // CHECK:   ret <4 x i32> [[VMOVL_I_I]]
    460 uint32x4_t test_vabdl_u16(uint16x4_t a, uint16x4_t b) {
    461   return vabdl_u16(a, b);
    462 }
    463 
    464 // CHECK-LABEL: define <2 x i64> @test_vabdl_u32(<2 x i32> %a, <2 x i32> %b) #0 {
    465 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
    466 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
    467 // CHECK:   [[VABD_V_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
    468 // CHECK:   [[VABD_V1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
    469 // CHECK:   [[VABD_V2_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> [[VABD_V_I_I]], <2 x i32> [[VABD_V1_I_I]]) #4
    470 // CHECK:   [[VABD_V3_I_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I]] to <8 x i8>
    471 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I_I]] to <2 x i32>
    472 // CHECK:   [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8>
    473 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32>
    474 // CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP4]] to <2 x i64>
    475 // CHECK:   ret <2 x i64> [[VMOVL_I_I]]
    476 uint64x2_t test_vabdl_u32(uint32x2_t a, uint32x2_t b) {
    477   return vabdl_u32(a, b);
    478 }
    479 
    480 
    481 // CHECK-LABEL: define <8 x i8> @test_vabs_s8(<8 x i8> %a) #0 {
    482 // CHECK:   [[VABS_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabs.v8i8(<8 x i8> %a) #4
    483 // CHECK:   ret <8 x i8> [[VABS_I]]
    484 int8x8_t test_vabs_s8(int8x8_t a) {
    485   return vabs_s8(a);
    486 }
    487 
    488 // CHECK-LABEL: define <4 x i16> @test_vabs_s16(<4 x i16> %a) #0 {
    489 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
    490 // CHECK:   [[VABS_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
    491 // CHECK:   [[VABS1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabs.v4i16(<4 x i16> [[VABS_I]]) #4
    492 // CHECK:   ret <4 x i16> [[VABS1_I]]
    493 int16x4_t test_vabs_s16(int16x4_t a) {
    494   return vabs_s16(a);
    495 }
    496 
    497 // CHECK-LABEL: define <2 x i32> @test_vabs_s32(<2 x i32> %a) #0 {
    498 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
    499 // CHECK:   [[VABS_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
    500 // CHECK:   [[VABS1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabs.v2i32(<2 x i32> [[VABS_I]]) #4
    501 // CHECK:   ret <2 x i32> [[VABS1_I]]
    502 int32x2_t test_vabs_s32(int32x2_t a) {
    503   return vabs_s32(a);
    504 }
    505 
    506 // CHECK-LABEL: define <2 x float> @test_vabs_f32(<2 x float> %a) #0 {
    507 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
    508 // CHECK:   [[VABS_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
    509 // CHECK:   [[VABS1_I:%.*]] = call <2 x float> @llvm.fabs.v2f32(<2 x float> [[VABS_I]]) #4
    510 // CHECK:   ret <2 x float> [[VABS1_I]]
    511 float32x2_t test_vabs_f32(float32x2_t a) {
    512   return vabs_f32(a);
    513 }
    514 
    515 // CHECK-LABEL: define <16 x i8> @test_vabsq_s8(<16 x i8> %a) #0 {
    516 // CHECK:   [[VABS_I:%.*]] = call <16 x i8> @llvm.arm.neon.vabs.v16i8(<16 x i8> %a) #4
    517 // CHECK:   ret <16 x i8> [[VABS_I]]
    518 int8x16_t test_vabsq_s8(int8x16_t a) {
    519   return vabsq_s8(a);
    520 }
    521 
    522 // CHECK-LABEL: define <8 x i16> @test_vabsq_s16(<8 x i16> %a) #0 {
    523 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
    524 // CHECK:   [[VABS_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
    525 // CHECK:   [[VABS1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vabs.v8i16(<8 x i16> [[VABS_I]]) #4
    526 // CHECK:   ret <8 x i16> [[VABS1_I]]
    527 int16x8_t test_vabsq_s16(int16x8_t a) {
    528   return vabsq_s16(a);
    529 }
    530 
    531 // CHECK-LABEL: define <4 x i32> @test_vabsq_s32(<4 x i32> %a) #0 {
    532 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
    533 // CHECK:   [[VABS_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
    534 // CHECK:   [[VABS1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vabs.v4i32(<4 x i32> [[VABS_I]]) #4
    535 // CHECK:   ret <4 x i32> [[VABS1_I]]
    536 int32x4_t test_vabsq_s32(int32x4_t a) {
    537   return vabsq_s32(a);
    538 }
    539 
    540 // CHECK-LABEL: define <4 x float> @test_vabsq_f32(<4 x float> %a) #0 {
    541 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
    542 // CHECK:   [[VABS_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
    543 // CHECK:   [[VABS1_I:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[VABS_I]]) #4
    544 // CHECK:   ret <4 x float> [[VABS1_I]]
    545 float32x4_t test_vabsq_f32(float32x4_t a) {
    546   return vabsq_f32(a);
    547 }
    548 
    549 
    550 // CHECK-LABEL: define <8 x i8> @test_vadd_s8(<8 x i8> %a, <8 x i8> %b) #0 {
    551 // CHECK:   [[ADD_I:%.*]] = add <8 x i8> %a, %b
    552 // CHECK:   ret <8 x i8> [[ADD_I]]
    553 int8x8_t test_vadd_s8(int8x8_t a, int8x8_t b) {
    554   return vadd_s8(a, b);
    555 }
    556 
    557 // CHECK-LABEL: define <4 x i16> @test_vadd_s16(<4 x i16> %a, <4 x i16> %b) #0 {
    558 // CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, %b
    559 // CHECK:   ret <4 x i16> [[ADD_I]]
    560 int16x4_t test_vadd_s16(int16x4_t a, int16x4_t b) {
    561   return vadd_s16(a, b);
    562 }
    563 
    564 // CHECK-LABEL: define <2 x i32> @test_vadd_s32(<2 x i32> %a, <2 x i32> %b) #0 {
    565 // CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, %b
    566 // CHECK:   ret <2 x i32> [[ADD_I]]
    567 int32x2_t test_vadd_s32(int32x2_t a, int32x2_t b) {
    568   return vadd_s32(a, b);
    569 }
    570 
    571 // CHECK-LABEL: define <1 x i64> @test_vadd_s64(<1 x i64> %a, <1 x i64> %b) #0 {
    572 // CHECK:   [[ADD_I:%.*]] = add <1 x i64> %a, %b
    573 // CHECK:   ret <1 x i64> [[ADD_I]]
    574 int64x1_t test_vadd_s64(int64x1_t a, int64x1_t b) {
    575   return vadd_s64(a, b);
    576 }
    577 
    578 // CHECK-LABEL: define <2 x float> @test_vadd_f32(<2 x float> %a, <2 x float> %b) #0 {
    579 // CHECK:   [[ADD_I:%.*]] = fadd <2 x float> %a, %b
    580 // CHECK:   ret <2 x float> [[ADD_I]]
    581 float32x2_t test_vadd_f32(float32x2_t a, float32x2_t b) {
    582   return vadd_f32(a, b);
    583 }
    584 
    585 // CHECK-LABEL: define <8 x i8> @test_vadd_u8(<8 x i8> %a, <8 x i8> %b) #0 {
    586 // CHECK:   [[ADD_I:%.*]] = add <8 x i8> %a, %b
    587 // CHECK:   ret <8 x i8> [[ADD_I]]
    588 uint8x8_t test_vadd_u8(uint8x8_t a, uint8x8_t b) {
    589   return vadd_u8(a, b);
    590 }
    591 
    592 // CHECK-LABEL: define <4 x i16> @test_vadd_u16(<4 x i16> %a, <4 x i16> %b) #0 {
    593 // CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, %b
    594 // CHECK:   ret <4 x i16> [[ADD_I]]
    595 uint16x4_t test_vadd_u16(uint16x4_t a, uint16x4_t b) {
    596   return vadd_u16(a, b);
    597 }
    598 
    599 // CHECK-LABEL: define <2 x i32> @test_vadd_u32(<2 x i32> %a, <2 x i32> %b) #0 {
    600 // CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, %b
    601 // CHECK:   ret <2 x i32> [[ADD_I]]
    602 uint32x2_t test_vadd_u32(uint32x2_t a, uint32x2_t b) {
    603   return vadd_u32(a, b);
    604 }
    605 
    606 // CHECK-LABEL: define <1 x i64> @test_vadd_u64(<1 x i64> %a, <1 x i64> %b) #0 {
    607 // CHECK:   [[ADD_I:%.*]] = add <1 x i64> %a, %b
    608 // CHECK:   ret <1 x i64> [[ADD_I]]
    609 uint64x1_t test_vadd_u64(uint64x1_t a, uint64x1_t b) {
    610   return vadd_u64(a, b);
    611 }
    612 
    613 // CHECK-LABEL: define <16 x i8> @test_vaddq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
    614 // CHECK:   [[ADD_I:%.*]] = add <16 x i8> %a, %b
    615 // CHECK:   ret <16 x i8> [[ADD_I]]
    616 int8x16_t test_vaddq_s8(int8x16_t a, int8x16_t b) {
    617   return vaddq_s8(a, b);
    618 }
    619 
    620 // CHECK-LABEL: define <8 x i16> @test_vaddq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
    621 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, %b
    622 // CHECK:   ret <8 x i16> [[ADD_I]]
    623 int16x8_t test_vaddq_s16(int16x8_t a, int16x8_t b) {
    624   return vaddq_s16(a, b);
    625 }
    626 
    627 // CHECK-LABEL: define <4 x i32> @test_vaddq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
    628 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, %b
    629 // CHECK:   ret <4 x i32> [[ADD_I]]
    630 int32x4_t test_vaddq_s32(int32x4_t a, int32x4_t b) {
    631   return vaddq_s32(a, b);
    632 }
    633 
    634 // CHECK-LABEL: define <2 x i64> @test_vaddq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
    635 // CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, %b
    636 // CHECK:   ret <2 x i64> [[ADD_I]]
    637 int64x2_t test_vaddq_s64(int64x2_t a, int64x2_t b) {
    638   return vaddq_s64(a, b);
    639 }
    640 
    641 // CHECK-LABEL: define <4 x float> @test_vaddq_f32(<4 x float> %a, <4 x float> %b) #0 {
    642 // CHECK:   [[ADD_I:%.*]] = fadd <4 x float> %a, %b
    643 // CHECK:   ret <4 x float> [[ADD_I]]
    644 float32x4_t test_vaddq_f32(float32x4_t a, float32x4_t b) {
    645   return vaddq_f32(a, b);
    646 }
    647 
    648 // CHECK-LABEL: define <16 x i8> @test_vaddq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
    649 // CHECK:   [[ADD_I:%.*]] = add <16 x i8> %a, %b
    650 // CHECK:   ret <16 x i8> [[ADD_I]]
    651 uint8x16_t test_vaddq_u8(uint8x16_t a, uint8x16_t b) {
    652   return vaddq_u8(a, b);
    653 }
    654 
    655 // CHECK-LABEL: define <8 x i16> @test_vaddq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
    656 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, %b
    657 // CHECK:   ret <8 x i16> [[ADD_I]]
    658 uint16x8_t test_vaddq_u16(uint16x8_t a, uint16x8_t b) {
    659   return vaddq_u16(a, b);
    660 }
    661 
    662 // CHECK-LABEL: define <4 x i32> @test_vaddq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
    663 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, %b
    664 // CHECK:   ret <4 x i32> [[ADD_I]]
    665 uint32x4_t test_vaddq_u32(uint32x4_t a, uint32x4_t b) {
    666   return vaddq_u32(a, b);
    667 }
    668 
    669 // CHECK-LABEL: define <2 x i64> @test_vaddq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
    670 // CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, %b
    671 // CHECK:   ret <2 x i64> [[ADD_I]]
    672 uint64x2_t test_vaddq_u64(uint64x2_t a, uint64x2_t b) {
    673   return vaddq_u64(a, b);
    674 }
    675 
    676 
    677 // CHECK-LABEL: define <8 x i8> @test_vaddhn_s16(<8 x i16> %a, <8 x i16> %b) #0 {
    678 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
    679 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
    680 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
    681 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
    682 // CHECK:   [[VADDHN_I:%.*]] = add <8 x i16> [[TMP2]], [[TMP3]]
    683 // CHECK:   [[VADDHN1_I:%.*]] = lshr <8 x i16> [[VADDHN_I]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
    684 // CHECK:   [[VADDHN2_I:%.*]] = trunc <8 x i16> [[VADDHN1_I]] to <8 x i8>
    685 // CHECK:   ret <8 x i8> [[VADDHN2_I]]
    686 int8x8_t test_vaddhn_s16(int16x8_t a, int16x8_t b) {
    687   return vaddhn_s16(a, b);
    688 }
    689 
    690 // CHECK-LABEL: define <4 x i16> @test_vaddhn_s32(<4 x i32> %a, <4 x i32> %b) #0 {
    691 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
    692 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
    693 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
    694 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
    695 // CHECK:   [[VADDHN_I:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]]
    696 // CHECK:   [[VADDHN1_I:%.*]] = lshr <4 x i32> [[VADDHN_I]], <i32 16, i32 16, i32 16, i32 16>
    697 // CHECK:   [[VADDHN2_I:%.*]] = trunc <4 x i32> [[VADDHN1_I]] to <4 x i16>
    698 // CHECK:   ret <4 x i16> [[VADDHN2_I]]
    699 int16x4_t test_vaddhn_s32(int32x4_t a, int32x4_t b) {
    700   return vaddhn_s32(a, b);
    701 }
    702 
    703 // CHECK-LABEL: define <2 x i32> @test_vaddhn_s64(<2 x i64> %a, <2 x i64> %b) #0 {
    704 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
    705 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
    706 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
    707 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
    708 // CHECK:   [[VADDHN_I:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]]
    709 // CHECK:   [[VADDHN1_I:%.*]] = lshr <2 x i64> [[VADDHN_I]], <i64 32, i64 32>
    710 // CHECK:   [[VADDHN2_I:%.*]] = trunc <2 x i64> [[VADDHN1_I]] to <2 x i32>
    711 // CHECK:   ret <2 x i32> [[VADDHN2_I]]
    712 int32x2_t test_vaddhn_s64(int64x2_t a, int64x2_t b) {
    713   return vaddhn_s64(a, b);
    714 }
    715 
    716 // CHECK-LABEL: define <8 x i8> @test_vaddhn_u16(<8 x i16> %a, <8 x i16> %b) #0 {
    717 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
    718 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
    719 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
    720 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
    721 // CHECK:   [[VADDHN_I:%.*]] = add <8 x i16> [[TMP2]], [[TMP3]]
    722 // CHECK:   [[VADDHN1_I:%.*]] = lshr <8 x i16> [[VADDHN_I]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
    723 // CHECK:   [[VADDHN2_I:%.*]] = trunc <8 x i16> [[VADDHN1_I]] to <8 x i8>
    724 // CHECK:   ret <8 x i8> [[VADDHN2_I]]
    725 uint8x8_t test_vaddhn_u16(uint16x8_t a, uint16x8_t b) {
    726   return vaddhn_u16(a, b);
    727 }
    728 
    729 // CHECK-LABEL: define <4 x i16> @test_vaddhn_u32(<4 x i32> %a, <4 x i32> %b) #0 {
    730 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
    731 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
    732 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
    733 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
    734 // CHECK:   [[VADDHN_I:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]]
    735 // CHECK:   [[VADDHN1_I:%.*]] = lshr <4 x i32> [[VADDHN_I]], <i32 16, i32 16, i32 16, i32 16>
    736 // CHECK:   [[VADDHN2_I:%.*]] = trunc <4 x i32> [[VADDHN1_I]] to <4 x i16>
    737 // CHECK:   ret <4 x i16> [[VADDHN2_I]]
    738 uint16x4_t test_vaddhn_u32(uint32x4_t a, uint32x4_t b) {
    739   return vaddhn_u32(a, b);
    740 }
    741 
    742 // CHECK-LABEL: define <2 x i32> @test_vaddhn_u64(<2 x i64> %a, <2 x i64> %b) #0 {
    743 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
    744 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
    745 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
    746 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
    747 // CHECK:   [[VADDHN_I:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]]
    748 // CHECK:   [[VADDHN1_I:%.*]] = lshr <2 x i64> [[VADDHN_I]], <i64 32, i64 32>
    749 // CHECK:   [[VADDHN2_I:%.*]] = trunc <2 x i64> [[VADDHN1_I]] to <2 x i32>
    750 // CHECK:   ret <2 x i32> [[VADDHN2_I]]
    751 uint32x2_t test_vaddhn_u64(uint64x2_t a, uint64x2_t b) {
    752   return vaddhn_u64(a, b);
    753 }
    754 
    755 
    756 // CHECK-LABEL: define <8 x i16> @test_vaddl_s8(<8 x i8> %a, <8 x i8> %b) #0 {
    757 // CHECK:   [[VMOVL_I_I:%.*]] = sext <8 x i8> %a to <8 x i16>
    758 // CHECK:   [[VMOVL_I4_I:%.*]] = sext <8 x i8> %b to <8 x i16>
    759 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]]
    760 // CHECK:   ret <8 x i16> [[ADD_I]]
    761 int16x8_t test_vaddl_s8(int8x8_t a, int8x8_t b) {
    762   return vaddl_s8(a, b);
    763 }
    764 
    765 // CHECK-LABEL: define <4 x i32> @test_vaddl_s16(<4 x i16> %a, <4 x i16> %b) #0 {
    766 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
    767 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
    768 // CHECK:   [[VMOVL_I_I:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
    769 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
    770 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
    771 // CHECK:   [[VMOVL_I4_I:%.*]] = sext <4 x i16> [[TMP3]] to <4 x i32>
    772 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]]
    773 // CHECK:   ret <4 x i32> [[ADD_I]]
    774 int32x4_t test_vaddl_s16(int16x4_t a, int16x4_t b) {
    775   return vaddl_s16(a, b);
    776 }
    777 
    778 // CHECK-LABEL: define <2 x i64> @test_vaddl_s32(<2 x i32> %a, <2 x i32> %b) #0 {
    779 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
    780 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
    781 // CHECK:   [[VMOVL_I_I:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
    782 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
    783 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
    784 // CHECK:   [[VMOVL_I4_I:%.*]] = sext <2 x i32> [[TMP3]] to <2 x i64>
    785 // CHECK:   [[ADD_I:%.*]] = add <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]]
    786 // CHECK:   ret <2 x i64> [[ADD_I]]
    787 int64x2_t test_vaddl_s32(int32x2_t a, int32x2_t b) {
    788   return vaddl_s32(a, b);
    789 }
    790 
    791 // CHECK-LABEL: define <8 x i16> @test_vaddl_u8(<8 x i8> %a, <8 x i8> %b) #0 {
    792 // CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> %a to <8 x i16>
    793 // CHECK:   [[VMOVL_I4_I:%.*]] = zext <8 x i8> %b to <8 x i16>
    794 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]]
    795 // CHECK:   ret <8 x i16> [[ADD_I]]
    796 uint16x8_t test_vaddl_u8(uint8x8_t a, uint8x8_t b) {
    797   return vaddl_u8(a, b);
    798 }
    799 
    800 // CHECK-LABEL: define <4 x i32> @test_vaddl_u16(<4 x i16> %a, <4 x i16> %b) #0 {
    801 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
    802 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
    803 // CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
    804 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
    805 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
    806 // CHECK:   [[VMOVL_I4_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
    807 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]]
    808 // CHECK:   ret <4 x i32> [[ADD_I]]
    809 uint32x4_t test_vaddl_u16(uint16x4_t a, uint16x4_t b) {
    810   return vaddl_u16(a, b);
    811 }
    812 
    813 // CHECK-LABEL: define <2 x i64> @test_vaddl_u32(<2 x i32> %a, <2 x i32> %b) #0 {
    814 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
    815 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
    816 // CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
    817 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
    818 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
    819 // CHECK:   [[VMOVL_I4_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64>
    820 // CHECK:   [[ADD_I:%.*]] = add <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]]
    821 // CHECK:   ret <2 x i64> [[ADD_I]]
    822 uint64x2_t test_vaddl_u32(uint32x2_t a, uint32x2_t b) {
    823   return vaddl_u32(a, b);
    824 }
    825 
    826 
    827 // CHECK-LABEL: define <8 x i16> @test_vaddw_s8(<8 x i16> %a, <8 x i8> %b) #0 {
    828 // CHECK:   [[VMOVL_I_I:%.*]] = sext <8 x i8> %b to <8 x i16>
    829 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I]]
    830 // CHECK:   ret <8 x i16> [[ADD_I]]
    831 int16x8_t test_vaddw_s8(int16x8_t a, int8x8_t b) {
    832   return vaddw_s8(a, b);
    833 }
    834 
    835 // CHECK-LABEL: define <4 x i32> @test_vaddw_s16(<4 x i32> %a, <4 x i16> %b) #0 {
    836 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
    837 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
    838 // CHECK:   [[VMOVL_I_I:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
    839 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I]]
    840 // CHECK:   ret <4 x i32> [[ADD_I]]
    841 int32x4_t test_vaddw_s16(int32x4_t a, int16x4_t b) {
    842   return vaddw_s16(a, b);
    843 }
    844 
    845 // CHECK-LABEL: define <2 x i64> @test_vaddw_s32(<2 x i64> %a, <2 x i32> %b) #0 {
    846 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
    847 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
    848 // CHECK:   [[VMOVL_I_I:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
    849 // CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I]]
    850 // CHECK:   ret <2 x i64> [[ADD_I]]
    851 int64x2_t test_vaddw_s32(int64x2_t a, int32x2_t b) {
    852   return vaddw_s32(a, b);
    853 }
    854 
    855 // CHECK-LABEL: define <8 x i16> @test_vaddw_u8(<8 x i16> %a, <8 x i8> %b) #0 {
    856 // CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> %b to <8 x i16>
    857 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I]]
    858 // CHECK:   ret <8 x i16> [[ADD_I]]
    859 uint16x8_t test_vaddw_u8(uint16x8_t a, uint8x8_t b) {
    860   return vaddw_u8(a, b);
    861 }
    862 
    863 // CHECK-LABEL: define <4 x i32> @test_vaddw_u16(<4 x i32> %a, <4 x i16> %b) #0 {
    864 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
    865 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
    866 // CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
    867 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I]]
    868 // CHECK:   ret <4 x i32> [[ADD_I]]
    869 uint32x4_t test_vaddw_u16(uint32x4_t a, uint16x4_t b) {
    870   return vaddw_u16(a, b);
    871 }
    872 
    873 // CHECK-LABEL: define <2 x i64> @test_vaddw_u32(<2 x i64> %a, <2 x i32> %b) #0 {
    874 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
    875 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
    876 // CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
    877 // CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I]]
    878 // CHECK:   ret <2 x i64> [[ADD_I]]
    879 uint64x2_t test_vaddw_u32(uint64x2_t a, uint32x2_t b) {
    880   return vaddw_u32(a, b);
    881 }
    882 
    883 
    884 // CHECK-LABEL: define <8 x i8> @test_vand_s8(<8 x i8> %a, <8 x i8> %b) #0 {
    885 // CHECK:   [[AND_I:%.*]] = and <8 x i8> %a, %b
    886 // CHECK:   ret <8 x i8> [[AND_I]]
    887 int8x8_t test_vand_s8(int8x8_t a, int8x8_t b) {
    888   return vand_s8(a, b);
    889 }
    890 
    891 // CHECK-LABEL: define <4 x i16> @test_vand_s16(<4 x i16> %a, <4 x i16> %b) #0 {
    892 // CHECK:   [[AND_I:%.*]] = and <4 x i16> %a, %b
    893 // CHECK:   ret <4 x i16> [[AND_I]]
    894 int16x4_t test_vand_s16(int16x4_t a, int16x4_t b) {
    895   return vand_s16(a, b);
    896 }
    897 
    898 // CHECK-LABEL: define <2 x i32> @test_vand_s32(<2 x i32> %a, <2 x i32> %b) #0 {
    899 // CHECK:   [[AND_I:%.*]] = and <2 x i32> %a, %b
    900 // CHECK:   ret <2 x i32> [[AND_I]]
    901 int32x2_t test_vand_s32(int32x2_t a, int32x2_t b) {
    902   return vand_s32(a, b);
    903 }
    904 
    905 // CHECK-LABEL: define <1 x i64> @test_vand_s64(<1 x i64> %a, <1 x i64> %b) #0 {
    906 // CHECK:   [[AND_I:%.*]] = and <1 x i64> %a, %b
    907 // CHECK:   ret <1 x i64> [[AND_I]]
    908 int64x1_t test_vand_s64(int64x1_t a, int64x1_t b) {
    909   return vand_s64(a, b);
    910 }
    911 
    912 // CHECK-LABEL: define <8 x i8> @test_vand_u8(<8 x i8> %a, <8 x i8> %b) #0 {
    913 // CHECK:   [[AND_I:%.*]] = and <8 x i8> %a, %b
    914 // CHECK:   ret <8 x i8> [[AND_I]]
    915 uint8x8_t test_vand_u8(uint8x8_t a, uint8x8_t b) {
    916   return vand_u8(a, b);
    917 }
    918 
    919 // CHECK-LABEL: define <4 x i16> @test_vand_u16(<4 x i16> %a, <4 x i16> %b) #0 {
    920 // CHECK:   [[AND_I:%.*]] = and <4 x i16> %a, %b
    921 // CHECK:   ret <4 x i16> [[AND_I]]
    922 uint16x4_t test_vand_u16(uint16x4_t a, uint16x4_t b) {
    923   return vand_u16(a, b);
    924 }
    925 
    926 // CHECK-LABEL: define <2 x i32> @test_vand_u32(<2 x i32> %a, <2 x i32> %b) #0 {
    927 // CHECK:   [[AND_I:%.*]] = and <2 x i32> %a, %b
    928 // CHECK:   ret <2 x i32> [[AND_I]]
    929 uint32x2_t test_vand_u32(uint32x2_t a, uint32x2_t b) {
    930   return vand_u32(a, b);
    931 }
    932 
    933 // CHECK-LABEL: define <1 x i64> @test_vand_u64(<1 x i64> %a, <1 x i64> %b) #0 {
    934 // CHECK:   [[AND_I:%.*]] = and <1 x i64> %a, %b
    935 // CHECK:   ret <1 x i64> [[AND_I]]
    936 uint64x1_t test_vand_u64(uint64x1_t a, uint64x1_t b) {
    937   return vand_u64(a, b);
    938 }
    939 
    940 // CHECK-LABEL: define <16 x i8> @test_vandq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
    941 // CHECK:   [[AND_I:%.*]] = and <16 x i8> %a, %b
    942 // CHECK:   ret <16 x i8> [[AND_I]]
    943 int8x16_t test_vandq_s8(int8x16_t a, int8x16_t b) {
    944   return vandq_s8(a, b);
    945 }
    946 
    947 // CHECK-LABEL: define <8 x i16> @test_vandq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
    948 // CHECK:   [[AND_I:%.*]] = and <8 x i16> %a, %b
    949 // CHECK:   ret <8 x i16> [[AND_I]]
    950 int16x8_t test_vandq_s16(int16x8_t a, int16x8_t b) {
    951   return vandq_s16(a, b);
    952 }
    953 
    954 // CHECK-LABEL: define <4 x i32> @test_vandq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
    955 // CHECK:   [[AND_I:%.*]] = and <4 x i32> %a, %b
    956 // CHECK:   ret <4 x i32> [[AND_I]]
    957 int32x4_t test_vandq_s32(int32x4_t a, int32x4_t b) {
    958   return vandq_s32(a, b);
    959 }
    960 
    961 // CHECK-LABEL: define <2 x i64> @test_vandq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
    962 // CHECK:   [[AND_I:%.*]] = and <2 x i64> %a, %b
    963 // CHECK:   ret <2 x i64> [[AND_I]]
    964 int64x2_t test_vandq_s64(int64x2_t a, int64x2_t b) {
    965   return vandq_s64(a, b);
    966 }
    967 
    968 // CHECK-LABEL: define <16 x i8> @test_vandq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
    969 // CHECK:   [[AND_I:%.*]] = and <16 x i8> %a, %b
    970 // CHECK:   ret <16 x i8> [[AND_I]]
    971 uint8x16_t test_vandq_u8(uint8x16_t a, uint8x16_t b) {
    972   return vandq_u8(a, b);
    973 }
    974 
    975 // CHECK-LABEL: define <8 x i16> @test_vandq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
    976 // CHECK:   [[AND_I:%.*]] = and <8 x i16> %a, %b
    977 // CHECK:   ret <8 x i16> [[AND_I]]
    978 uint16x8_t test_vandq_u16(uint16x8_t a, uint16x8_t b) {
    979   return vandq_u16(a, b);
    980 }
    981 
    982 // CHECK-LABEL: define <4 x i32> @test_vandq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
    983 // CHECK:   [[AND_I:%.*]] = and <4 x i32> %a, %b
    984 // CHECK:   ret <4 x i32> [[AND_I]]
    985 uint32x4_t test_vandq_u32(uint32x4_t a, uint32x4_t b) {
    986   return vandq_u32(a, b);
    987 }
    988 
    989 // CHECK-LABEL: define <2 x i64> @test_vandq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
    990 // CHECK:   [[AND_I:%.*]] = and <2 x i64> %a, %b
    991 // CHECK:   ret <2 x i64> [[AND_I]]
    992 uint64x2_t test_vandq_u64(uint64x2_t a, uint64x2_t b) {
    993   return vandq_u64(a, b);
    994 }
    995 
    996 
    997 // CHECK-LABEL: define <8 x i8> @test_vbic_s8(<8 x i8> %a, <8 x i8> %b) #0 {
    998 // CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
    999 // CHECK:   [[AND_I:%.*]] = and <8 x i8> %a, [[NEG_I]]
   1000 // CHECK:   ret <8 x i8> [[AND_I]]
   1001 int8x8_t test_vbic_s8(int8x8_t a, int8x8_t b) {
   1002   return vbic_s8(a, b);
   1003 }
   1004 
   1005 // CHECK-LABEL: define <4 x i16> @test_vbic_s16(<4 x i16> %a, <4 x i16> %b) #0 {
   1006 // CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1>
   1007 // CHECK:   [[AND_I:%.*]] = and <4 x i16> %a, [[NEG_I]]
   1008 // CHECK:   ret <4 x i16> [[AND_I]]
   1009 int16x4_t test_vbic_s16(int16x4_t a, int16x4_t b) {
   1010   return vbic_s16(a, b);
   1011 }
   1012 
   1013 // CHECK-LABEL: define <2 x i32> @test_vbic_s32(<2 x i32> %a, <2 x i32> %b) #0 {
   1014 // CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %b, <i32 -1, i32 -1>
   1015 // CHECK:   [[AND_I:%.*]] = and <2 x i32> %a, [[NEG_I]]
   1016 // CHECK:   ret <2 x i32> [[AND_I]]
   1017 int32x2_t test_vbic_s32(int32x2_t a, int32x2_t b) {
   1018   return vbic_s32(a, b);
   1019 }
   1020 
   1021 // CHECK-LABEL: define <1 x i64> @test_vbic_s64(<1 x i64> %a, <1 x i64> %b) #0 {
   1022 // CHECK:   [[NEG_I:%.*]] = xor <1 x i64> %b, <i64 -1>
   1023 // CHECK:   [[AND_I:%.*]] = and <1 x i64> %a, [[NEG_I]]
   1024 // CHECK:   ret <1 x i64> [[AND_I]]
   1025 int64x1_t test_vbic_s64(int64x1_t a, int64x1_t b) {
   1026   return vbic_s64(a, b);
   1027 }
   1028 
   1029 // CHECK-LABEL: define <8 x i8> @test_vbic_u8(<8 x i8> %a, <8 x i8> %b) #0 {
   1030 // CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
   1031 // CHECK:   [[AND_I:%.*]] = and <8 x i8> %a, [[NEG_I]]
   1032 // CHECK:   ret <8 x i8> [[AND_I]]
   1033 uint8x8_t test_vbic_u8(uint8x8_t a, uint8x8_t b) {
   1034   return vbic_u8(a, b);
   1035 }
   1036 
   1037 // CHECK-LABEL: define <4 x i16> @test_vbic_u16(<4 x i16> %a, <4 x i16> %b) #0 {
   1038 // CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1>
   1039 // CHECK:   [[AND_I:%.*]] = and <4 x i16> %a, [[NEG_I]]
   1040 // CHECK:   ret <4 x i16> [[AND_I]]
   1041 uint16x4_t test_vbic_u16(uint16x4_t a, uint16x4_t b) {
   1042   return vbic_u16(a, b);
   1043 }
   1044 
   1045 // CHECK-LABEL: define <2 x i32> @test_vbic_u32(<2 x i32> %a, <2 x i32> %b) #0 {
   1046 // CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %b, <i32 -1, i32 -1>
   1047 // CHECK:   [[AND_I:%.*]] = and <2 x i32> %a, [[NEG_I]]
   1048 // CHECK:   ret <2 x i32> [[AND_I]]
   1049 uint32x2_t test_vbic_u32(uint32x2_t a, uint32x2_t b) {
   1050   return vbic_u32(a, b);
   1051 }
   1052 
   1053 // CHECK-LABEL: define <1 x i64> @test_vbic_u64(<1 x i64> %a, <1 x i64> %b) #0 {
   1054 // CHECK:   [[NEG_I:%.*]] = xor <1 x i64> %b, <i64 -1>
   1055 // CHECK:   [[AND_I:%.*]] = and <1 x i64> %a, [[NEG_I]]
   1056 // CHECK:   ret <1 x i64> [[AND_I]]
   1057 uint64x1_t test_vbic_u64(uint64x1_t a, uint64x1_t b) {
   1058   return vbic_u64(a, b);
   1059 }
   1060 
   1061 // CHECK-LABEL: define <16 x i8> @test_vbicq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
   1062 // CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
   1063 // CHECK:   [[AND_I:%.*]] = and <16 x i8> %a, [[NEG_I]]
   1064 // CHECK:   ret <16 x i8> [[AND_I]]
   1065 int8x16_t test_vbicq_s8(int8x16_t a, int8x16_t b) {
   1066   return vbicq_s8(a, b);
   1067 }
   1068 
   1069 // CHECK-LABEL: define <8 x i16> @test_vbicq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
   1070 // CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
   1071 // CHECK:   [[AND_I:%.*]] = and <8 x i16> %a, [[NEG_I]]
   1072 // CHECK:   ret <8 x i16> [[AND_I]]
   1073 int16x8_t test_vbicq_s16(int16x8_t a, int16x8_t b) {
   1074   return vbicq_s16(a, b);
   1075 }
   1076 
   1077 // CHECK-LABEL: define <4 x i32> @test_vbicq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
   1078 // CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
   1079 // CHECK:   [[AND_I:%.*]] = and <4 x i32> %a, [[NEG_I]]
   1080 // CHECK:   ret <4 x i32> [[AND_I]]
   1081 int32x4_t test_vbicq_s32(int32x4_t a, int32x4_t b) {
   1082   return vbicq_s32(a, b);
   1083 }
   1084 
   1085 // CHECK-LABEL: define <2 x i64> @test_vbicq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
   1086 // CHECK:   [[NEG_I:%.*]] = xor <2 x i64> %b, <i64 -1, i64 -1>
   1087 // CHECK:   [[AND_I:%.*]] = and <2 x i64> %a, [[NEG_I]]
   1088 // CHECK:   ret <2 x i64> [[AND_I]]
   1089 int64x2_t test_vbicq_s64(int64x2_t a, int64x2_t b) {
   1090   return vbicq_s64(a, b);
   1091 }
   1092 
   1093 // CHECK-LABEL: define <16 x i8> @test_vbicq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
   1094 // CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
   1095 // CHECK:   [[AND_I:%.*]] = and <16 x i8> %a, [[NEG_I]]
   1096 // CHECK:   ret <16 x i8> [[AND_I]]
   1097 uint8x16_t test_vbicq_u8(uint8x16_t a, uint8x16_t b) {
   1098   return vbicq_u8(a, b);
   1099 }
   1100 
   1101 // CHECK-LABEL: define <8 x i16> @test_vbicq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
   1102 // CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
   1103 // CHECK:   [[AND_I:%.*]] = and <8 x i16> %a, [[NEG_I]]
   1104 // CHECK:   ret <8 x i16> [[AND_I]]
   1105 uint16x8_t test_vbicq_u16(uint16x8_t a, uint16x8_t b) {
   1106   return vbicq_u16(a, b);
   1107 }
   1108 
   1109 // CHECK-LABEL: define <4 x i32> @test_vbicq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
   1110 // CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
   1111 // CHECK:   [[AND_I:%.*]] = and <4 x i32> %a, [[NEG_I]]
   1112 // CHECK:   ret <4 x i32> [[AND_I]]
   1113 uint32x4_t test_vbicq_u32(uint32x4_t a, uint32x4_t b) {
   1114   return vbicq_u32(a, b);
   1115 }
   1116 
   1117 // CHECK-LABEL: define <2 x i64> @test_vbicq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
   1118 // CHECK:   [[NEG_I:%.*]] = xor <2 x i64> %b, <i64 -1, i64 -1>
   1119 // CHECK:   [[AND_I:%.*]] = and <2 x i64> %a, [[NEG_I]]
   1120 // CHECK:   ret <2 x i64> [[AND_I]]
   1121 uint64x2_t test_vbicq_u64(uint64x2_t a, uint64x2_t b) {
   1122   return vbicq_u64(a, b);
   1123 }
   1124 
   1125 
   1126 // CHECK-LABEL: define <8 x i8> @test_vbsl_s8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
   1127 // CHECK:   [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #4
   1128 // CHECK:   ret <8 x i8> [[VBSL_V_I]]
   1129 int8x8_t test_vbsl_s8(uint8x8_t a, int8x8_t b, int8x8_t c) {
   1130   return vbsl_s8(a, b, c);
   1131 }
   1132 
   1133 // CHECK-LABEL: define <4 x i16> @test_vbsl_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
   1134 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   1135 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   1136 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8>
   1137 // CHECK:   [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]]) #4
   1138 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <4 x i16>
   1139 // CHECK:   ret <4 x i16> [[TMP3]]
   1140 int16x4_t test_vbsl_s16(uint16x4_t a, int16x4_t b, int16x4_t c) {
   1141   return vbsl_s16(a, b, c);
   1142 }
   1143 
   1144 // CHECK-LABEL: define <2 x i32> @test_vbsl_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
   1145 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   1146 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   1147 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %c to <8 x i8>
   1148 // CHECK:   [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]]) #4
   1149 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <2 x i32>
   1150 // CHECK:   ret <2 x i32> [[TMP3]]
   1151 int32x2_t test_vbsl_s32(uint32x2_t a, int32x2_t b, int32x2_t c) {
   1152   return vbsl_s32(a, b, c);
   1153 }
   1154 
   1155 // CHECK-LABEL: define <1 x i64> @test_vbsl_s64(<1 x i64> %a, <1 x i64> %b, <1 x i64> %c) #0 {
   1156 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   1157 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
   1158 // CHECK:   [[TMP2:%.*]] = bitcast <1 x i64> %c to <8 x i8>
   1159 // CHECK:   [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]]) #4
   1160 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <1 x i64>
   1161 // CHECK:   ret <1 x i64> [[TMP3]]
   1162 int64x1_t test_vbsl_s64(uint64x1_t a, int64x1_t b, int64x1_t c) {
   1163   return vbsl_s64(a, b, c);
   1164 }
   1165 
   1166 // CHECK-LABEL: define <8 x i8> @test_vbsl_u8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
   1167 // CHECK:   [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #4
   1168 // CHECK:   ret <8 x i8> [[VBSL_V_I]]
   1169 uint8x8_t test_vbsl_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) {
   1170   return vbsl_u8(a, b, c);
   1171 }
   1172 
   1173 // CHECK-LABEL: define <4 x i16> @test_vbsl_u16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
   1174 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   1175 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   1176 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8>
   1177 // CHECK:   [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]]) #4
   1178 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <4 x i16>
   1179 // CHECK:   ret <4 x i16> [[TMP3]]
   1180 uint16x4_t test_vbsl_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) {
   1181   return vbsl_u16(a, b, c);
   1182 }
   1183 
   1184 // CHECK-LABEL: define <2 x i32> @test_vbsl_u32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
   1185 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   1186 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   1187 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %c to <8 x i8>
   1188 // CHECK:   [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]]) #4
   1189 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <2 x i32>
   1190 // CHECK:   ret <2 x i32> [[TMP3]]
   1191 uint32x2_t test_vbsl_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) {
   1192   return vbsl_u32(a, b, c);
   1193 }
   1194 
   1195 // CHECK-LABEL: define <1 x i64> @test_vbsl_u64(<1 x i64> %a, <1 x i64> %b, <1 x i64> %c) #0 {
   1196 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   1197 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
   1198 // CHECK:   [[TMP2:%.*]] = bitcast <1 x i64> %c to <8 x i8>
   1199 // CHECK:   [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]]) #4
   1200 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <1 x i64>
   1201 // CHECK:   ret <1 x i64> [[TMP3]]
   1202 uint64x1_t test_vbsl_u64(uint64x1_t a, uint64x1_t b, uint64x1_t c) {
   1203   return vbsl_u64(a, b, c);
   1204 }
   1205 
   1206 // CHECK-LABEL: define <2 x float> @test_vbsl_f32(<2 x i32> %a, <2 x float> %b, <2 x float> %c) #0 {
   1207 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   1208 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
   1209 // CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %c to <8 x i8>
   1210 // CHECK:   [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]]) #4
   1211 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <2 x float>
   1212 // CHECK:   ret <2 x float> [[TMP3]]
   1213 float32x2_t test_vbsl_f32(uint32x2_t a, float32x2_t b, float32x2_t c) {
   1214   return vbsl_f32(a, b, c);
   1215 }
   1216 
   1217 // CHECK-LABEL: define <8 x i8> @test_vbsl_p8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
   1218 // CHECK:   [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #4
   1219 // CHECK:   ret <8 x i8> [[VBSL_V_I]]
   1220 poly8x8_t test_vbsl_p8(uint8x8_t a, poly8x8_t b, poly8x8_t c) {
   1221   return vbsl_p8(a, b, c);
   1222 }
   1223 
   1224 // CHECK-LABEL: define <4 x i16> @test_vbsl_p16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
   1225 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   1226 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   1227 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8>
   1228 // CHECK:   [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]]) #4
   1229 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <4 x i16>
   1230 // CHECK:   ret <4 x i16> [[TMP3]]
   1231 poly16x4_t test_vbsl_p16(uint16x4_t a, poly16x4_t b, poly16x4_t c) {
   1232   return vbsl_p16(a, b, c);
   1233 }
   1234 
   1235 // CHECK-LABEL: define <16 x i8> @test_vbslq_s8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 {
   1236 // CHECK:   [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #4
   1237 // CHECK:   ret <16 x i8> [[VBSLQ_V_I]]
   1238 int8x16_t test_vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c) {
   1239   return vbslq_s8(a, b, c);
   1240 }
   1241 
   1242 // CHECK-LABEL: define <8 x i16> @test_vbslq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) #0 {
   1243 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   1244 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   1245 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %c to <16 x i8>
   1246 // CHECK:   [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) #4
   1247 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <8 x i16>
   1248 // CHECK:   ret <8 x i16> [[TMP3]]
   1249 int16x8_t test_vbslq_s16(uint16x8_t a, int16x8_t b, int16x8_t c) {
   1250   return vbslq_s16(a, b, c);
   1251 }
   1252 
   1253 // CHECK-LABEL: define <4 x i32> @test_vbslq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) #0 {
   1254 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   1255 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   1256 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %c to <16 x i8>
   1257 // CHECK:   [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) #4
   1258 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <4 x i32>
   1259 // CHECK:   ret <4 x i32> [[TMP3]]
   1260 int32x4_t test_vbslq_s32(uint32x4_t a, int32x4_t b, int32x4_t c) {
   1261   return vbslq_s32(a, b, c);
   1262 }
   1263 
   1264 // CHECK-LABEL: define <2 x i64> @test_vbslq_s64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) #0 {
   1265 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   1266 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   1267 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i64> %c to <16 x i8>
   1268 // CHECK:   [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) #4
   1269 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <2 x i64>
   1270 // CHECK:   ret <2 x i64> [[TMP3]]
   1271 int64x2_t test_vbslq_s64(uint64x2_t a, int64x2_t b, int64x2_t c) {
   1272   return vbslq_s64(a, b, c);
   1273 }
   1274 
   1275 // CHECK-LABEL: define <16 x i8> @test_vbslq_u8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 {
   1276 // CHECK:   [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #4
   1277 // CHECK:   ret <16 x i8> [[VBSLQ_V_I]]
   1278 uint8x16_t test_vbslq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) {
   1279   return vbslq_u8(a, b, c);
   1280 }
   1281 
   1282 // CHECK-LABEL: define <8 x i16> @test_vbslq_u16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) #0 {
   1283 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   1284 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   1285 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %c to <16 x i8>
   1286 // CHECK:   [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) #4
   1287 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <8 x i16>
   1288 // CHECK:   ret <8 x i16> [[TMP3]]
   1289 uint16x8_t test_vbslq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) {
   1290   return vbslq_u16(a, b, c);
   1291 }
   1292 
   1293 // CHECK-LABEL: define <4 x i32> @test_vbslq_u32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) #0 {
   1294 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   1295 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   1296 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %c to <16 x i8>
   1297 // CHECK:   [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) #4
   1298 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <4 x i32>
   1299 // CHECK:   ret <4 x i32> [[TMP3]]
   1300 uint32x4_t test_vbslq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) {
   1301   return vbslq_u32(a, b, c);
   1302 }
   1303 
   1304 // CHECK-LABEL: define <2 x i64> @test_vbslq_u64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) #0 {
   1305 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   1306 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   1307 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i64> %c to <16 x i8>
   1308 // CHECK:   [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) #4
   1309 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <2 x i64>
   1310 // CHECK:   ret <2 x i64> [[TMP3]]
   1311 uint64x2_t test_vbslq_u64(uint64x2_t a, uint64x2_t b, uint64x2_t c) {
   1312   return vbslq_u64(a, b, c);
   1313 }
   1314 
   1315 // CHECK-LABEL: define <4 x float> @test_vbslq_f32(<4 x i32> %a, <4 x float> %b, <4 x float> %c) #0 {
   1316 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   1317 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
   1318 // CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %c to <16 x i8>
   1319 // CHECK:   [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) #4
   1320 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <4 x float>
   1321 // CHECK:   ret <4 x float> [[TMP3]]
   1322 float32x4_t test_vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c) {
   1323   return vbslq_f32(a, b, c);
   1324 }
   1325 
   1326 // CHECK-LABEL: define <16 x i8> @test_vbslq_p8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 {
   1327 // CHECK:   [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #4
   1328 // CHECK:   ret <16 x i8> [[VBSLQ_V_I]]
   1329 poly8x16_t test_vbslq_p8(uint8x16_t a, poly8x16_t b, poly8x16_t c) {
   1330   return vbslq_p8(a, b, c);
   1331 }
   1332 
   1333 // CHECK-LABEL: define <8 x i16> @test_vbslq_p16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) #0 {
   1334 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   1335 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   1336 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %c to <16 x i8>
   1337 // CHECK:   [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) #4
   1338 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <8 x i16>
   1339 // CHECK:   ret <8 x i16> [[TMP3]]
   1340 poly16x8_t test_vbslq_p16(uint16x8_t a, poly16x8_t b, poly16x8_t c) {
   1341   return vbslq_p16(a, b, c);
   1342 }
   1343 
   1344 
   1345 // CHECK-LABEL: define <2 x i32> @test_vcage_f32(<2 x float> %a, <2 x float> %b) #0 {
   1346 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
   1347 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
   1348 // CHECK:   [[VCAGE_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
   1349 // CHECK:   [[VCAGE_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
   1350 // CHECK:   [[VCAGE_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vacge.v2i32.v2f32(<2 x float> [[VCAGE_V_I]], <2 x float> [[VCAGE_V1_I]]) #4
   1351 // CHECK:   ret <2 x i32> [[VCAGE_V2_I]]
   1352 uint32x2_t test_vcage_f32(float32x2_t a, float32x2_t b) {
   1353   return vcage_f32(a, b);
   1354 }
   1355 
   1356 // CHECK-LABEL: define <4 x i32> @test_vcageq_f32(<4 x float> %a, <4 x float> %b) #0 {
   1357 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
   1358 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
   1359 // CHECK:   [[VCAGEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
   1360 // CHECK:   [[VCAGEQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
   1361 // CHECK:   [[VCAGEQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vacge.v4i32.v4f32(<4 x float> [[VCAGEQ_V_I]], <4 x float> [[VCAGEQ_V1_I]]) #4
   1362 // CHECK:   ret <4 x i32> [[VCAGEQ_V2_I]]
   1363 uint32x4_t test_vcageq_f32(float32x4_t a, float32x4_t b) {
   1364   return vcageq_f32(a, b);
   1365 }
   1366 
   1367 
   1368 // CHECK-LABEL: define <2 x i32> @test_vcagt_f32(<2 x float> %a, <2 x float> %b) #0 {
   1369 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
   1370 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
   1371 // CHECK:   [[VCAGT_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
   1372 // CHECK:   [[VCAGT_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
   1373 // CHECK:   [[VCAGT_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vacgt.v2i32.v2f32(<2 x float> [[VCAGT_V_I]], <2 x float> [[VCAGT_V1_I]]) #4
   1374 // CHECK:   ret <2 x i32> [[VCAGT_V2_I]]
   1375 uint32x2_t test_vcagt_f32(float32x2_t a, float32x2_t b) {
   1376   return vcagt_f32(a, b);
   1377 }
   1378 
   1379 // CHECK-LABEL: define <4 x i32> @test_vcagtq_f32(<4 x float> %a, <4 x float> %b) #0 {
   1380 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
   1381 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
   1382 // CHECK:   [[VCAGTQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
   1383 // CHECK:   [[VCAGTQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
   1384 // CHECK:   [[VCAGTQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vacgt.v4i32.v4f32(<4 x float> [[VCAGTQ_V_I]], <4 x float> [[VCAGTQ_V1_I]]) #4
   1385 // CHECK:   ret <4 x i32> [[VCAGTQ_V2_I]]
   1386 uint32x4_t test_vcagtq_f32(float32x4_t a, float32x4_t b) {
   1387   return vcagtq_f32(a, b);
   1388 }
   1389 
   1390 
   1391 // CHECK-LABEL: define <2 x i32> @test_vcale_f32(<2 x float> %a, <2 x float> %b) #0 {
   1392 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
   1393 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
   1394 // CHECK:   [[VCALE_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
   1395 // CHECK:   [[VCALE_V1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
   1396 // CHECK:   [[VCALE_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vacge.v2i32.v2f32(<2 x float> [[VCALE_V_I]], <2 x float> [[VCALE_V1_I]]) #4
   1397 // CHECK:   ret <2 x i32> [[VCALE_V2_I]]
   1398 uint32x2_t test_vcale_f32(float32x2_t a, float32x2_t b) {
   1399   return vcale_f32(a, b);
   1400 }
   1401 
   1402 // CHECK-LABEL: define <4 x i32> @test_vcaleq_f32(<4 x float> %a, <4 x float> %b) #0 {
   1403 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
   1404 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
   1405 // CHECK:   [[VCALEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
   1406 // CHECK:   [[VCALEQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
   1407 // CHECK:   [[VCALEQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vacge.v4i32.v4f32(<4 x float> [[VCALEQ_V_I]], <4 x float> [[VCALEQ_V1_I]]) #4
   1408 // CHECK:   ret <4 x i32> [[VCALEQ_V2_I]]
   1409 uint32x4_t test_vcaleq_f32(float32x4_t a, float32x4_t b) {
   1410   return vcaleq_f32(a, b);
   1411 }
   1412 
   1413 
   1414 // CHECK-LABEL: define <2 x i32> @test_vcalt_f32(<2 x float> %a, <2 x float> %b) #0 {
   1415 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
   1416 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
   1417 // CHECK:   [[VCALT_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
   1418 // CHECK:   [[VCALT_V1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
   1419 // CHECK:   [[VCALT_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vacgt.v2i32.v2f32(<2 x float> [[VCALT_V_I]], <2 x float> [[VCALT_V1_I]]) #4
   1420 // CHECK:   ret <2 x i32> [[VCALT_V2_I]]
   1421 uint32x2_t test_vcalt_f32(float32x2_t a, float32x2_t b) {
   1422   return vcalt_f32(a, b);
   1423 }
   1424 
   1425 // CHECK-LABEL: define <4 x i32> @test_vcaltq_f32(<4 x float> %a, <4 x float> %b) #0 {
   1426 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
   1427 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
   1428 // CHECK:   [[VCALTQ_V_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
   1429 // CHECK:   [[VCALTQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
   1430 // CHECK:   [[VCALTQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vacgt.v4i32.v4f32(<4 x float> [[VCALTQ_V_I]], <4 x float> [[VCALTQ_V1_I]]) #4
   1431 // CHECK:   ret <4 x i32> [[VCALTQ_V2_I]]
   1432 uint32x4_t test_vcaltq_f32(float32x4_t a, float32x4_t b) {
   1433   return vcaltq_f32(a, b);
   1434 }
   1435 
   1436 
   1437 // CHECK-LABEL: define <8 x i8> @test_vceq_s8(<8 x i8> %a, <8 x i8> %b) #0 {
   1438 // CHECK:   [[CMP_I:%.*]] = icmp eq <8 x i8> %a, %b
   1439 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
   1440 // CHECK:   ret <8 x i8> [[SEXT_I]]
   1441 uint8x8_t test_vceq_s8(int8x8_t a, int8x8_t b) {
   1442   return vceq_s8(a, b);
   1443 }
   1444 
   1445 // CHECK-LABEL: define <4 x i16> @test_vceq_s16(<4 x i16> %a, <4 x i16> %b) #0 {
   1446 // CHECK:   [[CMP_I:%.*]] = icmp eq <4 x i16> %a, %b
   1447 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
   1448 // CHECK:   ret <4 x i16> [[SEXT_I]]
   1449 uint16x4_t test_vceq_s16(int16x4_t a, int16x4_t b) {
   1450   return vceq_s16(a, b);
   1451 }
   1452 
   1453 // CHECK-LABEL: define <2 x i32> @test_vceq_s32(<2 x i32> %a, <2 x i32> %b) #0 {
   1454 // CHECK:   [[CMP_I:%.*]] = icmp eq <2 x i32> %a, %b
   1455 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
   1456 // CHECK:   ret <2 x i32> [[SEXT_I]]
   1457 uint32x2_t test_vceq_s32(int32x2_t a, int32x2_t b) {
   1458   return vceq_s32(a, b);
   1459 }
   1460 
   1461 // CHECK-LABEL: define <2 x i32> @test_vceq_f32(<2 x float> %a, <2 x float> %b) #0 {
   1462 // CHECK:   [[CMP_I:%.*]] = fcmp oeq <2 x float> %a, %b
   1463 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
   1464 // CHECK:   ret <2 x i32> [[SEXT_I]]
   1465 uint32x2_t test_vceq_f32(float32x2_t a, float32x2_t b) {
   1466   return vceq_f32(a, b);
   1467 }
   1468 
   1469 // CHECK-LABEL: define <8 x i8> @test_vceq_u8(<8 x i8> %a, <8 x i8> %b) #0 {
   1470 // CHECK:   [[CMP_I:%.*]] = icmp eq <8 x i8> %a, %b
   1471 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
   1472 // CHECK:   ret <8 x i8> [[SEXT_I]]
   1473 uint8x8_t test_vceq_u8(uint8x8_t a, uint8x8_t b) {
   1474   return vceq_u8(a, b);
   1475 }
   1476 
   1477 // CHECK-LABEL: define <4 x i16> @test_vceq_u16(<4 x i16> %a, <4 x i16> %b) #0 {
   1478 // CHECK:   [[CMP_I:%.*]] = icmp eq <4 x i16> %a, %b
   1479 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
   1480 // CHECK:   ret <4 x i16> [[SEXT_I]]
   1481 uint16x4_t test_vceq_u16(uint16x4_t a, uint16x4_t b) {
   1482   return vceq_u16(a, b);
   1483 }
   1484 
   1485 // CHECK-LABEL: define <2 x i32> @test_vceq_u32(<2 x i32> %a, <2 x i32> %b) #0 {
   1486 // CHECK:   [[CMP_I:%.*]] = icmp eq <2 x i32> %a, %b
   1487 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
   1488 // CHECK:   ret <2 x i32> [[SEXT_I]]
   1489 uint32x2_t test_vceq_u32(uint32x2_t a, uint32x2_t b) {
   1490   return vceq_u32(a, b);
   1491 }
   1492 
   1493 // CHECK-LABEL: define <8 x i8> @test_vceq_p8(<8 x i8> %a, <8 x i8> %b) #0 {
   1494 // CHECK:   [[CMP_I:%.*]] = icmp eq <8 x i8> %a, %b
   1495 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
   1496 // CHECK:   ret <8 x i8> [[SEXT_I]]
   1497 uint8x8_t test_vceq_p8(poly8x8_t a, poly8x8_t b) {
   1498   return vceq_p8(a, b);
   1499 }
   1500 
   1501 // CHECK-LABEL: define <16 x i8> @test_vceqq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
   1502 // CHECK:   [[CMP_I:%.*]] = icmp eq <16 x i8> %a, %b
   1503 // CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
   1504 // CHECK:   ret <16 x i8> [[SEXT_I]]
   1505 uint8x16_t test_vceqq_s8(int8x16_t a, int8x16_t b) {
   1506   return vceqq_s8(a, b);
   1507 }
   1508 
   1509 // CHECK-LABEL: define <8 x i16> @test_vceqq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
   1510 // CHECK:   [[CMP_I:%.*]] = icmp eq <8 x i16> %a, %b
   1511 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
   1512 // CHECK:   ret <8 x i16> [[SEXT_I]]
   1513 uint16x8_t test_vceqq_s16(int16x8_t a, int16x8_t b) {
   1514   return vceqq_s16(a, b);
   1515 }
   1516 
   1517 // CHECK-LABEL: define <4 x i32> @test_vceqq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
   1518 // CHECK:   [[CMP_I:%.*]] = icmp eq <4 x i32> %a, %b
   1519 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
   1520 // CHECK:   ret <4 x i32> [[SEXT_I]]
   1521 uint32x4_t test_vceqq_s32(int32x4_t a, int32x4_t b) {
   1522   return vceqq_s32(a, b);
   1523 }
   1524 
   1525 // CHECK-LABEL: define <4 x i32> @test_vceqq_f32(<4 x float> %a, <4 x float> %b) #0 {
   1526 // CHECK:   [[CMP_I:%.*]] = fcmp oeq <4 x float> %a, %b
   1527 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
   1528 // CHECK:   ret <4 x i32> [[SEXT_I]]
   1529 uint32x4_t test_vceqq_f32(float32x4_t a, float32x4_t b) {
   1530   return vceqq_f32(a, b);
   1531 }
   1532 
   1533 // CHECK-LABEL: define <16 x i8> @test_vceqq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
   1534 // CHECK:   [[CMP_I:%.*]] = icmp eq <16 x i8> %a, %b
   1535 // CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
   1536 // CHECK:   ret <16 x i8> [[SEXT_I]]
   1537 uint8x16_t test_vceqq_u8(uint8x16_t a, uint8x16_t b) {
   1538   return vceqq_u8(a, b);
   1539 }
   1540 
   1541 // CHECK-LABEL: define <8 x i16> @test_vceqq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
   1542 // CHECK:   [[CMP_I:%.*]] = icmp eq <8 x i16> %a, %b
   1543 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
   1544 // CHECK:   ret <8 x i16> [[SEXT_I]]
   1545 uint16x8_t test_vceqq_u16(uint16x8_t a, uint16x8_t b) {
   1546   return vceqq_u16(a, b);
   1547 }
   1548 
   1549 // CHECK-LABEL: define <4 x i32> @test_vceqq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
   1550 // CHECK:   [[CMP_I:%.*]] = icmp eq <4 x i32> %a, %b
   1551 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
   1552 // CHECK:   ret <4 x i32> [[SEXT_I]]
   1553 uint32x4_t test_vceqq_u32(uint32x4_t a, uint32x4_t b) {
   1554   return vceqq_u32(a, b);
   1555 }
   1556 
   1557 // CHECK-LABEL: define <16 x i8> @test_vceqq_p8(<16 x i8> %a, <16 x i8> %b) #0 {
   1558 // CHECK:   [[CMP_I:%.*]] = icmp eq <16 x i8> %a, %b
   1559 // CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
   1560 // CHECK:   ret <16 x i8> [[SEXT_I]]
   1561 uint8x16_t test_vceqq_p8(poly8x16_t a, poly8x16_t b) {
   1562   return vceqq_p8(a, b);
   1563 }
   1564 
   1565 
   1566 // CHECK-LABEL: define <8 x i8> @test_vcge_s8(<8 x i8> %a, <8 x i8> %b) #0 {
   1567 // CHECK:   [[CMP_I:%.*]] = icmp sge <8 x i8> %a, %b
   1568 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
   1569 // CHECK:   ret <8 x i8> [[SEXT_I]]
   1570 uint8x8_t test_vcge_s8(int8x8_t a, int8x8_t b) {
   1571   return vcge_s8(a, b);
   1572 }
   1573 
   1574 // CHECK-LABEL: define <4 x i16> @test_vcge_s16(<4 x i16> %a, <4 x i16> %b) #0 {
   1575 // CHECK:   [[CMP_I:%.*]] = icmp sge <4 x i16> %a, %b
   1576 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
   1577 // CHECK:   ret <4 x i16> [[SEXT_I]]
   1578 uint16x4_t test_vcge_s16(int16x4_t a, int16x4_t b) {
   1579   return vcge_s16(a, b);
   1580 }
   1581 
   1582 // CHECK-LABEL: define <2 x i32> @test_vcge_s32(<2 x i32> %a, <2 x i32> %b) #0 {
   1583 // CHECK:   [[CMP_I:%.*]] = icmp sge <2 x i32> %a, %b
   1584 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
   1585 // CHECK:   ret <2 x i32> [[SEXT_I]]
   1586 uint32x2_t test_vcge_s32(int32x2_t a, int32x2_t b) {
   1587   return vcge_s32(a, b);
   1588 }
   1589 
   1590 // CHECK-LABEL: define <2 x i32> @test_vcge_f32(<2 x float> %a, <2 x float> %b) #0 {
   1591 // CHECK:   [[CMP_I:%.*]] = fcmp oge <2 x float> %a, %b
   1592 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
   1593 // CHECK:   ret <2 x i32> [[SEXT_I]]
   1594 uint32x2_t test_vcge_f32(float32x2_t a, float32x2_t b) {
   1595   return vcge_f32(a, b);
   1596 }
   1597 
   1598 // CHECK-LABEL: define <8 x i8> @test_vcge_u8(<8 x i8> %a, <8 x i8> %b) #0 {
   1599 // CHECK:   [[CMP_I:%.*]] = icmp uge <8 x i8> %a, %b
   1600 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
   1601 // CHECK:   ret <8 x i8> [[SEXT_I]]
   1602 uint8x8_t test_vcge_u8(uint8x8_t a, uint8x8_t b) {
   1603   return vcge_u8(a, b);
   1604 }
   1605 
   1606 // CHECK-LABEL: define <4 x i16> @test_vcge_u16(<4 x i16> %a, <4 x i16> %b) #0 {
   1607 // CHECK:   [[CMP_I:%.*]] = icmp uge <4 x i16> %a, %b
   1608 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
   1609 // CHECK:   ret <4 x i16> [[SEXT_I]]
   1610 uint16x4_t test_vcge_u16(uint16x4_t a, uint16x4_t b) {
   1611   return vcge_u16(a, b);
   1612 }
   1613 
   1614 // CHECK-LABEL: define <2 x i32> @test_vcge_u32(<2 x i32> %a, <2 x i32> %b) #0 {
   1615 // CHECK:   [[CMP_I:%.*]] = icmp uge <2 x i32> %a, %b
   1616 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
   1617 // CHECK:   ret <2 x i32> [[SEXT_I]]
   1618 uint32x2_t test_vcge_u32(uint32x2_t a, uint32x2_t b) {
   1619   return vcge_u32(a, b);
   1620 }
   1621 
   1622 // CHECK-LABEL: define <16 x i8> @test_vcgeq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
   1623 // CHECK:   [[CMP_I:%.*]] = icmp sge <16 x i8> %a, %b
   1624 // CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
   1625 // CHECK:   ret <16 x i8> [[SEXT_I]]
   1626 uint8x16_t test_vcgeq_s8(int8x16_t a, int8x16_t b) {
   1627   return vcgeq_s8(a, b);
   1628 }
   1629 
   1630 // CHECK-LABEL: define <8 x i16> @test_vcgeq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
   1631 // CHECK:   [[CMP_I:%.*]] = icmp sge <8 x i16> %a, %b
   1632 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
   1633 // CHECK:   ret <8 x i16> [[SEXT_I]]
   1634 uint16x8_t test_vcgeq_s16(int16x8_t a, int16x8_t b) {
   1635   return vcgeq_s16(a, b);
   1636 }
   1637 
   1638 // CHECK-LABEL: define <4 x i32> @test_vcgeq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
   1639 // CHECK:   [[CMP_I:%.*]] = icmp sge <4 x i32> %a, %b
   1640 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
   1641 // CHECK:   ret <4 x i32> [[SEXT_I]]
   1642 uint32x4_t test_vcgeq_s32(int32x4_t a, int32x4_t b) {
   1643   return vcgeq_s32(a, b);
   1644 }
   1645 
   1646 // CHECK-LABEL: define <4 x i32> @test_vcgeq_f32(<4 x float> %a, <4 x float> %b) #0 {
   1647 // CHECK:   [[CMP_I:%.*]] = fcmp oge <4 x float> %a, %b
   1648 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
   1649 // CHECK:   ret <4 x i32> [[SEXT_I]]
   1650 uint32x4_t test_vcgeq_f32(float32x4_t a, float32x4_t b) {
   1651   return vcgeq_f32(a, b);
   1652 }
   1653 
   1654 // CHECK-LABEL: define <16 x i8> @test_vcgeq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
   1655 // CHECK:   [[CMP_I:%.*]] = icmp uge <16 x i8> %a, %b
   1656 // CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
   1657 // CHECK:   ret <16 x i8> [[SEXT_I]]
   1658 uint8x16_t test_vcgeq_u8(uint8x16_t a, uint8x16_t b) {
   1659   return vcgeq_u8(a, b);
   1660 }
   1661 
   1662 // CHECK-LABEL: define <8 x i16> @test_vcgeq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
   1663 // CHECK:   [[CMP_I:%.*]] = icmp uge <8 x i16> %a, %b
   1664 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
   1665 // CHECK:   ret <8 x i16> [[SEXT_I]]
   1666 uint16x8_t test_vcgeq_u16(uint16x8_t a, uint16x8_t b) {
   1667   return vcgeq_u16(a, b);
   1668 }
   1669 
   1670 // CHECK-LABEL: define <4 x i32> @test_vcgeq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
   1671 // CHECK:   [[CMP_I:%.*]] = icmp uge <4 x i32> %a, %b
   1672 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
   1673 // CHECK:   ret <4 x i32> [[SEXT_I]]
   1674 uint32x4_t test_vcgeq_u32(uint32x4_t a, uint32x4_t b) {
   1675   return vcgeq_u32(a, b);
   1676 }
   1677 
   1678 
   1679 // CHECK-LABEL: define <8 x i8> @test_vcgt_s8(<8 x i8> %a, <8 x i8> %b) #0 {
   1680 // CHECK:   [[CMP_I:%.*]] = icmp sgt <8 x i8> %a, %b
   1681 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
   1682 // CHECK:   ret <8 x i8> [[SEXT_I]]
   1683 uint8x8_t test_vcgt_s8(int8x8_t a, int8x8_t b) {
   1684   return vcgt_s8(a, b);
   1685 }
   1686 
   1687 // CHECK-LABEL: define <4 x i16> @test_vcgt_s16(<4 x i16> %a, <4 x i16> %b) #0 {
   1688 // CHECK:   [[CMP_I:%.*]] = icmp sgt <4 x i16> %a, %b
   1689 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
   1690 // CHECK:   ret <4 x i16> [[SEXT_I]]
   1691 uint16x4_t test_vcgt_s16(int16x4_t a, int16x4_t b) {
   1692   return vcgt_s16(a, b);
   1693 }
   1694 
   1695 // CHECK-LABEL: define <2 x i32> @test_vcgt_s32(<2 x i32> %a, <2 x i32> %b) #0 {
   1696 // CHECK:   [[CMP_I:%.*]] = icmp sgt <2 x i32> %a, %b
   1697 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
   1698 // CHECK:   ret <2 x i32> [[SEXT_I]]
   1699 uint32x2_t test_vcgt_s32(int32x2_t a, int32x2_t b) {
   1700   return vcgt_s32(a, b);
   1701 }
   1702 
   1703 // CHECK-LABEL: define <2 x i32> @test_vcgt_f32(<2 x float> %a, <2 x float> %b) #0 {
   1704 // CHECK:   [[CMP_I:%.*]] = fcmp ogt <2 x float> %a, %b
   1705 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
   1706 // CHECK:   ret <2 x i32> [[SEXT_I]]
   1707 uint32x2_t test_vcgt_f32(float32x2_t a, float32x2_t b) {
   1708   return vcgt_f32(a, b);
   1709 }
   1710 
   1711 // CHECK-LABEL: define <8 x i8> @test_vcgt_u8(<8 x i8> %a, <8 x i8> %b) #0 {
   1712 // CHECK:   [[CMP_I:%.*]] = icmp ugt <8 x i8> %a, %b
   1713 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
   1714 // CHECK:   ret <8 x i8> [[SEXT_I]]
   1715 uint8x8_t test_vcgt_u8(uint8x8_t a, uint8x8_t b) {
   1716   return vcgt_u8(a, b);
   1717 }
   1718 
   1719 // CHECK-LABEL: define <4 x i16> @test_vcgt_u16(<4 x i16> %a, <4 x i16> %b) #0 {
   1720 // CHECK:   [[CMP_I:%.*]] = icmp ugt <4 x i16> %a, %b
   1721 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
   1722 // CHECK:   ret <4 x i16> [[SEXT_I]]
   1723 uint16x4_t test_vcgt_u16(uint16x4_t a, uint16x4_t b) {
   1724   return vcgt_u16(a, b);
   1725 }
   1726 
   1727 // CHECK-LABEL: define <2 x i32> @test_vcgt_u32(<2 x i32> %a, <2 x i32> %b) #0 {
   1728 // CHECK:   [[CMP_I:%.*]] = icmp ugt <2 x i32> %a, %b
   1729 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
   1730 // CHECK:   ret <2 x i32> [[SEXT_I]]
   1731 uint32x2_t test_vcgt_u32(uint32x2_t a, uint32x2_t b) {
   1732   return vcgt_u32(a, b);
   1733 }
   1734 
   1735 // CHECK-LABEL: define <16 x i8> @test_vcgtq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
   1736 // CHECK:   [[CMP_I:%.*]] = icmp sgt <16 x i8> %a, %b
   1737 // CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
   1738 // CHECK:   ret <16 x i8> [[SEXT_I]]
   1739 uint8x16_t test_vcgtq_s8(int8x16_t a, int8x16_t b) {
   1740   return vcgtq_s8(a, b);
   1741 }
   1742 
   1743 // CHECK-LABEL: define <8 x i16> @test_vcgtq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
   1744 // CHECK:   [[CMP_I:%.*]] = icmp sgt <8 x i16> %a, %b
   1745 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
   1746 // CHECK:   ret <8 x i16> [[SEXT_I]]
   1747 uint16x8_t test_vcgtq_s16(int16x8_t a, int16x8_t b) {
   1748   return vcgtq_s16(a, b);
   1749 }
   1750 
   1751 // CHECK-LABEL: define <4 x i32> @test_vcgtq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
   1752 // CHECK:   [[CMP_I:%.*]] = icmp sgt <4 x i32> %a, %b
   1753 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
   1754 // CHECK:   ret <4 x i32> [[SEXT_I]]
   1755 uint32x4_t test_vcgtq_s32(int32x4_t a, int32x4_t b) {
   1756   return vcgtq_s32(a, b);
   1757 }
   1758 
   1759 // CHECK-LABEL: define <4 x i32> @test_vcgtq_f32(<4 x float> %a, <4 x float> %b) #0 {
   1760 // CHECK:   [[CMP_I:%.*]] = fcmp ogt <4 x float> %a, %b
   1761 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
   1762 // CHECK:   ret <4 x i32> [[SEXT_I]]
   1763 uint32x4_t test_vcgtq_f32(float32x4_t a, float32x4_t b) {
   1764   return vcgtq_f32(a, b);
   1765 }
   1766 
   1767 // CHECK-LABEL: define <16 x i8> @test_vcgtq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
   1768 // CHECK:   [[CMP_I:%.*]] = icmp ugt <16 x i8> %a, %b
   1769 // CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
   1770 // CHECK:   ret <16 x i8> [[SEXT_I]]
   1771 uint8x16_t test_vcgtq_u8(uint8x16_t a, uint8x16_t b) {
   1772   return vcgtq_u8(a, b);
   1773 }
   1774 
   1775 // CHECK-LABEL: define <8 x i16> @test_vcgtq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
   1776 // CHECK:   [[CMP_I:%.*]] = icmp ugt <8 x i16> %a, %b
   1777 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
   1778 // CHECK:   ret <8 x i16> [[SEXT_I]]
   1779 uint16x8_t test_vcgtq_u16(uint16x8_t a, uint16x8_t b) {
   1780   return vcgtq_u16(a, b);
   1781 }
   1782 
   1783 // CHECK-LABEL: define <4 x i32> @test_vcgtq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
   1784 // CHECK:   [[CMP_I:%.*]] = icmp ugt <4 x i32> %a, %b
   1785 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
   1786 // CHECK:   ret <4 x i32> [[SEXT_I]]
   1787 uint32x4_t test_vcgtq_u32(uint32x4_t a, uint32x4_t b) {
   1788   return vcgtq_u32(a, b);
   1789 }
   1790 
   1791 
   1792 // CHECK-LABEL: define <8 x i8> @test_vcle_s8(<8 x i8> %a, <8 x i8> %b) #0 {
   1793 // CHECK:   [[CMP_I:%.*]] = icmp sle <8 x i8> %a, %b
   1794 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
   1795 // CHECK:   ret <8 x i8> [[SEXT_I]]
   1796 uint8x8_t test_vcle_s8(int8x8_t a, int8x8_t b) {
   1797   return vcle_s8(a, b);
   1798 }
   1799 
   1800 // CHECK-LABEL: define <4 x i16> @test_vcle_s16(<4 x i16> %a, <4 x i16> %b) #0 {
   1801 // CHECK:   [[CMP_I:%.*]] = icmp sle <4 x i16> %a, %b
   1802 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
   1803 // CHECK:   ret <4 x i16> [[SEXT_I]]
   1804 uint16x4_t test_vcle_s16(int16x4_t a, int16x4_t b) {
   1805   return vcle_s16(a, b);
   1806 }
   1807 
   1808 // CHECK-LABEL: define <2 x i32> @test_vcle_s32(<2 x i32> %a, <2 x i32> %b) #0 {
   1809 // CHECK:   [[CMP_I:%.*]] = icmp sle <2 x i32> %a, %b
   1810 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
   1811 // CHECK:   ret <2 x i32> [[SEXT_I]]
   1812 uint32x2_t test_vcle_s32(int32x2_t a, int32x2_t b) {
   1813   return vcle_s32(a, b);
   1814 }
   1815 
   1816 // CHECK-LABEL: define <2 x i32> @test_vcle_f32(<2 x float> %a, <2 x float> %b) #0 {
   1817 // CHECK:   [[CMP_I:%.*]] = fcmp ole <2 x float> %a, %b
   1818 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
   1819 // CHECK:   ret <2 x i32> [[SEXT_I]]
   1820 uint32x2_t test_vcle_f32(float32x2_t a, float32x2_t b) {
   1821   return vcle_f32(a, b);
   1822 }
   1823 
   1824 // CHECK-LABEL: define <8 x i8> @test_vcle_u8(<8 x i8> %a, <8 x i8> %b) #0 {
   1825 // CHECK:   [[CMP_I:%.*]] = icmp ule <8 x i8> %a, %b
   1826 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
   1827 // CHECK:   ret <8 x i8> [[SEXT_I]]
   1828 uint8x8_t test_vcle_u8(uint8x8_t a, uint8x8_t b) {
   1829   return vcle_u8(a, b);
   1830 }
   1831 
   1832 // CHECK-LABEL: define <4 x i16> @test_vcle_u16(<4 x i16> %a, <4 x i16> %b) #0 {
   1833 // CHECK:   [[CMP_I:%.*]] = icmp ule <4 x i16> %a, %b
   1834 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
   1835 // CHECK:   ret <4 x i16> [[SEXT_I]]
   1836 uint16x4_t test_vcle_u16(uint16x4_t a, uint16x4_t b) {
   1837   return vcle_u16(a, b);
   1838 }
   1839 
   1840 // CHECK-LABEL: define <2 x i32> @test_vcle_u32(<2 x i32> %a, <2 x i32> %b) #0 {
   1841 // CHECK:   [[CMP_I:%.*]] = icmp ule <2 x i32> %a, %b
   1842 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
   1843 // CHECK:   ret <2 x i32> [[SEXT_I]]
   1844 uint32x2_t test_vcle_u32(uint32x2_t a, uint32x2_t b) {
   1845   return vcle_u32(a, b);
   1846 }
   1847 
   1848 // CHECK-LABEL: define <16 x i8> @test_vcleq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
   1849 // CHECK:   [[CMP_I:%.*]] = icmp sle <16 x i8> %a, %b
   1850 // CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
   1851 // CHECK:   ret <16 x i8> [[SEXT_I]]
   1852 uint8x16_t test_vcleq_s8(int8x16_t a, int8x16_t b) {
   1853   return vcleq_s8(a, b);
   1854 }
   1855 
   1856 // CHECK-LABEL: define <8 x i16> @test_vcleq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
   1857 // CHECK:   [[CMP_I:%.*]] = icmp sle <8 x i16> %a, %b
   1858 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
   1859 // CHECK:   ret <8 x i16> [[SEXT_I]]
   1860 uint16x8_t test_vcleq_s16(int16x8_t a, int16x8_t b) {
   1861   return vcleq_s16(a, b);
   1862 }
   1863 
   1864 // CHECK-LABEL: define <4 x i32> @test_vcleq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
   1865 // CHECK:   [[CMP_I:%.*]] = icmp sle <4 x i32> %a, %b
   1866 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
   1867 // CHECK:   ret <4 x i32> [[SEXT_I]]
   1868 uint32x4_t test_vcleq_s32(int32x4_t a, int32x4_t b) {
   1869   return vcleq_s32(a, b);
   1870 }
   1871 
   1872 // CHECK-LABEL: define <4 x i32> @test_vcleq_f32(<4 x float> %a, <4 x float> %b) #0 {
   1873 // CHECK:   [[CMP_I:%.*]] = fcmp ole <4 x float> %a, %b
   1874 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
   1875 // CHECK:   ret <4 x i32> [[SEXT_I]]
   1876 uint32x4_t test_vcleq_f32(float32x4_t a, float32x4_t b) {
   1877   return vcleq_f32(a, b);
   1878 }
   1879 
   1880 // CHECK-LABEL: define <16 x i8> @test_vcleq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
   1881 // CHECK:   [[CMP_I:%.*]] = icmp ule <16 x i8> %a, %b
   1882 // CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
   1883 // CHECK:   ret <16 x i8> [[SEXT_I]]
   1884 uint8x16_t test_vcleq_u8(uint8x16_t a, uint8x16_t b) {
   1885   return vcleq_u8(a, b);
   1886 }
   1887 
   1888 // CHECK-LABEL: define <8 x i16> @test_vcleq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
   1889 // CHECK:   [[CMP_I:%.*]] = icmp ule <8 x i16> %a, %b
   1890 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
   1891 // CHECK:   ret <8 x i16> [[SEXT_I]]
   1892 uint16x8_t test_vcleq_u16(uint16x8_t a, uint16x8_t b) {
   1893   return vcleq_u16(a, b);
   1894 }
   1895 
   1896 // CHECK-LABEL: define <4 x i32> @test_vcleq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
   1897 // CHECK:   [[CMP_I:%.*]] = icmp ule <4 x i32> %a, %b
   1898 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
   1899 // CHECK:   ret <4 x i32> [[SEXT_I]]
   1900 uint32x4_t test_vcleq_u32(uint32x4_t a, uint32x4_t b) {
   1901   return vcleq_u32(a, b);
   1902 }
   1903 
   1904 
   1905 // CHECK-LABEL: define <8 x i8> @test_vcls_s8(<8 x i8> %a) #0 {
   1906 // CHECK:   [[VCLS_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8> %a) #4
   1907 // CHECK:   ret <8 x i8> [[VCLS_V_I]]
   1908 int8x8_t test_vcls_s8(int8x8_t a) {
   1909   return vcls_s8(a);
   1910 }
   1911 
   1912 // CHECK-LABEL: define <4 x i16> @test_vcls_s16(<4 x i16> %a) #0 {
   1913 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   1914 // CHECK:   [[VCLS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   1915 // CHECK:   [[VCLS_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16> [[VCLS_V_I]]) #4
   1916 // CHECK:   [[VCLS_V2_I:%.*]] = bitcast <4 x i16> [[VCLS_V1_I]] to <8 x i8>
   1917 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VCLS_V2_I]] to <4 x i16>
   1918 // CHECK:   ret <4 x i16> [[TMP1]]
   1919 int16x4_t test_vcls_s16(int16x4_t a) {
   1920   return vcls_s16(a);
   1921 }
   1922 
   1923 // CHECK-LABEL: define <2 x i32> @test_vcls_s32(<2 x i32> %a) #0 {
   1924 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   1925 // CHECK:   [[VCLS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   1926 // CHECK:   [[VCLS_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32> [[VCLS_V_I]]) #4
   1927 // CHECK:   [[VCLS_V2_I:%.*]] = bitcast <2 x i32> [[VCLS_V1_I]] to <8 x i8>
   1928 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VCLS_V2_I]] to <2 x i32>
   1929 // CHECK:   ret <2 x i32> [[TMP1]]
   1930 int32x2_t test_vcls_s32(int32x2_t a) {
   1931   return vcls_s32(a);
   1932 }
   1933 
   1934 // CHECK-LABEL: define <16 x i8> @test_vclsq_s8(<16 x i8> %a) #0 {
   1935 // CHECK:   [[VCLSQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8> %a) #4
   1936 // CHECK:   ret <16 x i8> [[VCLSQ_V_I]]
   1937 int8x16_t test_vclsq_s8(int8x16_t a) {
   1938   return vclsq_s8(a);
   1939 }
   1940 
   1941 // CHECK-LABEL: define <8 x i16> @test_vclsq_s16(<8 x i16> %a) #0 {
   1942 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   1943 // CHECK:   [[VCLSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   1944 // CHECK:   [[VCLSQ_V1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16> [[VCLSQ_V_I]]) #4
   1945 // CHECK:   [[VCLSQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLSQ_V1_I]] to <16 x i8>
   1946 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VCLSQ_V2_I]] to <8 x i16>
   1947 // CHECK:   ret <8 x i16> [[TMP1]]
   1948 int16x8_t test_vclsq_s16(int16x8_t a) {
   1949   return vclsq_s16(a);
   1950 }
   1951 
   1952 // CHECK-LABEL: define <4 x i32> @test_vclsq_s32(<4 x i32> %a) #0 {
   1953 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   1954 // CHECK:   [[VCLSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   1955 // CHECK:   [[VCLSQ_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32> [[VCLSQ_V_I]]) #4
   1956 // CHECK:   [[VCLSQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLSQ_V1_I]] to <16 x i8>
   1957 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VCLSQ_V2_I]] to <4 x i32>
   1958 // CHECK:   ret <4 x i32> [[TMP1]]
   1959 int32x4_t test_vclsq_s32(int32x4_t a) {
   1960   return vclsq_s32(a);
   1961 }
   1962 
   1963 
   1964 // CHECK-LABEL: define <8 x i8> @test_vclt_s8(<8 x i8> %a, <8 x i8> %b) #0 {
   1965 // CHECK:   [[CMP_I:%.*]] = icmp slt <8 x i8> %a, %b
   1966 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
   1967 // CHECK:   ret <8 x i8> [[SEXT_I]]
   1968 uint8x8_t test_vclt_s8(int8x8_t a, int8x8_t b) {
   1969   return vclt_s8(a, b);
   1970 }
   1971 
   1972 // CHECK-LABEL: define <4 x i16> @test_vclt_s16(<4 x i16> %a, <4 x i16> %b) #0 {
   1973 // CHECK:   [[CMP_I:%.*]] = icmp slt <4 x i16> %a, %b
   1974 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
   1975 // CHECK:   ret <4 x i16> [[SEXT_I]]
   1976 uint16x4_t test_vclt_s16(int16x4_t a, int16x4_t b) {
   1977   return vclt_s16(a, b);
   1978 }
   1979 
   1980 // CHECK-LABEL: define <2 x i32> @test_vclt_s32(<2 x i32> %a, <2 x i32> %b) #0 {
   1981 // CHECK:   [[CMP_I:%.*]] = icmp slt <2 x i32> %a, %b
   1982 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
   1983 // CHECK:   ret <2 x i32> [[SEXT_I]]
   1984 uint32x2_t test_vclt_s32(int32x2_t a, int32x2_t b) {
   1985   return vclt_s32(a, b);
   1986 }
   1987 
   1988 // CHECK-LABEL: define <2 x i32> @test_vclt_f32(<2 x float> %a, <2 x float> %b) #0 {
   1989 // CHECK:   [[CMP_I:%.*]] = fcmp olt <2 x float> %a, %b
   1990 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
   1991 // CHECK:   ret <2 x i32> [[SEXT_I]]
   1992 uint32x2_t test_vclt_f32(float32x2_t a, float32x2_t b) {
   1993   return vclt_f32(a, b);
   1994 }
   1995 
   1996 // CHECK-LABEL: define <8 x i8> @test_vclt_u8(<8 x i8> %a, <8 x i8> %b) #0 {
   1997 // CHECK:   [[CMP_I:%.*]] = icmp ult <8 x i8> %a, %b
   1998 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
   1999 // CHECK:   ret <8 x i8> [[SEXT_I]]
   2000 uint8x8_t test_vclt_u8(uint8x8_t a, uint8x8_t b) {
   2001   return vclt_u8(a, b);
   2002 }
   2003 
   2004 // CHECK-LABEL: define <4 x i16> @test_vclt_u16(<4 x i16> %a, <4 x i16> %b) #0 {
   2005 // CHECK:   [[CMP_I:%.*]] = icmp ult <4 x i16> %a, %b
   2006 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
   2007 // CHECK:   ret <4 x i16> [[SEXT_I]]
   2008 uint16x4_t test_vclt_u16(uint16x4_t a, uint16x4_t b) {
   2009   return vclt_u16(a, b);
   2010 }
   2011 
   2012 // CHECK-LABEL: define <2 x i32> @test_vclt_u32(<2 x i32> %a, <2 x i32> %b) #0 {
   2013 // CHECK:   [[CMP_I:%.*]] = icmp ult <2 x i32> %a, %b
   2014 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
   2015 // CHECK:   ret <2 x i32> [[SEXT_I]]
   2016 uint32x2_t test_vclt_u32(uint32x2_t a, uint32x2_t b) {
   2017   return vclt_u32(a, b);
   2018 }
   2019 
   2020 // CHECK-LABEL: define <16 x i8> @test_vcltq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
   2021 // CHECK:   [[CMP_I:%.*]] = icmp slt <16 x i8> %a, %b
   2022 // CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
   2023 // CHECK:   ret <16 x i8> [[SEXT_I]]
   2024 uint8x16_t test_vcltq_s8(int8x16_t a, int8x16_t b) {
   2025   return vcltq_s8(a, b);
   2026 }
   2027 
   2028 // CHECK-LABEL: define <8 x i16> @test_vcltq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
   2029 // CHECK:   [[CMP_I:%.*]] = icmp slt <8 x i16> %a, %b
   2030 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
   2031 // CHECK:   ret <8 x i16> [[SEXT_I]]
   2032 uint16x8_t test_vcltq_s16(int16x8_t a, int16x8_t b) {
   2033   return vcltq_s16(a, b);
   2034 }
   2035 
   2036 // CHECK-LABEL: define <4 x i32> @test_vcltq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
   2037 // CHECK:   [[CMP_I:%.*]] = icmp slt <4 x i32> %a, %b
   2038 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
   2039 // CHECK:   ret <4 x i32> [[SEXT_I]]
   2040 uint32x4_t test_vcltq_s32(int32x4_t a, int32x4_t b) {
   2041   return vcltq_s32(a, b);
   2042 }
   2043 
   2044 // CHECK-LABEL: define <4 x i32> @test_vcltq_f32(<4 x float> %a, <4 x float> %b) #0 {
   2045 // CHECK:   [[CMP_I:%.*]] = fcmp olt <4 x float> %a, %b
   2046 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
   2047 // CHECK:   ret <4 x i32> [[SEXT_I]]
   2048 uint32x4_t test_vcltq_f32(float32x4_t a, float32x4_t b) {
   2049   return vcltq_f32(a, b);
   2050 }
   2051 
   2052 // CHECK-LABEL: define <16 x i8> @test_vcltq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
   2053 // CHECK:   [[CMP_I:%.*]] = icmp ult <16 x i8> %a, %b
   2054 // CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
   2055 // CHECK:   ret <16 x i8> [[SEXT_I]]
   2056 uint8x16_t test_vcltq_u8(uint8x16_t a, uint8x16_t b) {
   2057   return vcltq_u8(a, b);
   2058 }
   2059 
   2060 // CHECK-LABEL: define <8 x i16> @test_vcltq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
   2061 // CHECK:   [[CMP_I:%.*]] = icmp ult <8 x i16> %a, %b
   2062 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
   2063 // CHECK:   ret <8 x i16> [[SEXT_I]]
   2064 uint16x8_t test_vcltq_u16(uint16x8_t a, uint16x8_t b) {
   2065   return vcltq_u16(a, b);
   2066 }
   2067 
   2068 // CHECK-LABEL: define <4 x i32> @test_vcltq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
   2069 // CHECK:   [[CMP_I:%.*]] = icmp ult <4 x i32> %a, %b
   2070 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
   2071 // CHECK:   ret <4 x i32> [[SEXT_I]]
   2072 uint32x4_t test_vcltq_u32(uint32x4_t a, uint32x4_t b) {
   2073   return vcltq_u32(a, b);
   2074 }
   2075 
   2076 
   2077 // CHECK-LABEL: define <8 x i8> @test_vclz_s8(<8 x i8> %a) #0 {
   2078 // CHECK:   [[VCLZ_V_I:%.*]] = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %a, i1 false) #4
   2079 // CHECK:   ret <8 x i8> [[VCLZ_V_I]]
   2080 int8x8_t test_vclz_s8(int8x8_t a) {
   2081   return vclz_s8(a);
   2082 }
   2083 
   2084 // CHECK-LABEL: define <4 x i16> @test_vclz_s16(<4 x i16> %a) #0 {
   2085 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   2086 // CHECK:   [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   2087 // CHECK:   [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> [[VCLZ_V_I]], i1 false) #4
   2088 // CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <4 x i16> [[VCLZ_V1_I]] to <8 x i8>
   2089 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <4 x i16>
   2090 // CHECK:   ret <4 x i16> [[TMP1]]
   2091 int16x4_t test_vclz_s16(int16x4_t a) {
   2092   return vclz_s16(a);
   2093 }
   2094 
   2095 // CHECK-LABEL: define <2 x i32> @test_vclz_s32(<2 x i32> %a) #0 {
   2096 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   2097 // CHECK:   [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   2098 // CHECK:   [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[VCLZ_V_I]], i1 false) #4
   2099 // CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <2 x i32> [[VCLZ_V1_I]] to <8 x i8>
   2100 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <2 x i32>
   2101 // CHECK:   ret <2 x i32> [[TMP1]]
   2102 int32x2_t test_vclz_s32(int32x2_t a) {
   2103   return vclz_s32(a);
   2104 }
   2105 
   2106 // CHECK-LABEL: define <8 x i8> @test_vclz_u8(<8 x i8> %a) #0 {
   2107 // CHECK:   [[VCLZ_V_I:%.*]] = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %a, i1 false) #4
   2108 // CHECK:   ret <8 x i8> [[VCLZ_V_I]]
   2109 uint8x8_t test_vclz_u8(uint8x8_t a) {
   2110   return vclz_u8(a);
   2111 }
   2112 
   2113 // CHECK-LABEL: define <4 x i16> @test_vclz_u16(<4 x i16> %a) #0 {
   2114 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   2115 // CHECK:   [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   2116 // CHECK:   [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> [[VCLZ_V_I]], i1 false) #4
   2117 // CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <4 x i16> [[VCLZ_V1_I]] to <8 x i8>
   2118 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <4 x i16>
   2119 // CHECK:   ret <4 x i16> [[TMP1]]
   2120 uint16x4_t test_vclz_u16(uint16x4_t a) {
   2121   return vclz_u16(a);
   2122 }
   2123 
   2124 // CHECK-LABEL: define <2 x i32> @test_vclz_u32(<2 x i32> %a) #0 {
   2125 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   2126 // CHECK:   [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   2127 // CHECK:   [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[VCLZ_V_I]], i1 false) #4
   2128 // CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <2 x i32> [[VCLZ_V1_I]] to <8 x i8>
   2129 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <2 x i32>
   2130 // CHECK:   ret <2 x i32> [[TMP1]]
   2131 uint32x2_t test_vclz_u32(uint32x2_t a) {
   2132   return vclz_u32(a);
   2133 }
   2134 
   2135 // CHECK-LABEL: define <16 x i8> @test_vclzq_s8(<16 x i8> %a) #0 {
   2136 // CHECK:   [[VCLZQ_V_I:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false) #4
   2137 // CHECK:   ret <16 x i8> [[VCLZQ_V_I]]
   2138 int8x16_t test_vclzq_s8(int8x16_t a) {
   2139   return vclzq_s8(a);
   2140 }
   2141 
   2142 // CHECK-LABEL: define <8 x i16> @test_vclzq_s16(<8 x i16> %a) #0 {
   2143 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   2144 // CHECK:   [[VCLZQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   2145 // CHECK:   [[VCLZQ_V1_I:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[VCLZQ_V_I]], i1 false) #4
   2146 // CHECK:   [[VCLZQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLZQ_V1_I]] to <16 x i8>
   2147 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VCLZQ_V2_I]] to <8 x i16>
   2148 // CHECK:   ret <8 x i16> [[TMP1]]
   2149 int16x8_t test_vclzq_s16(int16x8_t a) {
   2150   return vclzq_s16(a);
   2151 }
   2152 
   2153 // CHECK-LABEL: define <4 x i32> @test_vclzq_s32(<4 x i32> %a) #0 {
   2154 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   2155 // CHECK:   [[VCLZQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   2156 // CHECK:   [[VCLZQ_V1_I:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[VCLZQ_V_I]], i1 false) #4
   2157 // CHECK:   [[VCLZQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLZQ_V1_I]] to <16 x i8>
   2158 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VCLZQ_V2_I]] to <4 x i32>
   2159 // CHECK:   ret <4 x i32> [[TMP1]]
   2160 int32x4_t test_vclzq_s32(int32x4_t a) {
   2161   return vclzq_s32(a);
   2162 }
   2163 
   2164 // CHECK-LABEL: define <16 x i8> @test_vclzq_u8(<16 x i8> %a) #0 {
   2165 // CHECK:   [[VCLZQ_V_I:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false) #4
   2166 // CHECK:   ret <16 x i8> [[VCLZQ_V_I]]
   2167 uint8x16_t test_vclzq_u8(uint8x16_t a) {
   2168   return vclzq_u8(a);
   2169 }
   2170 
   2171 // CHECK-LABEL: define <8 x i16> @test_vclzq_u16(<8 x i16> %a) #0 {
   2172 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   2173 // CHECK:   [[VCLZQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   2174 // CHECK:   [[VCLZQ_V1_I:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[VCLZQ_V_I]], i1 false) #4
   2175 // CHECK:   [[VCLZQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLZQ_V1_I]] to <16 x i8>
   2176 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VCLZQ_V2_I]] to <8 x i16>
   2177 // CHECK:   ret <8 x i16> [[TMP1]]
   2178 uint16x8_t test_vclzq_u16(uint16x8_t a) {
   2179   return vclzq_u16(a);
   2180 }
   2181 
   2182 // CHECK-LABEL: define <4 x i32> @test_vclzq_u32(<4 x i32> %a) #0 {
   2183 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   2184 // CHECK:   [[VCLZQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   2185 // CHECK:   [[VCLZQ_V1_I:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[VCLZQ_V_I]], i1 false) #4
   2186 // CHECK:   [[VCLZQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLZQ_V1_I]] to <16 x i8>
   2187 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VCLZQ_V2_I]] to <4 x i32>
   2188 // CHECK:   ret <4 x i32> [[TMP1]]
   2189 uint32x4_t test_vclzq_u32(uint32x4_t a) {
   2190   return vclzq_u32(a);
   2191 }
   2192 
   2193 
   2194 // CHECK-LABEL: define <8 x i8> @test_vcnt_u8(<8 x i8> %a) #0 {
   2195 // CHECK:   [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %a) #4
   2196 // CHECK:   ret <8 x i8> [[VCNT_V_I]]
   2197 uint8x8_t test_vcnt_u8(uint8x8_t a) {
   2198   return vcnt_u8(a);
   2199 }
   2200 
   2201 // CHECK-LABEL: define <8 x i8> @test_vcnt_s8(<8 x i8> %a) #0 {
   2202 // CHECK:   [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %a) #4
   2203 // CHECK:   ret <8 x i8> [[VCNT_V_I]]
   2204 int8x8_t test_vcnt_s8(int8x8_t a) {
   2205   return vcnt_s8(a);
   2206 }
   2207 
   2208 // CHECK-LABEL: define <8 x i8> @test_vcnt_p8(<8 x i8> %a) #0 {
   2209 // CHECK:   [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %a) #4
   2210 // CHECK:   ret <8 x i8> [[VCNT_V_I]]
   2211 poly8x8_t test_vcnt_p8(poly8x8_t a) {
   2212   return vcnt_p8(a);
   2213 }
   2214 
   2215 // CHECK-LABEL: define <16 x i8> @test_vcntq_u8(<16 x i8> %a) #0 {
   2216 // CHECK:   [[VCNTQ_V_I:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a) #4
   2217 // CHECK:   ret <16 x i8> [[VCNTQ_V_I]]
   2218 uint8x16_t test_vcntq_u8(uint8x16_t a) {
   2219   return vcntq_u8(a);
   2220 }
   2221 
   2222 // CHECK-LABEL: define <16 x i8> @test_vcntq_s8(<16 x i8> %a) #0 {
   2223 // CHECK:   [[VCNTQ_V_I:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a) #4
   2224 // CHECK:   ret <16 x i8> [[VCNTQ_V_I]]
   2225 int8x16_t test_vcntq_s8(int8x16_t a) {
   2226   return vcntq_s8(a);
   2227 }
   2228 
   2229 // CHECK-LABEL: define <16 x i8> @test_vcntq_p8(<16 x i8> %a) #0 {
   2230 // CHECK:   [[VCNTQ_V_I:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a) #4
   2231 // CHECK:   ret <16 x i8> [[VCNTQ_V_I]]
   2232 poly8x16_t test_vcntq_p8(poly8x16_t a) {
   2233   return vcntq_p8(a);
   2234 }
   2235 
   2236 
   2237 // CHECK-LABEL: define <16 x i8> @test_vcombine_s8(<8 x i8> %a, <8 x i8> %b) #0 {
   2238 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   2239 // CHECK:   ret <16 x i8> [[SHUFFLE_I]]
   2240 int8x16_t test_vcombine_s8(int8x8_t a, int8x8_t b) {
   2241   return vcombine_s8(a, b);
   2242 }
   2243 
   2244 // CHECK-LABEL: define <8 x i16> @test_vcombine_s16(<4 x i16> %a, <4 x i16> %b) #0 {
   2245 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   2246 // CHECK:   ret <8 x i16> [[SHUFFLE_I]]
   2247 int16x8_t test_vcombine_s16(int16x4_t a, int16x4_t b) {
   2248   return vcombine_s16(a, b);
   2249 }
   2250 
   2251 // CHECK-LABEL: define <4 x i32> @test_vcombine_s32(<2 x i32> %a, <2 x i32> %b) #0 {
   2252 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   2253 // CHECK:   ret <4 x i32> [[SHUFFLE_I]]
   2254 int32x4_t test_vcombine_s32(int32x2_t a, int32x2_t b) {
   2255   return vcombine_s32(a, b);
   2256 }
   2257 
   2258 // CHECK-LABEL: define <2 x i64> @test_vcombine_s64(<1 x i64> %a, <1 x i64> %b) #0 {
   2259 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <1 x i64> %a, <1 x i64> %b, <2 x i32> <i32 0, i32 1>
   2260 // CHECK:   ret <2 x i64> [[SHUFFLE_I]]
   2261 int64x2_t test_vcombine_s64(int64x1_t a, int64x1_t b) {
   2262   return vcombine_s64(a, b);
   2263 }
   2264 
   2265 // CHECK-LABEL: define <8 x half> @test_vcombine_f16(<4 x half> %a, <4 x half> %b) #0 {
   2266 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x half> %a, <4 x half> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   2267 // CHECK:   ret <8 x half> [[SHUFFLE_I]]
   2268 float16x8_t test_vcombine_f16(float16x4_t a, float16x4_t b) {
   2269   return vcombine_f16(a, b);
   2270 }
   2271 
   2272 // CHECK-LABEL: define <4 x float> @test_vcombine_f32(<2 x float> %a, <2 x float> %b) #0 {
   2273 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   2274 // CHECK:   ret <4 x float> [[SHUFFLE_I]]
   2275 float32x4_t test_vcombine_f32(float32x2_t a, float32x2_t b) {
   2276   return vcombine_f32(a, b);
   2277 }
   2278 
   2279 // CHECK-LABEL: define <16 x i8> @test_vcombine_u8(<8 x i8> %a, <8 x i8> %b) #0 {
   2280 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   2281 // CHECK:   ret <16 x i8> [[SHUFFLE_I]]
   2282 uint8x16_t test_vcombine_u8(uint8x8_t a, uint8x8_t b) {
   2283   return vcombine_u8(a, b);
   2284 }
   2285 
   2286 // CHECK-LABEL: define <8 x i16> @test_vcombine_u16(<4 x i16> %a, <4 x i16> %b) #0 {
   2287 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   2288 // CHECK:   ret <8 x i16> [[SHUFFLE_I]]
   2289 uint16x8_t test_vcombine_u16(uint16x4_t a, uint16x4_t b) {
   2290   return vcombine_u16(a, b);
   2291 }
   2292 
   2293 // CHECK-LABEL: define <4 x i32> @test_vcombine_u32(<2 x i32> %a, <2 x i32> %b) #0 {
   2294 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   2295 // CHECK:   ret <4 x i32> [[SHUFFLE_I]]
   2296 uint32x4_t test_vcombine_u32(uint32x2_t a, uint32x2_t b) {
   2297   return vcombine_u32(a, b);
   2298 }
   2299 
   2300 // CHECK-LABEL: define <2 x i64> @test_vcombine_u64(<1 x i64> %a, <1 x i64> %b) #0 {
   2301 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <1 x i64> %a, <1 x i64> %b, <2 x i32> <i32 0, i32 1>
   2302 // CHECK:   ret <2 x i64> [[SHUFFLE_I]]
   2303 uint64x2_t test_vcombine_u64(uint64x1_t a, uint64x1_t b) {
   2304   return vcombine_u64(a, b);
   2305 }
   2306 
   2307 // CHECK-LABEL: define <16 x i8> @test_vcombine_p8(<8 x i8> %a, <8 x i8> %b) #0 {
   2308 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   2309 // CHECK:   ret <16 x i8> [[SHUFFLE_I]]
   2310 poly8x16_t test_vcombine_p8(poly8x8_t a, poly8x8_t b) {
   2311   return vcombine_p8(a, b);
   2312 }
   2313 
   2314 // CHECK-LABEL: define <8 x i16> @test_vcombine_p16(<4 x i16> %a, <4 x i16> %b) #0 {
   2315 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   2316 // CHECK:   ret <8 x i16> [[SHUFFLE_I]]
   2317 poly16x8_t test_vcombine_p16(poly16x4_t a, poly16x4_t b) {
   2318   return vcombine_p16(a, b);
   2319 }
   2320 
   2321 
   2322 // CHECK-LABEL: define <8 x i8> @test_vcreate_s8(i64 %a) #0 {
   2323 // CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <8 x i8>
   2324 // CHECK:   [[VCLZ_V_I:%.*]] = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> [[TMP0]], i1 false) #4
   2325 // CHECK:   ret <8 x i8> [[VCLZ_V_I]]
   2326 int8x8_t test_vcreate_s8(uint64_t a) {
   2327   return vclz_s8(vcreate_s8(a));
   2328 }
   2329 
   2330 // CHECK-LABEL: define <4 x i16> @test_vcreate_s16(i64 %a) #0 {
   2331 // CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <4 x i16>
   2332 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
   2333 // CHECK:   [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   2334 // CHECK:   [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> [[VCLZ_V_I]], i1 false) #4
   2335 // CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <4 x i16> [[VCLZ_V1_I]] to <8 x i8>
   2336 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <4 x i16>
   2337 // CHECK:   ret <4 x i16> [[TMP2]]
   2338 int16x4_t test_vcreate_s16(uint64_t a) {
   2339   return vclz_s16(vcreate_s16(a));
   2340 }
   2341 
   2342 // CHECK-LABEL: define <2 x i32> @test_vcreate_s32(i64 %a) #0 {
   2343 // CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <2 x i32>
   2344 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
   2345 // CHECK:   [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   2346 // CHECK:   [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[VCLZ_V_I]], i1 false) #4
   2347 // CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <2 x i32> [[VCLZ_V1_I]] to <8 x i8>
   2348 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <2 x i32>
   2349 // CHECK:   ret <2 x i32> [[TMP2]]
   2350 int32x2_t test_vcreate_s32(uint64_t a) {
   2351   return vclz_s32(vcreate_s32(a));
   2352 }
   2353 
   2354 // CHECK-LABEL: define <4 x half> @test_vcreate_f16(i64 %a) #0 {
   2355 // CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <4 x half>
   2356 // CHECK:   ret <4 x half> [[TMP0]]
   2357 float16x4_t test_vcreate_f16(uint64_t a) {
   2358   return vcreate_f16(a);
   2359 }
   2360 
   2361 // CHECK-LABEL: define <2 x float> @test_vcreate_f32(i64 %a) #0 {
   2362 // CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <2 x float>
   2363 // CHECK:   ret <2 x float> [[TMP0]]
   2364 float32x2_t test_vcreate_f32(uint64_t a) {
   2365   return vcreate_f32(a);
   2366 }
   2367 
   2368 // CHECK-LABEL: define <8 x i8> @test_vcreate_u8(i64 %a) #0 {
   2369 // CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <8 x i8>
   2370 // CHECK:   [[VCLZ_V_I:%.*]] = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> [[TMP0]], i1 false) #4
   2371 // CHECK:   ret <8 x i8> [[VCLZ_V_I]]
   2372 uint8x8_t test_vcreate_u8(uint64_t a) {
   2373   return vclz_s8(vcreate_u8(a));
   2374 }
   2375 
   2376 // CHECK-LABEL: define <4 x i16> @test_vcreate_u16(i64 %a) #0 {
   2377 // CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <4 x i16>
   2378 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
   2379 // CHECK:   [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   2380 // CHECK:   [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> [[VCLZ_V_I]], i1 false) #4
   2381 // CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <4 x i16> [[VCLZ_V1_I]] to <8 x i8>
   2382 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <4 x i16>
   2383 // CHECK:   ret <4 x i16> [[TMP2]]
   2384 uint16x4_t test_vcreate_u16(uint64_t a) {
   2385   return vclz_s16(vcreate_u16(a));
   2386 }
   2387 
   2388 // CHECK-LABEL: define <2 x i32> @test_vcreate_u32(i64 %a) #0 {
   2389 // CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <2 x i32>
   2390 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
   2391 // CHECK:   [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   2392 // CHECK:   [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[VCLZ_V_I]], i1 false) #4
   2393 // CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <2 x i32> [[VCLZ_V1_I]] to <8 x i8>
   2394 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <2 x i32>
   2395 // CHECK:   ret <2 x i32> [[TMP2]]
   2396 uint32x2_t test_vcreate_u32(uint64_t a) {
   2397   return vclz_s32(vcreate_u32(a));
   2398 }
   2399 
   2400 
   2401 // We have two ways of lowering that.  Either with one 'vmov d, r, r' or
   2402 // with two 'vmov d[],r'.  LLVM does the latter. We may want to be less
   2403 // strict about the matching pattern if it starts causing problem.
   2404 // CHECK-LABEL: define <1 x i64> @test_vcreate_u64(i64 %a) #0 {
   2405 // CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <1 x i64>
   2406 // CHECK:   [[ADD_I:%.*]] = add <1 x i64> [[TMP0]], [[TMP0]]
   2407 // CHECK:   ret <1 x i64> [[ADD_I]]
   2408 uint64x1_t test_vcreate_u64(uint64_t a) {
   2409   uint64x1_t tmp = vcreate_u64(a);
   2410   return vadd_u64(tmp, tmp);
   2411 
   2412 }
   2413 
   2414 // CHECK-LABEL: define <8 x i8> @test_vcreate_p8(i64 %a) #0 {
   2415 // CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <8 x i8>
   2416 // CHECK:   [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> [[TMP0]]) #4
   2417 // CHECK:   ret <8 x i8> [[VCNT_V_I]]
   2418 poly8x8_t test_vcreate_p8(uint64_t a) {
   2419   return vcnt_p8(vcreate_p8(a));
   2420 }
   2421 
   2422 // CHECK-LABEL: define <4 x i16> @test_vcreate_p16(i64 %a) #0 {
   2423 // CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <4 x i16>
   2424 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
   2425 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
   2426 // CHECK:   [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
   2427 // CHECK:   [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]]) #4
   2428 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <4 x i16>
   2429 // CHECK:   ret <4 x i16> [[TMP4]]
   2430 poly16x4_t test_vcreate_p16(uint64_t a) {
   2431   poly16x4_t tmp = vcreate_p16(a);
   2432   return vbsl_p16(tmp, tmp, tmp);
   2433 }
   2434 
   2435 // CHECK-LABEL: define <1 x i64> @test_vcreate_s64(i64 %a) #0 {
   2436 // CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <1 x i64>
   2437 // CHECK:   [[ADD_I:%.*]] = add <1 x i64> [[TMP0]], [[TMP0]]
   2438 // CHECK:   ret <1 x i64> [[ADD_I]]
   2439 int64x1_t test_vcreate_s64(uint64_t a) {
   2440   int64x1_t tmp = vcreate_s64(a);
   2441   return vadd_s64(tmp, tmp);
   2442 }
   2443 
   2444 
   2445 // CHECK-LABEL: define <4 x half> @test_vcvt_f16_f32(<4 x float> %a) #0 {
   2446 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
   2447 // CHECK:   [[VCVT_F16_F32_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
   2448 // CHECK:   [[VCVT_F16_F321_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float> [[VCVT_F16_F32_I]]) #4
   2449 // CHECK:   [[VCVT_F16_F322_I:%.*]] = bitcast <4 x i16> [[VCVT_F16_F321_I]] to <8 x i8>
   2450 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VCVT_F16_F322_I]] to <4 x half>
   2451 // CHECK:   ret <4 x half> [[TMP1]]
   2452 float16x4_t test_vcvt_f16_f32(float32x4_t a) {
   2453   return vcvt_f16_f32(a);
   2454 }
   2455 
   2456 
   2457 // CHECK-LABEL: define <2 x float> @test_vcvt_f32_s32(<2 x i32> %a) #0 {
   2458 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   2459 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   2460 // CHECK:   [[VCVT_I:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x float>
   2461 // CHECK:   ret <2 x float> [[VCVT_I]]
   2462 float32x2_t test_vcvt_f32_s32(int32x2_t a) {
   2463   return vcvt_f32_s32(a);
   2464 }
   2465 
   2466 // CHECK-LABEL: define <2 x float> @test_vcvt_f32_u32(<2 x i32> %a) #0 {
   2467 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   2468 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   2469 // CHECK:   [[VCVT_I:%.*]] = uitofp <2 x i32> [[TMP1]] to <2 x float>
   2470 // CHECK:   ret <2 x float> [[VCVT_I]]
   2471 float32x2_t test_vcvt_f32_u32(uint32x2_t a) {
   2472   return vcvt_f32_u32(a);
   2473 }
   2474 
   2475 // CHECK-LABEL: define <4 x float> @test_vcvtq_f32_s32(<4 x i32> %a) #0 {
   2476 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   2477 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   2478 // CHECK:   [[VCVT_I:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float>
   2479 // CHECK:   ret <4 x float> [[VCVT_I]]
   2480 float32x4_t test_vcvtq_f32_s32(int32x4_t a) {
   2481   return vcvtq_f32_s32(a);
   2482 }
   2483 
   2484 // CHECK-LABEL: define <4 x float> @test_vcvtq_f32_u32(<4 x i32> %a) #0 {
   2485 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   2486 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   2487 // CHECK:   [[VCVT_I:%.*]] = uitofp <4 x i32> [[TMP1]] to <4 x float>
   2488 // CHECK:   ret <4 x float> [[VCVT_I]]
   2489 float32x4_t test_vcvtq_f32_u32(uint32x4_t a) {
   2490   return vcvtq_f32_u32(a);
   2491 }
   2492 
   2493 
   2494 // CHECK-LABEL: define <4 x float> @test_vcvt_f32_f16(<4 x half> %a) #0 {
   2495 // CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
   2496 // CHECK:   [[VCVT_F32_F16_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   2497 // CHECK:   [[VCVT_F32_F161_I:%.*]] = call <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16> [[VCVT_F32_F16_I]]) #4
   2498 // CHECK:   [[VCVT_F32_F162_I:%.*]] = bitcast <4 x float> [[VCVT_F32_F161_I]] to <16 x i8>
   2499 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VCVT_F32_F162_I]] to <4 x float>
   2500 // CHECK:   ret <4 x float> [[TMP1]]
   2501 float32x4_t test_vcvt_f32_f16(float16x4_t a) {
   2502   return vcvt_f32_f16(a);
   2503 }
   2504 
   2505 
   2506 // CHECK-LABEL: define <2 x float> @test_vcvt_n_f32_s32(<2 x i32> %a) #0 {
   2507 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   2508 // CHECK:   [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   2509 // CHECK:   [[VCVT_N1:%.*]] = call <2 x float> @llvm.arm.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32> [[VCVT_N]], i32 1)
   2510 // CHECK:   ret <2 x float> [[VCVT_N1]]
   2511 float32x2_t test_vcvt_n_f32_s32(int32x2_t a) {
   2512   return vcvt_n_f32_s32(a, 1);
   2513 }
   2514 
   2515 // CHECK-LABEL: define <2 x float> @test_vcvt_n_f32_u32(<2 x i32> %a) #0 {
   2516 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   2517 // CHECK:   [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   2518 // CHECK:   [[VCVT_N1:%.*]] = call <2 x float> @llvm.arm.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32> [[VCVT_N]], i32 1)
   2519 // CHECK:   ret <2 x float> [[VCVT_N1]]
   2520 float32x2_t test_vcvt_n_f32_u32(uint32x2_t a) {
   2521   return vcvt_n_f32_u32(a, 1);
   2522 }
   2523 
   2524 // CHECK-LABEL: define <4 x float> @test_vcvtq_n_f32_s32(<4 x i32> %a) #0 {
   2525 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   2526 // CHECK:   [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   2527 // CHECK:   [[VCVT_N1:%.*]] = call <4 x float> @llvm.arm.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32> [[VCVT_N]], i32 3)
   2528 // CHECK:   ret <4 x float> [[VCVT_N1]]
   2529 float32x4_t test_vcvtq_n_f32_s32(int32x4_t a) {
   2530   return vcvtq_n_f32_s32(a, 3);
   2531 }
   2532 
   2533 // CHECK-LABEL: define <4 x float> @test_vcvtq_n_f32_u32(<4 x i32> %a) #0 {
   2534 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   2535 // CHECK:   [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   2536 // CHECK:   [[VCVT_N1:%.*]] = call <4 x float> @llvm.arm.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32> [[VCVT_N]], i32 3)
   2537 // CHECK:   ret <4 x float> [[VCVT_N1]]
   2538 float32x4_t test_vcvtq_n_f32_u32(uint32x4_t a) {
   2539   return vcvtq_n_f32_u32(a, 3);
   2540 }
   2541 
   2542 
   2543 // CHECK-LABEL: define <2 x i32> @test_vcvt_n_s32_f32(<2 x float> %a) #0 {
   2544 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
   2545 // CHECK:   [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
   2546 // CHECK:   [[VCVT_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float> [[VCVT_N]], i32 1)
   2547 // CHECK:   ret <2 x i32> [[VCVT_N1]]
   2548 int32x2_t test_vcvt_n_s32_f32(float32x2_t a) {
   2549   return vcvt_n_s32_f32(a, 1);
   2550 }
   2551 
   2552 // CHECK-LABEL: define <4 x i32> @test_vcvtq_n_s32_f32(<4 x float> %a) #0 {
   2553 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
   2554 // CHECK:   [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
   2555 // CHECK:   [[VCVT_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float> [[VCVT_N]], i32 3)
   2556 // CHECK:   ret <4 x i32> [[VCVT_N1]]
   2557 int32x4_t test_vcvtq_n_s32_f32(float32x4_t a) {
   2558   return vcvtq_n_s32_f32(a, 3);
   2559 }
   2560 
   2561 
   2562 // CHECK-LABEL: define <2 x i32> @test_vcvt_n_u32_f32(<2 x float> %a) #0 {
   2563 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
   2564 // CHECK:   [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
   2565 // CHECK:   [[VCVT_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float> [[VCVT_N]], i32 1)
   2566 // CHECK:   ret <2 x i32> [[VCVT_N1]]
   2567 uint32x2_t test_vcvt_n_u32_f32(float32x2_t a) {
   2568   return vcvt_n_u32_f32(a, 1);
   2569 }
   2570 
   2571 // CHECK-LABEL: define <4 x i32> @test_vcvtq_n_u32_f32(<4 x float> %a) #0 {
   2572 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
   2573 // CHECK:   [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
   2574 // CHECK:   [[VCVT_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float> [[VCVT_N]], i32 3)
   2575 // CHECK:   ret <4 x i32> [[VCVT_N1]]
   2576 uint32x4_t test_vcvtq_n_u32_f32(float32x4_t a) {
   2577   return vcvtq_n_u32_f32(a, 3);
   2578 }
   2579 
   2580 
   2581 // CHECK-LABEL: define <2 x i32> @test_vcvt_s32_f32(<2 x float> %a) #0 {
   2582 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
   2583 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
   2584 // CHECK:   [[VCVT_I:%.*]] = fptosi <2 x float> [[TMP1]] to <2 x i32>
   2585 // CHECK:   ret <2 x i32> [[VCVT_I]]
   2586 int32x2_t test_vcvt_s32_f32(float32x2_t a) {
   2587   return vcvt_s32_f32(a);
   2588 }
   2589 
   2590 // CHECK-LABEL: define <4 x i32> @test_vcvtq_s32_f32(<4 x float> %a) #0 {
   2591 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
   2592 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
   2593 // CHECK:   [[VCVT_I:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32>
   2594 // CHECK:   ret <4 x i32> [[VCVT_I]]
   2595 int32x4_t test_vcvtq_s32_f32(float32x4_t a) {
   2596   return vcvtq_s32_f32(a);
   2597 }
   2598 
   2599 
   2600 // CHECK-LABEL: define <2 x i32> @test_vcvt_u32_f32(<2 x float> %a) #0 {
   2601 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
   2602 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
   2603 // CHECK:   [[VCVT_I:%.*]] = fptoui <2 x float> [[TMP1]] to <2 x i32>
   2604 // CHECK:   ret <2 x i32> [[VCVT_I]]
   2605 uint32x2_t test_vcvt_u32_f32(float32x2_t a) {
   2606   return vcvt_u32_f32(a);
   2607 }
   2608 
   2609 // CHECK-LABEL: define <4 x i32> @test_vcvtq_u32_f32(<4 x float> %a) #0 {
   2610 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
   2611 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
   2612 // CHECK:   [[VCVT_I:%.*]] = fptoui <4 x float> [[TMP1]] to <4 x i32>
   2613 // CHECK:   ret <4 x i32> [[VCVT_I]]
   2614 uint32x4_t test_vcvtq_u32_f32(float32x4_t a) {
   2615   return vcvtq_u32_f32(a);
   2616 }
   2617 
   2618 
   2619 // CHECK-LABEL: define <8 x i8> @test_vdup_lane_u8(<8 x i8> %a) #0 {
   2620 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
   2621 // CHECK:   ret <8 x i8> [[SHUFFLE]]
   2622 uint8x8_t test_vdup_lane_u8(uint8x8_t a) {
   2623   return vdup_lane_u8(a, 7);
   2624 }
   2625 
   2626 // CHECK-LABEL: define <4 x i16> @test_vdup_lane_u16(<4 x i16> %a) #0 {
   2627 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   2628 // CHECK:   ret <4 x i16> [[SHUFFLE]]
   2629 uint16x4_t test_vdup_lane_u16(uint16x4_t a) {
   2630   return vdup_lane_u16(a, 3);
   2631 }
   2632 
   2633 // CHECK-LABEL: define <2 x i32> @test_vdup_lane_u32(<2 x i32> %a) #0 {
   2634 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <2 x i32> <i32 1, i32 1>
   2635 // CHECK:   ret <2 x i32> [[SHUFFLE]]
   2636 uint32x2_t test_vdup_lane_u32(uint32x2_t a) {
   2637   return vdup_lane_u32(a, 1);
   2638 }
   2639 
   2640 // CHECK-LABEL: define <8 x i8> @test_vdup_lane_s8(<8 x i8> %a) #0 {
   2641 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
   2642 // CHECK:   ret <8 x i8> [[SHUFFLE]]
   2643 int8x8_t test_vdup_lane_s8(int8x8_t a) {
   2644   return vdup_lane_s8(a, 7);
   2645 }
   2646 
   2647 // CHECK-LABEL: define <4 x i16> @test_vdup_lane_s16(<4 x i16> %a) #0 {
   2648 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   2649 // CHECK:   ret <4 x i16> [[SHUFFLE]]
   2650 int16x4_t test_vdup_lane_s16(int16x4_t a) {
   2651   return vdup_lane_s16(a, 3);
   2652 }
   2653 
   2654 // CHECK-LABEL: define <2 x i32> @test_vdup_lane_s32(<2 x i32> %a) #0 {
   2655 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <2 x i32> <i32 1, i32 1>
   2656 // CHECK:   ret <2 x i32> [[SHUFFLE]]
   2657 int32x2_t test_vdup_lane_s32(int32x2_t a) {
   2658   return vdup_lane_s32(a, 1);
   2659 }
   2660 
   2661 // CHECK-LABEL: define <8 x i8> @test_vdup_lane_p8(<8 x i8> %a) #0 {
   2662 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
   2663 // CHECK:   ret <8 x i8> [[SHUFFLE]]
   2664 poly8x8_t test_vdup_lane_p8(poly8x8_t a) {
   2665   return vdup_lane_p8(a, 7);
   2666 }
   2667 
   2668 // CHECK-LABEL: define <4 x i16> @test_vdup_lane_p16(<4 x i16> %a) #0 {
   2669 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   2670 // CHECK:   ret <4 x i16> [[SHUFFLE]]
   2671 poly16x4_t test_vdup_lane_p16(poly16x4_t a) {
   2672   return vdup_lane_p16(a, 3);
   2673 }
   2674 
   2675 // CHECK-LABEL: define <2 x float> @test_vdup_lane_f32(<2 x float> %a) #0 {
   2676 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %a, <2 x float> %a, <2 x i32> <i32 1, i32 1>
   2677 // CHECK:   ret <2 x float> [[SHUFFLE]]
   2678 float32x2_t test_vdup_lane_f32(float32x2_t a) {
   2679   return vdup_lane_f32(a, 1);
   2680 }
   2681 
   2682 // CHECK-LABEL: define <16 x i8> @test_vdupq_lane_u8(<8 x i8> %a) #0 {
   2683 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <16 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
   2684 // CHECK:   ret <16 x i8> [[SHUFFLE]]
   2685 uint8x16_t test_vdupq_lane_u8(uint8x8_t a) {
   2686   return vdupq_lane_u8(a, 7);
   2687 }
   2688 
   2689 // CHECK-LABEL: define <8 x i16> @test_vdupq_lane_u16(<4 x i16> %a) #0 {
   2690 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
   2691 // CHECK:   ret <8 x i16> [[SHUFFLE]]
   2692 uint16x8_t test_vdupq_lane_u16(uint16x4_t a) {
   2693   return vdupq_lane_u16(a, 3);
   2694 }
   2695 
   2696 // CHECK-LABEL: define <4 x i32> @test_vdupq_lane_u32(<2 x i32> %a) #0 {
   2697 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   2698 // CHECK:   ret <4 x i32> [[SHUFFLE]]
   2699 uint32x4_t test_vdupq_lane_u32(uint32x2_t a) {
   2700   return vdupq_lane_u32(a, 1);
   2701 }
   2702 
   2703 // CHECK-LABEL: define <16 x i8> @test_vdupq_lane_s8(<8 x i8> %a) #0 {
   2704 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <16 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
   2705 // CHECK:   ret <16 x i8> [[SHUFFLE]]
   2706 int8x16_t test_vdupq_lane_s8(int8x8_t a) {
   2707   return vdupq_lane_s8(a, 7);
   2708 }
   2709 
   2710 // CHECK-LABEL: define <8 x i16> @test_vdupq_lane_s16(<4 x i16> %a) #0 {
   2711 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
   2712 // CHECK:   ret <8 x i16> [[SHUFFLE]]
   2713 int16x8_t test_vdupq_lane_s16(int16x4_t a) {
   2714   return vdupq_lane_s16(a, 3);
   2715 }
   2716 
   2717 // CHECK-LABEL: define <4 x i32> @test_vdupq_lane_s32(<2 x i32> %a) #0 {
   2718 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   2719 // CHECK:   ret <4 x i32> [[SHUFFLE]]
   2720 int32x4_t test_vdupq_lane_s32(int32x2_t a) {
   2721   return vdupq_lane_s32(a, 1);
   2722 }
   2723 
   2724 // CHECK-LABEL: define <16 x i8> @test_vdupq_lane_p8(<8 x i8> %a) #0 {
   2725 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <16 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
   2726 // CHECK:   ret <16 x i8> [[SHUFFLE]]
   2727 poly8x16_t test_vdupq_lane_p8(poly8x8_t a) {
   2728   return vdupq_lane_p8(a, 7);
   2729 }
   2730 
   2731 // CHECK-LABEL: define <8 x i16> @test_vdupq_lane_p16(<4 x i16> %a) #0 {
   2732 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
   2733 // CHECK:   ret <8 x i16> [[SHUFFLE]]
   2734 poly16x8_t test_vdupq_lane_p16(poly16x4_t a) {
   2735   return vdupq_lane_p16(a, 3);
   2736 }
   2737 
   2738 // CHECK-LABEL: define <4 x float> @test_vdupq_lane_f32(<2 x float> %a) #0 {
   2739 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %a, <2 x float> %a, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   2740 // CHECK:   ret <4 x float> [[SHUFFLE]]
   2741 float32x4_t test_vdupq_lane_f32(float32x2_t a) {
   2742   return vdupq_lane_f32(a, 1);
   2743 }
   2744 
   2745 // CHECK-LABEL: define <1 x i64> @test_vdup_lane_s64(<1 x i64> %a) #0 {
   2746 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <1 x i64> %a, <1 x i64> %a, <1 x i32> zeroinitializer
   2747 // CHECK:   ret <1 x i64> [[SHUFFLE]]
   2748 int64x1_t test_vdup_lane_s64(int64x1_t a) {
   2749   return vdup_lane_s64(a, 0);
   2750 }
   2751 
   2752 // CHECK-LABEL: define <1 x i64> @test_vdup_lane_u64(<1 x i64> %a) #0 {
   2753 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <1 x i64> %a, <1 x i64> %a, <1 x i32> zeroinitializer
   2754 // CHECK:   ret <1 x i64> [[SHUFFLE]]
   2755 uint64x1_t test_vdup_lane_u64(uint64x1_t a) {
   2756   return vdup_lane_u64(a, 0);
   2757 }
   2758 
   2759 // CHECK-LABEL: define <2 x i64> @test_vdupq_lane_s64(<1 x i64> %a) #0 {
   2760 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <1 x i64> %a, <1 x i64> %a, <2 x i32> zeroinitializer
   2761 // CHECK:   ret <2 x i64> [[SHUFFLE]]
   2762 int64x2_t test_vdupq_lane_s64(int64x1_t a) {
   2763   return vdupq_lane_s64(a, 0);
   2764 }
   2765 
   2766 // CHECK-LABEL: define <2 x i64> @test_vdupq_lane_u64(<1 x i64> %a) #0 {
   2767 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <1 x i64> %a, <1 x i64> %a, <2 x i32> zeroinitializer
   2768 // CHECK:   ret <2 x i64> [[SHUFFLE]]
   2769 uint64x2_t test_vdupq_lane_u64(uint64x1_t a) {
   2770   return vdupq_lane_u64(a, 0);
   2771 }
   2772 
   2773 
   2774 // CHECK-LABEL: define <8 x i8> @test_vdup_n_u8(i8 zeroext %a) #0 {
   2775 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i8> undef, i8 %a, i32 0
   2776 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 %a, i32 1
   2777 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 %a, i32 2
   2778 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 %a, i32 3
   2779 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 %a, i32 4
   2780 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 %a, i32 5
   2781 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 %a, i32 6
   2782 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 %a, i32 7
   2783 // CHECK:   ret <8 x i8> [[VECINIT7_I]]
   2784 uint8x8_t test_vdup_n_u8(uint8_t a) {
   2785   return vdup_n_u8(a);
   2786 }
   2787 
   2788 // CHECK-LABEL: define <4 x i16> @test_vdup_n_u16(i16 zeroext %a) #0 {
   2789 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %a, i32 0
   2790 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %a, i32 1
   2791 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %a, i32 2
   2792 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %a, i32 3
   2793 // CHECK:   ret <4 x i16> [[VECINIT3_I]]
   2794 uint16x4_t test_vdup_n_u16(uint16_t a) {
   2795   return vdup_n_u16(a);
   2796 }
   2797 
   2798 // CHECK-LABEL: define <2 x i32> @test_vdup_n_u32(i32 %a) #0 {
   2799 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %a, i32 0
   2800 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %a, i32 1
   2801 // CHECK:   ret <2 x i32> [[VECINIT1_I]]
   2802 uint32x2_t test_vdup_n_u32(uint32_t a) {
   2803   return vdup_n_u32(a);
   2804 }
   2805 
   2806 // CHECK-LABEL: define <8 x i8> @test_vdup_n_s8(i8 signext %a) #0 {
   2807 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i8> undef, i8 %a, i32 0
   2808 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 %a, i32 1
   2809 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 %a, i32 2
   2810 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 %a, i32 3
   2811 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 %a, i32 4
   2812 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 %a, i32 5
   2813 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 %a, i32 6
   2814 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 %a, i32 7
   2815 // CHECK:   ret <8 x i8> [[VECINIT7_I]]
   2816 int8x8_t test_vdup_n_s8(int8_t a) {
   2817   return vdup_n_s8(a);
   2818 }
   2819 
   2820 // CHECK-LABEL: define <4 x i16> @test_vdup_n_s16(i16 signext %a) #0 {
   2821 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %a, i32 0
   2822 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %a, i32 1
   2823 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %a, i32 2
   2824 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %a, i32 3
   2825 // CHECK:   ret <4 x i16> [[VECINIT3_I]]
   2826 int16x4_t test_vdup_n_s16(int16_t a) {
   2827   return vdup_n_s16(a);
   2828 }
   2829 
   2830 // CHECK-LABEL: define <2 x i32> @test_vdup_n_s32(i32 %a) #0 {
   2831 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %a, i32 0
   2832 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %a, i32 1
   2833 // CHECK:   ret <2 x i32> [[VECINIT1_I]]
   2834 int32x2_t test_vdup_n_s32(int32_t a) {
   2835   return vdup_n_s32(a);
   2836 }
   2837 
   2838 // CHECK-LABEL: define <8 x i8> @test_vdup_n_p8(i8 signext %a) #0 {
   2839 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i8> undef, i8 %a, i32 0
   2840 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 %a, i32 1
   2841 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 %a, i32 2
   2842 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 %a, i32 3
   2843 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 %a, i32 4
   2844 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 %a, i32 5
   2845 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 %a, i32 6
   2846 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 %a, i32 7
   2847 // CHECK:   ret <8 x i8> [[VECINIT7_I]]
   2848 poly8x8_t test_vdup_n_p8(poly8_t a) {
   2849   return vdup_n_p8(a);
   2850 }
   2851 
   2852 // CHECK-LABEL: define <4 x i16> @test_vdup_n_p16(i16 signext %a) #0 {
   2853 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %a, i32 0
   2854 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %a, i32 1
   2855 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %a, i32 2
   2856 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %a, i32 3
   2857 // CHECK:   ret <4 x i16> [[VECINIT3_I]]
   2858 poly16x4_t test_vdup_n_p16(poly16_t a) {
   2859   return vdup_n_p16(a);
   2860 }
   2861 
   2862 // CHECK-LABEL: define <4 x half> @test_vdup_n_f16(half* %a) #0 {
   2863 // CHECK:   [[TMP0:%.*]] = load half, half* %a, align 2
   2864 // CHECK:   [[VECINIT:%.*]] = insertelement <4 x half> undef, half [[TMP0]], i32 0
   2865 // CHECK:   [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP0]], i32 1
   2866 // CHECK:   [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT1]], half [[TMP0]], i32 2
   2867 // CHECK:   [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT2]], half [[TMP0]], i32 3
   2868 // CHECK:   ret <4 x half> [[VECINIT3]]
   2869 float16x4_t test_vdup_n_f16(float16_t *a) {
   2870   return vdup_n_f16(*a);
   2871 }
   2872 
   2873 // CHECK-LABEL: define <2 x float> @test_vdup_n_f32(float %a) #0 {
   2874 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %a, i32 0
   2875 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %a, i32 1
   2876 // CHECK:   ret <2 x float> [[VECINIT1_I]]
   2877 float32x2_t test_vdup_n_f32(float32_t a) {
   2878   return vdup_n_f32(a);
   2879 }
   2880 
   2881 // CHECK-LABEL: define <16 x i8> @test_vdupq_n_u8(i8 zeroext %a) #0 {
   2882 // CHECK:   [[VECINIT_I:%.*]] = insertelement <16 x i8> undef, i8 %a, i32 0
   2883 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 %a, i32 1
   2884 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 %a, i32 2
   2885 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 %a, i32 3
   2886 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 %a, i32 4
   2887 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 %a, i32 5
   2888 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 %a, i32 6
   2889 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 %a, i32 7
   2890 // CHECK:   [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 %a, i32 8
   2891 // CHECK:   [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 %a, i32 9
   2892 // CHECK:   [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 %a, i32 10
   2893 // CHECK:   [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 %a, i32 11
   2894 // CHECK:   [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 %a, i32 12
   2895 // CHECK:   [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 %a, i32 13
   2896 // CHECK:   [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 %a, i32 14
   2897 // CHECK:   [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 %a, i32 15
   2898 // CHECK:   ret <16 x i8> [[VECINIT15_I]]
   2899 uint8x16_t test_vdupq_n_u8(uint8_t a) {
   2900   return vdupq_n_u8(a);
   2901 }
   2902 
   2903 // CHECK-LABEL: define <8 x i16> @test_vdupq_n_u16(i16 zeroext %a) #0 {
   2904 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %a, i32 0
   2905 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %a, i32 1
   2906 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %a, i32 2
   2907 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %a, i32 3
   2908 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %a, i32 4
   2909 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %a, i32 5
   2910 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %a, i32 6
   2911 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %a, i32 7
   2912 // CHECK:   ret <8 x i16> [[VECINIT7_I]]
   2913 uint16x8_t test_vdupq_n_u16(uint16_t a) {
   2914   return vdupq_n_u16(a);
   2915 }
   2916 
   2917 // CHECK-LABEL: define <4 x i32> @test_vdupq_n_u32(i32 %a) #0 {
   2918 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %a, i32 0
   2919 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %a, i32 1
   2920 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %a, i32 2
   2921 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %a, i32 3
   2922 // CHECK:   ret <4 x i32> [[VECINIT3_I]]
   2923 uint32x4_t test_vdupq_n_u32(uint32_t a) {
   2924   return vdupq_n_u32(a);
   2925 }
   2926 
   2927 // CHECK-LABEL: define <16 x i8> @test_vdupq_n_s8(i8 signext %a) #0 {
   2928 // CHECK:   [[VECINIT_I:%.*]] = insertelement <16 x i8> undef, i8 %a, i32 0
   2929 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 %a, i32 1
   2930 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 %a, i32 2
   2931 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 %a, i32 3
   2932 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 %a, i32 4
   2933 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 %a, i32 5
   2934 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 %a, i32 6
   2935 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 %a, i32 7
   2936 // CHECK:   [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 %a, i32 8
   2937 // CHECK:   [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 %a, i32 9
   2938 // CHECK:   [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 %a, i32 10
   2939 // CHECK:   [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 %a, i32 11
   2940 // CHECK:   [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 %a, i32 12
   2941 // CHECK:   [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 %a, i32 13
   2942 // CHECK:   [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 %a, i32 14
   2943 // CHECK:   [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 %a, i32 15
   2944 // CHECK:   ret <16 x i8> [[VECINIT15_I]]
   2945 int8x16_t test_vdupq_n_s8(int8_t a) {
   2946   return vdupq_n_s8(a);
   2947 }
   2948 
   2949 // CHECK-LABEL: define <8 x i16> @test_vdupq_n_s16(i16 signext %a) #0 {
   2950 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %a, i32 0
   2951 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %a, i32 1
   2952 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %a, i32 2
   2953 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %a, i32 3
   2954 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %a, i32 4
   2955 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %a, i32 5
   2956 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %a, i32 6
   2957 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %a, i32 7
   2958 // CHECK:   ret <8 x i16> [[VECINIT7_I]]
   2959 int16x8_t test_vdupq_n_s16(int16_t a) {
   2960   return vdupq_n_s16(a);
   2961 }
   2962 
   2963 // CHECK-LABEL: define <4 x i32> @test_vdupq_n_s32(i32 %a) #0 {
   2964 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %a, i32 0
   2965 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %a, i32 1
   2966 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %a, i32 2
   2967 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %a, i32 3
   2968 // CHECK:   ret <4 x i32> [[VECINIT3_I]]
   2969 int32x4_t test_vdupq_n_s32(int32_t a) {
   2970   return vdupq_n_s32(a);
   2971 }
   2972 
   2973 // CHECK-LABEL: define <16 x i8> @test_vdupq_n_p8(i8 signext %a) #0 {
   2974 // CHECK:   [[VECINIT_I:%.*]] = insertelement <16 x i8> undef, i8 %a, i32 0
   2975 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 %a, i32 1
   2976 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 %a, i32 2
   2977 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 %a, i32 3
   2978 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 %a, i32 4
   2979 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 %a, i32 5
   2980 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 %a, i32 6
   2981 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 %a, i32 7
   2982 // CHECK:   [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 %a, i32 8
   2983 // CHECK:   [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 %a, i32 9
   2984 // CHECK:   [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 %a, i32 10
   2985 // CHECK:   [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 %a, i32 11
   2986 // CHECK:   [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 %a, i32 12
   2987 // CHECK:   [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 %a, i32 13
   2988 // CHECK:   [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 %a, i32 14
   2989 // CHECK:   [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 %a, i32 15
   2990 // CHECK:   ret <16 x i8> [[VECINIT15_I]]
   2991 poly8x16_t test_vdupq_n_p8(poly8_t a) {
   2992   return vdupq_n_p8(a);
   2993 }
   2994 
   2995 // CHECK-LABEL: define <8 x i16> @test_vdupq_n_p16(i16 signext %a) #0 {
   2996 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %a, i32 0
   2997 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %a, i32 1
   2998 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %a, i32 2
   2999 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %a, i32 3
   3000 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %a, i32 4
   3001 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %a, i32 5
   3002 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %a, i32 6
   3003 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %a, i32 7
   3004 // CHECK:   ret <8 x i16> [[VECINIT7_I]]
   3005 poly16x8_t test_vdupq_n_p16(poly16_t a) {
   3006   return vdupq_n_p16(a);
   3007 }
   3008 
   3009 // CHECK-LABEL: define <8 x half> @test_vdupq_n_f16(half* %a) #0 {
   3010 // CHECK:   [[TMP0:%.*]] = load half, half* %a, align 2
   3011 // CHECK:   [[VECINIT:%.*]] = insertelement <8 x half> undef, half [[TMP0]], i32 0
   3012 // CHECK:   [[VECINIT1:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP0]], i32 1
   3013 // CHECK:   [[VECINIT2:%.*]] = insertelement <8 x half> [[VECINIT1]], half [[TMP0]], i32 2
   3014 // CHECK:   [[VECINIT3:%.*]] = insertelement <8 x half> [[VECINIT2]], half [[TMP0]], i32 3
   3015 // CHECK:   [[VECINIT4:%.*]] = insertelement <8 x half> [[VECINIT3]], half [[TMP0]], i32 4
   3016 // CHECK:   [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT4]], half [[TMP0]], i32 5
   3017 // CHECK:   [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT5]], half [[TMP0]], i32 6
   3018 // CHECK:   [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT6]], half [[TMP0]], i32 7
   3019 // CHECK:   ret <8 x half> [[VECINIT7]]
   3020 float16x8_t test_vdupq_n_f16(float16_t *a) {
   3021   return vdupq_n_f16(*a);
   3022 }
   3023 
   3024 // CHECK-LABEL: define <4 x float> @test_vdupq_n_f32(float %a) #0 {
   3025 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %a, i32 0
   3026 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %a, i32 1
   3027 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %a, i32 2
   3028 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %a, i32 3
   3029 // CHECK:   ret <4 x float> [[VECINIT3_I]]
   3030 float32x4_t test_vdupq_n_f32(float32_t a) {
   3031   return vdupq_n_f32(a);
   3032 }
   3033 
   3034 // CHECK-LABEL: define <1 x i64> @test_vdup_n_s64(i64 %a) #0 {
   3035 // CHECK:   [[VECINIT_I:%.*]] = insertelement <1 x i64> undef, i64 %a, i32 0
   3036 // CHECK:   [[ADD_I:%.*]] = add <1 x i64> [[VECINIT_I]], [[VECINIT_I]]
   3037 // CHECK:   ret <1 x i64> [[ADD_I]]
   3038 int64x1_t test_vdup_n_s64(int64_t a) {
   3039   int64x1_t tmp = vdup_n_s64(a);
   3040   return vadd_s64(tmp, tmp);
   3041 }
   3042 
   3043 // CHECK-LABEL: define <1 x i64> @test_vdup_n_u64(i64 %a) #0 {
   3044 // CHECK:   [[VECINIT_I:%.*]] = insertelement <1 x i64> undef, i64 %a, i32 0
   3045 // CHECK:   [[ADD_I:%.*]] = add <1 x i64> [[VECINIT_I]], [[VECINIT_I]]
   3046 // CHECK:   ret <1 x i64> [[ADD_I]]
   3047 uint64x1_t test_vdup_n_u64(uint64_t a) {
   3048   int64x1_t tmp = vdup_n_u64(a);
   3049   return vadd_s64(tmp, tmp);
   3050 
   3051 }
   3052 
   3053 // CHECK-LABEL: define <2 x i64> @test_vdupq_n_s64(i64 %a) #0 {
   3054 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i64> undef, i64 %a, i32 0
   3055 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 %a, i32 1
   3056 // CHECK:   [[ADD_I:%.*]] = add <2 x i64> [[VECINIT1_I]], [[VECINIT1_I]]
   3057 // CHECK:   ret <2 x i64> [[ADD_I]]
   3058 int64x2_t test_vdupq_n_s64(int64_t a) {
   3059   int64x2_t tmp = vdupq_n_s64(a);
   3060   return vaddq_s64(tmp, tmp);
   3061 }
   3062 
   3063 // CHECK-LABEL: define <2 x i64> @test_vdupq_n_u64(i64 %a) #0 {
   3064 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i64> undef, i64 %a, i32 0
   3065 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 %a, i32 1
   3066 // CHECK:   [[ADD_I:%.*]] = add <2 x i64> [[VECINIT1_I]], [[VECINIT1_I]]
   3067 // CHECK:   ret <2 x i64> [[ADD_I]]
   3068 uint64x2_t test_vdupq_n_u64(uint64_t a) {
   3069   int64x2_t tmp = vdupq_n_u64(a);
   3070   return vaddq_u64(tmp, tmp);
   3071 }
   3072 
   3073 
   3074 // CHECK-LABEL: define <8 x i8> @test_veor_s8(<8 x i8> %a, <8 x i8> %b) #0 {
   3075 // CHECK:   [[XOR_I:%.*]] = xor <8 x i8> %a, %b
   3076 // CHECK:   ret <8 x i8> [[XOR_I]]
   3077 int8x8_t test_veor_s8(int8x8_t a, int8x8_t b) {
   3078   return veor_s8(a, b);
   3079 }
   3080 
   3081 // CHECK-LABEL: define <4 x i16> @test_veor_s16(<4 x i16> %a, <4 x i16> %b) #0 {
   3082 // CHECK:   [[XOR_I:%.*]] = xor <4 x i16> %a, %b
   3083 // CHECK:   ret <4 x i16> [[XOR_I]]
   3084 int16x4_t test_veor_s16(int16x4_t a, int16x4_t b) {
   3085   return veor_s16(a, b);
   3086 }
   3087 
   3088 // CHECK-LABEL: define <2 x i32> @test_veor_s32(<2 x i32> %a, <2 x i32> %b) #0 {
   3089 // CHECK:   [[XOR_I:%.*]] = xor <2 x i32> %a, %b
   3090 // CHECK:   ret <2 x i32> [[XOR_I]]
   3091 int32x2_t test_veor_s32(int32x2_t a, int32x2_t b) {
   3092   return veor_s32(a, b);
   3093 }
   3094 
   3095 // CHECK-LABEL: define <1 x i64> @test_veor_s64(<1 x i64> %a, <1 x i64> %b) #0 {
   3096 // CHECK:   [[XOR_I:%.*]] = xor <1 x i64> %a, %b
   3097 // CHECK:   ret <1 x i64> [[XOR_I]]
   3098 int64x1_t test_veor_s64(int64x1_t a, int64x1_t b) {
   3099   return veor_s64(a, b);
   3100 }
   3101 
   3102 // CHECK-LABEL: define <8 x i8> @test_veor_u8(<8 x i8> %a, <8 x i8> %b) #0 {
   3103 // CHECK:   [[XOR_I:%.*]] = xor <8 x i8> %a, %b
   3104 // CHECK:   ret <8 x i8> [[XOR_I]]
   3105 uint8x8_t test_veor_u8(uint8x8_t a, uint8x8_t b) {
   3106   return veor_u8(a, b);
   3107 }
   3108 
   3109 // CHECK-LABEL: define <4 x i16> @test_veor_u16(<4 x i16> %a, <4 x i16> %b) #0 {
   3110 // CHECK:   [[XOR_I:%.*]] = xor <4 x i16> %a, %b
   3111 // CHECK:   ret <4 x i16> [[XOR_I]]
   3112 uint16x4_t test_veor_u16(uint16x4_t a, uint16x4_t b) {
   3113   return veor_u16(a, b);
   3114 }
   3115 
   3116 // CHECK-LABEL: define <2 x i32> @test_veor_u32(<2 x i32> %a, <2 x i32> %b) #0 {
   3117 // CHECK:   [[XOR_I:%.*]] = xor <2 x i32> %a, %b
   3118 // CHECK:   ret <2 x i32> [[XOR_I]]
   3119 uint32x2_t test_veor_u32(uint32x2_t a, uint32x2_t b) {
   3120   return veor_u32(a, b);
   3121 }
   3122 
   3123 // CHECK-LABEL: define <1 x i64> @test_veor_u64(<1 x i64> %a, <1 x i64> %b) #0 {
   3124 // CHECK:   [[XOR_I:%.*]] = xor <1 x i64> %a, %b
   3125 // CHECK:   ret <1 x i64> [[XOR_I]]
   3126 uint64x1_t test_veor_u64(uint64x1_t a, uint64x1_t b) {
   3127   return veor_u64(a, b);
   3128 }
   3129 
   3130 // CHECK-LABEL: define <16 x i8> @test_veorq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
   3131 // CHECK:   [[XOR_I:%.*]] = xor <16 x i8> %a, %b
   3132 // CHECK:   ret <16 x i8> [[XOR_I]]
   3133 int8x16_t test_veorq_s8(int8x16_t a, int8x16_t b) {
   3134   return veorq_s8(a, b);
   3135 }
   3136 
   3137 // CHECK-LABEL: define <8 x i16> @test_veorq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
   3138 // CHECK:   [[XOR_I:%.*]] = xor <8 x i16> %a, %b
   3139 // CHECK:   ret <8 x i16> [[XOR_I]]
   3140 int16x8_t test_veorq_s16(int16x8_t a, int16x8_t b) {
   3141   return veorq_s16(a, b);
   3142 }
   3143 
   3144 // CHECK-LABEL: define <4 x i32> @test_veorq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
   3145 // CHECK:   [[XOR_I:%.*]] = xor <4 x i32> %a, %b
   3146 // CHECK:   ret <4 x i32> [[XOR_I]]
   3147 int32x4_t test_veorq_s32(int32x4_t a, int32x4_t b) {
   3148   return veorq_s32(a, b);
   3149 }
   3150 
   3151 // CHECK-LABEL: define <2 x i64> @test_veorq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
   3152 // CHECK:   [[XOR_I:%.*]] = xor <2 x i64> %a, %b
   3153 // CHECK:   ret <2 x i64> [[XOR_I]]
   3154 int64x2_t test_veorq_s64(int64x2_t a, int64x2_t b) {
   3155   return veorq_s64(a, b);
   3156 }
   3157 
   3158 // CHECK-LABEL: define <16 x i8> @test_veorq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
   3159 // CHECK:   [[XOR_I:%.*]] = xor <16 x i8> %a, %b
   3160 // CHECK:   ret <16 x i8> [[XOR_I]]
   3161 uint8x16_t test_veorq_u8(uint8x16_t a, uint8x16_t b) {
   3162   return veorq_u8(a, b);
   3163 }
   3164 
   3165 // CHECK-LABEL: define <8 x i16> @test_veorq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
   3166 // CHECK:   [[XOR_I:%.*]] = xor <8 x i16> %a, %b
   3167 // CHECK:   ret <8 x i16> [[XOR_I]]
   3168 uint16x8_t test_veorq_u16(uint16x8_t a, uint16x8_t b) {
   3169   return veorq_u16(a, b);
   3170 }
   3171 
   3172 // CHECK-LABEL: define <4 x i32> @test_veorq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
   3173 // CHECK:   [[XOR_I:%.*]] = xor <4 x i32> %a, %b
   3174 // CHECK:   ret <4 x i32> [[XOR_I]]
   3175 uint32x4_t test_veorq_u32(uint32x4_t a, uint32x4_t b) {
   3176   return veorq_u32(a, b);
   3177 }
   3178 
   3179 // CHECK-LABEL: define <2 x i64> @test_veorq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
   3180 // CHECK:   [[XOR_I:%.*]] = xor <2 x i64> %a, %b
   3181 // CHECK:   ret <2 x i64> [[XOR_I]]
   3182 uint64x2_t test_veorq_u64(uint64x2_t a, uint64x2_t b) {
   3183   return veorq_u64(a, b);
   3184 }
   3185 
   3186 
   3187 // CHECK-LABEL: define <8 x i8> @test_vext_s8(<8 x i8> %a, <8 x i8> %b) #0 {
   3188 // CHECK:   [[VEXT:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
   3189 // CHECK:   ret <8 x i8> [[VEXT]]
   3190 int8x8_t test_vext_s8(int8x8_t a, int8x8_t b) {
   3191   return vext_s8(a, b, 7);
   3192 }
   3193 
   3194 // CHECK-LABEL: define <8 x i8> @test_vext_u8(<8 x i8> %a, <8 x i8> %b) #0 {
   3195 // CHECK:   [[VEXT:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
   3196 // CHECK:   ret <8 x i8> [[VEXT]]
   3197 uint8x8_t test_vext_u8(uint8x8_t a, uint8x8_t b) {
   3198   return vext_u8(a, b, 7);
   3199 }
   3200 
   3201 // CHECK-LABEL: define <8 x i8> @test_vext_p8(<8 x i8> %a, <8 x i8> %b) #0 {
   3202 // CHECK:   [[VEXT:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
   3203 // CHECK:   ret <8 x i8> [[VEXT]]
   3204 poly8x8_t test_vext_p8(poly8x8_t a, poly8x8_t b) {
   3205   return vext_p8(a, b, 7);
   3206 }
   3207 
   3208 // CHECK-LABEL: define <4 x i16> @test_vext_s16(<4 x i16> %a, <4 x i16> %b) #0 {
   3209 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   3210 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   3211 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   3212 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   3213 // CHECK:   [[VEXT:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
   3214 // CHECK:   ret <4 x i16> [[VEXT]]
   3215 int16x4_t test_vext_s16(int16x4_t a, int16x4_t b) {
   3216   return vext_s16(a, b, 3);
   3217 }
   3218 
   3219 // CHECK-LABEL: define <4 x i16> @test_vext_u16(<4 x i16> %a, <4 x i16> %b) #0 {
   3220 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   3221 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   3222 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   3223 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   3224 // CHECK:   [[VEXT:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
   3225 // CHECK:   ret <4 x i16> [[VEXT]]
   3226 uint16x4_t test_vext_u16(uint16x4_t a, uint16x4_t b) {
   3227   return vext_u16(a, b, 3);
   3228 }
   3229 
   3230 // CHECK-LABEL: define <4 x i16> @test_vext_p16(<4 x i16> %a, <4 x i16> %b) #0 {
   3231 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   3232 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   3233 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   3234 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   3235 // CHECK:   [[VEXT:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
   3236 // CHECK:   ret <4 x i16> [[VEXT]]
   3237 poly16x4_t test_vext_p16(poly16x4_t a, poly16x4_t b) {
   3238   return vext_p16(a, b, 3);
   3239 }
   3240 
   3241 // CHECK-LABEL: define <2 x i32> @test_vext_s32(<2 x i32> %a, <2 x i32> %b) #0 {
   3242 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   3243 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   3244 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   3245 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   3246 // CHECK:   [[VEXT:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> <i32 1, i32 2>
   3247 // CHECK:   ret <2 x i32> [[VEXT]]
   3248 int32x2_t test_vext_s32(int32x2_t a, int32x2_t b) {
   3249   return vext_s32(a, b, 1);
   3250 }
   3251 
   3252 // CHECK-LABEL: define <2 x i32> @test_vext_u32(<2 x i32> %a, <2 x i32> %b) #0 {
   3253 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   3254 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   3255 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   3256 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   3257 // CHECK:   [[VEXT:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> <i32 1, i32 2>
   3258 // CHECK:   ret <2 x i32> [[VEXT]]
   3259 uint32x2_t test_vext_u32(uint32x2_t a, uint32x2_t b) {
   3260   return vext_u32(a, b, 1);
   3261 }
   3262 
   3263 // CHECK-LABEL: define <1 x i64> @test_vext_s64(<1 x i64> %a, <1 x i64> %b) #0 {
   3264 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   3265 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
   3266 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
   3267 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
   3268 // CHECK:   [[VEXT:%.*]] = shufflevector <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
   3269 // CHECK:   ret <1 x i64> [[VEXT]]
   3270 int64x1_t test_vext_s64(int64x1_t a, int64x1_t b) {
   3271   return vext_s64(a, b, 0);
   3272 }
   3273 
   3274 // CHECK-LABEL: define <1 x i64> @test_vext_u64(<1 x i64> %a, <1 x i64> %b) #0 {
   3275 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   3276 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
   3277 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
   3278 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
   3279 // CHECK:   [[VEXT:%.*]] = shufflevector <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
   3280 // CHECK:   ret <1 x i64> [[VEXT]]
   3281 uint64x1_t test_vext_u64(uint64x1_t a, uint64x1_t b) {
   3282   return vext_u64(a, b, 0);
   3283 }
   3284 
   3285 // CHECK-LABEL: define <2 x float> @test_vext_f32(<2 x float> %a, <2 x float> %b) #0 {
   3286 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
   3287 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
   3288 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
   3289 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
   3290 // CHECK:   [[VEXT:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP3]], <2 x i32> <i32 1, i32 2>
   3291 // CHECK:   ret <2 x float> [[VEXT]]
   3292 float32x2_t test_vext_f32(float32x2_t a, float32x2_t b) {
   3293   return vext_f32(a, b, 1);
   3294 }
   3295 
   3296 // CHECK-LABEL: define <16 x i8> @test_vextq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
   3297 // CHECK:   [[VEXT:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
   3298 // CHECK:   ret <16 x i8> [[VEXT]]
   3299 int8x16_t test_vextq_s8(int8x16_t a, int8x16_t b) {
   3300   return vextq_s8(a, b, 15);
   3301 }
   3302 
   3303 // CHECK-LABEL: define <16 x i8> @test_vextq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
   3304 // CHECK:   [[VEXT:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
   3305 // CHECK:   ret <16 x i8> [[VEXT]]
   3306 uint8x16_t test_vextq_u8(uint8x16_t a, uint8x16_t b) {
   3307   return vextq_u8(a, b, 15);
   3308 }
   3309 
   3310 // CHECK-LABEL: define <16 x i8> @test_vextq_p8(<16 x i8> %a, <16 x i8> %b) #0 {
   3311 // CHECK:   [[VEXT:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
   3312 // CHECK:   ret <16 x i8> [[VEXT]]
   3313 poly8x16_t test_vextq_p8(poly8x16_t a, poly8x16_t b) {
   3314   return vextq_p8(a, b, 15);
   3315 }
   3316 
   3317 // CHECK-LABEL: define <8 x i16> @test_vextq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
   3318 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   3319 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   3320 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   3321 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   3322 // CHECK:   [[VEXT:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
   3323 // CHECK:   ret <8 x i16> [[VEXT]]
   3324 int16x8_t test_vextq_s16(int16x8_t a, int16x8_t b) {
   3325   return vextq_s16(a, b, 7);
   3326 }
   3327 
   3328 // CHECK-LABEL: define <8 x i16> @test_vextq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
   3329 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   3330 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   3331 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   3332 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   3333 // CHECK:   [[VEXT:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
   3334 // CHECK:   ret <8 x i16> [[VEXT]]
   3335 uint16x8_t test_vextq_u16(uint16x8_t a, uint16x8_t b) {
   3336   return vextq_u16(a, b, 7);
   3337 }
   3338 
   3339 // CHECK-LABEL: define <8 x i16> @test_vextq_p16(<8 x i16> %a, <8 x i16> %b) #0 {
   3340 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   3341 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   3342 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   3343 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   3344 // CHECK:   [[VEXT:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
   3345 // CHECK:   ret <8 x i16> [[VEXT]]
   3346 poly16x8_t test_vextq_p16(poly16x8_t a, poly16x8_t b) {
   3347   return vextq_p16(a, b, 7);
   3348 }
   3349 
   3350 // CHECK-LABEL: define <4 x i32> @test_vextq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
   3351 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   3352 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   3353 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   3354 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   3355 // CHECK:   [[VEXT:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
   3356 // CHECK:   ret <4 x i32> [[VEXT]]
   3357 int32x4_t test_vextq_s32(int32x4_t a, int32x4_t b) {
   3358   return vextq_s32(a, b, 3);
   3359 }
   3360 
   3361 // CHECK-LABEL: define <4 x i32> @test_vextq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
   3362 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   3363 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   3364 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   3365 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   3366 // CHECK:   [[VEXT:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
   3367 // CHECK:   ret <4 x i32> [[VEXT]]
   3368 uint32x4_t test_vextq_u32(uint32x4_t a, uint32x4_t b) {
   3369   return vextq_u32(a, b, 3);
   3370 }
   3371 
   3372 // CHECK-LABEL: define <2 x i64> @test_vextq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
   3373 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   3374 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   3375 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   3376 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
   3377 // CHECK:   [[VEXT:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i32> <i32 1, i32 2>
   3378 // CHECK:   ret <2 x i64> [[VEXT]]
   3379 int64x2_t test_vextq_s64(int64x2_t a, int64x2_t b) {
   3380   return vextq_s64(a, b, 1);
   3381 }
   3382 
   3383 // CHECK-LABEL: define <2 x i64> @test_vextq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
   3384 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   3385 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   3386 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   3387 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
   3388 // CHECK:   [[VEXT:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i32> <i32 1, i32 2>
   3389 // CHECK:   ret <2 x i64> [[VEXT]]
   3390 uint64x2_t test_vextq_u64(uint64x2_t a, uint64x2_t b) {
   3391   return vextq_u64(a, b, 1);
   3392 }
   3393 
   3394 // CHECK-LABEL: define <4 x float> @test_vextq_f32(<4 x float> %a, <4 x float> %b) #0 {
   3395 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
   3396 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
   3397 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
   3398 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
   3399 // CHECK:   [[VEXT:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
   3400 // CHECK:   ret <4 x float> [[VEXT]]
   3401 float32x4_t test_vextq_f32(float32x4_t a, float32x4_t b) {
   3402   return vextq_f32(a, b, 3);
   3403 }
   3404 
   3405 
   3406 // CHECK-LABEL: define <2 x float> @test_vfma_f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) #0 {
   3407 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
   3408 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
   3409 // CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %c to <8 x i8>
   3410 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
   3411 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
   3412 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
   3413 // CHECK:   [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x float> [[TMP3]]) #4
   3414 // CHECK:   ret <2 x float> [[TMP6]]
   3415 float32x2_t test_vfma_f32(float32x2_t a, float32x2_t b, float32x2_t c) {
   3416   return vfma_f32(a, b, c);
   3417 }
   3418 
   3419 // CHECK-LABEL: define <4 x float> @test_vfmaq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) #0 {
   3420 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
   3421 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
   3422 // CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %c to <16 x i8>
   3423 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
   3424 // CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
   3425 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
   3426 // CHECK:   [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x float> [[TMP3]]) #4
   3427 // CHECK:   ret <4 x float> [[TMP6]]
   3428 float32x4_t test_vfmaq_f32(float32x4_t a, float32x4_t b, float32x4_t c) {
   3429   return vfmaq_f32(a, b, c);
   3430 }
   3431 
   3432 // CHECK-LABEL: define <2 x float> @test_vfms_f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) #0 {
   3433 // CHECK:   [[SUB_I:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %b
   3434 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
   3435 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> [[SUB_I]] to <8 x i8>
   3436 // CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %c to <8 x i8>
   3437 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
   3438 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
   3439 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
   3440 // CHECK:   [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x float> [[TMP3]]) #4
   3441 // CHECK:   ret <2 x float> [[TMP6]]
   3442 float32x2_t test_vfms_f32(float32x2_t a, float32x2_t b, float32x2_t c) {
   3443   return vfms_f32(a, b, c);
   3444 }
   3445 
   3446 // CHECK-LABEL: define <4 x float> @test_vfmsq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) #0 {
   3447 // CHECK:   [[SUB_I:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
   3448 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
   3449 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> [[SUB_I]] to <16 x i8>
   3450 // CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %c to <16 x i8>
   3451 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
   3452 // CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
   3453 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
   3454 // CHECK:   [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x float> [[TMP3]]) #4
   3455 // CHECK:   ret <4 x float> [[TMP6]]
   3456 float32x4_t test_vfmsq_f32(float32x4_t a, float32x4_t b, float32x4_t c) {
   3457   return vfmsq_f32(a, b, c);
   3458 }
   3459 
   3460 
   3461 // CHECK-LABEL: define <8 x i8> @test_vget_high_s8(<16 x i8> %a) #0 {
   3462 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   3463 // CHECK:   ret <8 x i8> [[SHUFFLE_I]]
   3464 int8x8_t test_vget_high_s8(int8x16_t a) {
   3465   return vget_high_s8(a);
   3466 }
   3467 
   3468 // CHECK-LABEL: define <4 x i16> @test_vget_high_s16(<8 x i16> %a) #0 {
   3469 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   3470 // CHECK:   ret <4 x i16> [[SHUFFLE_I]]
   3471 int16x4_t test_vget_high_s16(int16x8_t a) {
   3472   return vget_high_s16(a);
   3473 }
   3474 
   3475 // CHECK-LABEL: define <2 x i32> @test_vget_high_s32(<4 x i32> %a) #0 {
   3476 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
   3477 // CHECK:   ret <2 x i32> [[SHUFFLE_I]]
   3478 int32x2_t test_vget_high_s32(int32x4_t a) {
   3479   return vget_high_s32(a);
   3480 }
   3481 
   3482 // CHECK-LABEL: define <1 x i64> @test_vget_high_s64(<2 x i64> %a) #0 {
   3483 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> <i32 1>
   3484 // CHECK:   ret <1 x i64> [[SHUFFLE_I]]
   3485 int64x1_t test_vget_high_s64(int64x2_t a) {
   3486   return vget_high_s64(a);
   3487 }
   3488 
   3489 // CHECK-LABEL: define <4 x half> @test_vget_high_f16(<8 x half> %a) #0 {
   3490 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x half> %a, <8 x half> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   3491 // CHECK:   ret <4 x half> [[SHUFFLE_I]]
   3492 float16x4_t test_vget_high_f16(float16x8_t a) {
   3493   return vget_high_f16(a);
   3494 }
   3495 
   3496 // CHECK-LABEL: define <2 x float> @test_vget_high_f32(<4 x float> %a) #0 {
   3497 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %a, <2 x i32> <i32 2, i32 3>
   3498 // CHECK:   ret <2 x float> [[SHUFFLE_I]]
   3499 float32x2_t test_vget_high_f32(float32x4_t a) {
   3500   return vget_high_f32(a);
   3501 }
   3502 
   3503 // CHECK-LABEL: define <8 x i8> @test_vget_high_u8(<16 x i8> %a) #0 {
   3504 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   3505 // CHECK:   ret <8 x i8> [[SHUFFLE_I]]
   3506 uint8x8_t test_vget_high_u8(uint8x16_t a) {
   3507   return vget_high_u8(a);
   3508 }
   3509 
   3510 // CHECK-LABEL: define <4 x i16> @test_vget_high_u16(<8 x i16> %a) #0 {
   3511 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   3512 // CHECK:   ret <4 x i16> [[SHUFFLE_I]]
   3513 uint16x4_t test_vget_high_u16(uint16x8_t a) {
   3514   return vget_high_u16(a);
   3515 }
   3516 
   3517 // CHECK-LABEL: define <2 x i32> @test_vget_high_u32(<4 x i32> %a) #0 {
   3518 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
   3519 // CHECK:   ret <2 x i32> [[SHUFFLE_I]]
   3520 uint32x2_t test_vget_high_u32(uint32x4_t a) {
   3521   return vget_high_u32(a);
   3522 }
   3523 
   3524 // CHECK-LABEL: define <1 x i64> @test_vget_high_u64(<2 x i64> %a) #0 {
   3525 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> <i32 1>
   3526 // CHECK:   ret <1 x i64> [[SHUFFLE_I]]
   3527 uint64x1_t test_vget_high_u64(uint64x2_t a) {
   3528   return vget_high_u64(a);
   3529 }
   3530 
   3531 // CHECK-LABEL: define <8 x i8> @test_vget_high_p8(<16 x i8> %a) #0 {
   3532 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   3533 // CHECK:   ret <8 x i8> [[SHUFFLE_I]]
   3534 poly8x8_t test_vget_high_p8(poly8x16_t a) {
   3535   return vget_high_p8(a);
   3536 }
   3537 
   3538 // CHECK-LABEL: define <4 x i16> @test_vget_high_p16(<8 x i16> %a) #0 {
   3539 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   3540 // CHECK:   ret <4 x i16> [[SHUFFLE_I]]
   3541 poly16x4_t test_vget_high_p16(poly16x8_t a) {
   3542   return vget_high_p16(a);
   3543 }
   3544 
   3545 
   3546 // CHECK-LABEL: define zeroext i8 @test_vget_lane_u8(<8 x i8> %a) #0 {
   3547 // CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i8> %a, i32 7
   3548 // CHECK:   ret i8 [[VGET_LANE]]
   3549 uint8_t test_vget_lane_u8(uint8x8_t a) {
   3550   return vget_lane_u8(a, 7);
   3551 }
   3552 
   3553 // CHECK-LABEL: define zeroext i16 @test_vget_lane_u16(<4 x i16> %a) #0 {
   3554 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   3555 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   3556 // CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
   3557 // CHECK:   ret i16 [[VGET_LANE]]
   3558 uint16_t test_vget_lane_u16(uint16x4_t a) {
   3559   return vget_lane_u16(a, 3);
   3560 }
   3561 
   3562 // CHECK-LABEL: define i32 @test_vget_lane_u32(<2 x i32> %a) #0 {
   3563 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   3564 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   3565 // CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
   3566 // CHECK:   ret i32 [[VGET_LANE]]
   3567 uint32_t test_vget_lane_u32(uint32x2_t a) {
   3568   return vget_lane_u32(a, 1);
   3569 }
   3570 
   3571 // CHECK-LABEL: define signext i8 @test_vget_lane_s8(<8 x i8> %a) #0 {
   3572 // CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i8> %a, i32 7
   3573 // CHECK:   ret i8 [[VGET_LANE]]
   3574 int8_t test_vget_lane_s8(int8x8_t a) {
   3575   return vget_lane_s8(a, 7);
   3576 }
   3577 
   3578 // CHECK-LABEL: define signext i16 @test_vget_lane_s16(<4 x i16> %a) #0 {
   3579 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   3580 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   3581 // CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
   3582 // CHECK:   ret i16 [[VGET_LANE]]
   3583 int16_t test_vget_lane_s16(int16x4_t a) {
   3584   return vget_lane_s16(a, 3);
   3585 }
   3586 
   3587 // CHECK-LABEL: define i32 @test_vget_lane_s32(<2 x i32> %a) #0 {
   3588 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   3589 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   3590 // CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
   3591 // CHECK:   ret i32 [[VGET_LANE]]
   3592 int32_t test_vget_lane_s32(int32x2_t a) {
   3593   return vget_lane_s32(a, 1);
   3594 }
   3595 
   3596 // CHECK-LABEL: define signext i8 @test_vget_lane_p8(<8 x i8> %a) #0 {
   3597 // CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i8> %a, i32 7
   3598 // CHECK:   ret i8 [[VGET_LANE]]
   3599 poly8_t test_vget_lane_p8(poly8x8_t a) {
   3600   return vget_lane_p8(a, 7);
   3601 }
   3602 
   3603 // CHECK-LABEL: define signext i16 @test_vget_lane_p16(<4 x i16> %a) #0 {
   3604 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   3605 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   3606 // CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
   3607 // CHECK:   ret i16 [[VGET_LANE]]
   3608 poly16_t test_vget_lane_p16(poly16x4_t a) {
   3609   return vget_lane_p16(a, 3);
   3610 }
   3611 
   3612 // CHECK-LABEL: define float @test_vget_lane_f32(<2 x float> %a) #0 {
   3613 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
   3614 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
   3615 // CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
   3616 // CHECK:   ret float [[VGET_LANE]]
   3617 float32_t test_vget_lane_f32(float32x2_t a) {
   3618   return vget_lane_f32(a, 1);
   3619 }
   3620 
   3621 // CHECK-LABEL: define float @test_vget_lane_f16(<4 x half> %a) #0 {
   3622 // CHECK:   [[__REINT_242:%.*]] = alloca <4 x half>, align 8
   3623 // CHECK:   [[__REINT1_242:%.*]] = alloca i16, align 2
   3624 // CHECK:   store <4 x half> %a, <4 x half>* [[__REINT_242]], align 8
   3625 // CHECK:   [[TMP0:%.*]] = bitcast <4 x half>* [[__REINT_242]] to <4 x i16>*
   3626 // CHECK:   [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* [[TMP0]], align 8
   3627 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8>
   3628 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
   3629 // CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP3]], i32 1
   3630 // CHECK:   store i16 [[VGET_LANE]], i16* [[__REINT1_242]], align 2
   3631 // CHECK:   [[TMP4:%.*]] = bitcast i16* [[__REINT1_242]] to half*
   3632 // CHECK:   [[TMP5:%.*]] = load half, half* [[TMP4]], align 2
   3633 // CHECK:   [[CONV:%.*]] = fpext half [[TMP5]] to float
   3634 // CHECK:   ret float [[CONV]]
   3635 float32_t test_vget_lane_f16(float16x4_t a) {
   3636   return vget_lane_f16(a, 1);
   3637 }
   3638 
   3639 // CHECK-LABEL: define zeroext i8 @test_vgetq_lane_u8(<16 x i8> %a) #0 {
   3640 // CHECK:   [[VGET_LANE:%.*]] = extractelement <16 x i8> %a, i32 15
   3641 // CHECK:   ret i8 [[VGET_LANE]]
   3642 uint8_t test_vgetq_lane_u8(uint8x16_t a) {
   3643   return vgetq_lane_u8(a, 15);
   3644 }
   3645 
   3646 // CHECK-LABEL: define zeroext i16 @test_vgetq_lane_u16(<8 x i16> %a) #0 {
   3647 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   3648 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   3649 // CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
   3650 // CHECK:   ret i16 [[VGET_LANE]]
   3651 uint16_t test_vgetq_lane_u16(uint16x8_t a) {
   3652   return vgetq_lane_u16(a, 7);
   3653 }
   3654 
   3655 // CHECK-LABEL: define i32 @test_vgetq_lane_u32(<4 x i32> %a) #0 {
   3656 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   3657 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   3658 // CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
   3659 // CHECK:   ret i32 [[VGET_LANE]]
   3660 uint32_t test_vgetq_lane_u32(uint32x4_t a) {
   3661   return vgetq_lane_u32(a, 3);
   3662 }
   3663 
   3664 // CHECK-LABEL: define signext i8 @test_vgetq_lane_s8(<16 x i8> %a) #0 {
   3665 // CHECK:   [[VGET_LANE:%.*]] = extractelement <16 x i8> %a, i32 15
   3666 // CHECK:   ret i8 [[VGET_LANE]]
   3667 int8_t test_vgetq_lane_s8(int8x16_t a) {
   3668   return vgetq_lane_s8(a, 15);
   3669 }
   3670 
   3671 // CHECK-LABEL: define signext i16 @test_vgetq_lane_s16(<8 x i16> %a) #0 {
   3672 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   3673 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   3674 // CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
   3675 // CHECK:   ret i16 [[VGET_LANE]]
   3676 int16_t test_vgetq_lane_s16(int16x8_t a) {
   3677   return vgetq_lane_s16(a, 7);
   3678 }
   3679 
   3680 // CHECK-LABEL: define i32 @test_vgetq_lane_s32(<4 x i32> %a) #0 {
   3681 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   3682 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   3683 // CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
   3684 // CHECK:   ret i32 [[VGET_LANE]]
   3685 int32_t test_vgetq_lane_s32(int32x4_t a) {
   3686   return vgetq_lane_s32(a, 3);
   3687 }
   3688 
   3689 // CHECK-LABEL: define signext i8 @test_vgetq_lane_p8(<16 x i8> %a) #0 {
   3690 // CHECK:   [[VGET_LANE:%.*]] = extractelement <16 x i8> %a, i32 15
   3691 // CHECK:   ret i8 [[VGET_LANE]]
   3692 poly8_t test_vgetq_lane_p8(poly8x16_t a) {
   3693   return vgetq_lane_p8(a, 15);
   3694 }
   3695 
   3696 // CHECK-LABEL: define signext i16 @test_vgetq_lane_p16(<8 x i16> %a) #0 {
   3697 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   3698 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   3699 // CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
   3700 // CHECK:   ret i16 [[VGET_LANE]]
   3701 poly16_t test_vgetq_lane_p16(poly16x8_t a) {
   3702   return vgetq_lane_p16(a, 7);
   3703 }
   3704 
   3705 // CHECK-LABEL: define float @test_vgetq_lane_f32(<4 x float> %a) #0 {
   3706 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
   3707 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
   3708 // CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
   3709 // CHECK:   ret float [[VGET_LANE]]
   3710 float32_t test_vgetq_lane_f32(float32x4_t a) {
   3711   return vgetq_lane_f32(a, 3);
   3712 }
   3713 
   3714 // CHECK-LABEL: define float @test_vgetq_lane_f16(<8 x half> %a) #0 {
   3715 // CHECK:   [[__REINT_244:%.*]] = alloca <8 x half>, align 16
   3716 // CHECK:   [[__REINT1_244:%.*]] = alloca i16, align 2
   3717 // CHECK:   store <8 x half> %a, <8 x half>* [[__REINT_244]], align 16
   3718 // CHECK:   [[TMP0:%.*]] = bitcast <8 x half>* [[__REINT_244]] to <8 x i16>*
   3719 // CHECK:   [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 16
   3720 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8>
   3721 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
   3722 // CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3
   3723 // CHECK:   store i16 [[VGET_LANE]], i16* [[__REINT1_244]], align 2
   3724 // CHECK:   [[TMP4:%.*]] = bitcast i16* [[__REINT1_244]] to half*
   3725 // CHECK:   [[TMP5:%.*]] = load half, half* [[TMP4]], align 2
   3726 // CHECK:   [[CONV:%.*]] = fpext half [[TMP5]] to float
   3727 // CHECK:   ret float [[CONV]]
   3728 float32_t test_vgetq_lane_f16(float16x8_t a) {
   3729   return vgetq_lane_f16(a, 3);
   3730 }
   3731 
   3732 // The optimizer is able to remove all moves now.
   3733 // CHECK-LABEL: define i64 @test_vget_lane_s64(<1 x i64> %a) #0 {
   3734 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   3735 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
   3736 // CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0
   3737 // CHECK:   ret i64 [[VGET_LANE]]
   3738 int64_t test_vget_lane_s64(int64x1_t a) {
   3739   return vget_lane_s64(a, 0);
   3740 }
   3741 
   3742 // The optimizer is able to remove all moves now.
   3743 // CHECK-LABEL: define i64 @test_vget_lane_u64(<1 x i64> %a) #0 {
   3744 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   3745 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
   3746 // CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0
   3747 // CHECK:   ret i64 [[VGET_LANE]]
   3748 uint64_t test_vget_lane_u64(uint64x1_t a) {
   3749   return vget_lane_u64(a, 0);
   3750 }
   3751 
   3752 // CHECK-LABEL: define i64 @test_vgetq_lane_s64(<2 x i64> %a) #0 {
   3753 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   3754 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   3755 // CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
   3756 // CHECK:   ret i64 [[VGET_LANE]]
   3757 int64_t test_vgetq_lane_s64(int64x2_t a) {
   3758   return vgetq_lane_s64(a, 1);
   3759 }
   3760 
   3761 // CHECK-LABEL: define i64 @test_vgetq_lane_u64(<2 x i64> %a) #0 {
   3762 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   3763 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   3764 // CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
   3765 // CHECK:   ret i64 [[VGET_LANE]]
   3766 uint64_t test_vgetq_lane_u64(uint64x2_t a) {
   3767   return vgetq_lane_u64(a, 1);
   3768 }
   3769 
   3770 
   3771 // CHECK-LABEL: define <8 x i8> @test_vget_low_s8(<16 x i8> %a) #0 {
   3772 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   3773 // CHECK:   ret <8 x i8> [[SHUFFLE_I]]
   3774 int8x8_t test_vget_low_s8(int8x16_t a) {
   3775   return vget_low_s8(a);
   3776 }
   3777 
   3778 // CHECK-LABEL: define <4 x i16> @test_vget_low_s16(<8 x i16> %a) #0 {
   3779 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   3780 // CHECK:   ret <4 x i16> [[SHUFFLE_I]]
   3781 int16x4_t test_vget_low_s16(int16x8_t a) {
   3782   return vget_low_s16(a);
   3783 }
   3784 
   3785 // CHECK-LABEL: define <2 x i32> @test_vget_low_s32(<4 x i32> %a) #0 {
   3786 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 0, i32 1>
   3787 // CHECK:   ret <2 x i32> [[SHUFFLE_I]]
   3788 int32x2_t test_vget_low_s32(int32x4_t a) {
   3789   return vget_low_s32(a);
   3790 }
   3791 
   3792 // CHECK-LABEL: define <1 x i64> @test_vget_low_s64(<2 x i64> %a) #0 {
   3793 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> zeroinitializer
   3794 // CHECK:   ret <1 x i64> [[SHUFFLE_I]]
   3795 int64x1_t test_vget_low_s64(int64x2_t a) {
   3796   return vget_low_s64(a);
   3797 }
   3798 
   3799 // CHECK-LABEL: define <4 x half> @test_vget_low_f16(<8 x half> %a) #0 {
   3800 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x half> %a, <8 x half> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   3801 // CHECK:   ret <4 x half> [[SHUFFLE_I]]
   3802 float16x4_t test_vget_low_f16(float16x8_t a) {
   3803   return vget_low_f16(a);
   3804 }
   3805 
   3806 // CHECK-LABEL: define <2 x float> @test_vget_low_f32(<4 x float> %a) #0 {
   3807 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %a, <2 x i32> <i32 0, i32 1>
   3808 // CHECK:   ret <2 x float> [[SHUFFLE_I]]
   3809 float32x2_t test_vget_low_f32(float32x4_t a) {
   3810   return vget_low_f32(a);
   3811 }
   3812 
   3813 // CHECK-LABEL: define <8 x i8> @test_vget_low_u8(<16 x i8> %a) #0 {
   3814 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   3815 // CHECK:   ret <8 x i8> [[SHUFFLE_I]]
   3816 uint8x8_t test_vget_low_u8(uint8x16_t a) {
   3817   return vget_low_u8(a);
   3818 }
   3819 
   3820 // CHECK-LABEL: define <4 x i16> @test_vget_low_u16(<8 x i16> %a) #0 {
   3821 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   3822 // CHECK:   ret <4 x i16> [[SHUFFLE_I]]
   3823 uint16x4_t test_vget_low_u16(uint16x8_t a) {
   3824   return vget_low_u16(a);
   3825 }
   3826 
   3827 // CHECK-LABEL: define <2 x i32> @test_vget_low_u32(<4 x i32> %a) #0 {
   3828 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 0, i32 1>
   3829 // CHECK:   ret <2 x i32> [[SHUFFLE_I]]
   3830 uint32x2_t test_vget_low_u32(uint32x4_t a) {
   3831   return vget_low_u32(a);
   3832 }
   3833 
   3834 // CHECK-LABEL: define <1 x i64> @test_vget_low_u64(<2 x i64> %a) #0 {
   3835 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> zeroinitializer
   3836 // CHECK:   ret <1 x i64> [[SHUFFLE_I]]
   3837 uint64x1_t test_vget_low_u64(uint64x2_t a) {
   3838   return vget_low_u64(a);
   3839 }
   3840 
   3841 // CHECK-LABEL: define <8 x i8> @test_vget_low_p8(<16 x i8> %a) #0 {
   3842 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   3843 // CHECK:   ret <8 x i8> [[SHUFFLE_I]]
   3844 poly8x8_t test_vget_low_p8(poly8x16_t a) {
   3845   return vget_low_p8(a);
   3846 }
   3847 
   3848 // CHECK-LABEL: define <4 x i16> @test_vget_low_p16(<8 x i16> %a) #0 {
   3849 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   3850 // CHECK:   ret <4 x i16> [[SHUFFLE_I]]
   3851 poly16x4_t test_vget_low_p16(poly16x8_t a) {
   3852   return vget_low_p16(a);
   3853 }
   3854 
   3855 
   3856 // CHECK-LABEL: define <8 x i8> @test_vhadd_s8(<8 x i8> %a, <8 x i8> %b) #0 {
   3857 // CHECK:   [[VHADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vhadds.v8i8(<8 x i8> %a, <8 x i8> %b) #4
   3858 // CHECK:   ret <8 x i8> [[VHADD_V_I]]
   3859 int8x8_t test_vhadd_s8(int8x8_t a, int8x8_t b) {
   3860   return vhadd_s8(a, b);
   3861 }
   3862 
   3863 // CHECK-LABEL: define <4 x i16> @test_vhadd_s16(<4 x i16> %a, <4 x i16> %b) #0 {
   3864 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   3865 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   3866 // CHECK:   [[VHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   3867 // CHECK:   [[VHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   3868 // CHECK:   [[VHADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vhadds.v4i16(<4 x i16> [[VHADD_V_I]], <4 x i16> [[VHADD_V1_I]]) #4
   3869 // CHECK:   [[VHADD_V3_I:%.*]] = bitcast <4 x i16> [[VHADD_V2_I]] to <8 x i8>
   3870 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VHADD_V3_I]] to <4 x i16>
   3871 // CHECK:   ret <4 x i16> [[TMP2]]
   3872 int16x4_t test_vhadd_s16(int16x4_t a, int16x4_t b) {
   3873   return vhadd_s16(a, b);
   3874 }
   3875 
   3876 // CHECK-LABEL: define <2 x i32> @test_vhadd_s32(<2 x i32> %a, <2 x i32> %b) #0 {
   3877 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   3878 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   3879 // CHECK:   [[VHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   3880 // CHECK:   [[VHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   3881 // CHECK:   [[VHADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vhadds.v2i32(<2 x i32> [[VHADD_V_I]], <2 x i32> [[VHADD_V1_I]]) #4
   3882 // CHECK:   [[VHADD_V3_I:%.*]] = bitcast <2 x i32> [[VHADD_V2_I]] to <8 x i8>
   3883 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VHADD_V3_I]] to <2 x i32>
   3884 // CHECK:   ret <2 x i32> [[TMP2]]
   3885 int32x2_t test_vhadd_s32(int32x2_t a, int32x2_t b) {
   3886   return vhadd_s32(a, b);
   3887 }
   3888 
   3889 // CHECK-LABEL: define <8 x i8> @test_vhadd_u8(<8 x i8> %a, <8 x i8> %b) #0 {
   3890 // CHECK:   [[VHADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
   3891 // CHECK:   ret <8 x i8> [[VHADD_V_I]]
   3892 uint8x8_t test_vhadd_u8(uint8x8_t a, uint8x8_t b) {
   3893   return vhadd_u8(a, b);
   3894 }
   3895 
   3896 // CHECK-LABEL: define <4 x i16> @test_vhadd_u16(<4 x i16> %a, <4 x i16> %b) #0 {
   3897 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   3898 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   3899 // CHECK:   [[VHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   3900 // CHECK:   [[VHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   3901 // CHECK:   [[VHADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vhaddu.v4i16(<4 x i16> [[VHADD_V_I]], <4 x i16> [[VHADD_V1_I]]) #4
   3902 // CHECK:   [[VHADD_V3_I:%.*]] = bitcast <4 x i16> [[VHADD_V2_I]] to <8 x i8>
   3903 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VHADD_V3_I]] to <4 x i16>
   3904 // CHECK:   ret <4 x i16> [[TMP2]]
   3905 uint16x4_t test_vhadd_u16(uint16x4_t a, uint16x4_t b) {
   3906   return vhadd_u16(a, b);
   3907 }
   3908 
   3909 // CHECK-LABEL: define <2 x i32> @test_vhadd_u32(<2 x i32> %a, <2 x i32> %b) #0 {
   3910 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   3911 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   3912 // CHECK:   [[VHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   3913 // CHECK:   [[VHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   3914 // CHECK:   [[VHADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vhaddu.v2i32(<2 x i32> [[VHADD_V_I]], <2 x i32> [[VHADD_V1_I]]) #4
   3915 // CHECK:   [[VHADD_V3_I:%.*]] = bitcast <2 x i32> [[VHADD_V2_I]] to <8 x i8>
   3916 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VHADD_V3_I]] to <2 x i32>
   3917 // CHECK:   ret <2 x i32> [[TMP2]]
   3918 uint32x2_t test_vhadd_u32(uint32x2_t a, uint32x2_t b) {
   3919   return vhadd_u32(a, b);
   3920 }
   3921 
   3922 // CHECK-LABEL: define <16 x i8> @test_vhaddq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
   3923 // CHECK:   [[VHADDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vhadds.v16i8(<16 x i8> %a, <16 x i8> %b) #4
   3924 // CHECK:   ret <16 x i8> [[VHADDQ_V_I]]
   3925 int8x16_t test_vhaddq_s8(int8x16_t a, int8x16_t b) {
   3926   return vhaddq_s8(a, b);
   3927 }
   3928 
   3929 // CHECK-LABEL: define <8 x i16> @test_vhaddq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
   3930 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   3931 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   3932 // CHECK:   [[VHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   3933 // CHECK:   [[VHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   3934 // CHECK:   [[VHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16> [[VHADDQ_V_I]], <8 x i16> [[VHADDQ_V1_I]]) #4
   3935 // CHECK:   [[VHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VHADDQ_V2_I]] to <16 x i8>
   3936 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VHADDQ_V3_I]] to <8 x i16>
   3937 // CHECK:   ret <8 x i16> [[TMP2]]
   3938 int16x8_t test_vhaddq_s16(int16x8_t a, int16x8_t b) {
   3939   return vhaddq_s16(a, b);
   3940 }
   3941 
   3942 // CHECK-LABEL: define <4 x i32> @test_vhaddq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
   3943 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   3944 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   3945 // CHECK:   [[VHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   3946 // CHECK:   [[VHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   3947 // CHECK:   [[VHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vhadds.v4i32(<4 x i32> [[VHADDQ_V_I]], <4 x i32> [[VHADDQ_V1_I]]) #4
   3948 // CHECK:   [[VHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VHADDQ_V2_I]] to <16 x i8>
   3949 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VHADDQ_V3_I]] to <4 x i32>
   3950 // CHECK:   ret <4 x i32> [[TMP2]]
   3951 int32x4_t test_vhaddq_s32(int32x4_t a, int32x4_t b) {
   3952   return vhaddq_s32(a, b);
   3953 }
   3954 
   3955 // CHECK-LABEL: define <16 x i8> @test_vhaddq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
   3956 // CHECK:   [[VHADDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vhaddu.v16i8(<16 x i8> %a, <16 x i8> %b) #4
   3957 // CHECK:   ret <16 x i8> [[VHADDQ_V_I]]
   3958 uint8x16_t test_vhaddq_u8(uint8x16_t a, uint8x16_t b) {
   3959   return vhaddq_u8(a, b);
   3960 }
   3961 
   3962 // CHECK-LABEL: define <8 x i16> @test_vhaddq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
   3963 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   3964 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   3965 // CHECK:   [[VHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   3966 // CHECK:   [[VHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   3967 // CHECK:   [[VHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16> [[VHADDQ_V_I]], <8 x i16> [[VHADDQ_V1_I]]) #4
   3968 // CHECK:   [[VHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VHADDQ_V2_I]] to <16 x i8>
   3969 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VHADDQ_V3_I]] to <8 x i16>
   3970 // CHECK:   ret <8 x i16> [[TMP2]]
   3971 uint16x8_t test_vhaddq_u16(uint16x8_t a, uint16x8_t b) {
   3972   return vhaddq_u16(a, b);
   3973 }
   3974 
   3975 // CHECK-LABEL: define <4 x i32> @test_vhaddq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
   3976 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   3977 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   3978 // CHECK:   [[VHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   3979 // CHECK:   [[VHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   3980 // CHECK:   [[VHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vhaddu.v4i32(<4 x i32> [[VHADDQ_V_I]], <4 x i32> [[VHADDQ_V1_I]]) #4
   3981 // CHECK:   [[VHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VHADDQ_V2_I]] to <16 x i8>
   3982 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VHADDQ_V3_I]] to <4 x i32>
   3983 // CHECK:   ret <4 x i32> [[TMP2]]
   3984 uint32x4_t test_vhaddq_u32(uint32x4_t a, uint32x4_t b) {
   3985   return vhaddq_u32(a, b);
   3986 }
   3987 
   3988 
   3989 // CHECK-LABEL: define <8 x i8> @test_vhsub_s8(<8 x i8> %a, <8 x i8> %b) #0 {
   3990 // CHECK:   [[VHSUB_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vhsubs.v8i8(<8 x i8> %a, <8 x i8> %b) #4
   3991 // CHECK:   ret <8 x i8> [[VHSUB_V_I]]
   3992 int8x8_t test_vhsub_s8(int8x8_t a, int8x8_t b) {
   3993   return vhsub_s8(a, b);
   3994 }
   3995 
   3996 // CHECK-LABEL: define <4 x i16> @test_vhsub_s16(<4 x i16> %a, <4 x i16> %b) #0 {
   3997 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   3998 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   3999 // CHECK:   [[VHSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   4000 // CHECK:   [[VHSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   4001 // CHECK:   [[VHSUB_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vhsubs.v4i16(<4 x i16> [[VHSUB_V_I]], <4 x i16> [[VHSUB_V1_I]]) #4
   4002 // CHECK:   [[VHSUB_V3_I:%.*]] = bitcast <4 x i16> [[VHSUB_V2_I]] to <8 x i8>
   4003 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VHSUB_V3_I]] to <4 x i16>
   4004 // CHECK:   ret <4 x i16> [[TMP2]]
   4005 int16x4_t test_vhsub_s16(int16x4_t a, int16x4_t b) {
   4006   return vhsub_s16(a, b);
   4007 }
   4008 
   4009 // CHECK-LABEL: define <2 x i32> @test_vhsub_s32(<2 x i32> %a, <2 x i32> %b) #0 {
   4010 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   4011 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   4012 // CHECK:   [[VHSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   4013 // CHECK:   [[VHSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   4014 // CHECK:   [[VHSUB_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vhsubs.v2i32(<2 x i32> [[VHSUB_V_I]], <2 x i32> [[VHSUB_V1_I]]) #4
   4015 // CHECK:   [[VHSUB_V3_I:%.*]] = bitcast <2 x i32> [[VHSUB_V2_I]] to <8 x i8>
   4016 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VHSUB_V3_I]] to <2 x i32>
   4017 // CHECK:   ret <2 x i32> [[TMP2]]
   4018 int32x2_t test_vhsub_s32(int32x2_t a, int32x2_t b) {
   4019   return vhsub_s32(a, b);
   4020 }
   4021 
   4022 // CHECK-LABEL: define <8 x i8> @test_vhsub_u8(<8 x i8> %a, <8 x i8> %b) #0 {
   4023 // CHECK:   [[VHSUB_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vhsubu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
   4024 // CHECK:   ret <8 x i8> [[VHSUB_V_I]]
   4025 uint8x8_t test_vhsub_u8(uint8x8_t a, uint8x8_t b) {
   4026   return vhsub_u8(a, b);
   4027 }
   4028 
   4029 // CHECK-LABEL: define <4 x i16> @test_vhsub_u16(<4 x i16> %a, <4 x i16> %b) #0 {
   4030 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   4031 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   4032 // CHECK:   [[VHSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   4033 // CHECK:   [[VHSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   4034 // CHECK:   [[VHSUB_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vhsubu.v4i16(<4 x i16> [[VHSUB_V_I]], <4 x i16> [[VHSUB_V1_I]]) #4
   4035 // CHECK:   [[VHSUB_V3_I:%.*]] = bitcast <4 x i16> [[VHSUB_V2_I]] to <8 x i8>
   4036 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VHSUB_V3_I]] to <4 x i16>
   4037 // CHECK:   ret <4 x i16> [[TMP2]]
   4038 uint16x4_t test_vhsub_u16(uint16x4_t a, uint16x4_t b) {
   4039   return vhsub_u16(a, b);
   4040 }
   4041 
   4042 // CHECK-LABEL: define <2 x i32> @test_vhsub_u32(<2 x i32> %a, <2 x i32> %b) #0 {
   4043 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   4044 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   4045 // CHECK:   [[VHSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   4046 // CHECK:   [[VHSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   4047 // CHECK:   [[VHSUB_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vhsubu.v2i32(<2 x i32> [[VHSUB_V_I]], <2 x i32> [[VHSUB_V1_I]]) #4
   4048 // CHECK:   [[VHSUB_V3_I:%.*]] = bitcast <2 x i32> [[VHSUB_V2_I]] to <8 x i8>
   4049 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VHSUB_V3_I]] to <2 x i32>
   4050 // CHECK:   ret <2 x i32> [[TMP2]]
   4051 uint32x2_t test_vhsub_u32(uint32x2_t a, uint32x2_t b) {
   4052   return vhsub_u32(a, b);
   4053 }
   4054 
   4055 // CHECK-LABEL: define <16 x i8> @test_vhsubq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
   4056 // CHECK:   [[VHSUBQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vhsubs.v16i8(<16 x i8> %a, <16 x i8> %b) #4
   4057 // CHECK:   ret <16 x i8> [[VHSUBQ_V_I]]
   4058 int8x16_t test_vhsubq_s8(int8x16_t a, int8x16_t b) {
   4059   return vhsubq_s8(a, b);
   4060 }
   4061 
   4062 // CHECK-LABEL: define <8 x i16> @test_vhsubq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
   4063 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   4064 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   4065 // CHECK:   [[VHSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   4066 // CHECK:   [[VHSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   4067 // CHECK:   [[VHSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vhsubs.v8i16(<8 x i16> [[VHSUBQ_V_I]], <8 x i16> [[VHSUBQ_V1_I]]) #4
   4068 // CHECK:   [[VHSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VHSUBQ_V2_I]] to <16 x i8>
   4069 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VHSUBQ_V3_I]] to <8 x i16>
   4070 // CHECK:   ret <8 x i16> [[TMP2]]
   4071 int16x8_t test_vhsubq_s16(int16x8_t a, int16x8_t b) {
   4072   return vhsubq_s16(a, b);
   4073 }
   4074 
   4075 // CHECK-LABEL: define <4 x i32> @test_vhsubq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
   4076 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   4077 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   4078 // CHECK:   [[VHSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   4079 // CHECK:   [[VHSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   4080 // CHECK:   [[VHSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vhsubs.v4i32(<4 x i32> [[VHSUBQ_V_I]], <4 x i32> [[VHSUBQ_V1_I]]) #4
   4081 // CHECK:   [[VHSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VHSUBQ_V2_I]] to <16 x i8>
   4082 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VHSUBQ_V3_I]] to <4 x i32>
   4083 // CHECK:   ret <4 x i32> [[TMP2]]
   4084 int32x4_t test_vhsubq_s32(int32x4_t a, int32x4_t b) {
   4085   return vhsubq_s32(a, b);
   4086 }
   4087 
   4088 // CHECK-LABEL: define <16 x i8> @test_vhsubq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
   4089 // CHECK:   [[VHSUBQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vhsubu.v16i8(<16 x i8> %a, <16 x i8> %b) #4
   4090 // CHECK:   ret <16 x i8> [[VHSUBQ_V_I]]
   4091 uint8x16_t test_vhsubq_u8(uint8x16_t a, uint8x16_t b) {
   4092   return vhsubq_u8(a, b);
   4093 }
   4094 
   4095 // CHECK-LABEL: define <8 x i16> @test_vhsubq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
   4096 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   4097 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   4098 // CHECK:   [[VHSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   4099 // CHECK:   [[VHSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   4100 // CHECK:   [[VHSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vhsubu.v8i16(<8 x i16> [[VHSUBQ_V_I]], <8 x i16> [[VHSUBQ_V1_I]]) #4
   4101 // CHECK:   [[VHSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VHSUBQ_V2_I]] to <16 x i8>
   4102 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VHSUBQ_V3_I]] to <8 x i16>
   4103 // CHECK:   ret <8 x i16> [[TMP2]]
   4104 uint16x8_t test_vhsubq_u16(uint16x8_t a, uint16x8_t b) {
   4105   return vhsubq_u16(a, b);
   4106 }
   4107 
   4108 // CHECK-LABEL: define <4 x i32> @test_vhsubq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
   4109 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   4110 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   4111 // CHECK:   [[VHSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   4112 // CHECK:   [[VHSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   4113 // CHECK:   [[VHSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vhsubu.v4i32(<4 x i32> [[VHSUBQ_V_I]], <4 x i32> [[VHSUBQ_V1_I]]) #4
   4114 // CHECK:   [[VHSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VHSUBQ_V2_I]] to <16 x i8>
   4115 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VHSUBQ_V3_I]] to <4 x i32>
   4116 // CHECK:   ret <4 x i32> [[TMP2]]
   4117 uint32x4_t test_vhsubq_u32(uint32x4_t a, uint32x4_t b) {
   4118   return vhsubq_u32(a, b);
   4119 }
   4120 
   4121 
   4122 // CHECK-LABEL: define <16 x i8> @test_vld1q_u8(i8* %a) #0 {
   4123 // CHECK:   [[VLD1:%.*]] = call <16 x i8> @llvm.arm.neon.vld1.v16i8.p0i8(i8* %a, i32 1)
   4124 // CHECK:   ret <16 x i8> [[VLD1]]
   4125 uint8x16_t test_vld1q_u8(uint8_t const * a) {
   4126   return vld1q_u8(a);
   4127 }
   4128 
   4129 // CHECK-LABEL: define <8 x i16> @test_vld1q_u16(i16* %a) #0 {
   4130 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
   4131 // CHECK:   [[VLD1:%.*]] = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* [[TMP0]], i32 2)
   4132 // CHECK:   ret <8 x i16> [[VLD1]]
   4133 uint16x8_t test_vld1q_u16(uint16_t const * a) {
   4134   return vld1q_u16(a);
   4135 }
   4136 
   4137 // CHECK-LABEL: define <4 x i32> @test_vld1q_u32(i32* %a) #0 {
   4138 // CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
   4139 // CHECK:   [[VLD1:%.*]] = call <4 x i32> @llvm.arm.neon.vld1.v4i32.p0i8(i8* [[TMP0]], i32 4)
   4140 // CHECK:   ret <4 x i32> [[VLD1]]
   4141 uint32x4_t test_vld1q_u32(uint32_t const * a) {
   4142   return vld1q_u32(a);
   4143 }
   4144 
   4145 // CHECK-LABEL: define <2 x i64> @test_vld1q_u64(i64* %a) #0 {
   4146 // CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
   4147 // CHECK:   [[VLD1:%.*]] = call <2 x i64> @llvm.arm.neon.vld1.v2i64.p0i8(i8* [[TMP0]], i32 4)
   4148 // CHECK:   ret <2 x i64> [[VLD1]]
   4149 uint64x2_t test_vld1q_u64(uint64_t const * a) {
   4150   return vld1q_u64(a);
   4151 }
   4152 
   4153 // CHECK-LABEL: define <16 x i8> @test_vld1q_s8(i8* %a) #0 {
   4154 // CHECK:   [[VLD1:%.*]] = call <16 x i8> @llvm.arm.neon.vld1.v16i8.p0i8(i8* %a, i32 1)
   4155 // CHECK:   ret <16 x i8> [[VLD1]]
   4156 int8x16_t test_vld1q_s8(int8_t const * a) {
   4157   return vld1q_s8(a);
   4158 }
   4159 
   4160 // CHECK-LABEL: define <8 x i16> @test_vld1q_s16(i16* %a) #0 {
   4161 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
   4162 // CHECK:   [[VLD1:%.*]] = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* [[TMP0]], i32 2)
   4163 // CHECK:   ret <8 x i16> [[VLD1]]
   4164 int16x8_t test_vld1q_s16(int16_t const * a) {
   4165   return vld1q_s16(a);
   4166 }
   4167 
   4168 // CHECK-LABEL: define <4 x i32> @test_vld1q_s32(i32* %a) #0 {
   4169 // CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
   4170 // CHECK:   [[VLD1:%.*]] = call <4 x i32> @llvm.arm.neon.vld1.v4i32.p0i8(i8* [[TMP0]], i32 4)
   4171 // CHECK:   ret <4 x i32> [[VLD1]]
   4172 int32x4_t test_vld1q_s32(int32_t const * a) {
   4173   return vld1q_s32(a);
   4174 }
   4175 
   4176 // CHECK-LABEL: define <2 x i64> @test_vld1q_s64(i64* %a) #0 {
   4177 // CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
   4178 // CHECK:   [[VLD1:%.*]] = call <2 x i64> @llvm.arm.neon.vld1.v2i64.p0i8(i8* [[TMP0]], i32 4)
   4179 // CHECK:   ret <2 x i64> [[VLD1]]
   4180 int64x2_t test_vld1q_s64(int64_t const * a) {
   4181   return vld1q_s64(a);
   4182 }
   4183 
   4184 // CHECK-LABEL: define <8 x half> @test_vld1q_f16(half* %a) #0 {
   4185 // CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
   4186 // CHECK:   [[VLD1:%.*]] = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* [[TMP0]], i32 2)
   4187 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[VLD1]] to <8 x half>
   4188 // CHECK:   ret <8 x half> [[TMP1]]
   4189 float16x8_t test_vld1q_f16(float16_t const * a) {
   4190   return vld1q_f16(a);
   4191 }
   4192 
   4193 // CHECK-LABEL: define <4 x float> @test_vld1q_f32(float* %a) #0 {
   4194 // CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
   4195 // CHECK:   [[VLD1:%.*]] = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* [[TMP0]], i32 4)
   4196 // CHECK:   ret <4 x float> [[VLD1]]
   4197 float32x4_t test_vld1q_f32(float32_t const * a) {
   4198   return vld1q_f32(a);
   4199 }
   4200 
   4201 // CHECK-LABEL: define <16 x i8> @test_vld1q_p8(i8* %a) #0 {
   4202 // CHECK:   [[VLD1:%.*]] = call <16 x i8> @llvm.arm.neon.vld1.v16i8.p0i8(i8* %a, i32 1)
   4203 // CHECK:   ret <16 x i8> [[VLD1]]
   4204 poly8x16_t test_vld1q_p8(poly8_t const * a) {
   4205   return vld1q_p8(a);
   4206 }
   4207 
   4208 // CHECK-LABEL: define <8 x i16> @test_vld1q_p16(i16* %a) #0 {
   4209 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
   4210 // CHECK:   [[VLD1:%.*]] = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* [[TMP0]], i32 2)
   4211 // CHECK:   ret <8 x i16> [[VLD1]]
   4212 poly16x8_t test_vld1q_p16(poly16_t const * a) {
   4213   return vld1q_p16(a);
   4214 }
   4215 
   4216 // CHECK-LABEL: define <8 x i8> @test_vld1_u8(i8* %a) #0 {
   4217 // CHECK:   [[VLD1:%.*]] = call <8 x i8> @llvm.arm.neon.vld1.v8i8.p0i8(i8* %a, i32 1)
   4218 // CHECK:   ret <8 x i8> [[VLD1]]
   4219 uint8x8_t test_vld1_u8(uint8_t const * a) {
   4220   return vld1_u8(a);
   4221 }
   4222 
   4223 // CHECK-LABEL: define <4 x i16> @test_vld1_u16(i16* %a) #0 {
   4224 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
   4225 // CHECK:   [[VLD1:%.*]] = call <4 x i16> @llvm.arm.neon.vld1.v4i16.p0i8(i8* [[TMP0]], i32 2)
   4226 // CHECK:   ret <4 x i16> [[VLD1]]
   4227 uint16x4_t test_vld1_u16(uint16_t const * a) {
   4228   return vld1_u16(a);
   4229 }
   4230 
   4231 // CHECK-LABEL: define <2 x i32> @test_vld1_u32(i32* %a) #0 {
   4232 // CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
   4233 // CHECK:   [[VLD1:%.*]] = call <2 x i32> @llvm.arm.neon.vld1.v2i32.p0i8(i8* [[TMP0]], i32 4)
   4234 // CHECK:   ret <2 x i32> [[VLD1]]
   4235 uint32x2_t test_vld1_u32(uint32_t const * a) {
   4236   return vld1_u32(a);
   4237 }
   4238 
   4239 // CHECK-LABEL: define <1 x i64> @test_vld1_u64(i64* %a) #0 {
   4240 // CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
   4241 // CHECK:   [[VLD1:%.*]] = call <1 x i64> @llvm.arm.neon.vld1.v1i64.p0i8(i8* [[TMP0]], i32 4)
   4242 // CHECK:   ret <1 x i64> [[VLD1]]
   4243 uint64x1_t test_vld1_u64(uint64_t const * a) {
   4244   return vld1_u64(a);
   4245 }
   4246 
   4247 // CHECK-LABEL: define <8 x i8> @test_vld1_s8(i8* %a) #0 {
   4248 // CHECK:   [[VLD1:%.*]] = call <8 x i8> @llvm.arm.neon.vld1.v8i8.p0i8(i8* %a, i32 1)
   4249 // CHECK:   ret <8 x i8> [[VLD1]]
   4250 int8x8_t test_vld1_s8(int8_t const * a) {
   4251   return vld1_s8(a);
   4252 }
   4253 
   4254 // CHECK-LABEL: define <4 x i16> @test_vld1_s16(i16* %a) #0 {
   4255 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
   4256 // CHECK:   [[VLD1:%.*]] = call <4 x i16> @llvm.arm.neon.vld1.v4i16.p0i8(i8* [[TMP0]], i32 2)
   4257 // CHECK:   ret <4 x i16> [[VLD1]]
   4258 int16x4_t test_vld1_s16(int16_t const * a) {
   4259   return vld1_s16(a);
   4260 }
   4261 
   4262 // CHECK-LABEL: define <2 x i32> @test_vld1_s32(i32* %a) #0 {
   4263 // CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
   4264 // CHECK:   [[VLD1:%.*]] = call <2 x i32> @llvm.arm.neon.vld1.v2i32.p0i8(i8* [[TMP0]], i32 4)
   4265 // CHECK:   ret <2 x i32> [[VLD1]]
   4266 int32x2_t test_vld1_s32(int32_t const * a) {
   4267   return vld1_s32(a);
   4268 }
   4269 
   4270 // CHECK-LABEL: define <1 x i64> @test_vld1_s64(i64* %a) #0 {
   4271 // CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
   4272 // CHECK:   [[VLD1:%.*]] = call <1 x i64> @llvm.arm.neon.vld1.v1i64.p0i8(i8* [[TMP0]], i32 4)
   4273 // CHECK:   ret <1 x i64> [[VLD1]]
   4274 int64x1_t test_vld1_s64(int64_t const * a) {
   4275   return vld1_s64(a);
   4276 }
   4277 
   4278 // CHECK-LABEL: define <4 x half> @test_vld1_f16(half* %a) #0 {
   4279 // CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
   4280 // CHECK:   [[VLD1:%.*]] = call <4 x i16> @llvm.arm.neon.vld1.v4i16.p0i8(i8* [[TMP0]], i32 2)
   4281 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VLD1]] to <4 x half>
   4282 // CHECK:   ret <4 x half> [[TMP1]]
   4283 float16x4_t test_vld1_f16(float16_t const * a) {
   4284   return vld1_f16(a);
   4285 }
   4286 
   4287 // CHECK-LABEL: define <2 x float> @test_vld1_f32(float* %a) #0 {
   4288 // CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
   4289 // CHECK:   [[VLD1:%.*]] = call <2 x float> @llvm.arm.neon.vld1.v2f32.p0i8(i8* [[TMP0]], i32 4)
   4290 // CHECK:   ret <2 x float> [[VLD1]]
   4291 float32x2_t test_vld1_f32(float32_t const * a) {
   4292   return vld1_f32(a);
   4293 }
   4294 
   4295 // CHECK-LABEL: define <8 x i8> @test_vld1_p8(i8* %a) #0 {
   4296 // CHECK:   [[VLD1:%.*]] = call <8 x i8> @llvm.arm.neon.vld1.v8i8.p0i8(i8* %a, i32 1)
   4297 // CHECK:   ret <8 x i8> [[VLD1]]
   4298 poly8x8_t test_vld1_p8(poly8_t const * a) {
   4299   return vld1_p8(a);
   4300 }
   4301 
   4302 // CHECK-LABEL: define <4 x i16> @test_vld1_p16(i16* %a) #0 {
   4303 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
   4304 // CHECK:   [[VLD1:%.*]] = call <4 x i16> @llvm.arm.neon.vld1.v4i16.p0i8(i8* [[TMP0]], i32 2)
   4305 // CHECK:   ret <4 x i16> [[VLD1]]
   4306 poly16x4_t test_vld1_p16(poly16_t const * a) {
   4307   return vld1_p16(a);
   4308 }
   4309 
   4310 
   4311 // CHECK-LABEL: define <16 x i8> @test_vld1q_dup_u8(i8* %a) #0 {
   4312 // CHECK:   [[TMP0:%.*]] = load i8, i8* %a, align 1
   4313 // CHECK:   [[TMP1:%.*]] = insertelement <16 x i8> undef, i8 [[TMP0]], i32 0
   4314 // CHECK:   [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer
   4315 // CHECK:   ret <16 x i8> [[LANE]]
   4316 uint8x16_t test_vld1q_dup_u8(uint8_t const * a) {
   4317   return vld1q_dup_u8(a);
   4318 }
   4319 
   4320 // CHECK-LABEL: define <8 x i16> @test_vld1q_dup_u16(i16* %a) #0 {
   4321 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
   4322 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
   4323 // CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2
   4324 // CHECK:   [[TMP3:%.*]] = insertelement <8 x i16> undef, i16 [[TMP2]], i32 0
   4325 // CHECK:   [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer
   4326 // CHECK:   ret <8 x i16> [[LANE]]
   4327 uint16x8_t test_vld1q_dup_u16(uint16_t const * a) {
   4328   return vld1q_dup_u16(a);
   4329 }
   4330 
   4331 // CHECK-LABEL: define <4 x i32> @test_vld1q_dup_u32(i32* %a) #0 {
   4332 // CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
   4333 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i32*
   4334 // CHECK:   [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4
   4335 // CHECK:   [[TMP3:%.*]] = insertelement <4 x i32> undef, i32 [[TMP2]], i32 0
   4336 // CHECK:   [[LANE:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP3]], <4 x i32> zeroinitializer
   4337 // CHECK:   ret <4 x i32> [[LANE]]
   4338 uint32x4_t test_vld1q_dup_u32(uint32_t const * a) {
   4339   return vld1q_dup_u32(a);
   4340 }
   4341 
   4342 // CHECK-LABEL: define <2 x i64> @test_vld1q_dup_u64(i64* %a) #0 {
   4343 // CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
   4344 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i64*
   4345 // CHECK:   [[TMP2:%.*]] = load i64, i64* [[TMP1]], align 4
   4346 // CHECK:   [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i32 0
   4347 // CHECK:   [[LANE:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP3]], <2 x i32> zeroinitializer
   4348 // CHECK:   ret <2 x i64> [[LANE]]
   4349 uint64x2_t test_vld1q_dup_u64(uint64_t const * a) {
   4350   return vld1q_dup_u64(a);
   4351 }
   4352 
   4353 // CHECK-LABEL: define <16 x i8> @test_vld1q_dup_s8(i8* %a) #0 {
   4354 // CHECK:   [[TMP0:%.*]] = load i8, i8* %a, align 1
   4355 // CHECK:   [[TMP1:%.*]] = insertelement <16 x i8> undef, i8 [[TMP0]], i32 0
   4356 // CHECK:   [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer
   4357 // CHECK:   ret <16 x i8> [[LANE]]
   4358 int8x16_t test_vld1q_dup_s8(int8_t const * a) {
   4359   return vld1q_dup_s8(a);
   4360 }
   4361 
   4362 // CHECK-LABEL: define <8 x i16> @test_vld1q_dup_s16(i16* %a) #0 {
   4363 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
   4364 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
   4365 // CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2
   4366 // CHECK:   [[TMP3:%.*]] = insertelement <8 x i16> undef, i16 [[TMP2]], i32 0
   4367 // CHECK:   [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer
   4368 // CHECK:   ret <8 x i16> [[LANE]]
   4369 int16x8_t test_vld1q_dup_s16(int16_t const * a) {
   4370   return vld1q_dup_s16(a);
   4371 }
   4372 
   4373 // CHECK-LABEL: define <4 x i32> @test_vld1q_dup_s32(i32* %a) #0 {
   4374 // CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
   4375 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i32*
   4376 // CHECK:   [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4
   4377 // CHECK:   [[TMP3:%.*]] = insertelement <4 x i32> undef, i32 [[TMP2]], i32 0
   4378 // CHECK:   [[LANE:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP3]], <4 x i32> zeroinitializer
   4379 // CHECK:   ret <4 x i32> [[LANE]]
   4380 int32x4_t test_vld1q_dup_s32(int32_t const * a) {
   4381   return vld1q_dup_s32(a);
   4382 }
   4383 
   4384 // CHECK-LABEL: define <2 x i64> @test_vld1q_dup_s64(i64* %a) #0 {
   4385 // CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
   4386 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i64*
   4387 // CHECK:   [[TMP2:%.*]] = load i64, i64* [[TMP1]], align 4
   4388 // CHECK:   [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i32 0
   4389 // CHECK:   [[LANE:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP3]], <2 x i32> zeroinitializer
   4390 // CHECK:   ret <2 x i64> [[LANE]]
   4391 int64x2_t test_vld1q_dup_s64(int64_t const * a) {
   4392   return vld1q_dup_s64(a);
   4393 }
   4394 
   4395 // CHECK-LABEL: define <8 x half> @test_vld1q_dup_f16(half* %a) #0 {
   4396 // CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
   4397 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
   4398 // CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2
   4399 // CHECK:   [[TMP3:%.*]] = insertelement <8 x i16> undef, i16 [[TMP2]], i32 0
   4400 // CHECK:   [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer
   4401 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[LANE]] to <8 x half>
   4402 // CHECK:   ret <8 x half> [[TMP4]]
   4403 float16x8_t test_vld1q_dup_f16(float16_t const * a) {
   4404   return vld1q_dup_f16(a);
   4405 }
   4406 
   4407 // CHECK-LABEL: define <4 x float> @test_vld1q_dup_f32(float* %a) #0 {
   4408 // CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
   4409 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to float*
   4410 // CHECK:   [[TMP2:%.*]] = load float, float* [[TMP1]], align 4
   4411 // CHECK:   [[TMP3:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0
   4412 // CHECK:   [[LANE:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP3]], <4 x i32> zeroinitializer
   4413 // CHECK:   ret <4 x float> [[LANE]]
   4414 float32x4_t test_vld1q_dup_f32(float32_t const * a) {
   4415   return vld1q_dup_f32(a);
   4416 }
   4417 
   4418 // CHECK-LABEL: define <16 x i8> @test_vld1q_dup_p8(i8* %a) #0 {
   4419 // CHECK:   [[TMP0:%.*]] = load i8, i8* %a, align 1
   4420 // CHECK:   [[TMP1:%.*]] = insertelement <16 x i8> undef, i8 [[TMP0]], i32 0
   4421 // CHECK:   [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer
   4422 // CHECK:   ret <16 x i8> [[LANE]]
   4423 poly8x16_t test_vld1q_dup_p8(poly8_t const * a) {
   4424   return vld1q_dup_p8(a);
   4425 }
   4426 
   4427 // CHECK-LABEL: define <8 x i16> @test_vld1q_dup_p16(i16* %a) #0 {
   4428 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
   4429 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
   4430 // CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2
   4431 // CHECK:   [[TMP3:%.*]] = insertelement <8 x i16> undef, i16 [[TMP2]], i32 0
   4432 // CHECK:   [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer
   4433 // CHECK:   ret <8 x i16> [[LANE]]
   4434 poly16x8_t test_vld1q_dup_p16(poly16_t const * a) {
   4435   return vld1q_dup_p16(a);
   4436 }
   4437 
   4438 // CHECK-LABEL: define <8 x i8> @test_vld1_dup_u8(i8* %a) #0 {
   4439 // CHECK:   [[TMP0:%.*]] = load i8, i8* %a, align 1
   4440 // CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 [[TMP0]], i32 0
   4441 // CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
   4442 // CHECK:   ret <8 x i8> [[LANE]]
   4443 uint8x8_t test_vld1_dup_u8(uint8_t const * a) {
   4444   return vld1_dup_u8(a);
   4445 }
   4446 
   4447 // CHECK-LABEL: define <4 x i16> @test_vld1_dup_u16(i16* %a) #0 {
   4448 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
   4449 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
   4450 // CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2
   4451 // CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[TMP2]], i32 0
   4452 // CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer
   4453 // CHECK:   ret <4 x i16> [[LANE]]
   4454 uint16x4_t test_vld1_dup_u16(uint16_t const * a) {
   4455   return vld1_dup_u16(a);
   4456 }
   4457 
   4458 // CHECK-LABEL: define <2 x i32> @test_vld1_dup_u32(i32* %a) #0 {
   4459 // CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
   4460 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i32*
   4461 // CHECK:   [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4
   4462 // CHECK:   [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i32 0
   4463 // CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP3]], <2 x i32> zeroinitializer
   4464 // CHECK:   ret <2 x i32> [[LANE]]
   4465 uint32x2_t test_vld1_dup_u32(uint32_t const * a) {
   4466   return vld1_dup_u32(a);
   4467 }
   4468 
   4469 // CHECK-LABEL: define <1 x i64> @test_vld1_dup_u64(i64* %a) #0 {
   4470 // CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
   4471 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i64*
   4472 // CHECK:   [[TMP2:%.*]] = load i64, i64* [[TMP1]], align 4
   4473 // CHECK:   [[TMP3:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0
   4474 // CHECK:   [[LANE:%.*]] = shufflevector <1 x i64> [[TMP3]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
   4475 // CHECK:   ret <1 x i64> [[LANE]]
   4476 uint64x1_t test_vld1_dup_u64(uint64_t const * a) {
   4477   return vld1_dup_u64(a);
   4478 }
   4479 
   4480 // CHECK-LABEL: define <8 x i8> @test_vld1_dup_s8(i8* %a) #0 {
   4481 // CHECK:   [[TMP0:%.*]] = load i8, i8* %a, align 1
   4482 // CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 [[TMP0]], i32 0
   4483 // CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
   4484 // CHECK:   ret <8 x i8> [[LANE]]
   4485 int8x8_t test_vld1_dup_s8(int8_t const * a) {
   4486   return vld1_dup_s8(a);
   4487 }
   4488 
   4489 // CHECK-LABEL: define <4 x i16> @test_vld1_dup_s16(i16* %a) #0 {
   4490 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
   4491 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
   4492 // CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2
   4493 // CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[TMP2]], i32 0
   4494 // CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer
   4495 // CHECK:   ret <4 x i16> [[LANE]]
   4496 int16x4_t test_vld1_dup_s16(int16_t const * a) {
   4497   return vld1_dup_s16(a);
   4498 }
   4499 
   4500 // CHECK-LABEL: define <2 x i32> @test_vld1_dup_s32(i32* %a) #0 {
   4501 // CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
   4502 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i32*
   4503 // CHECK:   [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4
   4504 // CHECK:   [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i32 0
   4505 // CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP3]], <2 x i32> zeroinitializer
   4506 // CHECK:   ret <2 x i32> [[LANE]]
   4507 int32x2_t test_vld1_dup_s32(int32_t const * a) {
   4508   return vld1_dup_s32(a);
   4509 }
   4510 
   4511 // CHECK-LABEL: define <1 x i64> @test_vld1_dup_s64(i64* %a) #0 {
   4512 // CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
   4513 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i64*
   4514 // CHECK:   [[TMP2:%.*]] = load i64, i64* [[TMP1]], align 4
   4515 // CHECK:   [[TMP3:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0
   4516 // CHECK:   [[LANE:%.*]] = shufflevector <1 x i64> [[TMP3]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
   4517 // CHECK:   ret <1 x i64> [[LANE]]
   4518 int64x1_t test_vld1_dup_s64(int64_t const * a) {
   4519   return vld1_dup_s64(a);
   4520 }
   4521 
   4522 // CHECK-LABEL: define <4 x half> @test_vld1_dup_f16(half* %a) #0 {
   4523 // CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
   4524 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
   4525 // CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2
   4526 // CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[TMP2]], i32 0
   4527 // CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer
   4528 // CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <4 x half>
   4529 // CHECK:   ret <4 x half> [[TMP4]]
   4530 float16x4_t test_vld1_dup_f16(float16_t const * a) {
   4531   return vld1_dup_f16(a);
   4532 }
   4533 
   4534 // CHECK-LABEL: define <2 x float> @test_vld1_dup_f32(float* %a) #0 {
   4535 // CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
   4536 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to float*
   4537 // CHECK:   [[TMP2:%.*]] = load float, float* [[TMP1]], align 4
   4538 // CHECK:   [[TMP3:%.*]] = insertelement <2 x float> undef, float [[TMP2]], i32 0
   4539 // CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> zeroinitializer
   4540 // CHECK:   ret <2 x float> [[LANE]]
   4541 float32x2_t test_vld1_dup_f32(float32_t const * a) {
   4542   return vld1_dup_f32(a);
   4543 }
   4544 
   4545 // CHECK-LABEL: define <8 x i8> @test_vld1_dup_p8(i8* %a) #0 {
   4546 // CHECK:   [[TMP0:%.*]] = load i8, i8* %a, align 1
   4547 // CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 [[TMP0]], i32 0
   4548 // CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
   4549 // CHECK:   ret <8 x i8> [[LANE]]
   4550 poly8x8_t test_vld1_dup_p8(poly8_t const * a) {
   4551   return vld1_dup_p8(a);
   4552 }
   4553 
   4554 // CHECK-LABEL: define <4 x i16> @test_vld1_dup_p16(i16* %a) #0 {
   4555 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
   4556 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
   4557 // CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2
   4558 // CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[TMP2]], i32 0
   4559 // CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer
   4560 // CHECK:   ret <4 x i16> [[LANE]]
   4561 poly16x4_t test_vld1_dup_p16(poly16_t const * a) {
   4562   return vld1_dup_p16(a);
   4563 }
   4564 
   4565 
   4566 // CHECK-LABEL: define <16 x i8> @test_vld1q_lane_u8(i8* %a, <16 x i8> %b) #0 {
   4567 // CHECK:   [[TMP0:%.*]] = load i8, i8* %a, align 1
   4568 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <16 x i8> %b, i8 [[TMP0]], i32 15
   4569 // CHECK:   ret <16 x i8> [[VLD1_LANE]]
   4570 uint8x16_t test_vld1q_lane_u8(uint8_t const * a, uint8x16_t b) {
   4571   return vld1q_lane_u8(a, b, 15);
   4572 }
   4573 
   4574 // CHECK-LABEL: define <8 x i16> @test_vld1q_lane_u16(i16* %a, <8 x i16> %b) #0 {
   4575 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
   4576 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   4577 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   4578 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
   4579 // CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]], align 2
   4580 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7
   4581 // CHECK:   ret <8 x i16> [[VLD1_LANE]]
   4582 uint16x8_t test_vld1q_lane_u16(uint16_t const * a, uint16x8_t b) {
   4583   return vld1q_lane_u16(a, b, 7);
   4584 }
   4585 
   4586 // CHECK-LABEL: define <4 x i32> @test_vld1q_lane_u32(i32* %a, <4 x i32> %b) #0 {
   4587 // CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
   4588 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   4589 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   4590 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i32*
   4591 // CHECK:   [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4
   4592 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[TMP4]], i32 3
   4593 // CHECK:   ret <4 x i32> [[VLD1_LANE]]
   4594 uint32x4_t test_vld1q_lane_u32(uint32_t const * a, uint32x4_t b) {
   4595   return vld1q_lane_u32(a, b, 3);
   4596 }
   4597 
   4598 // CHECK-LABEL: define <2 x i64> @test_vld1q_lane_u64(i64* %a, <2 x i64> %b) #0 {
   4599 // CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
   4600 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   4601 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
   4602 // CHECK:   [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP2]], <1 x i32> zeroinitializer
   4603 // CHECK:   [[TMP4:%.*]] = call <1 x i64> @llvm.arm.neon.vld1.v1i64.p0i8(i8* [[TMP0]], i32 4)
   4604 // CHECK:   [[VLD1Q_LANE:%.*]] = shufflevector <1 x i64> [[TMP3]], <1 x i64> [[TMP4]], <2 x i32> <i32 0, i32 1>
   4605 // CHECK:   ret <2 x i64> [[VLD1Q_LANE]]
   4606 uint64x2_t test_vld1q_lane_u64(uint64_t const * a, uint64x2_t b) {
   4607   return vld1q_lane_u64(a, b, 1);
   4608 }
   4609 
   4610 // CHECK-LABEL: define <16 x i8> @test_vld1q_lane_s8(i8* %a, <16 x i8> %b) #0 {
   4611 // CHECK:   [[TMP0:%.*]] = load i8, i8* %a, align 1
   4612 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <16 x i8> %b, i8 [[TMP0]], i32 15
   4613 // CHECK:   ret <16 x i8> [[VLD1_LANE]]
   4614 int8x16_t test_vld1q_lane_s8(int8_t const * a, int8x16_t b) {
   4615   return vld1q_lane_s8(a, b, 15);
   4616 }
   4617 
   4618 // CHECK-LABEL: define <8 x i16> @test_vld1q_lane_s16(i16* %a, <8 x i16> %b) #0 {
   4619 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
   4620 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   4621 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   4622 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
   4623 // CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]], align 2
   4624 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7
   4625 // CHECK:   ret <8 x i16> [[VLD1_LANE]]
   4626 int16x8_t test_vld1q_lane_s16(int16_t const * a, int16x8_t b) {
   4627   return vld1q_lane_s16(a, b, 7);
   4628 }
   4629 
   4630 // CHECK-LABEL: define <4 x i32> @test_vld1q_lane_s32(i32* %a, <4 x i32> %b) #0 {
   4631 // CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
   4632 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   4633 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   4634 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i32*
   4635 // CHECK:   [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4
   4636 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[TMP4]], i32 3
   4637 // CHECK:   ret <4 x i32> [[VLD1_LANE]]
   4638 int32x4_t test_vld1q_lane_s32(int32_t const * a, int32x4_t b) {
   4639   return vld1q_lane_s32(a, b, 3);
   4640 }
   4641 
   4642 // CHECK-LABEL: define <2 x i64> @test_vld1q_lane_s64(i64* %a, <2 x i64> %b) #0 {
   4643 // CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
   4644 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   4645 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
   4646 // CHECK:   [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP2]], <1 x i32> zeroinitializer
   4647 // CHECK:   [[TMP4:%.*]] = call <1 x i64> @llvm.arm.neon.vld1.v1i64.p0i8(i8* [[TMP0]], i32 4)
   4648 // CHECK:   [[VLD1Q_LANE:%.*]] = shufflevector <1 x i64> [[TMP3]], <1 x i64> [[TMP4]], <2 x i32> <i32 0, i32 1>
   4649 // CHECK:   ret <2 x i64> [[VLD1Q_LANE]]
   4650 int64x2_t test_vld1q_lane_s64(int64_t const * a, int64x2_t b) {
   4651   return vld1q_lane_s64(a, b, 1);
   4652 }
   4653 
   4654 // CHECK-LABEL: define <8 x half> @test_vld1q_lane_f16(half* %a, <8 x half> %b) #0 {
   4655 // CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
   4656 // CHECK:   [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8>
   4657 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   4658 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
   4659 // CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]], align 2
   4660 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7
   4661 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[VLD1_LANE]] to <8 x half>
   4662 // CHECK:   ret <8 x half> [[TMP5]]
   4663 float16x8_t test_vld1q_lane_f16(float16_t const * a, float16x8_t b) {
   4664   return vld1q_lane_f16(a, b, 7);
   4665 }
   4666 
   4667 // CHECK-LABEL: define <4 x float> @test_vld1q_lane_f32(float* %a, <4 x float> %b) #0 {
   4668 // CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
   4669 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
   4670 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
   4671 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to float*
   4672 // CHECK:   [[TMP4:%.*]] = load float, float* [[TMP3]], align 4
   4673 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x float> [[TMP2]], float [[TMP4]], i32 3
   4674 // CHECK:   ret <4 x float> [[VLD1_LANE]]
   4675 float32x4_t test_vld1q_lane_f32(float32_t const * a, float32x4_t b) {
   4676   return vld1q_lane_f32(a, b, 3);
   4677 }
   4678 
   4679 // CHECK-LABEL: define <16 x i8> @test_vld1q_lane_p8(i8* %a, <16 x i8> %b) #0 {
   4680 // CHECK:   [[TMP0:%.*]] = load i8, i8* %a, align 1
   4681 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <16 x i8> %b, i8 [[TMP0]], i32 15
   4682 // CHECK:   ret <16 x i8> [[VLD1_LANE]]
   4683 poly8x16_t test_vld1q_lane_p8(poly8_t const * a, poly8x16_t b) {
   4684   return vld1q_lane_p8(a, b, 15);
   4685 }
   4686 
   4687 // CHECK-LABEL: define <8 x i16> @test_vld1q_lane_p16(i16* %a, <8 x i16> %b) #0 {
   4688 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
   4689 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   4690 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   4691 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
   4692 // CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]], align 2
   4693 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7
   4694 // CHECK:   ret <8 x i16> [[VLD1_LANE]]
   4695 poly16x8_t test_vld1q_lane_p16(poly16_t const * a, poly16x8_t b) {
   4696   return vld1q_lane_p16(a, b, 7);
   4697 }
   4698 
   4699 // CHECK-LABEL: define <8 x i8> @test_vld1_lane_u8(i8* %a, <8 x i8> %b) #0 {
   4700 // CHECK:   [[TMP0:%.*]] = load i8, i8* %a, align 1
   4701 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i8> %b, i8 [[TMP0]], i32 7
   4702 // CHECK:   ret <8 x i8> [[VLD1_LANE]]
   4703 uint8x8_t test_vld1_lane_u8(uint8_t const * a, uint8x8_t b) {
   4704   return vld1_lane_u8(a, b, 7);
   4705 }
   4706 
   4707 // CHECK-LABEL: define <4 x i16> @test_vld1_lane_u16(i16* %a, <4 x i16> %b) #0 {
   4708 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
   4709 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   4710 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   4711 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
   4712 // CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]], align 2
   4713 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3
   4714 // CHECK:   ret <4 x i16> [[VLD1_LANE]]
   4715 uint16x4_t test_vld1_lane_u16(uint16_t const * a, uint16x4_t b) {
   4716   return vld1_lane_u16(a, b, 3);
   4717 }
   4718 
   4719 // CHECK-LABEL: define <2 x i32> @test_vld1_lane_u32(i32* %a, <2 x i32> %b) #0 {
   4720 // CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
   4721 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   4722 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   4723 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i32*
   4724 // CHECK:   [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4
   4725 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[TMP4]], i32 1
   4726 // CHECK:   ret <2 x i32> [[VLD1_LANE]]
   4727 uint32x2_t test_vld1_lane_u32(uint32_t const * a, uint32x2_t b) {
   4728   return vld1_lane_u32(a, b, 1);
   4729 }
   4730 
   4731 // CHECK-LABEL: define <1 x i64> @test_vld1_lane_u64(i64* %a, <1 x i64> %b) #0 {
   4732 // CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
   4733 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
   4734 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
   4735 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i64*
   4736 // CHECK:   [[TMP4:%.*]] = load i64, i64* [[TMP3]], align 4
   4737 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <1 x i64> [[TMP2]], i64 [[TMP4]], i32 0
   4738 // CHECK:   ret <1 x i64> [[VLD1_LANE]]
   4739 uint64x1_t test_vld1_lane_u64(uint64_t const * a, uint64x1_t b) {
   4740   return vld1_lane_u64(a, b, 0);
   4741 }
   4742 
   4743 // CHECK-LABEL: define <8 x i8> @test_vld1_lane_s8(i8* %a, <8 x i8> %b) #0 {
   4744 // CHECK:   [[TMP0:%.*]] = load i8, i8* %a, align 1
   4745 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i8> %b, i8 [[TMP0]], i32 7
   4746 // CHECK:   ret <8 x i8> [[VLD1_LANE]]
   4747 int8x8_t test_vld1_lane_s8(int8_t const * a, int8x8_t b) {
   4748   return vld1_lane_s8(a, b, 7);
   4749 }
   4750 
   4751 // CHECK-LABEL: define <4 x i16> @test_vld1_lane_s16(i16* %a, <4 x i16> %b) #0 {
   4752 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
   4753 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   4754 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   4755 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
   4756 // CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]], align 2
   4757 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3
   4758 // CHECK:   ret <4 x i16> [[VLD1_LANE]]
   4759 int16x4_t test_vld1_lane_s16(int16_t const * a, int16x4_t b) {
   4760   return vld1_lane_s16(a, b, 3);
   4761 }
   4762 
   4763 // CHECK-LABEL: define <2 x i32> @test_vld1_lane_s32(i32* %a, <2 x i32> %b) #0 {
   4764 // CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
   4765 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   4766 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   4767 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i32*
   4768 // CHECK:   [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4
   4769 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[TMP4]], i32 1
   4770 // CHECK:   ret <2 x i32> [[VLD1_LANE]]
   4771 int32x2_t test_vld1_lane_s32(int32_t const * a, int32x2_t b) {
   4772   return vld1_lane_s32(a, b, 1);
   4773 }
   4774 
   4775 // CHECK-LABEL: define <1 x i64> @test_vld1_lane_s64(i64* %a, <1 x i64> %b) #0 {
   4776 // CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
   4777 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
   4778 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
   4779 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i64*
   4780 // CHECK:   [[TMP4:%.*]] = load i64, i64* [[TMP3]], align 4
   4781 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <1 x i64> [[TMP2]], i64 [[TMP4]], i32 0
   4782 // CHECK:   ret <1 x i64> [[VLD1_LANE]]
   4783 int64x1_t test_vld1_lane_s64(int64_t const * a, int64x1_t b) {
   4784   return vld1_lane_s64(a, b, 0);
   4785 }
   4786 
   4787 // CHECK-LABEL: define <4 x half> @test_vld1_lane_f16(half* %a, <4 x half> %b) #0 {
   4788 // CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
   4789 // CHECK:   [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8>
   4790 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   4791 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
   4792 // CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]], align 2
   4793 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3
   4794 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[VLD1_LANE]] to <4 x half>
   4795 // CHECK:   ret <4 x half> [[TMP5]]
   4796 float16x4_t test_vld1_lane_f16(float16_t const * a, float16x4_t b) {
   4797   return vld1_lane_f16(a, b, 3);
   4798 }
   4799 
   4800 // CHECK-LABEL: define <2 x float> @test_vld1_lane_f32(float* %a, <2 x float> %b) #0 {
   4801 // CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
   4802 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
   4803 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
   4804 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to float*
   4805 // CHECK:   [[TMP4:%.*]] = load float, float* [[TMP3]], align 4
   4806 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP4]], i32 1
   4807 // CHECK:   ret <2 x float> [[VLD1_LANE]]
   4808 float32x2_t test_vld1_lane_f32(float32_t const * a, float32x2_t b) {
   4809   return vld1_lane_f32(a, b, 1);
   4810 }
   4811 
   4812 // CHECK-LABEL: define <8 x i8> @test_vld1_lane_p8(i8* %a, <8 x i8> %b) #0 {
   4813 // CHECK:   [[TMP0:%.*]] = load i8, i8* %a, align 1
   4814 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i8> %b, i8 [[TMP0]], i32 7
   4815 // CHECK:   ret <8 x i8> [[VLD1_LANE]]
   4816 poly8x8_t test_vld1_lane_p8(poly8_t const * a, poly8x8_t b) {
   4817   return vld1_lane_p8(a, b, 7);
   4818 }
   4819 
   4820 // CHECK-LABEL: define <4 x i16> @test_vld1_lane_p16(i16* %a, <4 x i16> %b) #0 {
   4821 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
   4822 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   4823 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   4824 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
   4825 // CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]], align 2
   4826 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3
   4827 // CHECK:   ret <4 x i16> [[VLD1_LANE]]
   4828 poly16x4_t test_vld1_lane_p16(poly16_t const * a, poly16x4_t b) {
   4829   return vld1_lane_p16(a, b, 3);
   4830 }
   4831 
   4832 
   4833 // CHECK-LABEL: define void @test_vld2q_u8(%struct.uint8x16x2_t* noalias sret %agg.result, i8* %a) #0 {
   4834 // CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x2_t, align 16
   4835 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8*
   4836 // CHECK:   [[VLD2Q_V:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2.v16i8.p0i8(i8* %a, i32 1)
   4837 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8> }*
   4838 // CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2Q_V]], { <16 x i8>, <16 x i8> }* [[TMP1]]
   4839 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x2_t* %agg.result to i8*
   4840 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8*
   4841 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 32, i32 16, i1 false)
   4842 // CHECK:   ret void
   4843 uint8x16x2_t test_vld2q_u8(uint8_t const * a) {
   4844   return vld2q_u8(a);
   4845 }
   4846 
   4847 // CHECK-LABEL: define void @test_vld2q_u16(%struct.uint16x8x2_t* noalias sret %agg.result, i16* %a) #0 {
   4848 // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x2_t, align 16
   4849 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
   4850 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
   4851 // CHECK:   [[VLD2Q_V:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2.v8i16.p0i8(i8* [[TMP1]], i32 2)
   4852 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
   4853 // CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2Q_V]], { <8 x i16>, <8 x i16> }* [[TMP2]]
   4854 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint16x8x2_t* %agg.result to i8*
   4855 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
   4856 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false)
   4857 // CHECK:   ret void
   4858 uint16x8x2_t test_vld2q_u16(uint16_t const * a) {
   4859   return vld2q_u16(a);
   4860 }
   4861 
   4862 // CHECK-LABEL: define void @test_vld2q_u32(%struct.uint32x4x2_t* noalias sret %agg.result, i32* %a) #0 {
   4863 // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x2_t, align 16
   4864 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
   4865 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
   4866 // CHECK:   [[VLD2Q_V:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP1]], i32 4)
   4867 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32> }*
   4868 // CHECK:   store { <4 x i32>, <4 x i32> } [[VLD2Q_V]], { <4 x i32>, <4 x i32> }* [[TMP2]]
   4869 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint32x4x2_t* %agg.result to i8*
   4870 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
   4871 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false)
   4872 // CHECK:   ret void
   4873 uint32x4x2_t test_vld2q_u32(uint32_t const * a) {
   4874   return vld2q_u32(a);
   4875 }
   4876 
   4877 // CHECK-LABEL: define void @test_vld2q_s8(%struct.int8x16x2_t* noalias sret %agg.result, i8* %a) #0 {
   4878 // CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x2_t, align 16
   4879 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8*
   4880 // CHECK:   [[VLD2Q_V:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2.v16i8.p0i8(i8* %a, i32 1)
   4881 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8> }*
   4882 // CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2Q_V]], { <16 x i8>, <16 x i8> }* [[TMP1]]
   4883 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x2_t* %agg.result to i8*
   4884 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8*
   4885 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 32, i32 16, i1 false)
   4886 // CHECK:   ret void
   4887 int8x16x2_t test_vld2q_s8(int8_t const * a) {
   4888   return vld2q_s8(a);
   4889 }
   4890 
   4891 // CHECK-LABEL: define void @test_vld2q_s16(%struct.int16x8x2_t* noalias sret %agg.result, i16* %a) #0 {
   4892 // CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x2_t, align 16
   4893 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
   4894 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
   4895 // CHECK:   [[VLD2Q_V:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2.v8i16.p0i8(i8* [[TMP1]], i32 2)
   4896 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
   4897 // CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2Q_V]], { <8 x i16>, <8 x i16> }* [[TMP2]]
   4898 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int16x8x2_t* %agg.result to i8*
   4899 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
   4900 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false)
   4901 // CHECK:   ret void
   4902 int16x8x2_t test_vld2q_s16(int16_t const * a) {
   4903   return vld2q_s16(a);
   4904 }
   4905 
   4906 // CHECK-LABEL: define void @test_vld2q_s32(%struct.int32x4x2_t* noalias sret %agg.result, i32* %a) #0 {
   4907 // CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x2_t, align 16
   4908 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
   4909 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
   4910 // CHECK:   [[VLD2Q_V:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP1]], i32 4)
   4911 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32> }*
   4912 // CHECK:   store { <4 x i32>, <4 x i32> } [[VLD2Q_V]], { <4 x i32>, <4 x i32> }* [[TMP2]]
   4913 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int32x4x2_t* %agg.result to i8*
   4914 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
   4915 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false)
   4916 // CHECK:   ret void
   4917 int32x4x2_t test_vld2q_s32(int32_t const * a) {
   4918   return vld2q_s32(a);
   4919 }
   4920 
   4921 // CHECK-LABEL: define void @test_vld2q_f16(%struct.float16x8x2_t* noalias sret %agg.result, half* %a) #0 {
   4922 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x2_t, align 16
   4923 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
   4924 // CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
   4925 // CHECK:   [[VLD2Q_V:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2.v8i16.p0i8(i8* [[TMP1]], i32 2)
   4926 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
   4927 // CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2Q_V]], { <8 x i16>, <8 x i16> }* [[TMP2]]
   4928 // CHECK:   [[TMP3:%.*]] = bitcast %struct.float16x8x2_t* %agg.result to i8*
   4929 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
   4930 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false)
   4931 // CHECK:   ret void
   4932 float16x8x2_t test_vld2q_f16(float16_t const * a) {
   4933   return vld2q_f16(a);
   4934 }
   4935 
   4936 // CHECK-LABEL: define void @test_vld2q_f32(%struct.float32x4x2_t* noalias sret %agg.result, float* %a) #0 {
   4937 // CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x2_t, align 16
   4938 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
   4939 // CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
   4940 // CHECK:   [[VLD2Q_V:%.*]] = call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32.p0i8(i8* [[TMP1]], i32 4)
   4941 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x float>, <4 x float> }*
   4942 // CHECK:   store { <4 x float>, <4 x float> } [[VLD2Q_V]], { <4 x float>, <4 x float> }* [[TMP2]]
   4943 // CHECK:   [[TMP3:%.*]] = bitcast %struct.float32x4x2_t* %agg.result to i8*
   4944 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
   4945 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false)
   4946 // CHECK:   ret void
   4947 float32x4x2_t test_vld2q_f32(float32_t const * a) {
   4948   return vld2q_f32(a);
   4949 }
   4950 
   4951 // CHECK-LABEL: define void @test_vld2q_p8(%struct.poly8x16x2_t* noalias sret %agg.result, i8* %a) #0 {
   4952 // CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x2_t, align 16
   4953 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8*
   4954 // CHECK:   [[VLD2Q_V:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2.v16i8.p0i8(i8* %a, i32 1)
   4955 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8> }*
   4956 // CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2Q_V]], { <16 x i8>, <16 x i8> }* [[TMP1]]
   4957 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x2_t* %agg.result to i8*
   4958 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8*
   4959 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 32, i32 16, i1 false)
   4960 // CHECK:   ret void
   4961 poly8x16x2_t test_vld2q_p8(poly8_t const * a) {
   4962   return vld2q_p8(a);
   4963 }
   4964 
   4965 // CHECK-LABEL: define void @test_vld2q_p16(%struct.poly16x8x2_t* noalias sret %agg.result, i16* %a) #0 {
   4966 // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x2_t, align 16
   4967 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
   4968 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
   4969 // CHECK:   [[VLD2Q_V:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2.v8i16.p0i8(i8* [[TMP1]], i32 2)
   4970 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
   4971 // CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2Q_V]], { <8 x i16>, <8 x i16> }* [[TMP2]]
   4972 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly16x8x2_t* %agg.result to i8*
   4973 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
   4974 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false)
   4975 // CHECK:   ret void
   4976 poly16x8x2_t test_vld2q_p16(poly16_t const * a) {
   4977   return vld2q_p16(a);
   4978 }
   4979 
   4980 // CHECK-LABEL: define void @test_vld2_u8(%struct.uint8x8x2_t* noalias sret %agg.result, i8* %a) #0 {
   4981 // CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x2_t, align 8
   4982 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
   4983 // CHECK:   [[VLD2_V:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2.v8i8.p0i8(i8* %a, i32 1)
   4984 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
   4985 // CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2_V]], { <8 x i8>, <8 x i8> }* [[TMP1]]
   4986 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x2_t* %agg.result to i8*
   4987 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
   4988 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 16, i32 8, i1 false)
   4989 // CHECK:   ret void
   4990 uint8x8x2_t test_vld2_u8(uint8_t const * a) {
   4991   return vld2_u8(a);
   4992 }
   4993 
   4994 // CHECK-LABEL: define void @test_vld2_u16(%struct.uint16x4x2_t* noalias sret %agg.result, i16* %a) #0 {
   4995 // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x2_t, align 8
   4996 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
   4997 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
   4998 // CHECK:   [[VLD2_V:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2.v4i16.p0i8(i8* [[TMP1]], i32 2)
   4999 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
   5000 // CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2_V]], { <4 x i16>, <4 x i16> }* [[TMP2]]
   5001 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint16x4x2_t* %agg.result to i8*
   5002 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
   5003 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false)
   5004 // CHECK:   ret void
   5005 uint16x4x2_t test_vld2_u16(uint16_t const * a) {
   5006   return vld2_u16(a);
   5007 }
   5008 
   5009 // CHECK-LABEL: define void @test_vld2_u32(%struct.uint32x2x2_t* noalias sret %agg.result, i32* %a) #0 {
   5010 // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x2_t, align 8
   5011 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
   5012 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
   5013 // CHECK:   [[VLD2_V:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2.v2i32.p0i8(i8* [[TMP1]], i32 4)
   5014 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32> }*
   5015 // CHECK:   store { <2 x i32>, <2 x i32> } [[VLD2_V]], { <2 x i32>, <2 x i32> }* [[TMP2]]
   5016 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint32x2x2_t* %agg.result to i8*
   5017 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
   5018 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false)
   5019 // CHECK:   ret void
   5020 uint32x2x2_t test_vld2_u32(uint32_t const * a) {
   5021   return vld2_u32(a);
   5022 }
   5023 
   5024 // CHECK-LABEL: define void @test_vld2_u64(%struct.uint64x1x2_t* noalias sret %agg.result, i64* %a) #0 {
   5025 // CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x2_t, align 8
   5026 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
   5027 // CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
   5028 // CHECK:   [[VLD2_V:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64.p0i8(i8* [[TMP1]], i32 4)
   5029 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }*
   5030 // CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2_V]], { <1 x i64>, <1 x i64> }* [[TMP2]]
   5031 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint64x1x2_t* %agg.result to i8*
   5032 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
   5033 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false)
   5034 // CHECK:   ret void
   5035 uint64x1x2_t test_vld2_u64(uint64_t const * a) {
   5036   return vld2_u64(a);
   5037 }
   5038 
   5039 // CHECK-LABEL: define void @test_vld2_s8(%struct.int8x8x2_t* noalias sret %agg.result, i8* %a) #0 {
   5040 // CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x2_t, align 8
   5041 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
   5042 // CHECK:   [[VLD2_V:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2.v8i8.p0i8(i8* %a, i32 1)
   5043 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
   5044 // CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2_V]], { <8 x i8>, <8 x i8> }* [[TMP1]]
   5045 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x2_t* %agg.result to i8*
   5046 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
   5047 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 16, i32 8, i1 false)
   5048 // CHECK:   ret void
   5049 int8x8x2_t test_vld2_s8(int8_t const * a) {
   5050   return vld2_s8(a);
   5051 }
   5052 
   5053 // CHECK-LABEL: define void @test_vld2_s16(%struct.int16x4x2_t* noalias sret %agg.result, i16* %a) #0 {
   5054 // CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x2_t, align 8
   5055 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
   5056 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
   5057 // CHECK:   [[VLD2_V:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2.v4i16.p0i8(i8* [[TMP1]], i32 2)
   5058 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
   5059 // CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2_V]], { <4 x i16>, <4 x i16> }* [[TMP2]]
   5060 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int16x4x2_t* %agg.result to i8*
   5061 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
   5062 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false)
   5063 // CHECK:   ret void
   5064 int16x4x2_t test_vld2_s16(int16_t const * a) {
   5065   return vld2_s16(a);
   5066 }
   5067 
   5068 // CHECK-LABEL: define void @test_vld2_s32(%struct.int32x2x2_t* noalias sret %agg.result, i32* %a) #0 {
   5069 // CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x2_t, align 8
   5070 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
   5071 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
   5072 // CHECK:   [[VLD2_V:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2.v2i32.p0i8(i8* [[TMP1]], i32 4)
   5073 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32> }*
   5074 // CHECK:   store { <2 x i32>, <2 x i32> } [[VLD2_V]], { <2 x i32>, <2 x i32> }* [[TMP2]]
   5075 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int32x2x2_t* %agg.result to i8*
   5076 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
   5077 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false)
   5078 // CHECK:   ret void
   5079 int32x2x2_t test_vld2_s32(int32_t const * a) {
   5080   return vld2_s32(a);
   5081 }
   5082 
   5083 // CHECK-LABEL: define void @test_vld2_s64(%struct.int64x1x2_t* noalias sret %agg.result, i64* %a) #0 {
   5084 // CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x2_t, align 8
   5085 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
   5086 // CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
   5087 // CHECK:   [[VLD2_V:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64.p0i8(i8* [[TMP1]], i32 4)
   5088 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }*
   5089 // CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2_V]], { <1 x i64>, <1 x i64> }* [[TMP2]]
   5090 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int64x1x2_t* %agg.result to i8*
   5091 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
   5092 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false)
   5093 // CHECK:   ret void
   5094 int64x1x2_t test_vld2_s64(int64_t const * a) {
   5095   return vld2_s64(a);
   5096 }
   5097 
   5098 // CHECK-LABEL: define void @test_vld2_f16(%struct.float16x4x2_t* noalias sret %agg.result, half* %a) #0 {
   5099 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x2_t, align 8
   5100 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
   5101 // CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
   5102 // CHECK:   [[VLD2_V:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2.v4i16.p0i8(i8* [[TMP1]], i32 2)
   5103 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
   5104 // CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2_V]], { <4 x i16>, <4 x i16> }* [[TMP2]]
   5105 // CHECK:   [[TMP3:%.*]] = bitcast %struct.float16x4x2_t* %agg.result to i8*
   5106 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
   5107 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false)
   5108 // CHECK:   ret void
   5109 float16x4x2_t test_vld2_f16(float16_t const * a) {
   5110   return vld2_f16(a);
   5111 }
   5112 
   5113 // CHECK-LABEL: define void @test_vld2_f32(%struct.float32x2x2_t* noalias sret %agg.result, float* %a) #0 {
   5114 // CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x2_t, align 8
   5115 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
   5116 // CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
   5117 // CHECK:   [[VLD2_V:%.*]] = call { <2 x float>, <2 x float> } @llvm.arm.neon.vld2.v2f32.p0i8(i8* [[TMP1]], i32 4)
   5118 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float> }*
   5119 // CHECK:   store { <2 x float>, <2 x float> } [[VLD2_V]], { <2 x float>, <2 x float> }* [[TMP2]]
   5120 // CHECK:   [[TMP3:%.*]] = bitcast %struct.float32x2x2_t* %agg.result to i8*
   5121 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
   5122 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false)
   5123 // CHECK:   ret void
   5124 float32x2x2_t test_vld2_f32(float32_t const * a) {
   5125   return vld2_f32(a);
   5126 }
   5127 
   5128 // CHECK-LABEL: define void @test_vld2_p8(%struct.poly8x8x2_t* noalias sret %agg.result, i8* %a) #0 {
   5129 // CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x2_t, align 8
   5130 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
   5131 // CHECK:   [[VLD2_V:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2.v8i8.p0i8(i8* %a, i32 1)
   5132 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
   5133 // CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2_V]], { <8 x i8>, <8 x i8> }* [[TMP1]]
   5134 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x2_t* %agg.result to i8*
   5135 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
   5136 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 16, i32 8, i1 false)
   5137 // CHECK:   ret void
   5138 poly8x8x2_t test_vld2_p8(poly8_t const * a) {
   5139   return vld2_p8(a);
   5140 }
   5141 
   5142 // CHECK-LABEL: define void @test_vld2_p16(%struct.poly16x4x2_t* noalias sret %agg.result, i16* %a) #0 {
   5143 // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x2_t, align 8
   5144 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
   5145 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
   5146 // CHECK:   [[VLD2_V:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2.v4i16.p0i8(i8* [[TMP1]], i32 2)
   5147 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
   5148 // CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2_V]], { <4 x i16>, <4 x i16> }* [[TMP2]]
   5149 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly16x4x2_t* %agg.result to i8*
   5150 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
   5151 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false)
   5152 // CHECK:   ret void
   5153 poly16x4x2_t test_vld2_p16(poly16_t const * a) {
   5154   return vld2_p16(a);
   5155 }
   5156 
   5157 
   5158 // CHECK-LABEL: define void @test_vld2_dup_u8(%struct.uint8x8x2_t* noalias sret %agg.result, i8* %a) #0 {
   5159 // CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x2_t, align 8
   5160 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
   5161 // CHECK:   [[VLD_DUP:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8.p0i8(i8* %a, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
   5162 // CHECK:   [[TMP1:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLD_DUP]], 0
   5163 // CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
   5164 // CHECK:   [[TMP2:%.*]] = insertvalue { <8 x i8>, <8 x i8> } [[VLD_DUP]], <8 x i8> [[LANE]], 0
   5165 // CHECK:   [[TMP3:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[TMP2]], 1
   5166 // CHECK:   [[LANE1:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP3]], <8 x i32> zeroinitializer
   5167 // CHECK:   [[TMP4:%.*]] = insertvalue { <8 x i8>, <8 x i8> } [[TMP2]], <8 x i8> [[LANE1]], 1
   5168 // CHECK:   [[TMP5:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
   5169 // CHECK:   store { <8 x i8>, <8 x i8> } [[TMP4]], { <8 x i8>, <8 x i8> }* [[TMP5]]
   5170 // CHECK:   [[TMP6:%.*]] = bitcast %struct.uint8x8x2_t* %agg.result to i8*
   5171 // CHECK:   [[TMP7:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
   5172 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP6]], i8* [[TMP7]], i32 16, i32 8, i1 false)
   5173 // CHECK:   ret void
   5174 uint8x8x2_t test_vld2_dup_u8(uint8_t const * a) {
   5175   return vld2_dup_u8(a);
   5176 }
   5177 
   5178 // CHECK-LABEL: define void @test_vld2_dup_u16(%struct.uint16x4x2_t* noalias sret %agg.result, i16* %a) #0 {
   5179 // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x2_t, align 8
   5180 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
   5181 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
   5182 // CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16.p0i8(i8* [[TMP1]], <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
   5183 // CHECK:   [[TMP2:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD_DUP]], 0
   5184 // CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP2]], <4 x i32> zeroinitializer
   5185 // CHECK:   [[TMP3:%.*]] = insertvalue { <4 x i16>, <4 x i16> } [[VLD_DUP]], <4 x i16> [[LANE]], 0
   5186 // CHECK:   [[TMP4:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[TMP3]], 1
   5187 // CHECK:   [[LANE1:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP4]], <4 x i32> zeroinitializer
   5188 // CHECK:   [[TMP5:%.*]] = insertvalue { <4 x i16>, <4 x i16> } [[TMP3]], <4 x i16> [[LANE1]], 1
   5189 // CHECK:   [[TMP6:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
   5190 // CHECK:   store { <4 x i16>, <4 x i16> } [[TMP5]], { <4 x i16>, <4 x i16> }* [[TMP6]]
   5191 // CHECK:   [[TMP7:%.*]] = bitcast %struct.uint16x4x2_t* %agg.result to i8*
   5192 // CHECK:   [[TMP8:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
   5193 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false)
   5194 // CHECK:   ret void
   5195 uint16x4x2_t test_vld2_dup_u16(uint16_t const * a) {
   5196   return vld2_dup_u16(a);
   5197 }
   5198 
   5199 // CHECK-LABEL: define void @test_vld2_dup_u32(%struct.uint32x2x2_t* noalias sret %agg.result, i32* %a) #0 {
   5200 // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x2_t, align 8
   5201 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
   5202 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
   5203 // CHECK:   [[VLD_DUP:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2lane.v2i32.p0i8(i8* [[TMP1]], <2 x i32> undef, <2 x i32> undef, i32 0, i32 4)
   5204 // CHECK:   [[TMP2:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[VLD_DUP]], 0
   5205 // CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <2 x i32> zeroinitializer
   5206 // CHECK:   [[TMP3:%.*]] = insertvalue { <2 x i32>, <2 x i32> } [[VLD_DUP]], <2 x i32> [[LANE]], 0
   5207 // CHECK:   [[TMP4:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[TMP3]], 1
   5208 // CHECK:   [[LANE1:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP4]], <2 x i32> zeroinitializer
   5209 // CHECK:   [[TMP5:%.*]] = insertvalue { <2 x i32>, <2 x i32> } [[TMP3]], <2 x i32> [[LANE1]], 1
   5210 // CHECK:   [[TMP6:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32> }*
   5211 // CHECK:   store { <2 x i32>, <2 x i32> } [[TMP5]], { <2 x i32>, <2 x i32> }* [[TMP6]]
   5212 // CHECK:   [[TMP7:%.*]] = bitcast %struct.uint32x2x2_t* %agg.result to i8*
   5213 // CHECK:   [[TMP8:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
   5214 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false)
   5215 // CHECK:   ret void
   5216 uint32x2x2_t test_vld2_dup_u32(uint32_t const * a) {
   5217   return vld2_dup_u32(a);
   5218 }
   5219 
   5220 // CHECK-LABEL: define void @test_vld2_dup_u64(%struct.uint64x1x2_t* noalias sret %agg.result, i64* %a) #0 {
   5221 // CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x2_t, align 8
   5222 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
   5223 // CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
   5224 // CHECK:   [[VLD_DUP:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64.p0i8(i8* [[TMP1]], i32 4)
   5225 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }*
   5226 // CHECK:   store { <1 x i64>, <1 x i64> } [[VLD_DUP]], { <1 x i64>, <1 x i64> }* [[TMP2]]
   5227 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint64x1x2_t* %agg.result to i8*
   5228 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
   5229 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false)
   5230 // CHECK:   ret void
   5231 uint64x1x2_t test_vld2_dup_u64(uint64_t const * a) {
   5232   return vld2_dup_u64(a);
   5233 }
   5234 
   5235 // CHECK-LABEL: define void @test_vld2_dup_s8(%struct.int8x8x2_t* noalias sret %agg.result, i8* %a) #0 {
   5236 // CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x2_t, align 8
   5237 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
   5238 // CHECK:   [[VLD_DUP:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8.p0i8(i8* %a, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
   5239 // CHECK:   [[TMP1:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLD_DUP]], 0
   5240 // CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
   5241 // CHECK:   [[TMP2:%.*]] = insertvalue { <8 x i8>, <8 x i8> } [[VLD_DUP]], <8 x i8> [[LANE]], 0
   5242 // CHECK:   [[TMP3:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[TMP2]], 1
   5243 // CHECK:   [[LANE1:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP3]], <8 x i32> zeroinitializer
   5244 // CHECK:   [[TMP4:%.*]] = insertvalue { <8 x i8>, <8 x i8> } [[TMP2]], <8 x i8> [[LANE1]], 1
   5245 // CHECK:   [[TMP5:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
   5246 // CHECK:   store { <8 x i8>, <8 x i8> } [[TMP4]], { <8 x i8>, <8 x i8> }* [[TMP5]]
   5247 // CHECK:   [[TMP6:%.*]] = bitcast %struct.int8x8x2_t* %agg.result to i8*
   5248 // CHECK:   [[TMP7:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
   5249 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP6]], i8* [[TMP7]], i32 16, i32 8, i1 false)
   5250 // CHECK:   ret void
   5251 int8x8x2_t test_vld2_dup_s8(int8_t const * a) {
   5252   return vld2_dup_s8(a);
   5253 }
   5254 
   5255 // CHECK-LABEL: define void @test_vld2_dup_s16(%struct.int16x4x2_t* noalias sret %agg.result, i16* %a) #0 {
   5256 // CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x2_t, align 8
   5257 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
   5258 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
   5259 // CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16.p0i8(i8* [[TMP1]], <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
   5260 // CHECK:   [[TMP2:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD_DUP]], 0
   5261 // CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP2]], <4 x i32> zeroinitializer
   5262 // CHECK:   [[TMP3:%.*]] = insertvalue { <4 x i16>, <4 x i16> } [[VLD_DUP]], <4 x i16> [[LANE]], 0
   5263 // CHECK:   [[TMP4:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[TMP3]], 1
   5264 // CHECK:   [[LANE1:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP4]], <4 x i32> zeroinitializer
   5265 // CHECK:   [[TMP5:%.*]] = insertvalue { <4 x i16>, <4 x i16> } [[TMP3]], <4 x i16> [[LANE1]], 1
   5266 // CHECK:   [[TMP6:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
   5267 // CHECK:   store { <4 x i16>, <4 x i16> } [[TMP5]], { <4 x i16>, <4 x i16> }* [[TMP6]]
   5268 // CHECK:   [[TMP7:%.*]] = bitcast %struct.int16x4x2_t* %agg.result to i8*
   5269 // CHECK:   [[TMP8:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
   5270 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false)
   5271 // CHECK:   ret void
   5272 int16x4x2_t test_vld2_dup_s16(int16_t const * a) {
   5273   return vld2_dup_s16(a);
   5274 }
   5275 
   5276 // CHECK-LABEL: define void @test_vld2_dup_s32(%struct.int32x2x2_t* noalias sret %agg.result, i32* %a) #0 {
   5277 // CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x2_t, align 8
   5278 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
   5279 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
   5280 // CHECK:   [[VLD_DUP:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2lane.v2i32.p0i8(i8* [[TMP1]], <2 x i32> undef, <2 x i32> undef, i32 0, i32 4)
   5281 // CHECK:   [[TMP2:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[VLD_DUP]], 0
   5282 // CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <2 x i32> zeroinitializer
   5283 // CHECK:   [[TMP3:%.*]] = insertvalue { <2 x i32>, <2 x i32> } [[VLD_DUP]], <2 x i32> [[LANE]], 0
   5284 // CHECK:   [[TMP4:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[TMP3]], 1
   5285 // CHECK:   [[LANE1:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP4]], <2 x i32> zeroinitializer
   5286 // CHECK:   [[TMP5:%.*]] = insertvalue { <2 x i32>, <2 x i32> } [[TMP3]], <2 x i32> [[LANE1]], 1
   5287 // CHECK:   [[TMP6:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32> }*
   5288 // CHECK:   store { <2 x i32>, <2 x i32> } [[TMP5]], { <2 x i32>, <2 x i32> }* [[TMP6]]
   5289 // CHECK:   [[TMP7:%.*]] = bitcast %struct.int32x2x2_t* %agg.result to i8*
   5290 // CHECK:   [[TMP8:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
   5291 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false)
   5292 // CHECK:   ret void
   5293 int32x2x2_t test_vld2_dup_s32(int32_t const * a) {
   5294   return vld2_dup_s32(a);
   5295 }
   5296 
   5297 // CHECK-LABEL: define void @test_vld2_dup_s64(%struct.int64x1x2_t* noalias sret %agg.result, i64* %a) #0 {
   5298 // CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x2_t, align 8
   5299 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
   5300 // CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
   5301 // CHECK:   [[VLD_DUP:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64.p0i8(i8* [[TMP1]], i32 4)
   5302 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }*
   5303 // CHECK:   store { <1 x i64>, <1 x i64> } [[VLD_DUP]], { <1 x i64>, <1 x i64> }* [[TMP2]]
   5304 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int64x1x2_t* %agg.result to i8*
   5305 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
   5306 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false)
   5307 // CHECK:   ret void
   5308 int64x1x2_t test_vld2_dup_s64(int64_t const * a) {
   5309   return vld2_dup_s64(a);
   5310 }
   5311 
   5312 // CHECK-LABEL: define void @test_vld2_dup_f16(%struct.float16x4x2_t* noalias sret %agg.result, half* %a) #0 {
   5313 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x2_t, align 8
   5314 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
   5315 // CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
   5316 // CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16.p0i8(i8* [[TMP1]], <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
   5317 // CHECK:   [[TMP2:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD_DUP]], 0
   5318 // CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP2]], <4 x i32> zeroinitializer
   5319 // CHECK:   [[TMP3:%.*]] = insertvalue { <4 x i16>, <4 x i16> } [[VLD_DUP]], <4 x i16> [[LANE]], 0
   5320 // CHECK:   [[TMP4:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[TMP3]], 1
   5321 // CHECK:   [[LANE1:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP4]], <4 x i32> zeroinitializer
   5322 // CHECK:   [[TMP5:%.*]] = insertvalue { <4 x i16>, <4 x i16> } [[TMP3]], <4 x i16> [[LANE1]], 1
   5323 // CHECK:   [[TMP6:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
   5324 // CHECK:   store { <4 x i16>, <4 x i16> } [[TMP5]], { <4 x i16>, <4 x i16> }* [[TMP6]]
   5325 // CHECK:   [[TMP7:%.*]] = bitcast %struct.float16x4x2_t* %agg.result to i8*
   5326 // CHECK:   [[TMP8:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
   5327 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false)
   5328 // CHECK:   ret void
   5329 float16x4x2_t test_vld2_dup_f16(float16_t const * a) {
   5330   return vld2_dup_f16(a);
   5331 }
   5332 
   5333 // CHECK-LABEL: define void @test_vld2_dup_f32(%struct.float32x2x2_t* noalias sret %agg.result, float* %a) #0 {
   5334 // CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x2_t, align 8
   5335 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
   5336 // CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
   5337 // CHECK:   [[VLD_DUP:%.*]] = call { <2 x float>, <2 x float> } @llvm.arm.neon.vld2lane.v2f32.p0i8(i8* [[TMP1]], <2 x float> undef, <2 x float> undef, i32 0, i32 4)
   5338 // CHECK:   [[TMP2:%.*]] = extractvalue { <2 x float>, <2 x float> } [[VLD_DUP]], 0
   5339 // CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP2]], <2 x i32> zeroinitializer
   5340 // CHECK:   [[TMP3:%.*]] = insertvalue { <2 x float>, <2 x float> } [[VLD_DUP]], <2 x float> [[LANE]], 0
   5341 // CHECK:   [[TMP4:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP3]], 1
   5342 // CHECK:   [[LANE1:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP4]], <2 x i32> zeroinitializer
   5343 // CHECK:   [[TMP5:%.*]] = insertvalue { <2 x float>, <2 x float> } [[TMP3]], <2 x float> [[LANE1]], 1
   5344 // CHECK:   [[TMP6:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float> }*
   5345 // CHECK:   store { <2 x float>, <2 x float> } [[TMP5]], { <2 x float>, <2 x float> }* [[TMP6]]
   5346 // CHECK:   [[TMP7:%.*]] = bitcast %struct.float32x2x2_t* %agg.result to i8*
   5347 // CHECK:   [[TMP8:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
   5348 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false)
   5349 // CHECK:   ret void
   5350 float32x2x2_t test_vld2_dup_f32(float32_t const * a) {
   5351   return vld2_dup_f32(a);
   5352 }
   5353 
   5354 // CHECK-LABEL: define void @test_vld2_dup_p8(%struct.poly8x8x2_t* noalias sret %agg.result, i8* %a) #0 {
   5355 // CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x2_t, align 8
   5356 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
   5357 // CHECK:   [[VLD_DUP:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8.p0i8(i8* %a, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
   5358 // CHECK:   [[TMP1:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLD_DUP]], 0
   5359 // CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
   5360 // CHECK:   [[TMP2:%.*]] = insertvalue { <8 x i8>, <8 x i8> } [[VLD_DUP]], <8 x i8> [[LANE]], 0
   5361 // CHECK:   [[TMP3:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[TMP2]], 1
   5362 // CHECK:   [[LANE1:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP3]], <8 x i32> zeroinitializer
   5363 // CHECK:   [[TMP4:%.*]] = insertvalue { <8 x i8>, <8 x i8> } [[TMP2]], <8 x i8> [[LANE1]], 1
   5364 // CHECK:   [[TMP5:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
   5365 // CHECK:   store { <8 x i8>, <8 x i8> } [[TMP4]], { <8 x i8>, <8 x i8> }* [[TMP5]]
   5366 // CHECK:   [[TMP6:%.*]] = bitcast %struct.poly8x8x2_t* %agg.result to i8*
   5367 // CHECK:   [[TMP7:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
   5368 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP6]], i8* [[TMP7]], i32 16, i32 8, i1 false)
   5369 // CHECK:   ret void
   5370 poly8x8x2_t test_vld2_dup_p8(poly8_t const * a) {
   5371   return vld2_dup_p8(a);
   5372 }
   5373 
   5374 // CHECK-LABEL: define void @test_vld2_dup_p16(%struct.poly16x4x2_t* noalias sret %agg.result, i16* %a) #0 {
   5375 // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x2_t, align 8
   5376 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
   5377 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
   5378 // CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16.p0i8(i8* [[TMP1]], <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
   5379 // CHECK:   [[TMP2:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD_DUP]], 0
   5380 // CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP2]], <4 x i32> zeroinitializer
   5381 // CHECK:   [[TMP3:%.*]] = insertvalue { <4 x i16>, <4 x i16> } [[VLD_DUP]], <4 x i16> [[LANE]], 0
   5382 // CHECK:   [[TMP4:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[TMP3]], 1
   5383 // CHECK:   [[LANE1:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP4]], <4 x i32> zeroinitializer
   5384 // CHECK:   [[TMP5:%.*]] = insertvalue { <4 x i16>, <4 x i16> } [[TMP3]], <4 x i16> [[LANE1]], 1
   5385 // CHECK:   [[TMP6:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
   5386 // CHECK:   store { <4 x i16>, <4 x i16> } [[TMP5]], { <4 x i16>, <4 x i16> }* [[TMP6]]
   5387 // CHECK:   [[TMP7:%.*]] = bitcast %struct.poly16x4x2_t* %agg.result to i8*
   5388 // CHECK:   [[TMP8:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
   5389 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false)
   5390 // CHECK:   ret void
   5391 poly16x4x2_t test_vld2_dup_p16(poly16_t const * a) {
   5392   return vld2_dup_p16(a);
   5393 }
   5394 
   5395 
   5396 // CHECK-LABEL: define void @test_vld2q_lane_u16(%struct.uint16x8x2_t* noalias sret %agg.result, i16* %a, [4 x i64] %b.coerce) #0 {
   5397 // CHECK:   [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16
   5398 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16
   5399 // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x2_t, align 16
   5400 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[B]], i32 0, i32 0
   5401 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
   5402 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
   5403 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x2_t* [[__S1]] to i8*
   5404 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x2_t* [[B]] to i8*
   5405 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
   5406 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
   5407 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
   5408 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
   5409 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i32 0, i32 0
   5410 // CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
   5411 // CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
   5412 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
   5413 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i32 0, i32 1
   5414 // CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
   5415 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
   5416 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
   5417 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
   5418 // CHECK:   [[VLD2Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16.p0i8(i8* [[TMP4]], <8 x i16> [[TMP9]], <8 x i16> [[TMP10]], i32 7, i32 2)
   5419 // CHECK:   [[TMP11:%.*]] = bitcast i8* [[TMP3]] to { <8 x i16>, <8 x i16> }*
   5420 // CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2Q_LANE_V]], { <8 x i16>, <8 x i16> }* [[TMP11]]
   5421 // CHECK:   [[TMP12:%.*]] = bitcast %struct.uint16x8x2_t* %agg.result to i8*
   5422 // CHECK:   [[TMP13:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
   5423 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP12]], i8* [[TMP13]], i32 32, i32 16, i1 false)
   5424 // CHECK:   ret void
   5425 uint16x8x2_t test_vld2q_lane_u16(uint16_t const * a, uint16x8x2_t b) {
   5426   return vld2q_lane_u16(a, b, 7);
   5427 }
   5428 
   5429 // CHECK-LABEL: define void @test_vld2q_lane_u32(%struct.uint32x4x2_t* noalias sret %agg.result, i32* %a, [4 x i64] %b.coerce) #0 {
   5430 // CHECK:   [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16
   5431 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16
   5432 // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x2_t, align 16
   5433 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[B]], i32 0, i32 0
   5434 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x i32>]* [[COERCE_DIVE]] to [4 x i64]*
   5435 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
   5436 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x2_t* [[__S1]] to i8*
   5437 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x2_t* [[B]] to i8*
   5438 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
   5439 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
   5440 // CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
   5441 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
   5442 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i32 0, i32 0
   5443 // CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
   5444 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
   5445 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
   5446 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL1]], i32 0, i32 1
   5447 // CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
   5448 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
   5449 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
   5450 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
   5451 // CHECK:   [[VLD2Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32.p0i8(i8* [[TMP4]], <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], i32 3, i32 4)
   5452 // CHECK:   [[TMP11:%.*]] = bitcast i8* [[TMP3]] to { <4 x i32>, <4 x i32> }*
   5453 // CHECK:   store { <4 x i32>, <4 x i32> } [[VLD2Q_LANE_V]], { <4 x i32>, <4 x i32> }* [[TMP11]]
   5454 // CHECK:   [[TMP12:%.*]] = bitcast %struct.uint32x4x2_t* %agg.result to i8*
   5455 // CHECK:   [[TMP13:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
   5456 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP12]], i8* [[TMP13]], i32 32, i32 16, i1 false)
   5457 // CHECK:   ret void
   5458 uint32x4x2_t test_vld2q_lane_u32(uint32_t const * a, uint32x4x2_t b) {
   5459   return vld2q_lane_u32(a, b, 3);
   5460 }
   5461 
   5462 // CHECK-LABEL: define void @test_vld2q_lane_s16(%struct.int16x8x2_t* noalias sret %agg.result, i16* %a, [4 x i64] %b.coerce) #0 {
   5463 // CHECK:   [[B:%.*]] = alloca %struct.int16x8x2_t, align 16
   5464 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16
   5465 // CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x2_t, align 16
   5466 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[B]], i32 0, i32 0
   5467 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
   5468 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
   5469 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x2_t* [[__S1]] to i8*
   5470 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x2_t* [[B]] to i8*
   5471 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
   5472 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
   5473 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
   5474 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
   5475 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i32 0, i32 0
   5476 // CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
   5477 // CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
   5478 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
   5479 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i32 0, i32 1
   5480 // CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
   5481 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
   5482 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
   5483 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
   5484 // CHECK:   [[VLD2Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16.p0i8(i8* [[TMP4]], <8 x i16> [[TMP9]], <8 x i16> [[TMP10]], i32 7, i32 2)
   5485 // CHECK:   [[TMP11:%.*]] = bitcast i8* [[TMP3]] to { <8 x i16>, <8 x i16> }*
   5486 // CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2Q_LANE_V]], { <8 x i16>, <8 x i16> }* [[TMP11]]
   5487 // CHECK:   [[TMP12:%.*]] = bitcast %struct.int16x8x2_t* %agg.result to i8*
   5488 // CHECK:   [[TMP13:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
   5489 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP12]], i8* [[TMP13]], i32 32, i32 16, i1 false)
   5490 // CHECK:   ret void
   5491 int16x8x2_t test_vld2q_lane_s16(int16_t const * a, int16x8x2_t b) {
   5492   return vld2q_lane_s16(a, b, 7);
   5493 }
   5494 
   5495 // CHECK-LABEL: define void @test_vld2q_lane_s32(%struct.int32x4x2_t* noalias sret %agg.result, i32* %a, [4 x i64] %b.coerce) #0 {
   5496 // CHECK:   [[B:%.*]] = alloca %struct.int32x4x2_t, align 16
   5497 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16
   5498 // CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x2_t, align 16
   5499 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[B]], i32 0, i32 0
   5500 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x i32>]* [[COERCE_DIVE]] to [4 x i64]*
   5501 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
   5502 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x2_t* [[__S1]] to i8*
   5503 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x2_t* [[B]] to i8*
   5504 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
   5505 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
   5506 // CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
   5507 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
   5508 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i32 0, i32 0
   5509 // CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
   5510 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
   5511 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
   5512 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL1]], i32 0, i32 1
   5513 // CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
   5514 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
   5515 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
   5516 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
   5517 // CHECK:   [[VLD2Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32.p0i8(i8* [[TMP4]], <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], i32 3, i32 4)
   5518 // CHECK:   [[TMP11:%.*]] = bitcast i8* [[TMP3]] to { <4 x i32>, <4 x i32> }*
   5519 // CHECK:   store { <4 x i32>, <4 x i32> } [[VLD2Q_LANE_V]], { <4 x i32>, <4 x i32> }* [[TMP11]]
   5520 // CHECK:   [[TMP12:%.*]] = bitcast %struct.int32x4x2_t* %agg.result to i8*
   5521 // CHECK:   [[TMP13:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
   5522 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP12]], i8* [[TMP13]], i32 32, i32 16, i1 false)
   5523 // CHECK:   ret void
   5524 int32x4x2_t test_vld2q_lane_s32(int32_t const * a, int32x4x2_t b) {
   5525   return vld2q_lane_s32(a, b, 3);
   5526 }
   5527 
   5528 // CHECK-LABEL: define void @test_vld2q_lane_f16(%struct.float16x8x2_t* noalias sret %agg.result, half* %a, [4 x i64] %b.coerce) #0 {
   5529 // CHECK:   [[B:%.*]] = alloca %struct.float16x8x2_t, align 16
   5530 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16
   5531 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x2_t, align 16
   5532 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[B]], i32 0, i32 0
   5533 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x half>]* [[COERCE_DIVE]] to [4 x i64]*
   5534 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
   5535 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x2_t* [[__S1]] to i8*
   5536 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x2_t* [[B]] to i8*
   5537 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
   5538 // CHECK:   [[TMP3:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
   5539 // CHECK:   [[TMP4:%.*]] = bitcast half* %a to i8*
   5540 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
   5541 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL]], i32 0, i32 0
   5542 // CHECK:   [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
   5543 // CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
   5544 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
   5545 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL1]], i32 0, i32 1
   5546 // CHECK:   [[TMP7:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
   5547 // CHECK:   [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
   5548 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
   5549 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
   5550 // CHECK:   [[VLD2Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16.p0i8(i8* [[TMP4]], <8 x i16> [[TMP9]], <8 x i16> [[TMP10]], i32 7, i32 2)
   5551 // CHECK:   [[TMP11:%.*]] = bitcast i8* [[TMP3]] to { <8 x i16>, <8 x i16> }*
   5552 // CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2Q_LANE_V]], { <8 x i16>, <8 x i16> }* [[TMP11]]
   5553 // CHECK:   [[TMP12:%.*]] = bitcast %struct.float16x8x2_t* %agg.result to i8*
   5554 // CHECK:   [[TMP13:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
   5555 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP12]], i8* [[TMP13]], i32 32, i32 16, i1 false)
   5556 // CHECK:   ret void
   5557 float16x8x2_t test_vld2q_lane_f16(float16_t const * a, float16x8x2_t b) {
   5558   return vld2q_lane_f16(a, b, 7);
   5559 }
   5560 
   5561 // CHECK-LABEL: define void @test_vld2q_lane_f32(%struct.float32x4x2_t* noalias sret %agg.result, float* %a, [4 x i64] %b.coerce) #0 {
   5562 // CHECK:   [[B:%.*]] = alloca %struct.float32x4x2_t, align 16
   5563 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16
   5564 // CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x2_t, align 16
   5565 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[B]], i32 0, i32 0
   5566 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x float>]* [[COERCE_DIVE]] to [4 x i64]*
   5567 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
   5568 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x2_t* [[__S1]] to i8*
   5569 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x2_t* [[B]] to i8*
   5570 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
   5571 // CHECK:   [[TMP3:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
   5572 // CHECK:   [[TMP4:%.*]] = bitcast float* %a to i8*
   5573 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
   5574 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL]], i32 0, i32 0
   5575 // CHECK:   [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
   5576 // CHECK:   [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
   5577 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
   5578 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL1]], i32 0, i32 1
   5579 // CHECK:   [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
   5580 // CHECK:   [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8>
   5581 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
   5582 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>
   5583 // CHECK:   [[VLD2Q_LANE_V:%.*]] = call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2lane.v4f32.p0i8(i8* [[TMP4]], <4 x float> [[TMP9]], <4 x float> [[TMP10]], i32 3, i32 4)
   5584 // CHECK:   [[TMP11:%.*]] = bitcast i8* [[TMP3]] to { <4 x float>, <4 x float> }*
   5585 // CHECK:   store { <4 x float>, <4 x float> } [[VLD2Q_LANE_V]], { <4 x float>, <4 x float> }* [[TMP11]]
   5586 // CHECK:   [[TMP12:%.*]] = bitcast %struct.float32x4x2_t* %agg.result to i8*
   5587 // CHECK:   [[TMP13:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
   5588 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP12]], i8* [[TMP13]], i32 32, i32 16, i1 false)
   5589 // CHECK:   ret void
   5590 float32x4x2_t test_vld2q_lane_f32(float32_t const * a, float32x4x2_t b) {
   5591   return vld2q_lane_f32(a, b, 3);
   5592 }
   5593 
   5594 // CHECK-LABEL: define void @test_vld2q_lane_p16(%struct.poly16x8x2_t* noalias sret %agg.result, i16* %a, [4 x i64] %b.coerce) #0 {
   5595 // CHECK:   [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16
   5596 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16
   5597 // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x2_t, align 16
   5598 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[B]], i32 0, i32 0
   5599 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
   5600 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
   5601 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x2_t* [[__S1]] to i8*
   5602 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x2_t* [[B]] to i8*
   5603 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
   5604 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
   5605 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
   5606 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
   5607 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i32 0, i32 0
   5608 // CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
   5609 // CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
   5610 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
   5611 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i32 0, i32 1
   5612 // CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
   5613 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
   5614 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
   5615 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
   5616 // CHECK:   [[VLD2Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16.p0i8(i8* [[TMP4]], <8 x i16> [[TMP9]], <8 x i16> [[TMP10]], i32 7, i32 2)
   5617 // CHECK:   [[TMP11:%.*]] = bitcast i8* [[TMP3]] to { <8 x i16>, <8 x i16> }*
   5618 // CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2Q_LANE_V]], { <8 x i16>, <8 x i16> }* [[TMP11]]
   5619 // CHECK:   [[TMP12:%.*]] = bitcast %struct.poly16x8x2_t* %agg.result to i8*
   5620 // CHECK:   [[TMP13:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
   5621 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP12]], i8* [[TMP13]], i32 32, i32 16, i1 false)
   5622 // CHECK:   ret void
   5623 poly16x8x2_t test_vld2q_lane_p16(poly16_t const * a, poly16x8x2_t b) {
   5624   return vld2q_lane_p16(a, b, 7);
   5625 }
   5626 
   5627 // CHECK-LABEL: define void @test_vld2_lane_u8(%struct.uint8x8x2_t* noalias sret %agg.result, i8* %a, [2 x i64] %b.coerce) #0 {
   5628 // CHECK:   [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8
   5629 // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x2_t, align 8
   5630 // CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x2_t, align 8
   5631 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[B]], i32 0, i32 0
   5632 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE]] to [2 x i64]*
   5633 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
   5634 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x2_t* [[__S1]] to i8*
   5635 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x2_t* [[B]] to i8*
   5636 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
   5637 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
   5638 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
   5639 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i32 0, i32 0
   5640 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
   5641 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
   5642 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i32 0, i32 1
   5643 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
   5644 // CHECK:   [[VLD2_LANE_V:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8.p0i8(i8* %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i32 7, i32 1)
   5645 // CHECK:   [[TMP6:%.*]] = bitcast i8* [[TMP3]] to { <8 x i8>, <8 x i8> }*
   5646 // CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2_LANE_V]], { <8 x i8>, <8 x i8> }* [[TMP6]]
   5647 // CHECK:   [[TMP7:%.*]] = bitcast %struct.uint8x8x2_t* %agg.result to i8*
   5648 // CHECK:   [[TMP8:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
   5649 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false)
   5650 // CHECK:   ret void
   5651 uint8x8x2_t test_vld2_lane_u8(uint8_t const * a, uint8x8x2_t b) {
   5652   return vld2_lane_u8(a, b, 7);
   5653 }
   5654 
   5655 // CHECK-LABEL: define void @test_vld2_lane_u16(%struct.uint16x4x2_t* noalias sret %agg.result, i16* %a, [2 x i64] %b.coerce) #0 {
   5656 // CHECK:   [[B:%.*]] = alloca %struct.uint16x4x2_t, align 8
   5657 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x2_t, align 8
   5658 // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x2_t, align 8
   5659 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[B]], i32 0, i32 0
   5660 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x i16>]* [[COERCE_DIVE]] to [2 x i64]*
   5661 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
   5662 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x2_t* [[__S1]] to i8*
   5663 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x2_t* [[B]] to i8*
   5664 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
   5665 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
   5666 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
   5667 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
   5668 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i32 0, i32 0
   5669 // CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
   5670 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
   5671 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
   5672 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i32 0, i32 1
   5673 // CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
   5674 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
   5675 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
   5676 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
   5677 // CHECK:   [[VLD2_LANE_V:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16.p0i8(i8* [[TMP4]], <4 x i16> [[TMP9]], <4 x i16> [[TMP10]], i32 3, i32 2)
   5678 // CHECK:   [[TMP11:%.*]] = bitcast i8* [[TMP3]] to { <4 x i16>, <4 x i16> }*
   5679 // CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2_LANE_V]], { <4 x i16>, <4 x i16> }* [[TMP11]]
   5680 // CHECK:   [[TMP12:%.*]] = bitcast %struct.uint16x4x2_t* %agg.result to i8*
   5681 // CHECK:   [[TMP13:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
   5682 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP12]], i8* [[TMP13]], i32 16, i32 8, i1 false)
   5683 // CHECK:   ret void
   5684 uint16x4x2_t test_vld2_lane_u16(uint16_t const * a, uint16x4x2_t b) {
   5685   return vld2_lane_u16(a, b, 3);
   5686 }
   5687 
   5688 // CHECK-LABEL: define void @test_vld2_lane_u32(%struct.uint32x2x2_t* noalias sret %agg.result, i32* %a, [2 x i64] %b.coerce) #0 {
   5689 // CHECK:   [[B:%.*]] = alloca %struct.uint32x2x2_t, align 8
   5690 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x2_t, align 8
   5691 // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x2_t, align 8
   5692 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[B]], i32 0, i32 0
   5693 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <2 x i32>]* [[COERCE_DIVE]] to [2 x i64]*
   5694 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
   5695 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x2_t* [[__S1]] to i8*
   5696 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x2_t* [[B]] to i8*
   5697 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
   5698 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
   5699 // CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
   5700 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
   5701 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i32 0, i32 0
   5702 // CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
   5703 // CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
   5704 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
   5705 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL1]], i32 0, i32 1
   5706 // CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
   5707 // CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
   5708 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
   5709 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
   5710 // CHECK:   [[VLD2_LANE_V:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2lane.v2i32.p0i8(i8* [[TMP4]], <2 x i32> [[TMP9]], <2 x i32> [[TMP10]], i32 1, i32 4)
   5711 // CHECK:   [[TMP11:%.*]] = bitcast i8* [[TMP3]] to { <2 x i32>, <2 x i32> }*
   5712 // CHECK:   store { <2 x i32>, <2 x i32> } [[VLD2_LANE_V]], { <2 x i32>, <2 x i32> }* [[TMP11]]
   5713 // CHECK:   [[TMP12:%.*]] = bitcast %struct.uint32x2x2_t* %agg.result to i8*
   5714 // CHECK:   [[TMP13:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
   5715 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP12]], i8* [[TMP13]], i32 16, i32 8, i1 false)
   5716 // CHECK:   ret void
   5717 uint32x2x2_t test_vld2_lane_u32(uint32_t const * a, uint32x2x2_t b) {
   5718   return vld2_lane_u32(a, b, 1);
   5719 }
   5720 
   5721 // CHECK-LABEL: define void @test_vld2_lane_s8(%struct.int8x8x2_t* noalias sret %agg.result, i8* %a, [2 x i64] %b.coerce) #0 {
   5722 // CHECK:   [[B:%.*]] = alloca %struct.int8x8x2_t, align 8
   5723 // CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x2_t, align 8
   5724 // CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x2_t, align 8
   5725 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[B]], i32 0, i32 0
   5726 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE]] to [2 x i64]*
   5727 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
   5728 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x2_t* [[__S1]] to i8*
   5729 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x2_t* [[B]] to i8*
   5730 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
   5731 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
   5732 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
   5733 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i32 0, i32 0
   5734 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
   5735 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
   5736 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i32 0, i32 1
   5737 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
   5738 // CHECK:   [[VLD2_LANE_V:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8.p0i8(i8* %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i32 7, i32 1)
   5739 // CHECK:   [[TMP6:%.*]] = bitcast i8* [[TMP3]] to { <8 x i8>, <8 x i8> }*
   5740 // CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2_LANE_V]], { <8 x i8>, <8 x i8> }* [[TMP6]]
   5741 // CHECK:   [[TMP7:%.*]] = bitcast %struct.int8x8x2_t* %agg.result to i8*
   5742 // CHECK:   [[TMP8:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
   5743 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false)
   5744 // CHECK:   ret void
   5745 int8x8x2_t test_vld2_lane_s8(int8_t const * a, int8x8x2_t b) {
   5746   return vld2_lane_s8(a, b, 7);
   5747 }
   5748 
   5749 // CHECK-LABEL: define void @test_vld2_lane_s16(%struct.int16x4x2_t* noalias sret %agg.result, i16* %a, [2 x i64] %b.coerce) #0 {
   5750 // CHECK:   [[B:%.*]] = alloca %struct.int16x4x2_t, align 8
   5751 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x2_t, align 8
   5752 // CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x2_t, align 8
   5753 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[B]], i32 0, i32 0
   5754 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x i16>]* [[COERCE_DIVE]] to [2 x i64]*
   5755 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
   5756 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x2_t* [[__S1]] to i8*
   5757 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x2_t* [[B]] to i8*
   5758 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
   5759 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
   5760 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
   5761 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
   5762 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i32 0, i32 0
   5763 // CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
   5764 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
   5765 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
   5766 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i32 0, i32 1
   5767 // CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
   5768 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
   5769 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
   5770 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
   5771 // CHECK:   [[VLD2_LANE_V:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16.p0i8(i8* [[TMP4]], <4 x i16> [[TMP9]], <4 x i16> [[TMP10]], i32 3, i32 2)
   5772 // CHECK:   [[TMP11:%.*]] = bitcast i8* [[TMP3]] to { <4 x i16>, <4 x i16> }*
   5773 // CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2_LANE_V]], { <4 x i16>, <4 x i16> }* [[TMP11]]
   5774 // CHECK:   [[TMP12:%.*]] = bitcast %struct.int16x4x2_t* %agg.result to i8*
   5775 // CHECK:   [[TMP13:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
   5776 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP12]], i8* [[TMP13]], i32 16, i32 8, i1 false)
   5777 // CHECK:   ret void
   5778 int16x4x2_t test_vld2_lane_s16(int16_t const * a, int16x4x2_t b) {
   5779   return vld2_lane_s16(a, b, 3);
   5780 }
   5781 
   5782 // CHECK-LABEL: define void @test_vld2_lane_s32(%struct.int32x2x2_t* noalias sret %agg.result, i32* %a, [2 x i64] %b.coerce) #0 {
   5783 // CHECK:   [[B:%.*]] = alloca %struct.int32x2x2_t, align 8
   5784 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x2_t, align 8
   5785 // CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x2_t, align 8
   5786 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[B]], i32 0, i32 0
   5787 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <2 x i32>]* [[COERCE_DIVE]] to [2 x i64]*
   5788 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
   5789 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x2_t* [[__S1]] to i8*
   5790 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x2_t* [[B]] to i8*
   5791 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
   5792 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
   5793 // CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
   5794 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
   5795 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i32 0, i32 0
   5796 // CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
   5797 // CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
   5798 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
   5799 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL1]], i32 0, i32 1
   5800 // CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
   5801 // CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
   5802 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
   5803 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
   5804 // CHECK:   [[VLD2_LANE_V:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2lane.v2i32.p0i8(i8* [[TMP4]], <2 x i32> [[TMP9]], <2 x i32> [[TMP10]], i32 1, i32 4)
   5805 // CHECK:   [[TMP11:%.*]] = bitcast i8* [[TMP3]] to { <2 x i32>, <2 x i32> }*
   5806 // CHECK:   store { <2 x i32>, <2 x i32> } [[VLD2_LANE_V]], { <2 x i32>, <2 x i32> }* [[TMP11]]
   5807 // CHECK:   [[TMP12:%.*]] = bitcast %struct.int32x2x2_t* %agg.result to i8*
   5808 // CHECK:   [[TMP13:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
   5809 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP12]], i8* [[TMP13]], i32 16, i32 8, i1 false)
   5810 // CHECK:   ret void
   5811 int32x2x2_t test_vld2_lane_s32(int32_t const * a, int32x2x2_t b) {
   5812   return vld2_lane_s32(a, b, 1);
   5813 }
   5814 
   5815 // CHECK-LABEL: define void @test_vld2_lane_f16(%struct.float16x4x2_t* noalias sret %agg.result, half* %a, [2 x i64] %b.coerce) #0 {
   5816 // CHECK:   [[B:%.*]] = alloca %struct.float16x4x2_t, align 8
   5817 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x2_t, align 8
   5818 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x2_t, align 8
   5819 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[B]], i32 0, i32 0
   5820 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x half>]* [[COERCE_DIVE]] to [2 x i64]*
   5821 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
   5822 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x2_t* [[__S1]] to i8*
   5823 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x2_t* [[B]] to i8*
   5824 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
   5825 // CHECK:   [[TMP3:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
   5826 // CHECK:   [[TMP4:%.*]] = bitcast half* %a to i8*
   5827 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
   5828 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL]], i32 0, i32 0
   5829 // CHECK:   [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
   5830 // CHECK:   [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
   5831 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
   5832 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL1]], i32 0, i32 1
   5833 // CHECK:   [[TMP7:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
   5834 // CHECK:   [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8>
   5835 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
   5836 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
   5837 // CHECK:   [[VLD2_LANE_V:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16.p0i8(i8* [[TMP4]], <4 x i16> [[TMP9]], <4 x i16> [[TMP10]], i32 3, i32 2)
   5838 // CHECK:   [[TMP11:%.*]] = bitcast i8* [[TMP3]] to { <4 x i16>, <4 x i16> }*
   5839 // CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2_LANE_V]], { <4 x i16>, <4 x i16> }* [[TMP11]]
   5840 // CHECK:   [[TMP12:%.*]] = bitcast %struct.float16x4x2_t* %agg.result to i8*
   5841 // CHECK:   [[TMP13:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
   5842 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP12]], i8* [[TMP13]], i32 16, i32 8, i1 false)
   5843 // CHECK:   ret void
   5844 float16x4x2_t test_vld2_lane_f16(float16_t const * a, float16x4x2_t b) {
   5845   return vld2_lane_f16(a, b, 3);
   5846 }
   5847 
   5848 // CHECK-LABEL: define void @test_vld2_lane_f32(%struct.float32x2x2_t* noalias sret %agg.result, float* %a, [2 x i64] %b.coerce) #0 {
   5849 // CHECK:   [[B:%.*]] = alloca %struct.float32x2x2_t, align 8
   5850 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x2_t, align 8
   5851 // CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x2_t, align 8
   5852 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[B]], i32 0, i32 0
   5853 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <2 x float>]* [[COERCE_DIVE]] to [2 x i64]*
   5854 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
   5855 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x2_t* [[__S1]] to i8*
   5856 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x2_t* [[B]] to i8*
   5857 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
   5858 // CHECK:   [[TMP3:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
   5859 // CHECK:   [[TMP4:%.*]] = bitcast float* %a to i8*
   5860 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
   5861 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL]], i32 0, i32 0
   5862 // CHECK:   [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
   5863 // CHECK:   [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
   5864 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
   5865 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL1]], i32 0, i32 1
   5866 // CHECK:   [[TMP7:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
   5867 // CHECK:   [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8>
   5868 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
   5869 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float>
   5870 // CHECK:   [[VLD2_LANE_V:%.*]] = call { <2 x float>, <2 x float> } @llvm.arm.neon.vld2lane.v2f32.p0i8(i8* [[TMP4]], <2 x float> [[TMP9]], <2 x float> [[TMP10]], i32 1, i32 4)
   5871 // CHECK:   [[TMP11:%.*]] = bitcast i8* [[TMP3]] to { <2 x float>, <2 x float> }*
   5872 // CHECK:   store { <2 x float>, <2 x float> } [[VLD2_LANE_V]], { <2 x float>, <2 x float> }* [[TMP11]]
   5873 // CHECK:   [[TMP12:%.*]] = bitcast %struct.float32x2x2_t* %agg.result to i8*
   5874 // CHECK:   [[TMP13:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
   5875 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP12]], i8* [[TMP13]], i32 16, i32 8, i1 false)
   5876 // CHECK:   ret void
   5877 float32x2x2_t test_vld2_lane_f32(float32_t const * a, float32x2x2_t b) {
   5878   return vld2_lane_f32(a, b, 1);
   5879 }
   5880 
   5881 // CHECK-LABEL: define void @test_vld2_lane_p8(%struct.poly8x8x2_t* noalias sret %agg.result, i8* %a, [2 x i64] %b.coerce) #0 {
   5882 // CHECK:   [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8
   5883 // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x2_t, align 8
   5884 // CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x2_t, align 8
   5885 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[B]], i32 0, i32 0
   5886 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE]] to [2 x i64]*
   5887 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
   5888 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x2_t* [[__S1]] to i8*
   5889 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x2_t* [[B]] to i8*
   5890 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
   5891 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
   5892 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
   5893 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i32 0, i32 0
   5894 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
   5895 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
   5896 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i32 0, i32 1
   5897 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
   5898 // CHECK:   [[VLD2_LANE_V:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8.p0i8(i8* %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i32 7, i32 1)
   5899 // CHECK:   [[TMP6:%.*]] = bitcast i8* [[TMP3]] to { <8 x i8>, <8 x i8> }*
   5900 // CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2_LANE_V]], { <8 x i8>, <8 x i8> }* [[TMP6]]
   5901 // CHECK:   [[TMP7:%.*]] = bitcast %struct.poly8x8x2_t* %agg.result to i8*
   5902 // CHECK:   [[TMP8:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
   5903 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false)
   5904 // CHECK:   ret void
   5905 poly8x8x2_t test_vld2_lane_p8(poly8_t const * a, poly8x8x2_t b) {
   5906   return vld2_lane_p8(a, b, 7);
   5907 }
   5908 
   5909 // CHECK-LABEL: define void @test_vld2_lane_p16(%struct.poly16x4x2_t* noalias sret %agg.result, i16* %a, [2 x i64] %b.coerce) #0 {
   5910 // CHECK:   [[B:%.*]] = alloca %struct.poly16x4x2_t, align 8
   5911 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x2_t, align 8
   5912 // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x2_t, align 8
   5913 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[B]], i32 0, i32 0
   5914 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x i16>]* [[COERCE_DIVE]] to [2 x i64]*
   5915 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
   5916 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x2_t* [[__S1]] to i8*
   5917 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x2_t* [[B]] to i8*
   5918 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
   5919 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
   5920 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
   5921 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
   5922 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i32 0, i32 0
   5923 // CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
   5924 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
   5925 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
   5926 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i32 0, i32 1
   5927 // CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
   5928 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
   5929 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
   5930 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
   5931 // CHECK:   [[VLD2_LANE_V:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16.p0i8(i8* [[TMP4]], <4 x i16> [[TMP9]], <4 x i16> [[TMP10]], i32 3, i32 2)
   5932 // CHECK:   [[TMP11:%.*]] = bitcast i8* [[TMP3]] to { <4 x i16>, <4 x i16> }*
   5933 // CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2_LANE_V]], { <4 x i16>, <4 x i16> }* [[TMP11]]
   5934 // CHECK:   [[TMP12:%.*]] = bitcast %struct.poly16x4x2_t* %agg.result to i8*
   5935 // CHECK:   [[TMP13:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
   5936 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP12]], i8* [[TMP13]], i32 16, i32 8, i1 false)
   5937 // CHECK:   ret void
   5938 poly16x4x2_t test_vld2_lane_p16(poly16_t const * a, poly16x4x2_t b) {
   5939   return vld2_lane_p16(a, b, 3);
   5940 }
   5941 
   5942 
   5943 // CHECK-LABEL: define void @test_vld3q_u8(%struct.uint8x16x3_t* noalias sret %agg.result, i8* %a) #0 {
   5944 // CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x3_t, align 16
   5945 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8*
   5946 // CHECK:   [[VLD3Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8.p0i8(i8* %a, i32 1)
   5947 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
   5948 // CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3Q_V]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
   5949 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x3_t* %agg.result to i8*
   5950 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8*
   5951 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 48, i32 16, i1 false)
   5952 // CHECK:   ret void
   5953 uint8x16x3_t test_vld3q_u8(uint8_t const * a) {
   5954   return vld3q_u8(a);
   5955 }
   5956 
   5957 // CHECK-LABEL: define void @test_vld3q_u16(%struct.uint16x8x3_t* noalias sret %agg.result, i16* %a) #0 {
   5958 // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x3_t, align 16
   5959 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
   5960 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
   5961 // CHECK:   [[VLD3Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3.v8i16.p0i8(i8* [[TMP1]], i32 2)
   5962 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
   5963 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_V]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP2]]
   5964 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint16x8x3_t* %agg.result to i8*
   5965 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
   5966 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 48, i32 16, i1 false)
   5967 // CHECK:   ret void
   5968 uint16x8x3_t test_vld3q_u16(uint16_t const * a) {
   5969   return vld3q_u16(a);
   5970 }
   5971 
   5972 // CHECK-LABEL: define void @test_vld3q_u32(%struct.uint32x4x3_t* noalias sret %agg.result, i32* %a) #0 {
   5973 // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x3_t, align 16
   5974 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
   5975 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
   5976 // CHECK:   [[VLD3Q_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32.p0i8(i8* [[TMP1]], i32 4)
   5977 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32> }*
   5978 // CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3Q_V]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP2]]
   5979 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint32x4x3_t* %agg.result to i8*
   5980 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
   5981 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 48, i32 16, i1 false)
   5982 // CHECK:   ret void
   5983 uint32x4x3_t test_vld3q_u32(uint32_t const * a) {
   5984   return vld3q_u32(a);
   5985 }
   5986 
   5987 // CHECK-LABEL: define void @test_vld3q_s8(%struct.int8x16x3_t* noalias sret %agg.result, i8* %a) #0 {
   5988 // CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x3_t, align 16
   5989 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8*
   5990 // CHECK:   [[VLD3Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8.p0i8(i8* %a, i32 1)
   5991 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
   5992 // CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3Q_V]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
   5993 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x3_t* %agg.result to i8*
   5994 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8*
   5995 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 48, i32 16, i1 false)
   5996 // CHECK:   ret void
   5997 int8x16x3_t test_vld3q_s8(int8_t const * a) {
   5998   return vld3q_s8(a);
   5999 }
   6000 
   6001 // CHECK-LABEL: define void @test_vld3q_s16(%struct.int16x8x3_t* noalias sret %agg.result, i16* %a) #0 {
   6002 // CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x3_t, align 16
   6003 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
   6004 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
   6005 // CHECK:   [[VLD3Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3.v8i16.p0i8(i8* [[TMP1]], i32 2)
   6006 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
   6007 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_V]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP2]]
   6008 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int16x8x3_t* %agg.result to i8*
   6009 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
   6010 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 48, i32 16, i1 false)
   6011 // CHECK:   ret void
   6012 int16x8x3_t test_vld3q_s16(int16_t const * a) {
   6013   return vld3q_s16(a);
   6014 }
   6015 
   6016 // CHECK-LABEL: define void @test_vld3q_s32(%struct.int32x4x3_t* noalias sret %agg.result, i32* %a) #0 {
   6017 // CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x3_t, align 16
   6018 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
   6019 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
   6020 // CHECK:   [[VLD3Q_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32.p0i8(i8* [[TMP1]], i32 4)
   6021 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32> }*
   6022 // CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3Q_V]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP2]]
   6023 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int32x4x3_t* %agg.result to i8*
   6024 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
   6025 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 48, i32 16, i1 false)
   6026 // CHECK:   ret void
   6027 int32x4x3_t test_vld3q_s32(int32_t const * a) {
   6028   return vld3q_s32(a);
   6029 }
   6030 
   6031 // CHECK-LABEL: define void @test_vld3q_f16(%struct.float16x8x3_t* noalias sret %agg.result, half* %a) #0 {
   6032 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x3_t, align 16
   6033 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
   6034 // CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
   6035 // CHECK:   [[VLD3Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3.v8i16.p0i8(i8* [[TMP1]], i32 2)
   6036 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
   6037 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_V]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP2]]
   6038 // CHECK:   [[TMP3:%.*]] = bitcast %struct.float16x8x3_t* %agg.result to i8*
   6039 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
   6040 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 48, i32 16, i1 false)
   6041 // CHECK:   ret void
   6042 float16x8x3_t test_vld3q_f16(float16_t const * a) {
   6043   return vld3q_f16(a);
   6044 }
   6045 
   6046 // CHECK-LABEL: define void @test_vld3q_f32(%struct.float32x4x3_t* noalias sret %agg.result, float* %a) #0 {
   6047 // CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x3_t, align 16
   6048 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
   6049 // CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
   6050 // CHECK:   [[VLD3Q_V:%.*]] = call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3.v4f32.p0i8(i8* [[TMP1]], i32 4)
   6051 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x float>, <4 x float>, <4 x float> }*
   6052 // CHECK:   store { <4 x float>, <4 x float>, <4 x float> } [[VLD3Q_V]], { <4 x float>, <4 x float>, <4 x float> }* [[TMP2]]
   6053 // CHECK:   [[TMP3:%.*]] = bitcast %struct.float32x4x3_t* %agg.result to i8*
   6054 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
   6055 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 48, i32 16, i1 false)
   6056 // CHECK:   ret void
   6057 float32x4x3_t test_vld3q_f32(float32_t const * a) {
   6058   return vld3q_f32(a);
   6059 }
   6060 
   6061 // CHECK-LABEL: define void @test_vld3q_p8(%struct.poly8x16x3_t* noalias sret %agg.result, i8* %a) #0 {
   6062 // CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x3_t, align 16
   6063 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8*
   6064 // CHECK:   [[VLD3Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8.p0i8(i8* %a, i32 1)
   6065 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
   6066 // CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3Q_V]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
   6067 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x3_t* %agg.result to i8*
   6068 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8*
   6069 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 48, i32 16, i1 false)
   6070 // CHECK:   ret void
   6071 poly8x16x3_t test_vld3q_p8(poly8_t const * a) {
   6072   return vld3q_p8(a);
   6073 }
   6074 
   6075 // CHECK-LABEL: define void @test_vld3q_p16(%struct.poly16x8x3_t* noalias sret %agg.result, i16* %a) #0 {
   6076 // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x3_t, align 16
   6077 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
   6078 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
   6079 // CHECK:   [[VLD3Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3.v8i16.p0i8(i8* [[TMP1]], i32 2)
   6080 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
   6081 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_V]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP2]]
   6082 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly16x8x3_t* %agg.result to i8*
   6083 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
   6084 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 48, i32 16, i1 false)
   6085 // CHECK:   ret void
   6086 poly16x8x3_t test_vld3q_p16(poly16_t const * a) {
   6087   return vld3q_p16(a);
   6088 }
   6089 
   6090 // CHECK-LABEL: define void @test_vld3_u8(%struct.uint8x8x3_t* noalias sret %agg.result, i8* %a) #0 {
   6091 // CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x3_t, align 8
   6092 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
   6093 // CHECK:   [[VLD3_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3.v8i8.p0i8(i8* %a, i32 1)
   6094 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
   6095 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_V]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
   6096 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x3_t* %agg.result to i8*
   6097 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
   6098 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 24, i32 8, i1 false)
   6099 // CHECK:   ret void
   6100 uint8x8x3_t test_vld3_u8(uint8_t const * a) {
   6101   return vld3_u8(a);
   6102 }
   6103 
   6104 // CHECK-LABEL: define void @test_vld3_u16(%struct.uint16x4x3_t* noalias sret %agg.result, i16* %a) #0 {
   6105 // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x3_t, align 8
   6106 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
   6107 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
   6108 // CHECK:   [[VLD3_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3.v4i16.p0i8(i8* [[TMP1]], i32 2)
   6109 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
   6110 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_V]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP2]]
   6111 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint16x4x3_t* %agg.result to i8*
   6112 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
   6113 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 24, i32 8, i1 false)
   6114 // CHECK:   ret void
   6115 uint16x4x3_t test_vld3_u16(uint16_t const * a) {
   6116   return vld3_u16(a);
   6117 }
   6118 
   6119 // CHECK-LABEL: define void @test_vld3_u32(%struct.uint32x2x3_t* noalias sret %agg.result, i32* %a) #0 {
   6120 // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x3_t, align 8
   6121 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
   6122 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
   6123 // CHECK:   [[VLD3_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3.v2i32.p0i8(i8* [[TMP1]], i32 4)
   6124 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
   6125 // CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_V]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP2]]
   6126 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint32x2x3_t* %agg.result to i8*
   6127 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
   6128 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 24, i32 8, i1 false)
   6129 // CHECK:   ret void
   6130 uint32x2x3_t test_vld3_u32(uint32_t const * a) {
   6131   return vld3_u32(a);
   6132 }
   6133 
   6134 // CHECK-LABEL: define void @test_vld3_u64(%struct.uint64x1x3_t* noalias sret %agg.result, i64* %a) #0 {
   6135 // CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x3_t, align 8
   6136 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
   6137 // CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
   6138 // CHECK:   [[VLD3_V:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64.p0i8(i8* [[TMP1]], i32 4)
   6139 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
   6140 // CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_V]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP2]]
   6141 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint64x1x3_t* %agg.result to i8*
   6142 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
   6143 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 24, i32 8, i1 false)
   6144 // CHECK:   ret void
   6145 uint64x1x3_t test_vld3_u64(uint64_t const * a) {
   6146   return vld3_u64(a);
   6147 }
   6148 
   6149 // CHECK-LABEL: define void @test_vld3_s8(%struct.int8x8x3_t* noalias sret %agg.result, i8* %a) #0 {
   6150 // CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x3_t, align 8
   6151 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
   6152 // CHECK:   [[VLD3_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3.v8i8.p0i8(i8* %a, i32 1)
   6153 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
   6154 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_V]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
   6155 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x3_t* %agg.result to i8*
   6156 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
   6157 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 24, i32 8, i1 false)
   6158 // CHECK:   ret void
   6159 int8x8x3_t test_vld3_s8(int8_t const * a) {
   6160   return vld3_s8(a);
   6161 }
   6162 
   6163 // CHECK-LABEL: define void @test_vld3_s16(%struct.int16x4x3_t* noalias sret %agg.result, i16* %a) #0 {
   6164 // CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x3_t, align 8
   6165 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
   6166 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
   6167 // CHECK:   [[VLD3_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3.v4i16.p0i8(i8* [[TMP1]], i32 2)
   6168 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
   6169 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_V]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP2]]
   6170 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int16x4x3_t* %agg.result to i8*
   6171 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
   6172 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 24, i32 8, i1 false)
   6173 // CHECK:   ret void
   6174 int16x4x3_t test_vld3_s16(int16_t const * a) {
   6175   return vld3_s16(a);
   6176 }
   6177 
   6178 // CHECK-LABEL: define void @test_vld3_s32(%struct.int32x2x3_t* noalias sret %agg.result, i32* %a) #0 {
   6179 // CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x3_t, align 8
   6180 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
   6181 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
   6182 // CHECK:   [[VLD3_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3.v2i32.p0i8(i8* [[TMP1]], i32 4)
   6183 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
   6184 // CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_V]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP2]]
   6185 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int32x2x3_t* %agg.result to i8*
   6186 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
   6187 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 24, i32 8, i1 false)
   6188 // CHECK:   ret void
   6189 int32x2x3_t test_vld3_s32(int32_t const * a) {
   6190   return vld3_s32(a);
   6191 }
   6192 
   6193 // CHECK-LABEL: define void @test_vld3_s64(%struct.int64x1x3_t* noalias sret %agg.result, i64* %a) #0 {
   6194 // CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x3_t, align 8
   6195 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
   6196 // CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
   6197 // CHECK:   [[VLD3_V:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64.p0i8(i8* [[TMP1]], i32 4)
   6198 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
   6199 // CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_V]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP2]]
   6200 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int64x1x3_t* %agg.result to i8*
   6201 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
   6202 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 24, i32 8, i1 false)
   6203 // CHECK:   ret void
   6204 int64x1x3_t test_vld3_s64(int64_t const * a) {
   6205   return vld3_s64(a);
   6206 }
   6207 
   6208 // CHECK-LABEL: define void @test_vld3_f16(%struct.float16x4x3_t* noalias sret %agg.result, half* %a) #0 {
   6209 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x3_t, align 8
   6210 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
   6211 // CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
   6212 // CHECK:   [[VLD3_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3.v4i16.p0i8(i8* [[TMP1]], i32 2)
   6213 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
   6214 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_V]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP2]]
   6215 // CHECK:   [[TMP3:%.*]] = bitcast %struct.float16x4x3_t* %agg.result to i8*
   6216 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
   6217 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 24, i32 8, i1 false)
   6218 // CHECK:   ret void
   6219 float16x4x3_t test_vld3_f16(float16_t const * a) {
   6220   return vld3_f16(a);
   6221 }
   6222 
   6223 // CHECK-LABEL: define void @test_vld3_f32(%struct.float32x2x3_t* noalias sret %agg.result, float* %a) #0 {
   6224 // CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x3_t, align 8
   6225 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
   6226 // CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
   6227 // CHECK:   [[VLD3_V:%.*]] = call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3.v2f32.p0i8(i8* [[TMP1]], i32 4)
   6228 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float>, <2 x float> }*
   6229 // CHECK:   store { <2 x float>, <2 x float>, <2 x float> } [[VLD3_V]], { <2 x float>, <2 x float>, <2 x float> }* [[TMP2]]
   6230 // CHECK:   [[TMP3:%.*]] = bitcast %struct.float32x2x3_t* %agg.result to i8*
   6231 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
   6232 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 24, i32 8, i1 false)
   6233 // CHECK:   ret void
   6234 float32x2x3_t test_vld3_f32(float32_t const * a) {
   6235   return vld3_f32(a);
   6236 }
   6237 
   6238 // CHECK-LABEL: define void @test_vld3_p8(%struct.poly8x8x3_t* noalias sret %agg.result, i8* %a) #0 {
   6239 // CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x3_t, align 8
   6240 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
   6241 // CHECK:   [[VLD3_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3.v8i8.p0i8(i8* %a, i32 1)
   6242 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
   6243 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_V]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
   6244 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x3_t* %agg.result to i8*
   6245 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
   6246 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 24, i32 8, i1 false)
   6247 // CHECK:   ret void
   6248 poly8x8x3_t test_vld3_p8(poly8_t const * a) {
   6249   return vld3_p8(a);
   6250 }
   6251 
   6252 // CHECK-LABEL: define void @test_vld3_p16(%struct.poly16x4x3_t* noalias sret %agg.result, i16* %a) #0 {
   6253 // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x3_t, align 8
   6254 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
   6255 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
   6256 // CHECK:   [[VLD3_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3.v4i16.p0i8(i8* [[TMP1]], i32 2)
   6257 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
   6258 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_V]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP2]]
   6259 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly16x4x3_t* %agg.result to i8*
   6260 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
   6261 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 24, i32 8, i1 false)
   6262 // CHECK:   ret void
   6263 poly16x4x3_t test_vld3_p16(poly16_t const * a) {
   6264   return vld3_p16(a);
   6265 }
   6266 
   6267 
   6268 // CHECK-LABEL: define void @test_vld3_dup_u8(%struct.uint8x8x3_t* noalias sret %agg.result, i8* %a) #0 {
   6269 // CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x3_t, align 8
   6270 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
   6271 // CHECK:   [[VLD_DUP:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8.p0i8(i8* %a, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
   6272 // CHECK:   [[TMP1:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD_DUP]], 0
   6273 // CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
   6274 // CHECK:   [[TMP2:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD_DUP]], <8 x i8> [[LANE]], 0
   6275 // CHECK:   [[TMP3:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP2]], 1
   6276 // CHECK:   [[LANE1:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP3]], <8 x i32> zeroinitializer
   6277 // CHECK:   [[TMP4:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP2]], <8 x i8> [[LANE1]], 1
   6278 // CHECK:   [[TMP5:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], 2
   6279 // CHECK:   [[LANE2:%.*]] = shufflevector <8 x i8> [[TMP5]], <8 x i8> [[TMP5]], <8 x i32> zeroinitializer
   6280 // CHECK:   [[TMP6:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], <8 x i8> [[LANE2]], 2
   6281 // CHECK:   [[TMP7:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
   6282 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP6]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP7]]
   6283 // CHECK:   [[TMP8:%.*]] = bitcast %struct.uint8x8x3_t* %agg.result to i8*
   6284 // CHECK:   [[TMP9:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
   6285 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP8]], i8* [[TMP9]], i32 24, i32 8, i1 false)
   6286 // CHECK:   ret void
   6287 uint8x8x3_t test_vld3_dup_u8(uint8_t const * a) {
   6288   return vld3_dup_u8(a);
   6289 }
   6290 
   6291 // CHECK-LABEL: define void @test_vld3_dup_u16(%struct.uint16x4x3_t* noalias sret %agg.result, i16* %a) #0 {
   6292 // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x3_t, align 8
   6293 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
   6294 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
   6295 // CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16.p0i8(i8* [[TMP1]], <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
   6296 // CHECK:   [[TMP2:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD_DUP]], 0
   6297 // CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP2]], <4 x i32> zeroinitializer
   6298 // CHECK:   [[TMP3:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD_DUP]], <4 x i16> [[LANE]], 0
   6299 // CHECK:   [[TMP4:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP3]], 1
   6300 // CHECK:   [[LANE1:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP4]], <4 x i32> zeroinitializer
   6301 // CHECK:   [[TMP5:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP3]], <4 x i16> [[LANE1]], 1
   6302 // CHECK:   [[TMP6:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP5]], 2
   6303 // CHECK:   [[LANE2:%.*]] = shufflevector <4 x i16> [[TMP6]], <4 x i16> [[TMP6]], <4 x i32> zeroinitializer
   6304 // CHECK:   [[TMP7:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP5]], <4 x i16> [[LANE2]], 2
   6305 // CHECK:   [[TMP8:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
   6306 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP7]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP8]]
   6307 // CHECK:   [[TMP9:%.*]] = bitcast %struct.uint16x4x3_t* %agg.result to i8*
   6308 // CHECK:   [[TMP10:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
   6309 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP9]], i8* [[TMP10]], i32 24, i32 8, i1 false)
   6310 // CHECK:   ret void
   6311 uint16x4x3_t test_vld3_dup_u16(uint16_t const * a) {
   6312   return vld3_dup_u16(a);
   6313 }
   6314 
   6315 // CHECK-LABEL: define void @test_vld3_dup_u32(%struct.uint32x2x3_t* noalias sret %agg.result, i32* %a) #0 {
   6316 // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x3_t, align 8
   6317 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
   6318 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
   6319 // CHECK:   [[VLD_DUP:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3lane.v2i32.p0i8(i8* [[TMP1]], <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 0, i32 4)
   6320 // CHECK:   [[TMP2:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD_DUP]], 0
   6321 // CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <2 x i32> zeroinitializer
   6322 // CHECK:   [[TMP3:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD_DUP]], <2 x i32> [[LANE]], 0
   6323 // CHECK:   [[TMP4:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP3]], 1
   6324 // CHECK:   [[LANE1:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP4]], <2 x i32> zeroinitializer
   6325 // CHECK:   [[TMP5:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP3]], <2 x i32> [[LANE1]], 1
   6326 // CHECK:   [[TMP6:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP5]], 2
   6327 // CHECK:   [[LANE2:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> [[TMP6]], <2 x i32> zeroinitializer
   6328 // CHECK:   [[TMP7:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP5]], <2 x i32> [[LANE2]], 2
   6329 // CHECK:   [[TMP8:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
   6330 // CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP7]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP8]]
   6331 // CHECK:   [[TMP9:%.*]] = bitcast %struct.uint32x2x3_t* %agg.result to i8*
   6332 // CHECK:   [[TMP10:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
   6333 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP9]], i8* [[TMP10]], i32 24, i32 8, i1 false)
   6334 // CHECK:   ret void
   6335 uint32x2x3_t test_vld3_dup_u32(uint32_t const * a) {
   6336   return vld3_dup_u32(a);
   6337 }
   6338 
   6339 // CHECK-LABEL: define void @test_vld3_dup_u64(%struct.uint64x1x3_t* noalias sret %agg.result, i64* %a) #0 {
   6340 // CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x3_t, align 8
   6341 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
   6342 // CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
   6343 // CHECK:   [[VLD_DUP:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64.p0i8(i8* [[TMP1]], i32 4)
   6344 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
   6345 // CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD_DUP]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP2]]
   6346 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint64x1x3_t* %agg.result to i8*
   6347 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
   6348 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 24, i32 8, i1 false)
   6349 // CHECK:   ret void
   6350 uint64x1x3_t test_vld3_dup_u64(uint64_t const * a) {
   6351   return vld3_dup_u64(a);
   6352 }
   6353 
   6354 // CHECK-LABEL: define void @test_vld3_dup_s8(%struct.int8x8x3_t* noalias sret %agg.result, i8* %a) #0 {
   6355 // CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x3_t, align 8
   6356 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
   6357 // CHECK:   [[VLD_DUP:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8.p0i8(i8* %a, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
   6358 // CHECK:   [[TMP1:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD_DUP]], 0
   6359 // CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
   6360 // CHECK:   [[TMP2:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD_DUP]], <8 x i8> [[LANE]], 0
   6361 // CHECK:   [[TMP3:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP2]], 1
   6362 // CHECK:   [[LANE1:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP3]], <8 x i32> zeroinitializer
   6363 // CHECK:   [[TMP4:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP2]], <8 x i8> [[LANE1]], 1
   6364 // CHECK:   [[TMP5:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], 2
   6365 // CHECK:   [[LANE2:%.*]] = shufflevector <8 x i8> [[TMP5]], <8 x i8> [[TMP5]], <8 x i32> zeroinitializer
   6366 // CHECK:   [[TMP6:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], <8 x i8> [[LANE2]], 2
   6367 // CHECK:   [[TMP7:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
   6368 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP6]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP7]]
   6369 // CHECK:   [[TMP8:%.*]] = bitcast %struct.int8x8x3_t* %agg.result to i8*
   6370 // CHECK:   [[TMP9:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
   6371 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP8]], i8* [[TMP9]], i32 24, i32 8, i1 false)
   6372 // CHECK:   ret void
   6373 int8x8x3_t test_vld3_dup_s8(int8_t const * a) {
   6374   return vld3_dup_s8(a);
   6375 }
   6376 
   6377 // CHECK-LABEL: define void @test_vld3_dup_s16(%struct.int16x4x3_t* noalias sret %agg.result, i16* %a) #0 {
   6378 // CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x3_t, align 8
   6379 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
   6380 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
   6381 // CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16.p0i8(i8* [[TMP1]], <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
   6382 // CHECK:   [[TMP2:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD_DUP]], 0
   6383 // CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP2]], <4 x i32> zeroinitializer
   6384 // CHECK:   [[TMP3:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD_DUP]], <4 x i16> [[LANE]], 0
   6385 // CHECK:   [[TMP4:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP3]], 1
   6386 // CHECK:   [[LANE1:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP4]], <4 x i32> zeroinitializer
   6387 // CHECK:   [[TMP5:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP3]], <4 x i16> [[LANE1]], 1
   6388 // CHECK:   [[TMP6:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP5]], 2
   6389 // CHECK:   [[LANE2:%.*]] = shufflevector <4 x i16> [[TMP6]], <4 x i16> [[TMP6]], <4 x i32> zeroinitializer
   6390 // CHECK:   [[TMP7:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP5]], <4 x i16> [[LANE2]], 2
   6391 // CHECK:   [[TMP8:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
   6392 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP7]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP8]]
   6393 // CHECK:   [[TMP9:%.*]] = bitcast %struct.int16x4x3_t* %agg.result to i8*
   6394 // CHECK:   [[TMP10:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
   6395 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP9]], i8* [[TMP10]], i32 24, i32 8, i1 false)
   6396 // CHECK:   ret void
   6397 int16x4x3_t test_vld3_dup_s16(int16_t const * a) {
   6398   return vld3_dup_s16(a);
   6399 }
   6400 
   6401 // CHECK-LABEL: define void @test_vld3_dup_s32(%struct.int32x2x3_t* noalias sret %agg.result, i32* %a) #0 {
   6402 // CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x3_t, align 8
   6403 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
   6404 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
   6405 // CHECK:   [[VLD_DUP:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3lane.v2i32.p0i8(i8* [[TMP1]], <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 0, i32 4)
   6406 // CHECK:   [[TMP2:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD_DUP]], 0
   6407 // CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <2 x i32> zeroinitializer
   6408 // CHECK:   [[TMP3:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD_DUP]], <2 x i32> [[LANE]], 0
   6409 // CHECK:   [[TMP4:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP3]], 1
   6410 // CHECK:   [[LANE1:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP4]], <2 x i32> zeroinitializer
   6411 // CHECK:   [[TMP5:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP3]], <2 x i32> [[LANE1]], 1
   6412 // CHECK:   [[TMP6:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP5]], 2
   6413 // CHECK:   [[LANE2:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> [[TMP6]], <2 x i32> zeroinitializer
   6414 // CHECK:   [[TMP7:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP5]], <2 x i32> [[LANE2]], 2
   6415 // CHECK:   [[TMP8:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
   6416 // CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP7]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP8]]
   6417 // CHECK:   [[TMP9:%.*]] = bitcast %struct.int32x2x3_t* %agg.result to i8*
   6418 // CHECK:   [[TMP10:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
   6419 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP9]], i8* [[TMP10]], i32 24, i32 8, i1 false)
   6420 // CHECK:   ret void
   6421 int32x2x3_t test_vld3_dup_s32(int32_t const * a) {
   6422   return vld3_dup_s32(a);
   6423 }
   6424 
   6425 // CHECK-LABEL: define void @test_vld3_dup_s64(%struct.int64x1x3_t* noalias sret %agg.result, i64* %a) #0 {
   6426 // CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x3_t, align 8
   6427 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
   6428 // CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
   6429 // CHECK:   [[VLD_DUP:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64.p0i8(i8* [[TMP1]], i32 4)
   6430 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
   6431 // CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD_DUP]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP2]]
   6432 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int64x1x3_t* %agg.result to i8*
   6433 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
   6434 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 24, i32 8, i1 false)
   6435 // CHECK:   ret void
   6436 int64x1x3_t test_vld3_dup_s64(int64_t const * a) {
   6437   return vld3_dup_s64(a);
   6438 }
   6439 
   6440 // CHECK-LABEL: define void @test_vld3_dup_f16(%struct.float16x4x3_t* noalias sret %agg.result, half* %a) #0 {
   6441 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x3_t, align 8
   6442 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
   6443 // CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
   6444 // CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16.p0i8(i8* [[TMP1]], <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
   6445 // CHECK:   [[TMP2:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD_DUP]], 0
   6446 // CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP2]], <4 x i32> zeroinitializer
   6447 // CHECK:   [[TMP3:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD_DUP]], <4 x i16> [[LANE]], 0
   6448 // CHECK:   [[TMP4:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP3]], 1
   6449 // CHECK:   [[LANE1:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP4]], <4 x i32> zeroinitializer
   6450 // CHECK:   [[TMP5:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP3]], <4 x i16> [[LANE1]], 1
   6451 // CHECK:   [[TMP6:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP5]], 2
   6452 // CHECK:   [[LANE2:%.*]] = shufflevector <4 x i16> [[TMP6]], <4 x i16> [[TMP6]], <4 x i32> zeroinitializer
   6453 // CHECK:   [[TMP7:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP5]], <4 x i16> [[LANE2]], 2
   6454 // CHECK:   [[TMP8:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
   6455 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP7]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP8]]
   6456 // CHECK:   [[TMP9:%.*]] = bitcast %struct.float16x4x3_t* %agg.result to i8*
   6457 // CHECK:   [[TMP10:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
   6458 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP9]], i8* [[TMP10]], i32 24, i32 8, i1 false)
   6459 // CHECK:   ret void
   6460 float16x4x3_t test_vld3_dup_f16(float16_t const * a) {
   6461   return vld3_dup_f16(a);
   6462 }
   6463 
   6464 // CHECK-LABEL: define void @test_vld3_dup_f32(%struct.float32x2x3_t* noalias sret %agg.result, float* %a) #0 {
   6465 // CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x3_t, align 8
   6466 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
   6467 // CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
   6468 // CHECK:   [[VLD_DUP:%.*]] = call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32.p0i8(i8* [[TMP1]], <2 x float> undef, <2 x float> undef, <2 x float> undef, i32 0, i32 4)
   6469 // CHECK:   [[TMP2:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float> } [[VLD_DUP]], 0
   6470 // CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP2]], <2 x i32> zeroinitializer
   6471 // CHECK:   [[TMP3:%.*]] = insertvalue { <2 x float>, <2 x float>, <2 x float> } [[VLD_DUP]], <2 x float> [[LANE]], 0
   6472 // CHECK:   [[TMP4:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float> } [[TMP3]], 1
   6473 // CHECK:   [[LANE1:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP4]], <2 x i32> zeroinitializer
   6474 // CHECK:   [[TMP5:%.*]] = insertvalue { <2 x float>, <2 x float>, <2 x float> } [[TMP3]], <2 x float> [[LANE1]], 1
   6475 // CHECK:   [[TMP6:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float> } [[TMP5]], 2
   6476 // CHECK:   [[LANE2:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP6]], <2 x i32> zeroinitializer
   6477 // CHECK:   [[TMP7:%.*]] = insertvalue { <2 x float>, <2 x float>, <2 x float> } [[TMP5]], <2 x float> [[LANE2]], 2
   6478 // CHECK:   [[TMP8:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float>, <2 x float> }*
   6479 // CHECK:   store { <2 x float>, <2 x float>, <2 x float> } [[TMP7]], { <2 x float>, <2 x float>, <2 x float> }* [[TMP8]]
   6480 // CHECK:   [[TMP9:%.*]] = bitcast %struct.float32x2x3_t* %agg.result to i8*
   6481 // CHECK:   [[TMP10:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
   6482 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP9]], i8* [[TMP10]], i32 24, i32 8, i1 false)
   6483 // CHECK:   ret void
   6484 float32x2x3_t test_vld3_dup_f32(float32_t const * a) {
   6485   return vld3_dup_f32(a);
   6486 }
   6487 
   6488 // CHECK-LABEL: define void @test_vld3_dup_p8(%struct.poly8x8x3_t* noalias sret %agg.result, i8* %a) #0 {
   6489 // CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x3_t, align 8
   6490 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
   6491 // CHECK:   [[VLD_DUP:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8.p0i8(i8* %a, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
   6492 // CHECK:   [[TMP1:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD_DUP]], 0
   6493 // CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
   6494 // CHECK:   [[TMP2:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD_DUP]], <8 x i8> [[LANE]], 0
   6495 // CHECK:   [[TMP3:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP2]], 1
   6496 // CHECK:   [[LANE1:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP3]], <8 x i32> zeroinitializer
   6497 // CHECK:   [[TMP4:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP2]], <8 x i8> [[LANE1]], 1
   6498 // CHECK:   [[TMP5:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], 2
   6499 // CHECK:   [[LANE2:%.*]] = shufflevector <8 x i8> [[TMP5]], <8 x i8> [[TMP5]], <8 x i32> zeroinitializer
   6500 // CHECK:   [[TMP6:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], <8 x i8> [[LANE2]], 2
   6501 // CHECK:   [[TMP7:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
   6502 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP6]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP7]]
   6503 // CHECK:   [[TMP8:%.*]] = bitcast %struct.poly8x8x3_t* %agg.result to i8*
   6504 // CHECK:   [[TMP9:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
   6505 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP8]], i8* [[TMP9]], i32 24, i32 8, i1 false)
   6506 // CHECK:   ret void
   6507 poly8x8x3_t test_vld3_dup_p8(poly8_t const * a) {
   6508   return vld3_dup_p8(a);
   6509 }
   6510 
   6511 // CHECK-LABEL: define void @test_vld3_dup_p16(%struct.poly16x4x3_t* noalias sret %agg.result, i16* %a) #0 {
   6512 // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x3_t, align 8
   6513 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
   6514 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
   6515 // CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16.p0i8(i8* [[TMP1]], <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
   6516 // CHECK:   [[TMP2:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD_DUP]], 0
   6517 // CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP2]], <4 x i32> zeroinitializer
   6518 // CHECK:   [[TMP3:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD_DUP]], <4 x i16> [[LANE]], 0
   6519 // CHECK:   [[TMP4:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP3]], 1
   6520 // CHECK:   [[LANE1:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP4]], <4 x i32> zeroinitializer
   6521 // CHECK:   [[TMP5:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP3]], <4 x i16> [[LANE1]], 1
   6522 // CHECK:   [[TMP6:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP5]], 2
   6523 // CHECK:   [[LANE2:%.*]] = shufflevector <4 x i16> [[TMP6]], <4 x i16> [[TMP6]], <4 x i32> zeroinitializer
   6524 // CHECK:   [[TMP7:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP5]], <4 x i16> [[LANE2]], 2
   6525 // CHECK:   [[TMP8:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
   6526 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP7]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP8]]
   6527 // CHECK:   [[TMP9:%.*]] = bitcast %struct.poly16x4x3_t* %agg.result to i8*
   6528 // CHECK:   [[TMP10:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
   6529 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP9]], i8* [[TMP10]], i32 24, i32 8, i1 false)
   6530 // CHECK:   ret void
   6531 poly16x4x3_t test_vld3_dup_p16(poly16_t const * a) {
   6532   return vld3_dup_p16(a);
   6533 }
   6534 
   6535 
   6536 // CHECK-LABEL: define void @test_vld3q_lane_u16(%struct.uint16x8x3_t* noalias sret %agg.result, i16* %a, [6 x i64] %b.coerce) #0 {
   6537 // CHECK:   [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16
   6538 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16
   6539 // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x3_t, align 16
   6540 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[B]], i32 0, i32 0
   6541 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i16>]* [[COERCE_DIVE]] to [6 x i64]*
   6542 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
   6543 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x3_t* [[__S1]] to i8*
   6544 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x3_t* [[B]] to i8*
   6545 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
   6546 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
   6547 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
   6548 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
   6549 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i32 0, i32 0
   6550 // CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
   6551 // CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
   6552 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
   6553 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i32 0, i32 1
   6554 // CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
   6555 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
   6556 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
   6557 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i32 0, i32 2
   6558 // CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
   6559 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
   6560 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
   6561 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
   6562 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
   6563 // CHECK:   [[VLD3Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16.p0i8(i8* [[TMP4]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], i32 7, i32 2)
   6564 // CHECK:   [[TMP14:%.*]] = bitcast i8* [[TMP3]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
   6565 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_LANE_V]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP14]]
   6566 // CHECK:   [[TMP15:%.*]] = bitcast %struct.uint16x8x3_t* %agg.result to i8*
   6567 // CHECK:   [[TMP16:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
   6568 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP15]], i8* [[TMP16]], i32 48, i32 16, i1 false)
   6569 // CHECK:   ret void
   6570 uint16x8x3_t test_vld3q_lane_u16(uint16_t const * a, uint16x8x3_t b) {
   6571   return vld3q_lane_u16(a, b, 7);
   6572 }
   6573 
   6574 // CHECK-LABEL: define void @test_vld3q_lane_u32(%struct.uint32x4x3_t* noalias sret %agg.result, i32* %a, [6 x i64] %b.coerce) #0 {
   6575 // CHECK:   [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16
   6576 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16
   6577 // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x3_t, align 16
   6578 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[B]], i32 0, i32 0
   6579 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x i32>]* [[COERCE_DIVE]] to [6 x i64]*
   6580 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
   6581 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x3_t* [[__S1]] to i8*
   6582 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x3_t* [[B]] to i8*
   6583 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
   6584 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
   6585 // CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
   6586 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
   6587 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i32 0, i32 0
   6588 // CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
   6589 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
   6590 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
   6591 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL1]], i32 0, i32 1
   6592 // CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
   6593 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
   6594 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
   6595 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL3]], i32 0, i32 2
   6596 // CHECK:   [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
   6597 // CHECK:   [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
   6598 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
   6599 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
   6600 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
   6601 // CHECK:   [[VLD3Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32.p0i8(i8* [[TMP4]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], i32 3, i32 4)
   6602 // CHECK:   [[TMP14:%.*]] = bitcast i8* [[TMP3]] to { <4 x i32>, <4 x i32>, <4 x i32> }*
   6603 // CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3Q_LANE_V]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP14]]
   6604 // CHECK:   [[TMP15:%.*]] = bitcast %struct.uint32x4x3_t* %agg.result to i8*
   6605 // CHECK:   [[TMP16:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
   6606 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP15]], i8* [[TMP16]], i32 48, i32 16, i1 false)
   6607 // CHECK:   ret void
   6608 uint32x4x3_t test_vld3q_lane_u32(uint32_t const * a, uint32x4x3_t b) {
   6609   return vld3q_lane_u32(a, b, 3);
   6610 }
   6611 
   6612 // CHECK-LABEL: define void @test_vld3q_lane_s16(%struct.int16x8x3_t* noalias sret %agg.result, i16* %a, [6 x i64] %b.coerce) #0 {
   6613 // CHECK:   [[B:%.*]] = alloca %struct.int16x8x3_t, align 16
   6614 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16
   6615 // CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x3_t, align 16
   6616 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[B]], i32 0, i32 0
   6617 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i16>]* [[COERCE_DIVE]] to [6 x i64]*
   6618 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
   6619 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x3_t* [[__S1]] to i8*
   6620 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x3_t* [[B]] to i8*
   6621 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
   6622 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
   6623 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
   6624 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
   6625 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i32 0, i32 0
   6626 // CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
   6627 // CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
   6628 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
   6629 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i32 0, i32 1
   6630 // CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
   6631 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
   6632 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
   6633 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i32 0, i32 2
   6634 // CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
   6635 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
   6636 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
   6637 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
   6638 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
   6639 // CHECK:   [[VLD3Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16.p0i8(i8* [[TMP4]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], i32 7, i32 2)
   6640 // CHECK:   [[TMP14:%.*]] = bitcast i8* [[TMP3]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
   6641 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_LANE_V]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP14]]
   6642 // CHECK:   [[TMP15:%.*]] = bitcast %struct.int16x8x3_t* %agg.result to i8*
   6643 // CHECK:   [[TMP16:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
   6644 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP15]], i8* [[TMP16]], i32 48, i32 16, i1 false)
   6645 // CHECK:   ret void
   6646 int16x8x3_t test_vld3q_lane_s16(int16_t const * a, int16x8x3_t b) {
   6647   return vld3q_lane_s16(a, b, 7);
   6648 }
   6649 
   6650 // CHECK-LABEL: define void @test_vld3q_lane_s32(%struct.int32x4x3_t* noalias sret %agg.result, i32* %a, [6 x i64] %b.coerce) #0 {
   6651 // CHECK:   [[B:%.*]] = alloca %struct.int32x4x3_t, align 16
   6652 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16
   6653 // CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x3_t, align 16
   6654 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[B]], i32 0, i32 0
   6655 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x i32>]* [[COERCE_DIVE]] to [6 x i64]*
   6656 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
   6657 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x3_t* [[__S1]] to i8*
   6658 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x3_t* [[B]] to i8*
   6659 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
   6660 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
   6661 // CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
   6662 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
   6663 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i32 0, i32 0
   6664 // CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
   6665 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
   6666 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
   6667 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL1]], i32 0, i32 1
   6668 // CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
   6669 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
   6670 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
   6671 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL3]], i32 0, i32 2
   6672 // CHECK:   [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
   6673 // CHECK:   [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
   6674 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
   6675 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
   6676 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
   6677 // CHECK:   [[VLD3Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32.p0i8(i8* [[TMP4]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], i32 3, i32 4)
   6678 // CHECK:   [[TMP14:%.*]] = bitcast i8* [[TMP3]] to { <4 x i32>, <4 x i32>, <4 x i32> }*
   6679 // CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3Q_LANE_V]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP14]]
   6680 // CHECK:   [[TMP15:%.*]] = bitcast %struct.int32x4x3_t* %agg.result to i8*
   6681 // CHECK:   [[TMP16:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
   6682 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP15]], i8* [[TMP16]], i32 48, i32 16, i1 false)
   6683 // CHECK:   ret void
   6684 int32x4x3_t test_vld3q_lane_s32(int32_t const * a, int32x4x3_t b) {
   6685   return vld3q_lane_s32(a, b, 3);
   6686 }
   6687 
   6688 // CHECK-LABEL: define void @test_vld3q_lane_f16(%struct.float16x8x3_t* noalias sret %agg.result, half* %a, [6 x i64] %b.coerce) #0 {
   6689 // CHECK:   [[B:%.*]] = alloca %struct.float16x8x3_t, align 16
   6690 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16
   6691 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x3_t, align 16
   6692 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[B]], i32 0, i32 0
   6693 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x half>]* [[COERCE_DIVE]] to [6 x i64]*
   6694 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
   6695 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x3_t* [[__S1]] to i8*
   6696 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x3_t* [[B]] to i8*
   6697 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
   6698 // CHECK:   [[TMP3:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
   6699 // CHECK:   [[TMP4:%.*]] = bitcast half* %a to i8*
   6700 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
   6701 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]], i32 0, i32 0
   6702 // CHECK:   [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
   6703 // CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
   6704 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
   6705 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL1]], i32 0, i32 1
   6706 // CHECK:   [[TMP7:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
   6707 // CHECK:   [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
   6708 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
   6709 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL3]], i32 0, i32 2
   6710 // CHECK:   [[TMP9:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 16
   6711 // CHECK:   [[TMP10:%.*]] = bitcast <8 x half> [[TMP9]] to <16 x i8>
   6712 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
   6713 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
   6714 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
   6715 // CHECK:   [[VLD3Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16.p0i8(i8* [[TMP4]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], i32 7, i32 2)
   6716 // CHECK:   [[TMP14:%.*]] = bitcast i8* [[TMP3]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
   6717 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_LANE_V]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP14]]
   6718 // CHECK:   [[TMP15:%.*]] = bitcast %struct.float16x8x3_t* %agg.result to i8*
   6719 // CHECK:   [[TMP16:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
   6720 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP15]], i8* [[TMP16]], i32 48, i32 16, i1 false)
   6721 // CHECK:   ret void
   6722 float16x8x3_t test_vld3q_lane_f16(float16_t const * a, float16x8x3_t b) {
   6723   return vld3q_lane_f16(a, b, 7);
   6724 }
   6725 
   6726 // CHECK-LABEL: define void @test_vld3q_lane_f32(%struct.float32x4x3_t* noalias sret %agg.result, float* %a, [6 x i64] %b.coerce) #0 {
   6727 // CHECK:   [[B:%.*]] = alloca %struct.float32x4x3_t, align 16
   6728 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16
   6729 // CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x3_t, align 16
   6730 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[B]], i32 0, i32 0
   6731 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x float>]* [[COERCE_DIVE]] to [6 x i64]*
   6732 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
   6733 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x3_t* [[__S1]] to i8*
   6734 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x3_t* [[B]] to i8*
   6735 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
   6736 // CHECK:   [[TMP3:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
   6737 // CHECK:   [[TMP4:%.*]] = bitcast float* %a to i8*
   6738 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
   6739 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]], i32 0, i32 0
   6740 // CHECK:   [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
   6741 // CHECK:   [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
   6742 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
   6743 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL1]], i32 0, i32 1
   6744 // CHECK:   [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
   6745 // CHECK:   [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8>
   6746 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
   6747 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL3]], i32 0, i32 2
   6748 // CHECK:   [[TMP9:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align 16
   6749 // CHECK:   [[TMP10:%.*]] = bitcast <4 x float> [[TMP9]] to <16 x i8>
   6750 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
   6751 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>
   6752 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x float>
   6753 // CHECK:   [[VLD3Q_LANE_V:%.*]] = call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3lane.v4f32.p0i8(i8* [[TMP4]], <4 x float> [[TMP11]], <4 x float> [[TMP12]], <4 x float> [[TMP13]], i32 3, i32 4)
   6754 // CHECK:   [[TMP14:%.*]] = bitcast i8* [[TMP3]] to { <4 x float>, <4 x float>, <4 x float> }*
   6755 // CHECK:   store { <4 x float>, <4 x float>, <4 x float> } [[VLD3Q_LANE_V]], { <4 x float>, <4 x float>, <4 x float> }* [[TMP14]]
   6756 // CHECK:   [[TMP15:%.*]] = bitcast %struct.float32x4x3_t* %agg.result to i8*
   6757 // CHECK:   [[TMP16:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
   6758 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP15]], i8* [[TMP16]], i32 48, i32 16, i1 false)
   6759 // CHECK:   ret void
   6760 float32x4x3_t test_vld3q_lane_f32(float32_t const * a, float32x4x3_t b) {
   6761   return vld3q_lane_f32(a, b, 3);
   6762 }
   6763 
   6764 // CHECK-LABEL: define void @test_vld3q_lane_p16(%struct.poly16x8x3_t* noalias sret %agg.result, i16* %a, [6 x i64] %b.coerce) #0 {
   6765 // CHECK:   [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16
   6766 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16
   6767 // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x3_t, align 16
   6768 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[B]], i32 0, i32 0
   6769 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i16>]* [[COERCE_DIVE]] to [6 x i64]*
   6770 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
   6771 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x3_t* [[__S1]] to i8*
   6772 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x3_t* [[B]] to i8*
   6773 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
   6774 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
   6775 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
   6776 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
   6777 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i32 0, i32 0
   6778 // CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
   6779 // CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
   6780 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
   6781 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i32 0, i32 1
   6782 // CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
   6783 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
   6784 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
   6785 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i32 0, i32 2
   6786 // CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
   6787 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
   6788 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
   6789 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
   6790 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
   6791 // CHECK:   [[VLD3Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16.p0i8(i8* [[TMP4]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], i32 7, i32 2)
   6792 // CHECK:   [[TMP14:%.*]] = bitcast i8* [[TMP3]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
   6793 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_LANE_V]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP14]]
   6794 // CHECK:   [[TMP15:%.*]] = bitcast %struct.poly16x8x3_t* %agg.result to i8*
   6795 // CHECK:   [[TMP16:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
   6796 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP15]], i8* [[TMP16]], i32 48, i32 16, i1 false)
   6797 // CHECK:   ret void
   6798 poly16x8x3_t test_vld3q_lane_p16(poly16_t const * a, poly16x8x3_t b) {
   6799   return vld3q_lane_p16(a, b, 7);
   6800 }
   6801 
   6802 // CHECK-LABEL: define void @test_vld3_lane_u8(%struct.uint8x8x3_t* noalias sret %agg.result, i8* %a, [3 x i64] %b.coerce) #0 {
   6803 // CHECK:   [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8
   6804 // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x3_t, align 8
   6805 // CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x3_t, align 8
   6806 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[B]], i32 0, i32 0
   6807 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE]] to [3 x i64]*
   6808 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
   6809 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x3_t* [[__S1]] to i8*
   6810 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x3_t* [[B]] to i8*
   6811 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
   6812 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
   6813 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
   6814 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i32 0, i32 0
   6815 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
   6816 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
   6817 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i32 0, i32 1
   6818 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
   6819 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
   6820 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i32 0, i32 2
   6821 // CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
   6822 // CHECK:   [[VLD3_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8.p0i8(i8* %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i32 7, i32 1)
   6823 // CHECK:   [[TMP7:%.*]] = bitcast i8* [[TMP3]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
   6824 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE_V]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP7]]
   6825 // CHECK:   [[TMP8:%.*]] = bitcast %struct.uint8x8x3_t* %agg.result to i8*
   6826 // CHECK:   [[TMP9:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
   6827 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP8]], i8* [[TMP9]], i32 24, i32 8, i1 false)
   6828 // CHECK:   ret void
   6829 uint8x8x3_t test_vld3_lane_u8(uint8_t const * a, uint8x8x3_t b) {
   6830   return vld3_lane_u8(a, b, 7);
   6831 }
   6832 
   6833 // CHECK-LABEL: define void @test_vld3_lane_u16(%struct.uint16x4x3_t* noalias sret %agg.result, i16* %a, [3 x i64] %b.coerce) #0 {
   6834 // CHECK:   [[B:%.*]] = alloca %struct.uint16x4x3_t, align 8
   6835 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x3_t, align 8
   6836 // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x3_t, align 8
   6837 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[B]], i32 0, i32 0
   6838 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x i16>]* [[COERCE_DIVE]] to [3 x i64]*
   6839 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
   6840 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x3_t* [[__S1]] to i8*
   6841 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x3_t* [[B]] to i8*
   6842 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
   6843 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
   6844 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
   6845 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
   6846 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i32 0, i32 0
   6847 // CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
   6848 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
   6849 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
   6850 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i32 0, i32 1
   6851 // CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
   6852 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
   6853 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
   6854 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i32 0, i32 2
   6855 // CHECK:   [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
   6856 // CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
   6857 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
   6858 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
   6859 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
   6860 // CHECK:   [[VLD3_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16.p0i8(i8* [[TMP4]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], i32 3, i32 2)
   6861 // CHECK:   [[TMP14:%.*]] = bitcast i8* [[TMP3]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
   6862 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE_V]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP14]]
   6863 // CHECK:   [[TMP15:%.*]] = bitcast %struct.uint16x4x3_t* %agg.result to i8*
   6864 // CHECK:   [[TMP16:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
   6865 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP15]], i8* [[TMP16]], i32 24, i32 8, i1 false)
   6866 // CHECK:   ret void
   6867 uint16x4x3_t test_vld3_lane_u16(uint16_t const * a, uint16x4x3_t b) {
   6868   return vld3_lane_u16(a, b, 3);
   6869 }
   6870 
   6871 // CHECK-LABEL: define void @test_vld3_lane_u32(%struct.uint32x2x3_t* noalias sret %agg.result, i32* %a, [3 x i64] %b.coerce) #0 {
   6872 // CHECK:   [[B:%.*]] = alloca %struct.uint32x2x3_t, align 8
   6873 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x3_t, align 8
   6874 // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x3_t, align 8
   6875 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[B]], i32 0, i32 0
   6876 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <2 x i32>]* [[COERCE_DIVE]] to [3 x i64]*
   6877 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
   6878 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x3_t* [[__S1]] to i8*
   6879 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x3_t* [[B]] to i8*
   6880 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
   6881 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
   6882 // CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
   6883 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
   6884 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i32 0, i32 0
   6885 // CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
   6886 // CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
   6887 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
   6888 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL1]], i32 0, i32 1
   6889 // CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
   6890 // CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
   6891 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
   6892 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL3]], i32 0, i32 2
   6893 // CHECK:   [[TMP9:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
   6894 // CHECK:   [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8>
   6895 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
   6896 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
   6897 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32>
   6898 // CHECK:   [[VLD3_LANE_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3lane.v2i32.p0i8(i8* [[TMP4]], <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], i32 1, i32 4)
   6899 // CHECK:   [[TMP14:%.*]] = bitcast i8* [[TMP3]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
   6900 // CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE_V]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP14]]
   6901 // CHECK:   [[TMP15:%.*]] = bitcast %struct.uint32x2x3_t* %agg.result to i8*
   6902 // CHECK:   [[TMP16:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
   6903 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP15]], i8* [[TMP16]], i32 24, i32 8, i1 false)
   6904 // CHECK:   ret void
   6905 uint32x2x3_t test_vld3_lane_u32(uint32_t const * a, uint32x2x3_t b) {
   6906   return vld3_lane_u32(a, b, 1);
   6907 }
   6908 
   6909 // CHECK-LABEL: define void @test_vld3_lane_s8(%struct.int8x8x3_t* noalias sret %agg.result, i8* %a, [3 x i64] %b.coerce) #0 {
   6910 // CHECK:   [[B:%.*]] = alloca %struct.int8x8x3_t, align 8
   6911 // CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x3_t, align 8
   6912 // CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x3_t, align 8
   6913 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[B]], i32 0, i32 0
   6914 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE]] to [3 x i64]*
   6915 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
   6916 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x3_t* [[__S1]] to i8*
   6917 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x3_t* [[B]] to i8*
   6918 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
   6919 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
   6920 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
   6921 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i32 0, i32 0
   6922 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
   6923 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
   6924 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i32 0, i32 1
   6925 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
   6926 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
   6927 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i32 0, i32 2
   6928 // CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
   6929 // CHECK:   [[VLD3_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8.p0i8(i8* %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i32 7, i32 1)
   6930 // CHECK:   [[TMP7:%.*]] = bitcast i8* [[TMP3]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
   6931 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE_V]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP7]]
   6932 // CHECK:   [[TMP8:%.*]] = bitcast %struct.int8x8x3_t* %agg.result to i8*
   6933 // CHECK:   [[TMP9:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
   6934 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP8]], i8* [[TMP9]], i32 24, i32 8, i1 false)
   6935 // CHECK:   ret void
   6936 int8x8x3_t test_vld3_lane_s8(int8_t const * a, int8x8x3_t b) {
   6937   return vld3_lane_s8(a, b, 7);
   6938 }
   6939 
   6940 // CHECK-LABEL: define void @test_vld3_lane_s16(%struct.int16x4x3_t* noalias sret %agg.result, i16* %a, [3 x i64] %b.coerce) #0 {
   6941 // CHECK:   [[B:%.*]] = alloca %struct.int16x4x3_t, align 8
   6942 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x3_t, align 8
   6943 // CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x3_t, align 8
   6944 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[B]], i32 0, i32 0
   6945 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x i16>]* [[COERCE_DIVE]] to [3 x i64]*
   6946 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
   6947 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x3_t* [[__S1]] to i8*
   6948 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x3_t* [[B]] to i8*
   6949 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
   6950 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
   6951 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
   6952 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
   6953 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i32 0, i32 0
   6954 // CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
   6955 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
   6956 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
   6957 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i32 0, i32 1
   6958 // CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
   6959 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
   6960 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
   6961 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i32 0, i32 2
   6962 // CHECK:   [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
   6963 // CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
   6964 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
   6965 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
   6966 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
   6967 // CHECK:   [[VLD3_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16.p0i8(i8* [[TMP4]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], i32 3, i32 2)
   6968 // CHECK:   [[TMP14:%.*]] = bitcast i8* [[TMP3]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
   6969 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE_V]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP14]]
   6970 // CHECK:   [[TMP15:%.*]] = bitcast %struct.int16x4x3_t* %agg.result to i8*
   6971 // CHECK:   [[TMP16:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
   6972 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP15]], i8* [[TMP16]], i32 24, i32 8, i1 false)
   6973 // CHECK:   ret void
   6974 int16x4x3_t test_vld3_lane_s16(int16_t const * a, int16x4x3_t b) {
   6975   return vld3_lane_s16(a, b, 3);
   6976 }
   6977 
   6978 // CHECK-LABEL: define void @test_vld3_lane_s32(%struct.int32x2x3_t* noalias sret %agg.result, i32* %a, [3 x i64] %b.coerce) #0 {
   6979 // CHECK:   [[B:%.*]] = alloca %struct.int32x2x3_t, align 8
   6980 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x3_t, align 8
   6981 // CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x3_t, align 8
   6982 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[B]], i32 0, i32 0
   6983 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <2 x i32>]* [[COERCE_DIVE]] to [3 x i64]*
   6984 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
   6985 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x3_t* [[__S1]] to i8*
   6986 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x3_t* [[B]] to i8*
   6987 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
   6988 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
   6989 // CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
   6990 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
   6991 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i32 0, i32 0
   6992 // CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
   6993 // CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
   6994 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
   6995 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL1]], i32 0, i32 1
   6996 // CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
   6997 // CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
   6998 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
   6999 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL3]], i32 0, i32 2
   7000 // CHECK:   [[TMP9:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
   7001 // CHECK:   [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8>
   7002 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
   7003 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
   7004 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32>
   7005 // CHECK:   [[VLD3_LANE_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3lane.v2i32.p0i8(i8* [[TMP4]], <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], i32 1, i32 4)
   7006 // CHECK:   [[TMP14:%.*]] = bitcast i8* [[TMP3]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
   7007 // CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE_V]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP14]]
   7008 // CHECK:   [[TMP15:%.*]] = bitcast %struct.int32x2x3_t* %agg.result to i8*
   7009 // CHECK:   [[TMP16:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
   7010 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP15]], i8* [[TMP16]], i32 24, i32 8, i1 false)
   7011 // CHECK:   ret void
   7012 int32x2x3_t test_vld3_lane_s32(int32_t const * a, int32x2x3_t b) {
   7013   return vld3_lane_s32(a, b, 1);
   7014 }
   7015 
   7016 // CHECK-LABEL: define void @test_vld3_lane_f16(%struct.float16x4x3_t* noalias sret %agg.result, half* %a, [3 x i64] %b.coerce) #0 {
   7017 // CHECK:   [[B:%.*]] = alloca %struct.float16x4x3_t, align 8
   7018 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x3_t, align 8
   7019 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x3_t, align 8
   7020 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[B]], i32 0, i32 0
   7021 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x half>]* [[COERCE_DIVE]] to [3 x i64]*
   7022 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
   7023 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x3_t* [[__S1]] to i8*
   7024 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x3_t* [[B]] to i8*
   7025 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
   7026 // CHECK:   [[TMP3:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
   7027 // CHECK:   [[TMP4:%.*]] = bitcast half* %a to i8*
   7028 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
   7029 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]], i32 0, i32 0
   7030 // CHECK:   [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
   7031 // CHECK:   [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
   7032 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
   7033 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL1]], i32 0, i32 1
   7034 // CHECK:   [[TMP7:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
   7035 // CHECK:   [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8>
   7036 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
   7037 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL3]], i32 0, i32 2
   7038 // CHECK:   [[TMP9:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX4]], align 8
   7039 // CHECK:   [[TMP10:%.*]] = bitcast <4 x half> [[TMP9]] to <8 x i8>
   7040 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
   7041 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
   7042 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
   7043 // CHECK:   [[VLD3_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16.p0i8(i8* [[TMP4]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], i32 3, i32 2)
   7044 // CHECK:   [[TMP14:%.*]] = bitcast i8* [[TMP3]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
   7045 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE_V]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP14]]
   7046 // CHECK:   [[TMP15:%.*]] = bitcast %struct.float16x4x3_t* %agg.result to i8*
   7047 // CHECK:   [[TMP16:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
   7048 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP15]], i8* [[TMP16]], i32 24, i32 8, i1 false)
   7049 // CHECK:   ret void
   7050 float16x4x3_t test_vld3_lane_f16(float16_t const * a, float16x4x3_t b) {
   7051   return vld3_lane_f16(a, b, 3);
   7052 }
   7053 
   7054 // CHECK-LABEL: define void @test_vld3_lane_f32(%struct.float32x2x3_t* noalias sret %agg.result, float* %a, [3 x i64] %b.coerce) #0 {
   7055 // CHECK:   [[B:%.*]] = alloca %struct.float32x2x3_t, align 8
   7056 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x3_t, align 8
   7057 // CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x3_t, align 8
   7058 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[B]], i32 0, i32 0
   7059 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <2 x float>]* [[COERCE_DIVE]] to [3 x i64]*
   7060 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
   7061 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x3_t* [[__S1]] to i8*
   7062 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x3_t* [[B]] to i8*
   7063 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
   7064 // CHECK:   [[TMP3:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
   7065 // CHECK:   [[TMP4:%.*]] = bitcast float* %a to i8*
   7066 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
   7067 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]], i32 0, i32 0
   7068 // CHECK:   [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
   7069 // CHECK:   [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
   7070 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
   7071 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL1]], i32 0, i32 1
   7072 // CHECK:   [[TMP7:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
   7073 // CHECK:   [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8>
   7074 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
   7075 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL3]], i32 0, i32 2
   7076 // CHECK:   [[TMP9:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX4]], align 8
   7077 // CHECK:   [[TMP10:%.*]] = bitcast <2 x float> [[TMP9]] to <8 x i8>
   7078 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
   7079 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float>
   7080 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x float>
   7081 // CHECK:   [[VLD3_LANE_V:%.*]] = call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32.p0i8(i8* [[TMP4]], <2 x float> [[TMP11]], <2 x float> [[TMP12]], <2 x float> [[TMP13]], i32 1, i32 4)
   7082 // CHECK:   [[TMP14:%.*]] = bitcast i8* [[TMP3]] to { <2 x float>, <2 x float>, <2 x float> }*
   7083 // CHECK:   store { <2 x float>, <2 x float>, <2 x float> } [[VLD3_LANE_V]], { <2 x float>, <2 x float>, <2 x float> }* [[TMP14]]
   7084 // CHECK:   [[TMP15:%.*]] = bitcast %struct.float32x2x3_t* %agg.result to i8*
   7085 // CHECK:   [[TMP16:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
   7086 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP15]], i8* [[TMP16]], i32 24, i32 8, i1 false)
   7087 // CHECK:   ret void
   7088 float32x2x3_t test_vld3_lane_f32(float32_t const * a, float32x2x3_t b) {
   7089   return vld3_lane_f32(a, b, 1);
   7090 }
   7091 
   7092 // CHECK-LABEL: define void @test_vld3_lane_p8(%struct.poly8x8x3_t* noalias sret %agg.result, i8* %a, [3 x i64] %b.coerce) #0 {
   7093 // CHECK:   [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8
   7094 // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x3_t, align 8
   7095 // CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x3_t, align 8
   7096 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[B]], i32 0, i32 0
   7097 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE]] to [3 x i64]*
   7098 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
   7099 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x3_t* [[__S1]] to i8*
   7100 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x3_t* [[B]] to i8*
   7101 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
   7102 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
   7103 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
   7104 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i32 0, i32 0
   7105 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
   7106 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
   7107 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i32 0, i32 1
   7108 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
   7109 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
   7110 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i32 0, i32 2
   7111 // CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
   7112 // CHECK:   [[VLD3_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8.p0i8(i8* %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i32 7, i32 1)
   7113 // CHECK:   [[TMP7:%.*]] = bitcast i8* [[TMP3]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
   7114 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE_V]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP7]]
   7115 // CHECK:   [[TMP8:%.*]] = bitcast %struct.poly8x8x3_t* %agg.result to i8*
   7116 // CHECK:   [[TMP9:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
   7117 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP8]], i8* [[TMP9]], i32 24, i32 8, i1 false)
   7118 // CHECK:   ret void
   7119 poly8x8x3_t test_vld3_lane_p8(poly8_t const * a, poly8x8x3_t b) {
   7120   return vld3_lane_p8(a, b, 7);
   7121 }
   7122 
   7123 // CHECK-LABEL: define void @test_vld3_lane_p16(%struct.poly16x4x3_t* noalias sret %agg.result, i16* %a, [3 x i64] %b.coerce) #0 {
   7124 // CHECK:   [[B:%.*]] = alloca %struct.poly16x4x3_t, align 8
   7125 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x3_t, align 8
   7126 // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x3_t, align 8
   7127 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[B]], i32 0, i32 0
   7128 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x i16>]* [[COERCE_DIVE]] to [3 x i64]*
   7129 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
   7130 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x3_t* [[__S1]] to i8*
   7131 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x3_t* [[B]] to i8*
   7132 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
   7133 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
   7134 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
   7135 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
   7136 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i32 0, i32 0
   7137 // CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
   7138 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
   7139 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
   7140 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i32 0, i32 1
   7141 // CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
   7142 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
   7143 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
   7144 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i32 0, i32 2
   7145 // CHECK:   [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
   7146 // CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
   7147 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
   7148 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
   7149 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
   7150 // CHECK:   [[VLD3_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16.p0i8(i8* [[TMP4]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], i32 3, i32 2)
   7151 // CHECK:   [[TMP14:%.*]] = bitcast i8* [[TMP3]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
   7152 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE_V]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP14]]
   7153 // CHECK:   [[TMP15:%.*]] = bitcast %struct.poly16x4x3_t* %agg.result to i8*
   7154 // CHECK:   [[TMP16:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
   7155 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP15]], i8* [[TMP16]], i32 24, i32 8, i1 false)
   7156 // CHECK:   ret void
   7157 poly16x4x3_t test_vld3_lane_p16(poly16_t const * a, poly16x4x3_t b) {
   7158   return vld3_lane_p16(a, b, 3);
   7159 }
   7160 
   7161 
   7162 // CHECK-LABEL: define void @test_vld4q_u8(%struct.uint8x16x4_t* noalias sret %agg.result, i8* %a) #0 {
   7163 // CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x4_t, align 16
   7164 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8*
   7165 // CHECK:   [[VLD4Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4.v16i8.p0i8(i8* %a, i32 1)
   7166 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
   7167 // CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4Q_V]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
   7168 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x4_t* %agg.result to i8*
   7169 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8*
   7170 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 64, i32 16, i1 false)
   7171 // CHECK:   ret void
   7172 uint8x16x4_t test_vld4q_u8(uint8_t const * a) {
   7173   return vld4q_u8(a);
   7174 }
   7175 
   7176 // CHECK-LABEL: define void @test_vld4q_u16(%struct.uint16x8x4_t* noalias sret %agg.result, i16* %a) #0 {
   7177 // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x4_t, align 16
   7178 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
   7179 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
   7180 // CHECK:   [[VLD4Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4.v8i16.p0i8(i8* [[TMP1]], i32 2)
   7181 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
   7182 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_V]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP2]]
   7183 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint16x8x4_t* %agg.result to i8*
   7184 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
   7185 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 64, i32 16, i1 false)
   7186 // CHECK:   ret void
   7187 uint16x8x4_t test_vld4q_u16(uint16_t const * a) {
   7188   return vld4q_u16(a);
   7189 }
   7190 
   7191 // CHECK-LABEL: define void @test_vld4q_u32(%struct.uint32x4x4_t* noalias sret %agg.result, i32* %a) #0 {
   7192 // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x4_t, align 16
   7193 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
   7194 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
   7195 // CHECK:   [[VLD4Q_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4.v4i32.p0i8(i8* [[TMP1]], i32 4)
   7196 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }*
   7197 // CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4Q_V]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP2]]
   7198 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint32x4x4_t* %agg.result to i8*
   7199 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
   7200 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 64, i32 16, i1 false)
   7201 // CHECK:   ret void
   7202 uint32x4x4_t test_vld4q_u32(uint32_t const * a) {
   7203   return vld4q_u32(a);
   7204 }
   7205 
   7206 // CHECK-LABEL: define void @test_vld4q_s8(%struct.int8x16x4_t* noalias sret %agg.result, i8* %a) #0 {
   7207 // CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x4_t, align 16
   7208 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8*
   7209 // CHECK:   [[VLD4Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4.v16i8.p0i8(i8* %a, i32 1)
   7210 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
   7211 // CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4Q_V]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
   7212 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x4_t* %agg.result to i8*
   7213 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8*
   7214 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 64, i32 16, i1 false)
   7215 // CHECK:   ret void
   7216 int8x16x4_t test_vld4q_s8(int8_t const * a) {
   7217   return vld4q_s8(a);
   7218 }
   7219 
   7220 // CHECK-LABEL: define void @test_vld4q_s16(%struct.int16x8x4_t* noalias sret %agg.result, i16* %a) #0 {
   7221 // CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x4_t, align 16
   7222 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
   7223 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
   7224 // CHECK:   [[VLD4Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4.v8i16.p0i8(i8* [[TMP1]], i32 2)
   7225 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
   7226 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_V]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP2]]
   7227 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int16x8x4_t* %agg.result to i8*
   7228 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
   7229 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 64, i32 16, i1 false)
   7230 // CHECK:   ret void
   7231 int16x8x4_t test_vld4q_s16(int16_t const * a) {
   7232   return vld4q_s16(a);
   7233 }
   7234 
   7235 // CHECK-LABEL: define void @test_vld4q_s32(%struct.int32x4x4_t* noalias sret %agg.result, i32* %a) #0 {
   7236 // CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x4_t, align 16
   7237 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
   7238 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
   7239 // CHECK:   [[VLD4Q_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4.v4i32.p0i8(i8* [[TMP1]], i32 4)
   7240 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }*
   7241 // CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4Q_V]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP2]]
   7242 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int32x4x4_t* %agg.result to i8*
   7243 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
   7244 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 64, i32 16, i1 false)
   7245 // CHECK:   ret void
   7246 int32x4x4_t test_vld4q_s32(int32_t const * a) {
   7247   return vld4q_s32(a);
   7248 }
   7249 
   7250 // CHECK-LABEL: define void @test_vld4q_f16(%struct.float16x8x4_t* noalias sret %agg.result, half* %a) #0 {
   7251 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x4_t, align 16
   7252 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
   7253 // CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
   7254 // CHECK:   [[VLD4Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4.v8i16.p0i8(i8* [[TMP1]], i32 2)
   7255 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
   7256 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_V]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP2]]
   7257 // CHECK:   [[TMP3:%.*]] = bitcast %struct.float16x8x4_t* %agg.result to i8*
   7258 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
   7259 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 64, i32 16, i1 false)
   7260 // CHECK:   ret void
   7261 float16x8x4_t test_vld4q_f16(float16_t const * a) {
   7262   return vld4q_f16(a);
   7263 }
   7264 
   7265 // CHECK-LABEL: define void @test_vld4q_f32(%struct.float32x4x4_t* noalias sret %agg.result, float* %a) #0 {
   7266 // CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x4_t, align 16
   7267 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
   7268 // CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
   7269 // CHECK:   [[VLD4Q_V:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4.v4f32.p0i8(i8* [[TMP1]], i32 4)
   7270 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x float>, <4 x float>, <4 x float>, <4 x float> }*
   7271 // CHECK:   store { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4Q_V]], { <4 x float>, <4 x float>, <4 x float>, <4 x float> }* [[TMP2]]
   7272 // CHECK:   [[TMP3:%.*]] = bitcast %struct.float32x4x4_t* %agg.result to i8*
   7273 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
   7274 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 64, i32 16, i1 false)
   7275 // CHECK:   ret void
   7276 float32x4x4_t test_vld4q_f32(float32_t const * a) {
   7277   return vld4q_f32(a);
   7278 }
   7279 
   7280 // CHECK-LABEL: define void @test_vld4q_p8(%struct.poly8x16x4_t* noalias sret %agg.result, i8* %a) #0 {
   7281 // CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x4_t, align 16
   7282 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8*
   7283 // CHECK:   [[VLD4Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4.v16i8.p0i8(i8* %a, i32 1)
   7284 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
   7285 // CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4Q_V]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
   7286 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x4_t* %agg.result to i8*
   7287 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8*
   7288 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 64, i32 16, i1 false)
   7289 // CHECK:   ret void
   7290 poly8x16x4_t test_vld4q_p8(poly8_t const * a) {
   7291   return vld4q_p8(a);
   7292 }
   7293 
   7294 // CHECK-LABEL: define void @test_vld4q_p16(%struct.poly16x8x4_t* noalias sret %agg.result, i16* %a) #0 {
   7295 // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x4_t, align 16
   7296 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
   7297 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
   7298 // CHECK:   [[VLD4Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4.v8i16.p0i8(i8* [[TMP1]], i32 2)
   7299 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
   7300 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_V]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP2]]
   7301 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly16x8x4_t* %agg.result to i8*
   7302 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
   7303 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 64, i32 16, i1 false)
   7304 // CHECK:   ret void
   7305 poly16x8x4_t test_vld4q_p16(poly16_t const * a) {
   7306   return vld4q_p16(a);
   7307 }
   7308 
   7309 // CHECK-LABEL: define void @test_vld4_u8(%struct.uint8x8x4_t* noalias sret %agg.result, i8* %a) #0 {
   7310 // CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x4_t, align 8
   7311 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
   7312 // CHECK:   [[VLD4_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4.v8i8.p0i8(i8* %a, i32 1)
   7313 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
   7314 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_V]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
   7315 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x4_t* %agg.result to i8*
   7316 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
   7317 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 32, i32 8, i1 false)
   7318 // CHECK:   ret void
   7319 uint8x8x4_t test_vld4_u8(uint8_t const * a) {
   7320   return vld4_u8(a);
   7321 }
   7322 
   7323 // CHECK-LABEL: define void @test_vld4_u16(%struct.uint16x4x4_t* noalias sret %agg.result, i16* %a) #0 {
   7324 // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x4_t, align 8
   7325 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
   7326 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
   7327 // CHECK:   [[VLD4_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4.v4i16.p0i8(i8* [[TMP1]], i32 2)
   7328 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
   7329 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_V]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP2]]
   7330 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint16x4x4_t* %agg.result to i8*
   7331 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
   7332 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 8, i1 false)
   7333 // CHECK:   ret void
   7334 uint16x4x4_t test_vld4_u16(uint16_t const * a) {
   7335   return vld4_u16(a);
   7336 }
   7337 
   7338 // CHECK-LABEL: define void @test_vld4_u32(%struct.uint32x2x4_t* noalias sret %agg.result, i32* %a) #0 {
   7339 // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x4_t, align 8
   7340 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
   7341 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
   7342 // CHECK:   [[VLD4_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4.v2i32.p0i8(i8* [[TMP1]], i32 4)
   7343 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
   7344 // CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_V]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP2]]
   7345 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint32x2x4_t* %agg.result to i8*
   7346 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
   7347 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 8, i1 false)
   7348 // CHECK:   ret void
   7349 uint32x2x4_t test_vld4_u32(uint32_t const * a) {
   7350   return vld4_u32(a);
   7351 }
   7352 
   7353 // CHECK-LABEL: define void @test_vld4_u64(%struct.uint64x1x4_t* noalias sret %agg.result, i64* %a) #0 {
   7354 // CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x4_t, align 8
   7355 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
   7356 // CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
   7357 // CHECK:   [[VLD4_V:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64.p0i8(i8* [[TMP1]], i32 4)
   7358 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
   7359 // CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_V]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP2]]
   7360 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint64x1x4_t* %agg.result to i8*
   7361 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
   7362 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 8, i1 false)
   7363 // CHECK:   ret void
   7364 uint64x1x4_t test_vld4_u64(uint64_t const * a) {
   7365   return vld4_u64(a);
   7366 }
   7367 
   7368 // CHECK-LABEL: define void @test_vld4_s8(%struct.int8x8x4_t* noalias sret %agg.result, i8* %a) #0 {
   7369 // CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x4_t, align 8
   7370 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
   7371 // CHECK:   [[VLD4_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4.v8i8.p0i8(i8* %a, i32 1)
   7372 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
   7373 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_V]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
   7374 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x4_t* %agg.result to i8*
   7375 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
   7376 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 32, i32 8, i1 false)
   7377 // CHECK:   ret void
   7378 int8x8x4_t test_vld4_s8(int8_t const * a) {
   7379   return vld4_s8(a);
   7380 }
   7381 
   7382 // CHECK-LABEL: define void @test_vld4_s16(%struct.int16x4x4_t* noalias sret %agg.result, i16* %a) #0 {
   7383 // CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x4_t, align 8
   7384 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
   7385 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
   7386 // CHECK:   [[VLD4_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4.v4i16.p0i8(i8* [[TMP1]], i32 2)
   7387 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
   7388 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_V]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP2]]
   7389 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int16x4x4_t* %agg.result to i8*
   7390 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
   7391 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 8, i1 false)
   7392 // CHECK:   ret void
   7393 int16x4x4_t test_vld4_s16(int16_t const * a) {
   7394   return vld4_s16(a);
   7395 }
   7396 
   7397 // CHECK-LABEL: define void @test_vld4_s32(%struct.int32x2x4_t* noalias sret %agg.result, i32* %a) #0 {
   7398 // CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x4_t, align 8
   7399 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
   7400 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
   7401 // CHECK:   [[VLD4_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4.v2i32.p0i8(i8* [[TMP1]], i32 4)
   7402 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
   7403 // CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_V]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP2]]
   7404 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int32x2x4_t* %agg.result to i8*
   7405 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
   7406 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 8, i1 false)
   7407 // CHECK:   ret void
   7408 int32x2x4_t test_vld4_s32(int32_t const * a) {
   7409   return vld4_s32(a);
   7410 }
   7411 
   7412 // CHECK-LABEL: define void @test_vld4_s64(%struct.int64x1x4_t* noalias sret %agg.result, i64* %a) #0 {
   7413 // CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x4_t, align 8
   7414 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
   7415 // CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
   7416 // CHECK:   [[VLD4_V:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64.p0i8(i8* [[TMP1]], i32 4)
   7417 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
   7418 // CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_V]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP2]]
   7419 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int64x1x4_t* %agg.result to i8*
   7420 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
   7421 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 8, i1 false)
   7422 // CHECK:   ret void
   7423 int64x1x4_t test_vld4_s64(int64_t const * a) {
   7424   return vld4_s64(a);
   7425 }
   7426 
   7427 // CHECK-LABEL: define void @test_vld4_f16(%struct.float16x4x4_t* noalias sret %agg.result, half* %a) #0 {
   7428 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x4_t, align 8
   7429 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
   7430 // CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
   7431 // CHECK:   [[VLD4_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4.v4i16.p0i8(i8* [[TMP1]], i32 2)
   7432 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
   7433 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_V]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP2]]
   7434 // CHECK:   [[TMP3:%.*]] = bitcast %struct.float16x4x4_t* %agg.result to i8*
   7435 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
   7436 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 8, i1 false)
   7437 // CHECK:   ret void
   7438 float16x4x4_t test_vld4_f16(float16_t const * a) {
   7439   return vld4_f16(a);
   7440 }
   7441 
   7442 // CHECK-LABEL: define void @test_vld4_f32(%struct.float32x2x4_t* noalias sret %agg.result, float* %a) #0 {
   7443 // CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x4_t, align 8
   7444 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
   7445 // CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
   7446 // CHECK:   [[VLD4_V:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4.v2f32.p0i8(i8* [[TMP1]], i32 4)
   7447 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float>, <2 x float>, <2 x float> }*
   7448 // CHECK:   store { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4_V]], { <2 x float>, <2 x float>, <2 x float>, <2 x float> }* [[TMP2]]
   7449 // CHECK:   [[TMP3:%.*]] = bitcast %struct.float32x2x4_t* %agg.result to i8*
   7450 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
   7451 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 8, i1 false)
   7452 // CHECK:   ret void
   7453 float32x2x4_t test_vld4_f32(float32_t const * a) {
   7454   return vld4_f32(a);
   7455 }
   7456 
   7457 // CHECK-LABEL: define void @test_vld4_p8(%struct.poly8x8x4_t* noalias sret %agg.result, i8* %a) #0 {
   7458 // CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x4_t, align 8
   7459 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
   7460 // CHECK:   [[VLD4_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4.v8i8.p0i8(i8* %a, i32 1)
   7461 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
   7462 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_V]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
   7463 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x4_t* %agg.result to i8*
   7464 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
   7465 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 32, i32 8, i1 false)
   7466 // CHECK:   ret void
   7467 poly8x8x4_t test_vld4_p8(poly8_t const * a) {
   7468   return vld4_p8(a);
   7469 }
   7470 
   7471 // CHECK-LABEL: define void @test_vld4_p16(%struct.poly16x4x4_t* noalias sret %agg.result, i16* %a) #0 {
   7472 // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x4_t, align 8
   7473 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
   7474 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
   7475 // CHECK:   [[VLD4_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4.v4i16.p0i8(i8* [[TMP1]], i32 2)
   7476 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
   7477 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_V]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP2]]
   7478 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly16x4x4_t* %agg.result to i8*
   7479 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
   7480 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 8, i1 false)
   7481 // CHECK:   ret void
   7482 poly16x4x4_t test_vld4_p16(poly16_t const * a) {
   7483   return vld4_p16(a);
   7484 }
   7485 
   7486 
   7487 // CHECK-LABEL: define void @test_vld4_dup_u8(%struct.uint8x8x4_t* noalias sret %agg.result, i8* %a) #0 {
   7488 // CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x4_t, align 8
   7489 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
   7490 // CHECK:   [[VLD_DUP:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8.p0i8(i8* %a, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
   7491 // CHECK:   [[TMP1:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD_DUP]], 0
   7492 // CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
   7493 // CHECK:   [[TMP2:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD_DUP]], <8 x i8> [[LANE]], 0
   7494 // CHECK:   [[TMP3:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP2]], 1
   7495 // CHECK:   [[LANE1:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP3]], <8 x i32> zeroinitializer
   7496 // CHECK:   [[TMP4:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP2]], <8 x i8> [[LANE1]], 1
   7497 // CHECK:   [[TMP5:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], 2
   7498 // CHECK:   [[LANE2:%.*]] = shufflevector <8 x i8> [[TMP5]], <8 x i8> [[TMP5]], <8 x i32> zeroinitializer
   7499 // CHECK:   [[TMP6:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], <8 x i8> [[LANE2]], 2
   7500 // CHECK:   [[TMP7:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP6]], 3
   7501 // CHECK:   [[LANE3:%.*]] = shufflevector <8 x i8> [[TMP7]], <8 x i8> [[TMP7]], <8 x i32> zeroinitializer
   7502 // CHECK:   [[TMP8:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP6]], <8 x i8> [[LANE3]], 3
   7503 // CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
   7504 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP8]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP9]]
   7505 // CHECK:   [[TMP10:%.*]] = bitcast %struct.uint8x8x4_t* %agg.result to i8*
   7506 // CHECK:   [[TMP11:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
   7507 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP10]], i8* [[TMP11]], i32 32, i32 8, i1 false)
   7508 // CHECK:   ret void
   7509 uint8x8x4_t test_vld4_dup_u8(uint8_t const * a) {
   7510   return vld4_dup_u8(a);
   7511 }
   7512 
   7513 // CHECK-LABEL: define void @test_vld4_dup_u16(%struct.uint16x4x4_t* noalias sret %agg.result, i16* %a) #0 {
   7514 // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x4_t, align 8
   7515 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
   7516 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
   7517 // CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16.p0i8(i8* [[TMP1]], <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
   7518 // CHECK:   [[TMP2:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD_DUP]], 0
   7519 // CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP2]], <4 x i32> zeroinitializer
   7520 // CHECK:   [[TMP3:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD_DUP]], <4 x i16> [[LANE]], 0
   7521 // CHECK:   [[TMP4:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP3]], 1
   7522 // CHECK:   [[LANE1:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP4]], <4 x i32> zeroinitializer
   7523 // CHECK:   [[TMP5:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP3]], <4 x i16> [[LANE1]], 1
   7524 // CHECK:   [[TMP6:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP5]], 2
   7525 // CHECK:   [[LANE2:%.*]] = shufflevector <4 x i16> [[TMP6]], <4 x i16> [[TMP6]], <4 x i32> zeroinitializer
   7526 // CHECK:   [[TMP7:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP5]], <4 x i16> [[LANE2]], 2
   7527 // CHECK:   [[TMP8:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP7]], 3
   7528 // CHECK:   [[LANE3:%.*]] = shufflevector <4 x i16> [[TMP8]], <4 x i16> [[TMP8]], <4 x i32> zeroinitializer
   7529 // CHECK:   [[TMP9:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP7]], <4 x i16> [[LANE3]], 3
   7530 // CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
   7531 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP9]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP10]]
   7532 // CHECK:   [[TMP11:%.*]] = bitcast %struct.uint16x4x4_t* %agg.result to i8*
   7533 // CHECK:   [[TMP12:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
   7534 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP11]], i8* [[TMP12]], i32 32, i32 8, i1 false)
   7535 // CHECK:   ret void
   7536 uint16x4x4_t test_vld4_dup_u16(uint16_t const * a) {
   7537   return vld4_dup_u16(a);
   7538 }
   7539 
   7540 // CHECK-LABEL: define void @test_vld4_dup_u32(%struct.uint32x2x4_t* noalias sret %agg.result, i32* %a) #0 {
   7541 // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x4_t, align 8
   7542 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
   7543 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
   7544 // CHECK:   [[VLD_DUP:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32.p0i8(i8* [[TMP1]], <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 0, i32 4)
   7545 // CHECK:   [[TMP2:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD_DUP]], 0
   7546 // CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <2 x i32> zeroinitializer
   7547 // CHECK:   [[TMP3:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD_DUP]], <2 x i32> [[LANE]], 0
   7548 // CHECK:   [[TMP4:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP3]], 1
   7549 // CHECK:   [[LANE1:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP4]], <2 x i32> zeroinitializer
   7550 // CHECK:   [[TMP5:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP3]], <2 x i32> [[LANE1]], 1
   7551 // CHECK:   [[TMP6:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP5]], 2
   7552 // CHECK:   [[LANE2:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> [[TMP6]], <2 x i32> zeroinitializer
   7553 // CHECK:   [[TMP7:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP5]], <2 x i32> [[LANE2]], 2
   7554 // CHECK:   [[TMP8:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP7]], 3
   7555 // CHECK:   [[LANE3:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> [[TMP8]], <2 x i32> zeroinitializer
   7556 // CHECK:   [[TMP9:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP7]], <2 x i32> [[LANE3]], 3
   7557 // CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
   7558 // CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP9]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP10]]
   7559 // CHECK:   [[TMP11:%.*]] = bitcast %struct.uint32x2x4_t* %agg.result to i8*
   7560 // CHECK:   [[TMP12:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
   7561 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP11]], i8* [[TMP12]], i32 32, i32 8, i1 false)
   7562 // CHECK:   ret void
   7563 uint32x2x4_t test_vld4_dup_u32(uint32_t const * a) {
   7564   return vld4_dup_u32(a);
   7565 }
   7566 
   7567 // CHECK-LABEL: define void @test_vld4_dup_u64(%struct.uint64x1x4_t* noalias sret %agg.result, i64* %a) #0 {
   7568 // CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x4_t, align 8
   7569 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
   7570 // CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
   7571 // CHECK:   [[VLD_DUP:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64.p0i8(i8* [[TMP1]], i32 4)
   7572 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
   7573 // CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD_DUP]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP2]]
   7574 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint64x1x4_t* %agg.result to i8*
   7575 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
   7576 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 8, i1 false)
   7577 // CHECK:   ret void
   7578 uint64x1x4_t test_vld4_dup_u64(uint64_t const * a) {
   7579   return vld4_dup_u64(a);
   7580 }
   7581 
   7582 // CHECK-LABEL: define void @test_vld4_dup_s8(%struct.int8x8x4_t* noalias sret %agg.result, i8* %a) #0 {
   7583 // CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x4_t, align 8
   7584 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
   7585 // CHECK:   [[VLD_DUP:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8.p0i8(i8* %a, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
   7586 // CHECK:   [[TMP1:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD_DUP]], 0
   7587 // CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
   7588 // CHECK:   [[TMP2:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD_DUP]], <8 x i8> [[LANE]], 0
   7589 // CHECK:   [[TMP3:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP2]], 1
   7590 // CHECK:   [[LANE1:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP3]], <8 x i32> zeroinitializer
   7591 // CHECK:   [[TMP4:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP2]], <8 x i8> [[LANE1]], 1
   7592 // CHECK:   [[TMP5:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], 2
   7593 // CHECK:   [[LANE2:%.*]] = shufflevector <8 x i8> [[TMP5]], <8 x i8> [[TMP5]], <8 x i32> zeroinitializer
   7594 // CHECK:   [[TMP6:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], <8 x i8> [[LANE2]], 2
   7595 // CHECK:   [[TMP7:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP6]], 3
   7596 // CHECK:   [[LANE3:%.*]] = shufflevector <8 x i8> [[TMP7]], <8 x i8> [[TMP7]], <8 x i32> zeroinitializer
   7597 // CHECK:   [[TMP8:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP6]], <8 x i8> [[LANE3]], 3
   7598 // CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
   7599 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP8]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP9]]
   7600 // CHECK:   [[TMP10:%.*]] = bitcast %struct.int8x8x4_t* %agg.result to i8*
   7601 // CHECK:   [[TMP11:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
   7602 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP10]], i8* [[TMP11]], i32 32, i32 8, i1 false)
   7603 // CHECK:   ret void
   7604 int8x8x4_t test_vld4_dup_s8(int8_t const * a) {
   7605   return vld4_dup_s8(a);
   7606 }
   7607 
   7608 // CHECK-LABEL: define void @test_vld4_dup_s16(%struct.int16x4x4_t* noalias sret %agg.result, i16* %a) #0 {
   7609 // CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x4_t, align 8
   7610 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
   7611 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
   7612 // CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16.p0i8(i8* [[TMP1]], <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
   7613 // CHECK:   [[TMP2:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD_DUP]], 0
   7614 // CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP2]], <4 x i32> zeroinitializer
   7615 // CHECK:   [[TMP3:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD_DUP]], <4 x i16> [[LANE]], 0
   7616 // CHECK:   [[TMP4:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP3]], 1
   7617 // CHECK:   [[LANE1:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP4]], <4 x i32> zeroinitializer
   7618 // CHECK:   [[TMP5:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP3]], <4 x i16> [[LANE1]], 1
   7619 // CHECK:   [[TMP6:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP5]], 2
   7620 // CHECK:   [[LANE2:%.*]] = shufflevector <4 x i16> [[TMP6]], <4 x i16> [[TMP6]], <4 x i32> zeroinitializer
   7621 // CHECK:   [[TMP7:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP5]], <4 x i16> [[LANE2]], 2
   7622 // CHECK:   [[TMP8:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP7]], 3
   7623 // CHECK:   [[LANE3:%.*]] = shufflevector <4 x i16> [[TMP8]], <4 x i16> [[TMP8]], <4 x i32> zeroinitializer
   7624 // CHECK:   [[TMP9:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP7]], <4 x i16> [[LANE3]], 3
   7625 // CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
   7626 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP9]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP10]]
   7627 // CHECK:   [[TMP11:%.*]] = bitcast %struct.int16x4x4_t* %agg.result to i8*
   7628 // CHECK:   [[TMP12:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
   7629 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP11]], i8* [[TMP12]], i32 32, i32 8, i1 false)
   7630 // CHECK:   ret void
   7631 int16x4x4_t test_vld4_dup_s16(int16_t const * a) {
   7632   return vld4_dup_s16(a);
   7633 }
   7634 
   7635 // CHECK-LABEL: define void @test_vld4_dup_s32(%struct.int32x2x4_t* noalias sret %agg.result, i32* %a) #0 {
   7636 // CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x4_t, align 8
   7637 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
   7638 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
   7639 // CHECK:   [[VLD_DUP:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32.p0i8(i8* [[TMP1]], <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 0, i32 4)
   7640 // CHECK:   [[TMP2:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD_DUP]], 0
   7641 // CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <2 x i32> zeroinitializer
   7642 // CHECK:   [[TMP3:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD_DUP]], <2 x i32> [[LANE]], 0
   7643 // CHECK:   [[TMP4:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP3]], 1
   7644 // CHECK:   [[LANE1:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP4]], <2 x i32> zeroinitializer
   7645 // CHECK:   [[TMP5:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP3]], <2 x i32> [[LANE1]], 1
   7646 // CHECK:   [[TMP6:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP5]], 2
   7647 // CHECK:   [[LANE2:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> [[TMP6]], <2 x i32> zeroinitializer
   7648 // CHECK:   [[TMP7:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP5]], <2 x i32> [[LANE2]], 2
   7649 // CHECK:   [[TMP8:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP7]], 3
   7650 // CHECK:   [[LANE3:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> [[TMP8]], <2 x i32> zeroinitializer
   7651 // CHECK:   [[TMP9:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP7]], <2 x i32> [[LANE3]], 3
   7652 // CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
   7653 // CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP9]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP10]]
   7654 // CHECK:   [[TMP11:%.*]] = bitcast %struct.int32x2x4_t* %agg.result to i8*
   7655 // CHECK:   [[TMP12:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
   7656 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP11]], i8* [[TMP12]], i32 32, i32 8, i1 false)
   7657 // CHECK:   ret void
   7658 int32x2x4_t test_vld4_dup_s32(int32_t const * a) {
   7659   return vld4_dup_s32(a);
   7660 }
   7661 
   7662 // CHECK-LABEL: define void @test_vld4_dup_s64(%struct.int64x1x4_t* noalias sret %agg.result, i64* %a) #0 {
   7663 // CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x4_t, align 8
   7664 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
   7665 // CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
   7666 // CHECK:   [[VLD_DUP:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64.p0i8(i8* [[TMP1]], i32 4)
   7667 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
   7668 // CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD_DUP]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP2]]
   7669 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int64x1x4_t* %agg.result to i8*
   7670 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
   7671 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 8, i1 false)
   7672 // CHECK:   ret void
   7673 int64x1x4_t test_vld4_dup_s64(int64_t const * a) {
   7674   return vld4_dup_s64(a);
   7675 }
   7676 
   7677 // CHECK-LABEL: define void @test_vld4_dup_f16(%struct.float16x4x4_t* noalias sret %agg.result, half* %a) #0 {
   7678 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x4_t, align 8
   7679 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
   7680 // CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
   7681 // CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16.p0i8(i8* [[TMP1]], <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
   7682 // CHECK:   [[TMP2:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD_DUP]], 0
   7683 // CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP2]], <4 x i32> zeroinitializer
   7684 // CHECK:   [[TMP3:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD_DUP]], <4 x i16> [[LANE]], 0
   7685 // CHECK:   [[TMP4:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP3]], 1
   7686 // CHECK:   [[LANE1:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP4]], <4 x i32> zeroinitializer
   7687 // CHECK:   [[TMP5:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP3]], <4 x i16> [[LANE1]], 1
   7688 // CHECK:   [[TMP6:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP5]], 2
   7689 // CHECK:   [[LANE2:%.*]] = shufflevector <4 x i16> [[TMP6]], <4 x i16> [[TMP6]], <4 x i32> zeroinitializer
   7690 // CHECK:   [[TMP7:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP5]], <4 x i16> [[LANE2]], 2
   7691 // CHECK:   [[TMP8:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP7]], 3
   7692 // CHECK:   [[LANE3:%.*]] = shufflevector <4 x i16> [[TMP8]], <4 x i16> [[TMP8]], <4 x i32> zeroinitializer
   7693 // CHECK:   [[TMP9:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP7]], <4 x i16> [[LANE3]], 3
   7694 // CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
   7695 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP9]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP10]]
   7696 // CHECK:   [[TMP11:%.*]] = bitcast %struct.float16x4x4_t* %agg.result to i8*
   7697 // CHECK:   [[TMP12:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
   7698 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP11]], i8* [[TMP12]], i32 32, i32 8, i1 false)
   7699 // CHECK:   ret void
   7700 float16x4x4_t test_vld4_dup_f16(float16_t const * a) {
   7701   return vld4_dup_f16(a);
   7702 }
   7703 
   7704 // CHECK-LABEL: define void @test_vld4_dup_f32(%struct.float32x2x4_t* noalias sret %agg.result, float* %a) #0 {
   7705 // CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x4_t, align 8
   7706 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
   7707 // CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
   7708 // CHECK:   [[VLD_DUP:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4lane.v2f32.p0i8(i8* [[TMP1]], <2 x float> undef, <2 x float> undef, <2 x float> undef, <2 x float> undef, i32 0, i32 4)
   7709 // CHECK:   [[TMP2:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD_DUP]], 0
   7710 // CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP2]], <2 x i32> zeroinitializer
   7711 // CHECK:   [[TMP3:%.*]] = insertvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD_DUP]], <2 x float> [[LANE]], 0
   7712 // CHECK:   [[TMP4:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[TMP3]], 1
   7713 // CHECK:   [[LANE1:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP4]], <2 x i32> zeroinitializer
   7714 // CHECK:   [[TMP5:%.*]] = insertvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[TMP3]], <2 x float> [[LANE1]], 1
   7715 // CHECK:   [[TMP6:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[TMP5]], 2
   7716 // CHECK:   [[LANE2:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP6]], <2 x i32> zeroinitializer
   7717 // CHECK:   [[TMP7:%.*]] = insertvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[TMP5]], <2 x float> [[LANE2]], 2
   7718 // CHECK:   [[TMP8:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[TMP7]], 3
   7719 // CHECK:   [[LANE3:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> [[TMP8]], <2 x i32> zeroinitializer
   7720 // CHECK:   [[TMP9:%.*]] = insertvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[TMP7]], <2 x float> [[LANE3]], 3
   7721 // CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float>, <2 x float>, <2 x float> }*
   7722 // CHECK:   store { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[TMP9]], { <2 x float>, <2 x float>, <2 x float>, <2 x float> }* [[TMP10]]
   7723 // CHECK:   [[TMP11:%.*]] = bitcast %struct.float32x2x4_t* %agg.result to i8*
   7724 // CHECK:   [[TMP12:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
   7725 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP11]], i8* [[TMP12]], i32 32, i32 8, i1 false)
   7726 // CHECK:   ret void
   7727 float32x2x4_t test_vld4_dup_f32(float32_t const * a) {
   7728   return vld4_dup_f32(a);
   7729 }
   7730 
   7731 // CHECK-LABEL: define void @test_vld4_dup_p8(%struct.poly8x8x4_t* noalias sret %agg.result, i8* %a) #0 {
   7732 // CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x4_t, align 8
   7733 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
   7734 // CHECK:   [[VLD_DUP:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8.p0i8(i8* %a, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
   7735 // CHECK:   [[TMP1:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD_DUP]], 0
   7736 // CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
   7737 // CHECK:   [[TMP2:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD_DUP]], <8 x i8> [[LANE]], 0
   7738 // CHECK:   [[TMP3:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP2]], 1
   7739 // CHECK:   [[LANE1:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP3]], <8 x i32> zeroinitializer
   7740 // CHECK:   [[TMP4:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP2]], <8 x i8> [[LANE1]], 1
   7741 // CHECK:   [[TMP5:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], 2
   7742 // CHECK:   [[LANE2:%.*]] = shufflevector <8 x i8> [[TMP5]], <8 x i8> [[TMP5]], <8 x i32> zeroinitializer
   7743 // CHECK:   [[TMP6:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], <8 x i8> [[LANE2]], 2
   7744 // CHECK:   [[TMP7:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP6]], 3
   7745 // CHECK:   [[LANE3:%.*]] = shufflevector <8 x i8> [[TMP7]], <8 x i8> [[TMP7]], <8 x i32> zeroinitializer
   7746 // CHECK:   [[TMP8:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP6]], <8 x i8> [[LANE3]], 3
   7747 // CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
   7748 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP8]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP9]]
   7749 // CHECK:   [[TMP10:%.*]] = bitcast %struct.poly8x8x4_t* %agg.result to i8*
   7750 // CHECK:   [[TMP11:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
   7751 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP10]], i8* [[TMP11]], i32 32, i32 8, i1 false)
   7752 // CHECK:   ret void
   7753 poly8x8x4_t test_vld4_dup_p8(poly8_t const * a) {
   7754   return vld4_dup_p8(a);
   7755 }
   7756 
   7757 // CHECK-LABEL: define void @test_vld4_dup_p16(%struct.poly16x4x4_t* noalias sret %agg.result, i16* %a) #0 {
   7758 // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x4_t, align 8
   7759 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
   7760 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
   7761 // CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16.p0i8(i8* [[TMP1]], <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
   7762 // CHECK:   [[TMP2:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD_DUP]], 0
   7763 // CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP2]], <4 x i32> zeroinitializer
   7764 // CHECK:   [[TMP3:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD_DUP]], <4 x i16> [[LANE]], 0
   7765 // CHECK:   [[TMP4:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP3]], 1
   7766 // CHECK:   [[LANE1:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP4]], <4 x i32> zeroinitializer
   7767 // CHECK:   [[TMP5:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP3]], <4 x i16> [[LANE1]], 1
   7768 // CHECK:   [[TMP6:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP5]], 2
   7769 // CHECK:   [[LANE2:%.*]] = shufflevector <4 x i16> [[TMP6]], <4 x i16> [[TMP6]], <4 x i32> zeroinitializer
   7770 // CHECK:   [[TMP7:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP5]], <4 x i16> [[LANE2]], 2
   7771 // CHECK:   [[TMP8:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP7]], 3
   7772 // CHECK:   [[LANE3:%.*]] = shufflevector <4 x i16> [[TMP8]], <4 x i16> [[TMP8]], <4 x i32> zeroinitializer
   7773 // CHECK:   [[TMP9:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP7]], <4 x i16> [[LANE3]], 3
   7774 // CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
   7775 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP9]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP10]]
   7776 // CHECK:   [[TMP11:%.*]] = bitcast %struct.poly16x4x4_t* %agg.result to i8*
   7777 // CHECK:   [[TMP12:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
   7778 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP11]], i8* [[TMP12]], i32 32, i32 8, i1 false)
   7779 // CHECK:   ret void
   7780 poly16x4x4_t test_vld4_dup_p16(poly16_t const * a) {
   7781   return vld4_dup_p16(a);
   7782 }
   7783 
   7784 
   7785 // CHECK-LABEL: define void @test_vld4q_lane_u16(%struct.uint16x8x4_t* noalias sret %agg.result, i16* %a, [8 x i64] %b.coerce) #0 {
   7786 // CHECK:   [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16
   7787 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16
   7788 // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x4_t, align 16
   7789 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[B]], i32 0, i32 0
   7790 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i16>]* [[COERCE_DIVE]] to [8 x i64]*
   7791 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
   7792 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x4_t* [[__S1]] to i8*
   7793 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x4_t* [[B]] to i8*
   7794 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
   7795 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
   7796 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
   7797 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
   7798 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i32 0, i32 0
   7799 // CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
   7800 // CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
   7801 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
   7802 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i32 0, i32 1
   7803 // CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
   7804 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
   7805 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
   7806 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i32 0, i32 2
   7807 // CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
   7808 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
   7809 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
   7810 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i32 0, i32 3
   7811 // CHECK:   [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
   7812 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i16> [[TMP11]] to <16 x i8>
   7813 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
   7814 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
   7815 // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
   7816 // CHECK:   [[TMP16:%.*]] = bitcast <16 x i8> [[TMP12]] to <8 x i16>
   7817 // CHECK:   [[VLD4Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16.p0i8(i8* [[TMP4]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], <8 x i16> [[TMP16]], i32 7, i32 2)
   7818 // CHECK:   [[TMP17:%.*]] = bitcast i8* [[TMP3]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
   7819 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_LANE_V]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP17]]
   7820 // CHECK:   [[TMP18:%.*]] = bitcast %struct.uint16x8x4_t* %agg.result to i8*
   7821 // CHECK:   [[TMP19:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
   7822 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP18]], i8* [[TMP19]], i32 64, i32 16, i1 false)
   7823 // CHECK:   ret void
   7824 uint16x8x4_t test_vld4q_lane_u16(uint16_t const * a, uint16x8x4_t b) {
   7825   return vld4q_lane_u16(a, b, 7);
   7826 }
   7827 
   7828 // CHECK-LABEL: define void @test_vld4q_lane_u32(%struct.uint32x4x4_t* noalias sret %agg.result, i32* %a, [8 x i64] %b.coerce) #0 {
   7829 // CHECK:   [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16
   7830 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16
   7831 // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x4_t, align 16
   7832 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[B]], i32 0, i32 0
   7833 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x i32>]* [[COERCE_DIVE]] to [8 x i64]*
   7834 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
   7835 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x4_t* [[__S1]] to i8*
   7836 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x4_t* [[B]] to i8*
   7837 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
   7838 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
   7839 // CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
   7840 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
   7841 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i32 0, i32 0
   7842 // CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
   7843 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
   7844 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
   7845 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL1]], i32 0, i32 1
   7846 // CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
   7847 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
   7848 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
   7849 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL3]], i32 0, i32 2
   7850 // CHECK:   [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
   7851 // CHECK:   [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
   7852 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
   7853 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL5]], i32 0, i32 3
   7854 // CHECK:   [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align 16
   7855 // CHECK:   [[TMP12:%.*]] = bitcast <4 x i32> [[TMP11]] to <16 x i8>
   7856 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
   7857 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
   7858 // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
   7859 // CHECK:   [[TMP16:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
   7860 // CHECK:   [[VLD4Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32.p0i8(i8* [[TMP4]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], <4 x i32> [[TMP16]], i32 3, i32 4)
   7861 // CHECK:   [[TMP17:%.*]] = bitcast i8* [[TMP3]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }*
   7862 // CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4Q_LANE_V]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP17]]
   7863 // CHECK:   [[TMP18:%.*]] = bitcast %struct.uint32x4x4_t* %agg.result to i8*
   7864 // CHECK:   [[TMP19:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
   7865 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP18]], i8* [[TMP19]], i32 64, i32 16, i1 false)
   7866 // CHECK:   ret void
   7867 uint32x4x4_t test_vld4q_lane_u32(uint32_t const * a, uint32x4x4_t b) {
   7868   return vld4q_lane_u32(a, b, 3);
   7869 }
   7870 
   7871 // CHECK-LABEL: define void @test_vld4q_lane_s16(%struct.int16x8x4_t* noalias sret %agg.result, i16* %a, [8 x i64] %b.coerce) #0 {
   7872 // CHECK:   [[B:%.*]] = alloca %struct.int16x8x4_t, align 16
   7873 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16
   7874 // CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x4_t, align 16
   7875 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[B]], i32 0, i32 0
   7876 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i16>]* [[COERCE_DIVE]] to [8 x i64]*
   7877 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
   7878 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x4_t* [[__S1]] to i8*
   7879 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x4_t* [[B]] to i8*
   7880 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
   7881 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
   7882 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
   7883 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
   7884 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i32 0, i32 0
   7885 // CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
   7886 // CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
   7887 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
   7888 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i32 0, i32 1
   7889 // CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
   7890 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
   7891 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
   7892 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i32 0, i32 2
   7893 // CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
   7894 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
   7895 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
   7896 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i32 0, i32 3
   7897 // CHECK:   [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
   7898 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i16> [[TMP11]] to <16 x i8>
   7899 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
   7900 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
   7901 // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
   7902 // CHECK:   [[TMP16:%.*]] = bitcast <16 x i8> [[TMP12]] to <8 x i16>
   7903 // CHECK:   [[VLD4Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16.p0i8(i8* [[TMP4]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], <8 x i16> [[TMP16]], i32 7, i32 2)
   7904 // CHECK:   [[TMP17:%.*]] = bitcast i8* [[TMP3]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
   7905 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_LANE_V]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP17]]
   7906 // CHECK:   [[TMP18:%.*]] = bitcast %struct.int16x8x4_t* %agg.result to i8*
   7907 // CHECK:   [[TMP19:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
   7908 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP18]], i8* [[TMP19]], i32 64, i32 16, i1 false)
   7909 // CHECK:   ret void
   7910 int16x8x4_t test_vld4q_lane_s16(int16_t const * a, int16x8x4_t b) {
   7911   return vld4q_lane_s16(a, b, 7);
   7912 }
   7913 
   7914 // CHECK-LABEL: define void @test_vld4q_lane_s32(%struct.int32x4x4_t* noalias sret %agg.result, i32* %a, [8 x i64] %b.coerce) #0 {
   7915 // CHECK:   [[B:%.*]] = alloca %struct.int32x4x4_t, align 16
   7916 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16
   7917 // CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x4_t, align 16
   7918 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[B]], i32 0, i32 0
   7919 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x i32>]* [[COERCE_DIVE]] to [8 x i64]*
   7920 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
   7921 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x4_t* [[__S1]] to i8*
   7922 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x4_t* [[B]] to i8*
   7923 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
   7924 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
   7925 // CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
   7926 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
   7927 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i32 0, i32 0
   7928 // CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
   7929 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
   7930 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
   7931 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL1]], i32 0, i32 1
   7932 // CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
   7933 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
   7934 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
   7935 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL3]], i32 0, i32 2
   7936 // CHECK:   [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
   7937 // CHECK:   [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
   7938 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
   7939 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL5]], i32 0, i32 3
   7940 // CHECK:   [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align 16
   7941 // CHECK:   [[TMP12:%.*]] = bitcast <4 x i32> [[TMP11]] to <16 x i8>
   7942 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
   7943 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
   7944 // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
   7945 // CHECK:   [[TMP16:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
   7946 // CHECK:   [[VLD4Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32.p0i8(i8* [[TMP4]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], <4 x i32> [[TMP16]], i32 3, i32 4)
   7947 // CHECK:   [[TMP17:%.*]] = bitcast i8* [[TMP3]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }*
   7948 // CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4Q_LANE_V]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP17]]
   7949 // CHECK:   [[TMP18:%.*]] = bitcast %struct.int32x4x4_t* %agg.result to i8*
   7950 // CHECK:   [[TMP19:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
   7951 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP18]], i8* [[TMP19]], i32 64, i32 16, i1 false)
   7952 // CHECK:   ret void
   7953 int32x4x4_t test_vld4q_lane_s32(int32_t const * a, int32x4x4_t b) {
   7954   return vld4q_lane_s32(a, b, 3);
   7955 }
   7956 
   7957 // CHECK-LABEL: define void @test_vld4q_lane_f16(%struct.float16x8x4_t* noalias sret %agg.result, half* %a, [8 x i64] %b.coerce) #0 {
   7958 // CHECK:   [[B:%.*]] = alloca %struct.float16x8x4_t, align 16
   7959 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16
   7960 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x4_t, align 16
   7961 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[B]], i32 0, i32 0
   7962 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x half>]* [[COERCE_DIVE]] to [8 x i64]*
   7963 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
   7964 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x4_t* [[__S1]] to i8*
   7965 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x4_t* [[B]] to i8*
   7966 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
   7967 // CHECK:   [[TMP3:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
   7968 // CHECK:   [[TMP4:%.*]] = bitcast half* %a to i8*
   7969 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
   7970 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]], i32 0, i32 0
   7971 // CHECK:   [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
   7972 // CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
   7973 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
   7974 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL1]], i32 0, i32 1
   7975 // CHECK:   [[TMP7:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
   7976 // CHECK:   [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
   7977 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
   7978 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL3]], i32 0, i32 2
   7979 // CHECK:   [[TMP9:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 16
   7980 // CHECK:   [[TMP10:%.*]] = bitcast <8 x half> [[TMP9]] to <16 x i8>
   7981 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
   7982 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL5]], i32 0, i32 3
   7983 // CHECK:   [[TMP11:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX6]], align 16
   7984 // CHECK:   [[TMP12:%.*]] = bitcast <8 x half> [[TMP11]] to <16 x i8>
   7985 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
   7986 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
   7987 // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
   7988 // CHECK:   [[TMP16:%.*]] = bitcast <16 x i8> [[TMP12]] to <8 x i16>
   7989 // CHECK:   [[VLD4Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16.p0i8(i8* [[TMP4]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], <8 x i16> [[TMP16]], i32 7, i32 2)
   7990 // CHECK:   [[TMP17:%.*]] = bitcast i8* [[TMP3]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
   7991 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_LANE_V]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP17]]
   7992 // CHECK:   [[TMP18:%.*]] = bitcast %struct.float16x8x4_t* %agg.result to i8*
   7993 // CHECK:   [[TMP19:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
   7994 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP18]], i8* [[TMP19]], i32 64, i32 16, i1 false)
   7995 // CHECK:   ret void
   7996 float16x8x4_t test_vld4q_lane_f16(float16_t const * a, float16x8x4_t b) {
   7997   return vld4q_lane_f16(a, b, 7);
   7998 }
   7999 
   8000 // CHECK-LABEL: define void @test_vld4q_lane_f32(%struct.float32x4x4_t* noalias sret %agg.result, float* %a, [8 x i64] %b.coerce) #0 {
   8001 // CHECK:   [[B:%.*]] = alloca %struct.float32x4x4_t, align 16
   8002 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16
   8003 // CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x4_t, align 16
   8004 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[B]], i32 0, i32 0
   8005 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x float>]* [[COERCE_DIVE]] to [8 x i64]*
   8006 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
   8007 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x4_t* [[__S1]] to i8*
   8008 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x4_t* [[B]] to i8*
   8009 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
   8010 // CHECK:   [[TMP3:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
   8011 // CHECK:   [[TMP4:%.*]] = bitcast float* %a to i8*
   8012 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
   8013 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]], i32 0, i32 0
   8014 // CHECK:   [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
   8015 // CHECK:   [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
   8016 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
   8017 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL1]], i32 0, i32 1
   8018 // CHECK:   [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
   8019 // CHECK:   [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8>
   8020 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
   8021 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL3]], i32 0, i32 2
   8022 // CHECK:   [[TMP9:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align 16
   8023 // CHECK:   [[TMP10:%.*]] = bitcast <4 x float> [[TMP9]] to <16 x i8>
   8024 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
   8025 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL5]], i32 0, i32 3
   8026 // CHECK:   [[TMP11:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX6]], align 16
   8027 // CHECK:   [[TMP12:%.*]] = bitcast <4 x float> [[TMP11]] to <16 x i8>
   8028 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
   8029 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>
   8030 // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x float>
   8031 // CHECK:   [[TMP16:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x float>
   8032 // CHECK:   [[VLD4Q_LANE_V:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4lane.v4f32.p0i8(i8* [[TMP4]], <4 x float> [[TMP13]], <4 x float> [[TMP14]], <4 x float> [[TMP15]], <4 x float> [[TMP16]], i32 3, i32 4)
   8033 // CHECK:   [[TMP17:%.*]] = bitcast i8* [[TMP3]] to { <4 x float>, <4 x float>, <4 x float>, <4 x float> }*
   8034 // CHECK:   store { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4Q_LANE_V]], { <4 x float>, <4 x float>, <4 x float>, <4 x float> }* [[TMP17]]
   8035 // CHECK:   [[TMP18:%.*]] = bitcast %struct.float32x4x4_t* %agg.result to i8*
   8036 // CHECK:   [[TMP19:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
   8037 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP18]], i8* [[TMP19]], i32 64, i32 16, i1 false)
   8038 // CHECK:   ret void
   8039 float32x4x4_t test_vld4q_lane_f32(float32_t const * a, float32x4x4_t b) {
   8040   return vld4q_lane_f32(a, b, 3);
   8041 }
   8042 
   8043 // CHECK-LABEL: define void @test_vld4q_lane_p16(%struct.poly16x8x4_t* noalias sret %agg.result, i16* %a, [8 x i64] %b.coerce) #0 {
   8044 // CHECK:   [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16
   8045 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16
   8046 // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x4_t, align 16
   8047 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[B]], i32 0, i32 0
   8048 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i16>]* [[COERCE_DIVE]] to [8 x i64]*
   8049 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
   8050 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x4_t* [[__S1]] to i8*
   8051 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x4_t* [[B]] to i8*
   8052 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
   8053 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
   8054 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
   8055 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
   8056 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i32 0, i32 0
   8057 // CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
   8058 // CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
   8059 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
   8060 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i32 0, i32 1
   8061 // CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
   8062 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
   8063 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
   8064 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i32 0, i32 2
   8065 // CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
   8066 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
   8067 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
   8068 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i32 0, i32 3
   8069 // CHECK:   [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
   8070 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i16> [[TMP11]] to <16 x i8>
   8071 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
   8072 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
   8073 // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
   8074 // CHECK:   [[TMP16:%.*]] = bitcast <16 x i8> [[TMP12]] to <8 x i16>
   8075 // CHECK:   [[VLD4Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16.p0i8(i8* [[TMP4]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], <8 x i16> [[TMP16]], i32 7, i32 2)
   8076 // CHECK:   [[TMP17:%.*]] = bitcast i8* [[TMP3]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
   8077 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_LANE_V]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP17]]
   8078 // CHECK:   [[TMP18:%.*]] = bitcast %struct.poly16x8x4_t* %agg.result to i8*
   8079 // CHECK:   [[TMP19:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
   8080 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP18]], i8* [[TMP19]], i32 64, i32 16, i1 false)
   8081 // CHECK:   ret void
   8082 poly16x8x4_t test_vld4q_lane_p16(poly16_t const * a, poly16x8x4_t b) {
   8083   return vld4q_lane_p16(a, b, 7);
   8084 }
   8085 
   8086 // CHECK-LABEL: define void @test_vld4_lane_u8(%struct.uint8x8x4_t* noalias sret %agg.result, i8* %a, [4 x i64] %b.coerce) #0 {
   8087 // CHECK:   [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8
   8088 // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x4_t, align 8
   8089 // CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x4_t, align 8
   8090 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[B]], i32 0, i32 0
   8091 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
   8092 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
   8093 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x4_t* [[__S1]] to i8*
   8094 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x4_t* [[B]] to i8*
   8095 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
   8096 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
   8097 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
   8098 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i32 0, i32 0
   8099 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
   8100 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
   8101 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i32 0, i32 1
   8102 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
   8103 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
   8104 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i32 0, i32 2
   8105 // CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
   8106 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
   8107 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i32 0, i32 3
   8108 // CHECK:   [[TMP7:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
   8109 // CHECK:   [[VLD4_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8.p0i8(i8* %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], i32 7, i32 1)
   8110 // CHECK:   [[TMP8:%.*]] = bitcast i8* [[TMP3]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
   8111 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE_V]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP8]]
   8112 // CHECK:   [[TMP9:%.*]] = bitcast %struct.uint8x8x4_t* %agg.result to i8*
   8113 // CHECK:   [[TMP10:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
   8114 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP9]], i8* [[TMP10]], i32 32, i32 8, i1 false)
   8115 // CHECK:   ret void
   8116 uint8x8x4_t test_vld4_lane_u8(uint8_t const * a, uint8x8x4_t b) {
   8117   return vld4_lane_u8(a, b, 7);
   8118 }
   8119 
   8120 // CHECK-LABEL: define void @test_vld4_lane_u16(%struct.uint16x4x4_t* noalias sret %agg.result, i16* %a, [4 x i64] %b.coerce) #0 {
   8121 // CHECK:   [[B:%.*]] = alloca %struct.uint16x4x4_t, align 8
   8122 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x4_t, align 8
   8123 // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x4_t, align 8
   8124 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[B]], i32 0, i32 0
   8125 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
   8126 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
   8127 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x4_t* [[__S1]] to i8*
   8128 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x4_t* [[B]] to i8*
   8129 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
   8130 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
   8131 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
   8132 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
   8133 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i32 0, i32 0
   8134 // CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
   8135 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
   8136 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
   8137 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i32 0, i32 1
   8138 // CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
   8139 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
   8140 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
   8141 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i32 0, i32 2
   8142 // CHECK:   [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
   8143 // CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
   8144 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
   8145 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i32 0, i32 3
   8146 // CHECK:   [[TMP11:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
   8147 // CHECK:   [[TMP12:%.*]] = bitcast <4 x i16> [[TMP11]] to <8 x i8>
   8148 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
   8149 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
   8150 // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
   8151 // CHECK:   [[TMP16:%.*]] = bitcast <8 x i8> [[TMP12]] to <4 x i16>
   8152 // CHECK:   [[VLD4_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16.p0i8(i8* [[TMP4]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], <4 x i16> [[TMP16]], i32 3, i32 2)
   8153 // CHECK:   [[TMP17:%.*]] = bitcast i8* [[TMP3]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
   8154 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE_V]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP17]]
   8155 // CHECK:   [[TMP18:%.*]] = bitcast %struct.uint16x4x4_t* %agg.result to i8*
   8156 // CHECK:   [[TMP19:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
   8157 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP18]], i8* [[TMP19]], i32 32, i32 8, i1 false)
   8158 // CHECK:   ret void
   8159 uint16x4x4_t test_vld4_lane_u16(uint16_t const * a, uint16x4x4_t b) {
   8160   return vld4_lane_u16(a, b, 3);
   8161 }
   8162 
   8163 // CHECK-LABEL: define void @test_vld4_lane_u32(%struct.uint32x2x4_t* noalias sret %agg.result, i32* %a, [4 x i64] %b.coerce) #0 {
   8164 // CHECK:   [[B:%.*]] = alloca %struct.uint32x2x4_t, align 8
   8165 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x4_t, align 8
   8166 // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x4_t, align 8
   8167 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[B]], i32 0, i32 0
   8168 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <2 x i32>]* [[COERCE_DIVE]] to [4 x i64]*
   8169 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
   8170 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x4_t* [[__S1]] to i8*
   8171 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x4_t* [[B]] to i8*
   8172 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
   8173 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
   8174 // CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
   8175 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
   8176 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i32 0, i32 0
   8177 // CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
   8178 // CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
   8179 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
   8180 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL1]], i32 0, i32 1
   8181 // CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
   8182 // CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
   8183 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
   8184 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL3]], i32 0, i32 2
   8185 // CHECK:   [[TMP9:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
   8186 // CHECK:   [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8>
   8187 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
   8188 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL5]], i32 0, i32 3
   8189 // CHECK:   [[TMP11:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX6]], align 8
   8190 // CHECK:   [[TMP12:%.*]] = bitcast <2 x i32> [[TMP11]] to <8 x i8>
   8191 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
   8192 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
   8193 // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32>
   8194 // CHECK:   [[TMP16:%.*]] = bitcast <8 x i8> [[TMP12]] to <2 x i32>
   8195 // CHECK:   [[VLD4_LANE_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32.p0i8(i8* [[TMP4]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <2 x i32> [[TMP15]], <2 x i32> [[TMP16]], i32 1, i32 4)
   8196 // CHECK:   [[TMP17:%.*]] = bitcast i8* [[TMP3]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
   8197 // CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE_V]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP17]]
   8198 // CHECK:   [[TMP18:%.*]] = bitcast %struct.uint32x2x4_t* %agg.result to i8*
   8199 // CHECK:   [[TMP19:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
   8200 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP18]], i8* [[TMP19]], i32 32, i32 8, i1 false)
   8201 // CHECK:   ret void
   8202 uint32x2x4_t test_vld4_lane_u32(uint32_t const * a, uint32x2x4_t b) {
   8203   return vld4_lane_u32(a, b, 1);
   8204 }
   8205 
   8206 // CHECK-LABEL: define void @test_vld4_lane_s8(%struct.int8x8x4_t* noalias sret %agg.result, i8* %a, [4 x i64] %b.coerce) #0 {
   8207 // CHECK:   [[B:%.*]] = alloca %struct.int8x8x4_t, align 8
   8208 // CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x4_t, align 8
   8209 // CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x4_t, align 8
   8210 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[B]], i32 0, i32 0
   8211 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
   8212 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
   8213 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x4_t* [[__S1]] to i8*
   8214 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x4_t* [[B]] to i8*
   8215 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
   8216 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
   8217 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
   8218 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i32 0, i32 0
   8219 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
   8220 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
   8221 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i32 0, i32 1
   8222 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
   8223 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
   8224 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i32 0, i32 2
   8225 // CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
   8226 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
   8227 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i32 0, i32 3
   8228 // CHECK:   [[TMP7:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
   8229 // CHECK:   [[VLD4_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8.p0i8(i8* %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], i32 7, i32 1)
   8230 // CHECK:   [[TMP8:%.*]] = bitcast i8* [[TMP3]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
   8231 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE_V]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP8]]
   8232 // CHECK:   [[TMP9:%.*]] = bitcast %struct.int8x8x4_t* %agg.result to i8*
   8233 // CHECK:   [[TMP10:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
   8234 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP9]], i8* [[TMP10]], i32 32, i32 8, i1 false)
   8235 // CHECK:   ret void
   8236 int8x8x4_t test_vld4_lane_s8(int8_t const * a, int8x8x4_t b) {
   8237   return vld4_lane_s8(a, b, 7);
   8238 }
   8239 
   8240 // CHECK-LABEL: define void @test_vld4_lane_s16(%struct.int16x4x4_t* noalias sret %agg.result, i16* %a, [4 x i64] %b.coerce) #0 {
   8241 // CHECK:   [[B:%.*]] = alloca %struct.int16x4x4_t, align 8
   8242 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x4_t, align 8
   8243 // CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x4_t, align 8
   8244 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[B]], i32 0, i32 0
   8245 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
   8246 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
   8247 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x4_t* [[__S1]] to i8*
   8248 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x4_t* [[B]] to i8*
   8249 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
   8250 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
   8251 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
   8252 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
   8253 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i32 0, i32 0
   8254 // CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
   8255 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
   8256 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
   8257 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i32 0, i32 1
   8258 // CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
   8259 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
   8260 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
   8261 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i32 0, i32 2
   8262 // CHECK:   [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
   8263 // CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
   8264 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
   8265 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i32 0, i32 3
   8266 // CHECK:   [[TMP11:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
   8267 // CHECK:   [[TMP12:%.*]] = bitcast <4 x i16> [[TMP11]] to <8 x i8>
   8268 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
   8269 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
   8270 // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
   8271 // CHECK:   [[TMP16:%.*]] = bitcast <8 x i8> [[TMP12]] to <4 x i16>
   8272 // CHECK:   [[VLD4_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16.p0i8(i8* [[TMP4]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], <4 x i16> [[TMP16]], i32 3, i32 2)
   8273 // CHECK:   [[TMP17:%.*]] = bitcast i8* [[TMP3]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
   8274 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE_V]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP17]]
   8275 // CHECK:   [[TMP18:%.*]] = bitcast %struct.int16x4x4_t* %agg.result to i8*
   8276 // CHECK:   [[TMP19:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
   8277 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP18]], i8* [[TMP19]], i32 32, i32 8, i1 false)
   8278 // CHECK:   ret void
   8279 int16x4x4_t test_vld4_lane_s16(int16_t const * a, int16x4x4_t b) {
   8280   return vld4_lane_s16(a, b, 3);
   8281 }
   8282 
   8283 // CHECK-LABEL: define void @test_vld4_lane_s32(%struct.int32x2x4_t* noalias sret %agg.result, i32* %a, [4 x i64] %b.coerce) #0 {
   8284 // CHECK:   [[B:%.*]] = alloca %struct.int32x2x4_t, align 8
   8285 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x4_t, align 8
   8286 // CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x4_t, align 8
   8287 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[B]], i32 0, i32 0
   8288 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <2 x i32>]* [[COERCE_DIVE]] to [4 x i64]*
   8289 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
   8290 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x4_t* [[__S1]] to i8*
   8291 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x4_t* [[B]] to i8*
   8292 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
   8293 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
   8294 // CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
   8295 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
   8296 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i32 0, i32 0
   8297 // CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
   8298 // CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
   8299 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
   8300 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL1]], i32 0, i32 1
   8301 // CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
   8302 // CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
   8303 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
   8304 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL3]], i32 0, i32 2
   8305 // CHECK:   [[TMP9:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
   8306 // CHECK:   [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8>
   8307 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
   8308 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL5]], i32 0, i32 3
   8309 // CHECK:   [[TMP11:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX6]], align 8
   8310 // CHECK:   [[TMP12:%.*]] = bitcast <2 x i32> [[TMP11]] to <8 x i8>
   8311 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
   8312 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
   8313 // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32>
   8314 // CHECK:   [[TMP16:%.*]] = bitcast <8 x i8> [[TMP12]] to <2 x i32>
   8315 // CHECK:   [[VLD4_LANE_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32.p0i8(i8* [[TMP4]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <2 x i32> [[TMP15]], <2 x i32> [[TMP16]], i32 1, i32 4)
   8316 // CHECK:   [[TMP17:%.*]] = bitcast i8* [[TMP3]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
   8317 // CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE_V]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP17]]
   8318 // CHECK:   [[TMP18:%.*]] = bitcast %struct.int32x2x4_t* %agg.result to i8*
   8319 // CHECK:   [[TMP19:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
   8320 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP18]], i8* [[TMP19]], i32 32, i32 8, i1 false)
   8321 // CHECK:   ret void
   8322 int32x2x4_t test_vld4_lane_s32(int32_t const * a, int32x2x4_t b) {
   8323   return vld4_lane_s32(a, b, 1);
   8324 }
   8325 
   8326 // CHECK-LABEL: define void @test_vld4_lane_f16(%struct.float16x4x4_t* noalias sret %agg.result, half* %a, [4 x i64] %b.coerce) #0 {
   8327 // CHECK:   [[B:%.*]] = alloca %struct.float16x4x4_t, align 8
   8328 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x4_t, align 8
   8329 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x4_t, align 8
   8330 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[B]], i32 0, i32 0
   8331 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x half>]* [[COERCE_DIVE]] to [4 x i64]*
   8332 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
   8333 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x4_t* [[__S1]] to i8*
   8334 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x4_t* [[B]] to i8*
   8335 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
   8336 // CHECK:   [[TMP3:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
   8337 // CHECK:   [[TMP4:%.*]] = bitcast half* %a to i8*
   8338 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
   8339 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]], i32 0, i32 0
   8340 // CHECK:   [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
   8341 // CHECK:   [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
   8342 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
   8343 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL1]], i32 0, i32 1
   8344 // CHECK:   [[TMP7:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
   8345 // CHECK:   [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8>
   8346 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
   8347 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL3]], i32 0, i32 2
   8348 // CHECK:   [[TMP9:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX4]], align 8
   8349 // CHECK:   [[TMP10:%.*]] = bitcast <4 x half> [[TMP9]] to <8 x i8>
   8350 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
   8351 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL5]], i32 0, i32 3
   8352 // CHECK:   [[TMP11:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX6]], align 8
   8353 // CHECK:   [[TMP12:%.*]] = bitcast <4 x half> [[TMP11]] to <8 x i8>
   8354 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
   8355 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
   8356 // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
   8357 // CHECK:   [[TMP16:%.*]] = bitcast <8 x i8> [[TMP12]] to <4 x i16>
   8358 // CHECK:   [[VLD4_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16.p0i8(i8* [[TMP4]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], <4 x i16> [[TMP16]], i32 3, i32 2)
   8359 // CHECK:   [[TMP17:%.*]] = bitcast i8* [[TMP3]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
   8360 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE_V]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP17]]
   8361 // CHECK:   [[TMP18:%.*]] = bitcast %struct.float16x4x4_t* %agg.result to i8*
   8362 // CHECK:   [[TMP19:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
   8363 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP18]], i8* [[TMP19]], i32 32, i32 8, i1 false)
   8364 // CHECK:   ret void
   8365 float16x4x4_t test_vld4_lane_f16(float16_t const * a, float16x4x4_t b) {
   8366   return vld4_lane_f16(a, b, 3);
   8367 }
   8368 
   8369 // CHECK-LABEL: define void @test_vld4_lane_f32(%struct.float32x2x4_t* noalias sret %agg.result, float* %a, [4 x i64] %b.coerce) #0 {
   8370 // CHECK:   [[B:%.*]] = alloca %struct.float32x2x4_t, align 8
   8371 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x4_t, align 8
   8372 // CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x4_t, align 8
   8373 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[B]], i32 0, i32 0
   8374 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <2 x float>]* [[COERCE_DIVE]] to [4 x i64]*
   8375 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
   8376 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x4_t* [[__S1]] to i8*
   8377 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x4_t* [[B]] to i8*
   8378 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
   8379 // CHECK:   [[TMP3:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
   8380 // CHECK:   [[TMP4:%.*]] = bitcast float* %a to i8*
   8381 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
   8382 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]], i32 0, i32 0
   8383 // CHECK:   [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
   8384 // CHECK:   [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
   8385 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
   8386 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL1]], i32 0, i32 1
   8387 // CHECK:   [[TMP7:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
   8388 // CHECK:   [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8>
   8389 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
   8390 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL3]], i32 0, i32 2
   8391 // CHECK:   [[TMP9:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX4]], align 8
   8392 // CHECK:   [[TMP10:%.*]] = bitcast <2 x float> [[TMP9]] to <8 x i8>
   8393 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
   8394 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL5]], i32 0, i32 3
   8395 // CHECK:   [[TMP11:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX6]], align 8
   8396 // CHECK:   [[TMP12:%.*]] = bitcast <2 x float> [[TMP11]] to <8 x i8>
   8397 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
   8398 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float>
   8399 // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x float>
   8400 // CHECK:   [[TMP16:%.*]] = bitcast <8 x i8> [[TMP12]] to <2 x float>
   8401 // CHECK:   [[VLD4_LANE_V:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4lane.v2f32.p0i8(i8* [[TMP4]], <2 x float> [[TMP13]], <2 x float> [[TMP14]], <2 x float> [[TMP15]], <2 x float> [[TMP16]], i32 1, i32 4)
   8402 // CHECK:   [[TMP17:%.*]] = bitcast i8* [[TMP3]] to { <2 x float>, <2 x float>, <2 x float>, <2 x float> }*
   8403 // CHECK:   store { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4_LANE_V]], { <2 x float>, <2 x float>, <2 x float>, <2 x float> }* [[TMP17]]
   8404 // CHECK:   [[TMP18:%.*]] = bitcast %struct.float32x2x4_t* %agg.result to i8*
   8405 // CHECK:   [[TMP19:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
   8406 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP18]], i8* [[TMP19]], i32 32, i32 8, i1 false)
   8407 // CHECK:   ret void
   8408 float32x2x4_t test_vld4_lane_f32(float32_t const * a, float32x2x4_t b) {
   8409   return vld4_lane_f32(a, b, 1);
   8410 }
   8411 
   8412 // CHECK-LABEL: define void @test_vld4_lane_p8(%struct.poly8x8x4_t* noalias sret %agg.result, i8* %a, [4 x i64] %b.coerce) #0 {
   8413 // CHECK:   [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8
   8414 // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x4_t, align 8
   8415 // CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x4_t, align 8
   8416 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[B]], i32 0, i32 0
   8417 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
   8418 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
   8419 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x4_t* [[__S1]] to i8*
   8420 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x4_t* [[B]] to i8*
   8421 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
   8422 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
   8423 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
   8424 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i32 0, i32 0
   8425 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
   8426 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
   8427 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i32 0, i32 1
   8428 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
   8429 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
   8430 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i32 0, i32 2
   8431 // CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
   8432 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
   8433 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i32 0, i32 3
   8434 // CHECK:   [[TMP7:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
   8435 // CHECK:   [[VLD4_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8.p0i8(i8* %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], i32 7, i32 1)
   8436 // CHECK:   [[TMP8:%.*]] = bitcast i8* [[TMP3]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
   8437 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE_V]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP8]]
   8438 // CHECK:   [[TMP9:%.*]] = bitcast %struct.poly8x8x4_t* %agg.result to i8*
   8439 // CHECK:   [[TMP10:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
   8440 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP9]], i8* [[TMP10]], i32 32, i32 8, i1 false)
   8441 // CHECK:   ret void
   8442 poly8x8x4_t test_vld4_lane_p8(poly8_t const * a, poly8x8x4_t b) {
   8443   return vld4_lane_p8(a, b, 7);
   8444 }
   8445 
   8446 // CHECK-LABEL: define void @test_vld4_lane_p16(%struct.poly16x4x4_t* noalias sret %agg.result, i16* %a, [4 x i64] %b.coerce) #0 {
   8447 // CHECK:   [[B:%.*]] = alloca %struct.poly16x4x4_t, align 8
   8448 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x4_t, align 8
   8449 // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x4_t, align 8
   8450 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[B]], i32 0, i32 0
   8451 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
   8452 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
   8453 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x4_t* [[__S1]] to i8*
   8454 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x4_t* [[B]] to i8*
   8455 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
   8456 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
   8457 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
   8458 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
   8459 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i32 0, i32 0
   8460 // CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
   8461 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
   8462 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
   8463 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i32 0, i32 1
   8464 // CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
   8465 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
   8466 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
   8467 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i32 0, i32 2
   8468 // CHECK:   [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
   8469 // CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
   8470 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
   8471 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i32 0, i32 3
   8472 // CHECK:   [[TMP11:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
   8473 // CHECK:   [[TMP12:%.*]] = bitcast <4 x i16> [[TMP11]] to <8 x i8>
   8474 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
   8475 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
   8476 // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
   8477 // CHECK:   [[TMP16:%.*]] = bitcast <8 x i8> [[TMP12]] to <4 x i16>
   8478 // CHECK:   [[VLD4_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16.p0i8(i8* [[TMP4]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], <4 x i16> [[TMP16]], i32 3, i32 2)
   8479 // CHECK:   [[TMP17:%.*]] = bitcast i8* [[TMP3]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
   8480 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE_V]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP17]]
   8481 // CHECK:   [[TMP18:%.*]] = bitcast %struct.poly16x4x4_t* %agg.result to i8*
   8482 // CHECK:   [[TMP19:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
   8483 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP18]], i8* [[TMP19]], i32 32, i32 8, i1 false)
   8484 // CHECK:   ret void
   8485 poly16x4x4_t test_vld4_lane_p16(poly16_t const * a, poly16x4x4_t b) {
   8486   return vld4_lane_p16(a, b, 3);
   8487 }
   8488 
   8489 
   8490 // CHECK-LABEL: define <8 x i8> @test_vmax_s8(<8 x i8> %a, <8 x i8> %b) #0 {
   8491 // CHECK:   [[VMAX_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vmaxs.v8i8(<8 x i8> %a, <8 x i8> %b) #4
   8492 // CHECK:   ret <8 x i8> [[VMAX_V_I]]
   8493 int8x8_t test_vmax_s8(int8x8_t a, int8x8_t b) {
   8494   return vmax_s8(a, b);
   8495 }
   8496 
   8497 // CHECK-LABEL: define <4 x i16> @test_vmax_s16(<4 x i16> %a, <4 x i16> %b) #0 {
   8498 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   8499 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   8500 // CHECK:   [[VMAX_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   8501 // CHECK:   [[VMAX_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   8502 // CHECK:   [[VMAX_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vmaxs.v4i16(<4 x i16> [[VMAX_V_I]], <4 x i16> [[VMAX_V1_I]]) #4
   8503 // CHECK:   [[VMAX_V3_I:%.*]] = bitcast <4 x i16> [[VMAX_V2_I]] to <8 x i8>
   8504 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VMAX_V3_I]] to <4 x i16>
   8505 // CHECK:   ret <4 x i16> [[TMP2]]
   8506 int16x4_t test_vmax_s16(int16x4_t a, int16x4_t b) {
   8507   return vmax_s16(a, b);
   8508 }
   8509 
   8510 // CHECK-LABEL: define <2 x i32> @test_vmax_s32(<2 x i32> %a, <2 x i32> %b) #0 {
   8511 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   8512 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   8513 // CHECK:   [[VMAX_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   8514 // CHECK:   [[VMAX_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   8515 // CHECK:   [[VMAX_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32> [[VMAX_V_I]], <2 x i32> [[VMAX_V1_I]]) #4
   8516 // CHECK:   [[VMAX_V3_I:%.*]] = bitcast <2 x i32> [[VMAX_V2_I]] to <8 x i8>
   8517 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VMAX_V3_I]] to <2 x i32>
   8518 // CHECK:   ret <2 x i32> [[TMP2]]
   8519 int32x2_t test_vmax_s32(int32x2_t a, int32x2_t b) {
   8520   return vmax_s32(a, b);
   8521 }
   8522 
   8523 // CHECK-LABEL: define <8 x i8> @test_vmax_u8(<8 x i8> %a, <8 x i8> %b) #0 {
   8524 // CHECK:   [[VMAX_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vmaxu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
   8525 // CHECK:   ret <8 x i8> [[VMAX_V_I]]
   8526 uint8x8_t test_vmax_u8(uint8x8_t a, uint8x8_t b) {
   8527   return vmax_u8(a, b);
   8528 }
   8529 
   8530 // CHECK-LABEL: define <4 x i16> @test_vmax_u16(<4 x i16> %a, <4 x i16> %b) #0 {
   8531 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   8532 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   8533 // CHECK:   [[VMAX_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   8534 // CHECK:   [[VMAX_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   8535 // CHECK:   [[VMAX_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vmaxu.v4i16(<4 x i16> [[VMAX_V_I]], <4 x i16> [[VMAX_V1_I]]) #4
   8536 // CHECK:   [[VMAX_V3_I:%.*]] = bitcast <4 x i16> [[VMAX_V2_I]] to <8 x i8>
   8537 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VMAX_V3_I]] to <4 x i16>
   8538 // CHECK:   ret <4 x i16> [[TMP2]]
   8539 uint16x4_t test_vmax_u16(uint16x4_t a, uint16x4_t b) {
   8540   return vmax_u16(a, b);
   8541 }
   8542 
   8543 // CHECK-LABEL: define <2 x i32> @test_vmax_u32(<2 x i32> %a, <2 x i32> %b) #0 {
   8544 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   8545 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   8546 // CHECK:   [[VMAX_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   8547 // CHECK:   [[VMAX_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   8548 // CHECK:   [[VMAX_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32> [[VMAX_V_I]], <2 x i32> [[VMAX_V1_I]]) #4
   8549 // CHECK:   [[VMAX_V3_I:%.*]] = bitcast <2 x i32> [[VMAX_V2_I]] to <8 x i8>
   8550 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VMAX_V3_I]] to <2 x i32>
   8551 // CHECK:   ret <2 x i32> [[TMP2]]
   8552 uint32x2_t test_vmax_u32(uint32x2_t a, uint32x2_t b) {
   8553   return vmax_u32(a, b);
   8554 }
   8555 
   8556 // CHECK-LABEL: define <2 x float> @test_vmax_f32(<2 x float> %a, <2 x float> %b) #0 {
   8557 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
   8558 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
   8559 // CHECK:   [[VMAX_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
   8560 // CHECK:   [[VMAX_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
   8561 // CHECK:   [[VMAX_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float> [[VMAX_V_I]], <2 x float> [[VMAX_V1_I]]) #4
   8562 // CHECK:   [[VMAX_V3_I:%.*]] = bitcast <2 x float> [[VMAX_V2_I]] to <8 x i8>
   8563 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VMAX_V3_I]] to <2 x float>
   8564 // CHECK:   ret <2 x float> [[TMP2]]
   8565 float32x2_t test_vmax_f32(float32x2_t a, float32x2_t b) {
   8566   return vmax_f32(a, b);
   8567 }
   8568 
   8569 // CHECK-LABEL: define <16 x i8> @test_vmaxq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
   8570 // CHECK:   [[VMAXQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vmaxs.v16i8(<16 x i8> %a, <16 x i8> %b) #4
   8571 // CHECK:   ret <16 x i8> [[VMAXQ_V_I]]
   8572 int8x16_t test_vmaxq_s8(int8x16_t a, int8x16_t b) {
   8573   return vmaxq_s8(a, b);
   8574 }
   8575 
   8576 // CHECK-LABEL: define <8 x i16> @test_vmaxq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
   8577 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   8578 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   8579 // CHECK:   [[VMAXQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   8580 // CHECK:   [[VMAXQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   8581 // CHECK:   [[VMAXQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmaxs.v8i16(<8 x i16> [[VMAXQ_V_I]], <8 x i16> [[VMAXQ_V1_I]]) #4
   8582 // CHECK:   [[VMAXQ_V3_I:%.*]] = bitcast <8 x i16> [[VMAXQ_V2_I]] to <16 x i8>
   8583 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VMAXQ_V3_I]] to <8 x i16>
   8584 // CHECK:   ret <8 x i16> [[TMP2]]
   8585 int16x8_t test_vmaxq_s16(int16x8_t a, int16x8_t b) {
   8586   return vmaxq_s16(a, b);
   8587 }
   8588 
   8589 // CHECK-LABEL: define <4 x i32> @test_vmaxq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
   8590 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   8591 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   8592 // CHECK:   [[VMAXQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   8593 // CHECK:   [[VMAXQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   8594 // CHECK:   [[VMAXQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> [[VMAXQ_V_I]], <4 x i32> [[VMAXQ_V1_I]]) #4
   8595 // CHECK:   [[VMAXQ_V3_I:%.*]] = bitcast <4 x i32> [[VMAXQ_V2_I]] to <16 x i8>
   8596 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VMAXQ_V3_I]] to <4 x i32>
   8597 // CHECK:   ret <4 x i32> [[TMP2]]
   8598 int32x4_t test_vmaxq_s32(int32x4_t a, int32x4_t b) {
   8599   return vmaxq_s32(a, b);
   8600 }
   8601 
   8602 // CHECK-LABEL: define <16 x i8> @test_vmaxq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
   8603 // CHECK:   [[VMAXQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vmaxu.v16i8(<16 x i8> %a, <16 x i8> %b) #4
   8604 // CHECK:   ret <16 x i8> [[VMAXQ_V_I]]
   8605 uint8x16_t test_vmaxq_u8(uint8x16_t a, uint8x16_t b) {
   8606   return vmaxq_u8(a, b);
   8607 }
   8608 
   8609 // CHECK-LABEL: define <8 x i16> @test_vmaxq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
   8610 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   8611 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   8612 // CHECK:   [[VMAXQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   8613 // CHECK:   [[VMAXQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   8614 // CHECK:   [[VMAXQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmaxu.v8i16(<8 x i16> [[VMAXQ_V_I]], <8 x i16> [[VMAXQ_V1_I]]) #4
   8615 // CHECK:   [[VMAXQ_V3_I:%.*]] = bitcast <8 x i16> [[VMAXQ_V2_I]] to <16 x i8>
   8616 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VMAXQ_V3_I]] to <8 x i16>
   8617 // CHECK:   ret <8 x i16> [[TMP2]]
   8618 uint16x8_t test_vmaxq_u16(uint16x8_t a, uint16x8_t b) {
   8619   return vmaxq_u16(a, b);
   8620 }
   8621 
   8622 // CHECK-LABEL: define <4 x i32> @test_vmaxq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
   8623 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   8624 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   8625 // CHECK:   [[VMAXQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   8626 // CHECK:   [[VMAXQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   8627 // CHECK:   [[VMAXQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> [[VMAXQ_V_I]], <4 x i32> [[VMAXQ_V1_I]]) #4
   8628 // CHECK:   [[VMAXQ_V3_I:%.*]] = bitcast <4 x i32> [[VMAXQ_V2_I]] to <16 x i8>
   8629 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VMAXQ_V3_I]] to <4 x i32>
   8630 // CHECK:   ret <4 x i32> [[TMP2]]
   8631 uint32x4_t test_vmaxq_u32(uint32x4_t a, uint32x4_t b) {
   8632   return vmaxq_u32(a, b);
   8633 }
   8634 
   8635 // CHECK-LABEL: define <4 x float> @test_vmaxq_f32(<4 x float> %a, <4 x float> %b) #0 {
   8636 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
   8637 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
   8638 // CHECK:   [[VMAXQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
   8639 // CHECK:   [[VMAXQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
   8640 // CHECK:   [[VMAXQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> [[VMAXQ_V_I]], <4 x float> [[VMAXQ_V1_I]]) #4
   8641 // CHECK:   [[VMAXQ_V3_I:%.*]] = bitcast <4 x float> [[VMAXQ_V2_I]] to <16 x i8>
   8642 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VMAXQ_V3_I]] to <4 x float>
   8643 // CHECK:   ret <4 x float> [[TMP2]]
   8644 float32x4_t test_vmaxq_f32(float32x4_t a, float32x4_t b) {
   8645   return vmaxq_f32(a, b);
   8646 }
   8647 
   8648 
   8649 // CHECK-LABEL: define <8 x i8> @test_vmin_s8(<8 x i8> %a, <8 x i8> %b) #0 {
   8650 // CHECK:   [[VMIN_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vmins.v8i8(<8 x i8> %a, <8 x i8> %b) #4
   8651 // CHECK:   ret <8 x i8> [[VMIN_V_I]]
   8652 int8x8_t test_vmin_s8(int8x8_t a, int8x8_t b) {
   8653   return vmin_s8(a, b);
   8654 }
   8655 
   8656 // CHECK-LABEL: define <4 x i16> @test_vmin_s16(<4 x i16> %a, <4 x i16> %b) #0 {
   8657 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   8658 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   8659 // CHECK:   [[VMIN_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   8660 // CHECK:   [[VMIN_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   8661 // CHECK:   [[VMIN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vmins.v4i16(<4 x i16> [[VMIN_V_I]], <4 x i16> [[VMIN_V1_I]]) #4
   8662 // CHECK:   [[VMIN_V3_I:%.*]] = bitcast <4 x i16> [[VMIN_V2_I]] to <8 x i8>
   8663 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VMIN_V3_I]] to <4 x i16>
   8664 // CHECK:   ret <4 x i16> [[TMP2]]
   8665 int16x4_t test_vmin_s16(int16x4_t a, int16x4_t b) {
   8666   return vmin_s16(a, b);
   8667 }
   8668 
   8669 // CHECK-LABEL: define <2 x i32> @test_vmin_s32(<2 x i32> %a, <2 x i32> %b) #0 {
   8670 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   8671 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   8672 // CHECK:   [[VMIN_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   8673 // CHECK:   [[VMIN_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   8674 // CHECK:   [[VMIN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32> [[VMIN_V_I]], <2 x i32> [[VMIN_V1_I]]) #4
   8675 // CHECK:   [[VMIN_V3_I:%.*]] = bitcast <2 x i32> [[VMIN_V2_I]] to <8 x i8>
   8676 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VMIN_V3_I]] to <2 x i32>
   8677 // CHECK:   ret <2 x i32> [[TMP2]]
   8678 int32x2_t test_vmin_s32(int32x2_t a, int32x2_t b) {
   8679   return vmin_s32(a, b);
   8680 }
   8681 
   8682 // CHECK-LABEL: define <8 x i8> @test_vmin_u8(<8 x i8> %a, <8 x i8> %b) #0 {
   8683 // CHECK:   [[VMIN_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vminu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
   8684 // CHECK:   ret <8 x i8> [[VMIN_V_I]]
   8685 uint8x8_t test_vmin_u8(uint8x8_t a, uint8x8_t b) {
   8686   return vmin_u8(a, b);
   8687 }
   8688 
   8689 // CHECK-LABEL: define <4 x i16> @test_vmin_u16(<4 x i16> %a, <4 x i16> %b) #0 {
   8690 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   8691 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   8692 // CHECK:   [[VMIN_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   8693 // CHECK:   [[VMIN_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   8694 // CHECK:   [[VMIN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vminu.v4i16(<4 x i16> [[VMIN_V_I]], <4 x i16> [[VMIN_V1_I]]) #4
   8695 // CHECK:   [[VMIN_V3_I:%.*]] = bitcast <4 x i16> [[VMIN_V2_I]] to <8 x i8>
   8696 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VMIN_V3_I]] to <4 x i16>
   8697 // CHECK:   ret <4 x i16> [[TMP2]]
   8698 uint16x4_t test_vmin_u16(uint16x4_t a, uint16x4_t b) {
   8699   return vmin_u16(a, b);
   8700 }
   8701 
   8702 // CHECK-LABEL: define <2 x i32> @test_vmin_u32(<2 x i32> %a, <2 x i32> %b) #0 {
   8703 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   8704 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   8705 // CHECK:   [[VMIN_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   8706 // CHECK:   [[VMIN_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   8707 // CHECK:   [[VMIN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32> [[VMIN_V_I]], <2 x i32> [[VMIN_V1_I]]) #4
   8708 // CHECK:   [[VMIN_V3_I:%.*]] = bitcast <2 x i32> [[VMIN_V2_I]] to <8 x i8>
   8709 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VMIN_V3_I]] to <2 x i32>
   8710 // CHECK:   ret <2 x i32> [[TMP2]]
   8711 uint32x2_t test_vmin_u32(uint32x2_t a, uint32x2_t b) {
   8712   return vmin_u32(a, b);
   8713 }
   8714 
   8715 // CHECK-LABEL: define <2 x float> @test_vmin_f32(<2 x float> %a, <2 x float> %b) #0 {
   8716 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
   8717 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
   8718 // CHECK:   [[VMIN_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
   8719 // CHECK:   [[VMIN_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
   8720 // CHECK:   [[VMIN_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float> [[VMIN_V_I]], <2 x float> [[VMIN_V1_I]]) #4
   8721 // CHECK:   [[VMIN_V3_I:%.*]] = bitcast <2 x float> [[VMIN_V2_I]] to <8 x i8>
   8722 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VMIN_V3_I]] to <2 x float>
   8723 // CHECK:   ret <2 x float> [[TMP2]]
   8724 float32x2_t test_vmin_f32(float32x2_t a, float32x2_t b) {
   8725   return vmin_f32(a, b);
   8726 }
   8727 
   8728 // CHECK-LABEL: define <16 x i8> @test_vminq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
   8729 // CHECK:   [[VMINQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vmins.v16i8(<16 x i8> %a, <16 x i8> %b) #4
   8730 // CHECK:   ret <16 x i8> [[VMINQ_V_I]]
   8731 int8x16_t test_vminq_s8(int8x16_t a, int8x16_t b) {
   8732   return vminq_s8(a, b);
   8733 }
   8734 
   8735 // CHECK-LABEL: define <8 x i16> @test_vminq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
   8736 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   8737 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   8738 // CHECK:   [[VMINQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   8739 // CHECK:   [[VMINQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   8740 // CHECK:   [[VMINQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmins.v8i16(<8 x i16> [[VMINQ_V_I]], <8 x i16> [[VMINQ_V1_I]]) #4
   8741 // CHECK:   [[VMINQ_V3_I:%.*]] = bitcast <8 x i16> [[VMINQ_V2_I]] to <16 x i8>
   8742 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VMINQ_V3_I]] to <8 x i16>
   8743 // CHECK:   ret <8 x i16> [[TMP2]]
   8744 int16x8_t test_vminq_s16(int16x8_t a, int16x8_t b) {
   8745   return vminq_s16(a, b);
   8746 }
   8747 
   8748 // CHECK-LABEL: define <4 x i32> @test_vminq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
   8749 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   8750 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   8751 // CHECK:   [[VMINQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   8752 // CHECK:   [[VMINQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   8753 // CHECK:   [[VMINQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> [[VMINQ_V_I]], <4 x i32> [[VMINQ_V1_I]]) #4
   8754 // CHECK:   [[VMINQ_V3_I:%.*]] = bitcast <4 x i32> [[VMINQ_V2_I]] to <16 x i8>
   8755 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VMINQ_V3_I]] to <4 x i32>
   8756 // CHECK:   ret <4 x i32> [[TMP2]]
   8757 int32x4_t test_vminq_s32(int32x4_t a, int32x4_t b) {
   8758   return vminq_s32(a, b);
   8759 }
   8760 
   8761 // CHECK-LABEL: define <16 x i8> @test_vminq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
   8762 // CHECK:   [[VMINQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vminu.v16i8(<16 x i8> %a, <16 x i8> %b) #4
   8763 // CHECK:   ret <16 x i8> [[VMINQ_V_I]]
   8764 uint8x16_t test_vminq_u8(uint8x16_t a, uint8x16_t b) {
   8765   return vminq_u8(a, b);
   8766 }
   8767 
   8768 // CHECK-LABEL: define <8 x i16> @test_vminq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
   8769 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   8770 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   8771 // CHECK:   [[VMINQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   8772 // CHECK:   [[VMINQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   8773 // CHECK:   [[VMINQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vminu.v8i16(<8 x i16> [[VMINQ_V_I]], <8 x i16> [[VMINQ_V1_I]]) #4
   8774 // CHECK:   [[VMINQ_V3_I:%.*]] = bitcast <8 x i16> [[VMINQ_V2_I]] to <16 x i8>
   8775 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VMINQ_V3_I]] to <8 x i16>
   8776 // CHECK:   ret <8 x i16> [[TMP2]]
   8777 uint16x8_t test_vminq_u16(uint16x8_t a, uint16x8_t b) {
   8778   return vminq_u16(a, b);
   8779 }
   8780 
   8781 // CHECK-LABEL: define <4 x i32> @test_vminq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
   8782 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   8783 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   8784 // CHECK:   [[VMINQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   8785 // CHECK:   [[VMINQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   8786 // CHECK:   [[VMINQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> [[VMINQ_V_I]], <4 x i32> [[VMINQ_V1_I]]) #4
   8787 // CHECK:   [[VMINQ_V3_I:%.*]] = bitcast <4 x i32> [[VMINQ_V2_I]] to <16 x i8>
   8788 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VMINQ_V3_I]] to <4 x i32>
   8789 // CHECK:   ret <4 x i32> [[TMP2]]
   8790 uint32x4_t test_vminq_u32(uint32x4_t a, uint32x4_t b) {
   8791   return vminq_u32(a, b);
   8792 }
   8793 
   8794 // CHECK-LABEL: define <4 x float> @test_vminq_f32(<4 x float> %a, <4 x float> %b) #0 {
   8795 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
   8796 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
   8797 // CHECK:   [[VMINQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
   8798 // CHECK:   [[VMINQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
   8799 // CHECK:   [[VMINQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> [[VMINQ_V_I]], <4 x float> [[VMINQ_V1_I]]) #4
   8800 // CHECK:   [[VMINQ_V3_I:%.*]] = bitcast <4 x float> [[VMINQ_V2_I]] to <16 x i8>
   8801 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VMINQ_V3_I]] to <4 x float>
   8802 // CHECK:   ret <4 x float> [[TMP2]]
   8803 float32x4_t test_vminq_f32(float32x4_t a, float32x4_t b) {
   8804   return vminq_f32(a, b);
   8805 }
   8806 
   8807 
   8808 // CHECK-LABEL: define <8 x i8> @test_vmla_s8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
   8809 // CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %b, %c
   8810 // CHECK:   [[ADD_I:%.*]] = add <8 x i8> %a, [[MUL_I]]
   8811 // CHECK:   ret <8 x i8> [[ADD_I]]
   8812 int8x8_t test_vmla_s8(int8x8_t a, int8x8_t b, int8x8_t c) {
   8813   return vmla_s8(a, b, c);
   8814 }
   8815 
   8816 // CHECK-LABEL: define <4 x i16> @test_vmla_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
   8817 // CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, %c
   8818 // CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, [[MUL_I]]
   8819 // CHECK:   ret <4 x i16> [[ADD_I]]
   8820 int16x4_t test_vmla_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
   8821   return vmla_s16(a, b, c);
   8822 }
   8823 
   8824 // CHECK-LABEL: define <2 x i32> @test_vmla_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
   8825 // CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, %c
   8826 // CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, [[MUL_I]]
   8827 // CHECK:   ret <2 x i32> [[ADD_I]]
   8828 int32x2_t test_vmla_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
   8829   return vmla_s32(a, b, c);
   8830 }
   8831 
   8832 // CHECK-LABEL: define <2 x float> @test_vmla_f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) #0 {
   8833 // CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %b, %c
   8834 // CHECK:   [[ADD_I:%.*]] = fadd <2 x float> %a, [[MUL_I]]
   8835 // CHECK:   ret <2 x float> [[ADD_I]]
   8836 float32x2_t test_vmla_f32(float32x2_t a, float32x2_t b, float32x2_t c) {
   8837   return vmla_f32(a, b, c);
   8838 }
   8839 
   8840 // CHECK-LABEL: define <8 x i8> @test_vmla_u8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
   8841 // CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %b, %c
   8842 // CHECK:   [[ADD_I:%.*]] = add <8 x i8> %a, [[MUL_I]]
   8843 // CHECK:   ret <8 x i8> [[ADD_I]]
   8844 uint8x8_t test_vmla_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) {
   8845   return vmla_u8(a, b, c);
   8846 }
   8847 
   8848 // CHECK-LABEL: define <4 x i16> @test_vmla_u16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
   8849 // CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, %c
   8850 // CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, [[MUL_I]]
   8851 // CHECK:   ret <4 x i16> [[ADD_I]]
   8852 uint16x4_t test_vmla_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) {
   8853   return vmla_u16(a, b, c);
   8854 }
   8855 
   8856 // CHECK-LABEL: define <2 x i32> @test_vmla_u32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
   8857 // CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, %c
   8858 // CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, [[MUL_I]]
   8859 // CHECK:   ret <2 x i32> [[ADD_I]]
   8860 uint32x2_t test_vmla_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) {
   8861   return vmla_u32(a, b, c);
   8862 }
   8863 
   8864 // CHECK-LABEL: define <16 x i8> @test_vmlaq_s8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 {
   8865 // CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %b, %c
   8866 // CHECK:   [[ADD_I:%.*]] = add <16 x i8> %a, [[MUL_I]]
   8867 // CHECK:   ret <16 x i8> [[ADD_I]]
   8868 int8x16_t test_vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c) {
   8869   return vmlaq_s8(a, b, c);
   8870 }
   8871 
   8872 // CHECK-LABEL: define <8 x i16> @test_vmlaq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) #0 {
   8873 // CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, %c
   8874 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[MUL_I]]
   8875 // CHECK:   ret <8 x i16> [[ADD_I]]
   8876 int16x8_t test_vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c) {
   8877   return vmlaq_s16(a, b, c);
   8878 }
   8879 
   8880 // CHECK-LABEL: define <4 x i32> @test_vmlaq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) #0 {
   8881 // CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, %c
   8882 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[MUL_I]]
   8883 // CHECK:   ret <4 x i32> [[ADD_I]]
   8884 int32x4_t test_vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c) {
   8885   return vmlaq_s32(a, b, c);
   8886 }
   8887 
   8888 // CHECK-LABEL: define <4 x float> @test_vmlaq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) #0 {
   8889 // CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %b, %c
   8890 // CHECK:   [[ADD_I:%.*]] = fadd <4 x float> %a, [[MUL_I]]
   8891 // CHECK:   ret <4 x float> [[ADD_I]]
   8892 float32x4_t test_vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c) {
   8893   return vmlaq_f32(a, b, c);
   8894 }
   8895 
   8896 // CHECK-LABEL: define <16 x i8> @test_vmlaq_u8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 {
   8897 // CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %b, %c
   8898 // CHECK:   [[ADD_I:%.*]] = add <16 x i8> %a, [[MUL_I]]
   8899 // CHECK:   ret <16 x i8> [[ADD_I]]
   8900 uint8x16_t test_vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) {
   8901   return vmlaq_u8(a, b, c);
   8902 }
   8903 
   8904 // CHECK-LABEL: define <8 x i16> @test_vmlaq_u16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) #0 {
   8905 // CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, %c
   8906 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[MUL_I]]
   8907 // CHECK:   ret <8 x i16> [[ADD_I]]
   8908 uint16x8_t test_vmlaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) {
   8909   return vmlaq_u16(a, b, c);
   8910 }
   8911 
   8912 // CHECK-LABEL: define <4 x i32> @test_vmlaq_u32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) #0 {
   8913 // CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, %c
   8914 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[MUL_I]]
   8915 // CHECK:   ret <4 x i32> [[ADD_I]]
   8916 uint32x4_t test_vmlaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) {
   8917   return vmlaq_u32(a, b, c);
   8918 }
   8919 
   8920 
   8921 // CHECK-LABEL: define <8 x i16> @test_vmlal_s8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) #0 {
   8922 // CHECK:   [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %b, <8 x i8> %c) #4
   8923 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMULL_I_I]]
   8924 // CHECK:   ret <8 x i16> [[ADD_I]]
   8925 int16x8_t test_vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c) {
   8926   return vmlal_s8(a, b, c);
   8927 }
   8928 
   8929 // CHECK-LABEL: define <4 x i32> @test_vmlal_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
   8930 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   8931 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
   8932 // CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   8933 // CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   8934 // CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4
   8935 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]]
   8936 // CHECK:   ret <4 x i32> [[ADD_I]]
   8937 int32x4_t test_vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
   8938   return vmlal_s16(a, b, c);
   8939 }
   8940 
   8941 // CHECK-LABEL: define <2 x i64> @test_vmlal_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
   8942 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   8943 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
   8944 // CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   8945 // CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   8946 // CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4
   8947 // CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]]
   8948 // CHECK:   ret <2 x i64> [[ADD_I]]
   8949 int64x2_t test_vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
   8950   return vmlal_s32(a, b, c);
   8951 }
   8952 
   8953 // CHECK-LABEL: define <8 x i16> @test_vmlal_u8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) #0 {
   8954 // CHECK:   [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %b, <8 x i8> %c) #4
   8955 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMULL_I_I]]
   8956 // CHECK:   ret <8 x i16> [[ADD_I]]
   8957 uint16x8_t test_vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) {
   8958   return vmlal_u8(a, b, c);
   8959 }
   8960 
   8961 // CHECK-LABEL: define <4 x i32> @test_vmlal_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
   8962 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   8963 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
   8964 // CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   8965 // CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   8966 // CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4
   8967 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]]
   8968 // CHECK:   ret <4 x i32> [[ADD_I]]
   8969 uint32x4_t test_vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) {
   8970   return vmlal_u16(a, b, c);
   8971 }
   8972 
   8973 // CHECK-LABEL: define <2 x i64> @test_vmlal_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
   8974 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   8975 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
   8976 // CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   8977 // CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   8978 // CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4
   8979 // CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]]
   8980 // CHECK:   ret <2 x i64> [[ADD_I]]
   8981 uint64x2_t test_vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
   8982   return vmlal_u32(a, b, c);
   8983 }
   8984 
   8985 
   8986 // CHECK-LABEL: define <4 x i32> @test_vmlal_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
   8987 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   8988 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   8989 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
   8990 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   8991 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   8992 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #4
   8993 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
   8994 // CHECK:   ret <4 x i32> [[ADD]]
   8995 int32x4_t test_vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
   8996   return vmlal_lane_s16(a, b, c, 3);
   8997 }
   8998 
   8999 // CHECK-LABEL: define <2 x i64> @test_vmlal_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
   9000 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
   9001 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   9002 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
   9003 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   9004 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   9005 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #4
   9006 // CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
   9007 // CHECK:   ret <2 x i64> [[ADD]]
   9008 int64x2_t test_vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
   9009   return vmlal_lane_s32(a, b, c, 1);
   9010 }
   9011 
   9012 // CHECK-LABEL: define <4 x i32> @test_vmlal_lane_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
   9013 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   9014 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   9015 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
   9016 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   9017 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   9018 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #4
   9019 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
   9020 // CHECK:   ret <4 x i32> [[ADD]]
   9021 uint32x4_t test_vmlal_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) {
   9022   return vmlal_lane_u16(a, b, c, 3);
   9023 }
   9024 
   9025 // CHECK-LABEL: define <2 x i64> @test_vmlal_lane_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
   9026 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
   9027 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   9028 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
   9029 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   9030 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   9031 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #4
   9032 // CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
   9033 // CHECK:   ret <2 x i64> [[ADD]]
   9034 uint64x2_t test_vmlal_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
   9035   return vmlal_lane_u32(a, b, c, 1);
   9036 }
   9037 
   9038 
   9039 // CHECK-LABEL: define <4 x i32> @test_vmlal_n_s16(<4 x i32> %a, <4 x i16> %b, i16 signext %c) #0 {
   9040 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
   9041 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
   9042 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
   9043 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
   9044 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   9045 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
   9046 // CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   9047 // CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   9048 // CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4
   9049 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]]
   9050 // CHECK:   ret <4 x i32> [[ADD_I]]
   9051 int32x4_t test_vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
   9052   return vmlal_n_s16(a, b, c);
   9053 }
   9054 
   9055 // CHECK-LABEL: define <2 x i64> @test_vmlal_n_s32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 {
   9056 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
   9057 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
   9058 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   9059 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
   9060 // CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   9061 // CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   9062 // CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4
   9063 // CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]]
   9064 // CHECK:   ret <2 x i64> [[ADD_I]]
   9065 int64x2_t test_vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
   9066   return vmlal_n_s32(a, b, c);
   9067 }
   9068 
   9069 // CHECK-LABEL: define <4 x i32> @test_vmlal_n_u16(<4 x i32> %a, <4 x i16> %b, i16 zeroext %c) #0 {
   9070 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
   9071 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
   9072 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
   9073 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
   9074 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   9075 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
   9076 // CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   9077 // CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   9078 // CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4
   9079 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]]
   9080 // CHECK:   ret <4 x i32> [[ADD_I]]
   9081 uint32x4_t test_vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) {
   9082   return vmlal_n_u16(a, b, c);
   9083 }
   9084 
   9085 // CHECK-LABEL: define <2 x i64> @test_vmlal_n_u32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 {
   9086 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
   9087 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
   9088 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   9089 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
   9090 // CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   9091 // CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   9092 // CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4
   9093 // CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]]
   9094 // CHECK:   ret <2 x i64> [[ADD_I]]
   9095 uint64x2_t test_vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) {
   9096   return vmlal_n_u32(a, b, c);
   9097 }
   9098 
   9099 
   9100 // CHECK-LABEL: define <4 x i16> @test_vmla_lane_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
   9101 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   9102 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
   9103 // CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
   9104 // CHECK:   ret <4 x i16> [[ADD]]
   9105 int16x4_t test_vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
   9106   return vmla_lane_s16(a, b, c, 3);
   9107 }
   9108 
   9109 // CHECK-LABEL: define <2 x i32> @test_vmla_lane_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
   9110 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
   9111 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
   9112 // CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
   9113 // CHECK:   ret <2 x i32> [[ADD]]
   9114 int32x2_t test_vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
   9115   return vmla_lane_s32(a, b, c, 1);
   9116 }
   9117 
   9118 // CHECK-LABEL: define <4 x i16> @test_vmla_lane_u16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
   9119 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   9120 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
   9121 // CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
   9122 // CHECK:   ret <4 x i16> [[ADD]]
   9123 uint16x4_t test_vmla_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) {
   9124   return vmla_lane_u16(a, b, c, 3);
   9125 }
   9126 
   9127 // CHECK-LABEL: define <2 x i32> @test_vmla_lane_u32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
   9128 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
   9129 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
   9130 // CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
   9131 // CHECK:   ret <2 x i32> [[ADD]]
   9132 uint32x2_t test_vmla_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) {
   9133   return vmla_lane_u32(a, b, c, 1);
   9134 }
   9135 
   9136 // CHECK-LABEL: define <2 x float> @test_vmla_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) #0 {
   9137 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %c, <2 x float> %c, <2 x i32> <i32 1, i32 1>
   9138 // CHECK:   [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]]
   9139 // CHECK:   [[ADD:%.*]] = fadd <2 x float> %a, [[MUL]]
   9140 // CHECK:   ret <2 x float> [[ADD]]
   9141 float32x2_t test_vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t c) {
   9142   return vmla_lane_f32(a, b, c, 1);
   9143 }
   9144 
   9145 // CHECK-LABEL: define <8 x i16> @test_vmlaq_lane_s16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %c) #0 {
   9146 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
   9147 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
   9148 // CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
   9149 // CHECK:   ret <8 x i16> [[ADD]]
   9150 int16x8_t test_vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c) {
   9151   return vmlaq_lane_s16(a, b, c, 3);
   9152 }
   9153 
   9154 // CHECK-LABEL: define <4 x i32> @test_vmlaq_lane_s32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %c) #0 {
   9155 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   9156 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
   9157 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
   9158 // CHECK:   ret <4 x i32> [[ADD]]
   9159 int32x4_t test_vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t c) {
   9160   return vmlaq_lane_s32(a, b, c, 1);
   9161 }
   9162 
   9163 // CHECK-LABEL: define <8 x i16> @test_vmlaq_lane_u16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %c) #0 {
   9164 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
   9165 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
   9166 // CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
   9167 // CHECK:   ret <8 x i16> [[ADD]]
   9168 uint16x8_t test_vmlaq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t c) {
   9169   return vmlaq_lane_u16(a, b, c, 3);
   9170 }
   9171 
   9172 // CHECK-LABEL: define <4 x i32> @test_vmlaq_lane_u32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %c) #0 {
   9173 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   9174 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
   9175 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
   9176 // CHECK:   ret <4 x i32> [[ADD]]
   9177 uint32x4_t test_vmlaq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t c) {
   9178   return vmlaq_lane_u32(a, b, c, 1);
   9179 }
   9180 
   9181 // CHECK-LABEL: define <4 x float> @test_vmlaq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %c) #0 {
   9182 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %c, <2 x float> %c, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   9183 // CHECK:   [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]]
   9184 // CHECK:   [[ADD:%.*]] = fadd <4 x float> %a, [[MUL]]
   9185 // CHECK:   ret <4 x float> [[ADD]]
   9186 float32x4_t test_vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t c) {
   9187   return vmlaq_lane_f32(a, b, c, 1);
   9188 }
   9189 
   9190 
   9191 // CHECK-LABEL: define <4 x i16> @test_vmla_n_s16(<4 x i16> %a, <4 x i16> %b, i16 signext %c) #0 {
   9192 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
   9193 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
   9194 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
   9195 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
   9196 // CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]]
   9197 // CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, [[MUL_I]]
   9198 // CHECK:   ret <4 x i16> [[ADD_I]]
   9199 int16x4_t test_vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c) {
   9200   return vmla_n_s16(a, b, c);
   9201 }
   9202 
   9203 // CHECK-LABEL: define <2 x i32> @test_vmla_n_s32(<2 x i32> %a, <2 x i32> %b, i32 %c) #0 {
   9204 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
   9205 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
   9206 // CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
   9207 // CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, [[MUL_I]]
   9208 // CHECK:   ret <2 x i32> [[ADD_I]]
   9209 int32x2_t test_vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c) {
   9210   return vmla_n_s32(a, b, c);
   9211 }
   9212 
   9213 // CHECK-LABEL: define <4 x i16> @test_vmla_n_u16(<4 x i16> %a, <4 x i16> %b, i16 zeroext %c) #0 {
   9214 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
   9215 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
   9216 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
   9217 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
   9218 // CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]]
   9219 // CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, [[MUL_I]]
   9220 // CHECK:   ret <4 x i16> [[ADD_I]]
   9221 uint16x4_t test_vmla_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) {
   9222   return vmla_n_u16(a, b, c);
   9223 }
   9224 
   9225 // CHECK-LABEL: define <2 x i32> @test_vmla_n_u32(<2 x i32> %a, <2 x i32> %b, i32 %c) #0 {
   9226 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
   9227 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
   9228 // CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
   9229 // CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, [[MUL_I]]
   9230 // CHECK:   ret <2 x i32> [[ADD_I]]
   9231 uint32x2_t test_vmla_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) {
   9232   return vmla_n_u32(a, b, c);
   9233 }
   9234 
   9235 // CHECK-LABEL: define <2 x float> @test_vmla_n_f32(<2 x float> %a, <2 x float> %b, float %c) #0 {
   9236 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %c, i32 0
   9237 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %c, i32 1
   9238 // CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %b, [[VECINIT1_I]]
   9239 // CHECK:   [[ADD_I:%.*]] = fadd <2 x float> %a, [[MUL_I]]
   9240 // CHECK:   ret <2 x float> [[ADD_I]]
   9241 float32x2_t test_vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c) {
   9242   return vmla_n_f32(a, b, c);
   9243 }
   9244 
   9245 // CHECK-LABEL: define <8 x i16> @test_vmlaq_n_s16(<8 x i16> %a, <8 x i16> %b, i16 signext %c) #0 {
   9246 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0
   9247 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
   9248 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
   9249 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3
   9250 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4
   9251 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5
   9252 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6
   9253 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7
   9254 // CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]]
   9255 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[MUL_I]]
   9256 // CHECK:   ret <8 x i16> [[ADD_I]]
   9257 int16x8_t test_vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c) {
   9258   return vmlaq_n_s16(a, b, c);
   9259 }
   9260 
   9261 // CHECK-LABEL: define <4 x i32> @test_vmlaq_n_s32(<4 x i32> %a, <4 x i32> %b, i32 %c) #0 {
   9262 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
   9263 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
   9264 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
   9265 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3
   9266 // CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]]
   9267 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[MUL_I]]
   9268 // CHECK:   ret <4 x i32> [[ADD_I]]
   9269 int32x4_t test_vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c) {
   9270   return vmlaq_n_s32(a, b, c);
   9271 }
   9272 
   9273 // CHECK-LABEL: define <8 x i16> @test_vmlaq_n_u16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %c) #0 {
   9274 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0
   9275 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
   9276 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
   9277 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3
   9278 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4
   9279 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5
   9280 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6
   9281 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7
   9282 // CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]]
   9283 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[MUL_I]]
   9284 // CHECK:   ret <8 x i16> [[ADD_I]]
   9285 uint16x8_t test_vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) {
   9286   return vmlaq_n_u16(a, b, c);
   9287 }
   9288 
   9289 // CHECK-LABEL: define <4 x i32> @test_vmlaq_n_u32(<4 x i32> %a, <4 x i32> %b, i32 %c) #0 {
   9290 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
   9291 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
   9292 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
   9293 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3
   9294 // CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]]
   9295 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[MUL_I]]
   9296 // CHECK:   ret <4 x i32> [[ADD_I]]
   9297 uint32x4_t test_vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) {
   9298   return vmlaq_n_u32(a, b, c);
   9299 }
   9300 
   9301 // CHECK-LABEL: define <4 x float> @test_vmlaq_n_f32(<4 x float> %a, <4 x float> %b, float %c) #0 {
   9302 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %c, i32 0
   9303 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %c, i32 1
   9304 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %c, i32 2
   9305 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %c, i32 3
   9306 // CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %b, [[VECINIT3_I]]
   9307 // CHECK:   [[ADD_I:%.*]] = fadd <4 x float> %a, [[MUL_I]]
   9308 // CHECK:   ret <4 x float> [[ADD_I]]
   9309 float32x4_t test_vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c) {
   9310   return vmlaq_n_f32(a, b, c);
   9311 }
   9312 
   9313 
   9314 // CHECK-LABEL: define <8 x i8> @test_vmls_s8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
   9315 // CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %b, %c
   9316 // CHECK:   [[SUB_I:%.*]] = sub <8 x i8> %a, [[MUL_I]]
   9317 // CHECK:   ret <8 x i8> [[SUB_I]]
   9318 int8x8_t test_vmls_s8(int8x8_t a, int8x8_t b, int8x8_t c) {
   9319   return vmls_s8(a, b, c);
   9320 }
   9321 
   9322 // CHECK-LABEL: define <4 x i16> @test_vmls_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
   9323 // CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, %c
   9324 // CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %a, [[MUL_I]]
   9325 // CHECK:   ret <4 x i16> [[SUB_I]]
   9326 int16x4_t test_vmls_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
   9327   return vmls_s16(a, b, c);
   9328 }
   9329 
   9330 // CHECK-LABEL: define <2 x i32> @test_vmls_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
   9331 // CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, %c
   9332 // CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %a, [[MUL_I]]
   9333 // CHECK:   ret <2 x i32> [[SUB_I]]
   9334 int32x2_t test_vmls_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
   9335   return vmls_s32(a, b, c);
   9336 }
   9337 
   9338 // CHECK-LABEL: define <2 x float> @test_vmls_f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) #0 {
   9339 // CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %b, %c
   9340 // CHECK:   [[SUB_I:%.*]] = fsub <2 x float> %a, [[MUL_I]]
   9341 // CHECK:   ret <2 x float> [[SUB_I]]
   9342 float32x2_t test_vmls_f32(float32x2_t a, float32x2_t b, float32x2_t c) {
   9343   return vmls_f32(a, b, c);
   9344 }
   9345 
   9346 // CHECK-LABEL: define <8 x i8> @test_vmls_u8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
   9347 // CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %b, %c
   9348 // CHECK:   [[SUB_I:%.*]] = sub <8 x i8> %a, [[MUL_I]]
   9349 // CHECK:   ret <8 x i8> [[SUB_I]]
   9350 uint8x8_t test_vmls_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) {
   9351   return vmls_u8(a, b, c);
   9352 }
   9353 
   9354 // CHECK-LABEL: define <4 x i16> @test_vmls_u16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
   9355 // CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, %c
   9356 // CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %a, [[MUL_I]]
   9357 // CHECK:   ret <4 x i16> [[SUB_I]]
   9358 uint16x4_t test_vmls_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) {
   9359   return vmls_u16(a, b, c);
   9360 }
   9361 
   9362 // CHECK-LABEL: define <2 x i32> @test_vmls_u32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
   9363 // CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, %c
   9364 // CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %a, [[MUL_I]]
   9365 // CHECK:   ret <2 x i32> [[SUB_I]]
   9366 uint32x2_t test_vmls_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) {
   9367   return vmls_u32(a, b, c);
   9368 }
   9369 
   9370 // CHECK-LABEL: define <16 x i8> @test_vmlsq_s8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 {
   9371 // CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %b, %c
   9372 // CHECK:   [[SUB_I:%.*]] = sub <16 x i8> %a, [[MUL_I]]
   9373 // CHECK:   ret <16 x i8> [[SUB_I]]
   9374 int8x16_t test_vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c) {
   9375   return vmlsq_s8(a, b, c);
   9376 }
   9377 
   9378 // CHECK-LABEL: define <8 x i16> @test_vmlsq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) #0 {
   9379 // CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, %c
   9380 // CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[MUL_I]]
   9381 // CHECK:   ret <8 x i16> [[SUB_I]]
   9382 int16x8_t test_vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c) {
   9383   return vmlsq_s16(a, b, c);
   9384 }
   9385 
   9386 // CHECK-LABEL: define <4 x i32> @test_vmlsq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) #0 {
   9387 // CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, %c
   9388 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[MUL_I]]
   9389 // CHECK:   ret <4 x i32> [[SUB_I]]
   9390 int32x4_t test_vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c) {
   9391   return vmlsq_s32(a, b, c);
   9392 }
   9393 
   9394 // CHECK-LABEL: define <4 x float> @test_vmlsq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) #0 {
   9395 // CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %b, %c
   9396 // CHECK:   [[SUB_I:%.*]] = fsub <4 x float> %a, [[MUL_I]]
   9397 // CHECK:   ret <4 x float> [[SUB_I]]
   9398 float32x4_t test_vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c) {
   9399   return vmlsq_f32(a, b, c);
   9400 }
   9401 
   9402 // CHECK-LABEL: define <16 x i8> @test_vmlsq_u8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 {
   9403 // CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %b, %c
   9404 // CHECK:   [[SUB_I:%.*]] = sub <16 x i8> %a, [[MUL_I]]
   9405 // CHECK:   ret <16 x i8> [[SUB_I]]
   9406 uint8x16_t test_vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) {
   9407   return vmlsq_u8(a, b, c);
   9408 }
   9409 
   9410 // CHECK-LABEL: define <8 x i16> @test_vmlsq_u16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) #0 {
   9411 // CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, %c
   9412 // CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[MUL_I]]
   9413 // CHECK:   ret <8 x i16> [[SUB_I]]
   9414 uint16x8_t test_vmlsq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) {
   9415   return vmlsq_u16(a, b, c);
   9416 }
   9417 
   9418 // CHECK-LABEL: define <4 x i32> @test_vmlsq_u32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) #0 {
   9419 // CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, %c
   9420 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[MUL_I]]
   9421 // CHECK:   ret <4 x i32> [[SUB_I]]
   9422 uint32x4_t test_vmlsq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) {
   9423   return vmlsq_u32(a, b, c);
   9424 }
   9425 
   9426 
   9427 // CHECK-LABEL: define <8 x i16> @test_vmlsl_s8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) #0 {
   9428 // CHECK:   [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %b, <8 x i8> %c) #4
   9429 // CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMULL_I_I]]
   9430 // CHECK:   ret <8 x i16> [[SUB_I]]
   9431 int16x8_t test_vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c) {
   9432   return vmlsl_s8(a, b, c);
   9433 }
   9434 
   9435 // CHECK-LABEL: define <4 x i32> @test_vmlsl_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
   9436 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   9437 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
   9438 // CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   9439 // CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   9440 // CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4
   9441 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]]
   9442 // CHECK:   ret <4 x i32> [[SUB_I]]
   9443 int32x4_t test_vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
   9444   return vmlsl_s16(a, b, c);
   9445 }
   9446 
   9447 // CHECK-LABEL: define <2 x i64> @test_vmlsl_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
   9448 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   9449 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
   9450 // CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   9451 // CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   9452 // CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4
   9453 // CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]]
   9454 // CHECK:   ret <2 x i64> [[SUB_I]]
   9455 int64x2_t test_vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
   9456   return vmlsl_s32(a, b, c);
   9457 }
   9458 
   9459 // CHECK-LABEL: define <8 x i16> @test_vmlsl_u8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) #0 {
   9460 // CHECK:   [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %b, <8 x i8> %c) #4
   9461 // CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMULL_I_I]]
   9462 // CHECK:   ret <8 x i16> [[SUB_I]]
   9463 uint16x8_t test_vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) {
   9464   return vmlsl_u8(a, b, c);
   9465 }
   9466 
   9467 // CHECK-LABEL: define <4 x i32> @test_vmlsl_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
   9468 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   9469 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
   9470 // CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   9471 // CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   9472 // CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4
   9473 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]]
   9474 // CHECK:   ret <4 x i32> [[SUB_I]]
   9475 uint32x4_t test_vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) {
   9476   return vmlsl_u16(a, b, c);
   9477 }
   9478 
   9479 // CHECK-LABEL: define <2 x i64> @test_vmlsl_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
   9480 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   9481 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
   9482 // CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   9483 // CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   9484 // CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4
   9485 // CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]]
   9486 // CHECK:   ret <2 x i64> [[SUB_I]]
   9487 uint64x2_t test_vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
   9488   return vmlsl_u32(a, b, c);
   9489 }
   9490 
   9491 
   9492 // CHECK-LABEL: define <4 x i32> @test_vmlsl_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
   9493 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   9494 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   9495 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
   9496 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   9497 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   9498 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #4
   9499 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
   9500 // CHECK:   ret <4 x i32> [[SUB]]
   9501 int32x4_t test_vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
   9502   return vmlsl_lane_s16(a, b, c, 3);
   9503 }
   9504 
   9505 // CHECK-LABEL: define <2 x i64> @test_vmlsl_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
   9506 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
   9507 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   9508 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
   9509 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   9510 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   9511 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #4
   9512 // CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
   9513 // CHECK:   ret <2 x i64> [[SUB]]
   9514 int64x2_t test_vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
   9515   return vmlsl_lane_s32(a, b, c, 1);
   9516 }
   9517 
   9518 // CHECK-LABEL: define <4 x i32> @test_vmlsl_lane_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
   9519 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   9520 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   9521 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
   9522 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   9523 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   9524 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #4
   9525 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
   9526 // CHECK:   ret <4 x i32> [[SUB]]
   9527 uint32x4_t test_vmlsl_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) {
   9528   return vmlsl_lane_u16(a, b, c, 3);
   9529 }
   9530 
   9531 // CHECK-LABEL: define <2 x i64> @test_vmlsl_lane_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
   9532 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
   9533 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   9534 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
   9535 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   9536 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   9537 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #4
   9538 // CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
   9539 // CHECK:   ret <2 x i64> [[SUB]]
   9540 uint64x2_t test_vmlsl_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
   9541   return vmlsl_lane_u32(a, b, c, 1);
   9542 }
   9543 
   9544 
   9545 // CHECK-LABEL: define <4 x i32> @test_vmlsl_n_s16(<4 x i32> %a, <4 x i16> %b, i16 signext %c) #0 {
   9546 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
   9547 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
   9548 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
   9549 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
   9550 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   9551 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
   9552 // CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   9553 // CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   9554 // CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4
   9555 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]]
   9556 // CHECK:   ret <4 x i32> [[SUB_I]]
   9557 int32x4_t test_vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
   9558   return vmlsl_n_s16(a, b, c);
   9559 }
   9560 
   9561 // CHECK-LABEL: define <2 x i64> @test_vmlsl_n_s32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 {
   9562 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
   9563 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
   9564 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   9565 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
   9566 // CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   9567 // CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   9568 // CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4
   9569 // CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]]
   9570 // CHECK:   ret <2 x i64> [[SUB_I]]
   9571 int64x2_t test_vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
   9572   return vmlsl_n_s32(a, b, c);
   9573 }
   9574 
   9575 // CHECK-LABEL: define <4 x i32> @test_vmlsl_n_u16(<4 x i32> %a, <4 x i16> %b, i16 zeroext %c) #0 {
   9576 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
   9577 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
   9578 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
   9579 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
   9580 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   9581 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
   9582 // CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   9583 // CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   9584 // CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4
   9585 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]]
   9586 // CHECK:   ret <4 x i32> [[SUB_I]]
   9587 uint32x4_t test_vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) {
   9588   return vmlsl_n_u16(a, b, c);
   9589 }
   9590 
   9591 // CHECK-LABEL: define <2 x i64> @test_vmlsl_n_u32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 {
   9592 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
   9593 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
   9594 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   9595 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
   9596 // CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   9597 // CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   9598 // CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4
   9599 // CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]]
   9600 // CHECK:   ret <2 x i64> [[SUB_I]]
   9601 uint64x2_t test_vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) {
   9602   return vmlsl_n_u32(a, b, c);
   9603 }
   9604 
   9605 
   9606 // CHECK-LABEL: define <4 x i16> @test_vmls_lane_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
   9607 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   9608 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
   9609 // CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
   9610 // CHECK:   ret <4 x i16> [[SUB]]
   9611 int16x4_t test_vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
   9612   return vmls_lane_s16(a, b, c, 3);
   9613 }
   9614 
   9615 // CHECK-LABEL: define <2 x i32> @test_vmls_lane_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
   9616 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
   9617 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
   9618 // CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
   9619 // CHECK:   ret <2 x i32> [[SUB]]
   9620 int32x2_t test_vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
   9621   return vmls_lane_s32(a, b, c, 1);
   9622 }
   9623 
   9624 // CHECK-LABEL: define <4 x i16> @test_vmls_lane_u16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
   9625 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   9626 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
   9627 // CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
   9628 // CHECK:   ret <4 x i16> [[SUB]]
   9629 uint16x4_t test_vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) {
   9630   return vmls_lane_u16(a, b, c, 3);
   9631 }
   9632 
   9633 // CHECK-LABEL: define <2 x i32> @test_vmls_lane_u32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
   9634 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
   9635 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
   9636 // CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
   9637 // CHECK:   ret <2 x i32> [[SUB]]
   9638 uint32x2_t test_vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) {
   9639   return vmls_lane_u32(a, b, c, 1);
   9640 }
   9641 
   9642 // CHECK-LABEL: define <2 x float> @test_vmls_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) #0 {
   9643 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %c, <2 x float> %c, <2 x i32> <i32 1, i32 1>
   9644 // CHECK:   [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]]
   9645 // CHECK:   [[SUB:%.*]] = fsub <2 x float> %a, [[MUL]]
   9646 // CHECK:   ret <2 x float> [[SUB]]
   9647 float32x2_t test_vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t c) {
   9648   return vmls_lane_f32(a, b, c, 1);
   9649 }
   9650 
   9651 // CHECK-LABEL: define <8 x i16> @test_vmlsq_lane_s16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %c) #0 {
   9652 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
   9653 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
   9654 // CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
   9655 // CHECK:   ret <8 x i16> [[SUB]]
   9656 int16x8_t test_vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c) {
   9657   return vmlsq_lane_s16(a, b, c, 3);
   9658 }
   9659 
   9660 // CHECK-LABEL: define <4 x i32> @test_vmlsq_lane_s32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %c) #0 {
   9661 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   9662 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
   9663 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
   9664 // CHECK:   ret <4 x i32> [[SUB]]
   9665 int32x4_t test_vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t c) {
   9666   return vmlsq_lane_s32(a, b, c, 1);
   9667 }
   9668 
   9669 // CHECK-LABEL: define <8 x i16> @test_vmlsq_lane_u16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %c) #0 {
   9670 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
   9671 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
   9672 // CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
   9673 // CHECK:   ret <8 x i16> [[SUB]]
   9674 uint16x8_t test_vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t c) {
   9675   return vmlsq_lane_u16(a, b, c, 3);
   9676 }
   9677 
   9678 // CHECK-LABEL: define <4 x i32> @test_vmlsq_lane_u32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %c) #0 {
   9679 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   9680 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
   9681 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
   9682 // CHECK:   ret <4 x i32> [[SUB]]
   9683 uint32x4_t test_vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t c) {
   9684   return vmlsq_lane_u32(a, b, c, 1);
   9685 }
   9686 
   9687 // CHECK-LABEL: define <4 x float> @test_vmlsq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %c) #0 {
   9688 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %c, <2 x float> %c, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   9689 // CHECK:   [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]]
   9690 // CHECK:   [[SUB:%.*]] = fsub <4 x float> %a, [[MUL]]
   9691 // CHECK:   ret <4 x float> [[SUB]]
   9692 float32x4_t test_vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t c) {
   9693   return vmlsq_lane_f32(a, b, c, 1);
   9694 }
   9695 
   9696 
   9697 // CHECK-LABEL: define <4 x i16> @test_vmls_n_s16(<4 x i16> %a, <4 x i16> %b, i16 signext %c) #0 {
   9698 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
   9699 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
   9700 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
   9701 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
   9702 // CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]]
   9703 // CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %a, [[MUL_I]]
   9704 // CHECK:   ret <4 x i16> [[SUB_I]]
   9705 int16x4_t test_vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c) {
   9706   return vmls_n_s16(a, b, c);
   9707 }
   9708 
   9709 // CHECK-LABEL: define <2 x i32> @test_vmls_n_s32(<2 x i32> %a, <2 x i32> %b, i32 %c) #0 {
   9710 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
   9711 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
   9712 // CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
   9713 // CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %a, [[MUL_I]]
   9714 // CHECK:   ret <2 x i32> [[SUB_I]]
   9715 int32x2_t test_vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c) {
   9716   return vmls_n_s32(a, b, c);
   9717 }
   9718 
   9719 // CHECK-LABEL: define <4 x i16> @test_vmls_n_u16(<4 x i16> %a, <4 x i16> %b, i16 zeroext %c) #0 {
   9720 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
   9721 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
   9722 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
   9723 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
   9724 // CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]]
   9725 // CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %a, [[MUL_I]]
   9726 // CHECK:   ret <4 x i16> [[SUB_I]]
   9727 uint16x4_t test_vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) {
   9728   return vmls_n_u16(a, b, c);
   9729 }
   9730 
   9731 // CHECK-LABEL: define <2 x i32> @test_vmls_n_u32(<2 x i32> %a, <2 x i32> %b, i32 %c) #0 {
   9732 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
   9733 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
   9734 // CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
   9735 // CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %a, [[MUL_I]]
   9736 // CHECK:   ret <2 x i32> [[SUB_I]]
   9737 uint32x2_t test_vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) {
   9738   return vmls_n_u32(a, b, c);
   9739 }
   9740 
   9741 // CHECK-LABEL: define <2 x float> @test_vmls_n_f32(<2 x float> %a, <2 x float> %b, float %c) #0 {
   9742 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %c, i32 0
   9743 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %c, i32 1
   9744 // CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %b, [[VECINIT1_I]]
   9745 // CHECK:   [[SUB_I:%.*]] = fsub <2 x float> %a, [[MUL_I]]
   9746 // CHECK:   ret <2 x float> [[SUB_I]]
   9747 float32x2_t test_vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c) {
   9748   return vmls_n_f32(a, b, c);
   9749 }
   9750 
   9751 // CHECK-LABEL: define <8 x i16> @test_vmlsq_n_s16(<8 x i16> %a, <8 x i16> %b, i16 signext %c) #0 {
   9752 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0
   9753 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
   9754 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
   9755 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3
   9756 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4
   9757 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5
   9758 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6
   9759 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7
   9760 // CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]]
   9761 // CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[MUL_I]]
   9762 // CHECK:   ret <8 x i16> [[SUB_I]]
   9763 int16x8_t test_vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c) {
   9764   return vmlsq_n_s16(a, b, c);
   9765 }
   9766 
   9767 // CHECK-LABEL: define <4 x i32> @test_vmlsq_n_s32(<4 x i32> %a, <4 x i32> %b, i32 %c) #0 {
   9768 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
   9769 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
   9770 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
   9771 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3
   9772 // CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]]
   9773 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[MUL_I]]
   9774 // CHECK:   ret <4 x i32> [[SUB_I]]
   9775 int32x4_t test_vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c) {
   9776   return vmlsq_n_s32(a, b, c);
   9777 }
   9778 
   9779 // CHECK-LABEL: define <8 x i16> @test_vmlsq_n_u16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %c) #0 {
   9780 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0
   9781 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
   9782 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
   9783 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3
   9784 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4
   9785 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5
   9786 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6
   9787 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7
   9788 // CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]]
   9789 // CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[MUL_I]]
   9790 // CHECK:   ret <8 x i16> [[SUB_I]]
   9791 uint16x8_t test_vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) {
   9792   return vmlsq_n_u16(a, b, c);
   9793 }
   9794 
   9795 // CHECK-LABEL: define <4 x i32> @test_vmlsq_n_u32(<4 x i32> %a, <4 x i32> %b, i32 %c) #0 {
   9796 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
   9797 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
   9798 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
   9799 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3
   9800 // CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]]
   9801 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[MUL_I]]
   9802 // CHECK:   ret <4 x i32> [[SUB_I]]
   9803 uint32x4_t test_vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) {
   9804   return vmlsq_n_u32(a, b, c);
   9805 }
   9806 
   9807 // CHECK-LABEL: define <4 x float> @test_vmlsq_n_f32(<4 x float> %a, <4 x float> %b, float %c) #0 {
   9808 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %c, i32 0
   9809 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %c, i32 1
   9810 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %c, i32 2
   9811 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %c, i32 3
   9812 // CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %b, [[VECINIT3_I]]
   9813 // CHECK:   [[SUB_I:%.*]] = fsub <4 x float> %a, [[MUL_I]]
   9814 // CHECK:   ret <4 x float> [[SUB_I]]
   9815 float32x4_t test_vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c) {
   9816   return vmlsq_n_f32(a, b, c);
   9817 }
   9818 
   9819 
   9820 // CHECK-LABEL: define <8 x i16> @test_vmovl_s8(<8 x i8> %a) #0 {
   9821 // CHECK:   [[VMOVL_I:%.*]] = sext <8 x i8> %a to <8 x i16>
   9822 // CHECK:   ret <8 x i16> [[VMOVL_I]]
   9823 int16x8_t test_vmovl_s8(int8x8_t a) {
   9824   return vmovl_s8(a);
   9825 }
   9826 
   9827 // CHECK-LABEL: define <4 x i32> @test_vmovl_s16(<4 x i16> %a) #0 {
   9828 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   9829 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   9830 // CHECK:   [[VMOVL_I:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
   9831 // CHECK:   ret <4 x i32> [[VMOVL_I]]
   9832 int32x4_t test_vmovl_s16(int16x4_t a) {
   9833   return vmovl_s16(a);
   9834 }
   9835 
   9836 // CHECK-LABEL: define <2 x i64> @test_vmovl_s32(<2 x i32> %a) #0 {
   9837 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   9838 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   9839 // CHECK:   [[VMOVL_I:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
   9840 // CHECK:   ret <2 x i64> [[VMOVL_I]]
   9841 int64x2_t test_vmovl_s32(int32x2_t a) {
   9842   return vmovl_s32(a);
   9843 }
   9844 
   9845 // CHECK-LABEL: define <8 x i16> @test_vmovl_u8(<8 x i8> %a) #0 {
   9846 // CHECK:   [[VMOVL_I:%.*]] = zext <8 x i8> %a to <8 x i16>
   9847 // CHECK:   ret <8 x i16> [[VMOVL_I]]
   9848 uint16x8_t test_vmovl_u8(uint8x8_t a) {
   9849   return vmovl_u8(a);
   9850 }
   9851 
   9852 // CHECK-LABEL: define <4 x i32> @test_vmovl_u16(<4 x i16> %a) #0 {
   9853 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   9854 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   9855 // CHECK:   [[VMOVL_I:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
   9856 // CHECK:   ret <4 x i32> [[VMOVL_I]]
   9857 uint32x4_t test_vmovl_u16(uint16x4_t a) {
   9858   return vmovl_u16(a);
   9859 }
   9860 
   9861 // CHECK-LABEL: define <2 x i64> @test_vmovl_u32(<2 x i32> %a) #0 {
   9862 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   9863 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   9864 // CHECK:   [[VMOVL_I:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
   9865 // CHECK:   ret <2 x i64> [[VMOVL_I]]
   9866 uint64x2_t test_vmovl_u32(uint32x2_t a) {
   9867   return vmovl_u32(a);
   9868 }
   9869 
   9870 
   9871 // CHECK-LABEL: define <8 x i8> @test_vmovn_s16(<8 x i16> %a) #0 {
   9872 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   9873 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   9874 // CHECK:   [[VMOVN_I:%.*]] = trunc <8 x i16> [[TMP1]] to <8 x i8>
   9875 // CHECK:   ret <8 x i8> [[VMOVN_I]]
   9876 int8x8_t test_vmovn_s16(int16x8_t a) {
   9877   return vmovn_s16(a);
   9878 }
   9879 
   9880 // CHECK-LABEL: define <4 x i16> @test_vmovn_s32(<4 x i32> %a) #0 {
   9881 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   9882 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   9883 // CHECK:   [[VMOVN_I:%.*]] = trunc <4 x i32> [[TMP1]] to <4 x i16>
   9884 // CHECK:   ret <4 x i16> [[VMOVN_I]]
   9885 int16x4_t test_vmovn_s32(int32x4_t a) {
   9886   return vmovn_s32(a);
   9887 }
   9888 
   9889 // CHECK-LABEL: define <2 x i32> @test_vmovn_s64(<2 x i64> %a) #0 {
   9890 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   9891 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   9892 // CHECK:   [[VMOVN_I:%.*]] = trunc <2 x i64> [[TMP1]] to <2 x i32>
   9893 // CHECK:   ret <2 x i32> [[VMOVN_I]]
   9894 int32x2_t test_vmovn_s64(int64x2_t a) {
   9895   return vmovn_s64(a);
   9896 }
   9897 
   9898 // CHECK-LABEL: define <8 x i8> @test_vmovn_u16(<8 x i16> %a) #0 {
   9899 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   9900 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   9901 // CHECK:   [[VMOVN_I:%.*]] = trunc <8 x i16> [[TMP1]] to <8 x i8>
   9902 // CHECK:   ret <8 x i8> [[VMOVN_I]]
   9903 uint8x8_t test_vmovn_u16(uint16x8_t a) {
   9904   return vmovn_u16(a);
   9905 }
   9906 
   9907 // CHECK-LABEL: define <4 x i16> @test_vmovn_u32(<4 x i32> %a) #0 {
   9908 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   9909 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   9910 // CHECK:   [[VMOVN_I:%.*]] = trunc <4 x i32> [[TMP1]] to <4 x i16>
   9911 // CHECK:   ret <4 x i16> [[VMOVN_I]]
   9912 uint16x4_t test_vmovn_u32(uint32x4_t a) {
   9913   return vmovn_u32(a);
   9914 }
   9915 
   9916 // CHECK-LABEL: define <2 x i32> @test_vmovn_u64(<2 x i64> %a) #0 {
   9917 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   9918 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   9919 // CHECK:   [[VMOVN_I:%.*]] = trunc <2 x i64> [[TMP1]] to <2 x i32>
   9920 // CHECK:   ret <2 x i32> [[VMOVN_I]]
   9921 uint32x2_t test_vmovn_u64(uint64x2_t a) {
   9922   return vmovn_u64(a);
   9923 }
   9924 
   9925 
   9926 // CHECK-LABEL: define <8 x i8> @test_vmov_n_u8(i8 zeroext %a) #0 {
   9927 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i8> undef, i8 %a, i32 0
   9928 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 %a, i32 1
   9929 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 %a, i32 2
   9930 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 %a, i32 3
   9931 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 %a, i32 4
   9932 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 %a, i32 5
   9933 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 %a, i32 6
   9934 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 %a, i32 7
   9935 // CHECK:   ret <8 x i8> [[VECINIT7_I]]
   9936 uint8x8_t test_vmov_n_u8(uint8_t a) {
   9937   return vmov_n_u8(a);
   9938 }
   9939 
   9940 // CHECK-LABEL: define <4 x i16> @test_vmov_n_u16(i16 zeroext %a) #0 {
   9941 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %a, i32 0
   9942 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %a, i32 1
   9943 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %a, i32 2
   9944 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %a, i32 3
   9945 // CHECK:   ret <4 x i16> [[VECINIT3_I]]
   9946 uint16x4_t test_vmov_n_u16(uint16_t a) {
   9947   return vmov_n_u16(a);
   9948 }
   9949 
   9950 // CHECK-LABEL: define <2 x i32> @test_vmov_n_u32(i32 %a) #0 {
   9951 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %a, i32 0
   9952 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %a, i32 1
   9953 // CHECK:   ret <2 x i32> [[VECINIT1_I]]
   9954 uint32x2_t test_vmov_n_u32(uint32_t a) {
   9955   return vmov_n_u32(a);
   9956 }
   9957 
   9958 // CHECK-LABEL: define <8 x i8> @test_vmov_n_s8(i8 signext %a) #0 {
   9959 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i8> undef, i8 %a, i32 0
   9960 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 %a, i32 1
   9961 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 %a, i32 2
   9962 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 %a, i32 3
   9963 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 %a, i32 4
   9964 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 %a, i32 5
   9965 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 %a, i32 6
   9966 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 %a, i32 7
   9967 // CHECK:   ret <8 x i8> [[VECINIT7_I]]
   9968 int8x8_t test_vmov_n_s8(int8_t a) {
   9969   return vmov_n_s8(a);
   9970 }
   9971 
   9972 // CHECK-LABEL: define <4 x i16> @test_vmov_n_s16(i16 signext %a) #0 {
   9973 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %a, i32 0
   9974 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %a, i32 1
   9975 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %a, i32 2
   9976 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %a, i32 3
   9977 // CHECK:   ret <4 x i16> [[VECINIT3_I]]
   9978 int16x4_t test_vmov_n_s16(int16_t a) {
   9979   return vmov_n_s16(a);
   9980 }
   9981 
   9982 // CHECK-LABEL: define <2 x i32> @test_vmov_n_s32(i32 %a) #0 {
   9983 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %a, i32 0
   9984 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %a, i32 1
   9985 // CHECK:   ret <2 x i32> [[VECINIT1_I]]
   9986 int32x2_t test_vmov_n_s32(int32_t a) {
   9987   return vmov_n_s32(a);
   9988 }
   9989 
   9990 // CHECK-LABEL: define <8 x i8> @test_vmov_n_p8(i8 signext %a) #0 {
   9991 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i8> undef, i8 %a, i32 0
   9992 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 %a, i32 1
   9993 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 %a, i32 2
   9994 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 %a, i32 3
   9995 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 %a, i32 4
   9996 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 %a, i32 5
   9997 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 %a, i32 6
   9998 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 %a, i32 7
   9999 // CHECK:   ret <8 x i8> [[VECINIT7_I]]
   10000 poly8x8_t test_vmov_n_p8(poly8_t a) {
   10001   return vmov_n_p8(a);
   10002 }
   10003 
   10004 // CHECK-LABEL: define <4 x i16> @test_vmov_n_p16(i16 signext %a) #0 {
   10005 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %a, i32 0
   10006 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %a, i32 1
   10007 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %a, i32 2
   10008 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %a, i32 3
   10009 // CHECK:   ret <4 x i16> [[VECINIT3_I]]
   10010 poly16x4_t test_vmov_n_p16(poly16_t a) {
   10011   return vmov_n_p16(a);
   10012 }
   10013 
   10014 // CHECK-LABEL: define <4 x half> @test_vmov_n_f16(half* %a) #0 {
   10015 // CHECK:   [[TMP0:%.*]] = load half, half* %a, align 2
   10016 // CHECK:   [[VECINIT:%.*]] = insertelement <4 x half> undef, half [[TMP0]], i32 0
   10017 // CHECK:   [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP0]], i32 1
   10018 // CHECK:   [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT1]], half [[TMP0]], i32 2
   10019 // CHECK:   [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT2]], half [[TMP0]], i32 3
   10020 // CHECK:   ret <4 x half> [[VECINIT3]]
   10021 float16x4_t test_vmov_n_f16(float16_t *a) {
   10022   return vmov_n_f16(*a);
   10023 }
   10024 
   10025 // CHECK-LABEL: define <2 x float> @test_vmov_n_f32(float %a) #0 {
   10026 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %a, i32 0
   10027 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %a, i32 1
   10028 // CHECK:   ret <2 x float> [[VECINIT1_I]]
   10029 float32x2_t test_vmov_n_f32(float32_t a) {
   10030   return vmov_n_f32(a);
   10031 }
   10032 
   10033 // CHECK-LABEL: define <16 x i8> @test_vmovq_n_u8(i8 zeroext %a) #0 {
   10034 // CHECK:   [[VECINIT_I:%.*]] = insertelement <16 x i8> undef, i8 %a, i32 0
   10035 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 %a, i32 1
   10036 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 %a, i32 2
   10037 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 %a, i32 3
   10038 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 %a, i32 4
   10039 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 %a, i32 5
   10040 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 %a, i32 6
   10041 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 %a, i32 7
   10042 // CHECK:   [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 %a, i32 8
   10043 // CHECK:   [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 %a, i32 9
   10044 // CHECK:   [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 %a, i32 10
   10045 // CHECK:   [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 %a, i32 11
   10046 // CHECK:   [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 %a, i32 12
   10047 // CHECK:   [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 %a, i32 13
   10048 // CHECK:   [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 %a, i32 14
   10049 // CHECK:   [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 %a, i32 15
   10050 // CHECK:   ret <16 x i8> [[VECINIT15_I]]
   10051 uint8x16_t test_vmovq_n_u8(uint8_t a) {
   10052   return vmovq_n_u8(a);
   10053 }
   10054 
   10055 // CHECK-LABEL: define <8 x i16> @test_vmovq_n_u16(i16 zeroext %a) #0 {
   10056 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %a, i32 0
   10057 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %a, i32 1
   10058 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %a, i32 2
   10059 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %a, i32 3
   10060 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %a, i32 4
   10061 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %a, i32 5
   10062 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %a, i32 6
   10063 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %a, i32 7
   10064 // CHECK:   ret <8 x i16> [[VECINIT7_I]]
   10065 uint16x8_t test_vmovq_n_u16(uint16_t a) {
   10066   return vmovq_n_u16(a);
   10067 }
   10068 
   10069 // CHECK-LABEL: define <4 x i32> @test_vmovq_n_u32(i32 %a) #0 {
   10070 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %a, i32 0
   10071 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %a, i32 1
   10072 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %a, i32 2
   10073 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %a, i32 3
   10074 // CHECK:   ret <4 x i32> [[VECINIT3_I]]
   10075 uint32x4_t test_vmovq_n_u32(uint32_t a) {
   10076   return vmovq_n_u32(a);
   10077 }
   10078 
   10079 // CHECK-LABEL: define <16 x i8> @test_vmovq_n_s8(i8 signext %a) #0 {
   10080 // CHECK:   [[VECINIT_I:%.*]] = insertelement <16 x i8> undef, i8 %a, i32 0
   10081 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 %a, i32 1
   10082 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 %a, i32 2
   10083 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 %a, i32 3
   10084 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 %a, i32 4
   10085 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 %a, i32 5
   10086 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 %a, i32 6
   10087 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 %a, i32 7
   10088 // CHECK:   [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 %a, i32 8
   10089 // CHECK:   [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 %a, i32 9
   10090 // CHECK:   [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 %a, i32 10
   10091 // CHECK:   [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 %a, i32 11
   10092 // CHECK:   [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 %a, i32 12
   10093 // CHECK:   [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 %a, i32 13
   10094 // CHECK:   [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 %a, i32 14
   10095 // CHECK:   [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 %a, i32 15
   10096 // CHECK:   ret <16 x i8> [[VECINIT15_I]]
   10097 int8x16_t test_vmovq_n_s8(int8_t a) {
   10098   return vmovq_n_s8(a);
   10099 }
   10100 
   10101 // CHECK-LABEL: define <8 x i16> @test_vmovq_n_s16(i16 signext %a) #0 {
   10102 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %a, i32 0
   10103 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %a, i32 1
   10104 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %a, i32 2
   10105 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %a, i32 3
   10106 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %a, i32 4
   10107 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %a, i32 5
   10108 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %a, i32 6
   10109 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %a, i32 7
   10110 // CHECK:   ret <8 x i16> [[VECINIT7_I]]
   10111 int16x8_t test_vmovq_n_s16(int16_t a) {
   10112   return vmovq_n_s16(a);
   10113 }
   10114 
   10115 // CHECK-LABEL: define <4 x i32> @test_vmovq_n_s32(i32 %a) #0 {
   10116 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %a, i32 0
   10117 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %a, i32 1
   10118 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %a, i32 2
   10119 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %a, i32 3
   10120 // CHECK:   ret <4 x i32> [[VECINIT3_I]]
   10121 int32x4_t test_vmovq_n_s32(int32_t a) {
   10122   return vmovq_n_s32(a);
   10123 }
   10124 
   10125 // CHECK-LABEL: define <16 x i8> @test_vmovq_n_p8(i8 signext %a) #0 {
   10126 // CHECK:   [[VECINIT_I:%.*]] = insertelement <16 x i8> undef, i8 %a, i32 0
   10127 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 %a, i32 1
   10128 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 %a, i32 2
   10129 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 %a, i32 3
   10130 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 %a, i32 4
   10131 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 %a, i32 5
   10132 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 %a, i32 6
   10133 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 %a, i32 7
   10134 // CHECK:   [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 %a, i32 8
   10135 // CHECK:   [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 %a, i32 9
   10136 // CHECK:   [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 %a, i32 10
   10137 // CHECK:   [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 %a, i32 11
   10138 // CHECK:   [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 %a, i32 12
   10139 // CHECK:   [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 %a, i32 13
   10140 // CHECK:   [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 %a, i32 14
   10141 // CHECK:   [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 %a, i32 15
   10142 // CHECK:   ret <16 x i8> [[VECINIT15_I]]
   10143 poly8x16_t test_vmovq_n_p8(poly8_t a) {
   10144   return vmovq_n_p8(a);
   10145 }
   10146 
   10147 // CHECK-LABEL: define <8 x i16> @test_vmovq_n_p16(i16 signext %a) #0 {
   10148 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %a, i32 0
   10149 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %a, i32 1
   10150 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %a, i32 2
   10151 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %a, i32 3
   10152 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %a, i32 4
   10153 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %a, i32 5
   10154 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %a, i32 6
   10155 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %a, i32 7
   10156 // CHECK:   ret <8 x i16> [[VECINIT7_I]]
   10157 poly16x8_t test_vmovq_n_p16(poly16_t a) {
   10158   return vmovq_n_p16(a);
   10159 }
   10160 
   10161 // CHECK-LABEL: define <8 x half> @test_vmovq_n_f16(half* %a) #0 {
   10162 // CHECK:   [[TMP0:%.*]] = load half, half* %a, align 2
   10163 // CHECK:   [[VECINIT:%.*]] = insertelement <8 x half> undef, half [[TMP0]], i32 0
   10164 // CHECK:   [[VECINIT1:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP0]], i32 1
   10165 // CHECK:   [[VECINIT2:%.*]] = insertelement <8 x half> [[VECINIT1]], half [[TMP0]], i32 2
   10166 // CHECK:   [[VECINIT3:%.*]] = insertelement <8 x half> [[VECINIT2]], half [[TMP0]], i32 3
   10167 // CHECK:   [[VECINIT4:%.*]] = insertelement <8 x half> [[VECINIT3]], half [[TMP0]], i32 4
   10168 // CHECK:   [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT4]], half [[TMP0]], i32 5
   10169 // CHECK:   [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT5]], half [[TMP0]], i32 6
   10170 // CHECK:   [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT6]], half [[TMP0]], i32 7
   10171 // CHECK:   ret <8 x half> [[VECINIT7]]
   10172 float16x8_t test_vmovq_n_f16(float16_t *a) {
   10173   return vmovq_n_f16(*a);
   10174 }
   10175 
   10176 // CHECK-LABEL: define <4 x float> @test_vmovq_n_f32(float %a) #0 {
   10177 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %a, i32 0
   10178 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %a, i32 1
   10179 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %a, i32 2
   10180 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %a, i32 3
   10181 // CHECK:   ret <4 x float> [[VECINIT3_I]]
   10182 float32x4_t test_vmovq_n_f32(float32_t a) {
   10183   return vmovq_n_f32(a);
   10184 }
   10185 
   10186 // CHECK-LABEL: define <1 x i64> @test_vmov_n_s64(i64 %a) #0 {
   10187 // CHECK:   [[VECINIT_I:%.*]] = insertelement <1 x i64> undef, i64 %a, i32 0
   10188 // CHECK:   [[ADD_I:%.*]] = add <1 x i64> [[VECINIT_I]], [[VECINIT_I]]
   10189 // CHECK:   ret <1 x i64> [[ADD_I]]
   10190 int64x1_t test_vmov_n_s64(int64_t a) {
   10191   int64x1_t tmp = vmov_n_s64(a);
   10192   return vadd_s64(tmp, tmp);
   10193 }
   10194 
   10195 // CHECK-LABEL: define <1 x i64> @test_vmov_n_u64(i64 %a) #0 {
   10196 // CHECK:   [[VECINIT_I:%.*]] = insertelement <1 x i64> undef, i64 %a, i32 0
   10197 // CHECK:   [[ADD_I:%.*]] = add <1 x i64> [[VECINIT_I]], [[VECINIT_I]]
   10198 // CHECK:   ret <1 x i64> [[ADD_I]]
   10199 uint64x1_t test_vmov_n_u64(uint64_t a) {
   10200   uint64x1_t tmp = vmov_n_u64(a);
   10201   return vadd_u64(tmp, tmp);
   10202 }
   10203 
   10204 // CHECK-LABEL: define <2 x i64> @test_vmovq_n_s64(i64 %a) #0 {
   10205 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i64> undef, i64 %a, i32 0
   10206 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 %a, i32 1
   10207 // CHECK:   ret <2 x i64> [[VECINIT1_I]]
   10208 int64x2_t test_vmovq_n_s64(int64_t a) {
   10209   return vmovq_n_s64(a);
   10210 }
   10211 
   10212 // CHECK-LABEL: define <2 x i64> @test_vmovq_n_u64(i64 %a) #0 {
   10213 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i64> undef, i64 %a, i32 0
   10214 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 %a, i32 1
   10215 // CHECK:   ret <2 x i64> [[VECINIT1_I]]
   10216 uint64x2_t test_vmovq_n_u64(uint64_t a) {
   10217   return vmovq_n_u64(a);
   10218 }
   10219 
   10220 
   10221 // CHECK-LABEL: define <8 x i8> @test_vmul_s8(<8 x i8> %a, <8 x i8> %b) #0 {
   10222 // CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %a, %b
   10223 // CHECK:   ret <8 x i8> [[MUL_I]]
   10224 int8x8_t test_vmul_s8(int8x8_t a, int8x8_t b) {
   10225   return vmul_s8(a, b);
   10226 }
   10227 
   10228 // CHECK-LABEL: define <4 x i16> @test_vmul_s16(<4 x i16> %a, <4 x i16> %b) #0 {
   10229 // CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %a, %b
   10230 // CHECK:   ret <4 x i16> [[MUL_I]]
   10231 int16x4_t test_vmul_s16(int16x4_t a, int16x4_t b) {
   10232   return vmul_s16(a, b);
   10233 }
   10234 
   10235 // CHECK-LABEL: define <2 x i32> @test_vmul_s32(<2 x i32> %a, <2 x i32> %b) #0 {
   10236 // CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %a, %b
   10237 // CHECK:   ret <2 x i32> [[MUL_I]]
   10238 int32x2_t test_vmul_s32(int32x2_t a, int32x2_t b) {
   10239   return vmul_s32(a, b);
   10240 }
   10241 
   10242 // CHECK-LABEL: define <2 x float> @test_vmul_f32(<2 x float> %a, <2 x float> %b) #0 {
   10243 // CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %a, %b
   10244 // CHECK:   ret <2 x float> [[MUL_I]]
   10245 float32x2_t test_vmul_f32(float32x2_t a, float32x2_t b) {
   10246   return vmul_f32(a, b);
   10247 }
   10248 
   10249 // CHECK-LABEL: define <8 x i8> @test_vmul_u8(<8 x i8> %a, <8 x i8> %b) #0 {
   10250 // CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %a, %b
   10251 // CHECK:   ret <8 x i8> [[MUL_I]]
   10252 uint8x8_t test_vmul_u8(uint8x8_t a, uint8x8_t b) {
   10253   return vmul_u8(a, b);
   10254 }
   10255 
   10256 // CHECK-LABEL: define <4 x i16> @test_vmul_u16(<4 x i16> %a, <4 x i16> %b) #0 {
   10257 // CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %a, %b
   10258 // CHECK:   ret <4 x i16> [[MUL_I]]
   10259 uint16x4_t test_vmul_u16(uint16x4_t a, uint16x4_t b) {
   10260   return vmul_u16(a, b);
   10261 }
   10262 
   10263 // CHECK-LABEL: define <2 x i32> @test_vmul_u32(<2 x i32> %a, <2 x i32> %b) #0 {
   10264 // CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %a, %b
   10265 // CHECK:   ret <2 x i32> [[MUL_I]]
   10266 uint32x2_t test_vmul_u32(uint32x2_t a, uint32x2_t b) {
   10267   return vmul_u32(a, b);
   10268 }
   10269 
   10270 // CHECK-LABEL: define <16 x i8> @test_vmulq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
   10271 // CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %a, %b
   10272 // CHECK:   ret <16 x i8> [[MUL_I]]
   10273 int8x16_t test_vmulq_s8(int8x16_t a, int8x16_t b) {
   10274   return vmulq_s8(a, b);
   10275 }
   10276 
   10277 // CHECK-LABEL: define <8 x i16> @test_vmulq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
   10278 // CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %a, %b
   10279 // CHECK:   ret <8 x i16> [[MUL_I]]
   10280 int16x8_t test_vmulq_s16(int16x8_t a, int16x8_t b) {
   10281   return vmulq_s16(a, b);
   10282 }
   10283 
   10284 // CHECK-LABEL: define <4 x i32> @test_vmulq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
   10285 // CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %a, %b
   10286 // CHECK:   ret <4 x i32> [[MUL_I]]
   10287 int32x4_t test_vmulq_s32(int32x4_t a, int32x4_t b) {
   10288   return vmulq_s32(a, b);
   10289 }
   10290 
   10291 // CHECK-LABEL: define <4 x float> @test_vmulq_f32(<4 x float> %a, <4 x float> %b) #0 {
   10292 // CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %a, %b
   10293 // CHECK:   ret <4 x float> [[MUL_I]]
   10294 float32x4_t test_vmulq_f32(float32x4_t a, float32x4_t b) {
   10295   return vmulq_f32(a, b);
   10296 }
   10297 
   10298 // CHECK-LABEL: define <16 x i8> @test_vmulq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
   10299 // CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %a, %b
   10300 // CHECK:   ret <16 x i8> [[MUL_I]]
   10301 uint8x16_t test_vmulq_u8(uint8x16_t a, uint8x16_t b) {
   10302   return vmulq_u8(a, b);
   10303 }
   10304 
   10305 // CHECK-LABEL: define <8 x i16> @test_vmulq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
   10306 // CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %a, %b
   10307 // CHECK:   ret <8 x i16> [[MUL_I]]
   10308 uint16x8_t test_vmulq_u16(uint16x8_t a, uint16x8_t b) {
   10309   return vmulq_u16(a, b);
   10310 }
   10311 
   10312 // CHECK-LABEL: define <4 x i32> @test_vmulq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
   10313 // CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %a, %b
   10314 // CHECK:   ret <4 x i32> [[MUL_I]]
   10315 uint32x4_t test_vmulq_u32(uint32x4_t a, uint32x4_t b) {
   10316   return vmulq_u32(a, b);
   10317 }
   10318 
   10319 
   10320 // CHECK-LABEL: define <8 x i16> @test_vmull_s8(<8 x i8> %a, <8 x i8> %b) #0 {
   10321 // CHECK:   [[VMULL_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %a, <8 x i8> %b) #4
   10322 // CHECK:   ret <8 x i16> [[VMULL_I]]
   10323 int16x8_t test_vmull_s8(int8x8_t a, int8x8_t b) {
   10324   return vmull_s8(a, b);
   10325 }
   10326 
   10327 // CHECK-LABEL: define <4 x i32> @test_vmull_s16(<4 x i16> %a, <4 x i16> %b) #0 {
   10328 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   10329 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   10330 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   10331 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   10332 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #4
   10333 // CHECK:   ret <4 x i32> [[VMULL2_I]]
   10334 int32x4_t test_vmull_s16(int16x4_t a, int16x4_t b) {
   10335   return vmull_s16(a, b);
   10336 }
   10337 
   10338 // CHECK-LABEL: define <2 x i64> @test_vmull_s32(<2 x i32> %a, <2 x i32> %b) #0 {
   10339 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   10340 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   10341 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   10342 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   10343 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #4
   10344 // CHECK:   ret <2 x i64> [[VMULL2_I]]
   10345 int64x2_t test_vmull_s32(int32x2_t a, int32x2_t b) {
   10346   return vmull_s32(a, b);
   10347 }
   10348 
   10349 // CHECK-LABEL: define <8 x i16> @test_vmull_u8(<8 x i8> %a, <8 x i8> %b) #0 {
   10350 // CHECK:   [[VMULL_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %a, <8 x i8> %b) #4
   10351 // CHECK:   ret <8 x i16> [[VMULL_I]]
   10352 uint16x8_t test_vmull_u8(uint8x8_t a, uint8x8_t b) {
   10353   return vmull_u8(a, b);
   10354 }
   10355 
   10356 // CHECK-LABEL: define <4 x i32> @test_vmull_u16(<4 x i16> %a, <4 x i16> %b) #0 {
   10357 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   10358 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   10359 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   10360 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   10361 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #4
   10362 // CHECK:   ret <4 x i32> [[VMULL2_I]]
   10363 uint32x4_t test_vmull_u16(uint16x4_t a, uint16x4_t b) {
   10364   return vmull_u16(a, b);
   10365 }
   10366 
   10367 // CHECK-LABEL: define <2 x i64> @test_vmull_u32(<2 x i32> %a, <2 x i32> %b) #0 {
   10368 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   10369 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   10370 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   10371 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   10372 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #4
   10373 // CHECK:   ret <2 x i64> [[VMULL2_I]]
   10374 uint64x2_t test_vmull_u32(uint32x2_t a, uint32x2_t b) {
   10375   return vmull_u32(a, b);
   10376 }
   10377 
   10378 // CHECK-LABEL: define <8 x i16> @test_vmull_p8(<8 x i8> %a, <8 x i8> %b) #0 {
   10379 // CHECK:   [[VMULL_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmullp.v8i16(<8 x i8> %a, <8 x i8> %b) #4
   10380 // CHECK:   ret <8 x i16> [[VMULL_I]]
   10381 poly16x8_t test_vmull_p8(poly8x8_t a, poly8x8_t b) {
   10382   return vmull_p8(a, b);
   10383 }
   10384 
   10385 
   10386 // CHECK-LABEL: define <4 x i32> @test_vmull_lane_s16(<4 x i16> %a, <4 x i16> %b) #0 {
   10387 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   10388 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   10389 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
   10390 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   10391 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   10392 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #4
   10393 // CHECK:   ret <4 x i32> [[VMULL2_I]]
   10394 int32x4_t test_vmull_lane_s16(int16x4_t a, int16x4_t b) {
   10395   return vmull_lane_s16(a, b, 3);
   10396 }
   10397 
   10398 // CHECK-LABEL: define <2 x i64> @test_vmull_lane_s32(<2 x i32> %a, <2 x i32> %b) #0 {
   10399 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> <i32 1, i32 1>
   10400 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   10401 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
   10402 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   10403 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   10404 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #4
   10405 // CHECK:   ret <2 x i64> [[VMULL2_I]]
   10406 int64x2_t test_vmull_lane_s32(int32x2_t a, int32x2_t b) {
   10407   return vmull_lane_s32(a, b, 1);
   10408 }
   10409 
   10410 // CHECK-LABEL: define <4 x i32> @test_vmull_lane_u16(<4 x i16> %a, <4 x i16> %b) #0 {
   10411 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   10412 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   10413 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
   10414 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   10415 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   10416 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #4
   10417 // CHECK:   ret <4 x i32> [[VMULL2_I]]
   10418 uint32x4_t test_vmull_lane_u16(uint16x4_t a, uint16x4_t b) {
   10419   return vmull_lane_u16(a, b, 3);
   10420 }
   10421 
   10422 // CHECK-LABEL: define <2 x i64> @test_vmull_lane_u32(<2 x i32> %a, <2 x i32> %b) #0 {
   10423 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> <i32 1, i32 1>
   10424 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   10425 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
   10426 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   10427 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   10428 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #4
   10429 // CHECK:   ret <2 x i64> [[VMULL2_I]]
   10430 uint64x2_t test_vmull_lane_u32(uint32x2_t a, uint32x2_t b) {
   10431   return vmull_lane_u32(a, b, 1);
   10432 }
   10433 
   10434 
   10435 // CHECK-LABEL: define <4 x i32> @test_vmull_n_s16(<4 x i16> %a, i16 signext %b) #0 {
   10436 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   10437 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
   10438 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
   10439 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
   10440 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
   10441 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
   10442 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   10443 // CHECK:   [[VMULL4_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   10444 // CHECK:   [[VMULL5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL4_I]]) #4
   10445 // CHECK:   ret <4 x i32> [[VMULL5_I]]
   10446 int32x4_t test_vmull_n_s16(int16x4_t a, int16_t b) {
   10447   return vmull_n_s16(a, b);
   10448 }
   10449 
   10450 // CHECK-LABEL: define <2 x i64> @test_vmull_n_s32(<2 x i32> %a, i32 %b) #0 {
   10451 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   10452 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
   10453 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
   10454 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
   10455 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   10456 // CHECK:   [[VMULL2_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   10457 // CHECK:   [[VMULL3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL2_I]]) #4
   10458 // CHECK:   ret <2 x i64> [[VMULL3_I]]
   10459 int64x2_t test_vmull_n_s32(int32x2_t a, int32_t b) {
   10460   return vmull_n_s32(a, b);
   10461 }
   10462 
   10463 // CHECK-LABEL: define <4 x i32> @test_vmull_n_u16(<4 x i16> %a, i16 zeroext %b) #0 {
   10464 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   10465 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
   10466 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
   10467 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
   10468 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
   10469 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
   10470 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   10471 // CHECK:   [[VMULL4_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   10472 // CHECK:   [[VMULL5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL4_I]]) #4
   10473 // CHECK:   ret <4 x i32> [[VMULL5_I]]
   10474 uint32x4_t test_vmull_n_u16(uint16x4_t a, uint16_t b) {
   10475   return vmull_n_u16(a, b);
   10476 }
   10477 
   10478 // CHECK-LABEL: define <2 x i64> @test_vmull_n_u32(<2 x i32> %a, i32 %b) #0 {
   10479 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   10480 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
   10481 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
   10482 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
   10483 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   10484 // CHECK:   [[VMULL2_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   10485 // CHECK:   [[VMULL3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL2_I]]) #4
   10486 // CHECK:   ret <2 x i64> [[VMULL3_I]]
   10487 uint64x2_t test_vmull_n_u32(uint32x2_t a, uint32_t b) {
   10488   return vmull_n_u32(a, b);
   10489 }
   10490 
   10491 
   10492 // CHECK-LABEL: define <8 x i8> @test_vmul_p8(<8 x i8> %a, <8 x i8> %b) #0 {
   10493 // CHECK:   [[VMUL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vmulp.v8i8(<8 x i8> %a, <8 x i8> %b) #4
   10494 // CHECK:   ret <8 x i8> [[VMUL_V_I]]
   10495 poly8x8_t test_vmul_p8(poly8x8_t a, poly8x8_t b) {
   10496   return vmul_p8(a, b);
   10497 }
   10498 
   10499 // CHECK-LABEL: define <16 x i8> @test_vmulq_p8(<16 x i8> %a, <16 x i8> %b) #0 {
   10500 // CHECK:   [[VMULQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vmulp.v16i8(<16 x i8> %a, <16 x i8> %b) #4
   10501 // CHECK:   ret <16 x i8> [[VMULQ_V_I]]
   10502 poly8x16_t test_vmulq_p8(poly8x16_t a, poly8x16_t b) {
   10503   return vmulq_p8(a, b);
   10504 }
   10505 
   10506 
   10507 // CHECK-LABEL: define <4 x i16> @test_vmul_lane_s16(<4 x i16> %a, <4 x i16> %b) #0 {
   10508 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   10509 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
   10510 // CHECK:   ret <4 x i16> [[MUL]]
   10511 int16x4_t test_vmul_lane_s16(int16x4_t a, int16x4_t b) {
   10512   return vmul_lane_s16(a, b, 3);
   10513 }
   10514 
   10515 // CHECK-LABEL: define <2 x i32> @test_vmul_lane_s32(<2 x i32> %a, <2 x i32> %b) #0 {
   10516 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> <i32 1, i32 1>
   10517 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
   10518 // CHECK:   ret <2 x i32> [[MUL]]
   10519 int32x2_t test_vmul_lane_s32(int32x2_t a, int32x2_t b) {
   10520   return vmul_lane_s32(a, b, 1);
   10521 }
   10522 
   10523 // CHECK-LABEL: define <2 x float> @test_vmul_lane_f32(<2 x float> %a, <2 x float> %b) #0 {
   10524 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %b, <2 x float> %b, <2 x i32> <i32 1, i32 1>
   10525 // CHECK:   [[MUL:%.*]] = fmul <2 x float> %a, [[SHUFFLE]]
   10526 // CHECK:   ret <2 x float> [[MUL]]
   10527 float32x2_t test_vmul_lane_f32(float32x2_t a, float32x2_t b) {
   10528   return vmul_lane_f32(a, b, 1);
   10529 }
   10530 
   10531 // CHECK-LABEL: define <4 x i16> @test_vmul_lane_u16(<4 x i16> %a, <4 x i16> %b) #0 {
   10532 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   10533 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
   10534 // CHECK:   ret <4 x i16> [[MUL]]
   10535 uint16x4_t test_vmul_lane_u16(uint16x4_t a, uint16x4_t b) {
   10536   return vmul_lane_u16(a, b, 3);
   10537 }
   10538 
   10539 // CHECK-LABEL: define <2 x i32> @test_vmul_lane_u32(<2 x i32> %a, <2 x i32> %b) #0 {
   10540 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> <i32 1, i32 1>
   10541 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
   10542 // CHECK:   ret <2 x i32> [[MUL]]
   10543 uint32x2_t test_vmul_lane_u32(uint32x2_t a, uint32x2_t b) {
   10544   return vmul_lane_u32(a, b, 1);
   10545 }
   10546 
   10547 // CHECK-LABEL: define <8 x i16> @test_vmulq_lane_s16(<8 x i16> %a, <4 x i16> %b) #0 {
   10548 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
   10549 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
   10550 // CHECK:   ret <8 x i16> [[MUL]]
   10551 int16x8_t test_vmulq_lane_s16(int16x8_t a, int16x4_t b) {
   10552   return vmulq_lane_s16(a, b, 3);
   10553 }
   10554 
   10555 // CHECK-LABEL: define <4 x i32> @test_vmulq_lane_s32(<4 x i32> %a, <2 x i32> %b) #0 {
   10556 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   10557 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
   10558 // CHECK:   ret <4 x i32> [[MUL]]
   10559 int32x4_t test_vmulq_lane_s32(int32x4_t a, int32x2_t b) {
   10560   return vmulq_lane_s32(a, b, 1);
   10561 }
   10562 
   10563 // CHECK-LABEL: define <4 x float> @test_vmulq_lane_f32(<4 x float> %a, <2 x float> %b) #0 {
   10564 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %b, <2 x float> %b, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   10565 // CHECK:   [[MUL:%.*]] = fmul <4 x float> %a, [[SHUFFLE]]
   10566 // CHECK:   ret <4 x float> [[MUL]]
   10567 float32x4_t test_vmulq_lane_f32(float32x4_t a, float32x2_t b) {
   10568   return vmulq_lane_f32(a, b, 1);
   10569 }
   10570 
   10571 // CHECK-LABEL: define <8 x i16> @test_vmulq_lane_u16(<8 x i16> %a, <4 x i16> %b) #0 {
   10572 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
   10573 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
   10574 // CHECK:   ret <8 x i16> [[MUL]]
   10575 uint16x8_t test_vmulq_lane_u16(uint16x8_t a, uint16x4_t b) {
   10576   return vmulq_lane_u16(a, b, 3);
   10577 }
   10578 
   10579 // CHECK-LABEL: define <4 x i32> @test_vmulq_lane_u32(<4 x i32> %a, <2 x i32> %b) #0 {
   10580 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   10581 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
   10582 // CHECK:   ret <4 x i32> [[MUL]]
   10583 uint32x4_t test_vmulq_lane_u32(uint32x4_t a, uint32x2_t b) {
   10584   return vmulq_lane_u32(a, b, 1);
   10585 }
   10586 
   10587 
   10588 // CHECK-LABEL: define <4 x i16> @test_vmul_n_s16(<4 x i16> %a, i16 signext %b) #0 {
   10589 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
   10590 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
   10591 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
   10592 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
   10593 // CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %a, [[VECINIT3_I]]
   10594 // CHECK:   ret <4 x i16> [[MUL_I]]
   10595 int16x4_t test_vmul_n_s16(int16x4_t a, int16_t b) {
   10596   return vmul_n_s16(a, b);
   10597 }
   10598 
   10599 // CHECK-LABEL: define <2 x i32> @test_vmul_n_s32(<2 x i32> %a, i32 %b) #0 {
   10600 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
   10601 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
   10602 // CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %a, [[VECINIT1_I]]
   10603 // CHECK:   ret <2 x i32> [[MUL_I]]
   10604 int32x2_t test_vmul_n_s32(int32x2_t a, int32_t b) {
   10605   return vmul_n_s32(a, b);
   10606 }
   10607 
   10608 // CHECK-LABEL: define <2 x float> @test_vmul_n_f32(<2 x float> %a, float %b) #0 {
   10609 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %b, i32 0
   10610 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %b, i32 1
   10611 // CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %a, [[VECINIT1_I]]
   10612 // CHECK:   ret <2 x float> [[MUL_I]]
   10613 float32x2_t test_vmul_n_f32(float32x2_t a, float32_t b) {
   10614   return vmul_n_f32(a, b);
   10615 }
   10616 
   10617 // CHECK-LABEL: define <4 x i16> @test_vmul_n_u16(<4 x i16> %a, i16 zeroext %b) #0 {
   10618 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
   10619 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
   10620 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
   10621 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
   10622 // CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %a, [[VECINIT3_I]]
   10623 // CHECK:   ret <4 x i16> [[MUL_I]]
   10624 uint16x4_t test_vmul_n_u16(uint16x4_t a, uint16_t b) {
   10625   return vmul_n_u16(a, b);
   10626 }
   10627 
   10628 // CHECK-LABEL: define <2 x i32> @test_vmul_n_u32(<2 x i32> %a, i32 %b) #0 {
   10629 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
   10630 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
   10631 // CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %a, [[VECINIT1_I]]
   10632 // CHECK:   ret <2 x i32> [[MUL_I]]
   10633 uint32x2_t test_vmul_n_u32(uint32x2_t a, uint32_t b) {
   10634   return vmul_n_u32(a, b);
   10635 }
   10636 
   10637 // CHECK-LABEL: define <8 x i16> @test_vmulq_n_s16(<8 x i16> %a, i16 signext %b) #0 {
   10638 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
   10639 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
   10640 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
   10641 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3
   10642 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4
   10643 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5
   10644 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
   10645 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
   10646 // CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %a, [[VECINIT7_I]]
   10647 // CHECK:   ret <8 x i16> [[MUL_I]]
   10648 int16x8_t test_vmulq_n_s16(int16x8_t a, int16_t b) {
   10649   return vmulq_n_s16(a, b);
   10650 }
   10651 
   10652 // CHECK-LABEL: define <4 x i32> @test_vmulq_n_s32(<4 x i32> %a, i32 %b) #0 {
   10653 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
   10654 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
   10655 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
   10656 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
   10657 // CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %a, [[VECINIT3_I]]
   10658 // CHECK:   ret <4 x i32> [[MUL_I]]
   10659 int32x4_t test_vmulq_n_s32(int32x4_t a, int32_t b) {
   10660   return vmulq_n_s32(a, b);
   10661 }
   10662 
   10663 // CHECK-LABEL: define <4 x float> @test_vmulq_n_f32(<4 x float> %a, float %b) #0 {
   10664 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %b, i32 0
   10665 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %b, i32 1
   10666 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %b, i32 2
   10667 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %b, i32 3
   10668 // CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %a, [[VECINIT3_I]]
   10669 // CHECK:   ret <4 x float> [[MUL_I]]
   10670 float32x4_t test_vmulq_n_f32(float32x4_t a, float32_t b) {
   10671   return vmulq_n_f32(a, b);
   10672 }
   10673 
   10674 // CHECK-LABEL: define <8 x i16> @test_vmulq_n_u16(<8 x i16> %a, i16 zeroext %b) #0 {
   10675 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
   10676 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
   10677 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
   10678 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3
   10679 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4
   10680 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5
   10681 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
   10682 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
   10683 // CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %a, [[VECINIT7_I]]
   10684 // CHECK:   ret <8 x i16> [[MUL_I]]
   10685 uint16x8_t test_vmulq_n_u16(uint16x8_t a, uint16_t b) {
   10686   return vmulq_n_u16(a, b);
   10687 }
   10688 
   10689 // CHECK-LABEL: define <4 x i32> @test_vmulq_n_u32(<4 x i32> %a, i32 %b) #0 {
   10690 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
   10691 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
   10692 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
   10693 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
   10694 // CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %a, [[VECINIT3_I]]
   10695 // CHECK:   ret <4 x i32> [[MUL_I]]
   10696 uint32x4_t test_vmulq_n_u32(uint32x4_t a, uint32_t b) {
   10697   return vmulq_n_u32(a, b);
   10698 }
   10699 
   10700 
   10701 // CHECK-LABEL: define <8 x i8> @test_vmvn_s8(<8 x i8> %a) #0 {
   10702 // CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
   10703 // CHECK:   ret <8 x i8> [[NEG_I]]
   10704 int8x8_t test_vmvn_s8(int8x8_t a) {
   10705   return vmvn_s8(a);
   10706 }
   10707 
   10708 // CHECK-LABEL: define <4 x i16> @test_vmvn_s16(<4 x i16> %a) #0 {
   10709 // CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %a, <i16 -1, i16 -1, i16 -1, i16 -1>
   10710 // CHECK:   ret <4 x i16> [[NEG_I]]
   10711 int16x4_t test_vmvn_s16(int16x4_t a) {
   10712   return vmvn_s16(a);
   10713 }
   10714 
   10715 // CHECK-LABEL: define <2 x i32> @test_vmvn_s32(<2 x i32> %a) #0 {
   10716 // CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %a, <i32 -1, i32 -1>
   10717 // CHECK:   ret <2 x i32> [[NEG_I]]
   10718 int32x2_t test_vmvn_s32(int32x2_t a) {
   10719   return vmvn_s32(a);
   10720 }
   10721 
   10722 // CHECK-LABEL: define <8 x i8> @test_vmvn_u8(<8 x i8> %a) #0 {
   10723 // CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
   10724 // CHECK:   ret <8 x i8> [[NEG_I]]
   10725 uint8x8_t test_vmvn_u8(uint8x8_t a) {
   10726   return vmvn_u8(a);
   10727 }
   10728 
   10729 // CHECK-LABEL: define <4 x i16> @test_vmvn_u16(<4 x i16> %a) #0 {
   10730 // CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %a, <i16 -1, i16 -1, i16 -1, i16 -1>
   10731 // CHECK:   ret <4 x i16> [[NEG_I]]
   10732 uint16x4_t test_vmvn_u16(uint16x4_t a) {
   10733   return vmvn_u16(a);
   10734 }
   10735 
   10736 // CHECK-LABEL: define <2 x i32> @test_vmvn_u32(<2 x i32> %a) #0 {
   10737 // CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %a, <i32 -1, i32 -1>
   10738 // CHECK:   ret <2 x i32> [[NEG_I]]
   10739 uint32x2_t test_vmvn_u32(uint32x2_t a) {
   10740   return vmvn_u32(a);
   10741 }
   10742 
   10743 // CHECK-LABEL: define <8 x i8> @test_vmvn_p8(<8 x i8> %a) #0 {
   10744 // CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
   10745 // CHECK:   ret <8 x i8> [[NEG_I]]
   10746 poly8x8_t test_vmvn_p8(poly8x8_t a) {
   10747   return vmvn_p8(a);
   10748 }
   10749 
   10750 // CHECK-LABEL: define <16 x i8> @test_vmvnq_s8(<16 x i8> %a) #0 {
   10751 // CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
   10752 // CHECK:   ret <16 x i8> [[NEG_I]]
   10753 int8x16_t test_vmvnq_s8(int8x16_t a) {
   10754   return vmvnq_s8(a);
   10755 }
   10756 
   10757 // CHECK-LABEL: define <8 x i16> @test_vmvnq_s16(<8 x i16> %a) #0 {
   10758 // CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %a, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
   10759 // CHECK:   ret <8 x i16> [[NEG_I]]
   10760 int16x8_t test_vmvnq_s16(int16x8_t a) {
   10761   return vmvnq_s16(a);
   10762 }
   10763 
   10764 // CHECK-LABEL: define <4 x i32> @test_vmvnq_s32(<4 x i32> %a) #0 {
   10765 // CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1>
   10766 // CHECK:   ret <4 x i32> [[NEG_I]]
   10767 int32x4_t test_vmvnq_s32(int32x4_t a) {
   10768   return vmvnq_s32(a);
   10769 }
   10770 
   10771 // CHECK-LABEL: define <16 x i8> @test_vmvnq_u8(<16 x i8> %a) #0 {
   10772 // CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
   10773 // CHECK:   ret <16 x i8> [[NEG_I]]
   10774 uint8x16_t test_vmvnq_u8(uint8x16_t a) {
   10775   return vmvnq_u8(a);
   10776 }
   10777 
   10778 // CHECK-LABEL: define <8 x i16> @test_vmvnq_u16(<8 x i16> %a) #0 {
   10779 // CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %a, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
   10780 // CHECK:   ret <8 x i16> [[NEG_I]]
   10781 uint16x8_t test_vmvnq_u16(uint16x8_t a) {
   10782   return vmvnq_u16(a);
   10783 }
   10784 
   10785 // CHECK-LABEL: define <4 x i32> @test_vmvnq_u32(<4 x i32> %a) #0 {
   10786 // CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1>
   10787 // CHECK:   ret <4 x i32> [[NEG_I]]
   10788 uint32x4_t test_vmvnq_u32(uint32x4_t a) {
   10789   return vmvnq_u32(a);
   10790 }
   10791 
   10792 // CHECK-LABEL: define <16 x i8> @test_vmvnq_p8(<16 x i8> %a) #0 {
   10793 // CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
   10794 // CHECK:   ret <16 x i8> [[NEG_I]]
   10795 poly8x16_t test_vmvnq_p8(poly8x16_t a) {
   10796   return vmvnq_p8(a);
   10797 }
   10798 
   10799 
   10800 // CHECK-LABEL: define <8 x i8> @test_vneg_s8(<8 x i8> %a) #0 {
   10801 // CHECK:   [[SUB_I:%.*]] = sub <8 x i8> zeroinitializer, %a
   10802 // CHECK:   ret <8 x i8> [[SUB_I]]
   10803 int8x8_t test_vneg_s8(int8x8_t a) {
   10804   return vneg_s8(a);
   10805 }
   10806 
   10807 // CHECK-LABEL: define <4 x i16> @test_vneg_s16(<4 x i16> %a) #0 {
   10808 // CHECK:   [[SUB_I:%.*]] = sub <4 x i16> zeroinitializer, %a
   10809 // CHECK:   ret <4 x i16> [[SUB_I]]
   10810 int16x4_t test_vneg_s16(int16x4_t a) {
   10811   return vneg_s16(a);
   10812 }
   10813 
   10814 // CHECK-LABEL: define <2 x i32> @test_vneg_s32(<2 x i32> %a) #0 {
   10815 // CHECK:   [[SUB_I:%.*]] = sub <2 x i32> zeroinitializer, %a
   10816 // CHECK:   ret <2 x i32> [[SUB_I]]
   10817 int32x2_t test_vneg_s32(int32x2_t a) {
   10818   return vneg_s32(a);
   10819 }
   10820 
   10821 // CHECK-LABEL: define <2 x float> @test_vneg_f32(<2 x float> %a) #0 {
   10822 // CHECK:   [[SUB_I:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %a
   10823 // CHECK:   ret <2 x float> [[SUB_I]]
   10824 float32x2_t test_vneg_f32(float32x2_t a) {
   10825   return vneg_f32(a);
   10826 }
   10827 
   10828 // CHECK-LABEL: define <16 x i8> @test_vnegq_s8(<16 x i8> %a) #0 {
   10829 // CHECK:   [[SUB_I:%.*]] = sub <16 x i8> zeroinitializer, %a
   10830 // CHECK:   ret <16 x i8> [[SUB_I]]
   10831 int8x16_t test_vnegq_s8(int8x16_t a) {
   10832   return vnegq_s8(a);
   10833 }
   10834 
   10835 // CHECK-LABEL: define <8 x i16> @test_vnegq_s16(<8 x i16> %a) #0 {
   10836 // CHECK:   [[SUB_I:%.*]] = sub <8 x i16> zeroinitializer, %a
   10837 // CHECK:   ret <8 x i16> [[SUB_I]]
   10838 int16x8_t test_vnegq_s16(int16x8_t a) {
   10839   return vnegq_s16(a);
   10840 }
   10841 
   10842 // CHECK-LABEL: define <4 x i32> @test_vnegq_s32(<4 x i32> %a) #0 {
   10843 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> zeroinitializer, %a
   10844 // CHECK:   ret <4 x i32> [[SUB_I]]
   10845 int32x4_t test_vnegq_s32(int32x4_t a) {
   10846   return vnegq_s32(a);
   10847 }
   10848 
   10849 // CHECK-LABEL: define <4 x float> @test_vnegq_f32(<4 x float> %a) #0 {
   10850 // CHECK:   [[SUB_I:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a
   10851 // CHECK:   ret <4 x float> [[SUB_I]]
   10852 float32x4_t test_vnegq_f32(float32x4_t a) {
   10853   return vnegq_f32(a);
   10854 }
   10855 
   10856 
   10857 // CHECK-LABEL: define <8 x i8> @test_vorn_s8(<8 x i8> %a, <8 x i8> %b) #0 {
   10858 // CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
   10859 // CHECK:   [[OR_I:%.*]] = or <8 x i8> %a, [[NEG_I]]
   10860 // CHECK:   ret <8 x i8> [[OR_I]]
   10861 int8x8_t test_vorn_s8(int8x8_t a, int8x8_t b) {
   10862   return vorn_s8(a, b);
   10863 }
   10864 
   10865 // CHECK-LABEL: define <4 x i16> @test_vorn_s16(<4 x i16> %a, <4 x i16> %b) #0 {
   10866 // CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1>
   10867 // CHECK:   [[OR_I:%.*]] = or <4 x i16> %a, [[NEG_I]]
   10868 // CHECK:   ret <4 x i16> [[OR_I]]
   10869 int16x4_t test_vorn_s16(int16x4_t a, int16x4_t b) {
   10870   return vorn_s16(a, b);
   10871 }
   10872 
   10873 // CHECK-LABEL: define <2 x i32> @test_vorn_s32(<2 x i32> %a, <2 x i32> %b) #0 {
   10874 // CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %b, <i32 -1, i32 -1>
   10875 // CHECK:   [[OR_I:%.*]] = or <2 x i32> %a, [[NEG_I]]
   10876 // CHECK:   ret <2 x i32> [[OR_I]]
   10877 int32x2_t test_vorn_s32(int32x2_t a, int32x2_t b) {
   10878   return vorn_s32(a, b);
   10879 }
   10880 
   10881 // CHECK-LABEL: define <1 x i64> @test_vorn_s64(<1 x i64> %a, <1 x i64> %b) #0 {
   10882 // CHECK:   [[NEG_I:%.*]] = xor <1 x i64> %b, <i64 -1>
   10883 // CHECK:   [[OR_I:%.*]] = or <1 x i64> %a, [[NEG_I]]
   10884 // CHECK:   ret <1 x i64> [[OR_I]]
   10885 int64x1_t test_vorn_s64(int64x1_t a, int64x1_t b) {
   10886   return vorn_s64(a, b);
   10887 }
   10888 
   10889 // CHECK-LABEL: define <8 x i8> @test_vorn_u8(<8 x i8> %a, <8 x i8> %b) #0 {
   10890 // CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
   10891 // CHECK:   [[OR_I:%.*]] = or <8 x i8> %a, [[NEG_I]]
   10892 // CHECK:   ret <8 x i8> [[OR_I]]
   10893 uint8x8_t test_vorn_u8(uint8x8_t a, uint8x8_t b) {
   10894   return vorn_u8(a, b);
   10895 }
   10896 
   10897 // CHECK-LABEL: define <4 x i16> @test_vorn_u16(<4 x i16> %a, <4 x i16> %b) #0 {
   10898 // CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1>
   10899 // CHECK:   [[OR_I:%.*]] = or <4 x i16> %a, [[NEG_I]]
   10900 // CHECK:   ret <4 x i16> [[OR_I]]
   10901 uint16x4_t test_vorn_u16(uint16x4_t a, uint16x4_t b) {
   10902   return vorn_u16(a, b);
   10903 }
   10904 
   10905 // CHECK-LABEL: define <2 x i32> @test_vorn_u32(<2 x i32> %a, <2 x i32> %b) #0 {
   10906 // CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %b, <i32 -1, i32 -1>
   10907 // CHECK:   [[OR_I:%.*]] = or <2 x i32> %a, [[NEG_I]]
   10908 // CHECK:   ret <2 x i32> [[OR_I]]
   10909 uint32x2_t test_vorn_u32(uint32x2_t a, uint32x2_t b) {
   10910   return vorn_u32(a, b);
   10911 }
   10912 
   10913 // CHECK-LABEL: define <1 x i64> @test_vorn_u64(<1 x i64> %a, <1 x i64> %b) #0 {
   10914 // CHECK:   [[NEG_I:%.*]] = xor <1 x i64> %b, <i64 -1>
   10915 // CHECK:   [[OR_I:%.*]] = or <1 x i64> %a, [[NEG_I]]
   10916 // CHECK:   ret <1 x i64> [[OR_I]]
   10917 uint64x1_t test_vorn_u64(uint64x1_t a, uint64x1_t b) {
   10918   return vorn_u64(a, b);
   10919 }
   10920 
   10921 // CHECK-LABEL: define <16 x i8> @test_vornq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
   10922 // CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
   10923 // CHECK:   [[OR_I:%.*]] = or <16 x i8> %a, [[NEG_I]]
   10924 // CHECK:   ret <16 x i8> [[OR_I]]
   10925 int8x16_t test_vornq_s8(int8x16_t a, int8x16_t b) {
   10926   return vornq_s8(a, b);
   10927 }
   10928 
   10929 // CHECK-LABEL: define <8 x i16> @test_vornq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
   10930 // CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
   10931 // CHECK:   [[OR_I:%.*]] = or <8 x i16> %a, [[NEG_I]]
   10932 // CHECK:   ret <8 x i16> [[OR_I]]
   10933 int16x8_t test_vornq_s16(int16x8_t a, int16x8_t b) {
   10934   return vornq_s16(a, b);
   10935 }
   10936 
   10937 // CHECK-LABEL: define <4 x i32> @test_vornq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
   10938 // CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
   10939 // CHECK:   [[OR_I:%.*]] = or <4 x i32> %a, [[NEG_I]]
   10940 // CHECK:   ret <4 x i32> [[OR_I]]
   10941 int32x4_t test_vornq_s32(int32x4_t a, int32x4_t b) {
   10942   return vornq_s32(a, b);
   10943 }
   10944 
   10945 // CHECK-LABEL: define <2 x i64> @test_vornq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
   10946 // CHECK:   [[NEG_I:%.*]] = xor <2 x i64> %b, <i64 -1, i64 -1>
   10947 // CHECK:   [[OR_I:%.*]] = or <2 x i64> %a, [[NEG_I]]
   10948 // CHECK:   ret <2 x i64> [[OR_I]]
   10949 int64x2_t test_vornq_s64(int64x2_t a, int64x2_t b) {
   10950   return vornq_s64(a, b);
   10951 }
   10952 
   10953 // CHECK-LABEL: define <16 x i8> @test_vornq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
   10954 // CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
   10955 // CHECK:   [[OR_I:%.*]] = or <16 x i8> %a, [[NEG_I]]
   10956 // CHECK:   ret <16 x i8> [[OR_I]]
   10957 uint8x16_t test_vornq_u8(uint8x16_t a, uint8x16_t b) {
   10958   return vornq_u8(a, b);
   10959 }
   10960 
   10961 // CHECK-LABEL: define <8 x i16> @test_vornq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
   10962 // CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
   10963 // CHECK:   [[OR_I:%.*]] = or <8 x i16> %a, [[NEG_I]]
   10964 // CHECK:   ret <8 x i16> [[OR_I]]
   10965 uint16x8_t test_vornq_u16(uint16x8_t a, uint16x8_t b) {
   10966   return vornq_u16(a, b);
   10967 }
   10968 
   10969 // CHECK-LABEL: define <4 x i32> @test_vornq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
   10970 // CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
   10971 // CHECK:   [[OR_I:%.*]] = or <4 x i32> %a, [[NEG_I]]
   10972 // CHECK:   ret <4 x i32> [[OR_I]]
   10973 uint32x4_t test_vornq_u32(uint32x4_t a, uint32x4_t b) {
   10974   return vornq_u32(a, b);
   10975 }
   10976 
   10977 // CHECK-LABEL: define <2 x i64> @test_vornq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
   10978 // CHECK:   [[NEG_I:%.*]] = xor <2 x i64> %b, <i64 -1, i64 -1>
   10979 // CHECK:   [[OR_I:%.*]] = or <2 x i64> %a, [[NEG_I]]
   10980 // CHECK:   ret <2 x i64> [[OR_I]]
   10981 uint64x2_t test_vornq_u64(uint64x2_t a, uint64x2_t b) {
   10982   return vornq_u64(a, b);
   10983 }
   10984 
   10985 
   10986 // CHECK-LABEL: define <8 x i8> @test_vorr_s8(<8 x i8> %a, <8 x i8> %b) #0 {
   10987 // CHECK:   [[OR_I:%.*]] = or <8 x i8> %a, %b
   10988 // CHECK:   ret <8 x i8> [[OR_I]]
   10989 int8x8_t test_vorr_s8(int8x8_t a, int8x8_t b) {
   10990   return vorr_s8(a, b);
   10991 }
   10992 
   10993 // CHECK-LABEL: define <4 x i16> @test_vorr_s16(<4 x i16> %a, <4 x i16> %b) #0 {
   10994 // CHECK:   [[OR_I:%.*]] = or <4 x i16> %a, %b
   10995 // CHECK:   ret <4 x i16> [[OR_I]]
   10996 int16x4_t test_vorr_s16(int16x4_t a, int16x4_t b) {
   10997   return vorr_s16(a, b);
   10998 }
   10999 
   11000 // CHECK-LABEL: define <2 x i32> @test_vorr_s32(<2 x i32> %a, <2 x i32> %b) #0 {
   11001 // CHECK:   [[OR_I:%.*]] = or <2 x i32> %a, %b
   11002 // CHECK:   ret <2 x i32> [[OR_I]]
   11003 int32x2_t test_vorr_s32(int32x2_t a, int32x2_t b) {
   11004   return vorr_s32(a, b);
   11005 }
   11006 
   11007 // CHECK-LABEL: define <1 x i64> @test_vorr_s64(<1 x i64> %a, <1 x i64> %b) #0 {
   11008 // CHECK:   [[OR_I:%.*]] = or <1 x i64> %a, %b
   11009 // CHECK:   ret <1 x i64> [[OR_I]]
   11010 int64x1_t test_vorr_s64(int64x1_t a, int64x1_t b) {
   11011   return vorr_s64(a, b);
   11012 }
   11013 
   11014 // CHECK-LABEL: define <8 x i8> @test_vorr_u8(<8 x i8> %a, <8 x i8> %b) #0 {
   11015 // CHECK:   [[OR_I:%.*]] = or <8 x i8> %a, %b
   11016 // CHECK:   ret <8 x i8> [[OR_I]]
   11017 uint8x8_t test_vorr_u8(uint8x8_t a, uint8x8_t b) {
   11018   return vorr_u8(a, b);
   11019 }
   11020 
   11021 // CHECK-LABEL: define <4 x i16> @test_vorr_u16(<4 x i16> %a, <4 x i16> %b) #0 {
   11022 // CHECK:   [[OR_I:%.*]] = or <4 x i16> %a, %b
   11023 // CHECK:   ret <4 x i16> [[OR_I]]
   11024 uint16x4_t test_vorr_u16(uint16x4_t a, uint16x4_t b) {
   11025   return vorr_u16(a, b);
   11026 }
   11027 
   11028 // CHECK-LABEL: define <2 x i32> @test_vorr_u32(<2 x i32> %a, <2 x i32> %b) #0 {
   11029 // CHECK:   [[OR_I:%.*]] = or <2 x i32> %a, %b
   11030 // CHECK:   ret <2 x i32> [[OR_I]]
   11031 uint32x2_t test_vorr_u32(uint32x2_t a, uint32x2_t b) {
   11032   return vorr_u32(a, b);
   11033 }
   11034 
   11035 // CHECK-LABEL: define <1 x i64> @test_vorr_u64(<1 x i64> %a, <1 x i64> %b) #0 {
   11036 // CHECK:   [[OR_I:%.*]] = or <1 x i64> %a, %b
   11037 // CHECK:   ret <1 x i64> [[OR_I]]
   11038 uint64x1_t test_vorr_u64(uint64x1_t a, uint64x1_t b) {
   11039   return vorr_u64(a, b);
   11040 }
   11041 
   11042 // CHECK-LABEL: define <16 x i8> @test_vorrq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
   11043 // CHECK:   [[OR_I:%.*]] = or <16 x i8> %a, %b
   11044 // CHECK:   ret <16 x i8> [[OR_I]]
   11045 int8x16_t test_vorrq_s8(int8x16_t a, int8x16_t b) {
   11046   return vorrq_s8(a, b);
   11047 }
   11048 
   11049 // CHECK-LABEL: define <8 x i16> @test_vorrq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
   11050 // CHECK:   [[OR_I:%.*]] = or <8 x i16> %a, %b
   11051 // CHECK:   ret <8 x i16> [[OR_I]]
   11052 int16x8_t test_vorrq_s16(int16x8_t a, int16x8_t b) {
   11053   return vorrq_s16(a, b);
   11054 }
   11055 
   11056 // CHECK-LABEL: define <4 x i32> @test_vorrq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
   11057 // CHECK:   [[OR_I:%.*]] = or <4 x i32> %a, %b
   11058 // CHECK:   ret <4 x i32> [[OR_I]]
   11059 int32x4_t test_vorrq_s32(int32x4_t a, int32x4_t b) {
   11060   return vorrq_s32(a, b);
   11061 }
   11062 
   11063 // CHECK-LABEL: define <2 x i64> @test_vorrq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
   11064 // CHECK:   [[OR_I:%.*]] = or <2 x i64> %a, %b
   11065 // CHECK:   ret <2 x i64> [[OR_I]]
   11066 int64x2_t test_vorrq_s64(int64x2_t a, int64x2_t b) {
   11067   return vorrq_s64(a, b);
   11068 }
   11069 
   11070 // CHECK-LABEL: define <16 x i8> @test_vorrq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
   11071 // CHECK:   [[OR_I:%.*]] = or <16 x i8> %a, %b
   11072 // CHECK:   ret <16 x i8> [[OR_I]]
   11073 uint8x16_t test_vorrq_u8(uint8x16_t a, uint8x16_t b) {
   11074   return vorrq_u8(a, b);
   11075 }
   11076 
   11077 // CHECK-LABEL: define <8 x i16> @test_vorrq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
   11078 // CHECK:   [[OR_I:%.*]] = or <8 x i16> %a, %b
   11079 // CHECK:   ret <8 x i16> [[OR_I]]
   11080 uint16x8_t test_vorrq_u16(uint16x8_t a, uint16x8_t b) {
   11081   return vorrq_u16(a, b);
   11082 }
   11083 
   11084 // CHECK-LABEL: define <4 x i32> @test_vorrq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
   11085 // CHECK:   [[OR_I:%.*]] = or <4 x i32> %a, %b
   11086 // CHECK:   ret <4 x i32> [[OR_I]]
   11087 uint32x4_t test_vorrq_u32(uint32x4_t a, uint32x4_t b) {
   11088   return vorrq_u32(a, b);
   11089 }
   11090 
   11091 // CHECK-LABEL: define <2 x i64> @test_vorrq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
   11092 // CHECK:   [[OR_I:%.*]] = or <2 x i64> %a, %b
   11093 // CHECK:   ret <2 x i64> [[OR_I]]
   11094 uint64x2_t test_vorrq_u64(uint64x2_t a, uint64x2_t b) {
   11095   return vorrq_u64(a, b);
   11096 }
   11097 
   11098 
   11099 // CHECK-LABEL: define <4 x i16> @test_vpadal_s8(<4 x i16> %a, <8 x i8> %b) #0 {
   11100 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   11101 // CHECK:   [[VPADAL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   11102 // CHECK:   [[VPADAL_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpadals.v4i16.v8i8(<4 x i16> [[VPADAL_V_I]], <8 x i8> %b) #4
   11103 // CHECK:   ret <4 x i16> [[VPADAL_V1_I]]
   11104 int16x4_t test_vpadal_s8(int16x4_t a, int8x8_t b) {
   11105   return vpadal_s8(a, b);
   11106 }
   11107 
   11108 // CHECK-LABEL: define <2 x i32> @test_vpadal_s16(<2 x i32> %a, <4 x i16> %b) #0 {
   11109 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   11110 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   11111 // CHECK:   [[VPADAL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   11112 // CHECK:   [[VPADAL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   11113 // CHECK:   [[VPADAL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpadals.v2i32.v4i16(<2 x i32> [[VPADAL_V_I]], <4 x i16> [[VPADAL_V1_I]]) #4
   11114 // CHECK:   ret <2 x i32> [[VPADAL_V2_I]]
   11115 int32x2_t test_vpadal_s16(int32x2_t a, int16x4_t b) {
   11116   return vpadal_s16(a, b);
   11117 }
   11118 
   11119 // CHECK-LABEL: define <1 x i64> @test_vpadal_s32(<1 x i64> %a, <2 x i32> %b) #0 {
   11120 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   11121 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   11122 // CHECK:   [[VPADAL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
   11123 // CHECK:   [[VPADAL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   11124 // CHECK:   [[VPADAL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vpadals.v1i64.v2i32(<1 x i64> [[VPADAL_V_I]], <2 x i32> [[VPADAL_V1_I]]) #4
   11125 // CHECK:   ret <1 x i64> [[VPADAL_V2_I]]
   11126 int64x1_t test_vpadal_s32(int64x1_t a, int32x2_t b) {
   11127   return vpadal_s32(a, b);
   11128 }
   11129 
   11130 // CHECK-LABEL: define <4 x i16> @test_vpadal_u8(<4 x i16> %a, <8 x i8> %b) #0 {
   11131 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   11132 // CHECK:   [[VPADAL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   11133 // CHECK:   [[VPADAL_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpadalu.v4i16.v8i8(<4 x i16> [[VPADAL_V_I]], <8 x i8> %b) #4
   11134 // CHECK:   ret <4 x i16> [[VPADAL_V1_I]]
   11135 uint16x4_t test_vpadal_u8(uint16x4_t a, uint8x8_t b) {
   11136   return vpadal_u8(a, b);
   11137 }
   11138 
   11139 // CHECK-LABEL: define <2 x i32> @test_vpadal_u16(<2 x i32> %a, <4 x i16> %b) #0 {
   11140 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   11141 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   11142 // CHECK:   [[VPADAL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   11143 // CHECK:   [[VPADAL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   11144 // CHECK:   [[VPADAL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpadalu.v2i32.v4i16(<2 x i32> [[VPADAL_V_I]], <4 x i16> [[VPADAL_V1_I]]) #4
   11145 // CHECK:   ret <2 x i32> [[VPADAL_V2_I]]
   11146 uint32x2_t test_vpadal_u16(uint32x2_t a, uint16x4_t b) {
   11147   return vpadal_u16(a, b);
   11148 }
   11149 
   11150 // CHECK-LABEL: define <1 x i64> @test_vpadal_u32(<1 x i64> %a, <2 x i32> %b) #0 {
   11151 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   11152 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   11153 // CHECK:   [[VPADAL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
   11154 // CHECK:   [[VPADAL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   11155 // CHECK:   [[VPADAL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vpadalu.v1i64.v2i32(<1 x i64> [[VPADAL_V_I]], <2 x i32> [[VPADAL_V1_I]]) #4
   11156 // CHECK:   ret <1 x i64> [[VPADAL_V2_I]]
   11157 uint64x1_t test_vpadal_u32(uint64x1_t a, uint32x2_t b) {
   11158   return vpadal_u32(a, b);
   11159 }
   11160 
   11161 // CHECK-LABEL: define <8 x i16> @test_vpadalq_s8(<8 x i16> %a, <16 x i8> %b) #0 {
   11162 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   11163 // CHECK:   [[VPADALQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   11164 // CHECK:   [[VPADALQ_V1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vpadals.v8i16.v16i8(<8 x i16> [[VPADALQ_V_I]], <16 x i8> %b) #4
   11165 // CHECK:   ret <8 x i16> [[VPADALQ_V1_I]]
   11166 int16x8_t test_vpadalq_s8(int16x8_t a, int8x16_t b) {
   11167   return vpadalq_s8(a, b);
   11168 }
   11169 
   11170 // CHECK-LABEL: define <4 x i32> @test_vpadalq_s16(<4 x i32> %a, <8 x i16> %b) #0 {
   11171 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   11172 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   11173 // CHECK:   [[VPADALQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   11174 // CHECK:   [[VPADALQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   11175 // CHECK:   [[VPADALQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vpadals.v4i32.v8i16(<4 x i32> [[VPADALQ_V_I]], <8 x i16> [[VPADALQ_V1_I]]) #4
   11176 // CHECK:   ret <4 x i32> [[VPADALQ_V2_I]]
   11177 int32x4_t test_vpadalq_s16(int32x4_t a, int16x8_t b) {
   11178   return vpadalq_s16(a, b);
   11179 }
   11180 
   11181 // CHECK-LABEL: define <2 x i64> @test_vpadalq_s32(<2 x i64> %a, <4 x i32> %b) #0 {
   11182 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   11183 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   11184 // CHECK:   [[VPADALQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   11185 // CHECK:   [[VPADALQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   11186 // CHECK:   [[VPADALQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vpadals.v2i64.v4i32(<2 x i64> [[VPADALQ_V_I]], <4 x i32> [[VPADALQ_V1_I]]) #4
   11187 // CHECK:   ret <2 x i64> [[VPADALQ_V2_I]]
   11188 int64x2_t test_vpadalq_s32(int64x2_t a, int32x4_t b) {
   11189   return vpadalq_s32(a, b);
   11190 }
   11191 
   11192 // CHECK-LABEL: define <8 x i16> @test_vpadalq_u8(<8 x i16> %a, <16 x i8> %b) #0 {
   11193 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   11194 // CHECK:   [[VPADALQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   11195 // CHECK:   [[VPADALQ_V1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vpadalu.v8i16.v16i8(<8 x i16> [[VPADALQ_V_I]], <16 x i8> %b) #4
   11196 // CHECK:   ret <8 x i16> [[VPADALQ_V1_I]]
   11197 uint16x8_t test_vpadalq_u8(uint16x8_t a, uint8x16_t b) {
   11198   return vpadalq_u8(a, b);
   11199 }
   11200 
   11201 // CHECK-LABEL: define <4 x i32> @test_vpadalq_u16(<4 x i32> %a, <8 x i16> %b) #0 {
   11202 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   11203 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   11204 // CHECK:   [[VPADALQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   11205 // CHECK:   [[VPADALQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   11206 // CHECK:   [[VPADALQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vpadalu.v4i32.v8i16(<4 x i32> [[VPADALQ_V_I]], <8 x i16> [[VPADALQ_V1_I]]) #4
   11207 // CHECK:   ret <4 x i32> [[VPADALQ_V2_I]]
   11208 uint32x4_t test_vpadalq_u16(uint32x4_t a, uint16x8_t b) {
   11209   return vpadalq_u16(a, b);
   11210 }
   11211 
   11212 // CHECK-LABEL: define <2 x i64> @test_vpadalq_u32(<2 x i64> %a, <4 x i32> %b) #0 {
   11213 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   11214 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   11215 // CHECK:   [[VPADALQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   11216 // CHECK:   [[VPADALQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   11217 // CHECK:   [[VPADALQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vpadalu.v2i64.v4i32(<2 x i64> [[VPADALQ_V_I]], <4 x i32> [[VPADALQ_V1_I]]) #4
   11218 // CHECK:   ret <2 x i64> [[VPADALQ_V2_I]]
   11219 uint64x2_t test_vpadalq_u32(uint64x2_t a, uint32x4_t b) {
   11220   return vpadalq_u32(a, b);
   11221 }
   11222 
   11223 
   11224 // CHECK-LABEL: define <8 x i8> @test_vpadd_s8(<8 x i8> %a, <8 x i8> %b) #0 {
   11225 // CHECK:   [[VPADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vpadd.v8i8(<8 x i8> %a, <8 x i8> %b) #4
   11226 // CHECK:   ret <8 x i8> [[VPADD_V_I]]
   11227 int8x8_t test_vpadd_s8(int8x8_t a, int8x8_t b) {
   11228   return vpadd_s8(a, b);
   11229 }
   11230 
   11231 // CHECK-LABEL: define <4 x i16> @test_vpadd_s16(<4 x i16> %a, <4 x i16> %b) #0 {
   11232 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   11233 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   11234 // CHECK:   [[VPADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   11235 // CHECK:   [[VPADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   11236 // CHECK:   [[VPADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpadd.v4i16(<4 x i16> [[VPADD_V_I]], <4 x i16> [[VPADD_V1_I]]) #4
   11237 // CHECK:   [[VPADD_V3_I:%.*]] = bitcast <4 x i16> [[VPADD_V2_I]] to <8 x i8>
   11238 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPADD_V3_I]] to <4 x i16>
   11239 // CHECK:   ret <4 x i16> [[TMP2]]
   11240 int16x4_t test_vpadd_s16(int16x4_t a, int16x4_t b) {
   11241   return vpadd_s16(a, b);
   11242 }
   11243 
   11244 // CHECK-LABEL: define <2 x i32> @test_vpadd_s32(<2 x i32> %a, <2 x i32> %b) #0 {
   11245 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   11246 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   11247 // CHECK:   [[VPADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   11248 // CHECK:   [[VPADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   11249 // CHECK:   [[VPADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpadd.v2i32(<2 x i32> [[VPADD_V_I]], <2 x i32> [[VPADD_V1_I]]) #4
   11250 // CHECK:   [[VPADD_V3_I:%.*]] = bitcast <2 x i32> [[VPADD_V2_I]] to <8 x i8>
   11251 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPADD_V3_I]] to <2 x i32>
   11252 // CHECK:   ret <2 x i32> [[TMP2]]
   11253 int32x2_t test_vpadd_s32(int32x2_t a, int32x2_t b) {
   11254   return vpadd_s32(a, b);
   11255 }
   11256 
   11257 // CHECK-LABEL: define <8 x i8> @test_vpadd_u8(<8 x i8> %a, <8 x i8> %b) #0 {
   11258 // CHECK:   [[VPADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vpadd.v8i8(<8 x i8> %a, <8 x i8> %b) #4
   11259 // CHECK:   ret <8 x i8> [[VPADD_V_I]]
   11260 uint8x8_t test_vpadd_u8(uint8x8_t a, uint8x8_t b) {
   11261   return vpadd_u8(a, b);
   11262 }
   11263 
   11264 // CHECK-LABEL: define <4 x i16> @test_vpadd_u16(<4 x i16> %a, <4 x i16> %b) #0 {
   11265 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   11266 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   11267 // CHECK:   [[VPADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   11268 // CHECK:   [[VPADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   11269 // CHECK:   [[VPADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpadd.v4i16(<4 x i16> [[VPADD_V_I]], <4 x i16> [[VPADD_V1_I]]) #4
   11270 // CHECK:   [[VPADD_V3_I:%.*]] = bitcast <4 x i16> [[VPADD_V2_I]] to <8 x i8>
   11271 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPADD_V3_I]] to <4 x i16>
   11272 // CHECK:   ret <4 x i16> [[TMP2]]
   11273 uint16x4_t test_vpadd_u16(uint16x4_t a, uint16x4_t b) {
   11274   return vpadd_u16(a, b);
   11275 }
   11276 
   11277 // CHECK-LABEL: define <2 x i32> @test_vpadd_u32(<2 x i32> %a, <2 x i32> %b) #0 {
   11278 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   11279 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   11280 // CHECK:   [[VPADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   11281 // CHECK:   [[VPADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   11282 // CHECK:   [[VPADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpadd.v2i32(<2 x i32> [[VPADD_V_I]], <2 x i32> [[VPADD_V1_I]]) #4
   11283 // CHECK:   [[VPADD_V3_I:%.*]] = bitcast <2 x i32> [[VPADD_V2_I]] to <8 x i8>
   11284 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPADD_V3_I]] to <2 x i32>
   11285 // CHECK:   ret <2 x i32> [[TMP2]]
   11286 uint32x2_t test_vpadd_u32(uint32x2_t a, uint32x2_t b) {
   11287   return vpadd_u32(a, b);
   11288 }
   11289 
   11290 // CHECK-LABEL: define <2 x float> @test_vpadd_f32(<2 x float> %a, <2 x float> %b) #0 {
   11291 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
   11292 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
   11293 // CHECK:   [[VPADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
   11294 // CHECK:   [[VPADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
   11295 // CHECK:   [[VPADD_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vpadd.v2f32(<2 x float> [[VPADD_V_I]], <2 x float> [[VPADD_V1_I]]) #4
   11296 // CHECK:   [[VPADD_V3_I:%.*]] = bitcast <2 x float> [[VPADD_V2_I]] to <8 x i8>
   11297 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPADD_V3_I]] to <2 x float>
   11298 // CHECK:   ret <2 x float> [[TMP2]]
   11299 float32x2_t test_vpadd_f32(float32x2_t a, float32x2_t b) {
   11300   return vpadd_f32(a, b);
   11301 }
   11302 
   11303 
   11304 // CHECK-LABEL: define <4 x i16> @test_vpaddl_s8(<8 x i8> %a) #0 {
   11305 // CHECK:   [[VPADDL_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8> %a) #4
   11306 // CHECK:   ret <4 x i16> [[VPADDL_I]]
   11307 int16x4_t test_vpaddl_s8(int8x8_t a) {
   11308   return vpaddl_s8(a);
   11309 }
   11310 
   11311 // CHECK-LABEL: define <2 x i32> @test_vpaddl_s16(<4 x i16> %a) #0 {
   11312 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   11313 // CHECK:   [[VPADDL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   11314 // CHECK:   [[VPADDL1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpaddls.v2i32.v4i16(<4 x i16> [[VPADDL_I]]) #4
   11315 // CHECK:   ret <2 x i32> [[VPADDL1_I]]
   11316 int32x2_t test_vpaddl_s16(int16x4_t a) {
   11317   return vpaddl_s16(a);
   11318 }
   11319 
   11320 // CHECK-LABEL: define <1 x i64> @test_vpaddl_s32(<2 x i32> %a) #0 {
   11321 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   11322 // CHECK:   [[VPADDL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   11323 // CHECK:   [[VPADDL1_I:%.*]] = call <1 x i64> @llvm.arm.neon.vpaddls.v1i64.v2i32(<2 x i32> [[VPADDL_I]]) #4
   11324 // CHECK:   ret <1 x i64> [[VPADDL1_I]]
   11325 int64x1_t test_vpaddl_s32(int32x2_t a) {
   11326   return vpaddl_s32(a);
   11327 }
   11328 
   11329 // CHECK-LABEL: define <4 x i16> @test_vpaddl_u8(<8 x i8> %a) #0 {
   11330 // CHECK:   [[VPADDL_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpaddlu.v4i16.v8i8(<8 x i8> %a) #4
   11331 // CHECK:   ret <4 x i16> [[VPADDL_I]]
   11332 uint16x4_t test_vpaddl_u8(uint8x8_t a) {
   11333   return vpaddl_u8(a);
   11334 }
   11335 
   11336 // CHECK-LABEL: define <2 x i32> @test_vpaddl_u16(<4 x i16> %a) #0 {
   11337 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   11338 // CHECK:   [[VPADDL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   11339 // CHECK:   [[VPADDL1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16> [[VPADDL_I]]) #4
   11340 // CHECK:   ret <2 x i32> [[VPADDL1_I]]
   11341 uint32x2_t test_vpaddl_u16(uint16x4_t a) {
   11342   return vpaddl_u16(a);
   11343 }
   11344 
   11345 // CHECK-LABEL: define <1 x i64> @test_vpaddl_u32(<2 x i32> %a) #0 {
   11346 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   11347 // CHECK:   [[VPADDL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   11348 // CHECK:   [[VPADDL1_I:%.*]] = call <1 x i64> @llvm.arm.neon.vpaddlu.v1i64.v2i32(<2 x i32> [[VPADDL_I]]) #4
   11349 // CHECK:   ret <1 x i64> [[VPADDL1_I]]
   11350 uint64x1_t test_vpaddl_u32(uint32x2_t a) {
   11351   return vpaddl_u32(a);
   11352 }
   11353 
   11354 // CHECK-LABEL: define <8 x i16> @test_vpaddlq_s8(<16 x i8> %a) #0 {
   11355 // CHECK:   [[VPADDL_I:%.*]] = call <8 x i16> @llvm.arm.neon.vpaddls.v8i16.v16i8(<16 x i8> %a) #4
   11356 // CHECK:   ret <8 x i16> [[VPADDL_I]]
   11357 int16x8_t test_vpaddlq_s8(int8x16_t a) {
   11358   return vpaddlq_s8(a);
   11359 }
   11360 
   11361 // CHECK-LABEL: define <4 x i32> @test_vpaddlq_s16(<8 x i16> %a) #0 {
   11362 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   11363 // CHECK:   [[VPADDL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   11364 // CHECK:   [[VPADDL1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vpaddls.v4i32.v8i16(<8 x i16> [[VPADDL_I]]) #4
   11365 // CHECK:   ret <4 x i32> [[VPADDL1_I]]
   11366 int32x4_t test_vpaddlq_s16(int16x8_t a) {
   11367   return vpaddlq_s16(a);
   11368 }
   11369 
   11370 // CHECK-LABEL: define <2 x i64> @test_vpaddlq_s32(<4 x i32> %a) #0 {
   11371 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   11372 // CHECK:   [[VPADDL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   11373 // CHECK:   [[VPADDL1_I:%.*]] = call <2 x i64> @llvm.arm.neon.vpaddls.v2i64.v4i32(<4 x i32> [[VPADDL_I]]) #4
   11374 // CHECK:   ret <2 x i64> [[VPADDL1_I]]
   11375 int64x2_t test_vpaddlq_s32(int32x4_t a) {
   11376   return vpaddlq_s32(a);
   11377 }
   11378 
   11379 // CHECK-LABEL: define <8 x i16> @test_vpaddlq_u8(<16 x i8> %a) #0 {
   11380 // CHECK:   [[VPADDL_I:%.*]] = call <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8> %a) #4
   11381 // CHECK:   ret <8 x i16> [[VPADDL_I]]
   11382 uint16x8_t test_vpaddlq_u8(uint8x16_t a) {
   11383   return vpaddlq_u8(a);
   11384 }
   11385 
   11386 // CHECK-LABEL: define <4 x i32> @test_vpaddlq_u16(<8 x i16> %a) #0 {
   11387 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   11388 // CHECK:   [[VPADDL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   11389 // CHECK:   [[VPADDL1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> [[VPADDL_I]]) #4
   11390 // CHECK:   ret <4 x i32> [[VPADDL1_I]]
   11391 uint32x4_t test_vpaddlq_u16(uint16x8_t a) {
   11392   return vpaddlq_u16(a);
   11393 }
   11394 
   11395 // CHECK-LABEL: define <2 x i64> @test_vpaddlq_u32(<4 x i32> %a) #0 {
   11396 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   11397 // CHECK:   [[VPADDL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   11398 // CHECK:   [[VPADDL1_I:%.*]] = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> [[VPADDL_I]]) #4
   11399 // CHECK:   ret <2 x i64> [[VPADDL1_I]]
   11400 uint64x2_t test_vpaddlq_u32(uint32x4_t a) {
   11401   return vpaddlq_u32(a);
   11402 }
   11403 
   11404 
   11405 // CHECK-LABEL: define <8 x i8> @test_vpmax_s8(<8 x i8> %a, <8 x i8> %b) #0 {
   11406 // CHECK:   [[VPMAX_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vpmaxs.v8i8(<8 x i8> %a, <8 x i8> %b) #4
   11407 // CHECK:   ret <8 x i8> [[VPMAX_V_I]]
   11408 int8x8_t test_vpmax_s8(int8x8_t a, int8x8_t b) {
   11409   return vpmax_s8(a, b);
   11410 }
   11411 
   11412 // CHECK-LABEL: define <4 x i16> @test_vpmax_s16(<4 x i16> %a, <4 x i16> %b) #0 {
   11413 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   11414 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   11415 // CHECK:   [[VPMAX_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   11416 // CHECK:   [[VPMAX_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   11417 // CHECK:   [[VPMAX_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpmaxs.v4i16(<4 x i16> [[VPMAX_V_I]], <4 x i16> [[VPMAX_V1_I]]) #4
   11418 // CHECK:   [[VPMAX_V3_I:%.*]] = bitcast <4 x i16> [[VPMAX_V2_I]] to <8 x i8>
   11419 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPMAX_V3_I]] to <4 x i16>
   11420 // CHECK:   ret <4 x i16> [[TMP2]]
   11421 int16x4_t test_vpmax_s16(int16x4_t a, int16x4_t b) {
   11422   return vpmax_s16(a, b);
   11423 }
   11424 
   11425 // CHECK-LABEL: define <2 x i32> @test_vpmax_s32(<2 x i32> %a, <2 x i32> %b) #0 {
   11426 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   11427 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   11428 // CHECK:   [[VPMAX_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   11429 // CHECK:   [[VPMAX_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   11430 // CHECK:   [[VPMAX_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpmaxs.v2i32(<2 x i32> [[VPMAX_V_I]], <2 x i32> [[VPMAX_V1_I]]) #4
   11431 // CHECK:   [[VPMAX_V3_I:%.*]] = bitcast <2 x i32> [[VPMAX_V2_I]] to <8 x i8>
   11432 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPMAX_V3_I]] to <2 x i32>
   11433 // CHECK:   ret <2 x i32> [[TMP2]]
   11434 int32x2_t test_vpmax_s32(int32x2_t a, int32x2_t b) {
   11435   return vpmax_s32(a, b);
   11436 }
   11437 
   11438 // CHECK-LABEL: define <8 x i8> @test_vpmax_u8(<8 x i8> %a, <8 x i8> %b) #0 {
   11439 // CHECK:   [[VPMAX_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vpmaxu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
   11440 // CHECK:   ret <8 x i8> [[VPMAX_V_I]]
   11441 uint8x8_t test_vpmax_u8(uint8x8_t a, uint8x8_t b) {
   11442   return vpmax_u8(a, b);
   11443 }
   11444 
   11445 // CHECK-LABEL: define <4 x i16> @test_vpmax_u16(<4 x i16> %a, <4 x i16> %b) #0 {
   11446 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   11447 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   11448 // CHECK:   [[VPMAX_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   11449 // CHECK:   [[VPMAX_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   11450 // CHECK:   [[VPMAX_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpmaxu.v4i16(<4 x i16> [[VPMAX_V_I]], <4 x i16> [[VPMAX_V1_I]]) #4
   11451 // CHECK:   [[VPMAX_V3_I:%.*]] = bitcast <4 x i16> [[VPMAX_V2_I]] to <8 x i8>
   11452 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPMAX_V3_I]] to <4 x i16>
   11453 // CHECK:   ret <4 x i16> [[TMP2]]
   11454 uint16x4_t test_vpmax_u16(uint16x4_t a, uint16x4_t b) {
   11455   return vpmax_u16(a, b);
   11456 }
   11457 
   11458 // CHECK-LABEL: define <2 x i32> @test_vpmax_u32(<2 x i32> %a, <2 x i32> %b) #0 {
   11459 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   11460 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   11461 // CHECK:   [[VPMAX_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   11462 // CHECK:   [[VPMAX_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   11463 // CHECK:   [[VPMAX_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpmaxu.v2i32(<2 x i32> [[VPMAX_V_I]], <2 x i32> [[VPMAX_V1_I]]) #4
   11464 // CHECK:   [[VPMAX_V3_I:%.*]] = bitcast <2 x i32> [[VPMAX_V2_I]] to <8 x i8>
   11465 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPMAX_V3_I]] to <2 x i32>
   11466 // CHECK:   ret <2 x i32> [[TMP2]]
   11467 uint32x2_t test_vpmax_u32(uint32x2_t a, uint32x2_t b) {
   11468   return vpmax_u32(a, b);
   11469 }
   11470 
   11471 // CHECK-LABEL: define <2 x float> @test_vpmax_f32(<2 x float> %a, <2 x float> %b) #0 {
   11472 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
   11473 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
   11474 // CHECK:   [[VPMAX_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
   11475 // CHECK:   [[VPMAX_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
   11476 // CHECK:   [[VPMAX_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vpmaxs.v2f32(<2 x float> [[VPMAX_V_I]], <2 x float> [[VPMAX_V1_I]]) #4
   11477 // CHECK:   [[VPMAX_V3_I:%.*]] = bitcast <2 x float> [[VPMAX_V2_I]] to <8 x i8>
   11478 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPMAX_V3_I]] to <2 x float>
   11479 // CHECK:   ret <2 x float> [[TMP2]]
   11480 float32x2_t test_vpmax_f32(float32x2_t a, float32x2_t b) {
   11481   return vpmax_f32(a, b);
   11482 }
   11483 
   11484 
   11485 // CHECK-LABEL: define <8 x i8> @test_vpmin_s8(<8 x i8> %a, <8 x i8> %b) #0 {
   11486 // CHECK:   [[VPMIN_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vpmins.v8i8(<8 x i8> %a, <8 x i8> %b) #4
   11487 // CHECK:   ret <8 x i8> [[VPMIN_V_I]]
   11488 int8x8_t test_vpmin_s8(int8x8_t a, int8x8_t b) {
   11489   return vpmin_s8(a, b);
   11490 }
   11491 
   11492 // CHECK-LABEL: define <4 x i16> @test_vpmin_s16(<4 x i16> %a, <4 x i16> %b) #0 {
   11493 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   11494 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   11495 // CHECK:   [[VPMIN_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   11496 // CHECK:   [[VPMIN_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   11497 // CHECK:   [[VPMIN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpmins.v4i16(<4 x i16> [[VPMIN_V_I]], <4 x i16> [[VPMIN_V1_I]]) #4
   11498 // CHECK:   [[VPMIN_V3_I:%.*]] = bitcast <4 x i16> [[VPMIN_V2_I]] to <8 x i8>
   11499 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPMIN_V3_I]] to <4 x i16>
   11500 // CHECK:   ret <4 x i16> [[TMP2]]
   11501 int16x4_t test_vpmin_s16(int16x4_t a, int16x4_t b) {
   11502   return vpmin_s16(a, b);
   11503 }
   11504 
   11505 // CHECK-LABEL: define <2 x i32> @test_vpmin_s32(<2 x i32> %a, <2 x i32> %b) #0 {
   11506 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   11507 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   11508 // CHECK:   [[VPMIN_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   11509 // CHECK:   [[VPMIN_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   11510 // CHECK:   [[VPMIN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32> [[VPMIN_V_I]], <2 x i32> [[VPMIN_V1_I]]) #4
   11511 // CHECK:   [[VPMIN_V3_I:%.*]] = bitcast <2 x i32> [[VPMIN_V2_I]] to <8 x i8>
   11512 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPMIN_V3_I]] to <2 x i32>
   11513 // CHECK:   ret <2 x i32> [[TMP2]]
   11514 int32x2_t test_vpmin_s32(int32x2_t a, int32x2_t b) {
   11515   return vpmin_s32(a, b);
   11516 }
   11517 
   11518 // CHECK-LABEL: define <8 x i8> @test_vpmin_u8(<8 x i8> %a, <8 x i8> %b) #0 {
   11519 // CHECK:   [[VPMIN_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vpminu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
   11520 // CHECK:   ret <8 x i8> [[VPMIN_V_I]]
   11521 uint8x8_t test_vpmin_u8(uint8x8_t a, uint8x8_t b) {
   11522   return vpmin_u8(a, b);
   11523 }
   11524 
   11525 // CHECK-LABEL: define <4 x i16> @test_vpmin_u16(<4 x i16> %a, <4 x i16> %b) #0 {
   11526 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   11527 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   11528 // CHECK:   [[VPMIN_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   11529 // CHECK:   [[VPMIN_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   11530 // CHECK:   [[VPMIN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpminu.v4i16(<4 x i16> [[VPMIN_V_I]], <4 x i16> [[VPMIN_V1_I]]) #4
   11531 // CHECK:   [[VPMIN_V3_I:%.*]] = bitcast <4 x i16> [[VPMIN_V2_I]] to <8 x i8>
   11532 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPMIN_V3_I]] to <4 x i16>
   11533 // CHECK:   ret <4 x i16> [[TMP2]]
   11534 uint16x4_t test_vpmin_u16(uint16x4_t a, uint16x4_t b) {
   11535   return vpmin_u16(a, b);
   11536 }
   11537 
   11538 // CHECK-LABEL: define <2 x i32> @test_vpmin_u32(<2 x i32> %a, <2 x i32> %b) #0 {
   11539 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   11540 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   11541 // CHECK:   [[VPMIN_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   11542 // CHECK:   [[VPMIN_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   11543 // CHECK:   [[VPMIN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpminu.v2i32(<2 x i32> [[VPMIN_V_I]], <2 x i32> [[VPMIN_V1_I]]) #4
   11544 // CHECK:   [[VPMIN_V3_I:%.*]] = bitcast <2 x i32> [[VPMIN_V2_I]] to <8 x i8>
   11545 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPMIN_V3_I]] to <2 x i32>
   11546 // CHECK:   ret <2 x i32> [[TMP2]]
   11547 uint32x2_t test_vpmin_u32(uint32x2_t a, uint32x2_t b) {
   11548   return vpmin_u32(a, b);
   11549 }
   11550 
   11551 // CHECK-LABEL: define <2 x float> @test_vpmin_f32(<2 x float> %a, <2 x float> %b) #0 {
   11552 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
   11553 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
   11554 // CHECK:   [[VPMIN_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
   11555 // CHECK:   [[VPMIN_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
   11556 // CHECK:   [[VPMIN_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vpmins.v2f32(<2 x float> [[VPMIN_V_I]], <2 x float> [[VPMIN_V1_I]]) #4
   11557 // CHECK:   [[VPMIN_V3_I:%.*]] = bitcast <2 x float> [[VPMIN_V2_I]] to <8 x i8>
   11558 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPMIN_V3_I]] to <2 x float>
   11559 // CHECK:   ret <2 x float> [[TMP2]]
   11560 float32x2_t test_vpmin_f32(float32x2_t a, float32x2_t b) {
   11561   return vpmin_f32(a, b);
   11562 }
   11563 
   11564 
   11565 // CHECK-LABEL: define <8 x i8> @test_vqabs_s8(<8 x i8> %a) #0 {
   11566 // CHECK:   [[VQABS_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqabs.v8i8(<8 x i8> %a) #4
   11567 // CHECK:   ret <8 x i8> [[VQABS_V_I]]
   11568 int8x8_t test_vqabs_s8(int8x8_t a) {
   11569   return vqabs_s8(a);
   11570 }
   11571 
   11572 // CHECK-LABEL: define <4 x i16> @test_vqabs_s16(<4 x i16> %a) #0 {
   11573 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   11574 // CHECK:   [[VQABS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   11575 // CHECK:   [[VQABS_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqabs.v4i16(<4 x i16> [[VQABS_V_I]]) #4
   11576 // CHECK:   [[VQABS_V2_I:%.*]] = bitcast <4 x i16> [[VQABS_V1_I]] to <8 x i8>
   11577 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQABS_V2_I]] to <4 x i16>
   11578 // CHECK:   ret <4 x i16> [[TMP1]]
   11579 int16x4_t test_vqabs_s16(int16x4_t a) {
   11580   return vqabs_s16(a);
   11581 }
   11582 
   11583 // CHECK-LABEL: define <2 x i32> @test_vqabs_s32(<2 x i32> %a) #0 {
   11584 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   11585 // CHECK:   [[VQABS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   11586 // CHECK:   [[VQABS_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqabs.v2i32(<2 x i32> [[VQABS_V_I]]) #4
   11587 // CHECK:   [[VQABS_V2_I:%.*]] = bitcast <2 x i32> [[VQABS_V1_I]] to <8 x i8>
   11588 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQABS_V2_I]] to <2 x i32>
   11589 // CHECK:   ret <2 x i32> [[TMP1]]
   11590 int32x2_t test_vqabs_s32(int32x2_t a) {
   11591   return vqabs_s32(a);
   11592 }
   11593 
   11594 // CHECK-LABEL: define <16 x i8> @test_vqabsq_s8(<16 x i8> %a) #0 {
   11595 // CHECK:   [[VQABSQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqabs.v16i8(<16 x i8> %a) #4
   11596 // CHECK:   ret <16 x i8> [[VQABSQ_V_I]]
   11597 int8x16_t test_vqabsq_s8(int8x16_t a) {
   11598   return vqabsq_s8(a);
   11599 }
   11600 
   11601 // CHECK-LABEL: define <8 x i16> @test_vqabsq_s16(<8 x i16> %a) #0 {
   11602 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   11603 // CHECK:   [[VQABSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   11604 // CHECK:   [[VQABSQ_V1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqabs.v8i16(<8 x i16> [[VQABSQ_V_I]]) #4
   11605 // CHECK:   [[VQABSQ_V2_I:%.*]] = bitcast <8 x i16> [[VQABSQ_V1_I]] to <16 x i8>
   11606 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VQABSQ_V2_I]] to <8 x i16>
   11607 // CHECK:   ret <8 x i16> [[TMP1]]
   11608 int16x8_t test_vqabsq_s16(int16x8_t a) {
   11609   return vqabsq_s16(a);
   11610 }
   11611 
   11612 // CHECK-LABEL: define <4 x i32> @test_vqabsq_s32(<4 x i32> %a) #0 {
   11613 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   11614 // CHECK:   [[VQABSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   11615 // CHECK:   [[VQABSQ_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqabs.v4i32(<4 x i32> [[VQABSQ_V_I]]) #4
   11616 // CHECK:   [[VQABSQ_V2_I:%.*]] = bitcast <4 x i32> [[VQABSQ_V1_I]] to <16 x i8>
   11617 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VQABSQ_V2_I]] to <4 x i32>
   11618 // CHECK:   ret <4 x i32> [[TMP1]]
   11619 int32x4_t test_vqabsq_s32(int32x4_t a) {
   11620   return vqabsq_s32(a);
   11621 }
   11622 
   11623 
   11624 // CHECK-LABEL: define <8 x i8> @test_vqadd_s8(<8 x i8> %a, <8 x i8> %b) #0 {
   11625 // CHECK:   [[VQADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqadds.v8i8(<8 x i8> %a, <8 x i8> %b) #4
   11626 // CHECK:   ret <8 x i8> [[VQADD_V_I]]
   11627 int8x8_t test_vqadd_s8(int8x8_t a, int8x8_t b) {
   11628   return vqadd_s8(a, b);
   11629 }
   11630 
   11631 // CHECK-LABEL: define <4 x i16> @test_vqadd_s16(<4 x i16> %a, <4 x i16> %b) #0 {
   11632 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   11633 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   11634 // CHECK:   [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   11635 // CHECK:   [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   11636 // CHECK:   [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqadds.v4i16(<4 x i16> [[VQADD_V_I]], <4 x i16> [[VQADD_V1_I]]) #4
   11637 // CHECK:   [[VQADD_V3_I:%.*]] = bitcast <4 x i16> [[VQADD_V2_I]] to <8 x i8>
   11638 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to <4 x i16>
   11639 // CHECK:   ret <4 x i16> [[TMP2]]
   11640 int16x4_t test_vqadd_s16(int16x4_t a, int16x4_t b) {
   11641   return vqadd_s16(a, b);
   11642 }
   11643 
   11644 // CHECK-LABEL: define <2 x i32> @test_vqadd_s32(<2 x i32> %a, <2 x i32> %b) #0 {
   11645 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   11646 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   11647 // CHECK:   [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   11648 // CHECK:   [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   11649 // CHECK:   [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqadds.v2i32(<2 x i32> [[VQADD_V_I]], <2 x i32> [[VQADD_V1_I]]) #4
   11650 // CHECK:   [[VQADD_V3_I:%.*]] = bitcast <2 x i32> [[VQADD_V2_I]] to <8 x i8>
   11651 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to <2 x i32>
   11652 // CHECK:   ret <2 x i32> [[TMP2]]
   11653 int32x2_t test_vqadd_s32(int32x2_t a, int32x2_t b) {
   11654   return vqadd_s32(a, b);
   11655 }
   11656 
   11657 // CHECK-LABEL: define <1 x i64> @test_vqadd_s64(<1 x i64> %a, <1 x i64> %b) #0 {
   11658 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   11659 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
   11660 // CHECK:   [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
   11661 // CHECK:   [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
   11662 // CHECK:   [[VQADD_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqadds.v1i64(<1 x i64> [[VQADD_V_I]], <1 x i64> [[VQADD_V1_I]]) #4
   11663 // CHECK:   [[VQADD_V3_I:%.*]] = bitcast <1 x i64> [[VQADD_V2_I]] to <8 x i8>
   11664 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to <1 x i64>
   11665 // CHECK:   ret <1 x i64> [[TMP2]]
   11666 int64x1_t test_vqadd_s64(int64x1_t a, int64x1_t b) {
   11667   return vqadd_s64(a, b);
   11668 }
   11669 
   11670 // CHECK-LABEL: define <8 x i8> @test_vqadd_u8(<8 x i8> %a, <8 x i8> %b) #0 {
   11671 // CHECK:   [[VQADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqaddu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
   11672 // CHECK:   ret <8 x i8> [[VQADD_V_I]]
   11673 uint8x8_t test_vqadd_u8(uint8x8_t a, uint8x8_t b) {
   11674   return vqadd_u8(a, b);
   11675 }
   11676 
   11677 // CHECK-LABEL: define <4 x i16> @test_vqadd_u16(<4 x i16> %a, <4 x i16> %b) #0 {
   11678 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   11679 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   11680 // CHECK:   [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   11681 // CHECK:   [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   11682 // CHECK:   [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqaddu.v4i16(<4 x i16> [[VQADD_V_I]], <4 x i16> [[VQADD_V1_I]]) #4
   11683 // CHECK:   [[VQADD_V3_I:%.*]] = bitcast <4 x i16> [[VQADD_V2_I]] to <8 x i8>
   11684 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to <4 x i16>
   11685 // CHECK:   ret <4 x i16> [[TMP2]]
   11686 uint16x4_t test_vqadd_u16(uint16x4_t a, uint16x4_t b) {
   11687   return vqadd_u16(a, b);
   11688 }
   11689 
   11690 // CHECK-LABEL: define <2 x i32> @test_vqadd_u32(<2 x i32> %a, <2 x i32> %b) #0 {
   11691 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   11692 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   11693 // CHECK:   [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   11694 // CHECK:   [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   11695 // CHECK:   [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqaddu.v2i32(<2 x i32> [[VQADD_V_I]], <2 x i32> [[VQADD_V1_I]]) #4
   11696 // CHECK:   [[VQADD_V3_I:%.*]] = bitcast <2 x i32> [[VQADD_V2_I]] to <8 x i8>
   11697 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to <2 x i32>
   11698 // CHECK:   ret <2 x i32> [[TMP2]]
   11699 uint32x2_t test_vqadd_u32(uint32x2_t a, uint32x2_t b) {
   11700   return vqadd_u32(a, b);
   11701 }
   11702 
   11703 // CHECK-LABEL: define <1 x i64> @test_vqadd_u64(<1 x i64> %a, <1 x i64> %b) #0 {
   11704 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   11705 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
   11706 // CHECK:   [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
   11707 // CHECK:   [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
   11708 // CHECK:   [[VQADD_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqaddu.v1i64(<1 x i64> [[VQADD_V_I]], <1 x i64> [[VQADD_V1_I]]) #4
   11709 // CHECK:   [[VQADD_V3_I:%.*]] = bitcast <1 x i64> [[VQADD_V2_I]] to <8 x i8>
   11710 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to <1 x i64>
   11711 // CHECK:   ret <1 x i64> [[TMP2]]
   11712 uint64x1_t test_vqadd_u64(uint64x1_t a, uint64x1_t b) {
   11713   return vqadd_u64(a, b);
   11714 }
   11715 
   11716 // CHECK-LABEL: define <16 x i8> @test_vqaddq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
   11717 // CHECK:   [[VQADDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqadds.v16i8(<16 x i8> %a, <16 x i8> %b) #4
   11718 // CHECK:   ret <16 x i8> [[VQADDQ_V_I]]
   11719 int8x16_t test_vqaddq_s8(int8x16_t a, int8x16_t b) {
   11720   return vqaddq_s8(a, b);
   11721 }
   11722 
   11723 // CHECK-LABEL: define <8 x i16> @test_vqaddq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
   11724 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   11725 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   11726 // CHECK:   [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   11727 // CHECK:   [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   11728 // CHECK:   [[VQADDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqadds.v8i16(<8 x i16> [[VQADDQ_V_I]], <8 x i16> [[VQADDQ_V1_I]]) #4
   11729 // CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VQADDQ_V2_I]] to <16 x i8>
   11730 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <8 x i16>
   11731 // CHECK:   ret <8 x i16> [[TMP2]]
   11732 int16x8_t test_vqaddq_s16(int16x8_t a, int16x8_t b) {
   11733   return vqaddq_s16(a, b);
   11734 }
   11735 
   11736 // CHECK-LABEL: define <4 x i32> @test_vqaddq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
   11737 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   11738 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   11739 // CHECK:   [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   11740 // CHECK:   [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   11741 // CHECK:   [[VQADDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> [[VQADDQ_V_I]], <4 x i32> [[VQADDQ_V1_I]]) #4
   11742 // CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VQADDQ_V2_I]] to <16 x i8>
   11743 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <4 x i32>
   11744 // CHECK:   ret <4 x i32> [[TMP2]]
   11745 int32x4_t test_vqaddq_s32(int32x4_t a, int32x4_t b) {
   11746   return vqaddq_s32(a, b);
   11747 }
   11748 
   11749 // CHECK-LABEL: define <2 x i64> @test_vqaddq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
   11750 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   11751 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   11752 // CHECK:   [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   11753 // CHECK:   [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
   11754 // CHECK:   [[VQADDQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> [[VQADDQ_V_I]], <2 x i64> [[VQADDQ_V1_I]]) #4
   11755 // CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <2 x i64> [[VQADDQ_V2_I]] to <16 x i8>
   11756 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <2 x i64>
   11757 // CHECK:   ret <2 x i64> [[TMP2]]
   11758 int64x2_t test_vqaddq_s64(int64x2_t a, int64x2_t b) {
   11759   return vqaddq_s64(a, b);
   11760 }
   11761 
   11762 // CHECK-LABEL: define <16 x i8> @test_vqaddq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
   11763 // CHECK:   [[VQADDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqaddu.v16i8(<16 x i8> %a, <16 x i8> %b) #4
   11764 // CHECK:   ret <16 x i8> [[VQADDQ_V_I]]
   11765 uint8x16_t test_vqaddq_u8(uint8x16_t a, uint8x16_t b) {
   11766   return vqaddq_u8(a, b);
   11767 }
   11768 
   11769 // CHECK-LABEL: define <8 x i16> @test_vqaddq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
   11770 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   11771 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   11772 // CHECK:   [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   11773 // CHECK:   [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   11774 // CHECK:   [[VQADDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqaddu.v8i16(<8 x i16> [[VQADDQ_V_I]], <8 x i16> [[VQADDQ_V1_I]]) #4
   11775 // CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VQADDQ_V2_I]] to <16 x i8>
   11776 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <8 x i16>
   11777 // CHECK:   ret <8 x i16> [[TMP2]]
   11778 uint16x8_t test_vqaddq_u16(uint16x8_t a, uint16x8_t b) {
   11779   return vqaddq_u16(a, b);
   11780 }
   11781 
   11782 // CHECK-LABEL: define <4 x i32> @test_vqaddq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
   11783 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   11784 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   11785 // CHECK:   [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   11786 // CHECK:   [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   11787 // CHECK:   [[VQADDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqaddu.v4i32(<4 x i32> [[VQADDQ_V_I]], <4 x i32> [[VQADDQ_V1_I]]) #4
   11788 // CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VQADDQ_V2_I]] to <16 x i8>
   11789 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <4 x i32>
   11790 // CHECK:   ret <4 x i32> [[TMP2]]
   11791 uint32x4_t test_vqaddq_u32(uint32x4_t a, uint32x4_t b) {
   11792   return vqaddq_u32(a, b);
   11793 }
   11794 
   11795 // CHECK-LABEL: define <2 x i64> @test_vqaddq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
   11796 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   11797 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   11798 // CHECK:   [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   11799 // CHECK:   [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
   11800 // CHECK:   [[VQADDQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqaddu.v2i64(<2 x i64> [[VQADDQ_V_I]], <2 x i64> [[VQADDQ_V1_I]]) #4
   11801 // CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <2 x i64> [[VQADDQ_V2_I]] to <16 x i8>
   11802 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <2 x i64>
   11803 // CHECK:   ret <2 x i64> [[TMP2]]
   11804 uint64x2_t test_vqaddq_u64(uint64x2_t a, uint64x2_t b) {
   11805   return vqaddq_u64(a, b);
   11806 }
   11807 
   11808 
   11809 // CHECK-LABEL: define <4 x i32> @test_vqdmlal_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
   11810 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   11811 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   11812 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8>
   11813 // CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   11814 // CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
   11815 // CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #4
   11816 // CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   11817 // CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #4
   11818 // CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
   11819 int32x4_t test_vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
   11820   return vqdmlal_s16(a, b, c);
   11821 }
   11822 
   11823 // CHECK-LABEL: define <2 x i64> @test_vqdmlal_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
   11824 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   11825 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   11826 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %c to <8 x i8>
   11827 // CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   11828 // CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
   11829 // CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #4
   11830 // CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   11831 // CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #4
   11832 // CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
   11833 int64x2_t test_vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
   11834   return vqdmlal_s32(a, b, c);
   11835 }
   11836 
   11837 
   11838 // CHECK-LABEL: define <4 x i32> @test_vqdmlal_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
   11839 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   11840 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   11841 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   11842 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
   11843 // CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   11844 // CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
   11845 // CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #4
   11846 // CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   11847 // CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #4
   11848 // CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
   11849 int32x4_t test_vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
   11850   return vqdmlal_lane_s16(a, b, c, 3);
   11851 }
   11852 
   11853 // CHECK-LABEL: define <2 x i64> @test_vqdmlal_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
   11854 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
   11855 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   11856 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   11857 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
   11858 // CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   11859 // CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
   11860 // CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #4
   11861 // CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   11862 // CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #4
   11863 // CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
   11864 int64x2_t test_vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
   11865   return vqdmlal_lane_s32(a, b, c, 1);
   11866 }
   11867 
   11868 
   11869 // CHECK-LABEL: define <4 x i32> @test_vqdmlal_n_s16(<4 x i32> %a, <4 x i16> %b, i16 signext %c) #0 {
   11870 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   11871 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   11872 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
   11873 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
   11874 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
   11875 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
   11876 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
   11877 // CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   11878 // CHECK:   [[VQDMLAL4_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
   11879 // CHECK:   [[VQDMLAL5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL4_I]]) #4
   11880 // CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   11881 // CHECK:   [[VQDMLAL_V6_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL5_I]]) #4
   11882 // CHECK:   ret <4 x i32> [[VQDMLAL_V6_I]]
   11883 int32x4_t test_vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
   11884   return vqdmlal_n_s16(a, b, c);
   11885 }
   11886 
   11887 // CHECK-LABEL: define <2 x i64> @test_vqdmlal_n_s32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 {
   11888 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   11889 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   11890 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
   11891 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
   11892 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
   11893 // CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   11894 // CHECK:   [[VQDMLAL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
   11895 // CHECK:   [[VQDMLAL3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL2_I]]) #4
   11896 // CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   11897 // CHECK:   [[VQDMLAL_V4_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL3_I]]) #4
   11898 // CHECK:   ret <2 x i64> [[VQDMLAL_V4_I]]
   11899 int64x2_t test_vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
   11900   return vqdmlal_n_s32(a, b, c);
   11901 }
   11902 
   11903 
   11904 // CHECK-LABEL: define <4 x i32> @test_vqdmlsl_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
   11905 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   11906 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   11907 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8>
   11908 // CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   11909 // CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
   11910 // CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #4
   11911 // CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   11912 // CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #4
   11913 // CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
   11914 int32x4_t test_vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
   11915   return vqdmlsl_s16(a, b, c);
   11916 }
   11917 
   11918 // CHECK-LABEL: define <2 x i64> @test_vqdmlsl_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
   11919 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   11920 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   11921 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %c to <8 x i8>
   11922 // CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   11923 // CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
   11924 // CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #4
   11925 // CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   11926 // CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #4
   11927 // CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
   11928 int64x2_t test_vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
   11929   return vqdmlsl_s32(a, b, c);
   11930 }
   11931 
   11932 
   11933 // CHECK-LABEL: define <4 x i32> @test_vqdmlsl_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
   11934 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   11935 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   11936 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   11937 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
   11938 // CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   11939 // CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
   11940 // CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #4
   11941 // CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   11942 // CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #4
   11943 // CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
   11944 int32x4_t test_vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
   11945   return vqdmlsl_lane_s16(a, b, c, 3);
   11946 }
   11947 
   11948 // CHECK-LABEL: define <2 x i64> @test_vqdmlsl_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
   11949 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
   11950 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   11951 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   11952 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
   11953 // CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   11954 // CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
   11955 // CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #4
   11956 // CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   11957 // CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #4
   11958 // CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
   11959 int64x2_t test_vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
   11960   return vqdmlsl_lane_s32(a, b, c, 1);
   11961 }
   11962 
   11963 
   11964 // CHECK-LABEL: define <4 x i32> @test_vqdmlsl_n_s16(<4 x i32> %a, <4 x i16> %b, i16 signext %c) #0 {
   11965 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   11966 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   11967 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
   11968 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
   11969 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
   11970 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
   11971 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
   11972 // CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   11973 // CHECK:   [[VQDMLAL4_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
   11974 // CHECK:   [[VQDMLAL5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL4_I]]) #4
   11975 // CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   11976 // CHECK:   [[VQDMLSL_V6_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL5_I]]) #4
   11977 // CHECK:   ret <4 x i32> [[VQDMLSL_V6_I]]
   11978 int32x4_t test_vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
   11979   return vqdmlsl_n_s16(a, b, c);
   11980 }
   11981 
   11982 // CHECK-LABEL: define <2 x i64> @test_vqdmlsl_n_s32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 {
   11983 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   11984 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   11985 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
   11986 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
   11987 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
   11988 // CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   11989 // CHECK:   [[VQDMLAL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
   11990 // CHECK:   [[VQDMLAL3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL2_I]]) #4
   11991 // CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   11992 // CHECK:   [[VQDMLSL_V4_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL3_I]]) #4
   11993 // CHECK:   ret <2 x i64> [[VQDMLSL_V4_I]]
   11994 int64x2_t test_vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
   11995   return vqdmlsl_n_s32(a, b, c);
   11996 }
   11997 
   11998 
   11999 // CHECK-LABEL: define <4 x i16> @test_vqdmulh_s16(<4 x i16> %a, <4 x i16> %b) #0 {
   12000 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   12001 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   12002 // CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   12003 // CHECK:   [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   12004 // CHECK:   [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> [[VQDMULH_V_I]], <4 x i16> [[VQDMULH_V1_I]]) #4
   12005 // CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
   12006 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <4 x i16>
   12007 // CHECK:   ret <4 x i16> [[TMP2]]
   12008 int16x4_t test_vqdmulh_s16(int16x4_t a, int16x4_t b) {
   12009   return vqdmulh_s16(a, b);
   12010 }
   12011 
   12012 // CHECK-LABEL: define <2 x i32> @test_vqdmulh_s32(<2 x i32> %a, <2 x i32> %b) #0 {
   12013 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   12014 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   12015 // CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   12016 // CHECK:   [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   12017 // CHECK:   [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> [[VQDMULH_V_I]], <2 x i32> [[VQDMULH_V1_I]]) #4
   12018 // CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
   12019 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <2 x i32>
   12020 // CHECK:   ret <2 x i32> [[TMP2]]
   12021 int32x2_t test_vqdmulh_s32(int32x2_t a, int32x2_t b) {
   12022   return vqdmulh_s32(a, b);
   12023 }
   12024 
   12025 // CHECK-LABEL: define <8 x i16> @test_vqdmulhq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
   12026 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   12027 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   12028 // CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   12029 // CHECK:   [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   12030 // CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> [[VQDMULHQ_V_I]], <8 x i16> [[VQDMULHQ_V1_I]]) #4
   12031 // CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
   12032 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <8 x i16>
   12033 // CHECK:   ret <8 x i16> [[TMP2]]
   12034 int16x8_t test_vqdmulhq_s16(int16x8_t a, int16x8_t b) {
   12035   return vqdmulhq_s16(a, b);
   12036 }
   12037 
   12038 // CHECK-LABEL: define <4 x i32> @test_vqdmulhq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
   12039 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   12040 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   12041 // CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   12042 // CHECK:   [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   12043 // CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> [[VQDMULHQ_V_I]], <4 x i32> [[VQDMULHQ_V1_I]]) #4
   12044 // CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
   12045 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <4 x i32>
   12046 // CHECK:   ret <4 x i32> [[TMP2]]
   12047 int32x4_t test_vqdmulhq_s32(int32x4_t a, int32x4_t b) {
   12048   return vqdmulhq_s32(a, b);
   12049 }
   12050 
   12051 
   12052 // CHECK-LABEL: define <4 x i16> @test_vqdmulh_lane_s16(<4 x i16> %a, <4 x i16> %b) #0 {
   12053 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   12054 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   12055 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
   12056 // CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   12057 // CHECK:   [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   12058 // CHECK:   [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> [[VQDMULH_V_I]], <4 x i16> [[VQDMULH_V1_I]]) #4
   12059 // CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
   12060 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <4 x i16>
   12061 // CHECK:   ret <4 x i16> [[TMP2]]
   12062 int16x4_t test_vqdmulh_lane_s16(int16x4_t a, int16x4_t b) {
   12063   return vqdmulh_lane_s16(a, b, 3);
   12064 }
   12065 
   12066 // CHECK-LABEL: define <2 x i32> @test_vqdmulh_lane_s32(<2 x i32> %a, <2 x i32> %b) #0 {
   12067 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> <i32 1, i32 1>
   12068 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   12069 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
   12070 // CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   12071 // CHECK:   [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   12072 // CHECK:   [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> [[VQDMULH_V_I]], <2 x i32> [[VQDMULH_V1_I]]) #4
   12073 // CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
   12074 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <2 x i32>
   12075 // CHECK:   ret <2 x i32> [[TMP2]]
   12076 int32x2_t test_vqdmulh_lane_s32(int32x2_t a, int32x2_t b) {
   12077   return vqdmulh_lane_s32(a, b, 1);
   12078 }
   12079 
   12080 // CHECK-LABEL: define <8 x i16> @test_vqdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %b) #0 {
   12081 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
   12082 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   12083 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
   12084 // CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   12085 // CHECK:   [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   12086 // CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> [[VQDMULHQ_V_I]], <8 x i16> [[VQDMULHQ_V1_I]]) #4
   12087 // CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
   12088 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <8 x i16>
   12089 // CHECK:   ret <8 x i16> [[TMP2]]
   12090 int16x8_t test_vqdmulhq_lane_s16(int16x8_t a, int16x4_t b) {
   12091   return vqdmulhq_lane_s16(a, b, 3);
   12092 }
   12093 
   12094 // CHECK-LABEL: define <4 x i32> @test_vqdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %b) #0 {
   12095 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   12096 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   12097 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
   12098 // CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   12099 // CHECK:   [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   12100 // CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> [[VQDMULHQ_V_I]], <4 x i32> [[VQDMULHQ_V1_I]]) #4
   12101 // CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
   12102 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <4 x i32>
   12103 // CHECK:   ret <4 x i32> [[TMP2]]
   12104 int32x4_t test_vqdmulhq_lane_s32(int32x4_t a, int32x2_t b) {
   12105   return vqdmulhq_lane_s32(a, b, 1);
   12106 }
   12107 
   12108 
   12109 // CHECK-LABEL: define <4 x i16> @test_vqdmulh_n_s16(<4 x i16> %a, i16 signext %b) #0 {
   12110 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   12111 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
   12112 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
   12113 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
   12114 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
   12115 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
   12116 // CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   12117 // CHECK:   [[VQDMULH_V4_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   12118 // CHECK:   [[VQDMULH_V5_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> [[VQDMULH_V_I]], <4 x i16> [[VQDMULH_V4_I]]) #4
   12119 // CHECK:   [[VQDMULH_V6_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V5_I]] to <8 x i8>
   12120 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V6_I]] to <4 x i16>
   12121 // CHECK:   ret <4 x i16> [[TMP2]]
   12122 int16x4_t test_vqdmulh_n_s16(int16x4_t a, int16_t b) {
   12123   return vqdmulh_n_s16(a, b);
   12124 }
   12125 
   12126 // CHECK-LABEL: define <2 x i32> @test_vqdmulh_n_s32(<2 x i32> %a, i32 %b) #0 {
   12127 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   12128 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
   12129 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
   12130 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
   12131 // CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   12132 // CHECK:   [[VQDMULH_V2_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   12133 // CHECK:   [[VQDMULH_V3_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> [[VQDMULH_V_I]], <2 x i32> [[VQDMULH_V2_I]]) #4
   12134 // CHECK:   [[VQDMULH_V4_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V3_I]] to <8 x i8>
   12135 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V4_I]] to <2 x i32>
   12136 // CHECK:   ret <2 x i32> [[TMP2]]
   12137 int32x2_t test_vqdmulh_n_s32(int32x2_t a, int32_t b) {
   12138   return vqdmulh_n_s32(a, b);
   12139 }
   12140 
   12141 // CHECK-LABEL: define <8 x i16> @test_vqdmulhq_n_s16(<8 x i16> %a, i16 signext %b) #0 {
   12142 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   12143 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
   12144 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
   12145 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
   12146 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3
   12147 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4
   12148 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5
   12149 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
   12150 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
   12151 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8>
   12152 // CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   12153 // CHECK:   [[VQDMULHQ_V8_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   12154 // CHECK:   [[VQDMULHQ_V9_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> [[VQDMULHQ_V_I]], <8 x i16> [[VQDMULHQ_V8_I]]) #4
   12155 // CHECK:   [[VQDMULHQ_V10_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V9_I]] to <16 x i8>
   12156 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V10_I]] to <8 x i16>
   12157 // CHECK:   ret <8 x i16> [[TMP2]]
   12158 int16x8_t test_vqdmulhq_n_s16(int16x8_t a, int16_t b) {
   12159   return vqdmulhq_n_s16(a, b);
   12160 }
   12161 
   12162 // CHECK-LABEL: define <4 x i32> @test_vqdmulhq_n_s32(<4 x i32> %a, i32 %b) #0 {
   12163 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   12164 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
   12165 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
   12166 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
   12167 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
   12168 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8>
   12169 // CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   12170 // CHECK:   [[VQDMULHQ_V4_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   12171 // CHECK:   [[VQDMULHQ_V5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> [[VQDMULHQ_V_I]], <4 x i32> [[VQDMULHQ_V4_I]]) #4
   12172 // CHECK:   [[VQDMULHQ_V6_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V5_I]] to <16 x i8>
   12173 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V6_I]] to <4 x i32>
   12174 // CHECK:   ret <4 x i32> [[TMP2]]
   12175 int32x4_t test_vqdmulhq_n_s32(int32x4_t a, int32_t b) {
   12176   return vqdmulhq_n_s32(a, b);
   12177 }
   12178 
   12179 
   12180 // CHECK-LABEL: define <4 x i32> @test_vqdmull_s16(<4 x i16> %a, <4 x i16> %b) #0 {
   12181 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   12182 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   12183 // CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   12184 // CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   12185 // CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #4
   12186 // CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
   12187 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32>
   12188 // CHECK:   ret <4 x i32> [[TMP2]]
   12189 int32x4_t test_vqdmull_s16(int16x4_t a, int16x4_t b) {
   12190   return vqdmull_s16(a, b);
   12191 }
   12192 
   12193 // CHECK-LABEL: define <2 x i64> @test_vqdmull_s32(<2 x i32> %a, <2 x i32> %b) #0 {
   12194 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   12195 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   12196 // CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   12197 // CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   12198 // CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #4
   12199 // CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
   12200 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64>
   12201 // CHECK:   ret <2 x i64> [[TMP2]]
   12202 int64x2_t test_vqdmull_s32(int32x2_t a, int32x2_t b) {
   12203   return vqdmull_s32(a, b);
   12204 }
   12205 
   12206 
   12207 // CHECK-LABEL: define <4 x i32> @test_vqdmull_lane_s16(<4 x i16> %a, <4 x i16> %b) #0 {
   12208 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   12209 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   12210 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
   12211 // CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   12212 // CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   12213 // CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #4
   12214 // CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
   12215 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32>
   12216 // CHECK:   ret <4 x i32> [[TMP2]]
   12217 int32x4_t test_vqdmull_lane_s16(int16x4_t a, int16x4_t b) {
   12218   return vqdmull_lane_s16(a, b, 3);
   12219 }
   12220 
   12221 // CHECK-LABEL: define <2 x i64> @test_vqdmull_lane_s32(<2 x i32> %a, <2 x i32> %b) #0 {
   12222 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> <i32 1, i32 1>
   12223 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   12224 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
   12225 // CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   12226 // CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   12227 // CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #4
   12228 // CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
   12229 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64>
   12230 // CHECK:   ret <2 x i64> [[TMP2]]
   12231 int64x2_t test_vqdmull_lane_s32(int32x2_t a, int32x2_t b) {
   12232   return vqdmull_lane_s32(a, b, 1);
   12233 }
   12234 
   12235 
   12236 // CHECK-LABEL: define <4 x i32> @test_vqdmull_n_s16(<4 x i16> %a, i16 signext %b) #0 {
   12237 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   12238 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
   12239 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
   12240 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
   12241 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
   12242 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
   12243 // CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   12244 // CHECK:   [[VQDMULL_V4_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   12245 // CHECK:   [[VQDMULL_V5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V4_I]]) #4
   12246 // CHECK:   [[VQDMULL_V6_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V5_I]] to <16 x i8>
   12247 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V6_I]] to <4 x i32>
   12248 // CHECK:   ret <4 x i32> [[TMP2]]
   12249 int32x4_t test_vqdmull_n_s16(int16x4_t a, int16_t b) {
   12250   return vqdmull_n_s16(a, b);
   12251 }
   12252 
   12253 // CHECK-LABEL: define <2 x i64> @test_vqdmull_n_s32(<2 x i32> %a, i32 %b) #0 {
   12254 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   12255 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
   12256 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
   12257 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
   12258 // CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   12259 // CHECK:   [[VQDMULL_V2_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   12260 // CHECK:   [[VQDMULL_V3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V2_I]]) #4
   12261 // CHECK:   [[VQDMULL_V4_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V3_I]] to <16 x i8>
   12262 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V4_I]] to <2 x i64>
   12263 // CHECK:   ret <2 x i64> [[TMP2]]
   12264 int64x2_t test_vqdmull_n_s32(int32x2_t a, int32_t b) {
   12265   return vqdmull_n_s32(a, b);
   12266 }
   12267 
   12268 
   12269 // CHECK-LABEL: define <8 x i8> @test_vqmovn_s16(<8 x i16> %a) #0 {
   12270 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   12271 // CHECK:   [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   12272 // CHECK:   [[VQMOVN_V1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqmovns.v8i8(<8 x i16> [[VQMOVN_V_I]]) #4
   12273 // CHECK:   ret <8 x i8> [[VQMOVN_V1_I]]
   12274 int8x8_t test_vqmovn_s16(int16x8_t a) {
   12275   return vqmovn_s16(a);
   12276 }
   12277 
   12278 // CHECK-LABEL: define <4 x i16> @test_vqmovn_s32(<4 x i32> %a) #0 {
   12279 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   12280 // CHECK:   [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   12281 // CHECK:   [[VQMOVN_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqmovns.v4i16(<4 x i32> [[VQMOVN_V_I]]) #4
   12282 // CHECK:   [[VQMOVN_V2_I:%.*]] = bitcast <4 x i16> [[VQMOVN_V1_I]] to <8 x i8>
   12283 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVN_V2_I]] to <4 x i16>
   12284 // CHECK:   ret <4 x i16> [[TMP1]]
   12285 int16x4_t test_vqmovn_s32(int32x4_t a) {
   12286   return vqmovn_s32(a);
   12287 }
   12288 
   12289 // CHECK-LABEL: define <2 x i32> @test_vqmovn_s64(<2 x i64> %a) #0 {
   12290 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   12291 // CHECK:   [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   12292 // CHECK:   [[VQMOVN_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqmovns.v2i32(<2 x i64> [[VQMOVN_V_I]]) #4
   12293 // CHECK:   [[VQMOVN_V2_I:%.*]] = bitcast <2 x i32> [[VQMOVN_V1_I]] to <8 x i8>
   12294 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVN_V2_I]] to <2 x i32>
   12295 // CHECK:   ret <2 x i32> [[TMP1]]
   12296 int32x2_t test_vqmovn_s64(int64x2_t a) {
   12297   return vqmovn_s64(a);
   12298 }
   12299 
   12300 // CHECK-LABEL: define <8 x i8> @test_vqmovn_u16(<8 x i16> %a) #0 {
   12301 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   12302 // CHECK:   [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   12303 // CHECK:   [[VQMOVN_V1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqmovnu.v8i8(<8 x i16> [[VQMOVN_V_I]]) #4
   12304 // CHECK:   ret <8 x i8> [[VQMOVN_V1_I]]
   12305 uint8x8_t test_vqmovn_u16(uint16x8_t a) {
   12306   return vqmovn_u16(a);
   12307 }
   12308 
   12309 // CHECK-LABEL: define <4 x i16> @test_vqmovn_u32(<4 x i32> %a) #0 {
   12310 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   12311 // CHECK:   [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   12312 // CHECK:   [[VQMOVN_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqmovnu.v4i16(<4 x i32> [[VQMOVN_V_I]]) #4
   12313 // CHECK:   [[VQMOVN_V2_I:%.*]] = bitcast <4 x i16> [[VQMOVN_V1_I]] to <8 x i8>
   12314 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVN_V2_I]] to <4 x i16>
   12315 // CHECK:   ret <4 x i16> [[TMP1]]
   12316 uint16x4_t test_vqmovn_u32(uint32x4_t a) {
   12317   return vqmovn_u32(a);
   12318 }
   12319 
   12320 // CHECK-LABEL: define <2 x i32> @test_vqmovn_u64(<2 x i64> %a) #0 {
   12321 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   12322 // CHECK:   [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   12323 // CHECK:   [[VQMOVN_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqmovnu.v2i32(<2 x i64> [[VQMOVN_V_I]]) #4
   12324 // CHECK:   [[VQMOVN_V2_I:%.*]] = bitcast <2 x i32> [[VQMOVN_V1_I]] to <8 x i8>
   12325 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVN_V2_I]] to <2 x i32>
   12326 // CHECK:   ret <2 x i32> [[TMP1]]
   12327 uint32x2_t test_vqmovn_u64(uint64x2_t a) {
   12328   return vqmovn_u64(a);
   12329 }
   12330 
   12331 
   12332 // CHECK-LABEL: define <8 x i8> @test_vqmovun_s16(<8 x i16> %a) #0 {
   12333 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   12334 // CHECK:   [[VQMOVUN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   12335 // CHECK:   [[VQMOVUN_V1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqmovnsu.v8i8(<8 x i16> [[VQMOVUN_V_I]]) #4
   12336 // CHECK:   ret <8 x i8> [[VQMOVUN_V1_I]]
   12337 uint8x8_t test_vqmovun_s16(int16x8_t a) {
   12338   return vqmovun_s16(a);
   12339 }
   12340 
   12341 // CHECK-LABEL: define <4 x i16> @test_vqmovun_s32(<4 x i32> %a) #0 {
   12342 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   12343 // CHECK:   [[VQMOVUN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   12344 // CHECK:   [[VQMOVUN_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqmovnsu.v4i16(<4 x i32> [[VQMOVUN_V_I]]) #4
   12345 // CHECK:   [[VQMOVUN_V2_I:%.*]] = bitcast <4 x i16> [[VQMOVUN_V1_I]] to <8 x i8>
   12346 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVUN_V2_I]] to <4 x i16>
   12347 // CHECK:   ret <4 x i16> [[TMP1]]
   12348 uint16x4_t test_vqmovun_s32(int32x4_t a) {
   12349   return vqmovun_s32(a);
   12350 }
   12351 
   12352 // CHECK-LABEL: define <2 x i32> @test_vqmovun_s64(<2 x i64> %a) #0 {
   12353 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   12354 // CHECK:   [[VQMOVUN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   12355 // CHECK:   [[VQMOVUN_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqmovnsu.v2i32(<2 x i64> [[VQMOVUN_V_I]]) #4
   12356 // CHECK:   [[VQMOVUN_V2_I:%.*]] = bitcast <2 x i32> [[VQMOVUN_V1_I]] to <8 x i8>
   12357 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVUN_V2_I]] to <2 x i32>
   12358 // CHECK:   ret <2 x i32> [[TMP1]]
   12359 uint32x2_t test_vqmovun_s64(int64x2_t a) {
   12360   return vqmovun_s64(a);
   12361 }
   12362 
   12363 
   12364 // CHECK-LABEL: define <8 x i8> @test_vqneg_s8(<8 x i8> %a) #0 {
   12365 // CHECK:   [[VQNEG_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqneg.v8i8(<8 x i8> %a) #4
   12366 // CHECK:   ret <8 x i8> [[VQNEG_V_I]]
   12367 int8x8_t test_vqneg_s8(int8x8_t a) {
   12368   return vqneg_s8(a);
   12369 }
   12370 
   12371 // CHECK-LABEL: define <4 x i16> @test_vqneg_s16(<4 x i16> %a) #0 {
   12372 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   12373 // CHECK:   [[VQNEG_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   12374 // CHECK:   [[VQNEG_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqneg.v4i16(<4 x i16> [[VQNEG_V_I]]) #4
   12375 // CHECK:   [[VQNEG_V2_I:%.*]] = bitcast <4 x i16> [[VQNEG_V1_I]] to <8 x i8>
   12376 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQNEG_V2_I]] to <4 x i16>
   12377 // CHECK:   ret <4 x i16> [[TMP1]]
   12378 int16x4_t test_vqneg_s16(int16x4_t a) {
   12379   return vqneg_s16(a);
   12380 }
   12381 
   12382 // CHECK-LABEL: define <2 x i32> @test_vqneg_s32(<2 x i32> %a) #0 {
   12383 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   12384 // CHECK:   [[VQNEG_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   12385 // CHECK:   [[VQNEG_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqneg.v2i32(<2 x i32> [[VQNEG_V_I]]) #4
   12386 // CHECK:   [[VQNEG_V2_I:%.*]] = bitcast <2 x i32> [[VQNEG_V1_I]] to <8 x i8>
   12387 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQNEG_V2_I]] to <2 x i32>
   12388 // CHECK:   ret <2 x i32> [[TMP1]]
   12389 int32x2_t test_vqneg_s32(int32x2_t a) {
   12390   return vqneg_s32(a);
   12391 }
   12392 
   12393 // CHECK-LABEL: define <16 x i8> @test_vqnegq_s8(<16 x i8> %a) #0 {
   12394 // CHECK:   [[VQNEGQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqneg.v16i8(<16 x i8> %a) #4
   12395 // CHECK:   ret <16 x i8> [[VQNEGQ_V_I]]
   12396 int8x16_t test_vqnegq_s8(int8x16_t a) {
   12397   return vqnegq_s8(a);
   12398 }
   12399 
   12400 // CHECK-LABEL: define <8 x i16> @test_vqnegq_s16(<8 x i16> %a) #0 {
   12401 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   12402 // CHECK:   [[VQNEGQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   12403 // CHECK:   [[VQNEGQ_V1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqneg.v8i16(<8 x i16> [[VQNEGQ_V_I]]) #4
   12404 // CHECK:   [[VQNEGQ_V2_I:%.*]] = bitcast <8 x i16> [[VQNEGQ_V1_I]] to <16 x i8>
   12405 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VQNEGQ_V2_I]] to <8 x i16>
   12406 // CHECK:   ret <8 x i16> [[TMP1]]
   12407 int16x8_t test_vqnegq_s16(int16x8_t a) {
   12408   return vqnegq_s16(a);
   12409 }
   12410 
   12411 // CHECK-LABEL: define <4 x i32> @test_vqnegq_s32(<4 x i32> %a) #0 {
   12412 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   12413 // CHECK:   [[VQNEGQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   12414 // CHECK:   [[VQNEGQ_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqneg.v4i32(<4 x i32> [[VQNEGQ_V_I]]) #4
   12415 // CHECK:   [[VQNEGQ_V2_I:%.*]] = bitcast <4 x i32> [[VQNEGQ_V1_I]] to <16 x i8>
   12416 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VQNEGQ_V2_I]] to <4 x i32>
   12417 // CHECK:   ret <4 x i32> [[TMP1]]
   12418 int32x4_t test_vqnegq_s32(int32x4_t a) {
   12419   return vqnegq_s32(a);
   12420 }
   12421 
   12422 
   12423 // CHECK-LABEL: define <4 x i16> @test_vqrdmulh_s16(<4 x i16> %a, <4 x i16> %b) #0 {
   12424 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   12425 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   12426 // CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   12427 // CHECK:   [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   12428 // CHECK:   [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> [[VQRDMULH_V_I]], <4 x i16> [[VQRDMULH_V1_I]]) #4
   12429 // CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
   12430 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <4 x i16>
   12431 // CHECK:   ret <4 x i16> [[TMP2]]
   12432 int16x4_t test_vqrdmulh_s16(int16x4_t a, int16x4_t b) {
   12433   return vqrdmulh_s16(a, b);
   12434 }
   12435 
   12436 // CHECK-LABEL: define <2 x i32> @test_vqrdmulh_s32(<2 x i32> %a, <2 x i32> %b) #0 {
   12437 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   12438 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   12439 // CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   12440 // CHECK:   [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   12441 // CHECK:   [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> [[VQRDMULH_V_I]], <2 x i32> [[VQRDMULH_V1_I]]) #4
   12442 // CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
   12443 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <2 x i32>
   12444 // CHECK:   ret <2 x i32> [[TMP2]]
   12445 int32x2_t test_vqrdmulh_s32(int32x2_t a, int32x2_t b) {
   12446   return vqrdmulh_s32(a, b);
   12447 }
   12448 
   12449 // CHECK-LABEL: define <8 x i16> @test_vqrdmulhq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
   12450 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   12451 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   12452 // CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   12453 // CHECK:   [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   12454 // CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> [[VQRDMULHQ_V_I]], <8 x i16> [[VQRDMULHQ_V1_I]]) #4
   12455 // CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
   12456 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <8 x i16>
   12457 // CHECK:   ret <8 x i16> [[TMP2]]
   12458 int16x8_t test_vqrdmulhq_s16(int16x8_t a, int16x8_t b) {
   12459   return vqrdmulhq_s16(a, b);
   12460 }
   12461 
   12462 // CHECK-LABEL: define <4 x i32> @test_vqrdmulhq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
   12463 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   12464 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   12465 // CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   12466 // CHECK:   [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   12467 // CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> [[VQRDMULHQ_V_I]], <4 x i32> [[VQRDMULHQ_V1_I]]) #4
   12468 // CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
   12469 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <4 x i32>
   12470 // CHECK:   ret <4 x i32> [[TMP2]]
   12471 int32x4_t test_vqrdmulhq_s32(int32x4_t a, int32x4_t b) {
   12472   return vqrdmulhq_s32(a, b);
   12473 }
   12474 
   12475 
   12476 // CHECK-LABEL: define <4 x i16> @test_vqrdmulh_lane_s16(<4 x i16> %a, <4 x i16> %b) #0 {
   12477 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   12478 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   12479 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
   12480 // CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   12481 // CHECK:   [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   12482 // CHECK:   [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> [[VQRDMULH_V_I]], <4 x i16> [[VQRDMULH_V1_I]]) #4
   12483 // CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
   12484 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <4 x i16>
   12485 // CHECK:   ret <4 x i16> [[TMP2]]
   12486 int16x4_t test_vqrdmulh_lane_s16(int16x4_t a, int16x4_t b) {
   12487   return vqrdmulh_lane_s16(a, b, 3);
   12488 }
   12489 
   12490 // CHECK-LABEL: define <2 x i32> @test_vqrdmulh_lane_s32(<2 x i32> %a, <2 x i32> %b) #0 {
   12491 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> <i32 1, i32 1>
   12492 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   12493 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
   12494 // CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   12495 // CHECK:   [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   12496 // CHECK:   [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> [[VQRDMULH_V_I]], <2 x i32> [[VQRDMULH_V1_I]]) #4
   12497 // CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
   12498 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <2 x i32>
   12499 // CHECK:   ret <2 x i32> [[TMP2]]
   12500 int32x2_t test_vqrdmulh_lane_s32(int32x2_t a, int32x2_t b) {
   12501   return vqrdmulh_lane_s32(a, b, 1);
   12502 }
   12503 
   12504 // CHECK-LABEL: define <8 x i16> @test_vqrdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %b) #0 {
   12505 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
   12506 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   12507 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
   12508 // CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   12509 // CHECK:   [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   12510 // CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> [[VQRDMULHQ_V_I]], <8 x i16> [[VQRDMULHQ_V1_I]]) #4
   12511 // CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
   12512 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <8 x i16>
   12513 // CHECK:   ret <8 x i16> [[TMP2]]
   12514 int16x8_t test_vqrdmulhq_lane_s16(int16x8_t a, int16x4_t b) {
   12515   return vqrdmulhq_lane_s16(a, b, 3);
   12516 }
   12517 
   12518 // CHECK-LABEL: define <4 x i32> @test_vqrdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %b) #0 {
   12519 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   12520 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   12521 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
   12522 // CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   12523 // CHECK:   [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   12524 // CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> [[VQRDMULHQ_V_I]], <4 x i32> [[VQRDMULHQ_V1_I]]) #4
   12525 // CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
   12526 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <4 x i32>
   12527 // CHECK:   ret <4 x i32> [[TMP2]]
   12528 int32x4_t test_vqrdmulhq_lane_s32(int32x4_t a, int32x2_t b) {
   12529   return vqrdmulhq_lane_s32(a, b, 1);
   12530 }
   12531 
   12532 
   12533 // CHECK-LABEL: define <4 x i16> @test_vqrdmulh_n_s16(<4 x i16> %a, i16 signext %b) #0 {
   12534 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   12535 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
   12536 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
   12537 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
   12538 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
   12539 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
   12540 // CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   12541 // CHECK:   [[VQRDMULH_V4_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   12542 // CHECK:   [[VQRDMULH_V5_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> [[VQRDMULH_V_I]], <4 x i16> [[VQRDMULH_V4_I]]) #4
   12543 // CHECK:   [[VQRDMULH_V6_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V5_I]] to <8 x i8>
   12544 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V6_I]] to <4 x i16>
   12545 // CHECK:   ret <4 x i16> [[TMP2]]
   12546 int16x4_t test_vqrdmulh_n_s16(int16x4_t a, int16_t b) {
   12547   return vqrdmulh_n_s16(a, b);
   12548 }
   12549 
   12550 // CHECK-LABEL: define <2 x i32> @test_vqrdmulh_n_s32(<2 x i32> %a, i32 %b) #0 {
   12551 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   12552 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
   12553 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
   12554 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
   12555 // CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   12556 // CHECK:   [[VQRDMULH_V2_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   12557 // CHECK:   [[VQRDMULH_V3_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> [[VQRDMULH_V_I]], <2 x i32> [[VQRDMULH_V2_I]]) #4
   12558 // CHECK:   [[VQRDMULH_V4_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V3_I]] to <8 x i8>
   12559 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V4_I]] to <2 x i32>
   12560 // CHECK:   ret <2 x i32> [[TMP2]]
   12561 int32x2_t test_vqrdmulh_n_s32(int32x2_t a, int32_t b) {
   12562   return vqrdmulh_n_s32(a, b);
   12563 }
   12564 
   12565 // CHECK-LABEL: define <8 x i16> @test_vqrdmulhq_n_s16(<8 x i16> %a, i16 signext %b) #0 {
   12566 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   12567 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
   12568 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
   12569 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
   12570 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3
   12571 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4
   12572 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5
   12573 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
   12574 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
   12575 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8>
   12576 // CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   12577 // CHECK:   [[VQRDMULHQ_V8_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   12578 // CHECK:   [[VQRDMULHQ_V9_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> [[VQRDMULHQ_V_I]], <8 x i16> [[VQRDMULHQ_V8_I]]) #4
   12579 // CHECK:   [[VQRDMULHQ_V10_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V9_I]] to <16 x i8>
   12580 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V10_I]] to <8 x i16>
   12581 // CHECK:   ret <8 x i16> [[TMP2]]
   12582 int16x8_t test_vqrdmulhq_n_s16(int16x8_t a, int16_t b) {
   12583   return vqrdmulhq_n_s16(a, b);
   12584 }
   12585 
   12586 // CHECK-LABEL: define <4 x i32> @test_vqrdmulhq_n_s32(<4 x i32> %a, i32 %b) #0 {
   12587 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   12588 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
   12589 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
   12590 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
   12591 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
   12592 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8>
   12593 // CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   12594 // CHECK:   [[VQRDMULHQ_V4_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   12595 // CHECK:   [[VQRDMULHQ_V5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> [[VQRDMULHQ_V_I]], <4 x i32> [[VQRDMULHQ_V4_I]]) #4
   12596 // CHECK:   [[VQRDMULHQ_V6_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V5_I]] to <16 x i8>
   12597 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V6_I]] to <4 x i32>
   12598 // CHECK:   ret <4 x i32> [[TMP2]]
   12599 int32x4_t test_vqrdmulhq_n_s32(int32x4_t a, int32_t b) {
   12600   return vqrdmulhq_n_s32(a, b);
   12601 }
   12602 
   12603 
   12604 // CHECK-LABEL: define <8 x i8> @test_vqrshl_s8(<8 x i8> %a, <8 x i8> %b) #0 {
   12605 // CHECK:   [[VQRSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqrshifts.v8i8(<8 x i8> %a, <8 x i8> %b) #4
   12606 // CHECK:   ret <8 x i8> [[VQRSHL_V_I]]
   12607 int8x8_t test_vqrshl_s8(int8x8_t a, int8x8_t b) {
   12608   return vqrshl_s8(a, b);
   12609 }
   12610 
   12611 // CHECK-LABEL: define <4 x i16> @test_vqrshl_s16(<4 x i16> %a, <4 x i16> %b) #0 {
   12612 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   12613 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   12614 // CHECK:   [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   12615 // CHECK:   [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   12616 // CHECK:   [[VQRSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrshifts.v4i16(<4 x i16> [[VQRSHL_V_I]], <4 x i16> [[VQRSHL_V1_I]]) #4
   12617 // CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQRSHL_V2_I]] to <8 x i8>
   12618 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to <4 x i16>
   12619 // CHECK:   ret <4 x i16> [[TMP2]]
   12620 int16x4_t test_vqrshl_s16(int16x4_t a, int16x4_t b) {
   12621   return vqrshl_s16(a, b);
   12622 }
   12623 
   12624 // CHECK-LABEL: define <2 x i32> @test_vqrshl_s32(<2 x i32> %a, <2 x i32> %b) #0 {
   12625 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   12626 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   12627 // CHECK:   [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   12628 // CHECK:   [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   12629 // CHECK:   [[VQRSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrshifts.v2i32(<2 x i32> [[VQRSHL_V_I]], <2 x i32> [[VQRSHL_V1_I]]) #4
   12630 // CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQRSHL_V2_I]] to <8 x i8>
   12631 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to <2 x i32>
   12632 // CHECK:   ret <2 x i32> [[TMP2]]
   12633 int32x2_t test_vqrshl_s32(int32x2_t a, int32x2_t b) {
   12634   return vqrshl_s32(a, b);
   12635 }
   12636 
   12637 // CHECK-LABEL: define <1 x i64> @test_vqrshl_s64(<1 x i64> %a, <1 x i64> %b) #0 {
   12638 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   12639 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
   12640 // CHECK:   [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
   12641 // CHECK:   [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
   12642 // CHECK:   [[VQRSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqrshifts.v1i64(<1 x i64> [[VQRSHL_V_I]], <1 x i64> [[VQRSHL_V1_I]]) #4
   12643 // CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQRSHL_V2_I]] to <8 x i8>
   12644 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to <1 x i64>
   12645 // CHECK:   ret <1 x i64> [[TMP2]]
   12646 int64x1_t test_vqrshl_s64(int64x1_t a, int64x1_t b) {
   12647   return vqrshl_s64(a, b);
   12648 }
   12649 
   12650 // CHECK-LABEL: define <8 x i8> @test_vqrshl_u8(<8 x i8> %a, <8 x i8> %b) #0 {
   12651 // CHECK:   [[VQRSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqrshiftu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
   12652 // CHECK:   ret <8 x i8> [[VQRSHL_V_I]]
   12653 uint8x8_t test_vqrshl_u8(uint8x8_t a, int8x8_t b) {
   12654   return vqrshl_u8(a, b);
   12655 }
   12656 
   12657 // CHECK-LABEL: define <4 x i16> @test_vqrshl_u16(<4 x i16> %a, <4 x i16> %b) #0 {
   12658 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   12659 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   12660 // CHECK:   [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   12661 // CHECK:   [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   12662 // CHECK:   [[VQRSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrshiftu.v4i16(<4 x i16> [[VQRSHL_V_I]], <4 x i16> [[VQRSHL_V1_I]]) #4
   12663 // CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQRSHL_V2_I]] to <8 x i8>
   12664 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to <4 x i16>
   12665 // CHECK:   ret <4 x i16> [[TMP2]]
   12666 uint16x4_t test_vqrshl_u16(uint16x4_t a, int16x4_t b) {
   12667   return vqrshl_u16(a, b);
   12668 }
   12669 
   12670 // CHECK-LABEL: define <2 x i32> @test_vqrshl_u32(<2 x i32> %a, <2 x i32> %b) #0 {
   12671 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   12672 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   12673 // CHECK:   [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   12674 // CHECK:   [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   12675 // CHECK:   [[VQRSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrshiftu.v2i32(<2 x i32> [[VQRSHL_V_I]], <2 x i32> [[VQRSHL_V1_I]]) #4
   12676 // CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQRSHL_V2_I]] to <8 x i8>
   12677 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to <2 x i32>
   12678 // CHECK:   ret <2 x i32> [[TMP2]]
   12679 uint32x2_t test_vqrshl_u32(uint32x2_t a, int32x2_t b) {
   12680   return vqrshl_u32(a, b);
   12681 }
   12682 
   12683 // CHECK-LABEL: define <1 x i64> @test_vqrshl_u64(<1 x i64> %a, <1 x i64> %b) #0 {
   12684 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   12685 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
   12686 // CHECK:   [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
   12687 // CHECK:   [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
   12688 // CHECK:   [[VQRSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqrshiftu.v1i64(<1 x i64> [[VQRSHL_V_I]], <1 x i64> [[VQRSHL_V1_I]]) #4
   12689 // CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQRSHL_V2_I]] to <8 x i8>
   12690 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to <1 x i64>
   12691 // CHECK:   ret <1 x i64> [[TMP2]]
   12692 uint64x1_t test_vqrshl_u64(uint64x1_t a, int64x1_t b) {
   12693   return vqrshl_u64(a, b);
   12694 }
   12695 
   12696 // CHECK-LABEL: define <16 x i8> @test_vqrshlq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
   12697 // CHECK:   [[VQRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqrshifts.v16i8(<16 x i8> %a, <16 x i8> %b) #4
   12698 // CHECK:   ret <16 x i8> [[VQRSHLQ_V_I]]
   12699 int8x16_t test_vqrshlq_s8(int8x16_t a, int8x16_t b) {
   12700   return vqrshlq_s8(a, b);
   12701 }
   12702 
   12703 // CHECK-LABEL: define <8 x i16> @test_vqrshlq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
   12704 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   12705 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   12706 // CHECK:   [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   12707 // CHECK:   [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   12708 // CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrshifts.v8i16(<8 x i16> [[VQRSHLQ_V_I]], <8 x i16> [[VQRSHLQ_V1_I]]) #4
   12709 // CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRSHLQ_V2_I]] to <16 x i8>
   12710 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <8 x i16>
   12711 // CHECK:   ret <8 x i16> [[TMP2]]
   12712 int16x8_t test_vqrshlq_s16(int16x8_t a, int16x8_t b) {
   12713   return vqrshlq_s16(a, b);
   12714 }
   12715 
   12716 // CHECK-LABEL: define <4 x i32> @test_vqrshlq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
   12717 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   12718 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   12719 // CHECK:   [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   12720 // CHECK:   [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   12721 // CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrshifts.v4i32(<4 x i32> [[VQRSHLQ_V_I]], <4 x i32> [[VQRSHLQ_V1_I]]) #4
   12722 // CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRSHLQ_V2_I]] to <16 x i8>
   12723 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <4 x i32>
   12724 // CHECK:   ret <4 x i32> [[TMP2]]
   12725 int32x4_t test_vqrshlq_s32(int32x4_t a, int32x4_t b) {
   12726   return vqrshlq_s32(a, b);
   12727 }
   12728 
   12729 // CHECK-LABEL: define <2 x i64> @test_vqrshlq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
   12730 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   12731 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   12732 // CHECK:   [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   12733 // CHECK:   [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
   12734 // CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqrshifts.v2i64(<2 x i64> [[VQRSHLQ_V_I]], <2 x i64> [[VQRSHLQ_V1_I]]) #4
   12735 // CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQRSHLQ_V2_I]] to <16 x i8>
   12736 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <2 x i64>
   12737 // CHECK:   ret <2 x i64> [[TMP2]]
   12738 int64x2_t test_vqrshlq_s64(int64x2_t a, int64x2_t b) {
   12739   return vqrshlq_s64(a, b);
   12740 }
   12741 
   12742 // CHECK-LABEL: define <16 x i8> @test_vqrshlq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
   12743 // CHECK:   [[VQRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqrshiftu.v16i8(<16 x i8> %a, <16 x i8> %b) #4
   12744 // CHECK:   ret <16 x i8> [[VQRSHLQ_V_I]]
   12745 uint8x16_t test_vqrshlq_u8(uint8x16_t a, int8x16_t b) {
   12746   return vqrshlq_u8(a, b);
   12747 }
   12748 
   12749 // CHECK-LABEL: define <8 x i16> @test_vqrshlq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
   12750 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   12751 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   12752 // CHECK:   [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   12753 // CHECK:   [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   12754 // CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrshiftu.v8i16(<8 x i16> [[VQRSHLQ_V_I]], <8 x i16> [[VQRSHLQ_V1_I]]) #4
   12755 // CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRSHLQ_V2_I]] to <16 x i8>
   12756 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <8 x i16>
   12757 // CHECK:   ret <8 x i16> [[TMP2]]
   12758 uint16x8_t test_vqrshlq_u16(uint16x8_t a, int16x8_t b) {
   12759   return vqrshlq_u16(a, b);
   12760 }
   12761 
   12762 // CHECK-LABEL: define <4 x i32> @test_vqrshlq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
   12763 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   12764 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   12765 // CHECK:   [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   12766 // CHECK:   [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   12767 // CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrshiftu.v4i32(<4 x i32> [[VQRSHLQ_V_I]], <4 x i32> [[VQRSHLQ_V1_I]]) #4
   12768 // CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRSHLQ_V2_I]] to <16 x i8>
   12769 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <4 x i32>
   12770 // CHECK:   ret <4 x i32> [[TMP2]]
   12771 uint32x4_t test_vqrshlq_u32(uint32x4_t a, int32x4_t b) {
   12772   return vqrshlq_u32(a, b);
   12773 }
   12774 
   12775 // CHECK-LABEL: define <2 x i64> @test_vqrshlq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
   12776 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   12777 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   12778 // CHECK:   [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   12779 // CHECK:   [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
   12780 // CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqrshiftu.v2i64(<2 x i64> [[VQRSHLQ_V_I]], <2 x i64> [[VQRSHLQ_V1_I]]) #4
   12781 // CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQRSHLQ_V2_I]] to <16 x i8>
   12782 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <2 x i64>
   12783 // CHECK:   ret <2 x i64> [[TMP2]]
   12784 uint64x2_t test_vqrshlq_u64(uint64x2_t a, int64x2_t b) {
   12785   return vqrshlq_u64(a, b);
   12786 }
   12787 
   12788 
   12789 // CHECK-LABEL: define <8 x i8> @test_vqrshrn_n_s16(<8 x i16> %a) #0 {
   12790 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   12791 // CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   12792 // CHECK:   [[VQRSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqrshiftns.v8i8(<8 x i16> [[VQRSHRN_N]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
   12793 // CHECK:   ret <8 x i8> [[VQRSHRN_N1]]
   12794 int8x8_t test_vqrshrn_n_s16(int16x8_t a) {
   12795   return vqrshrn_n_s16(a, 1);
   12796 }
   12797 
   12798 // CHECK-LABEL: define <4 x i16> @test_vqrshrn_n_s32(<4 x i32> %a) #0 {
   12799 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   12800 // CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   12801 // CHECK:   [[VQRSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqrshiftns.v4i16(<4 x i32> [[VQRSHRN_N]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
   12802 // CHECK:   ret <4 x i16> [[VQRSHRN_N1]]
   12803 int16x4_t test_vqrshrn_n_s32(int32x4_t a) {
   12804   return vqrshrn_n_s32(a, 1);
   12805 }
   12806 
   12807 // CHECK-LABEL: define <2 x i32> @test_vqrshrn_n_s64(<2 x i64> %a) #0 {
   12808 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   12809 // CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   12810 // CHECK:   [[VQRSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqrshiftns.v2i32(<2 x i64> [[VQRSHRN_N]], <2 x i64> <i64 -1, i64 -1>)
   12811 // CHECK:   ret <2 x i32> [[VQRSHRN_N1]]
   12812 int32x2_t test_vqrshrn_n_s64(int64x2_t a) {
   12813   return vqrshrn_n_s64(a, 1);
   12814 }
   12815 
   12816 // CHECK-LABEL: define <8 x i8> @test_vqrshrn_n_u16(<8 x i16> %a) #0 {
   12817 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   12818 // CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   12819 // CHECK:   [[VQRSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqrshiftnu.v8i8(<8 x i16> [[VQRSHRN_N]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
   12820 // CHECK:   ret <8 x i8> [[VQRSHRN_N1]]
   12821 uint8x8_t test_vqrshrn_n_u16(uint16x8_t a) {
   12822   return vqrshrn_n_u16(a, 1);
   12823 }
   12824 
   12825 // CHECK-LABEL: define <4 x i16> @test_vqrshrn_n_u32(<4 x i32> %a) #0 {
   12826 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   12827 // CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   12828 // CHECK:   [[VQRSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqrshiftnu.v4i16(<4 x i32> [[VQRSHRN_N]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
   12829 // CHECK:   ret <4 x i16> [[VQRSHRN_N1]]
   12830 uint16x4_t test_vqrshrn_n_u32(uint32x4_t a) {
   12831   return vqrshrn_n_u32(a, 1);
   12832 }
   12833 
   12834 // CHECK-LABEL: define <2 x i32> @test_vqrshrn_n_u64(<2 x i64> %a) #0 {
   12835 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   12836 // CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   12837 // CHECK:   [[VQRSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqrshiftnu.v2i32(<2 x i64> [[VQRSHRN_N]], <2 x i64> <i64 -1, i64 -1>)
   12838 // CHECK:   ret <2 x i32> [[VQRSHRN_N1]]
   12839 uint32x2_t test_vqrshrn_n_u64(uint64x2_t a) {
   12840   return vqrshrn_n_u64(a, 1);
   12841 }
   12842 
   12843 
   12844 // CHECK-LABEL: define <8 x i8> @test_vqrshrun_n_s16(<8 x i16> %a) #0 {
   12845 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   12846 // CHECK:   [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   12847 // CHECK:   [[VQRSHRUN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqrshiftnsu.v8i8(<8 x i16> [[VQRSHRUN_N]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
   12848 // CHECK:   ret <8 x i8> [[VQRSHRUN_N1]]
   12849 uint8x8_t test_vqrshrun_n_s16(int16x8_t a) {
   12850   return vqrshrun_n_s16(a, 1);
   12851 }
   12852 
   12853 // CHECK-LABEL: define <4 x i16> @test_vqrshrun_n_s32(<4 x i32> %a) #0 {
   12854 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   12855 // CHECK:   [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   12856 // CHECK:   [[VQRSHRUN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqrshiftnsu.v4i16(<4 x i32> [[VQRSHRUN_N]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
   12857 // CHECK:   ret <4 x i16> [[VQRSHRUN_N1]]
   12858 uint16x4_t test_vqrshrun_n_s32(int32x4_t a) {
   12859   return vqrshrun_n_s32(a, 1);
   12860 }
   12861 
   12862 // CHECK-LABEL: define <2 x i32> @test_vqrshrun_n_s64(<2 x i64> %a) #0 {
   12863 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   12864 // CHECK:   [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   12865 // CHECK:   [[VQRSHRUN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqrshiftnsu.v2i32(<2 x i64> [[VQRSHRUN_N]], <2 x i64> <i64 -1, i64 -1>)
   12866 // CHECK:   ret <2 x i32> [[VQRSHRUN_N1]]
   12867 uint32x2_t test_vqrshrun_n_s64(int64x2_t a) {
   12868   return vqrshrun_n_s64(a, 1);
   12869 }
   12870 
   12871 
   12872 // CHECK-LABEL: define <8 x i8> @test_vqshl_s8(<8 x i8> %a, <8 x i8> %b) #0 {
   12873 // CHECK:   [[VQSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqshifts.v8i8(<8 x i8> %a, <8 x i8> %b) #4
   12874 // CHECK:   ret <8 x i8> [[VQSHL_V_I]]
   12875 int8x8_t test_vqshl_s8(int8x8_t a, int8x8_t b) {
   12876   return vqshl_s8(a, b);
   12877 }
   12878 
   12879 // CHECK-LABEL: define <4 x i16> @test_vqshl_s16(<4 x i16> %a, <4 x i16> %b) #0 {
   12880 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   12881 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   12882 // CHECK:   [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   12883 // CHECK:   [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   12884 // CHECK:   [[VQSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16> [[VQSHL_V_I]], <4 x i16> [[VQSHL_V1_I]]) #4
   12885 // CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQSHL_V2_I]] to <8 x i8>
   12886 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to <4 x i16>
   12887 // CHECK:   ret <4 x i16> [[TMP2]]
   12888 int16x4_t test_vqshl_s16(int16x4_t a, int16x4_t b) {
   12889   return vqshl_s16(a, b);
   12890 }
   12891 
   12892 // CHECK-LABEL: define <2 x i32> @test_vqshl_s32(<2 x i32> %a, <2 x i32> %b) #0 {
   12893 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   12894 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   12895 // CHECK:   [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   12896 // CHECK:   [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   12897 // CHECK:   [[VQSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32> [[VQSHL_V_I]], <2 x i32> [[VQSHL_V1_I]]) #4
   12898 // CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQSHL_V2_I]] to <8 x i8>
   12899 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to <2 x i32>
   12900 // CHECK:   ret <2 x i32> [[TMP2]]
   12901 int32x2_t test_vqshl_s32(int32x2_t a, int32x2_t b) {
   12902   return vqshl_s32(a, b);
   12903 }
   12904 
   12905 // CHECK-LABEL: define <1 x i64> @test_vqshl_s64(<1 x i64> %a, <1 x i64> %b) #0 {
   12906 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   12907 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
   12908 // CHECK:   [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
   12909 // CHECK:   [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
   12910 // CHECK:   [[VQSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqshifts.v1i64(<1 x i64> [[VQSHL_V_I]], <1 x i64> [[VQSHL_V1_I]]) #4
   12911 // CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQSHL_V2_I]] to <8 x i8>
   12912 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to <1 x i64>
   12913 // CHECK:   ret <1 x i64> [[TMP2]]
   12914 int64x1_t test_vqshl_s64(int64x1_t a, int64x1_t b) {
   12915   return vqshl_s64(a, b);
   12916 }
   12917 
   12918 // CHECK-LABEL: define <8 x i8> @test_vqshl_u8(<8 x i8> %a, <8 x i8> %b) #0 {
   12919 // CHECK:   [[VQSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
   12920 // CHECK:   ret <8 x i8> [[VQSHL_V_I]]
   12921 uint8x8_t test_vqshl_u8(uint8x8_t a, int8x8_t b) {
   12922   return vqshl_u8(a, b);
   12923 }
   12924 
   12925 // CHECK-LABEL: define <4 x i16> @test_vqshl_u16(<4 x i16> %a, <4 x i16> %b) #0 {
   12926 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   12927 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   12928 // CHECK:   [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   12929 // CHECK:   [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   12930 // CHECK:   [[VQSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16> [[VQSHL_V_I]], <4 x i16> [[VQSHL_V1_I]]) #4
   12931 // CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQSHL_V2_I]] to <8 x i8>
   12932 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to <4 x i16>
   12933 // CHECK:   ret <4 x i16> [[TMP2]]
   12934 uint16x4_t test_vqshl_u16(uint16x4_t a, int16x4_t b) {
   12935   return vqshl_u16(a, b);
   12936 }
   12937 
   12938 // CHECK-LABEL: define <2 x i32> @test_vqshl_u32(<2 x i32> %a, <2 x i32> %b) #0 {
   12939 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   12940 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   12941 // CHECK:   [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   12942 // CHECK:   [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   12943 // CHECK:   [[VQSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32> [[VQSHL_V_I]], <2 x i32> [[VQSHL_V1_I]]) #4
   12944 // CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQSHL_V2_I]] to <8 x i8>
   12945 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to <2 x i32>
   12946 // CHECK:   ret <2 x i32> [[TMP2]]
   12947 uint32x2_t test_vqshl_u32(uint32x2_t a, int32x2_t b) {
   12948   return vqshl_u32(a, b);
   12949 }
   12950 
   12951 // CHECK-LABEL: define <1 x i64> @test_vqshl_u64(<1 x i64> %a, <1 x i64> %b) #0 {
   12952 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   12953 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
   12954 // CHECK:   [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
   12955 // CHECK:   [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
   12956 // CHECK:   [[VQSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqshiftu.v1i64(<1 x i64> [[VQSHL_V_I]], <1 x i64> [[VQSHL_V1_I]]) #4
   12957 // CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQSHL_V2_I]] to <8 x i8>
   12958 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to <1 x i64>
   12959 // CHECK:   ret <1 x i64> [[TMP2]]
   12960 uint64x1_t test_vqshl_u64(uint64x1_t a, int64x1_t b) {
   12961   return vqshl_u64(a, b);
   12962 }
   12963 
   12964 // CHECK-LABEL: define <16 x i8> @test_vqshlq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
   12965 // CHECK:   [[VQSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqshifts.v16i8(<16 x i8> %a, <16 x i8> %b) #4
   12966 // CHECK:   ret <16 x i8> [[VQSHLQ_V_I]]
   12967 int8x16_t test_vqshlq_s8(int8x16_t a, int8x16_t b) {
   12968   return vqshlq_s8(a, b);
   12969 }
   12970 
   12971 // CHECK-LABEL: define <8 x i16> @test_vqshlq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
   12972 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   12973 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   12974 // CHECK:   [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   12975 // CHECK:   [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   12976 // CHECK:   [[VQSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16> [[VQSHLQ_V_I]], <8 x i16> [[VQSHLQ_V1_I]]) #4
   12977 // CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSHLQ_V2_I]] to <16 x i8>
   12978 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <8 x i16>
   12979 // CHECK:   ret <8 x i16> [[TMP2]]
   12980 int16x8_t test_vqshlq_s16(int16x8_t a, int16x8_t b) {
   12981   return vqshlq_s16(a, b);
   12982 }
   12983 
   12984 // CHECK-LABEL: define <4 x i32> @test_vqshlq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
   12985 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   12986 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   12987 // CHECK:   [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   12988 // CHECK:   [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   12989 // CHECK:   [[VQSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32> [[VQSHLQ_V_I]], <4 x i32> [[VQSHLQ_V1_I]]) #4
   12990 // CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSHLQ_V2_I]] to <16 x i8>
   12991 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <4 x i32>
   12992 // CHECK:   ret <4 x i32> [[TMP2]]
   12993 int32x4_t test_vqshlq_s32(int32x4_t a, int32x4_t b) {
   12994   return vqshlq_s32(a, b);
   12995 }
   12996 
   12997 // CHECK-LABEL: define <2 x i64> @test_vqshlq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
   12998 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   12999 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   13000 // CHECK:   [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   13001 // CHECK:   [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
   13002 // CHECK:   [[VQSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64> [[VQSHLQ_V_I]], <2 x i64> [[VQSHLQ_V1_I]]) #4
   13003 // CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSHLQ_V2_I]] to <16 x i8>
   13004 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <2 x i64>
   13005 // CHECK:   ret <2 x i64> [[TMP2]]
   13006 int64x2_t test_vqshlq_s64(int64x2_t a, int64x2_t b) {
   13007   return vqshlq_s64(a, b);
   13008 }
   13009 
   13010 // CHECK-LABEL: define <16 x i8> @test_vqshlq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
   13011 // CHECK:   [[VQSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqshiftu.v16i8(<16 x i8> %a, <16 x i8> %b) #4
   13012 // CHECK:   ret <16 x i8> [[VQSHLQ_V_I]]
   13013 uint8x16_t test_vqshlq_u8(uint8x16_t a, int8x16_t b) {
   13014   return vqshlq_u8(a, b);
   13015 }
   13016 
   13017 // CHECK-LABEL: define <8 x i16> @test_vqshlq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
   13018 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   13019 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   13020 // CHECK:   [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   13021 // CHECK:   [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   13022 // CHECK:   [[VQSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16> [[VQSHLQ_V_I]], <8 x i16> [[VQSHLQ_V1_I]]) #4
   13023 // CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSHLQ_V2_I]] to <16 x i8>
   13024 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <8 x i16>
   13025 // CHECK:   ret <8 x i16> [[TMP2]]
   13026 uint16x8_t test_vqshlq_u16(uint16x8_t a, int16x8_t b) {
   13027   return vqshlq_u16(a, b);
   13028 }
   13029 
   13030 // CHECK-LABEL: define <4 x i32> @test_vqshlq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
   13031 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   13032 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   13033 // CHECK:   [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   13034 // CHECK:   [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   13035 // CHECK:   [[VQSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32> [[VQSHLQ_V_I]], <4 x i32> [[VQSHLQ_V1_I]]) #4
   13036 // CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSHLQ_V2_I]] to <16 x i8>
   13037 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <4 x i32>
   13038 // CHECK:   ret <4 x i32> [[TMP2]]
   13039 uint32x4_t test_vqshlq_u32(uint32x4_t a, int32x4_t b) {
   13040   return vqshlq_u32(a, b);
   13041 }
   13042 
   13043 // CHECK-LABEL: define <2 x i64> @test_vqshlq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
   13044 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   13045 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   13046 // CHECK:   [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   13047 // CHECK:   [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
   13048 // CHECK:   [[VQSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64> [[VQSHLQ_V_I]], <2 x i64> [[VQSHLQ_V1_I]]) #4
   13049 // CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSHLQ_V2_I]] to <16 x i8>
   13050 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <2 x i64>
   13051 // CHECK:   ret <2 x i64> [[TMP2]]
   13052 uint64x2_t test_vqshlq_u64(uint64x2_t a, int64x2_t b) {
   13053   return vqshlq_u64(a, b);
   13054 }
   13055 
   13056 
   13057 // CHECK-LABEL: define <8 x i8> @test_vqshlu_n_s8(<8 x i8> %a) #0 {
   13058 // CHECK:   [[VQSHLU_N:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftsu.v8i8(<8 x i8> %a, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
   13059 // CHECK:   ret <8 x i8> [[VQSHLU_N]]
   13060 uint8x8_t test_vqshlu_n_s8(int8x8_t a) {
   13061   return vqshlu_n_s8(a, 1);
   13062 }
   13063 
   13064 // CHECK-LABEL: define <4 x i16> @test_vqshlu_n_s16(<4 x i16> %a) #0 {
   13065 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   13066 // CHECK:   [[VQSHLU_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   13067 // CHECK:   [[VQSHLU_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftsu.v4i16(<4 x i16> [[VQSHLU_N]], <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
   13068 // CHECK:   ret <4 x i16> [[VQSHLU_N1]]
   13069 uint16x4_t test_vqshlu_n_s16(int16x4_t a) {
   13070   return vqshlu_n_s16(a, 1);
   13071 }
   13072 
   13073 // CHECK-LABEL: define <2 x i32> @test_vqshlu_n_s32(<2 x i32> %a) #0 {
   13074 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   13075 // CHECK:   [[VQSHLU_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   13076 // CHECK:   [[VQSHLU_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftsu.v2i32(<2 x i32> [[VQSHLU_N]], <2 x i32> <i32 1, i32 1>)
   13077 // CHECK:   ret <2 x i32> [[VQSHLU_N1]]
   13078 uint32x2_t test_vqshlu_n_s32(int32x2_t a) {
   13079   return vqshlu_n_s32(a, 1);
   13080 }
   13081 
   13082 // CHECK-LABEL: define <1 x i64> @test_vqshlu_n_s64(<1 x i64> %a) #0 {
   13083 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   13084 // CHECK:   [[VQSHLU_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
   13085 // CHECK:   [[VQSHLU_N1:%.*]] = call <1 x i64> @llvm.arm.neon.vqshiftsu.v1i64(<1 x i64> [[VQSHLU_N]], <1 x i64> <i64 1>)
   13086 // CHECK:   ret <1 x i64> [[VQSHLU_N1]]
   13087 uint64x1_t test_vqshlu_n_s64(int64x1_t a) {
   13088   return vqshlu_n_s64(a, 1);
   13089 }
   13090 
   13091 // CHECK-LABEL: define <16 x i8> @test_vqshluq_n_s8(<16 x i8> %a) #0 {
   13092 // CHECK:   [[VQSHLU_N:%.*]] = call <16 x i8> @llvm.arm.neon.vqshiftsu.v16i8(<16 x i8> %a, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
   13093 // CHECK:   ret <16 x i8> [[VQSHLU_N]]
   13094 uint8x16_t test_vqshluq_n_s8(int8x16_t a) {
   13095   return vqshluq_n_s8(a, 1);
   13096 }
   13097 
   13098 // CHECK-LABEL: define <8 x i16> @test_vqshluq_n_s16(<8 x i16> %a) #0 {
   13099 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   13100 // CHECK:   [[VQSHLU_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   13101 // CHECK:   [[VQSHLU_N1:%.*]] = call <8 x i16> @llvm.arm.neon.vqshiftsu.v8i16(<8 x i16> [[VQSHLU_N]], <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
   13102 // CHECK:   ret <8 x i16> [[VQSHLU_N1]]
   13103 uint16x8_t test_vqshluq_n_s16(int16x8_t a) {
   13104   return vqshluq_n_s16(a, 1);
   13105 }
   13106 
   13107 // CHECK-LABEL: define <4 x i32> @test_vqshluq_n_s32(<4 x i32> %a) #0 {
   13108 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   13109 // CHECK:   [[VQSHLU_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   13110 // CHECK:   [[VQSHLU_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vqshiftsu.v4i32(<4 x i32> [[VQSHLU_N]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
   13111 // CHECK:   ret <4 x i32> [[VQSHLU_N1]]
   13112 uint32x4_t test_vqshluq_n_s32(int32x4_t a) {
   13113   return vqshluq_n_s32(a, 1);
   13114 }
   13115 
   13116 // CHECK-LABEL: define <2 x i64> @test_vqshluq_n_s64(<2 x i64> %a) #0 {
   13117 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   13118 // CHECK:   [[VQSHLU_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   13119 // CHECK:   [[VQSHLU_N1:%.*]] = call <2 x i64> @llvm.arm.neon.vqshiftsu.v2i64(<2 x i64> [[VQSHLU_N]], <2 x i64> <i64 1, i64 1>)
   13120 // CHECK:   ret <2 x i64> [[VQSHLU_N1]]
   13121 uint64x2_t test_vqshluq_n_s64(int64x2_t a) {
   13122   return vqshluq_n_s64(a, 1);
   13123 }
   13124 
   13125 
   13126 // CHECK-LABEL: define <8 x i8> @test_vqshl_n_s8(<8 x i8> %a) #0 {
   13127 // CHECK:   [[VQSHL_N:%.*]] = call <8 x i8> @llvm.arm.neon.vqshifts.v8i8(<8 x i8> %a, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
   13128 // CHECK:   ret <8 x i8> [[VQSHL_N]]
   13129 int8x8_t test_vqshl_n_s8(int8x8_t a) {
   13130   return vqshl_n_s8(a, 1);
   13131 }
   13132 
   13133 // CHECK-LABEL: define <4 x i16> @test_vqshl_n_s16(<4 x i16> %a) #0 {
   13134 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   13135 // CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   13136 // CHECK:   [[VQSHL_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16> [[VQSHL_N]], <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
   13137 // CHECK:   ret <4 x i16> [[VQSHL_N1]]
   13138 int16x4_t test_vqshl_n_s16(int16x4_t a) {
   13139   return vqshl_n_s16(a, 1);
   13140 }
   13141 
   13142 // CHECK-LABEL: define <2 x i32> @test_vqshl_n_s32(<2 x i32> %a) #0 {
   13143 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   13144 // CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   13145 // CHECK:   [[VQSHL_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32> [[VQSHL_N]], <2 x i32> <i32 1, i32 1>)
   13146 // CHECK:   ret <2 x i32> [[VQSHL_N1]]
   13147 int32x2_t test_vqshl_n_s32(int32x2_t a) {
   13148   return vqshl_n_s32(a, 1);
   13149 }
   13150 
   13151 // CHECK-LABEL: define <1 x i64> @test_vqshl_n_s64(<1 x i64> %a) #0 {
   13152 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   13153 // CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
   13154 // CHECK:   [[VQSHL_N1:%.*]] = call <1 x i64> @llvm.arm.neon.vqshifts.v1i64(<1 x i64> [[VQSHL_N]], <1 x i64> <i64 1>)
   13155 // CHECK:   ret <1 x i64> [[VQSHL_N1]]
   13156 int64x1_t test_vqshl_n_s64(int64x1_t a) {
   13157   return vqshl_n_s64(a, 1);
   13158 }
   13159 
   13160 // CHECK-LABEL: define <8 x i8> @test_vqshl_n_u8(<8 x i8> %a) #0 {
   13161 // CHECK:   [[VQSHL_N:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftu.v8i8(<8 x i8> %a, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
   13162 // CHECK:   ret <8 x i8> [[VQSHL_N]]
   13163 uint8x8_t test_vqshl_n_u8(uint8x8_t a) {
   13164   return vqshl_n_u8(a, 1);
   13165 }
   13166 
   13167 // CHECK-LABEL: define <4 x i16> @test_vqshl_n_u16(<4 x i16> %a) #0 {
   13168 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   13169 // CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   13170 // CHECK:   [[VQSHL_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16> [[VQSHL_N]], <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
   13171 // CHECK:   ret <4 x i16> [[VQSHL_N1]]
   13172 uint16x4_t test_vqshl_n_u16(uint16x4_t a) {
   13173   return vqshl_n_u16(a, 1);
   13174 }
   13175 
   13176 // CHECK-LABEL: define <2 x i32> @test_vqshl_n_u32(<2 x i32> %a) #0 {
   13177 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   13178 // CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   13179 // CHECK:   [[VQSHL_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32> [[VQSHL_N]], <2 x i32> <i32 1, i32 1>)
   13180 // CHECK:   ret <2 x i32> [[VQSHL_N1]]
   13181 uint32x2_t test_vqshl_n_u32(uint32x2_t a) {
   13182   return vqshl_n_u32(a, 1);
   13183 }
   13184 
   13185 // CHECK-LABEL: define <1 x i64> @test_vqshl_n_u64(<1 x i64> %a) #0 {
   13186 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   13187 // CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
   13188 // CHECK:   [[VQSHL_N1:%.*]] = call <1 x i64> @llvm.arm.neon.vqshiftu.v1i64(<1 x i64> [[VQSHL_N]], <1 x i64> <i64 1>)
   13189 // CHECK:   ret <1 x i64> [[VQSHL_N1]]
   13190 uint64x1_t test_vqshl_n_u64(uint64x1_t a) {
   13191   return vqshl_n_u64(a, 1);
   13192 }
   13193 
   13194 // CHECK-LABEL: define <16 x i8> @test_vqshlq_n_s8(<16 x i8> %a) #0 {
   13195 // CHECK:   [[VQSHL_N:%.*]] = call <16 x i8> @llvm.arm.neon.vqshifts.v16i8(<16 x i8> %a, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
   13196 // CHECK:   ret <16 x i8> [[VQSHL_N]]
   13197 int8x16_t test_vqshlq_n_s8(int8x16_t a) {
   13198   return vqshlq_n_s8(a, 1);
   13199 }
   13200 
   13201 // CHECK-LABEL: define <8 x i16> @test_vqshlq_n_s16(<8 x i16> %a) #0 {
   13202 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   13203 // CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   13204 // CHECK:   [[VQSHL_N1:%.*]] = call <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16> [[VQSHL_N]], <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
   13205 // CHECK:   ret <8 x i16> [[VQSHL_N1]]
   13206 int16x8_t test_vqshlq_n_s16(int16x8_t a) {
   13207   return vqshlq_n_s16(a, 1);
   13208 }
   13209 
   13210 // CHECK-LABEL: define <4 x i32> @test_vqshlq_n_s32(<4 x i32> %a) #0 {
   13211 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   13212 // CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   13213 // CHECK:   [[VQSHL_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32> [[VQSHL_N]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
   13214 // CHECK:   ret <4 x i32> [[VQSHL_N1]]
   13215 int32x4_t test_vqshlq_n_s32(int32x4_t a) {
   13216   return vqshlq_n_s32(a, 1);
   13217 }
   13218 
   13219 // CHECK-LABEL: define <2 x i64> @test_vqshlq_n_s64(<2 x i64> %a) #0 {
   13220 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   13221 // CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   13222 // CHECK:   [[VQSHL_N1:%.*]] = call <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64> [[VQSHL_N]], <2 x i64> <i64 1, i64 1>)
   13223 // CHECK:   ret <2 x i64> [[VQSHL_N1]]
   13224 int64x2_t test_vqshlq_n_s64(int64x2_t a) {
   13225   return vqshlq_n_s64(a, 1);
   13226 }
   13227 
   13228 // CHECK-LABEL: define <16 x i8> @test_vqshlq_n_u8(<16 x i8> %a) #0 {
   13229 // CHECK:   [[VQSHL_N:%.*]] = call <16 x i8> @llvm.arm.neon.vqshiftu.v16i8(<16 x i8> %a, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
   13230 // CHECK:   ret <16 x i8> [[VQSHL_N]]
   13231 uint8x16_t test_vqshlq_n_u8(uint8x16_t a) {
   13232   return vqshlq_n_u8(a, 1);
   13233 }
   13234 
   13235 // CHECK-LABEL: define <8 x i16> @test_vqshlq_n_u16(<8 x i16> %a) #0 {
   13236 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   13237 // CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   13238 // CHECK:   [[VQSHL_N1:%.*]] = call <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16> [[VQSHL_N]], <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
   13239 // CHECK:   ret <8 x i16> [[VQSHL_N1]]
   13240 uint16x8_t test_vqshlq_n_u16(uint16x8_t a) {
   13241   return vqshlq_n_u16(a, 1);
   13242 }
   13243 
   13244 // CHECK-LABEL: define <4 x i32> @test_vqshlq_n_u32(<4 x i32> %a) #0 {
   13245 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   13246 // CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   13247 // CHECK:   [[VQSHL_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32> [[VQSHL_N]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
   13248 // CHECK:   ret <4 x i32> [[VQSHL_N1]]
   13249 uint32x4_t test_vqshlq_n_u32(uint32x4_t a) {
   13250   return vqshlq_n_u32(a, 1);
   13251 }
   13252 
   13253 // CHECK-LABEL: define <2 x i64> @test_vqshlq_n_u64(<2 x i64> %a) #0 {
   13254 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   13255 // CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   13256 // CHECK:   [[VQSHL_N1:%.*]] = call <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64> [[VQSHL_N]], <2 x i64> <i64 1, i64 1>)
   13257 // CHECK:   ret <2 x i64> [[VQSHL_N1]]
   13258 uint64x2_t test_vqshlq_n_u64(uint64x2_t a) {
   13259   return vqshlq_n_u64(a, 1);
   13260 }
   13261 
   13262 
   13263 // CHECK-LABEL: define <8 x i8> @test_vqshrn_n_s16(<8 x i16> %a) #0 {
   13264 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   13265 // CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   13266 // CHECK:   [[VQSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftns.v8i8(<8 x i16> [[VQSHRN_N]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
   13267 // CHECK:   ret <8 x i8> [[VQSHRN_N1]]
   13268 int8x8_t test_vqshrn_n_s16(int16x8_t a) {
   13269   return vqshrn_n_s16(a, 1);
   13270 }
   13271 
   13272 // CHECK-LABEL: define <4 x i16> @test_vqshrn_n_s32(<4 x i32> %a) #0 {
   13273 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   13274 // CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   13275 // CHECK:   [[VQSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftns.v4i16(<4 x i32> [[VQSHRN_N]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
   13276 // CHECK:   ret <4 x i16> [[VQSHRN_N1]]
   13277 int16x4_t test_vqshrn_n_s32(int32x4_t a) {
   13278   return vqshrn_n_s32(a, 1);
   13279 }
   13280 
   13281 // CHECK-LABEL: define <2 x i32> @test_vqshrn_n_s64(<2 x i64> %a) #0 {
   13282 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   13283 // CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   13284 // CHECK:   [[VQSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftns.v2i32(<2 x i64> [[VQSHRN_N]], <2 x i64> <i64 -1, i64 -1>)
   13285 // CHECK:   ret <2 x i32> [[VQSHRN_N1]]
   13286 int32x2_t test_vqshrn_n_s64(int64x2_t a) {
   13287   return vqshrn_n_s64(a, 1);
   13288 }
   13289 
   13290 // CHECK-LABEL: define <8 x i8> @test_vqshrn_n_u16(<8 x i16> %a) #0 {
   13291 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   13292 // CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   13293 // CHECK:   [[VQSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftnu.v8i8(<8 x i16> [[VQSHRN_N]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
   13294 // CHECK:   ret <8 x i8> [[VQSHRN_N1]]
   13295 uint8x8_t test_vqshrn_n_u16(uint16x8_t a) {
   13296   return vqshrn_n_u16(a, 1);
   13297 }
   13298 
   13299 // CHECK-LABEL: define <4 x i16> @test_vqshrn_n_u32(<4 x i32> %a) #0 {
   13300 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   13301 // CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   13302 // CHECK:   [[VQSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftnu.v4i16(<4 x i32> [[VQSHRN_N]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
   13303 // CHECK:   ret <4 x i16> [[VQSHRN_N1]]
   13304 uint16x4_t test_vqshrn_n_u32(uint32x4_t a) {
   13305   return vqshrn_n_u32(a, 1);
   13306 }
   13307 
   13308 // CHECK-LABEL: define <2 x i32> @test_vqshrn_n_u64(<2 x i64> %a) #0 {
   13309 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   13310 // CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   13311 // CHECK:   [[VQSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftnu.v2i32(<2 x i64> [[VQSHRN_N]], <2 x i64> <i64 -1, i64 -1>)
   13312 // CHECK:   ret <2 x i32> [[VQSHRN_N1]]
   13313 uint32x2_t test_vqshrn_n_u64(uint64x2_t a) {
   13314   return vqshrn_n_u64(a, 1);
   13315 }
   13316 
   13317 
   13318 // CHECK-LABEL: define <8 x i8> @test_vqshrun_n_s16(<8 x i16> %a) #0 {
   13319 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   13320 // CHECK:   [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   13321 // CHECK:   [[VQSHRUN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftnsu.v8i8(<8 x i16> [[VQSHRUN_N]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
   13322 // CHECK:   ret <8 x i8> [[VQSHRUN_N1]]
   13323 uint8x8_t test_vqshrun_n_s16(int16x8_t a) {
   13324   return vqshrun_n_s16(a, 1);
   13325 }
   13326 
   13327 // CHECK-LABEL: define <4 x i16> @test_vqshrun_n_s32(<4 x i32> %a) #0 {
   13328 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   13329 // CHECK:   [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   13330 // CHECK:   [[VQSHRUN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftnsu.v4i16(<4 x i32> [[VQSHRUN_N]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
   13331 // CHECK:   ret <4 x i16> [[VQSHRUN_N1]]
   13332 uint16x4_t test_vqshrun_n_s32(int32x4_t a) {
   13333   return vqshrun_n_s32(a, 1);
   13334 }
   13335 
   13336 // CHECK-LABEL: define <2 x i32> @test_vqshrun_n_s64(<2 x i64> %a) #0 {
   13337 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   13338 // CHECK:   [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   13339 // CHECK:   [[VQSHRUN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftnsu.v2i32(<2 x i64> [[VQSHRUN_N]], <2 x i64> <i64 -1, i64 -1>)
   13340 // CHECK:   ret <2 x i32> [[VQSHRUN_N1]]
   13341 uint32x2_t test_vqshrun_n_s64(int64x2_t a) {
   13342   return vqshrun_n_s64(a, 1);
   13343 }
   13344 
   13345 
   13346 // CHECK-LABEL: define <8 x i8> @test_vqsub_s8(<8 x i8> %a, <8 x i8> %b) #0 {
   13347 // CHECK:   [[VQSUB_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqsubs.v8i8(<8 x i8> %a, <8 x i8> %b) #4
   13348 // CHECK:   ret <8 x i8> [[VQSUB_V_I]]
   13349 int8x8_t test_vqsub_s8(int8x8_t a, int8x8_t b) {
   13350   return vqsub_s8(a, b);
   13351 }
   13352 
   13353 // CHECK-LABEL: define <4 x i16> @test_vqsub_s16(<4 x i16> %a, <4 x i16> %b) #0 {
   13354 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   13355 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   13356 // CHECK:   [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   13357 // CHECK:   [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   13358 // CHECK:   [[VQSUB_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqsubs.v4i16(<4 x i16> [[VQSUB_V_I]], <4 x i16> [[VQSUB_V1_I]]) #4
   13359 // CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <4 x i16> [[VQSUB_V2_I]] to <8 x i8>
   13360 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to <4 x i16>
   13361 // CHECK:   ret <4 x i16> [[TMP2]]
   13362 int16x4_t test_vqsub_s16(int16x4_t a, int16x4_t b) {
   13363   return vqsub_s16(a, b);
   13364 }
   13365 
   13366 // CHECK-LABEL: define <2 x i32> @test_vqsub_s32(<2 x i32> %a, <2 x i32> %b) #0 {
   13367 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   13368 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   13369 // CHECK:   [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   13370 // CHECK:   [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   13371 // CHECK:   [[VQSUB_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqsubs.v2i32(<2 x i32> [[VQSUB_V_I]], <2 x i32> [[VQSUB_V1_I]]) #4
   13372 // CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <2 x i32> [[VQSUB_V2_I]] to <8 x i8>
   13373 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to <2 x i32>
   13374 // CHECK:   ret <2 x i32> [[TMP2]]
   13375 int32x2_t test_vqsub_s32(int32x2_t a, int32x2_t b) {
   13376   return vqsub_s32(a, b);
   13377 }
   13378 
   13379 // CHECK-LABEL: define <1 x i64> @test_vqsub_s64(<1 x i64> %a, <1 x i64> %b) #0 {
   13380 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   13381 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
   13382 // CHECK:   [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
   13383 // CHECK:   [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
   13384 // CHECK:   [[VQSUB_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqsubs.v1i64(<1 x i64> [[VQSUB_V_I]], <1 x i64> [[VQSUB_V1_I]]) #4
   13385 // CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <1 x i64> [[VQSUB_V2_I]] to <8 x i8>
   13386 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to <1 x i64>
   13387 // CHECK:   ret <1 x i64> [[TMP2]]
   13388 int64x1_t test_vqsub_s64(int64x1_t a, int64x1_t b) {
   13389   return vqsub_s64(a, b);
   13390 }
   13391 
   13392 // CHECK-LABEL: define <8 x i8> @test_vqsub_u8(<8 x i8> %a, <8 x i8> %b) #0 {
   13393 // CHECK:   [[VQSUB_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqsubu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
   13394 // CHECK:   ret <8 x i8> [[VQSUB_V_I]]
   13395 uint8x8_t test_vqsub_u8(uint8x8_t a, uint8x8_t b) {
   13396   return vqsub_u8(a, b);
   13397 }
   13398 
   13399 // CHECK-LABEL: define <4 x i16> @test_vqsub_u16(<4 x i16> %a, <4 x i16> %b) #0 {
   13400 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   13401 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   13402 // CHECK:   [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   13403 // CHECK:   [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   13404 // CHECK:   [[VQSUB_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqsubu.v4i16(<4 x i16> [[VQSUB_V_I]], <4 x i16> [[VQSUB_V1_I]]) #4
   13405 // CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <4 x i16> [[VQSUB_V2_I]] to <8 x i8>
   13406 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to <4 x i16>
   13407 // CHECK:   ret <4 x i16> [[TMP2]]
   13408 uint16x4_t test_vqsub_u16(uint16x4_t a, uint16x4_t b) {
   13409   return vqsub_u16(a, b);
   13410 }
   13411 
   13412 // CHECK-LABEL: define <2 x i32> @test_vqsub_u32(<2 x i32> %a, <2 x i32> %b) #0 {
   13413 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   13414 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   13415 // CHECK:   [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   13416 // CHECK:   [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   13417 // CHECK:   [[VQSUB_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqsubu.v2i32(<2 x i32> [[VQSUB_V_I]], <2 x i32> [[VQSUB_V1_I]]) #4
   13418 // CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <2 x i32> [[VQSUB_V2_I]] to <8 x i8>
   13419 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to <2 x i32>
   13420 // CHECK:   ret <2 x i32> [[TMP2]]
   13421 uint32x2_t test_vqsub_u32(uint32x2_t a, uint32x2_t b) {
   13422   return vqsub_u32(a, b);
   13423 }
   13424 
   13425 // CHECK-LABEL: define <1 x i64> @test_vqsub_u64(<1 x i64> %a, <1 x i64> %b) #0 {
   13426 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   13427 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
   13428 // CHECK:   [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
   13429 // CHECK:   [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
   13430 // CHECK:   [[VQSUB_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqsubu.v1i64(<1 x i64> [[VQSUB_V_I]], <1 x i64> [[VQSUB_V1_I]]) #4
   13431 // CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <1 x i64> [[VQSUB_V2_I]] to <8 x i8>
   13432 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to <1 x i64>
   13433 // CHECK:   ret <1 x i64> [[TMP2]]
   13434 uint64x1_t test_vqsub_u64(uint64x1_t a, uint64x1_t b) {
   13435   return vqsub_u64(a, b);
   13436 }
   13437 
   13438 // CHECK-LABEL: define <16 x i8> @test_vqsubq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
   13439 // CHECK:   [[VQSUBQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqsubs.v16i8(<16 x i8> %a, <16 x i8> %b) #4
   13440 // CHECK:   ret <16 x i8> [[VQSUBQ_V_I]]
   13441 int8x16_t test_vqsubq_s8(int8x16_t a, int8x16_t b) {
   13442   return vqsubq_s8(a, b);
   13443 }
   13444 
   13445 // CHECK-LABEL: define <8 x i16> @test_vqsubq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
   13446 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   13447 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   13448 // CHECK:   [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   13449 // CHECK:   [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   13450 // CHECK:   [[VQSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqsubs.v8i16(<8 x i16> [[VQSUBQ_V_I]], <8 x i16> [[VQSUBQ_V1_I]]) #4
   13451 // CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSUBQ_V2_I]] to <16 x i8>
   13452 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <8 x i16>
   13453 // CHECK:   ret <8 x i16> [[TMP2]]
   13454 int16x8_t test_vqsubq_s16(int16x8_t a, int16x8_t b) {
   13455   return vqsubq_s16(a, b);
   13456 }
   13457 
   13458 // CHECK-LABEL: define <4 x i32> @test_vqsubq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
   13459 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   13460 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   13461 // CHECK:   [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   13462 // CHECK:   [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   13463 // CHECK:   [[VQSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> [[VQSUBQ_V_I]], <4 x i32> [[VQSUBQ_V1_I]]) #4
   13464 // CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSUBQ_V2_I]] to <16 x i8>
   13465 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <4 x i32>
   13466 // CHECK:   ret <4 x i32> [[TMP2]]
   13467 int32x4_t test_vqsubq_s32(int32x4_t a, int32x4_t b) {
   13468   return vqsubq_s32(a, b);
   13469 }
   13470 
   13471 // CHECK-LABEL: define <2 x i64> @test_vqsubq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
   13472 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   13473 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   13474 // CHECK:   [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   13475 // CHECK:   [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
   13476 // CHECK:   [[VQSUBQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> [[VQSUBQ_V_I]], <2 x i64> [[VQSUBQ_V1_I]]) #4
   13477 // CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSUBQ_V2_I]] to <16 x i8>
   13478 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <2 x i64>
   13479 // CHECK:   ret <2 x i64> [[TMP2]]
   13480 int64x2_t test_vqsubq_s64(int64x2_t a, int64x2_t b) {
   13481   return vqsubq_s64(a, b);
   13482 }
   13483 
   13484 // CHECK-LABEL: define <16 x i8> @test_vqsubq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
   13485 // CHECK:   [[VQSUBQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqsubu.v16i8(<16 x i8> %a, <16 x i8> %b) #4
   13486 // CHECK:   ret <16 x i8> [[VQSUBQ_V_I]]
   13487 uint8x16_t test_vqsubq_u8(uint8x16_t a, uint8x16_t b) {
   13488   return vqsubq_u8(a, b);
   13489 }
   13490 
   13491 // CHECK-LABEL: define <8 x i16> @test_vqsubq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
   13492 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   13493 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   13494 // CHECK:   [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   13495 // CHECK:   [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   13496 // CHECK:   [[VQSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqsubu.v8i16(<8 x i16> [[VQSUBQ_V_I]], <8 x i16> [[VQSUBQ_V1_I]]) #4
   13497 // CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSUBQ_V2_I]] to <16 x i8>
   13498 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <8 x i16>
   13499 // CHECK:   ret <8 x i16> [[TMP2]]
   13500 uint16x8_t test_vqsubq_u16(uint16x8_t a, uint16x8_t b) {
   13501   return vqsubq_u16(a, b);
   13502 }
   13503 
   13504 // CHECK-LABEL: define <4 x i32> @test_vqsubq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
   13505 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   13506 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   13507 // CHECK:   [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   13508 // CHECK:   [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   13509 // CHECK:   [[VQSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqsubu.v4i32(<4 x i32> [[VQSUBQ_V_I]], <4 x i32> [[VQSUBQ_V1_I]]) #4
   13510 // CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSUBQ_V2_I]] to <16 x i8>
   13511 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <4 x i32>
   13512 // CHECK:   ret <4 x i32> [[TMP2]]
   13513 uint32x4_t test_vqsubq_u32(uint32x4_t a, uint32x4_t b) {
   13514   return vqsubq_u32(a, b);
   13515 }
   13516 
   13517 // CHECK-LABEL: define <2 x i64> @test_vqsubq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
   13518 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   13519 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   13520 // CHECK:   [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   13521 // CHECK:   [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
   13522 // CHECK:   [[VQSUBQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqsubu.v2i64(<2 x i64> [[VQSUBQ_V_I]], <2 x i64> [[VQSUBQ_V1_I]]) #4
   13523 // CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSUBQ_V2_I]] to <16 x i8>
   13524 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <2 x i64>
   13525 // CHECK:   ret <2 x i64> [[TMP2]]
   13526 uint64x2_t test_vqsubq_u64(uint64x2_t a, uint64x2_t b) {
   13527   return vqsubq_u64(a, b);
   13528 }
   13529 
   13530 
   13531 // CHECK-LABEL: define <8 x i8> @test_vraddhn_s16(<8 x i16> %a, <8 x i16> %b) #0 {
   13532 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   13533 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   13534 // CHECK:   [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   13535 // CHECK:   [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   13536 // CHECK:   [[VRADDHN_V2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16> [[VRADDHN_V_I]], <8 x i16> [[VRADDHN_V1_I]]) #4
   13537 // CHECK:   ret <8 x i8> [[VRADDHN_V2_I]]
   13538 int8x8_t test_vraddhn_s16(int16x8_t a, int16x8_t b) {
   13539   return vraddhn_s16(a, b);
   13540 }
   13541 
   13542 // CHECK-LABEL: define <4 x i16> @test_vraddhn_s32(<4 x i32> %a, <4 x i32> %b) #0 {
   13543 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   13544 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   13545 // CHECK:   [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   13546 // CHECK:   [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   13547 // CHECK:   [[VRADDHN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32> [[VRADDHN_V_I]], <4 x i32> [[VRADDHN_V1_I]]) #4
   13548 // CHECK:   [[VRADDHN_V3_I:%.*]] = bitcast <4 x i16> [[VRADDHN_V2_I]] to <8 x i8>
   13549 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I]] to <4 x i16>
   13550 // CHECK:   ret <4 x i16> [[TMP2]]
   13551 int16x4_t test_vraddhn_s32(int32x4_t a, int32x4_t b) {
   13552   return vraddhn_s32(a, b);
   13553 }
   13554 
   13555 // CHECK-LABEL: define <2 x i32> @test_vraddhn_s64(<2 x i64> %a, <2 x i64> %b) #0 {
   13556 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   13557 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   13558 // CHECK:   [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   13559 // CHECK:   [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
   13560 // CHECK:   [[VRADDHN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64> [[VRADDHN_V_I]], <2 x i64> [[VRADDHN_V1_I]]) #4
   13561 // CHECK:   [[VRADDHN_V3_I:%.*]] = bitcast <2 x i32> [[VRADDHN_V2_I]] to <8 x i8>
   13562 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I]] to <2 x i32>
   13563 // CHECK:   ret <2 x i32> [[TMP2]]
   13564 int32x2_t test_vraddhn_s64(int64x2_t a, int64x2_t b) {
   13565   return vraddhn_s64(a, b);
   13566 }
   13567 
   13568 // CHECK-LABEL: define <8 x i8> @test_vraddhn_u16(<8 x i16> %a, <8 x i16> %b) #0 {
   13569 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   13570 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   13571 // CHECK:   [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   13572 // CHECK:   [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   13573 // CHECK:   [[VRADDHN_V2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16> [[VRADDHN_V_I]], <8 x i16> [[VRADDHN_V1_I]]) #4
   13574 // CHECK:   ret <8 x i8> [[VRADDHN_V2_I]]
   13575 uint8x8_t test_vraddhn_u16(uint16x8_t a, uint16x8_t b) {
   13576   return vraddhn_u16(a, b);
   13577 }
   13578 
   13579 // CHECK-LABEL: define <4 x i16> @test_vraddhn_u32(<4 x i32> %a, <4 x i32> %b) #0 {
   13580 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   13581 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   13582 // CHECK:   [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   13583 // CHECK:   [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   13584 // CHECK:   [[VRADDHN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32> [[VRADDHN_V_I]], <4 x i32> [[VRADDHN_V1_I]]) #4
   13585 // CHECK:   [[VRADDHN_V3_I:%.*]] = bitcast <4 x i16> [[VRADDHN_V2_I]] to <8 x i8>
   13586 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I]] to <4 x i16>
   13587 // CHECK:   ret <4 x i16> [[TMP2]]
   13588 uint16x4_t test_vraddhn_u32(uint32x4_t a, uint32x4_t b) {
   13589   return vraddhn_u32(a, b);
   13590 }
   13591 
   13592 // CHECK-LABEL: define <2 x i32> @test_vraddhn_u64(<2 x i64> %a, <2 x i64> %b) #0 {
   13593 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   13594 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   13595 // CHECK:   [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   13596 // CHECK:   [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
   13597 // CHECK:   [[VRADDHN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64> [[VRADDHN_V_I]], <2 x i64> [[VRADDHN_V1_I]]) #4
   13598 // CHECK:   [[VRADDHN_V3_I:%.*]] = bitcast <2 x i32> [[VRADDHN_V2_I]] to <8 x i8>
   13599 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I]] to <2 x i32>
   13600 // CHECK:   ret <2 x i32> [[TMP2]]
   13601 uint32x2_t test_vraddhn_u64(uint64x2_t a, uint64x2_t b) {
   13602   return vraddhn_u64(a, b);
   13603 }
   13604 
   13605 
   13606 // CHECK-LABEL: define <2 x float> @test_vrecpe_f32(<2 x float> %a) #0 {
   13607 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
   13608 // CHECK:   [[VRECPE_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
   13609 // CHECK:   [[VRECPE_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrecpe.v2f32(<2 x float> [[VRECPE_V_I]]) #4
   13610 // CHECK:   ret <2 x float> [[VRECPE_V1_I]]
   13611 float32x2_t test_vrecpe_f32(float32x2_t a) {
   13612   return vrecpe_f32(a);
   13613 }
   13614 
   13615 // CHECK-LABEL: define <2 x i32> @test_vrecpe_u32(<2 x i32> %a) #0 {
   13616 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   13617 // CHECK:   [[VRECPE_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   13618 // CHECK:   [[VRECPE_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrecpe.v2i32(<2 x i32> [[VRECPE_V_I]]) #4
   13619 // CHECK:   ret <2 x i32> [[VRECPE_V1_I]]
   13620 uint32x2_t test_vrecpe_u32(uint32x2_t a) {
   13621   return vrecpe_u32(a);
   13622 }
   13623 
   13624 // CHECK-LABEL: define <4 x float> @test_vrecpeq_f32(<4 x float> %a) #0 {
   13625 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
   13626 // CHECK:   [[VRECPEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
   13627 // CHECK:   [[VRECPEQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float> [[VRECPEQ_V_I]]) #4
   13628 // CHECK:   ret <4 x float> [[VRECPEQ_V1_I]]
   13629 float32x4_t test_vrecpeq_f32(float32x4_t a) {
   13630   return vrecpeq_f32(a);
   13631 }
   13632 
   13633 // CHECK-LABEL: define <4 x i32> @test_vrecpeq_u32(<4 x i32> %a) #0 {
   13634 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   13635 // CHECK:   [[VRECPEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   13636 // CHECK:   [[VRECPEQ_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrecpe.v4i32(<4 x i32> [[VRECPEQ_V_I]]) #4
   13637 // CHECK:   ret <4 x i32> [[VRECPEQ_V1_I]]
   13638 uint32x4_t test_vrecpeq_u32(uint32x4_t a) {
   13639   return vrecpeq_u32(a);
   13640 }
   13641 
   13642 
   13643 // CHECK-LABEL: define <2 x float> @test_vrecps_f32(<2 x float> %a, <2 x float> %b) #0 {
   13644 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
   13645 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
   13646 // CHECK:   [[VRECPS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
   13647 // CHECK:   [[VRECPS_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
   13648 // CHECK:   [[VRECPS_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vrecps.v2f32(<2 x float> [[VRECPS_V_I]], <2 x float> [[VRECPS_V1_I]]) #4
   13649 // CHECK:   [[VRECPS_V3_I:%.*]] = bitcast <2 x float> [[VRECPS_V2_I]] to <8 x i8>
   13650 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRECPS_V3_I]] to <2 x float>
   13651 // CHECK:   ret <2 x float> [[TMP2]]
   13652 float32x2_t test_vrecps_f32(float32x2_t a, float32x2_t b) {
   13653   return vrecps_f32(a, b);
   13654 }
   13655 
   13656 // CHECK-LABEL: define <4 x float> @test_vrecpsq_f32(<4 x float> %a, <4 x float> %b) #0 {
   13657 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
   13658 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
   13659 // CHECK:   [[VRECPSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
   13660 // CHECK:   [[VRECPSQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
   13661 // CHECK:   [[VRECPSQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float> [[VRECPSQ_V_I]], <4 x float> [[VRECPSQ_V1_I]]) #4
   13662 // CHECK:   [[VRECPSQ_V3_I:%.*]] = bitcast <4 x float> [[VRECPSQ_V2_I]] to <16 x i8>
   13663 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRECPSQ_V3_I]] to <4 x float>
   13664 // CHECK:   ret <4 x float> [[TMP2]]
   13665 float32x4_t test_vrecpsq_f32(float32x4_t a, float32x4_t b) {
   13666   return vrecpsq_f32(a, b);
   13667 }
   13668 
   13669 
   13670 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_s16(<4 x i16> %a) #0 {
   13671 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   13672 // CHECK:   ret <8 x i8> [[TMP0]]
   13673 int8x8_t test_vreinterpret_s8_s16(int16x4_t a) {
   13674   return vreinterpret_s8_s16(a);
   13675 }
   13676 
   13677 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_s32(<2 x i32> %a) #0 {
   13678 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   13679 // CHECK:   ret <8 x i8> [[TMP0]]
   13680 int8x8_t test_vreinterpret_s8_s32(int32x2_t a) {
   13681   return vreinterpret_s8_s32(a);
   13682 }
   13683 
   13684 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_s64(<1 x i64> %a) #0 {
   13685 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   13686 // CHECK:   ret <8 x i8> [[TMP0]]
   13687 int8x8_t test_vreinterpret_s8_s64(int64x1_t a) {
   13688   return vreinterpret_s8_s64(a);
   13689 }
   13690 
   13691 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_u8(<8 x i8> %a) #0 {
   13692 // CHECK:   ret <8 x i8> %a
   13693 int8x8_t test_vreinterpret_s8_u8(uint8x8_t a) {
   13694   return vreinterpret_s8_u8(a);
   13695 }
   13696 
   13697 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_u16(<4 x i16> %a) #0 {
   13698 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   13699 // CHECK:   ret <8 x i8> [[TMP0]]
   13700 int8x8_t test_vreinterpret_s8_u16(uint16x4_t a) {
   13701   return vreinterpret_s8_u16(a);
   13702 }
   13703 
   13704 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_u32(<2 x i32> %a) #0 {
   13705 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   13706 // CHECK:   ret <8 x i8> [[TMP0]]
   13707 int8x8_t test_vreinterpret_s8_u32(uint32x2_t a) {
   13708   return vreinterpret_s8_u32(a);
   13709 }
   13710 
   13711 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_u64(<1 x i64> %a) #0 {
   13712 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   13713 // CHECK:   ret <8 x i8> [[TMP0]]
   13714 int8x8_t test_vreinterpret_s8_u64(uint64x1_t a) {
   13715   return vreinterpret_s8_u64(a);
   13716 }
   13717 
   13718 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_f16(<4 x half> %a) #0 {
   13719 // CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
   13720 // CHECK:   ret <8 x i8> [[TMP0]]
   13721 int8x8_t test_vreinterpret_s8_f16(float16x4_t a) {
   13722   return vreinterpret_s8_f16(a);
   13723 }
   13724 
   13725 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_f32(<2 x float> %a) #0 {
   13726 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
   13727 // CHECK:   ret <8 x i8> [[TMP0]]
   13728 int8x8_t test_vreinterpret_s8_f32(float32x2_t a) {
   13729   return vreinterpret_s8_f32(a);
   13730 }
   13731 
   13732 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_p8(<8 x i8> %a) #0 {
   13733 // CHECK:   ret <8 x i8> %a
   13734 int8x8_t test_vreinterpret_s8_p8(poly8x8_t a) {
   13735   return vreinterpret_s8_p8(a);
   13736 }
   13737 
   13738 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_p16(<4 x i16> %a) #0 {
   13739 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   13740 // CHECK:   ret <8 x i8> [[TMP0]]
   13741 int8x8_t test_vreinterpret_s8_p16(poly16x4_t a) {
   13742   return vreinterpret_s8_p16(a);
   13743 }
   13744 
   13745 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_s8(<8 x i8> %a) #0 {
   13746 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
   13747 // CHECK:   ret <4 x i16> [[TMP0]]
   13748 int16x4_t test_vreinterpret_s16_s8(int8x8_t a) {
   13749   return vreinterpret_s16_s8(a);
   13750 }
   13751 
   13752 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_s32(<2 x i32> %a) #0 {
   13753 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
   13754 // CHECK:   ret <4 x i16> [[TMP0]]
   13755 int16x4_t test_vreinterpret_s16_s32(int32x2_t a) {
   13756   return vreinterpret_s16_s32(a);
   13757 }
   13758 
   13759 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_s64(<1 x i64> %a) #0 {
   13760 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
   13761 // CHECK:   ret <4 x i16> [[TMP0]]
   13762 int16x4_t test_vreinterpret_s16_s64(int64x1_t a) {
   13763   return vreinterpret_s16_s64(a);
   13764 }
   13765 
   13766 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_u8(<8 x i8> %a) #0 {
   13767 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
   13768 // CHECK:   ret <4 x i16> [[TMP0]]
   13769 int16x4_t test_vreinterpret_s16_u8(uint8x8_t a) {
   13770   return vreinterpret_s16_u8(a);
   13771 }
   13772 
   13773 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_u16(<4 x i16> %a) #0 {
   13774 // CHECK:   ret <4 x i16> %a
   13775 int16x4_t test_vreinterpret_s16_u16(uint16x4_t a) {
   13776   return vreinterpret_s16_u16(a);
   13777 }
   13778 
   13779 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_u32(<2 x i32> %a) #0 {
   13780 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
   13781 // CHECK:   ret <4 x i16> [[TMP0]]
   13782 int16x4_t test_vreinterpret_s16_u32(uint32x2_t a) {
   13783   return vreinterpret_s16_u32(a);
   13784 }
   13785 
   13786 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_u64(<1 x i64> %a) #0 {
   13787 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
   13788 // CHECK:   ret <4 x i16> [[TMP0]]
   13789 int16x4_t test_vreinterpret_s16_u64(uint64x1_t a) {
   13790   return vreinterpret_s16_u64(a);
   13791 }
   13792 
   13793 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_f16(<4 x half> %a) #0 {
   13794 // CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <4 x i16>
   13795 // CHECK:   ret <4 x i16> [[TMP0]]
   13796 int16x4_t test_vreinterpret_s16_f16(float16x4_t a) {
   13797   return vreinterpret_s16_f16(a);
   13798 }
   13799 
   13800 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_f32(<2 x float> %a) #0 {
   13801 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x i16>
   13802 // CHECK:   ret <4 x i16> [[TMP0]]
   13803 int16x4_t test_vreinterpret_s16_f32(float32x2_t a) {
   13804   return vreinterpret_s16_f32(a);
   13805 }
   13806 
   13807 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_p8(<8 x i8> %a) #0 {
   13808 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
   13809 // CHECK:   ret <4 x i16> [[TMP0]]
   13810 int16x4_t test_vreinterpret_s16_p8(poly8x8_t a) {
   13811   return vreinterpret_s16_p8(a);
   13812 }
   13813 
   13814 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_p16(<4 x i16> %a) #0 {
   13815 // CHECK:   ret <4 x i16> %a
   13816 int16x4_t test_vreinterpret_s16_p16(poly16x4_t a) {
   13817   return vreinterpret_s16_p16(a);
   13818 }
   13819 
   13820 // CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_s8(<8 x i8> %a) #0 {
   13821 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
   13822 // CHECK:   ret <2 x i32> [[TMP0]]
   13823 int32x2_t test_vreinterpret_s32_s8(int8x8_t a) {
   13824   return vreinterpret_s32_s8(a);
   13825 }
   13826 
   13827 // CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_s16(<4 x i16> %a) #0 {
   13828 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
   13829 // CHECK:   ret <2 x i32> [[TMP0]]
   13830 int32x2_t test_vreinterpret_s32_s16(int16x4_t a) {
   13831   return vreinterpret_s32_s16(a);
   13832 }
   13833 
   13834 // CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_s64(<1 x i64> %a) #0 {
   13835 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32>
   13836 // CHECK:   ret <2 x i32> [[TMP0]]
   13837 int32x2_t test_vreinterpret_s32_s64(int64x1_t a) {
   13838   return vreinterpret_s32_s64(a);
   13839 }
   13840 
   13841 // CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_u8(<8 x i8> %a) #0 {
   13842 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
   13843 // CHECK:   ret <2 x i32> [[TMP0]]
   13844 int32x2_t test_vreinterpret_s32_u8(uint8x8_t a) {
   13845   return vreinterpret_s32_u8(a);
   13846 }
   13847 
   13848 // CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_u16(<4 x i16> %a) #0 {
   13849 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
   13850 // CHECK:   ret <2 x i32> [[TMP0]]
   13851 int32x2_t test_vreinterpret_s32_u16(uint16x4_t a) {
   13852   return vreinterpret_s32_u16(a);
   13853 }
   13854 
   13855 // CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_u32(<2 x i32> %a) #0 {
   13856 // CHECK:   ret <2 x i32> %a
   13857 int32x2_t test_vreinterpret_s32_u32(uint32x2_t a) {
   13858   return vreinterpret_s32_u32(a);
   13859 }
   13860 
   13861 // CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_u64(<1 x i64> %a) #0 {
   13862 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32>
   13863 // CHECK:   ret <2 x i32> [[TMP0]]
   13864 int32x2_t test_vreinterpret_s32_u64(uint64x1_t a) {
   13865   return vreinterpret_s32_u64(a);
   13866 }
   13867 
   13868 // CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_f16(<4 x half> %a) #0 {
   13869 // CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <2 x i32>
   13870 // CHECK:   ret <2 x i32> [[TMP0]]
   13871 int32x2_t test_vreinterpret_s32_f16(float16x4_t a) {
   13872   return vreinterpret_s32_f16(a);
   13873 }
   13874 
   13875 // CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_f32(<2 x float> %a) #0 {
   13876 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <2 x i32>
   13877 // CHECK:   ret <2 x i32> [[TMP0]]
   13878 int32x2_t test_vreinterpret_s32_f32(float32x2_t a) {
   13879   return vreinterpret_s32_f32(a);
   13880 }
   13881 
   13882 // CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_p8(<8 x i8> %a) #0 {
   13883 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
   13884 // CHECK:   ret <2 x i32> [[TMP0]]
   13885 int32x2_t test_vreinterpret_s32_p8(poly8x8_t a) {
   13886   return vreinterpret_s32_p8(a);
   13887 }
   13888 
   13889 // CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_p16(<4 x i16> %a) #0 {
   13890 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
   13891 // CHECK:   ret <2 x i32> [[TMP0]]
   13892 int32x2_t test_vreinterpret_s32_p16(poly16x4_t a) {
   13893   return vreinterpret_s32_p16(a);
   13894 }
   13895 
   13896 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_s8(<8 x i8> %a) #0 {
   13897 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
   13898 // CHECK:   ret <1 x i64> [[TMP0]]
   13899 int64x1_t test_vreinterpret_s64_s8(int8x8_t a) {
   13900   return vreinterpret_s64_s8(a);
   13901 }
   13902 
   13903 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_s16(<4 x i16> %a) #0 {
   13904 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
   13905 // CHECK:   ret <1 x i64> [[TMP0]]
   13906 int64x1_t test_vreinterpret_s64_s16(int16x4_t a) {
   13907   return vreinterpret_s64_s16(a);
   13908 }
   13909 
   13910 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_s32(<2 x i32> %a) #0 {
   13911 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64>
   13912 // CHECK:   ret <1 x i64> [[TMP0]]
   13913 int64x1_t test_vreinterpret_s64_s32(int32x2_t a) {
   13914   return vreinterpret_s64_s32(a);
   13915 }
   13916 
   13917 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_u8(<8 x i8> %a) #0 {
   13918 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
   13919 // CHECK:   ret <1 x i64> [[TMP0]]
   13920 int64x1_t test_vreinterpret_s64_u8(uint8x8_t a) {
   13921   return vreinterpret_s64_u8(a);
   13922 }
   13923 
   13924 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_u16(<4 x i16> %a) #0 {
   13925 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
   13926 // CHECK:   ret <1 x i64> [[TMP0]]
   13927 int64x1_t test_vreinterpret_s64_u16(uint16x4_t a) {
   13928   return vreinterpret_s64_u16(a);
   13929 }
   13930 
   13931 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_u32(<2 x i32> %a) #0 {
   13932 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64>
   13933 // CHECK:   ret <1 x i64> [[TMP0]]
   13934 int64x1_t test_vreinterpret_s64_u32(uint32x2_t a) {
   13935   return vreinterpret_s64_u32(a);
   13936 }
   13937 
   13938 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_u64(<1 x i64> %a) #0 {
   13939 // CHECK:   ret <1 x i64> %a
   13940 int64x1_t test_vreinterpret_s64_u64(uint64x1_t a) {
   13941   return vreinterpret_s64_u64(a);
   13942 }
   13943 
   13944 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_f16(<4 x half> %a) #0 {
   13945 // CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <1 x i64>
   13946 // CHECK:   ret <1 x i64> [[TMP0]]
   13947 int64x1_t test_vreinterpret_s64_f16(float16x4_t a) {
   13948   return vreinterpret_s64_f16(a);
   13949 }
   13950 
   13951 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_f32(<2 x float> %a) #0 {
   13952 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <1 x i64>
   13953 // CHECK:   ret <1 x i64> [[TMP0]]
   13954 int64x1_t test_vreinterpret_s64_f32(float32x2_t a) {
   13955   return vreinterpret_s64_f32(a);
   13956 }
   13957 
   13958 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_p8(<8 x i8> %a) #0 {
   13959 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
   13960 // CHECK:   ret <1 x i64> [[TMP0]]
   13961 int64x1_t test_vreinterpret_s64_p8(poly8x8_t a) {
   13962   return vreinterpret_s64_p8(a);
   13963 }
   13964 
   13965 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_p16(<4 x i16> %a) #0 {
   13966 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
   13967 // CHECK:   ret <1 x i64> [[TMP0]]
   13968 int64x1_t test_vreinterpret_s64_p16(poly16x4_t a) {
   13969   return vreinterpret_s64_p16(a);
   13970 }
   13971 
   13972 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_s8(<8 x i8> %a) #0 {
   13973 // CHECK:   ret <8 x i8> %a
   13974 uint8x8_t test_vreinterpret_u8_s8(int8x8_t a) {
   13975   return vreinterpret_u8_s8(a);
   13976 }
   13977 
   13978 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_s16(<4 x i16> %a) #0 {
   13979 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   13980 // CHECK:   ret <8 x i8> [[TMP0]]
   13981 uint8x8_t test_vreinterpret_u8_s16(int16x4_t a) {
   13982   return vreinterpret_u8_s16(a);
   13983 }
   13984 
   13985 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_s32(<2 x i32> %a) #0 {
   13986 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   13987 // CHECK:   ret <8 x i8> [[TMP0]]
   13988 uint8x8_t test_vreinterpret_u8_s32(int32x2_t a) {
   13989   return vreinterpret_u8_s32(a);
   13990 }
   13991 
   13992 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_s64(<1 x i64> %a) #0 {
   13993 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   13994 // CHECK:   ret <8 x i8> [[TMP0]]
   13995 uint8x8_t test_vreinterpret_u8_s64(int64x1_t a) {
   13996   return vreinterpret_u8_s64(a);
   13997 }
   13998 
   13999 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_u16(<4 x i16> %a) #0 {
   14000 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   14001 // CHECK:   ret <8 x i8> [[TMP0]]
   14002 uint8x8_t test_vreinterpret_u8_u16(uint16x4_t a) {
   14003   return vreinterpret_u8_u16(a);
   14004 }
   14005 
   14006 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_u32(<2 x i32> %a) #0 {
   14007 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   14008 // CHECK:   ret <8 x i8> [[TMP0]]
   14009 uint8x8_t test_vreinterpret_u8_u32(uint32x2_t a) {
   14010   return vreinterpret_u8_u32(a);
   14011 }
   14012 
   14013 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_u64(<1 x i64> %a) #0 {
   14014 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   14015 // CHECK:   ret <8 x i8> [[TMP0]]
   14016 uint8x8_t test_vreinterpret_u8_u64(uint64x1_t a) {
   14017   return vreinterpret_u8_u64(a);
   14018 }
   14019 
   14020 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_f16(<4 x half> %a) #0 {
   14021 // CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
   14022 // CHECK:   ret <8 x i8> [[TMP0]]
   14023 uint8x8_t test_vreinterpret_u8_f16(float16x4_t a) {
   14024   return vreinterpret_u8_f16(a);
   14025 }
   14026 
   14027 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_f32(<2 x float> %a) #0 {
   14028 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
   14029 // CHECK:   ret <8 x i8> [[TMP0]]
   14030 uint8x8_t test_vreinterpret_u8_f32(float32x2_t a) {
   14031   return vreinterpret_u8_f32(a);
   14032 }
   14033 
   14034 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_p8(<8 x i8> %a) #0 {
   14035 // CHECK:   ret <8 x i8> %a
   14036 uint8x8_t test_vreinterpret_u8_p8(poly8x8_t a) {
   14037   return vreinterpret_u8_p8(a);
   14038 }
   14039 
   14040 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_p16(<4 x i16> %a) #0 {
   14041 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   14042 // CHECK:   ret <8 x i8> [[TMP0]]
   14043 uint8x8_t test_vreinterpret_u8_p16(poly16x4_t a) {
   14044   return vreinterpret_u8_p16(a);
   14045 }
   14046 
   14047 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_s8(<8 x i8> %a) #0 {
   14048 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
   14049 // CHECK:   ret <4 x i16> [[TMP0]]
   14050 uint16x4_t test_vreinterpret_u16_s8(int8x8_t a) {
   14051   return vreinterpret_u16_s8(a);
   14052 }
   14053 
   14054 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_s16(<4 x i16> %a) #0 {
   14055 // CHECK:   ret <4 x i16> %a
   14056 uint16x4_t test_vreinterpret_u16_s16(int16x4_t a) {
   14057   return vreinterpret_u16_s16(a);
   14058 }
   14059 
   14060 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_s32(<2 x i32> %a) #0 {
   14061 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
   14062 // CHECK:   ret <4 x i16> [[TMP0]]
   14063 uint16x4_t test_vreinterpret_u16_s32(int32x2_t a) {
   14064   return vreinterpret_u16_s32(a);
   14065 }
   14066 
   14067 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_s64(<1 x i64> %a) #0 {
   14068 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
   14069 // CHECK:   ret <4 x i16> [[TMP0]]
   14070 uint16x4_t test_vreinterpret_u16_s64(int64x1_t a) {
   14071   return vreinterpret_u16_s64(a);
   14072 }
   14073 
   14074 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_u8(<8 x i8> %a) #0 {
   14075 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
   14076 // CHECK:   ret <4 x i16> [[TMP0]]
   14077 uint16x4_t test_vreinterpret_u16_u8(uint8x8_t a) {
   14078   return vreinterpret_u16_u8(a);
   14079 }
   14080 
   14081 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_u32(<2 x i32> %a) #0 {
   14082 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
   14083 // CHECK:   ret <4 x i16> [[TMP0]]
   14084 uint16x4_t test_vreinterpret_u16_u32(uint32x2_t a) {
   14085   return vreinterpret_u16_u32(a);
   14086 }
   14087 
   14088 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_u64(<1 x i64> %a) #0 {
   14089 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
   14090 // CHECK:   ret <4 x i16> [[TMP0]]
   14091 uint16x4_t test_vreinterpret_u16_u64(uint64x1_t a) {
   14092   return vreinterpret_u16_u64(a);
   14093 }
   14094 
   14095 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_f16(<4 x half> %a) #0 {
   14096 // CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <4 x i16>
   14097 // CHECK:   ret <4 x i16> [[TMP0]]
   14098 uint16x4_t test_vreinterpret_u16_f16(float16x4_t a) {
   14099   return vreinterpret_u16_f16(a);
   14100 }
   14101 
   14102 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_f32(<2 x float> %a) #0 {
   14103 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x i16>
   14104 // CHECK:   ret <4 x i16> [[TMP0]]
   14105 uint16x4_t test_vreinterpret_u16_f32(float32x2_t a) {
   14106   return vreinterpret_u16_f32(a);
   14107 }
   14108 
   14109 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_p8(<8 x i8> %a) #0 {
   14110 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
   14111 // CHECK:   ret <4 x i16> [[TMP0]]
   14112 uint16x4_t test_vreinterpret_u16_p8(poly8x8_t a) {
   14113   return vreinterpret_u16_p8(a);
   14114 }
   14115 
   14116 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_p16(<4 x i16> %a) #0 {
   14117 // CHECK:   ret <4 x i16> %a
   14118 uint16x4_t test_vreinterpret_u16_p16(poly16x4_t a) {
   14119   return vreinterpret_u16_p16(a);
   14120 }
   14121 
   14122 // CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_s8(<8 x i8> %a) #0 {
   14123 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
   14124 // CHECK:   ret <2 x i32> [[TMP0]]
   14125 uint32x2_t test_vreinterpret_u32_s8(int8x8_t a) {
   14126   return vreinterpret_u32_s8(a);
   14127 }
   14128 
   14129 // CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_s16(<4 x i16> %a) #0 {
   14130 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
   14131 // CHECK:   ret <2 x i32> [[TMP0]]
   14132 uint32x2_t test_vreinterpret_u32_s16(int16x4_t a) {
   14133   return vreinterpret_u32_s16(a);
   14134 }
   14135 
   14136 // CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_s32(<2 x i32> %a) #0 {
   14137 // CHECK:   ret <2 x i32> %a
   14138 uint32x2_t test_vreinterpret_u32_s32(int32x2_t a) {
   14139   return vreinterpret_u32_s32(a);
   14140 }
   14141 
   14142 // CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_s64(<1 x i64> %a) #0 {
   14143 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32>
   14144 // CHECK:   ret <2 x i32> [[TMP0]]
   14145 uint32x2_t test_vreinterpret_u32_s64(int64x1_t a) {
   14146   return vreinterpret_u32_s64(a);
   14147 }
   14148 
   14149 // CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_u8(<8 x i8> %a) #0 {
   14150 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
   14151 // CHECK:   ret <2 x i32> [[TMP0]]
   14152 uint32x2_t test_vreinterpret_u32_u8(uint8x8_t a) {
   14153   return vreinterpret_u32_u8(a);
   14154 }
   14155 
   14156 // CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_u16(<4 x i16> %a) #0 {
   14157 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
   14158 // CHECK:   ret <2 x i32> [[TMP0]]
   14159 uint32x2_t test_vreinterpret_u32_u16(uint16x4_t a) {
   14160   return vreinterpret_u32_u16(a);
   14161 }
   14162 
   14163 // CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_u64(<1 x i64> %a) #0 {
   14164 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32>
   14165 // CHECK:   ret <2 x i32> [[TMP0]]
   14166 uint32x2_t test_vreinterpret_u32_u64(uint64x1_t a) {
   14167   return vreinterpret_u32_u64(a);
   14168 }
   14169 
   14170 // CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_f16(<4 x half> %a) #0 {
   14171 // CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <2 x i32>
   14172 // CHECK:   ret <2 x i32> [[TMP0]]
   14173 uint32x2_t test_vreinterpret_u32_f16(float16x4_t a) {
   14174   return vreinterpret_u32_f16(a);
   14175 }
   14176 
   14177 // CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_f32(<2 x float> %a) #0 {
   14178 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <2 x i32>
   14179 // CHECK:   ret <2 x i32> [[TMP0]]
   14180 uint32x2_t test_vreinterpret_u32_f32(float32x2_t a) {
   14181   return vreinterpret_u32_f32(a);
   14182 }
   14183 
   14184 // CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_p8(<8 x i8> %a) #0 {
   14185 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
   14186 // CHECK:   ret <2 x i32> [[TMP0]]
   14187 uint32x2_t test_vreinterpret_u32_p8(poly8x8_t a) {
   14188   return vreinterpret_u32_p8(a);
   14189 }
   14190 
   14191 // CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_p16(<4 x i16> %a) #0 {
   14192 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
   14193 // CHECK:   ret <2 x i32> [[TMP0]]
   14194 uint32x2_t test_vreinterpret_u32_p16(poly16x4_t a) {
   14195   return vreinterpret_u32_p16(a);
   14196 }
   14197 
   14198 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_s8(<8 x i8> %a) #0 {
   14199 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
   14200 // CHECK:   ret <1 x i64> [[TMP0]]
   14201 uint64x1_t test_vreinterpret_u64_s8(int8x8_t a) {
   14202   return vreinterpret_u64_s8(a);
   14203 }
   14204 
   14205 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_s16(<4 x i16> %a) #0 {
   14206 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
   14207 // CHECK:   ret <1 x i64> [[TMP0]]
   14208 uint64x1_t test_vreinterpret_u64_s16(int16x4_t a) {
   14209   return vreinterpret_u64_s16(a);
   14210 }
   14211 
   14212 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_s32(<2 x i32> %a) #0 {
   14213 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64>
   14214 // CHECK:   ret <1 x i64> [[TMP0]]
   14215 uint64x1_t test_vreinterpret_u64_s32(int32x2_t a) {
   14216   return vreinterpret_u64_s32(a);
   14217 }
   14218 
   14219 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_s64(<1 x i64> %a) #0 {
   14220 // CHECK:   ret <1 x i64> %a
   14221 uint64x1_t test_vreinterpret_u64_s64(int64x1_t a) {
   14222   return vreinterpret_u64_s64(a);
   14223 }
   14224 
   14225 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_u8(<8 x i8> %a) #0 {
   14226 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
   14227 // CHECK:   ret <1 x i64> [[TMP0]]
   14228 uint64x1_t test_vreinterpret_u64_u8(uint8x8_t a) {
   14229   return vreinterpret_u64_u8(a);
   14230 }
   14231 
   14232 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_u16(<4 x i16> %a) #0 {
   14233 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
   14234 // CHECK:   ret <1 x i64> [[TMP0]]
   14235 uint64x1_t test_vreinterpret_u64_u16(uint16x4_t a) {
   14236   return vreinterpret_u64_u16(a);
   14237 }
   14238 
   14239 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_u32(<2 x i32> %a) #0 {
   14240 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64>
   14241 // CHECK:   ret <1 x i64> [[TMP0]]
   14242 uint64x1_t test_vreinterpret_u64_u32(uint32x2_t a) {
   14243   return vreinterpret_u64_u32(a);
   14244 }
   14245 
   14246 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_f16(<4 x half> %a) #0 {
   14247 // CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <1 x i64>
   14248 // CHECK:   ret <1 x i64> [[TMP0]]
   14249 uint64x1_t test_vreinterpret_u64_f16(float16x4_t a) {
   14250   return vreinterpret_u64_f16(a);
   14251 }
   14252 
   14253 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_f32(<2 x float> %a) #0 {
   14254 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <1 x i64>
   14255 // CHECK:   ret <1 x i64> [[TMP0]]
   14256 uint64x1_t test_vreinterpret_u64_f32(float32x2_t a) {
   14257   return vreinterpret_u64_f32(a);
   14258 }
   14259 
   14260 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_p8(<8 x i8> %a) #0 {
   14261 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
   14262 // CHECK:   ret <1 x i64> [[TMP0]]
   14263 uint64x1_t test_vreinterpret_u64_p8(poly8x8_t a) {
   14264   return vreinterpret_u64_p8(a);
   14265 }
   14266 
   14267 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_p16(<4 x i16> %a) #0 {
   14268 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
   14269 // CHECK:   ret <1 x i64> [[TMP0]]
   14270 uint64x1_t test_vreinterpret_u64_p16(poly16x4_t a) {
   14271   return vreinterpret_u64_p16(a);
   14272 }
   14273 
   14274 // CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_s8(<8 x i8> %a) #0 {
   14275 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x half>
   14276 // CHECK:   ret <4 x half> [[TMP0]]
   14277 float16x4_t test_vreinterpret_f16_s8(int8x8_t a) {
   14278   return vreinterpret_f16_s8(a);
   14279 }
   14280 
   14281 // CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_s16(<4 x i16> %a) #0 {
   14282 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <4 x half>
   14283 // CHECK:   ret <4 x half> [[TMP0]]
   14284 float16x4_t test_vreinterpret_f16_s16(int16x4_t a) {
   14285   return vreinterpret_f16_s16(a);
   14286 }
   14287 
   14288 // CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_s32(<2 x i32> %a) #0 {
   14289 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x half>
   14290 // CHECK:   ret <4 x half> [[TMP0]]
   14291 float16x4_t test_vreinterpret_f16_s32(int32x2_t a) {
   14292   return vreinterpret_f16_s32(a);
   14293 }
   14294 
   14295 // CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_s64(<1 x i64> %a) #0 {
   14296 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x half>
   14297 // CHECK:   ret <4 x half> [[TMP0]]
   14298 float16x4_t test_vreinterpret_f16_s64(int64x1_t a) {
   14299   return vreinterpret_f16_s64(a);
   14300 }
   14301 
   14302 // CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_u8(<8 x i8> %a) #0 {
   14303 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x half>
   14304 // CHECK:   ret <4 x half> [[TMP0]]
   14305 float16x4_t test_vreinterpret_f16_u8(uint8x8_t a) {
   14306   return vreinterpret_f16_u8(a);
   14307 }
   14308 
   14309 // CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_u16(<4 x i16> %a) #0 {
   14310 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <4 x half>
   14311 // CHECK:   ret <4 x half> [[TMP0]]
   14312 float16x4_t test_vreinterpret_f16_u16(uint16x4_t a) {
   14313   return vreinterpret_f16_u16(a);
   14314 }
   14315 
   14316 // CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_u32(<2 x i32> %a) #0 {
   14317 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x half>
   14318 // CHECK:   ret <4 x half> [[TMP0]]
   14319 float16x4_t test_vreinterpret_f16_u32(uint32x2_t a) {
   14320   return vreinterpret_f16_u32(a);
   14321 }
   14322 
   14323 // CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_u64(<1 x i64> %a) #0 {
   14324 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x half>
   14325 // CHECK:   ret <4 x half> [[TMP0]]
   14326 float16x4_t test_vreinterpret_f16_u64(uint64x1_t a) {
   14327   return vreinterpret_f16_u64(a);
   14328 }
   14329 
   14330 // CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_f32(<2 x float> %a) #0 {
   14331 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x half>
   14332 // CHECK:   ret <4 x half> [[TMP0]]
   14333 float16x4_t test_vreinterpret_f16_f32(float32x2_t a) {
   14334   return vreinterpret_f16_f32(a);
   14335 }
   14336 
   14337 // CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_p8(<8 x i8> %a) #0 {
   14338 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x half>
   14339 // CHECK:   ret <4 x half> [[TMP0]]
   14340 float16x4_t test_vreinterpret_f16_p8(poly8x8_t a) {
   14341   return vreinterpret_f16_p8(a);
   14342 }
   14343 
   14344 // CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_p16(<4 x i16> %a) #0 {
   14345 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <4 x half>
   14346 // CHECK:   ret <4 x half> [[TMP0]]
   14347 float16x4_t test_vreinterpret_f16_p16(poly16x4_t a) {
   14348   return vreinterpret_f16_p16(a);
   14349 }
   14350 
   14351 // CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_s8(<8 x i8> %a) #0 {
   14352 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x float>
   14353 // CHECK:   ret <2 x float> [[TMP0]]
   14354 float32x2_t test_vreinterpret_f32_s8(int8x8_t a) {
   14355   return vreinterpret_f32_s8(a);
   14356 }
   14357 
   14358 // CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_s16(<4 x i16> %a) #0 {
   14359 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x float>
   14360 // CHECK:   ret <2 x float> [[TMP0]]
   14361 float32x2_t test_vreinterpret_f32_s16(int16x4_t a) {
   14362   return vreinterpret_f32_s16(a);
   14363 }
   14364 
   14365 // CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_s32(<2 x i32> %a) #0 {
   14366 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <2 x float>
   14367 // CHECK:   ret <2 x float> [[TMP0]]
   14368 float32x2_t test_vreinterpret_f32_s32(int32x2_t a) {
   14369   return vreinterpret_f32_s32(a);
   14370 }
   14371 
   14372 // CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_s64(<1 x i64> %a) #0 {
   14373 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x float>
   14374 // CHECK:   ret <2 x float> [[TMP0]]
   14375 float32x2_t test_vreinterpret_f32_s64(int64x1_t a) {
   14376   return vreinterpret_f32_s64(a);
   14377 }
   14378 
   14379 // CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_u8(<8 x i8> %a) #0 {
   14380 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x float>
   14381 // CHECK:   ret <2 x float> [[TMP0]]
   14382 float32x2_t test_vreinterpret_f32_u8(uint8x8_t a) {
   14383   return vreinterpret_f32_u8(a);
   14384 }
   14385 
   14386 // CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_u16(<4 x i16> %a) #0 {
   14387 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x float>
   14388 // CHECK:   ret <2 x float> [[TMP0]]
   14389 float32x2_t test_vreinterpret_f32_u16(uint16x4_t a) {
   14390   return vreinterpret_f32_u16(a);
   14391 }
   14392 
   14393 // CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_u32(<2 x i32> %a) #0 {
   14394 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <2 x float>
   14395 // CHECK:   ret <2 x float> [[TMP0]]
   14396 float32x2_t test_vreinterpret_f32_u32(uint32x2_t a) {
   14397   return vreinterpret_f32_u32(a);
   14398 }
   14399 
   14400 // CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_u64(<1 x i64> %a) #0 {
   14401 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x float>
   14402 // CHECK:   ret <2 x float> [[TMP0]]
   14403 float32x2_t test_vreinterpret_f32_u64(uint64x1_t a) {
   14404   return vreinterpret_f32_u64(a);
   14405 }
   14406 
   14407 // CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_f16(<4 x half> %a) #0 {
   14408 // CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <2 x float>
   14409 // CHECK:   ret <2 x float> [[TMP0]]
   14410 float32x2_t test_vreinterpret_f32_f16(float16x4_t a) {
   14411   return vreinterpret_f32_f16(a);
   14412 }
   14413 
   14414 // CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_p8(<8 x i8> %a) #0 {
   14415 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x float>
   14416 // CHECK:   ret <2 x float> [[TMP0]]
   14417 float32x2_t test_vreinterpret_f32_p8(poly8x8_t a) {
   14418   return vreinterpret_f32_p8(a);
   14419 }
   14420 
   14421 // CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_p16(<4 x i16> %a) #0 {
   14422 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x float>
   14423 // CHECK:   ret <2 x float> [[TMP0]]
   14424 float32x2_t test_vreinterpret_f32_p16(poly16x4_t a) {
   14425   return vreinterpret_f32_p16(a);
   14426 }
   14427 
   14428 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_s8(<8 x i8> %a) #0 {
   14429 // CHECK:   ret <8 x i8> %a
   14430 poly8x8_t test_vreinterpret_p8_s8(int8x8_t a) {
   14431   return vreinterpret_p8_s8(a);
   14432 }
   14433 
   14434 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_s16(<4 x i16> %a) #0 {
   14435 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   14436 // CHECK:   ret <8 x i8> [[TMP0]]
   14437 poly8x8_t test_vreinterpret_p8_s16(int16x4_t a) {
   14438   return vreinterpret_p8_s16(a);
   14439 }
   14440 
   14441 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_s32(<2 x i32> %a) #0 {
   14442 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   14443 // CHECK:   ret <8 x i8> [[TMP0]]
   14444 poly8x8_t test_vreinterpret_p8_s32(int32x2_t a) {
   14445   return vreinterpret_p8_s32(a);
   14446 }
   14447 
   14448 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_s64(<1 x i64> %a) #0 {
   14449 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   14450 // CHECK:   ret <8 x i8> [[TMP0]]
   14451 poly8x8_t test_vreinterpret_p8_s64(int64x1_t a) {
   14452   return vreinterpret_p8_s64(a);
   14453 }
   14454 
   14455 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_u8(<8 x i8> %a) #0 {
   14456 // CHECK:   ret <8 x i8> %a
   14457 poly8x8_t test_vreinterpret_p8_u8(uint8x8_t a) {
   14458   return vreinterpret_p8_u8(a);
   14459 }
   14460 
   14461 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_u16(<4 x i16> %a) #0 {
   14462 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   14463 // CHECK:   ret <8 x i8> [[TMP0]]
   14464 poly8x8_t test_vreinterpret_p8_u16(uint16x4_t a) {
   14465   return vreinterpret_p8_u16(a);
   14466 }
   14467 
   14468 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_u32(<2 x i32> %a) #0 {
   14469 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   14470 // CHECK:   ret <8 x i8> [[TMP0]]
   14471 poly8x8_t test_vreinterpret_p8_u32(uint32x2_t a) {
   14472   return vreinterpret_p8_u32(a);
   14473 }
   14474 
   14475 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_u64(<1 x i64> %a) #0 {
   14476 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   14477 // CHECK:   ret <8 x i8> [[TMP0]]
   14478 poly8x8_t test_vreinterpret_p8_u64(uint64x1_t a) {
   14479   return vreinterpret_p8_u64(a);
   14480 }
   14481 
   14482 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_f16(<4 x half> %a) #0 {
   14483 // CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
   14484 // CHECK:   ret <8 x i8> [[TMP0]]
   14485 poly8x8_t test_vreinterpret_p8_f16(float16x4_t a) {
   14486   return vreinterpret_p8_f16(a);
   14487 }
   14488 
   14489 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_f32(<2 x float> %a) #0 {
   14490 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
   14491 // CHECK:   ret <8 x i8> [[TMP0]]
   14492 poly8x8_t test_vreinterpret_p8_f32(float32x2_t a) {
   14493   return vreinterpret_p8_f32(a);
   14494 }
   14495 
   14496 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_p16(<4 x i16> %a) #0 {
   14497 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   14498 // CHECK:   ret <8 x i8> [[TMP0]]
   14499 poly8x8_t test_vreinterpret_p8_p16(poly16x4_t a) {
   14500   return vreinterpret_p8_p16(a);
   14501 }
   14502 
   14503 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_s8(<8 x i8> %a) #0 {
   14504 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
   14505 // CHECK:   ret <4 x i16> [[TMP0]]
   14506 poly16x4_t test_vreinterpret_p16_s8(int8x8_t a) {
   14507   return vreinterpret_p16_s8(a);
   14508 }
   14509 
   14510 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_s16(<4 x i16> %a) #0 {
   14511 // CHECK:   ret <4 x i16> %a
   14512 poly16x4_t test_vreinterpret_p16_s16(int16x4_t a) {
   14513   return vreinterpret_p16_s16(a);
   14514 }
   14515 
   14516 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_s32(<2 x i32> %a) #0 {
   14517 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
   14518 // CHECK:   ret <4 x i16> [[TMP0]]
   14519 poly16x4_t test_vreinterpret_p16_s32(int32x2_t a) {
   14520   return vreinterpret_p16_s32(a);
   14521 }
   14522 
   14523 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_s64(<1 x i64> %a) #0 {
   14524 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
   14525 // CHECK:   ret <4 x i16> [[TMP0]]
   14526 poly16x4_t test_vreinterpret_p16_s64(int64x1_t a) {
   14527   return vreinterpret_p16_s64(a);
   14528 }
   14529 
   14530 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_u8(<8 x i8> %a) #0 {
   14531 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
   14532 // CHECK:   ret <4 x i16> [[TMP0]]
   14533 poly16x4_t test_vreinterpret_p16_u8(uint8x8_t a) {
   14534   return vreinterpret_p16_u8(a);
   14535 }
   14536 
   14537 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_u16(<4 x i16> %a) #0 {
   14538 // CHECK:   ret <4 x i16> %a
   14539 poly16x4_t test_vreinterpret_p16_u16(uint16x4_t a) {
   14540   return vreinterpret_p16_u16(a);
   14541 }
   14542 
   14543 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_u32(<2 x i32> %a) #0 {
   14544 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
   14545 // CHECK:   ret <4 x i16> [[TMP0]]
   14546 poly16x4_t test_vreinterpret_p16_u32(uint32x2_t a) {
   14547   return vreinterpret_p16_u32(a);
   14548 }
   14549 
   14550 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_u64(<1 x i64> %a) #0 {
   14551 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
   14552 // CHECK:   ret <4 x i16> [[TMP0]]
   14553 poly16x4_t test_vreinterpret_p16_u64(uint64x1_t a) {
   14554   return vreinterpret_p16_u64(a);
   14555 }
   14556 
   14557 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_f16(<4 x half> %a) #0 {
   14558 // CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <4 x i16>
   14559 // CHECK:   ret <4 x i16> [[TMP0]]
   14560 poly16x4_t test_vreinterpret_p16_f16(float16x4_t a) {
   14561   return vreinterpret_p16_f16(a);
   14562 }
   14563 
   14564 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_f32(<2 x float> %a) #0 {
   14565 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x i16>
   14566 // CHECK:   ret <4 x i16> [[TMP0]]
   14567 poly16x4_t test_vreinterpret_p16_f32(float32x2_t a) {
   14568   return vreinterpret_p16_f32(a);
   14569 }
   14570 
   14571 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_p8(<8 x i8> %a) #0 {
   14572 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
   14573 // CHECK:   ret <4 x i16> [[TMP0]]
   14574 poly16x4_t test_vreinterpret_p16_p8(poly8x8_t a) {
   14575   return vreinterpret_p16_p8(a);
   14576 }
   14577 
   14578 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_s16(<8 x i16> %a) #0 {
   14579 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   14580 // CHECK:   ret <16 x i8> [[TMP0]]
   14581 int8x16_t test_vreinterpretq_s8_s16(int16x8_t a) {
   14582   return vreinterpretq_s8_s16(a);
   14583 }
   14584 
   14585 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_s32(<4 x i32> %a) #0 {
   14586 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   14587 // CHECK:   ret <16 x i8> [[TMP0]]
   14588 int8x16_t test_vreinterpretq_s8_s32(int32x4_t a) {
   14589   return vreinterpretq_s8_s32(a);
   14590 }
   14591 
   14592 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_s64(<2 x i64> %a) #0 {
   14593 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   14594 // CHECK:   ret <16 x i8> [[TMP0]]
   14595 int8x16_t test_vreinterpretq_s8_s64(int64x2_t a) {
   14596   return vreinterpretq_s8_s64(a);
   14597 }
   14598 
   14599 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_u8(<16 x i8> %a) #0 {
   14600 // CHECK:   ret <16 x i8> %a
   14601 int8x16_t test_vreinterpretq_s8_u8(uint8x16_t a) {
   14602   return vreinterpretq_s8_u8(a);
   14603 }
   14604 
   14605 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_u16(<8 x i16> %a) #0 {
   14606 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   14607 // CHECK:   ret <16 x i8> [[TMP0]]
   14608 int8x16_t test_vreinterpretq_s8_u16(uint16x8_t a) {
   14609   return vreinterpretq_s8_u16(a);
   14610 }
   14611 
   14612 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_u32(<4 x i32> %a) #0 {
   14613 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   14614 // CHECK:   ret <16 x i8> [[TMP0]]
   14615 int8x16_t test_vreinterpretq_s8_u32(uint32x4_t a) {
   14616   return vreinterpretq_s8_u32(a);
   14617 }
   14618 
   14619 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_u64(<2 x i64> %a) #0 {
   14620 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   14621 // CHECK:   ret <16 x i8> [[TMP0]]
   14622 int8x16_t test_vreinterpretq_s8_u64(uint64x2_t a) {
   14623   return vreinterpretq_s8_u64(a);
   14624 }
   14625 
   14626 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_f16(<8 x half> %a) #0 {
   14627 // CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8>
   14628 // CHECK:   ret <16 x i8> [[TMP0]]
   14629 int8x16_t test_vreinterpretq_s8_f16(float16x8_t a) {
   14630   return vreinterpretq_s8_f16(a);
   14631 }
   14632 
   14633 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_f32(<4 x float> %a) #0 {
   14634 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
   14635 // CHECK:   ret <16 x i8> [[TMP0]]
   14636 int8x16_t test_vreinterpretq_s8_f32(float32x4_t a) {
   14637   return vreinterpretq_s8_f32(a);
   14638 }
   14639 
   14640 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_p8(<16 x i8> %a) #0 {
   14641 // CHECK:   ret <16 x i8> %a
   14642 int8x16_t test_vreinterpretq_s8_p8(poly8x16_t a) {
   14643   return vreinterpretq_s8_p8(a);
   14644 }
   14645 
   14646 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_p16(<8 x i16> %a) #0 {
   14647 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   14648 // CHECK:   ret <16 x i8> [[TMP0]]
   14649 int8x16_t test_vreinterpretq_s8_p16(poly16x8_t a) {
   14650   return vreinterpretq_s8_p16(a);
   14651 }
   14652 
   14653 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_s8(<16 x i8> %a) #0 {
   14654 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
   14655 // CHECK:   ret <8 x i16> [[TMP0]]
   14656 int16x8_t test_vreinterpretq_s16_s8(int8x16_t a) {
   14657   return vreinterpretq_s16_s8(a);
   14658 }
   14659 
   14660 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_s32(<4 x i32> %a) #0 {
   14661 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
   14662 // CHECK:   ret <8 x i16> [[TMP0]]
   14663 int16x8_t test_vreinterpretq_s16_s32(int32x4_t a) {
   14664   return vreinterpretq_s16_s32(a);
   14665 }
   14666 
   14667 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_s64(<2 x i64> %a) #0 {
   14668 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
   14669 // CHECK:   ret <8 x i16> [[TMP0]]
   14670 int16x8_t test_vreinterpretq_s16_s64(int64x2_t a) {
   14671   return vreinterpretq_s16_s64(a);
   14672 }
   14673 
   14674 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_u8(<16 x i8> %a) #0 {
   14675 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
   14676 // CHECK:   ret <8 x i16> [[TMP0]]
   14677 int16x8_t test_vreinterpretq_s16_u8(uint8x16_t a) {
   14678   return vreinterpretq_s16_u8(a);
   14679 }
   14680 
   14681 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_u16(<8 x i16> %a) #0 {
   14682 // CHECK:   ret <8 x i16> %a
   14683 int16x8_t test_vreinterpretq_s16_u16(uint16x8_t a) {
   14684   return vreinterpretq_s16_u16(a);
   14685 }
   14686 
   14687 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_u32(<4 x i32> %a) #0 {
   14688 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
   14689 // CHECK:   ret <8 x i16> [[TMP0]]
   14690 int16x8_t test_vreinterpretq_s16_u32(uint32x4_t a) {
   14691   return vreinterpretq_s16_u32(a);
   14692 }
   14693 
   14694 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_u64(<2 x i64> %a) #0 {
   14695 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
   14696 // CHECK:   ret <8 x i16> [[TMP0]]
   14697 int16x8_t test_vreinterpretq_s16_u64(uint64x2_t a) {
   14698   return vreinterpretq_s16_u64(a);
   14699 }
   14700 
   14701 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_f16(<8 x half> %a) #0 {
   14702 // CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <8 x i16>
   14703 // CHECK:   ret <8 x i16> [[TMP0]]
   14704 int16x8_t test_vreinterpretq_s16_f16(float16x8_t a) {
   14705   return vreinterpretq_s16_f16(a);
   14706 }
   14707 
   14708 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_f32(<4 x float> %a) #0 {
   14709 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x i16>
   14710 // CHECK:   ret <8 x i16> [[TMP0]]
   14711 int16x8_t test_vreinterpretq_s16_f32(float32x4_t a) {
   14712   return vreinterpretq_s16_f32(a);
   14713 }
   14714 
   14715 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_p8(<16 x i8> %a) #0 {
   14716 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
   14717 // CHECK:   ret <8 x i16> [[TMP0]]
   14718 int16x8_t test_vreinterpretq_s16_p8(poly8x16_t a) {
   14719   return vreinterpretq_s16_p8(a);
   14720 }
   14721 
   14722 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_p16(<8 x i16> %a) #0 {
   14723 // CHECK:   ret <8 x i16> %a
   14724 int16x8_t test_vreinterpretq_s16_p16(poly16x8_t a) {
   14725   return vreinterpretq_s16_p16(a);
   14726 }
   14727 
   14728 // CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_s8(<16 x i8> %a) #0 {
   14729 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
   14730 // CHECK:   ret <4 x i32> [[TMP0]]
   14731 int32x4_t test_vreinterpretq_s32_s8(int8x16_t a) {
   14732   return vreinterpretq_s32_s8(a);
   14733 }
   14734 
   14735 // CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_s16(<8 x i16> %a) #0 {
   14736 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
   14737 // CHECK:   ret <4 x i32> [[TMP0]]
   14738 int32x4_t test_vreinterpretq_s32_s16(int16x8_t a) {
   14739   return vreinterpretq_s32_s16(a);
   14740 }
   14741 
   14742 // CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_s64(<2 x i64> %a) #0 {
   14743 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32>
   14744 // CHECK:   ret <4 x i32> [[TMP0]]
   14745 int32x4_t test_vreinterpretq_s32_s64(int64x2_t a) {
   14746   return vreinterpretq_s32_s64(a);
   14747 }
   14748 
   14749 // CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_u8(<16 x i8> %a) #0 {
   14750 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
   14751 // CHECK:   ret <4 x i32> [[TMP0]]
   14752 int32x4_t test_vreinterpretq_s32_u8(uint8x16_t a) {
   14753   return vreinterpretq_s32_u8(a);
   14754 }
   14755 
   14756 // CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_u16(<8 x i16> %a) #0 {
   14757 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
   14758 // CHECK:   ret <4 x i32> [[TMP0]]
   14759 int32x4_t test_vreinterpretq_s32_u16(uint16x8_t a) {
   14760   return vreinterpretq_s32_u16(a);
   14761 }
   14762 
   14763 // CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_u32(<4 x i32> %a) #0 {
   14764 // CHECK:   ret <4 x i32> %a
   14765 int32x4_t test_vreinterpretq_s32_u32(uint32x4_t a) {
   14766   return vreinterpretq_s32_u32(a);
   14767 }
   14768 
   14769 // CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_u64(<2 x i64> %a) #0 {
   14770 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32>
   14771 // CHECK:   ret <4 x i32> [[TMP0]]
   14772 int32x4_t test_vreinterpretq_s32_u64(uint64x2_t a) {
   14773   return vreinterpretq_s32_u64(a);
   14774 }
   14775 
   14776 // CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_f16(<8 x half> %a) #0 {
   14777 // CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <4 x i32>
   14778 // CHECK:   ret <4 x i32> [[TMP0]]
   14779 int32x4_t test_vreinterpretq_s32_f16(float16x8_t a) {
   14780   return vreinterpretq_s32_f16(a);
   14781 }
   14782 
   14783 // CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_f32(<4 x float> %a) #0 {
   14784 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <4 x i32>
   14785 // CHECK:   ret <4 x i32> [[TMP0]]
   14786 int32x4_t test_vreinterpretq_s32_f32(float32x4_t a) {
   14787   return vreinterpretq_s32_f32(a);
   14788 }
   14789 
   14790 // CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_p8(<16 x i8> %a) #0 {
   14791 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
   14792 // CHECK:   ret <4 x i32> [[TMP0]]
   14793 int32x4_t test_vreinterpretq_s32_p8(poly8x16_t a) {
   14794   return vreinterpretq_s32_p8(a);
   14795 }
   14796 
   14797 // CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_p16(<8 x i16> %a) #0 {
   14798 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
   14799 // CHECK:   ret <4 x i32> [[TMP0]]
   14800 int32x4_t test_vreinterpretq_s32_p16(poly16x8_t a) {
   14801   return vreinterpretq_s32_p16(a);
   14802 }
   14803 
   14804 // CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_s8(<16 x i8> %a) #0 {
   14805 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
   14806 // CHECK:   ret <2 x i64> [[TMP0]]
   14807 int64x2_t test_vreinterpretq_s64_s8(int8x16_t a) {
   14808   return vreinterpretq_s64_s8(a);
   14809 }
   14810 
   14811 // CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_s16(<8 x i16> %a) #0 {
   14812 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
   14813 // CHECK:   ret <2 x i64> [[TMP0]]
   14814 int64x2_t test_vreinterpretq_s64_s16(int16x8_t a) {
   14815   return vreinterpretq_s64_s16(a);
   14816 }
   14817 
   14818 // CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_s32(<4 x i32> %a) #0 {
   14819 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64>
   14820 // CHECK:   ret <2 x i64> [[TMP0]]
   14821 int64x2_t test_vreinterpretq_s64_s32(int32x4_t a) {
   14822   return vreinterpretq_s64_s32(a);
   14823 }
   14824 
   14825 // CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_u8(<16 x i8> %a) #0 {
   14826 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
   14827 // CHECK:   ret <2 x i64> [[TMP0]]
   14828 int64x2_t test_vreinterpretq_s64_u8(uint8x16_t a) {
   14829   return vreinterpretq_s64_u8(a);
   14830 }
   14831 
   14832 // CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_u16(<8 x i16> %a) #0 {
   14833 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
   14834 // CHECK:   ret <2 x i64> [[TMP0]]
   14835 int64x2_t test_vreinterpretq_s64_u16(uint16x8_t a) {
   14836   return vreinterpretq_s64_u16(a);
   14837 }
   14838 
   14839 // CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_u32(<4 x i32> %a) #0 {
   14840 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64>
   14841 // CHECK:   ret <2 x i64> [[TMP0]]
   14842 int64x2_t test_vreinterpretq_s64_u32(uint32x4_t a) {
   14843   return vreinterpretq_s64_u32(a);
   14844 }
   14845 
   14846 // CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_u64(<2 x i64> %a) #0 {
   14847 // CHECK:   ret <2 x i64> %a
   14848 int64x2_t test_vreinterpretq_s64_u64(uint64x2_t a) {
   14849   return vreinterpretq_s64_u64(a);
   14850 }
   14851 
   14852 // CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_f16(<8 x half> %a) #0 {
   14853 // CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <2 x i64>
   14854 // CHECK:   ret <2 x i64> [[TMP0]]
   14855 int64x2_t test_vreinterpretq_s64_f16(float16x8_t a) {
   14856   return vreinterpretq_s64_f16(a);
   14857 }
   14858 
   14859 // CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_f32(<4 x float> %a) #0 {
   14860 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <2 x i64>
   14861 // CHECK:   ret <2 x i64> [[TMP0]]
   14862 int64x2_t test_vreinterpretq_s64_f32(float32x4_t a) {
   14863   return vreinterpretq_s64_f32(a);
   14864 }
   14865 
   14866 // CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_p8(<16 x i8> %a) #0 {
   14867 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
   14868 // CHECK:   ret <2 x i64> [[TMP0]]
   14869 int64x2_t test_vreinterpretq_s64_p8(poly8x16_t a) {
   14870   return vreinterpretq_s64_p8(a);
   14871 }
   14872 
   14873 // CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_p16(<8 x i16> %a) #0 {
   14874 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
   14875 // CHECK:   ret <2 x i64> [[TMP0]]
   14876 int64x2_t test_vreinterpretq_s64_p16(poly16x8_t a) {
   14877   return vreinterpretq_s64_p16(a);
   14878 }
   14879 
   14880 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_s8(<16 x i8> %a) #0 {
   14881 // CHECK:   ret <16 x i8> %a
   14882 uint8x16_t test_vreinterpretq_u8_s8(int8x16_t a) {
   14883   return vreinterpretq_u8_s8(a);
   14884 }
   14885 
   14886 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_s16(<8 x i16> %a) #0 {
   14887 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   14888 // CHECK:   ret <16 x i8> [[TMP0]]
   14889 uint8x16_t test_vreinterpretq_u8_s16(int16x8_t a) {
   14890   return vreinterpretq_u8_s16(a);
   14891 }
   14892 
   14893 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_s32(<4 x i32> %a) #0 {
   14894 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   14895 // CHECK:   ret <16 x i8> [[TMP0]]
   14896 uint8x16_t test_vreinterpretq_u8_s32(int32x4_t a) {
   14897   return vreinterpretq_u8_s32(a);
   14898 }
   14899 
   14900 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_s64(<2 x i64> %a) #0 {
   14901 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   14902 // CHECK:   ret <16 x i8> [[TMP0]]
   14903 uint8x16_t test_vreinterpretq_u8_s64(int64x2_t a) {
   14904   return vreinterpretq_u8_s64(a);
   14905 }
   14906 
   14907 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_u16(<8 x i16> %a) #0 {
   14908 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   14909 // CHECK:   ret <16 x i8> [[TMP0]]
   14910 uint8x16_t test_vreinterpretq_u8_u16(uint16x8_t a) {
   14911   return vreinterpretq_u8_u16(a);
   14912 }
   14913 
   14914 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_u32(<4 x i32> %a) #0 {
   14915 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   14916 // CHECK:   ret <16 x i8> [[TMP0]]
   14917 uint8x16_t test_vreinterpretq_u8_u32(uint32x4_t a) {
   14918   return vreinterpretq_u8_u32(a);
   14919 }
   14920 
   14921 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_u64(<2 x i64> %a) #0 {
   14922 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   14923 // CHECK:   ret <16 x i8> [[TMP0]]
   14924 uint8x16_t test_vreinterpretq_u8_u64(uint64x2_t a) {
   14925   return vreinterpretq_u8_u64(a);
   14926 }
   14927 
   14928 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_f16(<8 x half> %a) #0 {
   14929 // CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8>
   14930 // CHECK:   ret <16 x i8> [[TMP0]]
   14931 uint8x16_t test_vreinterpretq_u8_f16(float16x8_t a) {
   14932   return vreinterpretq_u8_f16(a);
   14933 }
   14934 
   14935 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_f32(<4 x float> %a) #0 {
   14936 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
   14937 // CHECK:   ret <16 x i8> [[TMP0]]
   14938 uint8x16_t test_vreinterpretq_u8_f32(float32x4_t a) {
   14939   return vreinterpretq_u8_f32(a);
   14940 }
   14941 
   14942 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_p8(<16 x i8> %a) #0 {
   14943 // CHECK:   ret <16 x i8> %a
   14944 uint8x16_t test_vreinterpretq_u8_p8(poly8x16_t a) {
   14945   return vreinterpretq_u8_p8(a);
   14946 }
   14947 
   14948 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_p16(<8 x i16> %a) #0 {
   14949 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   14950 // CHECK:   ret <16 x i8> [[TMP0]]
   14951 uint8x16_t test_vreinterpretq_u8_p16(poly16x8_t a) {
   14952   return vreinterpretq_u8_p16(a);
   14953 }
   14954 
   14955 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_s8(<16 x i8> %a) #0 {
   14956 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
   14957 // CHECK:   ret <8 x i16> [[TMP0]]
   14958 uint16x8_t test_vreinterpretq_u16_s8(int8x16_t a) {
   14959   return vreinterpretq_u16_s8(a);
   14960 }
   14961 
   14962 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_s16(<8 x i16> %a) #0 {
   14963 // CHECK:   ret <8 x i16> %a
   14964 uint16x8_t test_vreinterpretq_u16_s16(int16x8_t a) {
   14965   return vreinterpretq_u16_s16(a);
   14966 }
   14967 
   14968 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_s32(<4 x i32> %a) #0 {
   14969 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
   14970 // CHECK:   ret <8 x i16> [[TMP0]]
   14971 uint16x8_t test_vreinterpretq_u16_s32(int32x4_t a) {
   14972   return vreinterpretq_u16_s32(a);
   14973 }
   14974 
   14975 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_s64(<2 x i64> %a) #0 {
   14976 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
   14977 // CHECK:   ret <8 x i16> [[TMP0]]
   14978 uint16x8_t test_vreinterpretq_u16_s64(int64x2_t a) {
   14979   return vreinterpretq_u16_s64(a);
   14980 }
   14981 
   14982 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_u8(<16 x i8> %a) #0 {
   14983 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
   14984 // CHECK:   ret <8 x i16> [[TMP0]]
   14985 uint16x8_t test_vreinterpretq_u16_u8(uint8x16_t a) {
   14986   return vreinterpretq_u16_u8(a);
   14987 }
   14988 
   14989 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_u32(<4 x i32> %a) #0 {
   14990 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
   14991 // CHECK:   ret <8 x i16> [[TMP0]]
   14992 uint16x8_t test_vreinterpretq_u16_u32(uint32x4_t a) {
   14993   return vreinterpretq_u16_u32(a);
   14994 }
   14995 
   14996 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_u64(<2 x i64> %a) #0 {
   14997 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
   14998 // CHECK:   ret <8 x i16> [[TMP0]]
   14999 uint16x8_t test_vreinterpretq_u16_u64(uint64x2_t a) {
   15000   return vreinterpretq_u16_u64(a);
   15001 }
   15002 
   15003 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_f16(<8 x half> %a) #0 {
   15004 // CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <8 x i16>
   15005 // CHECK:   ret <8 x i16> [[TMP0]]
   15006 uint16x8_t test_vreinterpretq_u16_f16(float16x8_t a) {
   15007   return vreinterpretq_u16_f16(a);
   15008 }
   15009 
   15010 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_f32(<4 x float> %a) #0 {
   15011 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x i16>
   15012 // CHECK:   ret <8 x i16> [[TMP0]]
   15013 uint16x8_t test_vreinterpretq_u16_f32(float32x4_t a) {
   15014   return vreinterpretq_u16_f32(a);
   15015 }
   15016 
   15017 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_p8(<16 x i8> %a) #0 {
   15018 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
   15019 // CHECK:   ret <8 x i16> [[TMP0]]
   15020 uint16x8_t test_vreinterpretq_u16_p8(poly8x16_t a) {
   15021   return vreinterpretq_u16_p8(a);
   15022 }
   15023 
   15024 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_p16(<8 x i16> %a) #0 {
   15025 // CHECK:   ret <8 x i16> %a
   15026 uint16x8_t test_vreinterpretq_u16_p16(poly16x8_t a) {
   15027   return vreinterpretq_u16_p16(a);
   15028 }
   15029 
   15030 // CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_s8(<16 x i8> %a) #0 {
   15031 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
   15032 // CHECK:   ret <4 x i32> [[TMP0]]
   15033 uint32x4_t test_vreinterpretq_u32_s8(int8x16_t a) {
   15034   return vreinterpretq_u32_s8(a);
   15035 }
   15036 
   15037 // CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_s16(<8 x i16> %a) #0 {
   15038 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
   15039 // CHECK:   ret <4 x i32> [[TMP0]]
   15040 uint32x4_t test_vreinterpretq_u32_s16(int16x8_t a) {
   15041   return vreinterpretq_u32_s16(a);
   15042 }
   15043 
   15044 // CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_s32(<4 x i32> %a) #0 {
   15045 // CHECK:   ret <4 x i32> %a
   15046 uint32x4_t test_vreinterpretq_u32_s32(int32x4_t a) {
   15047   return vreinterpretq_u32_s32(a);
   15048 }
   15049 
   15050 // CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_s64(<2 x i64> %a) #0 {
   15051 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32>
   15052 // CHECK:   ret <4 x i32> [[TMP0]]
   15053 uint32x4_t test_vreinterpretq_u32_s64(int64x2_t a) {
   15054   return vreinterpretq_u32_s64(a);
   15055 }
   15056 
   15057 // CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_u8(<16 x i8> %a) #0 {
   15058 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
   15059 // CHECK:   ret <4 x i32> [[TMP0]]
   15060 uint32x4_t test_vreinterpretq_u32_u8(uint8x16_t a) {
   15061   return vreinterpretq_u32_u8(a);
   15062 }
   15063 
   15064 // CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_u16(<8 x i16> %a) #0 {
   15065 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
   15066 // CHECK:   ret <4 x i32> [[TMP0]]
   15067 uint32x4_t test_vreinterpretq_u32_u16(uint16x8_t a) {
   15068   return vreinterpretq_u32_u16(a);
   15069 }
   15070 
   15071 // CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_u64(<2 x i64> %a) #0 {
   15072 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32>
   15073 // CHECK:   ret <4 x i32> [[TMP0]]
   15074 uint32x4_t test_vreinterpretq_u32_u64(uint64x2_t a) {
   15075   return vreinterpretq_u32_u64(a);
   15076 }
   15077 
   15078 // CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_f16(<8 x half> %a) #0 {
   15079 // CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <4 x i32>
   15080 // CHECK:   ret <4 x i32> [[TMP0]]
   15081 uint32x4_t test_vreinterpretq_u32_f16(float16x8_t a) {
   15082   return vreinterpretq_u32_f16(a);
   15083 }
   15084 
   15085 // CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_f32(<4 x float> %a) #0 {
   15086 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <4 x i32>
   15087 // CHECK:   ret <4 x i32> [[TMP0]]
   15088 uint32x4_t test_vreinterpretq_u32_f32(float32x4_t a) {
   15089   return vreinterpretq_u32_f32(a);
   15090 }
   15091 
   15092 // CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_p8(<16 x i8> %a) #0 {
   15093 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
   15094 // CHECK:   ret <4 x i32> [[TMP0]]
   15095 uint32x4_t test_vreinterpretq_u32_p8(poly8x16_t a) {
   15096   return vreinterpretq_u32_p8(a);
   15097 }
   15098 
   15099 // CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_p16(<8 x i16> %a) #0 {
   15100 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
   15101 // CHECK:   ret <4 x i32> [[TMP0]]
   15102 uint32x4_t test_vreinterpretq_u32_p16(poly16x8_t a) {
   15103   return vreinterpretq_u32_p16(a);
   15104 }
   15105 
   15106 // CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_s8(<16 x i8> %a) #0 {
   15107 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
   15108 // CHECK:   ret <2 x i64> [[TMP0]]
   15109 uint64x2_t test_vreinterpretq_u64_s8(int8x16_t a) {
   15110   return vreinterpretq_u64_s8(a);
   15111 }
   15112 
   15113 // CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_s16(<8 x i16> %a) #0 {
   15114 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
   15115 // CHECK:   ret <2 x i64> [[TMP0]]
   15116 uint64x2_t test_vreinterpretq_u64_s16(int16x8_t a) {
   15117   return vreinterpretq_u64_s16(a);
   15118 }
   15119 
   15120 // CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_s32(<4 x i32> %a) #0 {
   15121 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64>
   15122 // CHECK:   ret <2 x i64> [[TMP0]]
   15123 uint64x2_t test_vreinterpretq_u64_s32(int32x4_t a) {
   15124   return vreinterpretq_u64_s32(a);
   15125 }
   15126 
   15127 // CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_s64(<2 x i64> %a) #0 {
   15128 // CHECK:   ret <2 x i64> %a
   15129 uint64x2_t test_vreinterpretq_u64_s64(int64x2_t a) {
   15130   return vreinterpretq_u64_s64(a);
   15131 }
   15132 
   15133 // CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_u8(<16 x i8> %a) #0 {
   15134 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
   15135 // CHECK:   ret <2 x i64> [[TMP0]]
   15136 uint64x2_t test_vreinterpretq_u64_u8(uint8x16_t a) {
   15137   return vreinterpretq_u64_u8(a);
   15138 }
   15139 
   15140 // CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_u16(<8 x i16> %a) #0 {
   15141 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
   15142 // CHECK:   ret <2 x i64> [[TMP0]]
   15143 uint64x2_t test_vreinterpretq_u64_u16(uint16x8_t a) {
   15144   return vreinterpretq_u64_u16(a);
   15145 }
   15146 
   15147 // CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_u32(<4 x i32> %a) #0 {
   15148 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64>
   15149 // CHECK:   ret <2 x i64> [[TMP0]]
   15150 uint64x2_t test_vreinterpretq_u64_u32(uint32x4_t a) {
   15151   return vreinterpretq_u64_u32(a);
   15152 }
   15153 
   15154 // CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_f16(<8 x half> %a) #0 {
   15155 // CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <2 x i64>
   15156 // CHECK:   ret <2 x i64> [[TMP0]]
   15157 uint64x2_t test_vreinterpretq_u64_f16(float16x8_t a) {
   15158   return vreinterpretq_u64_f16(a);
   15159 }
   15160 
   15161 // CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_f32(<4 x float> %a) #0 {
   15162 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <2 x i64>
   15163 // CHECK:   ret <2 x i64> [[TMP0]]
   15164 uint64x2_t test_vreinterpretq_u64_f32(float32x4_t a) {
   15165   return vreinterpretq_u64_f32(a);
   15166 }
   15167 
   15168 // CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_p8(<16 x i8> %a) #0 {
   15169 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
   15170 // CHECK:   ret <2 x i64> [[TMP0]]
   15171 uint64x2_t test_vreinterpretq_u64_p8(poly8x16_t a) {
   15172   return vreinterpretq_u64_p8(a);
   15173 }
   15174 
   15175 // CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_p16(<8 x i16> %a) #0 {
   15176 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
   15177 // CHECK:   ret <2 x i64> [[TMP0]]
   15178 uint64x2_t test_vreinterpretq_u64_p16(poly16x8_t a) {
   15179   return vreinterpretq_u64_p16(a);
   15180 }
   15181 
   15182 // CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_s8(<16 x i8> %a) #0 {
   15183 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x half>
   15184 // CHECK:   ret <8 x half> [[TMP0]]
   15185 float16x8_t test_vreinterpretq_f16_s8(int8x16_t a) {
   15186   return vreinterpretq_f16_s8(a);
   15187 }
   15188 
   15189 // CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_s16(<8 x i16> %a) #0 {
   15190 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <8 x half>
   15191 // CHECK:   ret <8 x half> [[TMP0]]
   15192 float16x8_t test_vreinterpretq_f16_s16(int16x8_t a) {
   15193   return vreinterpretq_f16_s16(a);
   15194 }
   15195 
   15196 // CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_s32(<4 x i32> %a) #0 {
   15197 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x half>
   15198 // CHECK:   ret <8 x half> [[TMP0]]
   15199 float16x8_t test_vreinterpretq_f16_s32(int32x4_t a) {
   15200   return vreinterpretq_f16_s32(a);
   15201 }
   15202 
   15203 // CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_s64(<2 x i64> %a) #0 {
   15204 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x half>
   15205 // CHECK:   ret <8 x half> [[TMP0]]
   15206 float16x8_t test_vreinterpretq_f16_s64(int64x2_t a) {
   15207   return vreinterpretq_f16_s64(a);
   15208 }
   15209 
   15210 // CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_u8(<16 x i8> %a) #0 {
   15211 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x half>
   15212 // CHECK:   ret <8 x half> [[TMP0]]
   15213 float16x8_t test_vreinterpretq_f16_u8(uint8x16_t a) {
   15214   return vreinterpretq_f16_u8(a);
   15215 }
   15216 
   15217 // CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_u16(<8 x i16> %a) #0 {
   15218 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <8 x half>
   15219 // CHECK:   ret <8 x half> [[TMP0]]
   15220 float16x8_t test_vreinterpretq_f16_u16(uint16x8_t a) {
   15221   return vreinterpretq_f16_u16(a);
   15222 }
   15223 
   15224 // CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_u32(<4 x i32> %a) #0 {
   15225 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x half>
   15226 // CHECK:   ret <8 x half> [[TMP0]]
   15227 float16x8_t test_vreinterpretq_f16_u32(uint32x4_t a) {
   15228   return vreinterpretq_f16_u32(a);
   15229 }
   15230 
   15231 // CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_u64(<2 x i64> %a) #0 {
   15232 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x half>
   15233 // CHECK:   ret <8 x half> [[TMP0]]
   15234 float16x8_t test_vreinterpretq_f16_u64(uint64x2_t a) {
   15235   return vreinterpretq_f16_u64(a);
   15236 }
   15237 
   15238 // CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_f32(<4 x float> %a) #0 {
   15239 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x half>
   15240 // CHECK:   ret <8 x half> [[TMP0]]
   15241 float16x8_t test_vreinterpretq_f16_f32(float32x4_t a) {
   15242   return vreinterpretq_f16_f32(a);
   15243 }
   15244 
   15245 // CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_p8(<16 x i8> %a) #0 {
   15246 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x half>
   15247 // CHECK:   ret <8 x half> [[TMP0]]
   15248 float16x8_t test_vreinterpretq_f16_p8(poly8x16_t a) {
   15249   return vreinterpretq_f16_p8(a);
   15250 }
   15251 
   15252 // CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_p16(<8 x i16> %a) #0 {
   15253 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <8 x half>
   15254 // CHECK:   ret <8 x half> [[TMP0]]
   15255 float16x8_t test_vreinterpretq_f16_p16(poly16x8_t a) {
   15256   return vreinterpretq_f16_p16(a);
   15257 }
   15258 
   15259 // CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_s8(<16 x i8> %a) #0 {
   15260 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x float>
   15261 // CHECK:   ret <4 x float> [[TMP0]]
   15262 float32x4_t test_vreinterpretq_f32_s8(int8x16_t a) {
   15263   return vreinterpretq_f32_s8(a);
   15264 }
   15265 
   15266 // CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_s16(<8 x i16> %a) #0 {
   15267 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x float>
   15268 // CHECK:   ret <4 x float> [[TMP0]]
   15269 float32x4_t test_vreinterpretq_f32_s16(int16x8_t a) {
   15270   return vreinterpretq_f32_s16(a);
   15271 }
   15272 
   15273 // CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_s32(<4 x i32> %a) #0 {
   15274 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <4 x float>
   15275 // CHECK:   ret <4 x float> [[TMP0]]
   15276 float32x4_t test_vreinterpretq_f32_s32(int32x4_t a) {
   15277   return vreinterpretq_f32_s32(a);
   15278 }
   15279 
   15280 // CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_s64(<2 x i64> %a) #0 {
   15281 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x float>
   15282 // CHECK:   ret <4 x float> [[TMP0]]
   15283 float32x4_t test_vreinterpretq_f32_s64(int64x2_t a) {
   15284   return vreinterpretq_f32_s64(a);
   15285 }
   15286 
   15287 // CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_u8(<16 x i8> %a) #0 {
   15288 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x float>
   15289 // CHECK:   ret <4 x float> [[TMP0]]
   15290 float32x4_t test_vreinterpretq_f32_u8(uint8x16_t a) {
   15291   return vreinterpretq_f32_u8(a);
   15292 }
   15293 
   15294 // CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_u16(<8 x i16> %a) #0 {
   15295 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x float>
   15296 // CHECK:   ret <4 x float> [[TMP0]]
   15297 float32x4_t test_vreinterpretq_f32_u16(uint16x8_t a) {
   15298   return vreinterpretq_f32_u16(a);
   15299 }
   15300 
   15301 // CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_u32(<4 x i32> %a) #0 {
   15302 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <4 x float>
   15303 // CHECK:   ret <4 x float> [[TMP0]]
   15304 float32x4_t test_vreinterpretq_f32_u32(uint32x4_t a) {
   15305   return vreinterpretq_f32_u32(a);
   15306 }
   15307 
   15308 // CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_u64(<2 x i64> %a) #0 {
   15309 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x float>
   15310 // CHECK:   ret <4 x float> [[TMP0]]
   15311 float32x4_t test_vreinterpretq_f32_u64(uint64x2_t a) {
   15312   return vreinterpretq_f32_u64(a);
   15313 }
   15314 
   15315 // CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_f16(<8 x half> %a) #0 {
   15316 // CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <4 x float>
   15317 // CHECK:   ret <4 x float> [[TMP0]]
   15318 float32x4_t test_vreinterpretq_f32_f16(float16x8_t a) {
   15319   return vreinterpretq_f32_f16(a);
   15320 }
   15321 
   15322 // CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_p8(<16 x i8> %a) #0 {
   15323 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x float>
   15324 // CHECK:   ret <4 x float> [[TMP0]]
   15325 float32x4_t test_vreinterpretq_f32_p8(poly8x16_t a) {
   15326   return vreinterpretq_f32_p8(a);
   15327 }
   15328 
   15329 // CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_p16(<8 x i16> %a) #0 {
   15330 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x float>
   15331 // CHECK:   ret <4 x float> [[TMP0]]
   15332 float32x4_t test_vreinterpretq_f32_p16(poly16x8_t a) {
   15333   return vreinterpretq_f32_p16(a);
   15334 }
   15335 
   15336 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_s8(<16 x i8> %a) #0 {
   15337 // CHECK:   ret <16 x i8> %a
   15338 poly8x16_t test_vreinterpretq_p8_s8(int8x16_t a) {
   15339   return vreinterpretq_p8_s8(a);
   15340 }
   15341 
   15342 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_s16(<8 x i16> %a) #0 {
   15343 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   15344 // CHECK:   ret <16 x i8> [[TMP0]]
   15345 poly8x16_t test_vreinterpretq_p8_s16(int16x8_t a) {
   15346   return vreinterpretq_p8_s16(a);
   15347 }
   15348 
   15349 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_s32(<4 x i32> %a) #0 {
   15350 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   15351 // CHECK:   ret <16 x i8> [[TMP0]]
   15352 poly8x16_t test_vreinterpretq_p8_s32(int32x4_t a) {
   15353   return vreinterpretq_p8_s32(a);
   15354 }
   15355 
   15356 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_s64(<2 x i64> %a) #0 {
   15357 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   15358 // CHECK:   ret <16 x i8> [[TMP0]]
   15359 poly8x16_t test_vreinterpretq_p8_s64(int64x2_t a) {
   15360   return vreinterpretq_p8_s64(a);
   15361 }
   15362 
   15363 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_u8(<16 x i8> %a) #0 {
   15364 // CHECK:   ret <16 x i8> %a
   15365 poly8x16_t test_vreinterpretq_p8_u8(uint8x16_t a) {
   15366   return vreinterpretq_p8_u8(a);
   15367 }
   15368 
   15369 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_u16(<8 x i16> %a) #0 {
   15370 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   15371 // CHECK:   ret <16 x i8> [[TMP0]]
   15372 poly8x16_t test_vreinterpretq_p8_u16(uint16x8_t a) {
   15373   return vreinterpretq_p8_u16(a);
   15374 }
   15375 
   15376 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_u32(<4 x i32> %a) #0 {
   15377 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   15378 // CHECK:   ret <16 x i8> [[TMP0]]
   15379 poly8x16_t test_vreinterpretq_p8_u32(uint32x4_t a) {
   15380   return vreinterpretq_p8_u32(a);
   15381 }
   15382 
   15383 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_u64(<2 x i64> %a) #0 {
   15384 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   15385 // CHECK:   ret <16 x i8> [[TMP0]]
   15386 poly8x16_t test_vreinterpretq_p8_u64(uint64x2_t a) {
   15387   return vreinterpretq_p8_u64(a);
   15388 }
   15389 
   15390 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_f16(<8 x half> %a) #0 {
   15391 // CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8>
   15392 // CHECK:   ret <16 x i8> [[TMP0]]
   15393 poly8x16_t test_vreinterpretq_p8_f16(float16x8_t a) {
   15394   return vreinterpretq_p8_f16(a);
   15395 }
   15396 
   15397 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_f32(<4 x float> %a) #0 {
   15398 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
   15399 // CHECK:   ret <16 x i8> [[TMP0]]
   15400 poly8x16_t test_vreinterpretq_p8_f32(float32x4_t a) {
   15401   return vreinterpretq_p8_f32(a);
   15402 }
   15403 
   15404 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_p16(<8 x i16> %a) #0 {
   15405 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   15406 // CHECK:   ret <16 x i8> [[TMP0]]
   15407 poly8x16_t test_vreinterpretq_p8_p16(poly16x8_t a) {
   15408   return vreinterpretq_p8_p16(a);
   15409 }
   15410 
   15411 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_s8(<16 x i8> %a) #0 {
   15412 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
   15413 // CHECK:   ret <8 x i16> [[TMP0]]
   15414 poly16x8_t test_vreinterpretq_p16_s8(int8x16_t a) {
   15415   return vreinterpretq_p16_s8(a);
   15416 }
   15417 
   15418 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_s16(<8 x i16> %a) #0 {
   15419 // CHECK:   ret <8 x i16> %a
   15420 poly16x8_t test_vreinterpretq_p16_s16(int16x8_t a) {
   15421   return vreinterpretq_p16_s16(a);
   15422 }
   15423 
   15424 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_s32(<4 x i32> %a) #0 {
   15425 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
   15426 // CHECK:   ret <8 x i16> [[TMP0]]
   15427 poly16x8_t test_vreinterpretq_p16_s32(int32x4_t a) {
   15428   return vreinterpretq_p16_s32(a);
   15429 }
   15430 
   15431 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_s64(<2 x i64> %a) #0 {
   15432 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
   15433 // CHECK:   ret <8 x i16> [[TMP0]]
   15434 poly16x8_t test_vreinterpretq_p16_s64(int64x2_t a) {
   15435   return vreinterpretq_p16_s64(a);
   15436 }
   15437 
   15438 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_u8(<16 x i8> %a) #0 {
   15439 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
   15440 // CHECK:   ret <8 x i16> [[TMP0]]
   15441 poly16x8_t test_vreinterpretq_p16_u8(uint8x16_t a) {
   15442   return vreinterpretq_p16_u8(a);
   15443 }
   15444 
   15445 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_u16(<8 x i16> %a) #0 {
   15446 // CHECK:   ret <8 x i16> %a
   15447 poly16x8_t test_vreinterpretq_p16_u16(uint16x8_t a) {
   15448   return vreinterpretq_p16_u16(a);
   15449 }
   15450 
   15451 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_u32(<4 x i32> %a) #0 {
   15452 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
   15453 // CHECK:   ret <8 x i16> [[TMP0]]
   15454 poly16x8_t test_vreinterpretq_p16_u32(uint32x4_t a) {
   15455   return vreinterpretq_p16_u32(a);
   15456 }
   15457 
   15458 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_u64(<2 x i64> %a) #0 {
   15459 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
   15460 // CHECK:   ret <8 x i16> [[TMP0]]
   15461 poly16x8_t test_vreinterpretq_p16_u64(uint64x2_t a) {
   15462   return vreinterpretq_p16_u64(a);
   15463 }
   15464 
   15465 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_f16(<8 x half> %a) #0 {
   15466 // CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <8 x i16>
   15467 // CHECK:   ret <8 x i16> [[TMP0]]
   15468 poly16x8_t test_vreinterpretq_p16_f16(float16x8_t a) {
   15469   return vreinterpretq_p16_f16(a);
   15470 }
   15471 
   15472 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_f32(<4 x float> %a) #0 {
   15473 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x i16>
   15474 // CHECK:   ret <8 x i16> [[TMP0]]
   15475 poly16x8_t test_vreinterpretq_p16_f32(float32x4_t a) {
   15476   return vreinterpretq_p16_f32(a);
   15477 }
   15478 
   15479 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_p8(<16 x i8> %a) #0 {
   15480 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
   15481 // CHECK:   ret <8 x i16> [[TMP0]]
   15482 poly16x8_t test_vreinterpretq_p16_p8(poly8x16_t a) {
   15483   return vreinterpretq_p16_p8(a);
   15484 }
   15485 
   15486 
   15487 // CHECK-LABEL: define <8 x i8> @test_vrev16_s8(<8 x i8> %a) #0 {
   15488 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
   15489 // CHECK:   ret <8 x i8> [[SHUFFLE_I]]
   15490 int8x8_t test_vrev16_s8(int8x8_t a) {
   15491   return vrev16_s8(a);
   15492 }
   15493 
   15494 // CHECK-LABEL: define <8 x i8> @test_vrev16_u8(<8 x i8> %a) #0 {
   15495 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
   15496 // CHECK:   ret <8 x i8> [[SHUFFLE_I]]
   15497 uint8x8_t test_vrev16_u8(uint8x8_t a) {
   15498   return vrev16_u8(a);
   15499 }
   15500 
   15501 // CHECK-LABEL: define <8 x i8> @test_vrev16_p8(<8 x i8> %a) #0 {
   15502 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
   15503 // CHECK:   ret <8 x i8> [[SHUFFLE_I]]
   15504 poly8x8_t test_vrev16_p8(poly8x8_t a) {
   15505   return vrev16_p8(a);
   15506 }
   15507 
   15508 // CHECK-LABEL: define <16 x i8> @test_vrev16q_s8(<16 x i8> %a) #0 {
   15509 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
   15510 // CHECK:   ret <16 x i8> [[SHUFFLE_I]]
   15511 int8x16_t test_vrev16q_s8(int8x16_t a) {
   15512   return vrev16q_s8(a);
   15513 }
   15514 
   15515 // CHECK-LABEL: define <16 x i8> @test_vrev16q_u8(<16 x i8> %a) #0 {
   15516 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
   15517 // CHECK:   ret <16 x i8> [[SHUFFLE_I]]
   15518 uint8x16_t test_vrev16q_u8(uint8x16_t a) {
   15519   return vrev16q_u8(a);
   15520 }
   15521 
   15522 // CHECK-LABEL: define <16 x i8> @test_vrev16q_p8(<16 x i8> %a) #0 {
   15523 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
   15524 // CHECK:   ret <16 x i8> [[SHUFFLE_I]]
   15525 poly8x16_t test_vrev16q_p8(poly8x16_t a) {
   15526   return vrev16q_p8(a);
   15527 }
   15528 
   15529 
   15530 // CHECK-LABEL: define <8 x i8> @test_vrev32_s8(<8 x i8> %a) #0 {
   15531 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
   15532 // CHECK:   ret <8 x i8> [[SHUFFLE_I]]
   15533 int8x8_t test_vrev32_s8(int8x8_t a) {
   15534   return vrev32_s8(a);
   15535 }
   15536 
   15537 // CHECK-LABEL: define <4 x i16> @test_vrev32_s16(<4 x i16> %a) #0 {
   15538 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
   15539 // CHECK:   ret <4 x i16> [[SHUFFLE_I]]
   15540 int16x4_t test_vrev32_s16(int16x4_t a) {
   15541   return vrev32_s16(a);
   15542 }
   15543 
   15544 // CHECK-LABEL: define <8 x i8> @test_vrev32_u8(<8 x i8> %a) #0 {
   15545 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
   15546 // CHECK:   ret <8 x i8> [[SHUFFLE_I]]
   15547 uint8x8_t test_vrev32_u8(uint8x8_t a) {
   15548   return vrev32_u8(a);
   15549 }
   15550 
   15551 // CHECK-LABEL: define <4 x i16> @test_vrev32_u16(<4 x i16> %a) #0 {
   15552 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
   15553 // CHECK:   ret <4 x i16> [[SHUFFLE_I]]
   15554 uint16x4_t test_vrev32_u16(uint16x4_t a) {
   15555   return vrev32_u16(a);
   15556 }
   15557 
   15558 // CHECK-LABEL: define <8 x i8> @test_vrev32_p8(<8 x i8> %a) #0 {
   15559 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
   15560 // CHECK:   ret <8 x i8> [[SHUFFLE_I]]
   15561 poly8x8_t test_vrev32_p8(poly8x8_t a) {
   15562   return vrev32_p8(a);
   15563 }
   15564 
   15565 // CHECK-LABEL: define <4 x i16> @test_vrev32_p16(<4 x i16> %a) #0 {
   15566 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
   15567 // CHECK:   ret <4 x i16> [[SHUFFLE_I]]
   15568 poly16x4_t test_vrev32_p16(poly16x4_t a) {
   15569   return vrev32_p16(a);
   15570 }
   15571 
   15572 // CHECK-LABEL: define <16 x i8> @test_vrev32q_s8(<16 x i8> %a) #0 {
   15573 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
   15574 // CHECK:   ret <16 x i8> [[SHUFFLE_I]]
   15575 int8x16_t test_vrev32q_s8(int8x16_t a) {
   15576   return vrev32q_s8(a);
   15577 }
   15578 
   15579 // CHECK-LABEL: define <8 x i16> @test_vrev32q_s16(<8 x i16> %a) #0 {
   15580 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
   15581 // CHECK:   ret <8 x i16> [[SHUFFLE_I]]
   15582 int16x8_t test_vrev32q_s16(int16x8_t a) {
   15583   return vrev32q_s16(a);
   15584 }
   15585 
   15586 // CHECK-LABEL: define <16 x i8> @test_vrev32q_u8(<16 x i8> %a) #0 {
   15587 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
   15588 // CHECK:   ret <16 x i8> [[SHUFFLE_I]]
   15589 uint8x16_t test_vrev32q_u8(uint8x16_t a) {
   15590   return vrev32q_u8(a);
   15591 }
   15592 
   15593 // CHECK-LABEL: define <8 x i16> @test_vrev32q_u16(<8 x i16> %a) #0 {
   15594 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
   15595 // CHECK:   ret <8 x i16> [[SHUFFLE_I]]
   15596 uint16x8_t test_vrev32q_u16(uint16x8_t a) {
   15597   return vrev32q_u16(a);
   15598 }
   15599 
   15600 // CHECK-LABEL: define <16 x i8> @test_vrev32q_p8(<16 x i8> %a) #0 {
   15601 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
   15602 // CHECK:   ret <16 x i8> [[SHUFFLE_I]]
   15603 poly8x16_t test_vrev32q_p8(poly8x16_t a) {
   15604   return vrev32q_p8(a);
   15605 }
   15606 
   15607 // CHECK-LABEL: define <8 x i16> @test_vrev32q_p16(<8 x i16> %a) #0 {
   15608 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
   15609 // CHECK:   ret <8 x i16> [[SHUFFLE_I]]
   15610 poly16x8_t test_vrev32q_p16(poly16x8_t a) {
   15611   return vrev32q_p16(a);
   15612 }
   15613 
   15614 
   15615 // CHECK-LABEL: define <8 x i8> @test_vrev64_s8(<8 x i8> %a) #0 {
   15616 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
   15617 // CHECK:   ret <8 x i8> [[SHUFFLE_I]]
   15618 int8x8_t test_vrev64_s8(int8x8_t a) {
   15619   return vrev64_s8(a);
   15620 }
   15621 
   15622 // CHECK-LABEL: define <4 x i16> @test_vrev64_s16(<4 x i16> %a) #0 {
   15623 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
   15624 // CHECK:   ret <4 x i16> [[SHUFFLE_I]]
   15625 int16x4_t test_vrev64_s16(int16x4_t a) {
   15626   return vrev64_s16(a);
   15627 }
   15628 
   15629 // CHECK-LABEL: define <2 x i32> @test_vrev64_s32(<2 x i32> %a) #0 {
   15630 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <2 x i32> <i32 1, i32 0>
   15631 // CHECK:   ret <2 x i32> [[SHUFFLE_I]]
   15632 int32x2_t test_vrev64_s32(int32x2_t a) {
   15633   return vrev64_s32(a);
   15634 }
   15635 
   15636 // CHECK-LABEL: define <8 x i8> @test_vrev64_u8(<8 x i8> %a) #0 {
   15637 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
   15638 // CHECK:   ret <8 x i8> [[SHUFFLE_I]]
   15639 uint8x8_t test_vrev64_u8(uint8x8_t a) {
   15640   return vrev64_u8(a);
   15641 }
   15642 
   15643 // CHECK-LABEL: define <4 x i16> @test_vrev64_u16(<4 x i16> %a) #0 {
   15644 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
   15645 // CHECK:   ret <4 x i16> [[SHUFFLE_I]]
   15646 uint16x4_t test_vrev64_u16(uint16x4_t a) {
   15647   return vrev64_u16(a);
   15648 }
   15649 
   15650 // CHECK-LABEL: define <2 x i32> @test_vrev64_u32(<2 x i32> %a) #0 {
   15651 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <2 x i32> <i32 1, i32 0>
   15652 // CHECK:   ret <2 x i32> [[SHUFFLE_I]]
   15653 uint32x2_t test_vrev64_u32(uint32x2_t a) {
   15654   return vrev64_u32(a);
   15655 }
   15656 
   15657 // CHECK-LABEL: define <8 x i8> @test_vrev64_p8(<8 x i8> %a) #0 {
   15658 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
   15659 // CHECK:   ret <8 x i8> [[SHUFFLE_I]]
   15660 poly8x8_t test_vrev64_p8(poly8x8_t a) {
   15661   return vrev64_p8(a);
   15662 }
   15663 
   15664 // CHECK-LABEL: define <4 x i16> @test_vrev64_p16(<4 x i16> %a) #0 {
   15665 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
   15666 // CHECK:   ret <4 x i16> [[SHUFFLE_I]]
   15667 poly16x4_t test_vrev64_p16(poly16x4_t a) {
   15668   return vrev64_p16(a);
   15669 }
   15670 
   15671 // CHECK-LABEL: define <2 x float> @test_vrev64_f32(<2 x float> %a) #0 {
   15672 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %a, <2 x i32> <i32 1, i32 0>
   15673 // CHECK:   ret <2 x float> [[SHUFFLE_I]]
   15674 float32x2_t test_vrev64_f32(float32x2_t a) {
   15675   return vrev64_f32(a);
   15676 }
   15677 
   15678 // CHECK-LABEL: define <16 x i8> @test_vrev64q_s8(<16 x i8> %a) #0 {
   15679 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
   15680 // CHECK:   ret <16 x i8> [[SHUFFLE_I]]
   15681 int8x16_t test_vrev64q_s8(int8x16_t a) {
   15682   return vrev64q_s8(a);
   15683 }
   15684 
   15685 // CHECK-LABEL: define <8 x i16> @test_vrev64q_s16(<8 x i16> %a) #0 {
   15686 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
   15687 // CHECK:   ret <8 x i16> [[SHUFFLE_I]]
   15688 int16x8_t test_vrev64q_s16(int16x8_t a) {
   15689   return vrev64q_s16(a);
   15690 }
   15691 
   15692 // CHECK-LABEL: define <4 x i32> @test_vrev64q_s32(<4 x i32> %a) #0 {
   15693 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
   15694 // CHECK:   ret <4 x i32> [[SHUFFLE_I]]
   15695 int32x4_t test_vrev64q_s32(int32x4_t a) {
   15696   return vrev64q_s32(a);
   15697 }
   15698 
   15699 // CHECK-LABEL: define <16 x i8> @test_vrev64q_u8(<16 x i8> %a) #0 {
   15700 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
   15701 // CHECK:   ret <16 x i8> [[SHUFFLE_I]]
   15702 uint8x16_t test_vrev64q_u8(uint8x16_t a) {
   15703   return vrev64q_u8(a);
   15704 }
   15705 
   15706 // CHECK-LABEL: define <8 x i16> @test_vrev64q_u16(<8 x i16> %a) #0 {
   15707 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
   15708 // CHECK:   ret <8 x i16> [[SHUFFLE_I]]
   15709 uint16x8_t test_vrev64q_u16(uint16x8_t a) {
   15710   return vrev64q_u16(a);
   15711 }
   15712 
   15713 // CHECK-LABEL: define <4 x i32> @test_vrev64q_u32(<4 x i32> %a) #0 {
   15714 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
   15715 // CHECK:   ret <4 x i32> [[SHUFFLE_I]]
   15716 uint32x4_t test_vrev64q_u32(uint32x4_t a) {
   15717   return vrev64q_u32(a);
   15718 }
   15719 
   15720 // CHECK-LABEL: define <16 x i8> @test_vrev64q_p8(<16 x i8> %a) #0 {
   15721 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
   15722 // CHECK:   ret <16 x i8> [[SHUFFLE_I]]
   15723 poly8x16_t test_vrev64q_p8(poly8x16_t a) {
   15724   return vrev64q_p8(a);
   15725 }
   15726 
   15727 // CHECK-LABEL: define <8 x i16> @test_vrev64q_p16(<8 x i16> %a) #0 {
   15728 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
   15729 // CHECK:   ret <8 x i16> [[SHUFFLE_I]]
   15730 poly16x8_t test_vrev64q_p16(poly16x8_t a) {
   15731   return vrev64q_p16(a);
   15732 }
   15733 
   15734 // CHECK-LABEL: define <4 x float> @test_vrev64q_f32(<4 x float> %a) #0 {
   15735 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
   15736 // CHECK:   ret <4 x float> [[SHUFFLE_I]]
   15737 float32x4_t test_vrev64q_f32(float32x4_t a) {
   15738   return vrev64q_f32(a);
   15739 }
   15740 
   15741 
   15742 // CHECK-LABEL: define <8 x i8> @test_vrhadd_s8(<8 x i8> %a, <8 x i8> %b) #0 {
   15743 // CHECK:   [[VRHADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vrhadds.v8i8(<8 x i8> %a, <8 x i8> %b) #4
   15744 // CHECK:   ret <8 x i8> [[VRHADD_V_I]]
   15745 int8x8_t test_vrhadd_s8(int8x8_t a, int8x8_t b) {
   15746   return vrhadd_s8(a, b);
   15747 }
   15748 
   15749 // CHECK-LABEL: define <4 x i16> @test_vrhadd_s16(<4 x i16> %a, <4 x i16> %b) #0 {
   15750 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   15751 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   15752 // CHECK:   [[VRHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   15753 // CHECK:   [[VRHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   15754 // CHECK:   [[VRHADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrhadds.v4i16(<4 x i16> [[VRHADD_V_I]], <4 x i16> [[VRHADD_V1_I]]) #4
   15755 // CHECK:   [[VRHADD_V3_I:%.*]] = bitcast <4 x i16> [[VRHADD_V2_I]] to <8 x i8>
   15756 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRHADD_V3_I]] to <4 x i16>
   15757 // CHECK:   ret <4 x i16> [[TMP2]]
   15758 int16x4_t test_vrhadd_s16(int16x4_t a, int16x4_t b) {
   15759   return vrhadd_s16(a, b);
   15760 }
   15761 
   15762 // CHECK-LABEL: define <2 x i32> @test_vrhadd_s32(<2 x i32> %a, <2 x i32> %b) #0 {
   15763 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   15764 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   15765 // CHECK:   [[VRHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   15766 // CHECK:   [[VRHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   15767 // CHECK:   [[VRHADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrhadds.v2i32(<2 x i32> [[VRHADD_V_I]], <2 x i32> [[VRHADD_V1_I]]) #4
   15768 // CHECK:   [[VRHADD_V3_I:%.*]] = bitcast <2 x i32> [[VRHADD_V2_I]] to <8 x i8>
   15769 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRHADD_V3_I]] to <2 x i32>
   15770 // CHECK:   ret <2 x i32> [[TMP2]]
   15771 int32x2_t test_vrhadd_s32(int32x2_t a, int32x2_t b) {
   15772   return vrhadd_s32(a, b);
   15773 }
   15774 
   15775 // CHECK-LABEL: define <8 x i8> @test_vrhadd_u8(<8 x i8> %a, <8 x i8> %b) #0 {
   15776 // CHECK:   [[VRHADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vrhaddu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
   15777 // CHECK:   ret <8 x i8> [[VRHADD_V_I]]
   15778 uint8x8_t test_vrhadd_u8(uint8x8_t a, uint8x8_t b) {
   15779   return vrhadd_u8(a, b);
   15780 }
   15781 
   15782 // CHECK-LABEL: define <4 x i16> @test_vrhadd_u16(<4 x i16> %a, <4 x i16> %b) #0 {
   15783 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   15784 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   15785 // CHECK:   [[VRHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   15786 // CHECK:   [[VRHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   15787 // CHECK:   [[VRHADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrhaddu.v4i16(<4 x i16> [[VRHADD_V_I]], <4 x i16> [[VRHADD_V1_I]]) #4
   15788 // CHECK:   [[VRHADD_V3_I:%.*]] = bitcast <4 x i16> [[VRHADD_V2_I]] to <8 x i8>
   15789 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRHADD_V3_I]] to <4 x i16>
   15790 // CHECK:   ret <4 x i16> [[TMP2]]
   15791 uint16x4_t test_vrhadd_u16(uint16x4_t a, uint16x4_t b) {
   15792   return vrhadd_u16(a, b);
   15793 }
   15794 
   15795 // CHECK-LABEL: define <2 x i32> @test_vrhadd_u32(<2 x i32> %a, <2 x i32> %b) #0 {
   15796 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   15797 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   15798 // CHECK:   [[VRHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   15799 // CHECK:   [[VRHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   15800 // CHECK:   [[VRHADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrhaddu.v2i32(<2 x i32> [[VRHADD_V_I]], <2 x i32> [[VRHADD_V1_I]]) #4
   15801 // CHECK:   [[VRHADD_V3_I:%.*]] = bitcast <2 x i32> [[VRHADD_V2_I]] to <8 x i8>
   15802 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRHADD_V3_I]] to <2 x i32>
   15803 // CHECK:   ret <2 x i32> [[TMP2]]
   15804 uint32x2_t test_vrhadd_u32(uint32x2_t a, uint32x2_t b) {
   15805   return vrhadd_u32(a, b);
   15806 }
   15807 
   15808 // CHECK-LABEL: define <16 x i8> @test_vrhaddq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
   15809 // CHECK:   [[VRHADDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vrhadds.v16i8(<16 x i8> %a, <16 x i8> %b) #4
   15810 // CHECK:   ret <16 x i8> [[VRHADDQ_V_I]]
   15811 int8x16_t test_vrhaddq_s8(int8x16_t a, int8x16_t b) {
   15812   return vrhaddq_s8(a, b);
   15813 }
   15814 
   15815 // CHECK-LABEL: define <8 x i16> @test_vrhaddq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
   15816 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   15817 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   15818 // CHECK:   [[VRHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   15819 // CHECK:   [[VRHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   15820 // CHECK:   [[VRHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16> [[VRHADDQ_V_I]], <8 x i16> [[VRHADDQ_V1_I]]) #4
   15821 // CHECK:   [[VRHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VRHADDQ_V2_I]] to <16 x i8>
   15822 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRHADDQ_V3_I]] to <8 x i16>
   15823 // CHECK:   ret <8 x i16> [[TMP2]]
   15824 int16x8_t test_vrhaddq_s16(int16x8_t a, int16x8_t b) {
   15825   return vrhaddq_s16(a, b);
   15826 }
   15827 
   15828 // CHECK-LABEL: define <4 x i32> @test_vrhaddq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
   15829 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   15830 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   15831 // CHECK:   [[VRHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   15832 // CHECK:   [[VRHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   15833 // CHECK:   [[VRHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrhadds.v4i32(<4 x i32> [[VRHADDQ_V_I]], <4 x i32> [[VRHADDQ_V1_I]]) #4
   15834 // CHECK:   [[VRHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VRHADDQ_V2_I]] to <16 x i8>
   15835 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRHADDQ_V3_I]] to <4 x i32>
   15836 // CHECK:   ret <4 x i32> [[TMP2]]
   15837 int32x4_t test_vrhaddq_s32(int32x4_t a, int32x4_t b) {
   15838   return vrhaddq_s32(a, b);
   15839 }
   15840 
   15841 // CHECK-LABEL: define <16 x i8> @test_vrhaddq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
   15842 // CHECK:   [[VRHADDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vrhaddu.v16i8(<16 x i8> %a, <16 x i8> %b) #4
   15843 // CHECK:   ret <16 x i8> [[VRHADDQ_V_I]]
   15844 uint8x16_t test_vrhaddq_u8(uint8x16_t a, uint8x16_t b) {
   15845   return vrhaddq_u8(a, b);
   15846 }
   15847 
   15848 // CHECK-LABEL: define <8 x i16> @test_vrhaddq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
   15849 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   15850 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   15851 // CHECK:   [[VRHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   15852 // CHECK:   [[VRHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   15853 // CHECK:   [[VRHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16> [[VRHADDQ_V_I]], <8 x i16> [[VRHADDQ_V1_I]]) #4
   15854 // CHECK:   [[VRHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VRHADDQ_V2_I]] to <16 x i8>
   15855 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRHADDQ_V3_I]] to <8 x i16>
   15856 // CHECK:   ret <8 x i16> [[TMP2]]
   15857 uint16x8_t test_vrhaddq_u16(uint16x8_t a, uint16x8_t b) {
   15858   return vrhaddq_u16(a, b);
   15859 }
   15860 
   15861 // CHECK-LABEL: define <4 x i32> @test_vrhaddq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
   15862 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   15863 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   15864 // CHECK:   [[VRHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   15865 // CHECK:   [[VRHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   15866 // CHECK:   [[VRHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrhaddu.v4i32(<4 x i32> [[VRHADDQ_V_I]], <4 x i32> [[VRHADDQ_V1_I]]) #4
   15867 // CHECK:   [[VRHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VRHADDQ_V2_I]] to <16 x i8>
   15868 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRHADDQ_V3_I]] to <4 x i32>
   15869 // CHECK:   ret <4 x i32> [[TMP2]]
   15870 uint32x4_t test_vrhaddq_u32(uint32x4_t a, uint32x4_t b) {
   15871   return vrhaddq_u32(a, b);
   15872 }
   15873 
   15874 
   15875 // CHECK-LABEL: define <8 x i8> @test_vrshl_s8(<8 x i8> %a, <8 x i8> %b) #0 {
   15876 // CHECK:   [[VRSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8> %a, <8 x i8> %b) #4
   15877 // CHECK:   ret <8 x i8> [[VRSHL_V_I]]
   15878 int8x8_t test_vrshl_s8(int8x8_t a, int8x8_t b) {
   15879   return vrshl_s8(a, b);
   15880 }
   15881 
   15882 // CHECK-LABEL: define <4 x i16> @test_vrshl_s16(<4 x i16> %a, <4 x i16> %b) #0 {
   15883 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   15884 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   15885 // CHECK:   [[VRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   15886 // CHECK:   [[VRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   15887 // CHECK:   [[VRSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16> [[VRSHL_V_I]], <4 x i16> [[VRSHL_V1_I]]) #4
   15888 // CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VRSHL_V2_I]] to <8 x i8>
   15889 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSHL_V3_I]] to <4 x i16>
   15890 // CHECK:   ret <4 x i16> [[TMP2]]
   15891 int16x4_t test_vrshl_s16(int16x4_t a, int16x4_t b) {
   15892   return vrshl_s16(a, b);
   15893 }
   15894 
   15895 // CHECK-LABEL: define <2 x i32> @test_vrshl_s32(<2 x i32> %a, <2 x i32> %b) #0 {
   15896 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   15897 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   15898 // CHECK:   [[VRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   15899 // CHECK:   [[VRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   15900 // CHECK:   [[VRSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32> [[VRSHL_V_I]], <2 x i32> [[VRSHL_V1_I]]) #4
   15901 // CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VRSHL_V2_I]] to <8 x i8>
   15902 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSHL_V3_I]] to <2 x i32>
   15903 // CHECK:   ret <2 x i32> [[TMP2]]
   15904 int32x2_t test_vrshl_s32(int32x2_t a, int32x2_t b) {
   15905   return vrshl_s32(a, b);
   15906 }
   15907 
   15908 // CHECK-LABEL: define <1 x i64> @test_vrshl_s64(<1 x i64> %a, <1 x i64> %b) #0 {
   15909 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   15910 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
   15911 // CHECK:   [[VRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
   15912 // CHECK:   [[VRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
   15913 // CHECK:   [[VRSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> [[VRSHL_V_I]], <1 x i64> [[VRSHL_V1_I]]) #4
   15914 // CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VRSHL_V2_I]] to <8 x i8>
   15915 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSHL_V3_I]] to <1 x i64>
   15916 // CHECK:   ret <1 x i64> [[TMP2]]
   15917 int64x1_t test_vrshl_s64(int64x1_t a, int64x1_t b) {
   15918   return vrshl_s64(a, b);
   15919 }
   15920 
   15921 // CHECK-LABEL: define <8 x i8> @test_vrshl_u8(<8 x i8> %a, <8 x i8> %b) #0 {
   15922 // CHECK:   [[VRSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
   15923 // CHECK:   ret <8 x i8> [[VRSHL_V_I]]
   15924 uint8x8_t test_vrshl_u8(uint8x8_t a, int8x8_t b) {
   15925   return vrshl_u8(a, b);
   15926 }
   15927 
   15928 // CHECK-LABEL: define <4 x i16> @test_vrshl_u16(<4 x i16> %a, <4 x i16> %b) #0 {
   15929 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   15930 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   15931 // CHECK:   [[VRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   15932 // CHECK:   [[VRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   15933 // CHECK:   [[VRSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16> [[VRSHL_V_I]], <4 x i16> [[VRSHL_V1_I]]) #4
   15934 // CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VRSHL_V2_I]] to <8 x i8>
   15935 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSHL_V3_I]] to <4 x i16>
   15936 // CHECK:   ret <4 x i16> [[TMP2]]
   15937 uint16x4_t test_vrshl_u16(uint16x4_t a, int16x4_t b) {
   15938   return vrshl_u16(a, b);
   15939 }
   15940 
   15941 // CHECK-LABEL: define <2 x i32> @test_vrshl_u32(<2 x i32> %a, <2 x i32> %b) #0 {
   15942 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   15943 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   15944 // CHECK:   [[VRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   15945 // CHECK:   [[VRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   15946 // CHECK:   [[VRSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32> [[VRSHL_V_I]], <2 x i32> [[VRSHL_V1_I]]) #4
   15947 // CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VRSHL_V2_I]] to <8 x i8>
   15948 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSHL_V3_I]] to <2 x i32>
   15949 // CHECK:   ret <2 x i32> [[TMP2]]
   15950 uint32x2_t test_vrshl_u32(uint32x2_t a, int32x2_t b) {
   15951   return vrshl_u32(a, b);
   15952 }
   15953 
   15954 // CHECK-LABEL: define <1 x i64> @test_vrshl_u64(<1 x i64> %a, <1 x i64> %b) #0 {
   15955 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   15956 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
   15957 // CHECK:   [[VRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
   15958 // CHECK:   [[VRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
   15959 // CHECK:   [[VRSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> [[VRSHL_V_I]], <1 x i64> [[VRSHL_V1_I]]) #4
   15960 // CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VRSHL_V2_I]] to <8 x i8>
   15961 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSHL_V3_I]] to <1 x i64>
   15962 // CHECK:   ret <1 x i64> [[TMP2]]
   15963 uint64x1_t test_vrshl_u64(uint64x1_t a, int64x1_t b) {
   15964   return vrshl_u64(a, b);
   15965 }
   15966 
   15967 // CHECK-LABEL: define <16 x i8> @test_vrshlq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
   15968 // CHECK:   [[VRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8> %a, <16 x i8> %b) #4
   15969 // CHECK:   ret <16 x i8> [[VRSHLQ_V_I]]
   15970 int8x16_t test_vrshlq_s8(int8x16_t a, int8x16_t b) {
   15971   return vrshlq_s8(a, b);
   15972 }
   15973 
   15974 // CHECK-LABEL: define <8 x i16> @test_vrshlq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
   15975 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   15976 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   15977 // CHECK:   [[VRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   15978 // CHECK:   [[VRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   15979 // CHECK:   [[VRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16> [[VRSHLQ_V_I]], <8 x i16> [[VRSHLQ_V1_I]]) #4
   15980 // CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VRSHLQ_V2_I]] to <16 x i8>
   15981 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRSHLQ_V3_I]] to <8 x i16>
   15982 // CHECK:   ret <8 x i16> [[TMP2]]
   15983 int16x8_t test_vrshlq_s16(int16x8_t a, int16x8_t b) {
   15984   return vrshlq_s16(a, b);
   15985 }
   15986 
   15987 // CHECK-LABEL: define <4 x i32> @test_vrshlq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
   15988 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   15989 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   15990 // CHECK:   [[VRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   15991 // CHECK:   [[VRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   15992 // CHECK:   [[VRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32> [[VRSHLQ_V_I]], <4 x i32> [[VRSHLQ_V1_I]]) #4
   15993 // CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VRSHLQ_V2_I]] to <16 x i8>
   15994 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRSHLQ_V3_I]] to <4 x i32>
   15995 // CHECK:   ret <4 x i32> [[TMP2]]
   15996 int32x4_t test_vrshlq_s32(int32x4_t a, int32x4_t b) {
   15997   return vrshlq_s32(a, b);
   15998 }
   15999 
   16000 // CHECK-LABEL: define <2 x i64> @test_vrshlq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
   16001 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   16002 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   16003 // CHECK:   [[VRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   16004 // CHECK:   [[VRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
   16005 // CHECK:   [[VRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64> [[VRSHLQ_V_I]], <2 x i64> [[VRSHLQ_V1_I]]) #4
   16006 // CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VRSHLQ_V2_I]] to <16 x i8>
   16007 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRSHLQ_V3_I]] to <2 x i64>
   16008 // CHECK:   ret <2 x i64> [[TMP2]]
   16009 int64x2_t test_vrshlq_s64(int64x2_t a, int64x2_t b) {
   16010   return vrshlq_s64(a, b);
   16011 }
   16012 
   16013 // CHECK-LABEL: define <16 x i8> @test_vrshlq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
   16014 // CHECK:   [[VRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8> %a, <16 x i8> %b) #4
   16015 // CHECK:   ret <16 x i8> [[VRSHLQ_V_I]]
   16016 uint8x16_t test_vrshlq_u8(uint8x16_t a, int8x16_t b) {
   16017   return vrshlq_u8(a, b);
   16018 }
   16019 
   16020 // CHECK-LABEL: define <8 x i16> @test_vrshlq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
   16021 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   16022 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   16023 // CHECK:   [[VRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   16024 // CHECK:   [[VRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   16025 // CHECK:   [[VRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> [[VRSHLQ_V_I]], <8 x i16> [[VRSHLQ_V1_I]]) #4
   16026 // CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VRSHLQ_V2_I]] to <16 x i8>
   16027 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRSHLQ_V3_I]] to <8 x i16>
   16028 // CHECK:   ret <8 x i16> [[TMP2]]
   16029 uint16x8_t test_vrshlq_u16(uint16x8_t a, int16x8_t b) {
   16030   return vrshlq_u16(a, b);
   16031 }
   16032 
   16033 // CHECK-LABEL: define <4 x i32> @test_vrshlq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
   16034 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   16035 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   16036 // CHECK:   [[VRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   16037 // CHECK:   [[VRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   16038 // CHECK:   [[VRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32> [[VRSHLQ_V_I]], <4 x i32> [[VRSHLQ_V1_I]]) #4
   16039 // CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VRSHLQ_V2_I]] to <16 x i8>
   16040 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRSHLQ_V3_I]] to <4 x i32>
   16041 // CHECK:   ret <4 x i32> [[TMP2]]
   16042 uint32x4_t test_vrshlq_u32(uint32x4_t a, int32x4_t b) {
   16043   return vrshlq_u32(a, b);
   16044 }
   16045 
   16046 // CHECK-LABEL: define <2 x i64> @test_vrshlq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
   16047 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   16048 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   16049 // CHECK:   [[VRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   16050 // CHECK:   [[VRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
   16051 // CHECK:   [[VRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64> [[VRSHLQ_V_I]], <2 x i64> [[VRSHLQ_V1_I]]) #4
   16052 // CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VRSHLQ_V2_I]] to <16 x i8>
   16053 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRSHLQ_V3_I]] to <2 x i64>
   16054 // CHECK:   ret <2 x i64> [[TMP2]]
   16055 uint64x2_t test_vrshlq_u64(uint64x2_t a, int64x2_t b) {
   16056   return vrshlq_u64(a, b);
   16057 }
   16058 
   16059 
   16060 // CHECK-LABEL: define <8 x i8> @test_vrshrn_n_s16(<8 x i16> %a) #0 {
   16061 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   16062 // CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   16063 // CHECK:   [[VRSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vrshiftn.v8i8(<8 x i16> [[VRSHRN_N]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
   16064 // CHECK:   ret <8 x i8> [[VRSHRN_N1]]
   16065 int8x8_t test_vrshrn_n_s16(int16x8_t a) {
   16066   return vrshrn_n_s16(a, 1);
   16067 }
   16068 
   16069 // CHECK-LABEL: define <4 x i16> @test_vrshrn_n_s32(<4 x i32> %a) #0 {
   16070 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   16071 // CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   16072 // CHECK:   [[VRSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vrshiftn.v4i16(<4 x i32> [[VRSHRN_N]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
   16073 // CHECK:   ret <4 x i16> [[VRSHRN_N1]]
   16074 int16x4_t test_vrshrn_n_s32(int32x4_t a) {
   16075   return vrshrn_n_s32(a, 1);
   16076 }
   16077 
   16078 // CHECK-LABEL: define <2 x i32> @test_vrshrn_n_s64(<2 x i64> %a) #0 {
   16079 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   16080 // CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   16081 // CHECK:   [[VRSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vrshiftn.v2i32(<2 x i64> [[VRSHRN_N]], <2 x i64> <i64 -1, i64 -1>)
   16082 // CHECK:   ret <2 x i32> [[VRSHRN_N1]]
   16083 int32x2_t test_vrshrn_n_s64(int64x2_t a) {
   16084   return vrshrn_n_s64(a, 1);
   16085 }
   16086 
   16087 // CHECK-LABEL: define <8 x i8> @test_vrshrn_n_u16(<8 x i16> %a) #0 {
   16088 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   16089 // CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   16090 // CHECK:   [[VRSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vrshiftn.v8i8(<8 x i16> [[VRSHRN_N]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
   16091 // CHECK:   ret <8 x i8> [[VRSHRN_N1]]
   16092 uint8x8_t test_vrshrn_n_u16(uint16x8_t a) {
   16093   return vrshrn_n_u16(a, 1);
   16094 }
   16095 
   16096 // CHECK-LABEL: define <4 x i16> @test_vrshrn_n_u32(<4 x i32> %a) #0 {
   16097 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   16098 // CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   16099 // CHECK:   [[VRSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vrshiftn.v4i16(<4 x i32> [[VRSHRN_N]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
   16100 // CHECK:   ret <4 x i16> [[VRSHRN_N1]]
   16101 uint16x4_t test_vrshrn_n_u32(uint32x4_t a) {
   16102   return vrshrn_n_u32(a, 1);
   16103 }
   16104 
   16105 // CHECK-LABEL: define <2 x i32> @test_vrshrn_n_u64(<2 x i64> %a) #0 {
   16106 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   16107 // CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   16108 // CHECK:   [[VRSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vrshiftn.v2i32(<2 x i64> [[VRSHRN_N]], <2 x i64> <i64 -1, i64 -1>)
   16109 // CHECK:   ret <2 x i32> [[VRSHRN_N1]]
   16110 uint32x2_t test_vrshrn_n_u64(uint64x2_t a) {
   16111   return vrshrn_n_u64(a, 1);
   16112 }
   16113 
   16114 
   16115 // CHECK-LABEL: define <8 x i8> @test_vrshr_n_s8(<8 x i8> %a) #0 {
   16116 // CHECK:   [[VRSHR_N:%.*]] = call <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8> %a, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
   16117 // CHECK:   ret <8 x i8> [[VRSHR_N]]
   16118 int8x8_t test_vrshr_n_s8(int8x8_t a) {
   16119   return vrshr_n_s8(a, 1);
   16120 }
   16121 
   16122 // CHECK-LABEL: define <4 x i16> @test_vrshr_n_s16(<4 x i16> %a) #0 {
   16123 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   16124 // CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   16125 // CHECK:   [[VRSHR_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16> [[VRSHR_N]], <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
   16126 // CHECK:   ret <4 x i16> [[VRSHR_N1]]
   16127 int16x4_t test_vrshr_n_s16(int16x4_t a) {
   16128   return vrshr_n_s16(a, 1);
   16129 }
   16130 
   16131 // CHECK-LABEL: define <2 x i32> @test_vrshr_n_s32(<2 x i32> %a) #0 {
   16132 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   16133 // CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   16134 // CHECK:   [[VRSHR_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32> [[VRSHR_N]], <2 x i32> <i32 -1, i32 -1>)
   16135 // CHECK:   ret <2 x i32> [[VRSHR_N1]]
   16136 int32x2_t test_vrshr_n_s32(int32x2_t a) {
   16137   return vrshr_n_s32(a, 1);
   16138 }
   16139 
   16140 // CHECK-LABEL: define <1 x i64> @test_vrshr_n_s64(<1 x i64> %a) #0 {
   16141 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   16142 // CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
   16143 // CHECK:   [[VRSHR_N1:%.*]] = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> [[VRSHR_N]], <1 x i64> <i64 -1>)
   16144 // CHECK:   ret <1 x i64> [[VRSHR_N1]]
   16145 int64x1_t test_vrshr_n_s64(int64x1_t a) {
   16146   return vrshr_n_s64(a, 1);
   16147 }
   16148 
   16149 // CHECK-LABEL: define <8 x i8> @test_vrshr_n_u8(<8 x i8> %a) #0 {
   16150 // CHECK:   [[VRSHR_N:%.*]] = call <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8> %a, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
   16151 // CHECK:   ret <8 x i8> [[VRSHR_N]]
   16152 uint8x8_t test_vrshr_n_u8(uint8x8_t a) {
   16153   return vrshr_n_u8(a, 1);
   16154 }
   16155 
   16156 // CHECK-LABEL: define <4 x i16> @test_vrshr_n_u16(<4 x i16> %a) #0 {
   16157 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   16158 // CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   16159 // CHECK:   [[VRSHR_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16> [[VRSHR_N]], <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
   16160 // CHECK:   ret <4 x i16> [[VRSHR_N1]]
   16161 uint16x4_t test_vrshr_n_u16(uint16x4_t a) {
   16162   return vrshr_n_u16(a, 1);
   16163 }
   16164 
   16165 // CHECK-LABEL: define <2 x i32> @test_vrshr_n_u32(<2 x i32> %a) #0 {
   16166 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   16167 // CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   16168 // CHECK:   [[VRSHR_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32> [[VRSHR_N]], <2 x i32> <i32 -1, i32 -1>)
   16169 // CHECK:   ret <2 x i32> [[VRSHR_N1]]
   16170 uint32x2_t test_vrshr_n_u32(uint32x2_t a) {
   16171   return vrshr_n_u32(a, 1);
   16172 }
   16173 
   16174 // CHECK-LABEL: define <1 x i64> @test_vrshr_n_u64(<1 x i64> %a) #0 {
   16175 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   16176 // CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
   16177 // CHECK:   [[VRSHR_N1:%.*]] = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> [[VRSHR_N]], <1 x i64> <i64 -1>)
   16178 // CHECK:   ret <1 x i64> [[VRSHR_N1]]
   16179 uint64x1_t test_vrshr_n_u64(uint64x1_t a) {
   16180   return vrshr_n_u64(a, 1);
   16181 }
   16182 
   16183 // CHECK-LABEL: define <16 x i8> @test_vrshrq_n_s8(<16 x i8> %a) #0 {
   16184 // CHECK:   [[VRSHR_N:%.*]] = call <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8> %a, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
   16185 // CHECK:   ret <16 x i8> [[VRSHR_N]]
   16186 int8x16_t test_vrshrq_n_s8(int8x16_t a) {
   16187   return vrshrq_n_s8(a, 1);
   16188 }
   16189 
   16190 // CHECK-LABEL: define <8 x i16> @test_vrshrq_n_s16(<8 x i16> %a) #0 {
   16191 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   16192 // CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   16193 // CHECK:   [[VRSHR_N1:%.*]] = call <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16> [[VRSHR_N]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
   16194 // CHECK:   ret <8 x i16> [[VRSHR_N1]]
   16195 int16x8_t test_vrshrq_n_s16(int16x8_t a) {
   16196   return vrshrq_n_s16(a, 1);
   16197 }
   16198 
   16199 // CHECK-LABEL: define <4 x i32> @test_vrshrq_n_s32(<4 x i32> %a) #0 {
   16200 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   16201 // CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   16202 // CHECK:   [[VRSHR_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32> [[VRSHR_N]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
   16203 // CHECK:   ret <4 x i32> [[VRSHR_N1]]
   16204 int32x4_t test_vrshrq_n_s32(int32x4_t a) {
   16205   return vrshrq_n_s32(a, 1);
   16206 }
   16207 
   16208 // CHECK-LABEL: define <2 x i64> @test_vrshrq_n_s64(<2 x i64> %a) #0 {
   16209 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   16210 // CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   16211 // CHECK:   [[VRSHR_N1:%.*]] = call <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64> [[VRSHR_N]], <2 x i64> <i64 -1, i64 -1>)
   16212 // CHECK:   ret <2 x i64> [[VRSHR_N1]]
   16213 int64x2_t test_vrshrq_n_s64(int64x2_t a) {
   16214   return vrshrq_n_s64(a, 1);
   16215 }
   16216 
   16217 // CHECK-LABEL: define <16 x i8> @test_vrshrq_n_u8(<16 x i8> %a) #0 {
   16218 // CHECK:   [[VRSHR_N:%.*]] = call <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8> %a, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
   16219 // CHECK:   ret <16 x i8> [[VRSHR_N]]
   16220 uint8x16_t test_vrshrq_n_u8(uint8x16_t a) {
   16221   return vrshrq_n_u8(a, 1);
   16222 }
   16223 
   16224 // CHECK-LABEL: define <8 x i16> @test_vrshrq_n_u16(<8 x i16> %a) #0 {
   16225 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   16226 // CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   16227 // CHECK:   [[VRSHR_N1:%.*]] = call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> [[VRSHR_N]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
   16228 // CHECK:   ret <8 x i16> [[VRSHR_N1]]
   16229 uint16x8_t test_vrshrq_n_u16(uint16x8_t a) {
   16230   return vrshrq_n_u16(a, 1);
   16231 }
   16232 
   16233 // CHECK-LABEL: define <4 x i32> @test_vrshrq_n_u32(<4 x i32> %a) #0 {
   16234 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   16235 // CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   16236 // CHECK:   [[VRSHR_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32> [[VRSHR_N]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
   16237 // CHECK:   ret <4 x i32> [[VRSHR_N1]]
   16238 uint32x4_t test_vrshrq_n_u32(uint32x4_t a) {
   16239   return vrshrq_n_u32(a, 1);
   16240 }
   16241 
   16242 // CHECK-LABEL: define <2 x i64> @test_vrshrq_n_u64(<2 x i64> %a) #0 {
   16243 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   16244 // CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   16245 // CHECK:   [[VRSHR_N1:%.*]] = call <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64> [[VRSHR_N]], <2 x i64> <i64 -1, i64 -1>)
   16246 // CHECK:   ret <2 x i64> [[VRSHR_N1]]
   16247 uint64x2_t test_vrshrq_n_u64(uint64x2_t a) {
   16248   return vrshrq_n_u64(a, 1);
   16249 }
   16250 
   16251 
   16252 // CHECK-LABEL: define <2 x float> @test_vrsqrte_f32(<2 x float> %a) #0 {
   16253 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
   16254 // CHECK:   [[VRSQRTE_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
   16255 // CHECK:   [[VRSQRTE_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrsqrte.v2f32(<2 x float> [[VRSQRTE_V_I]]) #4
   16256 // CHECK:   ret <2 x float> [[VRSQRTE_V1_I]]
   16257 float32x2_t test_vrsqrte_f32(float32x2_t a) {
   16258   return vrsqrte_f32(a);
   16259 }
   16260 
   16261 // CHECK-LABEL: define <2 x i32> @test_vrsqrte_u32(<2 x i32> %a) #0 {
   16262 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   16263 // CHECK:   [[VRSQRTE_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   16264 // CHECK:   [[VRSQRTE_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrsqrte.v2i32(<2 x i32> [[VRSQRTE_V_I]]) #4
   16265 // CHECK:   ret <2 x i32> [[VRSQRTE_V1_I]]
   16266 uint32x2_t test_vrsqrte_u32(uint32x2_t a) {
   16267   return vrsqrte_u32(a);
   16268 }
   16269 
   16270 // CHECK-LABEL: define <4 x float> @test_vrsqrteq_f32(<4 x float> %a) #0 {
   16271 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
   16272 // CHECK:   [[VRSQRTEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
   16273 // CHECK:   [[VRSQRTEQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float> [[VRSQRTEQ_V_I]]) #4
   16274 // CHECK:   ret <4 x float> [[VRSQRTEQ_V1_I]]
   16275 float32x4_t test_vrsqrteq_f32(float32x4_t a) {
   16276   return vrsqrteq_f32(a);
   16277 }
   16278 
   16279 // CHECK-LABEL: define <4 x i32> @test_vrsqrteq_u32(<4 x i32> %a) #0 {
   16280 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   16281 // CHECK:   [[VRSQRTEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   16282 // CHECK:   [[VRSQRTEQ_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrsqrte.v4i32(<4 x i32> [[VRSQRTEQ_V_I]]) #4
   16283 // CHECK:   ret <4 x i32> [[VRSQRTEQ_V1_I]]
   16284 uint32x4_t test_vrsqrteq_u32(uint32x4_t a) {
   16285   return vrsqrteq_u32(a);
   16286 }
   16287 
   16288 
   16289 // CHECK-LABEL: define <2 x float> @test_vrsqrts_f32(<2 x float> %a, <2 x float> %b) #0 {
   16290 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
   16291 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
   16292 // CHECK:   [[VRSQRTS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
   16293 // CHECK:   [[VRSQRTS_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
   16294 // CHECK:   [[VRSQRTS_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vrsqrts.v2f32(<2 x float> [[VRSQRTS_V_I]], <2 x float> [[VRSQRTS_V1_I]]) #4
   16295 // CHECK:   [[VRSQRTS_V3_I:%.*]] = bitcast <2 x float> [[VRSQRTS_V2_I]] to <8 x i8>
   16296 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSQRTS_V3_I]] to <2 x float>
   16297 // CHECK:   ret <2 x float> [[TMP2]]
   16298 float32x2_t test_vrsqrts_f32(float32x2_t a, float32x2_t b) {
   16299   return vrsqrts_f32(a, b);
   16300 }
   16301 
   16302 // CHECK-LABEL: define <4 x float> @test_vrsqrtsq_f32(<4 x float> %a, <4 x float> %b) #0 {
   16303 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
   16304 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
   16305 // CHECK:   [[VRSQRTSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
   16306 // CHECK:   [[VRSQRTSQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
   16307 // CHECK:   [[VRSQRTSQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float> [[VRSQRTSQ_V_I]], <4 x float> [[VRSQRTSQ_V1_I]]) #4
   16308 // CHECK:   [[VRSQRTSQ_V3_I:%.*]] = bitcast <4 x float> [[VRSQRTSQ_V2_I]] to <16 x i8>
   16309 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRSQRTSQ_V3_I]] to <4 x float>
   16310 // CHECK:   ret <4 x float> [[TMP2]]
   16311 float32x4_t test_vrsqrtsq_f32(float32x4_t a, float32x4_t b) {
   16312   return vrsqrtsq_f32(a, b);
   16313 }
   16314 
   16315 
   16316 // CHECK-LABEL: define <8 x i8> @test_vrsra_n_s8(<8 x i8> %a, <8 x i8> %b) #0 {
   16317 // CHECK:   [[TMP0:%.*]] = call <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8> %b, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
   16318 // CHECK:   [[VRSRA_N:%.*]] = add <8 x i8> %a, [[TMP0]]
   16319 // CHECK:   ret <8 x i8> [[VRSRA_N]]
   16320 int8x8_t test_vrsra_n_s8(int8x8_t a, int8x8_t b) {
   16321   return vrsra_n_s8(a, b, 1);
   16322 }
   16323 
   16324 // CHECK-LABEL: define <4 x i16> @test_vrsra_n_s16(<4 x i16> %a, <4 x i16> %b) #0 {
   16325 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   16326 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   16327 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   16328 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   16329 // CHECK:   [[TMP4:%.*]] = call <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16> [[TMP3]], <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
   16330 // CHECK:   [[VRSRA_N:%.*]] = add <4 x i16> [[TMP2]], [[TMP4]]
   16331 // CHECK:   ret <4 x i16> [[VRSRA_N]]
   16332 int16x4_t test_vrsra_n_s16(int16x4_t a, int16x4_t b) {
   16333   return vrsra_n_s16(a, b, 1);
   16334 }
   16335 
   16336 // CHECK-LABEL: define <2 x i32> @test_vrsra_n_s32(<2 x i32> %a, <2 x i32> %b) #0 {
   16337 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   16338 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   16339 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   16340 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   16341 // CHECK:   [[TMP4:%.*]] = call <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32> [[TMP3]], <2 x i32> <i32 -1, i32 -1>)
   16342 // CHECK:   [[VRSRA_N:%.*]] = add <2 x i32> [[TMP2]], [[TMP4]]
   16343 // CHECK:   ret <2 x i32> [[VRSRA_N]]
   16344 int32x2_t test_vrsra_n_s32(int32x2_t a, int32x2_t b) {
   16345   return vrsra_n_s32(a, b, 1);
   16346 }
   16347 
   16348 // CHECK-LABEL: define <1 x i64> @test_vrsra_n_s64(<1 x i64> %a, <1 x i64> %b) #0 {
   16349 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   16350 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
   16351 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
   16352 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
   16353 // CHECK:   [[TMP4:%.*]] = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> [[TMP3]], <1 x i64> <i64 -1>)
   16354 // CHECK:   [[VRSRA_N:%.*]] = add <1 x i64> [[TMP2]], [[TMP4]]
   16355 // CHECK:   ret <1 x i64> [[VRSRA_N]]
   16356 int64x1_t test_vrsra_n_s64(int64x1_t a, int64x1_t b) {
   16357   return vrsra_n_s64(a, b, 1);
   16358 }
   16359 
   16360 // CHECK-LABEL: define <8 x i8> @test_vrsra_n_u8(<8 x i8> %a, <8 x i8> %b) #0 {
   16361 // CHECK:   [[TMP0:%.*]] = call <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8> %b, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
   16362 // CHECK:   [[VRSRA_N:%.*]] = add <8 x i8> %a, [[TMP0]]
   16363 // CHECK:   ret <8 x i8> [[VRSRA_N]]
   16364 uint8x8_t test_vrsra_n_u8(uint8x8_t a, uint8x8_t b) {
   16365   return vrsra_n_u8(a, b, 1);
   16366 }
   16367 
   16368 // CHECK-LABEL: define <4 x i16> @test_vrsra_n_u16(<4 x i16> %a, <4 x i16> %b) #0 {
   16369 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   16370 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   16371 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   16372 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   16373 // CHECK:   [[TMP4:%.*]] = call <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16> [[TMP3]], <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
   16374 // CHECK:   [[VRSRA_N:%.*]] = add <4 x i16> [[TMP2]], [[TMP4]]
   16375 // CHECK:   ret <4 x i16> [[VRSRA_N]]
   16376 uint16x4_t test_vrsra_n_u16(uint16x4_t a, uint16x4_t b) {
   16377   return vrsra_n_u16(a, b, 1);
   16378 }
   16379 
   16380 // CHECK-LABEL: define <2 x i32> @test_vrsra_n_u32(<2 x i32> %a, <2 x i32> %b) #0 {
   16381 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   16382 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   16383 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   16384 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   16385 // CHECK:   [[TMP4:%.*]] = call <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32> [[TMP3]], <2 x i32> <i32 -1, i32 -1>)
   16386 // CHECK:   [[VRSRA_N:%.*]] = add <2 x i32> [[TMP2]], [[TMP4]]
   16387 // CHECK:   ret <2 x i32> [[VRSRA_N]]
   16388 uint32x2_t test_vrsra_n_u32(uint32x2_t a, uint32x2_t b) {
   16389   return vrsra_n_u32(a, b, 1);
   16390 }
   16391 
   16392 // CHECK-LABEL: define <1 x i64> @test_vrsra_n_u64(<1 x i64> %a, <1 x i64> %b) #0 {
   16393 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   16394 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
   16395 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
   16396 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
   16397 // CHECK:   [[TMP4:%.*]] = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> [[TMP3]], <1 x i64> <i64 -1>)
   16398 // CHECK:   [[VRSRA_N:%.*]] = add <1 x i64> [[TMP2]], [[TMP4]]
   16399 // CHECK:   ret <1 x i64> [[VRSRA_N]]
   16400 uint64x1_t test_vrsra_n_u64(uint64x1_t a, uint64x1_t b) {
   16401   return vrsra_n_u64(a, b, 1);
   16402 }
   16403 
   16404 // CHECK-LABEL: define <16 x i8> @test_vrsraq_n_s8(<16 x i8> %a, <16 x i8> %b) #0 {
   16405 // CHECK:   [[TMP0:%.*]] = call <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8> %b, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
   16406 // CHECK:   [[VRSRA_N:%.*]] = add <16 x i8> %a, [[TMP0]]
   16407 // CHECK:   ret <16 x i8> [[VRSRA_N]]
   16408 int8x16_t test_vrsraq_n_s8(int8x16_t a, int8x16_t b) {
   16409   return vrsraq_n_s8(a, b, 1);
   16410 }
   16411 
   16412 // CHECK-LABEL: define <8 x i16> @test_vrsraq_n_s16(<8 x i16> %a, <8 x i16> %b) #0 {
   16413 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   16414 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   16415 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   16416 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   16417 // CHECK:   [[TMP4:%.*]] = call <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16> [[TMP3]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
   16418 // CHECK:   [[VRSRA_N:%.*]] = add <8 x i16> [[TMP2]], [[TMP4]]
   16419 // CHECK:   ret <8 x i16> [[VRSRA_N]]
   16420 int16x8_t test_vrsraq_n_s16(int16x8_t a, int16x8_t b) {
   16421   return vrsraq_n_s16(a, b, 1);
   16422 }
   16423 
   16424 // CHECK-LABEL: define <4 x i32> @test_vrsraq_n_s32(<4 x i32> %a, <4 x i32> %b) #0 {
   16425 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   16426 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   16427 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   16428 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   16429 // CHECK:   [[TMP4:%.*]] = call <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32> [[TMP3]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
   16430 // CHECK:   [[VRSRA_N:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]]
   16431 // CHECK:   ret <4 x i32> [[VRSRA_N]]
   16432 int32x4_t test_vrsraq_n_s32(int32x4_t a, int32x4_t b) {
   16433   return vrsraq_n_s32(a, b, 1);
   16434 }
   16435 
   16436 // CHECK-LABEL: define <2 x i64> @test_vrsraq_n_s64(<2 x i64> %a, <2 x i64> %b) #0 {
   16437 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   16438 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   16439 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   16440 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
   16441 // CHECK:   [[TMP4:%.*]] = call <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64> [[TMP3]], <2 x i64> <i64 -1, i64 -1>)
   16442 // CHECK:   [[VRSRA_N:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
   16443 // CHECK:   ret <2 x i64> [[VRSRA_N]]
   16444 int64x2_t test_vrsraq_n_s64(int64x2_t a, int64x2_t b) {
   16445   return vrsraq_n_s64(a, b, 1);
   16446 }
   16447 
   16448 // CHECK-LABEL: define <16 x i8> @test_vrsraq_n_u8(<16 x i8> %a, <16 x i8> %b) #0 {
   16449 // CHECK:   [[TMP0:%.*]] = call <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8> %b, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
   16450 // CHECK:   [[VRSRA_N:%.*]] = add <16 x i8> %a, [[TMP0]]
   16451 // CHECK:   ret <16 x i8> [[VRSRA_N]]
   16452 uint8x16_t test_vrsraq_n_u8(uint8x16_t a, uint8x16_t b) {
   16453   return vrsraq_n_u8(a, b, 1);
   16454 }
   16455 
   16456 // CHECK-LABEL: define <8 x i16> @test_vrsraq_n_u16(<8 x i16> %a, <8 x i16> %b) #0 {
   16457 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   16458 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   16459 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   16460 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   16461 // CHECK:   [[TMP4:%.*]] = call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> [[TMP3]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
   16462 // CHECK:   [[VRSRA_N:%.*]] = add <8 x i16> [[TMP2]], [[TMP4]]
   16463 // CHECK:   ret <8 x i16> [[VRSRA_N]]
   16464 uint16x8_t test_vrsraq_n_u16(uint16x8_t a, uint16x8_t b) {
   16465   return vrsraq_n_u16(a, b, 1);
   16466 }
   16467 
   16468 // CHECK-LABEL: define <4 x i32> @test_vrsraq_n_u32(<4 x i32> %a, <4 x i32> %b) #0 {
   16469 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   16470 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   16471 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   16472 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   16473 // CHECK:   [[TMP4:%.*]] = call <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32> [[TMP3]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
   16474 // CHECK:   [[VRSRA_N:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]]
   16475 // CHECK:   ret <4 x i32> [[VRSRA_N]]
   16476 uint32x4_t test_vrsraq_n_u32(uint32x4_t a, uint32x4_t b) {
   16477   return vrsraq_n_u32(a, b, 1);
   16478 }
   16479 
   16480 // CHECK-LABEL: define <2 x i64> @test_vrsraq_n_u64(<2 x i64> %a, <2 x i64> %b) #0 {
   16481 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   16482 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   16483 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   16484 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
   16485 // CHECK:   [[TMP4:%.*]] = call <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64> [[TMP3]], <2 x i64> <i64 -1, i64 -1>)
   16486 // CHECK:   [[VRSRA_N:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
   16487 // CHECK:   ret <2 x i64> [[VRSRA_N]]
   16488 uint64x2_t test_vrsraq_n_u64(uint64x2_t a, uint64x2_t b) {
   16489   return vrsraq_n_u64(a, b, 1);
   16490 }
   16491 
   16492 
   16493 // CHECK-LABEL: define <8 x i8> @test_vrsubhn_s16(<8 x i16> %a, <8 x i16> %b) #0 {
   16494 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   16495 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   16496 // CHECK:   [[VRSUBHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   16497 // CHECK:   [[VRSUBHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   16498 // CHECK:   [[VRSUBHN_V2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16> [[VRSUBHN_V_I]], <8 x i16> [[VRSUBHN_V1_I]]) #4
   16499 // CHECK:   ret <8 x i8> [[VRSUBHN_V2_I]]
   16500 int8x8_t test_vrsubhn_s16(int16x8_t a, int16x8_t b) {
   16501   return vrsubhn_s16(a, b);
   16502 }
   16503 
   16504 // CHECK-LABEL: define <4 x i16> @test_vrsubhn_s32(<4 x i32> %a, <4 x i32> %b) #0 {
   16505 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   16506 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   16507 // CHECK:   [[VRSUBHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   16508 // CHECK:   [[VRSUBHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   16509 // CHECK:   [[VRSUBHN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32> [[VRSUBHN_V_I]], <4 x i32> [[VRSUBHN_V1_I]]) #4
   16510 // CHECK:   [[VRSUBHN_V3_I:%.*]] = bitcast <4 x i16> [[VRSUBHN_V2_I]] to <8 x i8>
   16511 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSUBHN_V3_I]] to <4 x i16>
   16512 // CHECK:   ret <4 x i16> [[TMP2]]
   16513 int16x4_t test_vrsubhn_s32(int32x4_t a, int32x4_t b) {
   16514   return vrsubhn_s32(a, b);
   16515 }
   16516 
   16517 // CHECK-LABEL: define <2 x i32> @test_vrsubhn_s64(<2 x i64> %a, <2 x i64> %b) #0 {
   16518 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   16519 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   16520 // CHECK:   [[VRSUBHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   16521 // CHECK:   [[VRSUBHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
   16522 // CHECK:   [[VRSUBHN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64> [[VRSUBHN_V_I]], <2 x i64> [[VRSUBHN_V1_I]]) #4
   16523 // CHECK:   [[VRSUBHN_V3_I:%.*]] = bitcast <2 x i32> [[VRSUBHN_V2_I]] to <8 x i8>
   16524 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSUBHN_V3_I]] to <2 x i32>
   16525 // CHECK:   ret <2 x i32> [[TMP2]]
   16526 int32x2_t test_vrsubhn_s64(int64x2_t a, int64x2_t b) {
   16527   return vrsubhn_s64(a, b);
   16528 }
   16529 
   16530 // CHECK-LABEL: define <8 x i8> @test_vrsubhn_u16(<8 x i16> %a, <8 x i16> %b) #0 {
   16531 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   16532 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   16533 // CHECK:   [[VRSUBHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   16534 // CHECK:   [[VRSUBHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   16535 // CHECK:   [[VRSUBHN_V2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16> [[VRSUBHN_V_I]], <8 x i16> [[VRSUBHN_V1_I]]) #4
   16536 // CHECK:   ret <8 x i8> [[VRSUBHN_V2_I]]
   16537 uint8x8_t test_vrsubhn_u16(uint16x8_t a, uint16x8_t b) {
   16538   return vrsubhn_u16(a, b);
   16539 }
   16540 
   16541 // CHECK-LABEL: define <4 x i16> @test_vrsubhn_u32(<4 x i32> %a, <4 x i32> %b) #0 {
   16542 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   16543 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   16544 // CHECK:   [[VRSUBHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   16545 // CHECK:   [[VRSUBHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   16546 // CHECK:   [[VRSUBHN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32> [[VRSUBHN_V_I]], <4 x i32> [[VRSUBHN_V1_I]]) #4
   16547 // CHECK:   [[VRSUBHN_V3_I:%.*]] = bitcast <4 x i16> [[VRSUBHN_V2_I]] to <8 x i8>
   16548 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSUBHN_V3_I]] to <4 x i16>
   16549 // CHECK:   ret <4 x i16> [[TMP2]]
   16550 uint16x4_t test_vrsubhn_u32(uint32x4_t a, uint32x4_t b) {
   16551   return vrsubhn_u32(a, b);
   16552 }
   16553 
   16554 // CHECK-LABEL: define <2 x i32> @test_vrsubhn_u64(<2 x i64> %a, <2 x i64> %b) #0 {
   16555 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   16556 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   16557 // CHECK:   [[VRSUBHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   16558 // CHECK:   [[VRSUBHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
   16559 // CHECK:   [[VRSUBHN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64> [[VRSUBHN_V_I]], <2 x i64> [[VRSUBHN_V1_I]]) #4
   16560 // CHECK:   [[VRSUBHN_V3_I:%.*]] = bitcast <2 x i32> [[VRSUBHN_V2_I]] to <8 x i8>
   16561 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSUBHN_V3_I]] to <2 x i32>
   16562 // CHECK:   ret <2 x i32> [[TMP2]]
   16563 uint32x2_t test_vrsubhn_u64(uint64x2_t a, uint64x2_t b) {
   16564   return vrsubhn_u64(a, b);
   16565 }
   16566 
   16567 
   16568 // CHECK-LABEL: define <8 x i8> @test_vset_lane_u8(i8 zeroext %a, <8 x i8> %b) #0 {
   16569 // CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i8> %b, i8 %a, i32 7
   16570 // CHECK:   ret <8 x i8> [[VSET_LANE]]
   16571 uint8x8_t test_vset_lane_u8(uint8_t a, uint8x8_t b) {
   16572   return vset_lane_u8(a, b, 7);
   16573 }
   16574 
   16575 // CHECK-LABEL: define <4 x i16> @test_vset_lane_u16(i16 zeroext %a, <4 x i16> %b) #0 {
   16576 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   16577 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   16578 // CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x i16> [[TMP1]], i16 %a, i32 3
   16579 // CHECK:   ret <4 x i16> [[VSET_LANE]]
   16580 uint16x4_t test_vset_lane_u16(uint16_t a, uint16x4_t b) {
   16581   return vset_lane_u16(a, b, 3);
   16582 }
   16583 
   16584 // CHECK-LABEL: define <2 x i32> @test_vset_lane_u32(i32 %a, <2 x i32> %b) #0 {
   16585 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   16586 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   16587 // CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x i32> [[TMP1]], i32 %a, i32 1
   16588 // CHECK:   ret <2 x i32> [[VSET_LANE]]
   16589 uint32x2_t test_vset_lane_u32(uint32_t a, uint32x2_t b) {
   16590   return vset_lane_u32(a, b, 1);
   16591 }
   16592 
   16593 // CHECK-LABEL: define <8 x i8> @test_vset_lane_s8(i8 signext %a, <8 x i8> %b) #0 {
   16594 // CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i8> %b, i8 %a, i32 7
   16595 // CHECK:   ret <8 x i8> [[VSET_LANE]]
   16596 int8x8_t test_vset_lane_s8(int8_t a, int8x8_t b) {
   16597   return vset_lane_s8(a, b, 7);
   16598 }
   16599 
   16600 // CHECK-LABEL: define <4 x i16> @test_vset_lane_s16(i16 signext %a, <4 x i16> %b) #0 {
   16601 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   16602 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   16603 // CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x i16> [[TMP1]], i16 %a, i32 3
   16604 // CHECK:   ret <4 x i16> [[VSET_LANE]]
   16605 int16x4_t test_vset_lane_s16(int16_t a, int16x4_t b) {
   16606   return vset_lane_s16(a, b, 3);
   16607 }
   16608 
   16609 // CHECK-LABEL: define <2 x i32> @test_vset_lane_s32(i32 %a, <2 x i32> %b) #0 {
   16610 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   16611 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   16612 // CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x i32> [[TMP1]], i32 %a, i32 1
   16613 // CHECK:   ret <2 x i32> [[VSET_LANE]]
   16614 int32x2_t test_vset_lane_s32(int32_t a, int32x2_t b) {
   16615   return vset_lane_s32(a, b, 1);
   16616 }
   16617 
   16618 // CHECK-LABEL: define <8 x i8> @test_vset_lane_p8(i8 signext %a, <8 x i8> %b) #0 {
   16619 // CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i8> %b, i8 %a, i32 7
   16620 // CHECK:   ret <8 x i8> [[VSET_LANE]]
   16621 poly8x8_t test_vset_lane_p8(poly8_t a, poly8x8_t b) {
   16622   return vset_lane_p8(a, b, 7);
   16623 }
   16624 
   16625 // CHECK-LABEL: define <4 x i16> @test_vset_lane_p16(i16 signext %a, <4 x i16> %b) #0 {
   16626 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   16627 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   16628 // CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x i16> [[TMP1]], i16 %a, i32 3
   16629 // CHECK:   ret <4 x i16> [[VSET_LANE]]
   16630 poly16x4_t test_vset_lane_p16(poly16_t a, poly16x4_t b) {
   16631   return vset_lane_p16(a, b, 3);
   16632 }
   16633 
   16634 // CHECK-LABEL: define <2 x float> @test_vset_lane_f32(float %a, <2 x float> %b) #0 {
   16635 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %b to <8 x i8>
   16636 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
   16637 // CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x float> [[TMP1]], float %a, i32 1
   16638 // CHECK:   ret <2 x float> [[VSET_LANE]]
   16639 float32x2_t test_vset_lane_f32(float32_t a, float32x2_t b) {
   16640   return vset_lane_f32(a, b, 1);
   16641 }
   16642 
   16643 // CHECK-LABEL: define <4 x half> @test_vset_lane_f16(half* %a, <4 x half> %b) #0 {
   16644 // CHECK:   [[__REINT_246:%.*]] = alloca half, align 2
   16645 // CHECK:   [[__REINT1_246:%.*]] = alloca <4 x half>, align 8
   16646 // CHECK:   [[__REINT2_246:%.*]] = alloca <4 x i16>, align 8
   16647 // CHECK:   [[TMP0:%.*]] = load half, half* %a, align 2
   16648 // CHECK:   store half [[TMP0]], half* [[__REINT_246]], align 2
   16649 // CHECK:   store <4 x half> %b, <4 x half>* [[__REINT1_246]], align 8
   16650 // CHECK:   [[TMP1:%.*]] = bitcast half* [[__REINT_246]] to i16*
   16651 // CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2
   16652 // CHECK:   [[TMP3:%.*]] = bitcast <4 x half>* [[__REINT1_246]] to <4 x i16>*
   16653 // CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[TMP3]], align 8
   16654 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
   16655 // CHECK:   [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
   16656 // CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x i16> [[TMP6]], i16 [[TMP2]], i32 1
   16657 // CHECK:   store <4 x i16> [[VSET_LANE]], <4 x i16>* [[__REINT2_246]], align 8
   16658 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i16>* [[__REINT2_246]] to <4 x half>*
   16659 // CHECK:   [[TMP8:%.*]] = load <4 x half>, <4 x half>* [[TMP7]], align 8
   16660 // CHECK:   ret <4 x half> [[TMP8]]
   16661 float16x4_t test_vset_lane_f16(float16_t *a, float16x4_t b) {
   16662   return vset_lane_f16(*a, b, 1);
   16663 }
   16664 
   16665 // CHECK-LABEL: define <16 x i8> @test_vsetq_lane_u8(i8 zeroext %a, <16 x i8> %b) #0 {
   16666 // CHECK:   [[VSET_LANE:%.*]] = insertelement <16 x i8> %b, i8 %a, i32 15
   16667 // CHECK:   ret <16 x i8> [[VSET_LANE]]
   16668 uint8x16_t test_vsetq_lane_u8(uint8_t a, uint8x16_t b) {
   16669   return vsetq_lane_u8(a, b, 15);
   16670 }
   16671 
   16672 // CHECK-LABEL: define <8 x i16> @test_vsetq_lane_u16(i16 zeroext %a, <8 x i16> %b) #0 {
   16673 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   16674 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   16675 // CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i16> [[TMP1]], i16 %a, i32 7
   16676 // CHECK:   ret <8 x i16> [[VSET_LANE]]
   16677 uint16x8_t test_vsetq_lane_u16(uint16_t a, uint16x8_t b) {
   16678   return vsetq_lane_u16(a, b, 7);
   16679 }
   16680 
   16681 // CHECK-LABEL: define <4 x i32> @test_vsetq_lane_u32(i32 %a, <4 x i32> %b) #0 {
   16682 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   16683 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   16684 // CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x i32> [[TMP1]], i32 %a, i32 3
   16685 // CHECK:   ret <4 x i32> [[VSET_LANE]]
   16686 uint32x4_t test_vsetq_lane_u32(uint32_t a, uint32x4_t b) {
   16687   return vsetq_lane_u32(a, b, 3);
   16688 }
   16689 
   16690 // CHECK-LABEL: define <16 x i8> @test_vsetq_lane_s8(i8 signext %a, <16 x i8> %b) #0 {
   16691 // CHECK:   [[VSET_LANE:%.*]] = insertelement <16 x i8> %b, i8 %a, i32 15
   16692 // CHECK:   ret <16 x i8> [[VSET_LANE]]
   16693 int8x16_t test_vsetq_lane_s8(int8_t a, int8x16_t b) {
   16694   return vsetq_lane_s8(a, b, 15);
   16695 }
   16696 
   16697 // CHECK-LABEL: define <8 x i16> @test_vsetq_lane_s16(i16 signext %a, <8 x i16> %b) #0 {
   16698 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   16699 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   16700 // CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i16> [[TMP1]], i16 %a, i32 7
   16701 // CHECK:   ret <8 x i16> [[VSET_LANE]]
   16702 int16x8_t test_vsetq_lane_s16(int16_t a, int16x8_t b) {
   16703   return vsetq_lane_s16(a, b, 7);
   16704 }
   16705 
   16706 // CHECK-LABEL: define <4 x i32> @test_vsetq_lane_s32(i32 %a, <4 x i32> %b) #0 {
   16707 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   16708 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   16709 // CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x i32> [[TMP1]], i32 %a, i32 3
   16710 // CHECK:   ret <4 x i32> [[VSET_LANE]]
   16711 int32x4_t test_vsetq_lane_s32(int32_t a, int32x4_t b) {
   16712   return vsetq_lane_s32(a, b, 3);
   16713 }
   16714 
   16715 // CHECK-LABEL: define <16 x i8> @test_vsetq_lane_p8(i8 signext %a, <16 x i8> %b) #0 {
   16716 // CHECK:   [[VSET_LANE:%.*]] = insertelement <16 x i8> %b, i8 %a, i32 15
   16717 // CHECK:   ret <16 x i8> [[VSET_LANE]]
   16718 poly8x16_t test_vsetq_lane_p8(poly8_t a, poly8x16_t b) {
   16719   return vsetq_lane_p8(a, b, 15);
   16720 }
   16721 
   16722 // CHECK-LABEL: define <8 x i16> @test_vsetq_lane_p16(i16 signext %a, <8 x i16> %b) #0 {
   16723 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   16724 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   16725 // CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i16> [[TMP1]], i16 %a, i32 7
   16726 // CHECK:   ret <8 x i16> [[VSET_LANE]]
   16727 poly16x8_t test_vsetq_lane_p16(poly16_t a, poly16x8_t b) {
   16728   return vsetq_lane_p16(a, b, 7);
   16729 }
   16730 
   16731 // CHECK-LABEL: define <4 x float> @test_vsetq_lane_f32(float %a, <4 x float> %b) #0 {
   16732 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %b to <16 x i8>
   16733 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
   16734 // CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x float> [[TMP1]], float %a, i32 3
   16735 // CHECK:   ret <4 x float> [[VSET_LANE]]
   16736 float32x4_t test_vsetq_lane_f32(float32_t a, float32x4_t b) {
   16737   return vsetq_lane_f32(a, b, 3);
   16738 }
   16739 
   16740 // CHECK-LABEL: define <8 x half> @test_vsetq_lane_f16(half* %a, <8 x half> %b) #0 {
   16741 // CHECK:   [[__REINT_248:%.*]] = alloca half, align 2
   16742 // CHECK:   [[__REINT1_248:%.*]] = alloca <8 x half>, align 16
   16743 // CHECK:   [[__REINT2_248:%.*]] = alloca <8 x i16>, align 16
   16744 // CHECK:   [[TMP0:%.*]] = load half, half* %a, align 2
   16745 // CHECK:   store half [[TMP0]], half* [[__REINT_248]], align 2
   16746 // CHECK:   store <8 x half> %b, <8 x half>* [[__REINT1_248]], align 16
   16747 // CHECK:   [[TMP1:%.*]] = bitcast half* [[__REINT_248]] to i16*
   16748 // CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2
   16749 // CHECK:   [[TMP3:%.*]] = bitcast <8 x half>* [[__REINT1_248]] to <8 x i16>*
   16750 // CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[TMP3]], align 16
   16751 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
   16752 // CHECK:   [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
   16753 // CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i16> [[TMP6]], i16 [[TMP2]], i32 3
   16754 // CHECK:   store <8 x i16> [[VSET_LANE]], <8 x i16>* [[__REINT2_248]], align 16
   16755 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16>* [[__REINT2_248]] to <8 x half>*
   16756 // CHECK:   [[TMP8:%.*]] = load <8 x half>, <8 x half>* [[TMP7]], align 16
   16757 // CHECK:   ret <8 x half> [[TMP8]]
   16758 float16x8_t test_vsetq_lane_f16(float16_t *a, float16x8_t b) {
   16759   return vsetq_lane_f16(*a, b, 3);
   16760 }
   16761 
   16762 // The optimizer is able to get rid of all moves now.
   16763 // CHECK-LABEL: define <1 x i64> @test_vset_lane_s64(i64 %a, <1 x i64> %b) #0 {
   16764 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %b to <8 x i8>
   16765 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
   16766 // CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x i64> [[TMP1]], i64 %a, i32 0
   16767 // CHECK:   ret <1 x i64> [[VSET_LANE]]
   16768 int64x1_t test_vset_lane_s64(int64_t a, int64x1_t b) {
   16769   return vset_lane_s64(a, b, 0);
   16770 }
   16771 
   16772 // The optimizer is able to get rid of all moves now.
   16773 // CHECK-LABEL: define <1 x i64> @test_vset_lane_u64(i64 %a, <1 x i64> %b) #0 {
   16774 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %b to <8 x i8>
   16775 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
   16776 // CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x i64> [[TMP1]], i64 %a, i32 0
   16777 // CHECK:   ret <1 x i64> [[VSET_LANE]]
   16778 uint64x1_t test_vset_lane_u64(uint64_t a, uint64x1_t b) {
   16779   return vset_lane_u64(a, b, 0);
   16780 }
   16781 
   16782 // CHECK-LABEL: define <2 x i64> @test_vsetq_lane_s64(i64 %a, <2 x i64> %b) #0 {
   16783 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   16784 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   16785 // CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x i64> [[TMP1]], i64 %a, i32 1
   16786 // CHECK:   ret <2 x i64> [[VSET_LANE]]
   16787 int64x2_t test_vsetq_lane_s64(int64_t a, int64x2_t b) {
   16788   return vsetq_lane_s64(a, b, 1);
   16789 }
   16790 
   16791 // CHECK-LABEL: define <2 x i64> @test_vsetq_lane_u64(i64 %a, <2 x i64> %b) #0 {
   16792 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   16793 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   16794 // CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x i64> [[TMP1]], i64 %a, i32 1
   16795 // CHECK:   ret <2 x i64> [[VSET_LANE]]
   16796 uint64x2_t test_vsetq_lane_u64(uint64_t a, uint64x2_t b) {
   16797   return vsetq_lane_u64(a, b, 1);
   16798 }
   16799 
   16800 
   16801 // CHECK-LABEL: define <8 x i8> @test_vshl_s8(<8 x i8> %a, <8 x i8> %b) #0 {
   16802 // CHECK:   [[VSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vshifts.v8i8(<8 x i8> %a, <8 x i8> %b) #4
   16803 // CHECK:   ret <8 x i8> [[VSHL_V_I]]
   16804 int8x8_t test_vshl_s8(int8x8_t a, int8x8_t b) {
   16805   return vshl_s8(a, b);
   16806 }
   16807 
   16808 // CHECK-LABEL: define <4 x i16> @test_vshl_s16(<4 x i16> %a, <4 x i16> %b) #0 {
   16809 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   16810 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   16811 // CHECK:   [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   16812 // CHECK:   [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   16813 // CHECK:   [[VSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vshifts.v4i16(<4 x i16> [[VSHL_V_I]], <4 x i16> [[VSHL_V1_I]]) #4
   16814 // CHECK:   [[VSHL_V3_I:%.*]] = bitcast <4 x i16> [[VSHL_V2_I]] to <8 x i8>
   16815 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to <4 x i16>
   16816 // CHECK:   ret <4 x i16> [[TMP2]]
   16817 int16x4_t test_vshl_s16(int16x4_t a, int16x4_t b) {
   16818   return vshl_s16(a, b);
   16819 }
   16820 
   16821 // CHECK-LABEL: define <2 x i32> @test_vshl_s32(<2 x i32> %a, <2 x i32> %b) #0 {
   16822 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   16823 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   16824 // CHECK:   [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   16825 // CHECK:   [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   16826 // CHECK:   [[VSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vshifts.v2i32(<2 x i32> [[VSHL_V_I]], <2 x i32> [[VSHL_V1_I]]) #4
   16827 // CHECK:   [[VSHL_V3_I:%.*]] = bitcast <2 x i32> [[VSHL_V2_I]] to <8 x i8>
   16828 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to <2 x i32>
   16829 // CHECK:   ret <2 x i32> [[TMP2]]
   16830 int32x2_t test_vshl_s32(int32x2_t a, int32x2_t b) {
   16831   return vshl_s32(a, b);
   16832 }
   16833 
   16834 // CHECK-LABEL: define <1 x i64> @test_vshl_s64(<1 x i64> %a, <1 x i64> %b) #0 {
   16835 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   16836 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
   16837 // CHECK:   [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
   16838 // CHECK:   [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
   16839 // CHECK:   [[VSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vshifts.v1i64(<1 x i64> [[VSHL_V_I]], <1 x i64> [[VSHL_V1_I]]) #4
   16840 // CHECK:   [[VSHL_V3_I:%.*]] = bitcast <1 x i64> [[VSHL_V2_I]] to <8 x i8>
   16841 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to <1 x i64>
   16842 // CHECK:   ret <1 x i64> [[TMP2]]
   16843 int64x1_t test_vshl_s64(int64x1_t a, int64x1_t b) {
   16844   return vshl_s64(a, b);
   16845 }
   16846 
   16847 // CHECK-LABEL: define <8 x i8> @test_vshl_u8(<8 x i8> %a, <8 x i8> %b) #0 {
   16848 // CHECK:   [[VSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
   16849 // CHECK:   ret <8 x i8> [[VSHL_V_I]]
   16850 uint8x8_t test_vshl_u8(uint8x8_t a, int8x8_t b) {
   16851   return vshl_u8(a, b);
   16852 }
   16853 
   16854 // CHECK-LABEL: define <4 x i16> @test_vshl_u16(<4 x i16> %a, <4 x i16> %b) #0 {
   16855 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   16856 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   16857 // CHECK:   [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   16858 // CHECK:   [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   16859 // CHECK:   [[VSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftu.v4i16(<4 x i16> [[VSHL_V_I]], <4 x i16> [[VSHL_V1_I]]) #4
   16860 // CHECK:   [[VSHL_V3_I:%.*]] = bitcast <4 x i16> [[VSHL_V2_I]] to <8 x i8>
   16861 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to <4 x i16>
   16862 // CHECK:   ret <4 x i16> [[TMP2]]
   16863 uint16x4_t test_vshl_u16(uint16x4_t a, int16x4_t b) {
   16864   return vshl_u16(a, b);
   16865 }
   16866 
   16867 // CHECK-LABEL: define <2 x i32> @test_vshl_u32(<2 x i32> %a, <2 x i32> %b) #0 {
   16868 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   16869 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   16870 // CHECK:   [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   16871 // CHECK:   [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   16872 // CHECK:   [[VSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vshiftu.v2i32(<2 x i32> [[VSHL_V_I]], <2 x i32> [[VSHL_V1_I]]) #4
   16873 // CHECK:   [[VSHL_V3_I:%.*]] = bitcast <2 x i32> [[VSHL_V2_I]] to <8 x i8>
   16874 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to <2 x i32>
   16875 // CHECK:   ret <2 x i32> [[TMP2]]
   16876 uint32x2_t test_vshl_u32(uint32x2_t a, int32x2_t b) {
   16877   return vshl_u32(a, b);
   16878 }
   16879 
   16880 // CHECK-LABEL: define <1 x i64> @test_vshl_u64(<1 x i64> %a, <1 x i64> %b) #0 {
   16881 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   16882 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
   16883 // CHECK:   [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
   16884 // CHECK:   [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
   16885 // CHECK:   [[VSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vshiftu.v1i64(<1 x i64> [[VSHL_V_I]], <1 x i64> [[VSHL_V1_I]]) #4
   16886 // CHECK:   [[VSHL_V3_I:%.*]] = bitcast <1 x i64> [[VSHL_V2_I]] to <8 x i8>
   16887 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to <1 x i64>
   16888 // CHECK:   ret <1 x i64> [[TMP2]]
   16889 uint64x1_t test_vshl_u64(uint64x1_t a, int64x1_t b) {
   16890   return vshl_u64(a, b);
   16891 }
   16892 
   16893 // CHECK-LABEL: define <16 x i8> @test_vshlq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
   16894 // CHECK:   [[VSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vshifts.v16i8(<16 x i8> %a, <16 x i8> %b) #4
   16895 // CHECK:   ret <16 x i8> [[VSHLQ_V_I]]
   16896 int8x16_t test_vshlq_s8(int8x16_t a, int8x16_t b) {
   16897   return vshlq_s8(a, b);
   16898 }
   16899 
   16900 // CHECK-LABEL: define <8 x i16> @test_vshlq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
   16901 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   16902 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   16903 // CHECK:   [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   16904 // CHECK:   [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   16905 // CHECK:   [[VSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vshifts.v8i16(<8 x i16> [[VSHLQ_V_I]], <8 x i16> [[VSHLQ_V1_I]]) #4
   16906 // CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VSHLQ_V2_I]] to <16 x i8>
   16907 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <8 x i16>
   16908 // CHECK:   ret <8 x i16> [[TMP2]]
   16909 int16x8_t test_vshlq_s16(int16x8_t a, int16x8_t b) {
   16910   return vshlq_s16(a, b);
   16911 }
   16912 
   16913 // CHECK-LABEL: define <4 x i32> @test_vshlq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
   16914 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   16915 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   16916 // CHECK:   [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   16917 // CHECK:   [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   16918 // CHECK:   [[VSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vshifts.v4i32(<4 x i32> [[VSHLQ_V_I]], <4 x i32> [[VSHLQ_V1_I]]) #4
   16919 // CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VSHLQ_V2_I]] to <16 x i8>
   16920 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <4 x i32>
   16921 // CHECK:   ret <4 x i32> [[TMP2]]
   16922 int32x4_t test_vshlq_s32(int32x4_t a, int32x4_t b) {
   16923   return vshlq_s32(a, b);
   16924 }
   16925 
   16926 // CHECK-LABEL: define <2 x i64> @test_vshlq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
   16927 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   16928 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   16929 // CHECK:   [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   16930 // CHECK:   [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
   16931 // CHECK:   [[VSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vshifts.v2i64(<2 x i64> [[VSHLQ_V_I]], <2 x i64> [[VSHLQ_V1_I]]) #4
   16932 // CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VSHLQ_V2_I]] to <16 x i8>
   16933 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <2 x i64>
   16934 // CHECK:   ret <2 x i64> [[TMP2]]
   16935 int64x2_t test_vshlq_s64(int64x2_t a, int64x2_t b) {
   16936   return vshlq_s64(a, b);
   16937 }
   16938 
   16939 // CHECK-LABEL: define <16 x i8> @test_vshlq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
   16940 // CHECK:   [[VSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftu.v16i8(<16 x i8> %a, <16 x i8> %b) #4
   16941 // CHECK:   ret <16 x i8> [[VSHLQ_V_I]]
   16942 uint8x16_t test_vshlq_u8(uint8x16_t a, int8x16_t b) {
   16943   return vshlq_u8(a, b);
   16944 }
   16945 
   16946 // CHECK-LABEL: define <8 x i16> @test_vshlq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
   16947 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   16948 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   16949 // CHECK:   [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   16950 // CHECK:   [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   16951 // CHECK:   [[VSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftu.v8i16(<8 x i16> [[VSHLQ_V_I]], <8 x i16> [[VSHLQ_V1_I]]) #4
   16952 // CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VSHLQ_V2_I]] to <16 x i8>
   16953 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <8 x i16>
   16954 // CHECK:   ret <8 x i16> [[TMP2]]
   16955 uint16x8_t test_vshlq_u16(uint16x8_t a, int16x8_t b) {
   16956   return vshlq_u16(a, b);
   16957 }
   16958 
   16959 // CHECK-LABEL: define <4 x i32> @test_vshlq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
   16960 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   16961 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   16962 // CHECK:   [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   16963 // CHECK:   [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   16964 // CHECK:   [[VSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vshiftu.v4i32(<4 x i32> [[VSHLQ_V_I]], <4 x i32> [[VSHLQ_V1_I]]) #4
   16965 // CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VSHLQ_V2_I]] to <16 x i8>
   16966 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <4 x i32>
   16967 // CHECK:   ret <4 x i32> [[TMP2]]
   16968 uint32x4_t test_vshlq_u32(uint32x4_t a, int32x4_t b) {
   16969   return vshlq_u32(a, b);
   16970 }
   16971 
   16972 // CHECK-LABEL: define <2 x i64> @test_vshlq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
   16973 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   16974 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   16975 // CHECK:   [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   16976 // CHECK:   [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
   16977 // CHECK:   [[VSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vshiftu.v2i64(<2 x i64> [[VSHLQ_V_I]], <2 x i64> [[VSHLQ_V1_I]]) #4
   16978 // CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VSHLQ_V2_I]] to <16 x i8>
   16979 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <2 x i64>
   16980 // CHECK:   ret <2 x i64> [[TMP2]]
   16981 uint64x2_t test_vshlq_u64(uint64x2_t a, int64x2_t b) {
   16982   return vshlq_u64(a, b);
   16983 }
   16984 
   16985 
   16986 // CHECK-LABEL: define <8 x i16> @test_vshll_n_s8(<8 x i8> %a) #0 {
   16987 // CHECK:   [[TMP0:%.*]] = sext <8 x i8> %a to <8 x i16>
   16988 // CHECK:   [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
   16989 // CHECK:   ret <8 x i16> [[VSHLL_N]]
   16990 int16x8_t test_vshll_n_s8(int8x8_t a) {
   16991   return vshll_n_s8(a, 1);
   16992 }
   16993 
   16994 // CHECK-LABEL: define <4 x i32> @test_vshll_n_s16(<4 x i16> %a) #0 {
   16995 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   16996 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   16997 // CHECK:   [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
   16998 // CHECK:   [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], <i32 1, i32 1, i32 1, i32 1>
   16999 // CHECK:   ret <4 x i32> [[VSHLL_N]]
   17000 int32x4_t test_vshll_n_s16(int16x4_t a) {
   17001   return vshll_n_s16(a, 1);
   17002 }
   17003 
   17004 // CHECK-LABEL: define <2 x i64> @test_vshll_n_s32(<2 x i32> %a) #0 {
   17005 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   17006 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   17007 // CHECK:   [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
   17008 // CHECK:   [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], <i64 1, i64 1>
   17009 // CHECK:   ret <2 x i64> [[VSHLL_N]]
   17010 int64x2_t test_vshll_n_s32(int32x2_t a) {
   17011   return vshll_n_s32(a, 1);
   17012 }
   17013 
   17014 // CHECK-LABEL: define <8 x i16> @test_vshll_n_u8(<8 x i8> %a) #0 {
   17015 // CHECK:   [[TMP0:%.*]] = zext <8 x i8> %a to <8 x i16>
   17016 // CHECK:   [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
   17017 // CHECK:   ret <8 x i16> [[VSHLL_N]]
   17018 uint16x8_t test_vshll_n_u8(uint8x8_t a) {
   17019   return vshll_n_u8(a, 1);
   17020 }
   17021 
   17022 // CHECK-LABEL: define <4 x i32> @test_vshll_n_u16(<4 x i16> %a) #0 {
   17023 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   17024 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   17025 // CHECK:   [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
   17026 // CHECK:   [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], <i32 1, i32 1, i32 1, i32 1>
   17027 // CHECK:   ret <4 x i32> [[VSHLL_N]]
   17028 uint32x4_t test_vshll_n_u16(uint16x4_t a) {
   17029   return vshll_n_u16(a, 1);
   17030 }
   17031 
   17032 // CHECK-LABEL: define <2 x i64> @test_vshll_n_u32(<2 x i32> %a) #0 {
   17033 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   17034 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   17035 // CHECK:   [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
   17036 // CHECK:   [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], <i64 1, i64 1>
   17037 // CHECK:   ret <2 x i64> [[VSHLL_N]]
   17038 uint64x2_t test_vshll_n_u32(uint32x2_t a) {
   17039   return vshll_n_u32(a, 1);
   17040 }
   17041 
   17042 
   17043 // CHECK-LABEL: define <8 x i8> @test_vshl_n_s8(<8 x i8> %a) #0 {
   17044 // CHECK:   [[VSHL_N:%.*]] = shl <8 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
   17045 // CHECK:   ret <8 x i8> [[VSHL_N]]
   17046 int8x8_t test_vshl_n_s8(int8x8_t a) {
   17047   return vshl_n_s8(a, 1);
   17048 }
   17049 
   17050 // CHECK-LABEL: define <4 x i16> @test_vshl_n_s16(<4 x i16> %a) #0 {
   17051 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   17052 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   17053 // CHECK:   [[VSHL_N:%.*]] = shl <4 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1>
   17054 // CHECK:   ret <4 x i16> [[VSHL_N]]
   17055 int16x4_t test_vshl_n_s16(int16x4_t a) {
   17056   return vshl_n_s16(a, 1);
   17057 }
   17058 
   17059 // CHECK-LABEL: define <2 x i32> @test_vshl_n_s32(<2 x i32> %a) #0 {
   17060 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   17061 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   17062 // CHECK:   [[VSHL_N:%.*]] = shl <2 x i32> [[TMP1]], <i32 1, i32 1>
   17063 // CHECK:   ret <2 x i32> [[VSHL_N]]
   17064 int32x2_t test_vshl_n_s32(int32x2_t a) {
   17065   return vshl_n_s32(a, 1);
   17066 }
   17067 
   17068 // CHECK-LABEL: define <1 x i64> @test_vshl_n_s64(<1 x i64> %a) #0 {
   17069 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   17070 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
   17071 // CHECK:   [[VSHL_N:%.*]] = shl <1 x i64> [[TMP1]], <i64 1>
   17072 // CHECK:   ret <1 x i64> [[VSHL_N]]
   17073 int64x1_t test_vshl_n_s64(int64x1_t a) {
   17074   return vshl_n_s64(a, 1);
   17075 }
   17076 
   17077 // CHECK-LABEL: define <8 x i8> @test_vshl_n_u8(<8 x i8> %a) #0 {
   17078 // CHECK:   [[VSHL_N:%.*]] = shl <8 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
   17079 // CHECK:   ret <8 x i8> [[VSHL_N]]
   17080 uint8x8_t test_vshl_n_u8(uint8x8_t a) {
   17081   return vshl_n_u8(a, 1);
   17082 }
   17083 
   17084 // CHECK-LABEL: define <4 x i16> @test_vshl_n_u16(<4 x i16> %a) #0 {
   17085 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   17086 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   17087 // CHECK:   [[VSHL_N:%.*]] = shl <4 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1>
   17088 // CHECK:   ret <4 x i16> [[VSHL_N]]
   17089 uint16x4_t test_vshl_n_u16(uint16x4_t a) {
   17090   return vshl_n_u16(a, 1);
   17091 }
   17092 
   17093 // CHECK-LABEL: define <2 x i32> @test_vshl_n_u32(<2 x i32> %a) #0 {
   17094 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   17095 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   17096 // CHECK:   [[VSHL_N:%.*]] = shl <2 x i32> [[TMP1]], <i32 1, i32 1>
   17097 // CHECK:   ret <2 x i32> [[VSHL_N]]
   17098 uint32x2_t test_vshl_n_u32(uint32x2_t a) {
   17099   return vshl_n_u32(a, 1);
   17100 }
   17101 
   17102 // CHECK-LABEL: define <1 x i64> @test_vshl_n_u64(<1 x i64> %a) #0 {
   17103 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   17104 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
   17105 // CHECK:   [[VSHL_N:%.*]] = shl <1 x i64> [[TMP1]], <i64 1>
   17106 // CHECK:   ret <1 x i64> [[VSHL_N]]
   17107 uint64x1_t test_vshl_n_u64(uint64x1_t a) {
   17108   return vshl_n_u64(a, 1);
   17109 }
   17110 
   17111 // CHECK-LABEL: define <16 x i8> @test_vshlq_n_s8(<16 x i8> %a) #0 {
   17112 // CHECK:   [[VSHL_N:%.*]] = shl <16 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
   17113 // CHECK:   ret <16 x i8> [[VSHL_N]]
   17114 int8x16_t test_vshlq_n_s8(int8x16_t a) {
   17115   return vshlq_n_s8(a, 1);
   17116 }
   17117 
   17118 // CHECK-LABEL: define <8 x i16> @test_vshlq_n_s16(<8 x i16> %a) #0 {
   17119 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   17120 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   17121 // CHECK:   [[VSHL_N:%.*]] = shl <8 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
   17122 // CHECK:   ret <8 x i16> [[VSHL_N]]
   17123 int16x8_t test_vshlq_n_s16(int16x8_t a) {
   17124   return vshlq_n_s16(a, 1);
   17125 }
   17126 
   17127 // CHECK-LABEL: define <4 x i32> @test_vshlq_n_s32(<4 x i32> %a) #0 {
   17128 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   17129 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   17130 // CHECK:   [[VSHL_N:%.*]] = shl <4 x i32> [[TMP1]], <i32 1, i32 1, i32 1, i32 1>
   17131 // CHECK:   ret <4 x i32> [[VSHL_N]]
   17132 int32x4_t test_vshlq_n_s32(int32x4_t a) {
   17133   return vshlq_n_s32(a, 1);
   17134 }
   17135 
   17136 // CHECK-LABEL: define <2 x i64> @test_vshlq_n_s64(<2 x i64> %a) #0 {
   17137 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   17138 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   17139 // CHECK:   [[VSHL_N:%.*]] = shl <2 x i64> [[TMP1]], <i64 1, i64 1>
   17140 // CHECK:   ret <2 x i64> [[VSHL_N]]
   17141 int64x2_t test_vshlq_n_s64(int64x2_t a) {
   17142   return vshlq_n_s64(a, 1);
   17143 }
   17144 
   17145 // CHECK-LABEL: define <16 x i8> @test_vshlq_n_u8(<16 x i8> %a) #0 {
   17146 // CHECK:   [[VSHL_N:%.*]] = shl <16 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
   17147 // CHECK:   ret <16 x i8> [[VSHL_N]]
   17148 uint8x16_t test_vshlq_n_u8(uint8x16_t a) {
   17149   return vshlq_n_u8(a, 1);
   17150 }
   17151 
   17152 // CHECK-LABEL: define <8 x i16> @test_vshlq_n_u16(<8 x i16> %a) #0 {
   17153 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   17154 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   17155 // CHECK:   [[VSHL_N:%.*]] = shl <8 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
   17156 // CHECK:   ret <8 x i16> [[VSHL_N]]
   17157 uint16x8_t test_vshlq_n_u16(uint16x8_t a) {
   17158   return vshlq_n_u16(a, 1);
   17159 }
   17160 
   17161 // CHECK-LABEL: define <4 x i32> @test_vshlq_n_u32(<4 x i32> %a) #0 {
   17162 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   17163 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   17164 // CHECK:   [[VSHL_N:%.*]] = shl <4 x i32> [[TMP1]], <i32 1, i32 1, i32 1, i32 1>
   17165 // CHECK:   ret <4 x i32> [[VSHL_N]]
   17166 uint32x4_t test_vshlq_n_u32(uint32x4_t a) {
   17167   return vshlq_n_u32(a, 1);
   17168 }
   17169 
   17170 // CHECK-LABEL: define <2 x i64> @test_vshlq_n_u64(<2 x i64> %a) #0 {
   17171 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   17172 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   17173 // CHECK:   [[VSHL_N:%.*]] = shl <2 x i64> [[TMP1]], <i64 1, i64 1>
   17174 // CHECK:   ret <2 x i64> [[VSHL_N]]
   17175 uint64x2_t test_vshlq_n_u64(uint64x2_t a) {
   17176   return vshlq_n_u64(a, 1);
   17177 }
   17178 
   17179 
   17180 // CHECK-LABEL: define <8 x i8> @test_vshrn_n_s16(<8 x i16> %a) #0 {
   17181 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   17182 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   17183 // CHECK:   [[TMP2:%.*]] = ashr <8 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
   17184 // CHECK:   [[VSHRN_N:%.*]] = trunc <8 x i16> [[TMP2]] to <8 x i8>
   17185 // CHECK:   ret <8 x i8> [[VSHRN_N]]
   17186 int8x8_t test_vshrn_n_s16(int16x8_t a) {
   17187   return vshrn_n_s16(a, 1);
   17188 }
   17189 
   17190 // CHECK-LABEL: define <4 x i16> @test_vshrn_n_s32(<4 x i32> %a) #0 {
   17191 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   17192 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   17193 // CHECK:   [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], <i32 1, i32 1, i32 1, i32 1>
   17194 // CHECK:   [[VSHRN_N:%.*]] = trunc <4 x i32> [[TMP2]] to <4 x i16>
   17195 // CHECK:   ret <4 x i16> [[VSHRN_N]]
   17196 int16x4_t test_vshrn_n_s32(int32x4_t a) {
   17197   return vshrn_n_s32(a, 1);
   17198 }
   17199 
   17200 // CHECK-LABEL: define <2 x i32> @test_vshrn_n_s64(<2 x i64> %a) #0 {
   17201 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   17202 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   17203 // CHECK:   [[TMP2:%.*]] = ashr <2 x i64> [[TMP1]], <i64 1, i64 1>
   17204 // CHECK:   [[VSHRN_N:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i32>
   17205 // CHECK:   ret <2 x i32> [[VSHRN_N]]
   17206 int32x2_t test_vshrn_n_s64(int64x2_t a) {
   17207   return vshrn_n_s64(a, 1);
   17208 }
   17209 
   17210 // CHECK-LABEL: define <8 x i8> @test_vshrn_n_u16(<8 x i16> %a) #0 {
   17211 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   17212 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   17213 // CHECK:   [[TMP2:%.*]] = lshr <8 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
   17214 // CHECK:   [[VSHRN_N:%.*]] = trunc <8 x i16> [[TMP2]] to <8 x i8>
   17215 // CHECK:   ret <8 x i8> [[VSHRN_N]]
   17216 uint8x8_t test_vshrn_n_u16(uint16x8_t a) {
   17217   return vshrn_n_u16(a, 1);
   17218 }
   17219 
   17220 // CHECK-LABEL: define <4 x i16> @test_vshrn_n_u32(<4 x i32> %a) #0 {
   17221 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   17222 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   17223 // CHECK:   [[TMP2:%.*]] = lshr <4 x i32> [[TMP1]], <i32 1, i32 1, i32 1, i32 1>
   17224 // CHECK:   [[VSHRN_N:%.*]] = trunc <4 x i32> [[TMP2]] to <4 x i16>
   17225 // CHECK:   ret <4 x i16> [[VSHRN_N]]
   17226 uint16x4_t test_vshrn_n_u32(uint32x4_t a) {
   17227   return vshrn_n_u32(a, 1);
   17228 }
   17229 
   17230 // CHECK-LABEL: define <2 x i32> @test_vshrn_n_u64(<2 x i64> %a) #0 {
   17231 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   17232 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   17233 // CHECK:   [[TMP2:%.*]] = lshr <2 x i64> [[TMP1]], <i64 1, i64 1>
   17234 // CHECK:   [[VSHRN_N:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i32>
   17235 // CHECK:   ret <2 x i32> [[VSHRN_N]]
   17236 uint32x2_t test_vshrn_n_u64(uint64x2_t a) {
   17237   return vshrn_n_u64(a, 1);
   17238 }
   17239 
   17240 
   17241 // CHECK-LABEL: define <8 x i8> @test_vshr_n_s8(<8 x i8> %a) #0 {
   17242 // CHECK:   [[VSHR_N:%.*]] = ashr <8 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
   17243 // CHECK:   ret <8 x i8> [[VSHR_N]]
   17244 int8x8_t test_vshr_n_s8(int8x8_t a) {
   17245   return vshr_n_s8(a, 1);
   17246 }
   17247 
   17248 // CHECK-LABEL: define <4 x i16> @test_vshr_n_s16(<4 x i16> %a) #0 {
   17249 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   17250 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   17251 // CHECK:   [[VSHR_N:%.*]] = ashr <4 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1>
   17252 // CHECK:   ret <4 x i16> [[VSHR_N]]
   17253 int16x4_t test_vshr_n_s16(int16x4_t a) {
   17254   return vshr_n_s16(a, 1);
   17255 }
   17256 
   17257 // CHECK-LABEL: define <2 x i32> @test_vshr_n_s32(<2 x i32> %a) #0 {
   17258 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   17259 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   17260 // CHECK:   [[VSHR_N:%.*]] = ashr <2 x i32> [[TMP1]], <i32 1, i32 1>
   17261 // CHECK:   ret <2 x i32> [[VSHR_N]]
   17262 int32x2_t test_vshr_n_s32(int32x2_t a) {
   17263   return vshr_n_s32(a, 1);
   17264 }
   17265 
   17266 // CHECK-LABEL: define <1 x i64> @test_vshr_n_s64(<1 x i64> %a) #0 {
   17267 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   17268 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
   17269 // CHECK:   [[VSHR_N:%.*]] = ashr <1 x i64> [[TMP1]], <i64 1>
   17270 // CHECK:   ret <1 x i64> [[VSHR_N]]
   17271 int64x1_t test_vshr_n_s64(int64x1_t a) {
   17272   return vshr_n_s64(a, 1);
   17273 }
   17274 
   17275 // CHECK-LABEL: define <8 x i8> @test_vshr_n_u8(<8 x i8> %a) #0 {
   17276 // CHECK:   [[VSHR_N:%.*]] = lshr <8 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
   17277 // CHECK:   ret <8 x i8> [[VSHR_N]]
   17278 uint8x8_t test_vshr_n_u8(uint8x8_t a) {
   17279   return vshr_n_u8(a, 1);
   17280 }
   17281 
   17282 // CHECK-LABEL: define <4 x i16> @test_vshr_n_u16(<4 x i16> %a) #0 {
   17283 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   17284 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   17285 // CHECK:   [[VSHR_N:%.*]] = lshr <4 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1>
   17286 // CHECK:   ret <4 x i16> [[VSHR_N]]
   17287 uint16x4_t test_vshr_n_u16(uint16x4_t a) {
   17288   return vshr_n_u16(a, 1);
   17289 }
   17290 
   17291 // CHECK-LABEL: define <2 x i32> @test_vshr_n_u32(<2 x i32> %a) #0 {
   17292 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   17293 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   17294 // CHECK:   [[VSHR_N:%.*]] = lshr <2 x i32> [[TMP1]], <i32 1, i32 1>
   17295 // CHECK:   ret <2 x i32> [[VSHR_N]]
   17296 uint32x2_t test_vshr_n_u32(uint32x2_t a) {
   17297   return vshr_n_u32(a, 1);
   17298 }
   17299 
   17300 // CHECK-LABEL: define <1 x i64> @test_vshr_n_u64(<1 x i64> %a) #0 {
   17301 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   17302 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
   17303 // CHECK:   [[VSHR_N:%.*]] = lshr <1 x i64> [[TMP1]], <i64 1>
   17304 // CHECK:   ret <1 x i64> [[VSHR_N]]
   17305 uint64x1_t test_vshr_n_u64(uint64x1_t a) {
   17306   return vshr_n_u64(a, 1);
   17307 }
   17308 
   17309 // CHECK-LABEL: define <16 x i8> @test_vshrq_n_s8(<16 x i8> %a) #0 {
   17310 // CHECK:   [[VSHR_N:%.*]] = ashr <16 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
   17311 // CHECK:   ret <16 x i8> [[VSHR_N]]
   17312 int8x16_t test_vshrq_n_s8(int8x16_t a) {
   17313   return vshrq_n_s8(a, 1);
   17314 }
   17315 
   17316 // CHECK-LABEL: define <8 x i16> @test_vshrq_n_s16(<8 x i16> %a) #0 {
   17317 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   17318 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   17319 // CHECK:   [[VSHR_N:%.*]] = ashr <8 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
   17320 // CHECK:   ret <8 x i16> [[VSHR_N]]
   17321 int16x8_t test_vshrq_n_s16(int16x8_t a) {
   17322   return vshrq_n_s16(a, 1);
   17323 }
   17324 
   17325 // CHECK-LABEL: define <4 x i32> @test_vshrq_n_s32(<4 x i32> %a) #0 {
   17326 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   17327 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   17328 // CHECK:   [[VSHR_N:%.*]] = ashr <4 x i32> [[TMP1]], <i32 1, i32 1, i32 1, i32 1>
   17329 // CHECK:   ret <4 x i32> [[VSHR_N]]
   17330 int32x4_t test_vshrq_n_s32(int32x4_t a) {
   17331   return vshrq_n_s32(a, 1);
   17332 }
   17333 
   17334 // CHECK-LABEL: define <2 x i64> @test_vshrq_n_s64(<2 x i64> %a) #0 {
   17335 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   17336 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   17337 // CHECK:   [[VSHR_N:%.*]] = ashr <2 x i64> [[TMP1]], <i64 1, i64 1>
   17338 // CHECK:   ret <2 x i64> [[VSHR_N]]
   17339 int64x2_t test_vshrq_n_s64(int64x2_t a) {
   17340   return vshrq_n_s64(a, 1);
   17341 }
   17342 
   17343 // CHECK-LABEL: define <16 x i8> @test_vshrq_n_u8(<16 x i8> %a) #0 {
   17344 // CHECK:   [[VSHR_N:%.*]] = lshr <16 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
   17345 // CHECK:   ret <16 x i8> [[VSHR_N]]
   17346 uint8x16_t test_vshrq_n_u8(uint8x16_t a) {
   17347   return vshrq_n_u8(a, 1);
   17348 }
   17349 
   17350 // CHECK-LABEL: define <8 x i16> @test_vshrq_n_u16(<8 x i16> %a) #0 {
   17351 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   17352 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   17353 // CHECK:   [[VSHR_N:%.*]] = lshr <8 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
   17354 // CHECK:   ret <8 x i16> [[VSHR_N]]
   17355 uint16x8_t test_vshrq_n_u16(uint16x8_t a) {
   17356   return vshrq_n_u16(a, 1);
   17357 }
   17358 
   17359 // CHECK-LABEL: define <4 x i32> @test_vshrq_n_u32(<4 x i32> %a) #0 {
   17360 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   17361 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   17362 // CHECK:   [[VSHR_N:%.*]] = lshr <4 x i32> [[TMP1]], <i32 1, i32 1, i32 1, i32 1>
   17363 // CHECK:   ret <4 x i32> [[VSHR_N]]
   17364 uint32x4_t test_vshrq_n_u32(uint32x4_t a) {
   17365   return vshrq_n_u32(a, 1);
   17366 }
   17367 
   17368 // CHECK-LABEL: define <2 x i64> @test_vshrq_n_u64(<2 x i64> %a) #0 {
   17369 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   17370 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   17371 // CHECK:   [[VSHR_N:%.*]] = lshr <2 x i64> [[TMP1]], <i64 1, i64 1>
   17372 // CHECK:   ret <2 x i64> [[VSHR_N]]
   17373 uint64x2_t test_vshrq_n_u64(uint64x2_t a) {
   17374   return vshrq_n_u64(a, 1);
   17375 }
   17376 
   17377 
   17378 // CHECK-LABEL: define <8 x i8> @test_vsli_n_s8(<8 x i8> %a, <8 x i8> %b) #0 {
   17379 // CHECK:   [[VSLI_N:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
   17380 // CHECK:   ret <8 x i8> [[VSLI_N]]
   17381 int8x8_t test_vsli_n_s8(int8x8_t a, int8x8_t b) {
   17382   return vsli_n_s8(a, b, 1);
   17383 }
   17384 
   17385 // CHECK-LABEL: define <4 x i16> @test_vsli_n_s16(<4 x i16> %a, <4 x i16> %b) #0 {
   17386 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   17387 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   17388 // CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   17389 // CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   17390 // CHECK:   [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
   17391 // CHECK:   ret <4 x i16> [[VSLI_N2]]
   17392 int16x4_t test_vsli_n_s16(int16x4_t a, int16x4_t b) {
   17393   return vsli_n_s16(a, b, 1);
   17394 }
   17395 
   17396 // CHECK-LABEL: define <2 x i32> @test_vsli_n_s32(<2 x i32> %a, <2 x i32> %b) #0 {
   17397 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   17398 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   17399 // CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   17400 // CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   17401 // CHECK:   [[VSLI_N2:%.*]] = call <2 x i32> @llvm.arm.neon.vshiftins.v2i32(<2 x i32> [[VSLI_N]], <2 x i32> [[VSLI_N1]], <2 x i32> <i32 1, i32 1>)
   17402 // CHECK:   ret <2 x i32> [[VSLI_N2]]
   17403 int32x2_t test_vsli_n_s32(int32x2_t a, int32x2_t b) {
   17404   return vsli_n_s32(a, b, 1);
   17405 }
   17406 
   17407 // CHECK-LABEL: define <1 x i64> @test_vsli_n_s64(<1 x i64> %a, <1 x i64> %b) #0 {
   17408 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   17409 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
   17410 // CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
   17411 // CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
   17412 // CHECK:   [[VSLI_N2:%.*]] = call <1 x i64> @llvm.arm.neon.vshiftins.v1i64(<1 x i64> [[VSLI_N]], <1 x i64> [[VSLI_N1]], <1 x i64> <i64 1>)
   17413 // CHECK:   ret <1 x i64> [[VSLI_N2]]
   17414 int64x1_t test_vsli_n_s64(int64x1_t a, int64x1_t b) {
   17415   return vsli_n_s64(a, b, 1);
   17416 }
   17417 
   17418 // CHECK-LABEL: define <8 x i8> @test_vsli_n_u8(<8 x i8> %a, <8 x i8> %b) #0 {
   17419 // CHECK:   [[VSLI_N:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
   17420 // CHECK:   ret <8 x i8> [[VSLI_N]]
   17421 uint8x8_t test_vsli_n_u8(uint8x8_t a, uint8x8_t b) {
   17422   return vsli_n_u8(a, b, 1);
   17423 }
   17424 
   17425 // CHECK-LABEL: define <4 x i16> @test_vsli_n_u16(<4 x i16> %a, <4 x i16> %b) #0 {
   17426 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   17427 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   17428 // CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   17429 // CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   17430 // CHECK:   [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
   17431 // CHECK:   ret <4 x i16> [[VSLI_N2]]
   17432 uint16x4_t test_vsli_n_u16(uint16x4_t a, uint16x4_t b) {
   17433   return vsli_n_u16(a, b, 1);
   17434 }
   17435 
   17436 // CHECK-LABEL: define <2 x i32> @test_vsli_n_u32(<2 x i32> %a, <2 x i32> %b) #0 {
   17437 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   17438 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   17439 // CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   17440 // CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   17441 // CHECK:   [[VSLI_N2:%.*]] = call <2 x i32> @llvm.arm.neon.vshiftins.v2i32(<2 x i32> [[VSLI_N]], <2 x i32> [[VSLI_N1]], <2 x i32> <i32 1, i32 1>)
   17442 // CHECK:   ret <2 x i32> [[VSLI_N2]]
   17443 uint32x2_t test_vsli_n_u32(uint32x2_t a, uint32x2_t b) {
   17444   return vsli_n_u32(a, b, 1);
   17445 }
   17446 
   17447 // CHECK-LABEL: define <1 x i64> @test_vsli_n_u64(<1 x i64> %a, <1 x i64> %b) #0 {
   17448 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   17449 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
   17450 // CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
   17451 // CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
   17452 // CHECK:   [[VSLI_N2:%.*]] = call <1 x i64> @llvm.arm.neon.vshiftins.v1i64(<1 x i64> [[VSLI_N]], <1 x i64> [[VSLI_N1]], <1 x i64> <i64 1>)
   17453 // CHECK:   ret <1 x i64> [[VSLI_N2]]
   17454 uint64x1_t test_vsli_n_u64(uint64x1_t a, uint64x1_t b) {
   17455   return vsli_n_u64(a, b, 1);
   17456 }
   17457 
   17458 // CHECK-LABEL: define <8 x i8> @test_vsli_n_p8(<8 x i8> %a, <8 x i8> %b) #0 {
   17459 // CHECK:   [[VSLI_N:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
   17460 // CHECK:   ret <8 x i8> [[VSLI_N]]
   17461 poly8x8_t test_vsli_n_p8(poly8x8_t a, poly8x8_t b) {
   17462   return vsli_n_p8(a, b, 1);
   17463 }
   17464 
   17465 // CHECK-LABEL: define <4 x i16> @test_vsli_n_p16(<4 x i16> %a, <4 x i16> %b) #0 {
   17466 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   17467 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   17468 // CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   17469 // CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   17470 // CHECK:   [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
   17471 // CHECK:   ret <4 x i16> [[VSLI_N2]]
   17472 poly16x4_t test_vsli_n_p16(poly16x4_t a, poly16x4_t b) {
   17473   return vsli_n_p16(a, b, 1);
   17474 }
   17475 
   17476 // CHECK-LABEL: define <16 x i8> @test_vsliq_n_s8(<16 x i8> %a, <16 x i8> %b) #0 {
   17477 // CHECK:   [[VSLI_N:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
   17478 // CHECK:   ret <16 x i8> [[VSLI_N]]
   17479 int8x16_t test_vsliq_n_s8(int8x16_t a, int8x16_t b) {
   17480   return vsliq_n_s8(a, b, 1);
   17481 }
   17482 
   17483 // CHECK-LABEL: define <8 x i16> @test_vsliq_n_s16(<8 x i16> %a, <8 x i16> %b) #0 {
   17484 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   17485 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   17486 // CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   17487 // CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   17488 // CHECK:   [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
   17489 // CHECK:   ret <8 x i16> [[VSLI_N2]]
   17490 int16x8_t test_vsliq_n_s16(int16x8_t a, int16x8_t b) {
   17491   return vsliq_n_s16(a, b, 1);
   17492 }
   17493 
   17494 // CHECK-LABEL: define <4 x i32> @test_vsliq_n_s32(<4 x i32> %a, <4 x i32> %b) #0 {
   17495 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   17496 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   17497 // CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   17498 // CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   17499 // CHECK:   [[VSLI_N2:%.*]] = call <4 x i32> @llvm.arm.neon.vshiftins.v4i32(<4 x i32> [[VSLI_N]], <4 x i32> [[VSLI_N1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
   17500 // CHECK:   ret <4 x i32> [[VSLI_N2]]
   17501 int32x4_t test_vsliq_n_s32(int32x4_t a, int32x4_t b) {
   17502   return vsliq_n_s32(a, b, 1);
   17503 }
   17504 
   17505 // CHECK-LABEL: define <2 x i64> @test_vsliq_n_s64(<2 x i64> %a, <2 x i64> %b) #0 {
   17506 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   17507 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   17508 // CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   17509 // CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
   17510 // CHECK:   [[VSLI_N2:%.*]] = call <2 x i64> @llvm.arm.neon.vshiftins.v2i64(<2 x i64> [[VSLI_N]], <2 x i64> [[VSLI_N1]], <2 x i64> <i64 1, i64 1>)
   17511 // CHECK:   ret <2 x i64> [[VSLI_N2]]
   17512 int64x2_t test_vsliq_n_s64(int64x2_t a, int64x2_t b) {
   17513   return vsliq_n_s64(a, b, 1);
   17514 }
   17515 
   17516 // CHECK-LABEL: define <16 x i8> @test_vsliq_n_u8(<16 x i8> %a, <16 x i8> %b) #0 {
   17517 // CHECK:   [[VSLI_N:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
   17518 // CHECK:   ret <16 x i8> [[VSLI_N]]
   17519 uint8x16_t test_vsliq_n_u8(uint8x16_t a, uint8x16_t b) {
   17520   return vsliq_n_u8(a, b, 1);
   17521 }
   17522 
   17523 // CHECK-LABEL: define <8 x i16> @test_vsliq_n_u16(<8 x i16> %a, <8 x i16> %b) #0 {
   17524 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   17525 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   17526 // CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   17527 // CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   17528 // CHECK:   [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
   17529 // CHECK:   ret <8 x i16> [[VSLI_N2]]
   17530 uint16x8_t test_vsliq_n_u16(uint16x8_t a, uint16x8_t b) {
   17531   return vsliq_n_u16(a, b, 1);
   17532 }
   17533 
   17534 // CHECK-LABEL: define <4 x i32> @test_vsliq_n_u32(<4 x i32> %a, <4 x i32> %b) #0 {
   17535 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   17536 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   17537 // CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   17538 // CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   17539 // CHECK:   [[VSLI_N2:%.*]] = call <4 x i32> @llvm.arm.neon.vshiftins.v4i32(<4 x i32> [[VSLI_N]], <4 x i32> [[VSLI_N1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
   17540 // CHECK:   ret <4 x i32> [[VSLI_N2]]
   17541 uint32x4_t test_vsliq_n_u32(uint32x4_t a, uint32x4_t b) {
   17542   return vsliq_n_u32(a, b, 1);
   17543 }
   17544 
   17545 // CHECK-LABEL: define <2 x i64> @test_vsliq_n_u64(<2 x i64> %a, <2 x i64> %b) #0 {
   17546 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   17547 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   17548 // CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   17549 // CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
   17550 // CHECK:   [[VSLI_N2:%.*]] = call <2 x i64> @llvm.arm.neon.vshiftins.v2i64(<2 x i64> [[VSLI_N]], <2 x i64> [[VSLI_N1]], <2 x i64> <i64 1, i64 1>)
   17551 // CHECK:   ret <2 x i64> [[VSLI_N2]]
   17552 uint64x2_t test_vsliq_n_u64(uint64x2_t a, uint64x2_t b) {
   17553   return vsliq_n_u64(a, b, 1);
   17554 }
   17555 
   17556 // CHECK-LABEL: define <16 x i8> @test_vsliq_n_p8(<16 x i8> %a, <16 x i8> %b) #0 {
   17557 // CHECK:   [[VSLI_N:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
   17558 // CHECK:   ret <16 x i8> [[VSLI_N]]
   17559 poly8x16_t test_vsliq_n_p8(poly8x16_t a, poly8x16_t b) {
   17560   return vsliq_n_p8(a, b, 1);
   17561 }
   17562 
   17563 // CHECK-LABEL: define <8 x i16> @test_vsliq_n_p16(<8 x i16> %a, <8 x i16> %b) #0 {
   17564 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   17565 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   17566 // CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   17567 // CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   17568 // CHECK:   [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
   17569 // CHECK:   ret <8 x i16> [[VSLI_N2]]
   17570 poly16x8_t test_vsliq_n_p16(poly16x8_t a, poly16x8_t b) {
   17571   return vsliq_n_p16(a, b, 1);
   17572 }
   17573 
   17574 
   17575 // CHECK-LABEL: define <8 x i8> @test_vsra_n_s8(<8 x i8> %a, <8 x i8> %b) #0 {
   17576 // CHECK:   [[VSRA_N:%.*]] = ashr <8 x i8> %b, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
   17577 // CHECK:   [[TMP0:%.*]] = add <8 x i8> %a, [[VSRA_N]]
   17578 // CHECK:   ret <8 x i8> [[TMP0]]
   17579 int8x8_t test_vsra_n_s8(int8x8_t a, int8x8_t b) {
   17580   return vsra_n_s8(a, b, 1);
   17581 }
   17582 
   17583 // CHECK-LABEL: define <4 x i16> @test_vsra_n_s16(<4 x i16> %a, <4 x i16> %b) #0 {
   17584 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   17585 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   17586 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   17587 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   17588 // CHECK:   [[VSRA_N:%.*]] = ashr <4 x i16> [[TMP3]], <i16 1, i16 1, i16 1, i16 1>
   17589 // CHECK:   [[TMP4:%.*]] = add <4 x i16> [[TMP2]], [[VSRA_N]]
   17590 // CHECK:   ret <4 x i16> [[TMP4]]
   17591 int16x4_t test_vsra_n_s16(int16x4_t a, int16x4_t b) {
   17592   return vsra_n_s16(a, b, 1);
   17593 }
   17594 
   17595 // CHECK-LABEL: define <2 x i32> @test_vsra_n_s32(<2 x i32> %a, <2 x i32> %b) #0 {
   17596 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   17597 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   17598 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   17599 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   17600 // CHECK:   [[VSRA_N:%.*]] = ashr <2 x i32> [[TMP3]], <i32 1, i32 1>
   17601 // CHECK:   [[TMP4:%.*]] = add <2 x i32> [[TMP2]], [[VSRA_N]]
   17602 // CHECK:   ret <2 x i32> [[TMP4]]
   17603 int32x2_t test_vsra_n_s32(int32x2_t a, int32x2_t b) {
   17604   return vsra_n_s32(a, b, 1);
   17605 }
   17606 
   17607 // CHECK-LABEL: define <1 x i64> @test_vsra_n_s64(<1 x i64> %a, <1 x i64> %b) #0 {
   17608 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   17609 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
   17610 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
   17611 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
   17612 // CHECK:   [[VSRA_N:%.*]] = ashr <1 x i64> [[TMP3]], <i64 1>
   17613 // CHECK:   [[TMP4:%.*]] = add <1 x i64> [[TMP2]], [[VSRA_N]]
   17614 // CHECK:   ret <1 x i64> [[TMP4]]
   17615 int64x1_t test_vsra_n_s64(int64x1_t a, int64x1_t b) {
   17616   return vsra_n_s64(a, b, 1);
   17617 }
   17618 
   17619 // CHECK-LABEL: define <8 x i8> @test_vsra_n_u8(<8 x i8> %a, <8 x i8> %b) #0 {
   17620 // CHECK:   [[VSRA_N:%.*]] = lshr <8 x i8> %b, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
   17621 // CHECK:   [[TMP0:%.*]] = add <8 x i8> %a, [[VSRA_N]]
   17622 // CHECK:   ret <8 x i8> [[TMP0]]
   17623 uint8x8_t test_vsra_n_u8(uint8x8_t a, uint8x8_t b) {
   17624   return vsra_n_u8(a, b, 1);
   17625 }
   17626 
   17627 // CHECK-LABEL: define <4 x i16> @test_vsra_n_u16(<4 x i16> %a, <4 x i16> %b) #0 {
   17628 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   17629 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   17630 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   17631 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   17632 // CHECK:   [[VSRA_N:%.*]] = lshr <4 x i16> [[TMP3]], <i16 1, i16 1, i16 1, i16 1>
   17633 // CHECK:   [[TMP4:%.*]] = add <4 x i16> [[TMP2]], [[VSRA_N]]
   17634 // CHECK:   ret <4 x i16> [[TMP4]]
   17635 uint16x4_t test_vsra_n_u16(uint16x4_t a, uint16x4_t b) {
   17636   return vsra_n_u16(a, b, 1);
   17637 }
   17638 
   17639 // CHECK-LABEL: define <2 x i32> @test_vsra_n_u32(<2 x i32> %a, <2 x i32> %b) #0 {
   17640 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   17641 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   17642 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   17643 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   17644 // CHECK:   [[VSRA_N:%.*]] = lshr <2 x i32> [[TMP3]], <i32 1, i32 1>
   17645 // CHECK:   [[TMP4:%.*]] = add <2 x i32> [[TMP2]], [[VSRA_N]]
   17646 // CHECK:   ret <2 x i32> [[TMP4]]
   17647 uint32x2_t test_vsra_n_u32(uint32x2_t a, uint32x2_t b) {
   17648   return vsra_n_u32(a, b, 1);
   17649 }
   17650 
   17651 // CHECK-LABEL: define <1 x i64> @test_vsra_n_u64(<1 x i64> %a, <1 x i64> %b) #0 {
   17652 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   17653 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
   17654 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
   17655 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
   17656 // CHECK:   [[VSRA_N:%.*]] = lshr <1 x i64> [[TMP3]], <i64 1>
   17657 // CHECK:   [[TMP4:%.*]] = add <1 x i64> [[TMP2]], [[VSRA_N]]
   17658 // CHECK:   ret <1 x i64> [[TMP4]]
   17659 uint64x1_t test_vsra_n_u64(uint64x1_t a, uint64x1_t b) {
   17660   return vsra_n_u64(a, b, 1);
   17661 }
   17662 
   17663 // CHECK-LABEL: define <16 x i8> @test_vsraq_n_s8(<16 x i8> %a, <16 x i8> %b) #0 {
   17664 // CHECK:   [[VSRA_N:%.*]] = ashr <16 x i8> %b, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
   17665 // CHECK:   [[TMP0:%.*]] = add <16 x i8> %a, [[VSRA_N]]
   17666 // CHECK:   ret <16 x i8> [[TMP0]]
   17667 int8x16_t test_vsraq_n_s8(int8x16_t a, int8x16_t b) {
   17668   return vsraq_n_s8(a, b, 1);
   17669 }
   17670 
   17671 // CHECK-LABEL: define <8 x i16> @test_vsraq_n_s16(<8 x i16> %a, <8 x i16> %b) #0 {
   17672 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   17673 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   17674 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   17675 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   17676 // CHECK:   [[VSRA_N:%.*]] = ashr <8 x i16> [[TMP3]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
   17677 // CHECK:   [[TMP4:%.*]] = add <8 x i16> [[TMP2]], [[VSRA_N]]
   17678 // CHECK:   ret <8 x i16> [[TMP4]]
   17679 int16x8_t test_vsraq_n_s16(int16x8_t a, int16x8_t b) {
   17680   return vsraq_n_s16(a, b, 1);
   17681 }
   17682 
   17683 // CHECK-LABEL: define <4 x i32> @test_vsraq_n_s32(<4 x i32> %a, <4 x i32> %b) #0 {
   17684 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   17685 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   17686 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   17687 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   17688 // CHECK:   [[VSRA_N:%.*]] = ashr <4 x i32> [[TMP3]], <i32 1, i32 1, i32 1, i32 1>
   17689 // CHECK:   [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[VSRA_N]]
   17690 // CHECK:   ret <4 x i32> [[TMP4]]
   17691 int32x4_t test_vsraq_n_s32(int32x4_t a, int32x4_t b) {
   17692   return vsraq_n_s32(a, b, 1);
   17693 }
   17694 
   17695 // CHECK-LABEL: define <2 x i64> @test_vsraq_n_s64(<2 x i64> %a, <2 x i64> %b) #0 {
   17696 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   17697 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   17698 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   17699 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
   17700 // CHECK:   [[VSRA_N:%.*]] = ashr <2 x i64> [[TMP3]], <i64 1, i64 1>
   17701 // CHECK:   [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[VSRA_N]]
   17702 // CHECK:   ret <2 x i64> [[TMP4]]
   17703 int64x2_t test_vsraq_n_s64(int64x2_t a, int64x2_t b) {
   17704   return vsraq_n_s64(a, b, 1);
   17705 }
   17706 
   17707 // CHECK-LABEL: define <16 x i8> @test_vsraq_n_u8(<16 x i8> %a, <16 x i8> %b) #0 {
   17708 // CHECK:   [[VSRA_N:%.*]] = lshr <16 x i8> %b, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
   17709 // CHECK:   [[TMP0:%.*]] = add <16 x i8> %a, [[VSRA_N]]
   17710 // CHECK:   ret <16 x i8> [[TMP0]]
   17711 uint8x16_t test_vsraq_n_u8(uint8x16_t a, uint8x16_t b) {
   17712   return vsraq_n_u8(a, b, 1);
   17713 }
   17714 
   17715 // CHECK-LABEL: define <8 x i16> @test_vsraq_n_u16(<8 x i16> %a, <8 x i16> %b) #0 {
   17716 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   17717 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   17718 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   17719 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   17720 // CHECK:   [[VSRA_N:%.*]] = lshr <8 x i16> [[TMP3]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
   17721 // CHECK:   [[TMP4:%.*]] = add <8 x i16> [[TMP2]], [[VSRA_N]]
   17722 // CHECK:   ret <8 x i16> [[TMP4]]
   17723 uint16x8_t test_vsraq_n_u16(uint16x8_t a, uint16x8_t b) {
   17724   return vsraq_n_u16(a, b, 1);
   17725 }
   17726 
   17727 // CHECK-LABEL: define <4 x i32> @test_vsraq_n_u32(<4 x i32> %a, <4 x i32> %b) #0 {
   17728 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   17729 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   17730 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   17731 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   17732 // CHECK:   [[VSRA_N:%.*]] = lshr <4 x i32> [[TMP3]], <i32 1, i32 1, i32 1, i32 1>
   17733 // CHECK:   [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[VSRA_N]]
   17734 // CHECK:   ret <4 x i32> [[TMP4]]
   17735 uint32x4_t test_vsraq_n_u32(uint32x4_t a, uint32x4_t b) {
   17736   return vsraq_n_u32(a, b, 1);
   17737 }
   17738 
   17739 // CHECK-LABEL: define <2 x i64> @test_vsraq_n_u64(<2 x i64> %a, <2 x i64> %b) #0 {
   17740 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   17741 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   17742 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   17743 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
   17744 // CHECK:   [[VSRA_N:%.*]] = lshr <2 x i64> [[TMP3]], <i64 1, i64 1>
   17745 // CHECK:   [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[VSRA_N]]
   17746 // CHECK:   ret <2 x i64> [[TMP4]]
   17747 uint64x2_t test_vsraq_n_u64(uint64x2_t a, uint64x2_t b) {
   17748   return vsraq_n_u64(a, b, 1);
   17749 }
   17750 
   17751 
   17752 // CHECK-LABEL: define <8 x i8> @test_vsri_n_s8(<8 x i8> %a, <8 x i8> %b) #0 {
   17753 // CHECK:   [[VSLI_N:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
   17754 // CHECK:   ret <8 x i8> [[VSLI_N]]
   17755 int8x8_t test_vsri_n_s8(int8x8_t a, int8x8_t b) {
   17756   return vsri_n_s8(a, b, 1);
   17757 }
   17758 
   17759 // CHECK-LABEL: define <4 x i16> @test_vsri_n_s16(<4 x i16> %a, <4 x i16> %b) #0 {
   17760 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   17761 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   17762 // CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   17763 // CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   17764 // CHECK:   [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
   17765 // CHECK:   ret <4 x i16> [[VSLI_N2]]
   17766 int16x4_t test_vsri_n_s16(int16x4_t a, int16x4_t b) {
   17767   return vsri_n_s16(a, b, 1);
   17768 }
   17769 
   17770 // CHECK-LABEL: define <2 x i32> @test_vsri_n_s32(<2 x i32> %a, <2 x i32> %b) #0 {
   17771 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   17772 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   17773 // CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   17774 // CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   17775 // CHECK:   [[VSLI_N2:%.*]] = call <2 x i32> @llvm.arm.neon.vshiftins.v2i32(<2 x i32> [[VSLI_N]], <2 x i32> [[VSLI_N1]], <2 x i32> <i32 -1, i32 -1>)
   17776 // CHECK:   ret <2 x i32> [[VSLI_N2]]
   17777 int32x2_t test_vsri_n_s32(int32x2_t a, int32x2_t b) {
   17778   return vsri_n_s32(a, b, 1);
   17779 }
   17780 
   17781 // CHECK-LABEL: define <1 x i64> @test_vsri_n_s64(<1 x i64> %a, <1 x i64> %b) #0 {
   17782 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   17783 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
   17784 // CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
   17785 // CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
   17786 // CHECK:   [[VSLI_N2:%.*]] = call <1 x i64> @llvm.arm.neon.vshiftins.v1i64(<1 x i64> [[VSLI_N]], <1 x i64> [[VSLI_N1]], <1 x i64> <i64 -1>)
   17787 // CHECK:   ret <1 x i64> [[VSLI_N2]]
   17788 int64x1_t test_vsri_n_s64(int64x1_t a, int64x1_t b) {
   17789   return vsri_n_s64(a, b, 1);
   17790 }
   17791 
   17792 // CHECK-LABEL: define <8 x i8> @test_vsri_n_u8(<8 x i8> %a, <8 x i8> %b) #0 {
   17793 // CHECK:   [[VSLI_N:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
   17794 // CHECK:   ret <8 x i8> [[VSLI_N]]
   17795 uint8x8_t test_vsri_n_u8(uint8x8_t a, uint8x8_t b) {
   17796   return vsri_n_u8(a, b, 1);
   17797 }
   17798 
   17799 // CHECK-LABEL: define <4 x i16> @test_vsri_n_u16(<4 x i16> %a, <4 x i16> %b) #0 {
   17800 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   17801 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   17802 // CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   17803 // CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   17804 // CHECK:   [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
   17805 // CHECK:   ret <4 x i16> [[VSLI_N2]]
   17806 uint16x4_t test_vsri_n_u16(uint16x4_t a, uint16x4_t b) {
   17807   return vsri_n_u16(a, b, 1);
   17808 }
   17809 
   17810 // CHECK-LABEL: define <2 x i32> @test_vsri_n_u32(<2 x i32> %a, <2 x i32> %b) #0 {
   17811 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   17812 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   17813 // CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   17814 // CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   17815 // CHECK:   [[VSLI_N2:%.*]] = call <2 x i32> @llvm.arm.neon.vshiftins.v2i32(<2 x i32> [[VSLI_N]], <2 x i32> [[VSLI_N1]], <2 x i32> <i32 -1, i32 -1>)
   17816 // CHECK:   ret <2 x i32> [[VSLI_N2]]
   17817 uint32x2_t test_vsri_n_u32(uint32x2_t a, uint32x2_t b) {
   17818   return vsri_n_u32(a, b, 1);
   17819 }
   17820 
   17821 // CHECK-LABEL: define <1 x i64> @test_vsri_n_u64(<1 x i64> %a, <1 x i64> %b) #0 {
   17822 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   17823 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
   17824 // CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
   17825 // CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
   17826 // CHECK:   [[VSLI_N2:%.*]] = call <1 x i64> @llvm.arm.neon.vshiftins.v1i64(<1 x i64> [[VSLI_N]], <1 x i64> [[VSLI_N1]], <1 x i64> <i64 -1>)
   17827 // CHECK:   ret <1 x i64> [[VSLI_N2]]
   17828 uint64x1_t test_vsri_n_u64(uint64x1_t a, uint64x1_t b) {
   17829   return vsri_n_u64(a, b, 1);
   17830 }
   17831 
   17832 // CHECK-LABEL: define <8 x i8> @test_vsri_n_p8(<8 x i8> %a, <8 x i8> %b) #0 {
   17833 // CHECK:   [[VSLI_N:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
   17834 // CHECK:   ret <8 x i8> [[VSLI_N]]
   17835 poly8x8_t test_vsri_n_p8(poly8x8_t a, poly8x8_t b) {
   17836   return vsri_n_p8(a, b, 1);
   17837 }
   17838 
   17839 // CHECK-LABEL: define <4 x i16> @test_vsri_n_p16(<4 x i16> %a, <4 x i16> %b) #0 {
   17840 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   17841 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   17842 // CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   17843 // CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   17844 // CHECK:   [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
   17845 // CHECK:   ret <4 x i16> [[VSLI_N2]]
   17846 poly16x4_t test_vsri_n_p16(poly16x4_t a, poly16x4_t b) {
   17847   return vsri_n_p16(a, b, 1);
   17848 }
   17849 
   17850 // CHECK-LABEL: define <16 x i8> @test_vsriq_n_s8(<16 x i8> %a, <16 x i8> %b) #0 {
   17851 // CHECK:   [[VSLI_N:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
   17852 // CHECK:   ret <16 x i8> [[VSLI_N]]
   17853 int8x16_t test_vsriq_n_s8(int8x16_t a, int8x16_t b) {
   17854   return vsriq_n_s8(a, b, 1);
   17855 }
   17856 
   17857 // CHECK-LABEL: define <8 x i16> @test_vsriq_n_s16(<8 x i16> %a, <8 x i16> %b) #0 {
   17858 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   17859 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   17860 // CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   17861 // CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   17862 // CHECK:   [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
   17863 // CHECK:   ret <8 x i16> [[VSLI_N2]]
   17864 int16x8_t test_vsriq_n_s16(int16x8_t a, int16x8_t b) {
   17865   return vsriq_n_s16(a, b, 1);
   17866 }
   17867 
   17868 // CHECK-LABEL: define <4 x i32> @test_vsriq_n_s32(<4 x i32> %a, <4 x i32> %b) #0 {
   17869 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   17870 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   17871 // CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   17872 // CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   17873 // CHECK:   [[VSLI_N2:%.*]] = call <4 x i32> @llvm.arm.neon.vshiftins.v4i32(<4 x i32> [[VSLI_N]], <4 x i32> [[VSLI_N1]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
   17874 // CHECK:   ret <4 x i32> [[VSLI_N2]]
   17875 int32x4_t test_vsriq_n_s32(int32x4_t a, int32x4_t b) {
   17876   return vsriq_n_s32(a, b, 1);
   17877 }
   17878 
   17879 // CHECK-LABEL: define <2 x i64> @test_vsriq_n_s64(<2 x i64> %a, <2 x i64> %b) #0 {
   17880 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   17881 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   17882 // CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   17883 // CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
   17884 // CHECK:   [[VSLI_N2:%.*]] = call <2 x i64> @llvm.arm.neon.vshiftins.v2i64(<2 x i64> [[VSLI_N]], <2 x i64> [[VSLI_N1]], <2 x i64> <i64 -1, i64 -1>)
   17885 // CHECK:   ret <2 x i64> [[VSLI_N2]]
   17886 int64x2_t test_vsriq_n_s64(int64x2_t a, int64x2_t b) {
   17887   return vsriq_n_s64(a, b, 1);
   17888 }
   17889 
   17890 // CHECK-LABEL: define <16 x i8> @test_vsriq_n_u8(<16 x i8> %a, <16 x i8> %b) #0 {
   17891 // CHECK:   [[VSLI_N:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
   17892 // CHECK:   ret <16 x i8> [[VSLI_N]]
   17893 uint8x16_t test_vsriq_n_u8(uint8x16_t a, uint8x16_t b) {
   17894   return vsriq_n_u8(a, b, 1);
   17895 }
   17896 
   17897 // CHECK-LABEL: define <8 x i16> @test_vsriq_n_u16(<8 x i16> %a, <8 x i16> %b) #0 {
   17898 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   17899 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   17900 // CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   17901 // CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   17902 // CHECK:   [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
   17903 // CHECK:   ret <8 x i16> [[VSLI_N2]]
   17904 uint16x8_t test_vsriq_n_u16(uint16x8_t a, uint16x8_t b) {
   17905   return vsriq_n_u16(a, b, 1);
   17906 }
   17907 
   17908 // CHECK-LABEL: define <4 x i32> @test_vsriq_n_u32(<4 x i32> %a, <4 x i32> %b) #0 {
   17909 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   17910 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   17911 // CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   17912 // CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   17913 // CHECK:   [[VSLI_N2:%.*]] = call <4 x i32> @llvm.arm.neon.vshiftins.v4i32(<4 x i32> [[VSLI_N]], <4 x i32> [[VSLI_N1]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
   17914 // CHECK:   ret <4 x i32> [[VSLI_N2]]
   17915 uint32x4_t test_vsriq_n_u32(uint32x4_t a, uint32x4_t b) {
   17916   return vsriq_n_u32(a, b, 1);
   17917 }
   17918 
   17919 // CHECK-LABEL: define <2 x i64> @test_vsriq_n_u64(<2 x i64> %a, <2 x i64> %b) #0 {
   17920 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   17921 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   17922 // CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   17923 // CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
   17924 // CHECK:   [[VSLI_N2:%.*]] = call <2 x i64> @llvm.arm.neon.vshiftins.v2i64(<2 x i64> [[VSLI_N]], <2 x i64> [[VSLI_N1]], <2 x i64> <i64 -1, i64 -1>)
   17925 // CHECK:   ret <2 x i64> [[VSLI_N2]]
   17926 uint64x2_t test_vsriq_n_u64(uint64x2_t a, uint64x2_t b) {
   17927   return vsriq_n_u64(a, b, 1);
   17928 }
   17929 
   17930 // CHECK-LABEL: define <16 x i8> @test_vsriq_n_p8(<16 x i8> %a, <16 x i8> %b) #0 {
   17931 // CHECK:   [[VSLI_N:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
   17932 // CHECK:   ret <16 x i8> [[VSLI_N]]
   17933 poly8x16_t test_vsriq_n_p8(poly8x16_t a, poly8x16_t b) {
   17934   return vsriq_n_p8(a, b, 1);
   17935 }
   17936 
   17937 // CHECK-LABEL: define <8 x i16> @test_vsriq_n_p16(<8 x i16> %a, <8 x i16> %b) #0 {
   17938 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   17939 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   17940 // CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   17941 // CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   17942 // CHECK:   [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
   17943 // CHECK:   ret <8 x i16> [[VSLI_N2]]
   17944 poly16x8_t test_vsriq_n_p16(poly16x8_t a, poly16x8_t b) {
   17945   return vsriq_n_p16(a, b, 1);
   17946 }
   17947 
   17948 
   17949 // CHECK-LABEL: define void @test_vst1q_u8(i8* %a, <16 x i8> %b) #0 {
   17950 // CHECK:   call void @llvm.arm.neon.vst1.p0i8.v16i8(i8* %a, <16 x i8> %b, i32 1)
   17951 // CHECK:   ret void
   17952 void test_vst1q_u8(uint8_t * a, uint8x16_t b) {
   17953   vst1q_u8(a, b);
   17954 }
   17955 
   17956 // CHECK-LABEL: define void @test_vst1q_u16(i16* %a, <8 x i16> %b) #0 {
   17957 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
   17958 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   17959 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   17960 // CHECK:   call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* [[TMP0]], <8 x i16> [[TMP2]], i32 2)
   17961 // CHECK:   ret void
   17962 void test_vst1q_u16(uint16_t * a, uint16x8_t b) {
   17963   vst1q_u16(a, b);
   17964 }
   17965 
   17966 // CHECK-LABEL: define void @test_vst1q_u32(i32* %a, <4 x i32> %b) #0 {
   17967 // CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
   17968 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   17969 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   17970 // CHECK:   call void @llvm.arm.neon.vst1.p0i8.v4i32(i8* [[TMP0]], <4 x i32> [[TMP2]], i32 4)
   17971 // CHECK:   ret void
   17972 void test_vst1q_u32(uint32_t * a, uint32x4_t b) {
   17973   vst1q_u32(a, b);
   17974 }
   17975 
   17976 // CHECK-LABEL: define void @test_vst1q_u64(i64* %a, <2 x i64> %b) #0 {
   17977 // CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
   17978 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   17979 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
   17980 // CHECK:   call void @llvm.arm.neon.vst1.p0i8.v2i64(i8* [[TMP0]], <2 x i64> [[TMP2]], i32 4)
   17981 // CHECK:   ret void
   17982 void test_vst1q_u64(uint64_t * a, uint64x2_t b) {
   17983   vst1q_u64(a, b);
   17984 }
   17985 
   17986 // CHECK-LABEL: define void @test_vst1q_s8(i8* %a, <16 x i8> %b) #0 {
   17987 // CHECK:   call void @llvm.arm.neon.vst1.p0i8.v16i8(i8* %a, <16 x i8> %b, i32 1)
   17988 // CHECK:   ret void
   17989 void test_vst1q_s8(int8_t * a, int8x16_t b) {
   17990   vst1q_s8(a, b);
   17991 }
   17992 
   17993 // CHECK-LABEL: define void @test_vst1q_s16(i16* %a, <8 x i16> %b) #0 {
   17994 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
   17995 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   17996 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   17997 // CHECK:   call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* [[TMP0]], <8 x i16> [[TMP2]], i32 2)
   17998 // CHECK:   ret void
   17999 void test_vst1q_s16(int16_t * a, int16x8_t b) {
   18000   vst1q_s16(a, b);
   18001 }
   18002 
   18003 // CHECK-LABEL: define void @test_vst1q_s32(i32* %a, <4 x i32> %b) #0 {
   18004 // CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
   18005 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   18006 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   18007 // CHECK:   call void @llvm.arm.neon.vst1.p0i8.v4i32(i8* [[TMP0]], <4 x i32> [[TMP2]], i32 4)
   18008 // CHECK:   ret void
   18009 void test_vst1q_s32(int32_t * a, int32x4_t b) {
   18010   vst1q_s32(a, b);
   18011 }
   18012 
   18013 // CHECK-LABEL: define void @test_vst1q_s64(i64* %a, <2 x i64> %b) #0 {
   18014 // CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
   18015 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   18016 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
   18017 // CHECK:   call void @llvm.arm.neon.vst1.p0i8.v2i64(i8* [[TMP0]], <2 x i64> [[TMP2]], i32 4)
   18018 // CHECK:   ret void
   18019 void test_vst1q_s64(int64_t * a, int64x2_t b) {
   18020   vst1q_s64(a, b);
   18021 }
   18022 
   18023 // CHECK-LABEL: define void @test_vst1q_f16(half* %a, <8 x half> %b) #0 {
   18024 // CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
   18025 // CHECK:   [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8>
   18026 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   18027 // CHECK:   call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* [[TMP0]], <8 x i16> [[TMP2]], i32 2)
   18028 // CHECK:   ret void
   18029 void test_vst1q_f16(float16_t * a, float16x8_t b) {
   18030   vst1q_f16(a, b);
   18031 }
   18032 
   18033 // CHECK-LABEL: define void @test_vst1q_f32(float* %a, <4 x float> %b) #0 {
   18034 // CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
   18035 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
   18036 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
   18037 // CHECK:   call void @llvm.arm.neon.vst1.p0i8.v4f32(i8* [[TMP0]], <4 x float> [[TMP2]], i32 4)
   18038 // CHECK:   ret void
   18039 void test_vst1q_f32(float32_t * a, float32x4_t b) {
   18040   vst1q_f32(a, b);
   18041 }
   18042 
   18043 // CHECK-LABEL: define void @test_vst1q_p8(i8* %a, <16 x i8> %b) #0 {
   18044 // CHECK:   call void @llvm.arm.neon.vst1.p0i8.v16i8(i8* %a, <16 x i8> %b, i32 1)
   18045 // CHECK:   ret void
   18046 void test_vst1q_p8(poly8_t * a, poly8x16_t b) {
   18047   vst1q_p8(a, b);
   18048 }
   18049 
   18050 // CHECK-LABEL: define void @test_vst1q_p16(i16* %a, <8 x i16> %b) #0 {
   18051 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
   18052 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   18053 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   18054 // CHECK:   call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* [[TMP0]], <8 x i16> [[TMP2]], i32 2)
   18055 // CHECK:   ret void
   18056 void test_vst1q_p16(poly16_t * a, poly16x8_t b) {
   18057   vst1q_p16(a, b);
   18058 }
   18059 
   18060 // CHECK-LABEL: define void @test_vst1_u8(i8* %a, <8 x i8> %b) #0 {
   18061 // CHECK:   call void @llvm.arm.neon.vst1.p0i8.v8i8(i8* %a, <8 x i8> %b, i32 1)
   18062 // CHECK:   ret void
   18063 void test_vst1_u8(uint8_t * a, uint8x8_t b) {
   18064   vst1_u8(a, b);
   18065 }
   18066 
   18067 // CHECK-LABEL: define void @test_vst1_u16(i16* %a, <4 x i16> %b) #0 {
   18068 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
   18069 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   18070 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   18071 // CHECK:   call void @llvm.arm.neon.vst1.p0i8.v4i16(i8* [[TMP0]], <4 x i16> [[TMP2]], i32 2)
   18072 // CHECK:   ret void
   18073 void test_vst1_u16(uint16_t * a, uint16x4_t b) {
   18074   vst1_u16(a, b);
   18075 }
   18076 
   18077 // CHECK-LABEL: define void @test_vst1_u32(i32* %a, <2 x i32> %b) #0 {
   18078 // CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
   18079 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   18080 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   18081 // CHECK:   call void @llvm.arm.neon.vst1.p0i8.v2i32(i8* [[TMP0]], <2 x i32> [[TMP2]], i32 4)
   18082 // CHECK:   ret void
   18083 void test_vst1_u32(uint32_t * a, uint32x2_t b) {
   18084   vst1_u32(a, b);
   18085 }
   18086 
   18087 // CHECK-LABEL: define void @test_vst1_u64(i64* %a, <1 x i64> %b) #0 {
   18088 // CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
   18089 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
   18090 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
   18091 // CHECK:   call void @llvm.arm.neon.vst1.p0i8.v1i64(i8* [[TMP0]], <1 x i64> [[TMP2]], i32 4)
   18092 // CHECK:   ret void
   18093 void test_vst1_u64(uint64_t * a, uint64x1_t b) {
   18094   vst1_u64(a, b);
   18095 }
   18096 
   18097 // CHECK-LABEL: define void @test_vst1_s8(i8* %a, <8 x i8> %b) #0 {
   18098 // CHECK:   call void @llvm.arm.neon.vst1.p0i8.v8i8(i8* %a, <8 x i8> %b, i32 1)
   18099 // CHECK:   ret void
   18100 void test_vst1_s8(int8_t * a, int8x8_t b) {
   18101   vst1_s8(a, b);
   18102 }
   18103 
   18104 // CHECK-LABEL: define void @test_vst1_s16(i16* %a, <4 x i16> %b) #0 {
   18105 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
   18106 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   18107 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   18108 // CHECK:   call void @llvm.arm.neon.vst1.p0i8.v4i16(i8* [[TMP0]], <4 x i16> [[TMP2]], i32 2)
   18109 // CHECK:   ret void
   18110 void test_vst1_s16(int16_t * a, int16x4_t b) {
   18111   vst1_s16(a, b);
   18112 }
   18113 
   18114 // CHECK-LABEL: define void @test_vst1_s32(i32* %a, <2 x i32> %b) #0 {
   18115 // CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
   18116 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   18117 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   18118 // CHECK:   call void @llvm.arm.neon.vst1.p0i8.v2i32(i8* [[TMP0]], <2 x i32> [[TMP2]], i32 4)
   18119 // CHECK:   ret void
   18120 void test_vst1_s32(int32_t * a, int32x2_t b) {
   18121   vst1_s32(a, b);
   18122 }
   18123 
   18124 // CHECK-LABEL: define void @test_vst1_s64(i64* %a, <1 x i64> %b) #0 {
   18125 // CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
   18126 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
   18127 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
   18128 // CHECK:   call void @llvm.arm.neon.vst1.p0i8.v1i64(i8* [[TMP0]], <1 x i64> [[TMP2]], i32 4)
   18129 // CHECK:   ret void
   18130 void test_vst1_s64(int64_t * a, int64x1_t b) {
   18131   vst1_s64(a, b);
   18132 }
   18133 
   18134 // CHECK-LABEL: define void @test_vst1_f16(half* %a, <4 x half> %b) #0 {
   18135 // CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
   18136 // CHECK:   [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8>
   18137 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   18138 // CHECK:   call void @llvm.arm.neon.vst1.p0i8.v4i16(i8* [[TMP0]], <4 x i16> [[TMP2]], i32 2)
   18139 // CHECK:   ret void
   18140 void test_vst1_f16(float16_t * a, float16x4_t b) {
   18141   vst1_f16(a, b);
   18142 }
   18143 
   18144 // CHECK-LABEL: define void @test_vst1_f32(float* %a, <2 x float> %b) #0 {
   18145 // CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
   18146 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
   18147 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
   18148 // CHECK:   call void @llvm.arm.neon.vst1.p0i8.v2f32(i8* [[TMP0]], <2 x float> [[TMP2]], i32 4)
   18149 // CHECK:   ret void
   18150 void test_vst1_f32(float32_t * a, float32x2_t b) {
   18151   vst1_f32(a, b);
   18152 }
   18153 
   18154 // CHECK-LABEL: define void @test_vst1_p8(i8* %a, <8 x i8> %b) #0 {
   18155 // CHECK:   call void @llvm.arm.neon.vst1.p0i8.v8i8(i8* %a, <8 x i8> %b, i32 1)
   18156 // CHECK:   ret void
   18157 void test_vst1_p8(poly8_t * a, poly8x8_t b) {
   18158   vst1_p8(a, b);
   18159 }
   18160 
   18161 // CHECK-LABEL: define void @test_vst1_p16(i16* %a, <4 x i16> %b) #0 {
   18162 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
   18163 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   18164 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   18165 // CHECK:   call void @llvm.arm.neon.vst1.p0i8.v4i16(i8* [[TMP0]], <4 x i16> [[TMP2]], i32 2)
   18166 // CHECK:   ret void
   18167 void test_vst1_p16(poly16_t * a, poly16x4_t b) {
   18168   vst1_p16(a, b);
   18169 }
   18170 
   18171 
   18172 // CHECK-LABEL: define void @test_vst1q_lane_u8(i8* %a, <16 x i8> %b) #0 {
   18173 // CHECK:   [[TMP0:%.*]] = extractelement <16 x i8> %b, i32 15
   18174 // CHECK:   store i8 [[TMP0]], i8* %a, align 1
   18175 // CHECK:   ret void
   18176 void test_vst1q_lane_u8(uint8_t * a, uint8x16_t b) {
   18177   vst1q_lane_u8(a, b, 15);
   18178 }
   18179 
   18180 // CHECK-LABEL: define void @test_vst1q_lane_u16(i16* %a, <8 x i16> %b) #0 {
   18181 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
   18182 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   18183 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   18184 // CHECK:   [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7
   18185 // CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
   18186 // CHECK:   store i16 [[TMP3]], i16* [[TMP4]], align 2
   18187 // CHECK:   ret void
   18188 void test_vst1q_lane_u16(uint16_t * a, uint16x8_t b) {
   18189   vst1q_lane_u16(a, b, 7);
   18190 }
   18191 
   18192 // CHECK-LABEL: define void @test_vst1q_lane_u32(i32* %a, <4 x i32> %b) #0 {
   18193 // CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
   18194 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   18195 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   18196 // CHECK:   [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
   18197 // CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i32*
   18198 // CHECK:   store i32 [[TMP3]], i32* [[TMP4]], align 4
   18199 // CHECK:   ret void
   18200 void test_vst1q_lane_u32(uint32_t * a, uint32x4_t b) {
   18201   vst1q_lane_u32(a, b, 3);
   18202 }
   18203 
   18204 // CHECK-LABEL: define void @test_vst1q_lane_u64(i64* %a, <2 x i64> %b) #0 {
   18205 // CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
   18206 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   18207 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
   18208 // CHECK:   [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP2]], <1 x i32> <i32 1>
   18209 // CHECK:   call void @llvm.arm.neon.vst1.p0i8.v1i64(i8* [[TMP0]], <1 x i64> [[TMP3]], i32 4)
   18210 // CHECK:   ret void
   18211 void test_vst1q_lane_u64(uint64_t * a, uint64x2_t b) {
   18212   vst1q_lane_u64(a, b, 1);
   18213 }
   18214 
   18215 // CHECK-LABEL: define void @test_vst1q_lane_s8(i8* %a, <16 x i8> %b) #0 {
   18216 // CHECK:   [[TMP0:%.*]] = extractelement <16 x i8> %b, i32 15
   18217 // CHECK:   store i8 [[TMP0]], i8* %a, align 1
   18218 // CHECK:   ret void
   18219 void test_vst1q_lane_s8(int8_t * a, int8x16_t b) {
   18220   vst1q_lane_s8(a, b, 15);
   18221 }
   18222 
   18223 // CHECK-LABEL: define void @test_vst1q_lane_s16(i16* %a, <8 x i16> %b) #0 {
   18224 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
   18225 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   18226 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   18227 // CHECK:   [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7
   18228 // CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
   18229 // CHECK:   store i16 [[TMP3]], i16* [[TMP4]], align 2
   18230 // CHECK:   ret void
   18231 void test_vst1q_lane_s16(int16_t * a, int16x8_t b) {
   18232   vst1q_lane_s16(a, b, 7);
   18233 }
   18234 
   18235 // CHECK-LABEL: define void @test_vst1q_lane_s32(i32* %a, <4 x i32> %b) #0 {
   18236 // CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
   18237 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   18238 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   18239 // CHECK:   [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
   18240 // CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i32*
   18241 // CHECK:   store i32 [[TMP3]], i32* [[TMP4]], align 4
   18242 // CHECK:   ret void
   18243 void test_vst1q_lane_s32(int32_t * a, int32x4_t b) {
   18244   vst1q_lane_s32(a, b, 3);
   18245 }
   18246 
   18247 // CHECK-LABEL: define void @test_vst1q_lane_s64(i64* %a, <2 x i64> %b) #0 {
   18248 // CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
   18249 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   18250 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
   18251 // CHECK:   [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP2]], <1 x i32> <i32 1>
   18252 // CHECK:   call void @llvm.arm.neon.vst1.p0i8.v1i64(i8* [[TMP0]], <1 x i64> [[TMP3]], i32 4)
   18253 // CHECK:   ret void
   18254 void test_vst1q_lane_s64(int64_t * a, int64x2_t b) {
   18255   vst1q_lane_s64(a, b, 1);
   18256 }
   18257 
   18258 // CHECK-LABEL: define void @test_vst1q_lane_f16(half* %a, <8 x half> %b) #0 {
   18259 // CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
   18260 // CHECK:   [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8>
   18261 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   18262 // CHECK:   [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7
   18263 // CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
   18264 // CHECK:   store i16 [[TMP3]], i16* [[TMP4]], align 2
   18265 // CHECK:   ret void
   18266 void test_vst1q_lane_f16(float16_t * a, float16x8_t b) {
   18267   vst1q_lane_f16(a, b, 7);
   18268 }
   18269 
   18270 // CHECK-LABEL: define void @test_vst1q_lane_f32(float* %a, <4 x float> %b) #0 {
   18271 // CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
   18272 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
   18273 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
   18274 // CHECK:   [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
   18275 // CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to float*
   18276 // CHECK:   store float [[TMP3]], float* [[TMP4]], align 4
   18277 // CHECK:   ret void
   18278 void test_vst1q_lane_f32(float32_t * a, float32x4_t b) {
   18279   vst1q_lane_f32(a, b, 3);
   18280 }
   18281 
   18282 // CHECK-LABEL: define void @test_vst1q_lane_p8(i8* %a, <16 x i8> %b) #0 {
   18283 // CHECK:   [[TMP0:%.*]] = extractelement <16 x i8> %b, i32 15
   18284 // CHECK:   store i8 [[TMP0]], i8* %a, align 1
   18285 // CHECK:   ret void
   18286 void test_vst1q_lane_p8(poly8_t * a, poly8x16_t b) {
   18287   vst1q_lane_p8(a, b, 15);
   18288 }
   18289 
   18290 // CHECK-LABEL: define void @test_vst1q_lane_p16(i16* %a, <8 x i16> %b) #0 {
   18291 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
   18292 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   18293 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   18294 // CHECK:   [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7
   18295 // CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
   18296 // CHECK:   store i16 [[TMP3]], i16* [[TMP4]], align 2
   18297 // CHECK:   ret void
   18298 void test_vst1q_lane_p16(poly16_t * a, poly16x8_t b) {
   18299   vst1q_lane_p16(a, b, 7);
   18300 }
   18301 
   18302 // CHECK-LABEL: define void @test_vst1_lane_u8(i8* %a, <8 x i8> %b) #0 {
   18303 // CHECK:   [[TMP0:%.*]] = extractelement <8 x i8> %b, i32 7
   18304 // CHECK:   store i8 [[TMP0]], i8* %a, align 1
   18305 // CHECK:   ret void
   18306 void test_vst1_lane_u8(uint8_t * a, uint8x8_t b) {
   18307   vst1_lane_u8(a, b, 7);
   18308 }
   18309 
   18310 // CHECK-LABEL: define void @test_vst1_lane_u16(i16* %a, <4 x i16> %b) #0 {
   18311 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
   18312 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   18313 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   18314 // CHECK:   [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
   18315 // CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
   18316 // CHECK:   store i16 [[TMP3]], i16* [[TMP4]], align 2
   18317 // CHECK:   ret void
   18318 void test_vst1_lane_u16(uint16_t * a, uint16x4_t b) {
   18319   vst1_lane_u16(a, b, 3);
   18320 }
   18321 
   18322 // CHECK-LABEL: define void @test_vst1_lane_u32(i32* %a, <2 x i32> %b) #0 {
   18323 // CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
   18324 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   18325 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   18326 // CHECK:   [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
   18327 // CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i32*
   18328 // CHECK:   store i32 [[TMP3]], i32* [[TMP4]], align 4
   18329 // CHECK:   ret void
   18330 void test_vst1_lane_u32(uint32_t * a, uint32x2_t b) {
   18331   vst1_lane_u32(a, b, 1);
   18332 }
   18333 
   18334 // CHECK-LABEL: define void @test_vst1_lane_u64(i64* %a, <1 x i64> %b) #0 {
   18335 // CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
   18336 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
   18337 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
   18338 // CHECK:   [[TMP3:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0
   18339 // CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i64*
   18340 // CHECK:   store i64 [[TMP3]], i64* [[TMP4]], align 4
   18341 // CHECK:   ret void
   18342 void test_vst1_lane_u64(uint64_t * a, uint64x1_t b) {
   18343   vst1_lane_u64(a, b, 0);
   18344 }
   18345 
   18346 // CHECK-LABEL: define void @test_vst1_lane_s8(i8* %a, <8 x i8> %b) #0 {
   18347 // CHECK:   [[TMP0:%.*]] = extractelement <8 x i8> %b, i32 7
   18348 // CHECK:   store i8 [[TMP0]], i8* %a, align 1
   18349 // CHECK:   ret void
   18350 void test_vst1_lane_s8(int8_t * a, int8x8_t b) {
   18351   vst1_lane_s8(a, b, 7);
   18352 }
   18353 
   18354 // CHECK-LABEL: define void @test_vst1_lane_s16(i16* %a, <4 x i16> %b) #0 {
   18355 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
   18356 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   18357 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   18358 // CHECK:   [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
   18359 // CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
   18360 // CHECK:   store i16 [[TMP3]], i16* [[TMP4]], align 2
   18361 // CHECK:   ret void
   18362 void test_vst1_lane_s16(int16_t * a, int16x4_t b) {
   18363   vst1_lane_s16(a, b, 3);
   18364 }
   18365 
   18366 // CHECK-LABEL: define void @test_vst1_lane_s32(i32* %a, <2 x i32> %b) #0 {
   18367 // CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
   18368 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   18369 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   18370 // CHECK:   [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
   18371 // CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i32*
   18372 // CHECK:   store i32 [[TMP3]], i32* [[TMP4]], align 4
   18373 // CHECK:   ret void
   18374 void test_vst1_lane_s32(int32_t * a, int32x2_t b) {
   18375   vst1_lane_s32(a, b, 1);
   18376 }
   18377 
   18378 // CHECK-LABEL: define void @test_vst1_lane_s64(i64* %a, <1 x i64> %b) #0 {
   18379 // CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
   18380 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
   18381 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
   18382 // CHECK:   [[TMP3:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0
   18383 // CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i64*
   18384 // CHECK:   store i64 [[TMP3]], i64* [[TMP4]], align 4
   18385 // CHECK:   ret void
   18386 void test_vst1_lane_s64(int64_t * a, int64x1_t b) {
   18387   vst1_lane_s64(a, b, 0);
   18388 }
   18389 
   18390 // CHECK-LABEL: define void @test_vst1_lane_f16(half* %a, <4 x half> %b) #0 {
   18391 // CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
   18392 // CHECK:   [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8>
   18393 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   18394 // CHECK:   [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
   18395 // CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
   18396 // CHECK:   store i16 [[TMP3]], i16* [[TMP4]], align 2
   18397 // CHECK:   ret void
   18398 void test_vst1_lane_f16(float16_t * a, float16x4_t b) {
   18399   vst1_lane_f16(a, b, 3);
   18400 }
   18401 
   18402 // CHECK-LABEL: define void @test_vst1_lane_f32(float* %a, <2 x float> %b) #0 {
   18403 // CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
   18404 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
   18405 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
   18406 // CHECK:   [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
   18407 // CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to float*
   18408 // CHECK:   store float [[TMP3]], float* [[TMP4]], align 4
   18409 // CHECK:   ret void
   18410 void test_vst1_lane_f32(float32_t * a, float32x2_t b) {
   18411   vst1_lane_f32(a, b, 1);
   18412 }
   18413 
   18414 // CHECK-LABEL: define void @test_vst1_lane_p8(i8* %a, <8 x i8> %b) #0 {
   18415 // CHECK:   [[TMP0:%.*]] = extractelement <8 x i8> %b, i32 7
   18416 // CHECK:   store i8 [[TMP0]], i8* %a, align 1
   18417 // CHECK:   ret void
   18418 void test_vst1_lane_p8(poly8_t * a, poly8x8_t b) {
   18419   vst1_lane_p8(a, b, 7);
   18420 }
   18421 
   18422 // CHECK-LABEL: define void @test_vst1_lane_p16(i16* %a, <4 x i16> %b) #0 {
   18423 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
   18424 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   18425 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   18426 // CHECK:   [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
   18427 // CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
   18428 // CHECK:   store i16 [[TMP3]], i16* [[TMP4]], align 2
   18429 // CHECK:   ret void
   18430 void test_vst1_lane_p16(poly16_t * a, poly16x4_t b) {
   18431   vst1_lane_p16(a, b, 3);
   18432 }
   18433 
   18434 
   18435 // CHECK-LABEL: define void @test_vst2q_u8(i8* %a, [4 x i64] %b.coerce) #0 {
   18436 // CHECK:   [[B:%.*]] = alloca %struct.uint8x16x2_t, align 16
   18437 // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x2_t, align 16
   18438 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[B]], i32 0, i32 0
   18439 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <16 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
   18440 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
   18441 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x2_t* [[__S1]] to i8*
   18442 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x2_t* [[B]] to i8*
   18443 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
   18444 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__S1]], i32 0, i32 0
   18445 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i32 0, i32 0
   18446 // CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
   18447 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__S1]], i32 0, i32 0
   18448 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1]], i32 0, i32 1
   18449 // CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
   18450 // CHECK:   call void @llvm.arm.neon.vst2.p0i8.v16i8(i8* %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i32 1)
   18451 // CHECK:   ret void
   18452 void test_vst2q_u8(uint8_t * a, uint8x16x2_t b) {
   18453   vst2q_u8(a, b);
   18454 }
   18455 
   18456 // CHECK-LABEL: define void @test_vst2q_u16(i16* %a, [4 x i64] %b.coerce) #0 {
   18457 // CHECK:   [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16
   18458 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16
   18459 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[B]], i32 0, i32 0
   18460 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
   18461 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
   18462 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x2_t* [[__S1]] to i8*
   18463 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x2_t* [[B]] to i8*
   18464 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
   18465 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
   18466 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
   18467 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i32 0, i32 0
   18468 // CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
   18469 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
   18470 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
   18471 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i32 0, i32 1
   18472 // CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
   18473 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
   18474 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
   18475 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
   18476 // CHECK:   call void @llvm.arm.neon.vst2.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i32 2)
   18477 // CHECK:   ret void
   18478 void test_vst2q_u16(uint16_t * a, uint16x8x2_t b) {
   18479   vst2q_u16(a, b);
   18480 }
   18481 
   18482 // CHECK-LABEL: define void @test_vst2q_u32(i32* %a, [4 x i64] %b.coerce) #0 {
   18483 // CHECK:   [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16
   18484 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16
   18485 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[B]], i32 0, i32 0
   18486 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x i32>]* [[COERCE_DIVE]] to [4 x i64]*
   18487 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
   18488 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x2_t* [[__S1]] to i8*
   18489 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x2_t* [[B]] to i8*
   18490 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
   18491 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
   18492 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
   18493 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i32 0, i32 0
   18494 // CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
   18495 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
   18496 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
   18497 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL1]], i32 0, i32 1
   18498 // CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
   18499 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
   18500 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
   18501 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
   18502 // CHECK:   call void @llvm.arm.neon.vst2.p0i8.v4i32(i8* [[TMP3]], <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], i32 4)
   18503 // CHECK:   ret void
   18504 void test_vst2q_u32(uint32_t * a, uint32x4x2_t b) {
   18505   vst2q_u32(a, b);
   18506 }
   18507 
   18508 // CHECK-LABEL: define void @test_vst2q_s8(i8* %a, [4 x i64] %b.coerce) #0 {
   18509 // CHECK:   [[B:%.*]] = alloca %struct.int8x16x2_t, align 16
   18510 // CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x2_t, align 16
   18511 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[B]], i32 0, i32 0
   18512 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <16 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
   18513 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
   18514 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x2_t* [[__S1]] to i8*
   18515 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x2_t* [[B]] to i8*
   18516 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
   18517 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__S1]], i32 0, i32 0
   18518 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i32 0, i32 0
   18519 // CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
   18520 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__S1]], i32 0, i32 0
   18521 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1]], i32 0, i32 1
   18522 // CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
   18523 // CHECK:   call void @llvm.arm.neon.vst2.p0i8.v16i8(i8* %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i32 1)
   18524 // CHECK:   ret void
   18525 void test_vst2q_s8(int8_t * a, int8x16x2_t b) {
   18526   vst2q_s8(a, b);
   18527 }
   18528 
   18529 // CHECK-LABEL: define void @test_vst2q_s16(i16* %a, [4 x i64] %b.coerce) #0 {
   18530 // CHECK:   [[B:%.*]] = alloca %struct.int16x8x2_t, align 16
   18531 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16
   18532 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[B]], i32 0, i32 0
   18533 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
   18534 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
   18535 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x2_t* [[__S1]] to i8*
   18536 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x2_t* [[B]] to i8*
   18537 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
   18538 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
   18539 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
   18540 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i32 0, i32 0
   18541 // CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
   18542 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
   18543 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
   18544 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i32 0, i32 1
   18545 // CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
   18546 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
   18547 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
   18548 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
   18549 // CHECK:   call void @llvm.arm.neon.vst2.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i32 2)
   18550 // CHECK:   ret void
   18551 void test_vst2q_s16(int16_t * a, int16x8x2_t b) {
   18552   vst2q_s16(a, b);
   18553 }
   18554 
   18555 // CHECK-LABEL: define void @test_vst2q_s32(i32* %a, [4 x i64] %b.coerce) #0 {
   18556 // CHECK:   [[B:%.*]] = alloca %struct.int32x4x2_t, align 16
   18557 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16
   18558 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[B]], i32 0, i32 0
   18559 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x i32>]* [[COERCE_DIVE]] to [4 x i64]*
   18560 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
   18561 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x2_t* [[__S1]] to i8*
   18562 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x2_t* [[B]] to i8*
   18563 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
   18564 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
   18565 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
   18566 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i32 0, i32 0
   18567 // CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
   18568 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
   18569 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
   18570 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL1]], i32 0, i32 1
   18571 // CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
   18572 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
   18573 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
   18574 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
   18575 // CHECK:   call void @llvm.arm.neon.vst2.p0i8.v4i32(i8* [[TMP3]], <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], i32 4)
   18576 // CHECK:   ret void
   18577 void test_vst2q_s32(int32_t * a, int32x4x2_t b) {
   18578   vst2q_s32(a, b);
   18579 }
   18580 
   18581 // CHECK-LABEL: define void @test_vst2q_f16(half* %a, [4 x i64] %b.coerce) #0 {
   18582 // CHECK:   [[B:%.*]] = alloca %struct.float16x8x2_t, align 16
   18583 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16
   18584 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[B]], i32 0, i32 0
   18585 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x half>]* [[COERCE_DIVE]] to [4 x i64]*
   18586 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
   18587 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x2_t* [[__S1]] to i8*
   18588 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x2_t* [[B]] to i8*
   18589 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
   18590 // CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
   18591 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
   18592 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL]], i32 0, i32 0
   18593 // CHECK:   [[TMP4:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
   18594 // CHECK:   [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
   18595 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
   18596 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL1]], i32 0, i32 1
   18597 // CHECK:   [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
   18598 // CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
   18599 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
   18600 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
   18601 // CHECK:   call void @llvm.arm.neon.vst2.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i32 2)
   18602 // CHECK:   ret void
   18603 void test_vst2q_f16(float16_t * a, float16x8x2_t b) {
   18604   vst2q_f16(a, b);
   18605 }
   18606 
   18607 // CHECK-LABEL: define void @test_vst2q_f32(float* %a, [4 x i64] %b.coerce) #0 {
   18608 // CHECK:   [[B:%.*]] = alloca %struct.float32x4x2_t, align 16
   18609 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16
   18610 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[B]], i32 0, i32 0
   18611 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x float>]* [[COERCE_DIVE]] to [4 x i64]*
   18612 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
   18613 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x2_t* [[__S1]] to i8*
   18614 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x2_t* [[B]] to i8*
   18615 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
   18616 // CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
   18617 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
   18618 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL]], i32 0, i32 0
   18619 // CHECK:   [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
   18620 // CHECK:   [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
   18621 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
   18622 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL1]], i32 0, i32 1
   18623 // CHECK:   [[TMP6:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
   18624 // CHECK:   [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
   18625 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
   18626 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
   18627 // CHECK:   call void @llvm.arm.neon.vst2.p0i8.v4f32(i8* [[TMP3]], <4 x float> [[TMP8]], <4 x float> [[TMP9]], i32 4)
   18628 // CHECK:   ret void
   18629 void test_vst2q_f32(float32_t * a, float32x4x2_t b) {
   18630   vst2q_f32(a, b);
   18631 }
   18632 
   18633 // CHECK-LABEL: define void @test_vst2q_p8(i8* %a, [4 x i64] %b.coerce) #0 {
   18634 // CHECK:   [[B:%.*]] = alloca %struct.poly8x16x2_t, align 16
   18635 // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x2_t, align 16
   18636 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[B]], i32 0, i32 0
   18637 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <16 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
   18638 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
   18639 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x2_t* [[__S1]] to i8*
   18640 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x2_t* [[B]] to i8*
   18641 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
   18642 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__S1]], i32 0, i32 0
   18643 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i32 0, i32 0
   18644 // CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
   18645 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__S1]], i32 0, i32 0
   18646 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1]], i32 0, i32 1
   18647 // CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
   18648 // CHECK:   call void @llvm.arm.neon.vst2.p0i8.v16i8(i8* %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i32 1)
   18649 // CHECK:   ret void
   18650 void test_vst2q_p8(poly8_t * a, poly8x16x2_t b) {
   18651   vst2q_p8(a, b);
   18652 }
   18653 
   18654 // CHECK-LABEL: define void @test_vst2q_p16(i16* %a, [4 x i64] %b.coerce) #0 {
   18655 // CHECK:   [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16
   18656 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16
   18657 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[B]], i32 0, i32 0
   18658 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
   18659 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
   18660 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x2_t* [[__S1]] to i8*
   18661 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x2_t* [[B]] to i8*
   18662 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
   18663 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
   18664 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
   18665 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i32 0, i32 0
   18666 // CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
   18667 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
   18668 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
   18669 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i32 0, i32 1
   18670 // CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
   18671 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
   18672 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
   18673 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
   18674 // CHECK:   call void @llvm.arm.neon.vst2.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i32 2)
   18675 // CHECK:   ret void
   18676 void test_vst2q_p16(poly16_t * a, poly16x8x2_t b) {
   18677   vst2q_p16(a, b);
   18678 }
   18679 
   18680 // CHECK-LABEL: define void @test_vst2_u8(i8* %a, [2 x i64] %b.coerce) #0 {
   18681 // CHECK:   [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8
   18682 // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x2_t, align 8
   18683 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[B]], i32 0, i32 0
   18684 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE]] to [2 x i64]*
   18685 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
   18686 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x2_t* [[__S1]] to i8*
   18687 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x2_t* [[B]] to i8*
   18688 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
   18689 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
   18690 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i32 0, i32 0
   18691 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
   18692 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
   18693 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i32 0, i32 1
   18694 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
   18695 // CHECK:   call void @llvm.arm.neon.vst2.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i32 1)
   18696 // CHECK:   ret void
   18697 void test_vst2_u8(uint8_t * a, uint8x8x2_t b) {
   18698   vst2_u8(a, b);
   18699 }
   18700 
   18701 // CHECK-LABEL: define void @test_vst2_u16(i16* %a, [2 x i64] %b.coerce) #0 {
   18702 // CHECK:   [[B:%.*]] = alloca %struct.uint16x4x2_t, align 8
   18703 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x2_t, align 8
   18704 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[B]], i32 0, i32 0
   18705 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x i16>]* [[COERCE_DIVE]] to [2 x i64]*
   18706 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
   18707 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x2_t* [[__S1]] to i8*
   18708 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x2_t* [[B]] to i8*
   18709 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
   18710 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
   18711 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
   18712 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i32 0, i32 0
   18713 // CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
   18714 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
   18715 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
   18716 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i32 0, i32 1
   18717 // CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
   18718 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
   18719 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
   18720 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
   18721 // CHECK:   call void @llvm.arm.neon.vst2.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i32 2)
   18722 // CHECK:   ret void
   18723 void test_vst2_u16(uint16_t * a, uint16x4x2_t b) {
   18724   vst2_u16(a, b);
   18725 }
   18726 
   18727 // CHECK-LABEL: define void @test_vst2_u32(i32* %a, [2 x i64] %b.coerce) #0 {
   18728 // CHECK:   [[B:%.*]] = alloca %struct.uint32x2x2_t, align 8
   18729 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x2_t, align 8
   18730 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[B]], i32 0, i32 0
   18731 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <2 x i32>]* [[COERCE_DIVE]] to [2 x i64]*
   18732 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
   18733 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x2_t* [[__S1]] to i8*
   18734 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x2_t* [[B]] to i8*
   18735 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
   18736 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
   18737 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
   18738 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i32 0, i32 0
   18739 // CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
   18740 // CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
   18741 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
   18742 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL1]], i32 0, i32 1
   18743 // CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
   18744 // CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
   18745 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
   18746 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
   18747 // CHECK:   call void @llvm.arm.neon.vst2.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], i32 4)
   18748 // CHECK:   ret void
   18749 void test_vst2_u32(uint32_t * a, uint32x2x2_t b) {
   18750   vst2_u32(a, b);
   18751 }
   18752 
   18753 // CHECK-LABEL: define void @test_vst2_u64(i64* %a, [2 x i64] %b.coerce) #0 {
   18754 // CHECK:   [[B:%.*]] = alloca %struct.uint64x1x2_t, align 8
   18755 // CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x2_t, align 8
   18756 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[B]], i32 0, i32 0
   18757 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <1 x i64>]* [[COERCE_DIVE]] to [2 x i64]*
   18758 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
   18759 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x2_t* [[__S1]] to i8*
   18760 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint64x1x2_t* [[B]] to i8*
   18761 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
   18762 // CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
   18763 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[__S1]], i32 0, i32 0
   18764 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i32 0, i32 0
   18765 // CHECK:   [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
   18766 // CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
   18767 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[__S1]], i32 0, i32 0
   18768 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL1]], i32 0, i32 1
   18769 // CHECK:   [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
   18770 // CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
   18771 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
   18772 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
   18773 // CHECK:   call void @llvm.arm.neon.vst2.p0i8.v1i64(i8* [[TMP3]], <1 x i64> [[TMP8]], <1 x i64> [[TMP9]], i32 4)
   18774 // CHECK:   ret void
   18775 void test_vst2_u64(uint64_t * a, uint64x1x2_t b) {
   18776   vst2_u64(a, b);
   18777 }
   18778 
   18779 // CHECK-LABEL: define void @test_vst2_s8(i8* %a, [2 x i64] %b.coerce) #0 {
   18780 // CHECK:   [[B:%.*]] = alloca %struct.int8x8x2_t, align 8
   18781 // CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x2_t, align 8
   18782 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[B]], i32 0, i32 0
   18783 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE]] to [2 x i64]*
   18784 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
   18785 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x2_t* [[__S1]] to i8*
   18786 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x2_t* [[B]] to i8*
   18787 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
   18788 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
   18789 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i32 0, i32 0
   18790 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
   18791 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
   18792 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i32 0, i32 1
   18793 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
   18794 // CHECK:   call void @llvm.arm.neon.vst2.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i32 1)
   18795 // CHECK:   ret void
   18796 void test_vst2_s8(int8_t * a, int8x8x2_t b) {
   18797   vst2_s8(a, b);
   18798 }
   18799 
   18800 // CHECK-LABEL: define void @test_vst2_s16(i16* %a, [2 x i64] %b.coerce) #0 {
   18801 // CHECK:   [[B:%.*]] = alloca %struct.int16x4x2_t, align 8
   18802 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x2_t, align 8
   18803 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[B]], i32 0, i32 0
   18804 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x i16>]* [[COERCE_DIVE]] to [2 x i64]*
   18805 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
   18806 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x2_t* [[__S1]] to i8*
   18807 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x2_t* [[B]] to i8*
   18808 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
   18809 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
   18810 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
   18811 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i32 0, i32 0
   18812 // CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
   18813 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
   18814 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
   18815 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i32 0, i32 1
   18816 // CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
   18817 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
   18818 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
   18819 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
   18820 // CHECK:   call void @llvm.arm.neon.vst2.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i32 2)
   18821 // CHECK:   ret void
   18822 void test_vst2_s16(int16_t * a, int16x4x2_t b) {
   18823   vst2_s16(a, b);
   18824 }
   18825 
   18826 // CHECK-LABEL: define void @test_vst2_s32(i32* %a, [2 x i64] %b.coerce) #0 {
   18827 // CHECK:   [[B:%.*]] = alloca %struct.int32x2x2_t, align 8
   18828 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x2_t, align 8
   18829 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[B]], i32 0, i32 0
   18830 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <2 x i32>]* [[COERCE_DIVE]] to [2 x i64]*
   18831 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
   18832 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x2_t* [[__S1]] to i8*
   18833 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x2_t* [[B]] to i8*
   18834 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
   18835 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
   18836 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
   18837 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i32 0, i32 0
   18838 // CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
   18839 // CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
   18840 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
   18841 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL1]], i32 0, i32 1
   18842 // CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
   18843 // CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
   18844 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
   18845 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
   18846 // CHECK:   call void @llvm.arm.neon.vst2.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], i32 4)
   18847 // CHECK:   ret void
   18848 void test_vst2_s32(int32_t * a, int32x2x2_t b) {
   18849   vst2_s32(a, b);
   18850 }
   18851 
   18852 // CHECK-LABEL: define void @test_vst2_s64(i64* %a, [2 x i64] %b.coerce) #0 {
   18853 // CHECK:   [[B:%.*]] = alloca %struct.int64x1x2_t, align 8
   18854 // CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x2_t, align 8
   18855 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[B]], i32 0, i32 0
   18856 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <1 x i64>]* [[COERCE_DIVE]] to [2 x i64]*
   18857 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
   18858 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x2_t* [[__S1]] to i8*
   18859 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int64x1x2_t* [[B]] to i8*
   18860 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
   18861 // CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
   18862 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[__S1]], i32 0, i32 0
   18863 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i32 0, i32 0
   18864 // CHECK:   [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
   18865 // CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
   18866 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[__S1]], i32 0, i32 0
   18867 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL1]], i32 0, i32 1
   18868 // CHECK:   [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
   18869 // CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
   18870 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
   18871 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
   18872 // CHECK:   call void @llvm.arm.neon.vst2.p0i8.v1i64(i8* [[TMP3]], <1 x i64> [[TMP8]], <1 x i64> [[TMP9]], i32 4)
   18873 // CHECK:   ret void
   18874 void test_vst2_s64(int64_t * a, int64x1x2_t b) {
   18875   vst2_s64(a, b);
   18876 }
   18877 
   18878 // CHECK-LABEL: define void @test_vst2_f16(half* %a, [2 x i64] %b.coerce) #0 {
   18879 // CHECK:   [[B:%.*]] = alloca %struct.float16x4x2_t, align 8
   18880 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x2_t, align 8
   18881 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[B]], i32 0, i32 0
   18882 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x half>]* [[COERCE_DIVE]] to [2 x i64]*
   18883 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
   18884 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x2_t* [[__S1]] to i8*
   18885 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x2_t* [[B]] to i8*
   18886 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
   18887 // CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
   18888 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
   18889 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL]], i32 0, i32 0
   18890 // CHECK:   [[TMP4:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
   18891 // CHECK:   [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
   18892 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
   18893 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL1]], i32 0, i32 1
   18894 // CHECK:   [[TMP6:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
   18895 // CHECK:   [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
   18896 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
   18897 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
   18898 // CHECK:   call void @llvm.arm.neon.vst2.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i32 2)
   18899 // CHECK:   ret void
   18900 void test_vst2_f16(float16_t * a, float16x4x2_t b) {
   18901   vst2_f16(a, b);
   18902 }
   18903 
   18904 // CHECK-LABEL: define void @test_vst2_f32(float* %a, [2 x i64] %b.coerce) #0 {
   18905 // CHECK:   [[B:%.*]] = alloca %struct.float32x2x2_t, align 8
   18906 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x2_t, align 8
   18907 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[B]], i32 0, i32 0
   18908 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <2 x float>]* [[COERCE_DIVE]] to [2 x i64]*
   18909 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
   18910 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x2_t* [[__S1]] to i8*
   18911 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x2_t* [[B]] to i8*
   18912 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
   18913 // CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
   18914 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
   18915 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL]], i32 0, i32 0
   18916 // CHECK:   [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
   18917 // CHECK:   [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
   18918 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
   18919 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL1]], i32 0, i32 1
   18920 // CHECK:   [[TMP6:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
   18921 // CHECK:   [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
   18922 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
   18923 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
   18924 // CHECK:   call void @llvm.arm.neon.vst2.p0i8.v2f32(i8* [[TMP3]], <2 x float> [[TMP8]], <2 x float> [[TMP9]], i32 4)
   18925 // CHECK:   ret void
   18926 void test_vst2_f32(float32_t * a, float32x2x2_t b) {
   18927   vst2_f32(a, b);
   18928 }
   18929 
   18930 // CHECK-LABEL: define void @test_vst2_p8(i8* %a, [2 x i64] %b.coerce) #0 {
   18931 // CHECK:   [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8
   18932 // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x2_t, align 8
   18933 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[B]], i32 0, i32 0
   18934 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE]] to [2 x i64]*
   18935 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
   18936 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x2_t* [[__S1]] to i8*
   18937 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x2_t* [[B]] to i8*
   18938 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
   18939 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
   18940 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i32 0, i32 0
   18941 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
   18942 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
   18943 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i32 0, i32 1
   18944 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
   18945 // CHECK:   call void @llvm.arm.neon.vst2.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i32 1)
   18946 // CHECK:   ret void
   18947 void test_vst2_p8(poly8_t * a, poly8x8x2_t b) {
   18948   vst2_p8(a, b);
   18949 }
   18950 
   18951 // CHECK-LABEL: define void @test_vst2_p16(i16* %a, [2 x i64] %b.coerce) #0 {
   18952 // CHECK:   [[B:%.*]] = alloca %struct.poly16x4x2_t, align 8
   18953 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x2_t, align 8
   18954 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[B]], i32 0, i32 0
   18955 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x i16>]* [[COERCE_DIVE]] to [2 x i64]*
   18956 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
   18957 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x2_t* [[__S1]] to i8*
   18958 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x2_t* [[B]] to i8*
   18959 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
   18960 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
   18961 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
   18962 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i32 0, i32 0
   18963 // CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
   18964 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
   18965 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
   18966 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i32 0, i32 1
   18967 // CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
   18968 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
   18969 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
   18970 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
   18971 // CHECK:   call void @llvm.arm.neon.vst2.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i32 2)
   18972 // CHECK:   ret void
   18973 void test_vst2_p16(poly16_t * a, poly16x4x2_t b) {
   18974   vst2_p16(a, b);
   18975 }
   18976 
   18977 
   18978 // CHECK-LABEL: define void @test_vst2q_lane_u16(i16* %a, [4 x i64] %b.coerce) #0 {
   18979 // CHECK:   [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16
   18980 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16
   18981 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[B]], i32 0, i32 0
   18982 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
   18983 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
   18984 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x2_t* [[__S1]] to i8*
   18985 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x2_t* [[B]] to i8*
   18986 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
   18987 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
   18988 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
   18989 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i32 0, i32 0
   18990 // CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
   18991 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
   18992 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
   18993 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i32 0, i32 1
   18994 // CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
   18995 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
   18996 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
   18997 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
   18998 // CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i32 7, i32 2)
   18999 // CHECK:   ret void
   19000 void test_vst2q_lane_u16(uint16_t * a, uint16x8x2_t b) {
   19001   vst2q_lane_u16(a, b, 7);
   19002 }
   19003 
   19004 // CHECK-LABEL: define void @test_vst2q_lane_u32(i32* %a, [4 x i64] %b.coerce) #0 {
   19005 // CHECK:   [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16
   19006 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16
   19007 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[B]], i32 0, i32 0
   19008 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x i32>]* [[COERCE_DIVE]] to [4 x i64]*
   19009 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
   19010 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x2_t* [[__S1]] to i8*
   19011 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x2_t* [[B]] to i8*
   19012 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
   19013 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
   19014 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
   19015 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i32 0, i32 0
   19016 // CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
   19017 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
   19018 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
   19019 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL1]], i32 0, i32 1
   19020 // CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
   19021 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
   19022 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
   19023 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
   19024 // CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v4i32(i8* [[TMP3]], <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], i32 3, i32 4)
   19025 // CHECK:   ret void
   19026 void test_vst2q_lane_u32(uint32_t * a, uint32x4x2_t b) {
   19027   vst2q_lane_u32(a, b, 3);
   19028 }
   19029 
   19030 // CHECK-LABEL: define void @test_vst2q_lane_s16(i16* %a, [4 x i64] %b.coerce) #0 {
   19031 // CHECK:   [[B:%.*]] = alloca %struct.int16x8x2_t, align 16
   19032 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16
   19033 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[B]], i32 0, i32 0
   19034 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
   19035 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
   19036 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x2_t* [[__S1]] to i8*
   19037 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x2_t* [[B]] to i8*
   19038 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
   19039 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
   19040 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
   19041 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i32 0, i32 0
   19042 // CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
   19043 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
   19044 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
   19045 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i32 0, i32 1
   19046 // CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
   19047 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
   19048 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
   19049 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
   19050 // CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i32 7, i32 2)
   19051 // CHECK:   ret void
   19052 void test_vst2q_lane_s16(int16_t * a, int16x8x2_t b) {
   19053   vst2q_lane_s16(a, b, 7);
   19054 }
   19055 
   19056 // CHECK-LABEL: define void @test_vst2q_lane_s32(i32* %a, [4 x i64] %b.coerce) #0 {
   19057 // CHECK:   [[B:%.*]] = alloca %struct.int32x4x2_t, align 16
   19058 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16
   19059 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[B]], i32 0, i32 0
   19060 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x i32>]* [[COERCE_DIVE]] to [4 x i64]*
   19061 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
   19062 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x2_t* [[__S1]] to i8*
   19063 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x2_t* [[B]] to i8*
   19064 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
   19065 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
   19066 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
   19067 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i32 0, i32 0
   19068 // CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
   19069 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
   19070 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
   19071 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL1]], i32 0, i32 1
   19072 // CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
   19073 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
   19074 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
   19075 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
   19076 // CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v4i32(i8* [[TMP3]], <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], i32 3, i32 4)
   19077 // CHECK:   ret void
   19078 void test_vst2q_lane_s32(int32_t * a, int32x4x2_t b) {
   19079   vst2q_lane_s32(a, b, 3);
   19080 }
   19081 
   19082 // CHECK-LABEL: define void @test_vst2q_lane_f16(half* %a, [4 x i64] %b.coerce) #0 {
   19083 // CHECK:   [[B:%.*]] = alloca %struct.float16x8x2_t, align 16
   19084 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16
   19085 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[B]], i32 0, i32 0
   19086 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x half>]* [[COERCE_DIVE]] to [4 x i64]*
   19087 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
   19088 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x2_t* [[__S1]] to i8*
   19089 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x2_t* [[B]] to i8*
   19090 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
   19091 // CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
   19092 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
   19093 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL]], i32 0, i32 0
   19094 // CHECK:   [[TMP4:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
   19095 // CHECK:   [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
   19096 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
   19097 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL1]], i32 0, i32 1
   19098 // CHECK:   [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
   19099 // CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
   19100 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
   19101 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
   19102 // CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i32 7, i32 2)
   19103 // CHECK:   ret void
   19104 void test_vst2q_lane_f16(float16_t * a, float16x8x2_t b) {
   19105   vst2q_lane_f16(a, b, 7);
   19106 }
   19107 
   19108 // CHECK-LABEL: define void @test_vst2q_lane_f32(float* %a, [4 x i64] %b.coerce) #0 {
   19109 // CHECK:   [[B:%.*]] = alloca %struct.float32x4x2_t, align 16
   19110 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16
   19111 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[B]], i32 0, i32 0
   19112 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x float>]* [[COERCE_DIVE]] to [4 x i64]*
   19113 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
   19114 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x2_t* [[__S1]] to i8*
   19115 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x2_t* [[B]] to i8*
   19116 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
   19117 // CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
   19118 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
   19119 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL]], i32 0, i32 0
   19120 // CHECK:   [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
   19121 // CHECK:   [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
   19122 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
   19123 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL1]], i32 0, i32 1
   19124 // CHECK:   [[TMP6:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
   19125 // CHECK:   [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
   19126 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
   19127 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
   19128 // CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v4f32(i8* [[TMP3]], <4 x float> [[TMP8]], <4 x float> [[TMP9]], i32 3, i32 4)
   19129 // CHECK:   ret void
   19130 void test_vst2q_lane_f32(float32_t * a, float32x4x2_t b) {
   19131   vst2q_lane_f32(a, b, 3);
   19132 }
   19133 
   19134 // CHECK-LABEL: define void @test_vst2q_lane_p16(i16* %a, [4 x i64] %b.coerce) #0 {
   19135 // CHECK:   [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16
   19136 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16
   19137 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[B]], i32 0, i32 0
   19138 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
   19139 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
   19140 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x2_t* [[__S1]] to i8*
   19141 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x2_t* [[B]] to i8*
   19142 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
   19143 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
   19144 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
   19145 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i32 0, i32 0
   19146 // CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
   19147 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
   19148 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
   19149 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i32 0, i32 1
   19150 // CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
   19151 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
   19152 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
   19153 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
   19154 // CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i32 7, i32 2)
   19155 // CHECK:   ret void
   19156 void test_vst2q_lane_p16(poly16_t * a, poly16x8x2_t b) {
   19157   vst2q_lane_p16(a, b, 7);
   19158 }
   19159 
   19160 // CHECK-LABEL: define void @test_vst2_lane_u8(i8* %a, [2 x i64] %b.coerce) #0 {
   19161 // CHECK:   [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8
   19162 // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x2_t, align 8
   19163 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[B]], i32 0, i32 0
   19164 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE]] to [2 x i64]*
   19165 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
   19166 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x2_t* [[__S1]] to i8*
   19167 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x2_t* [[B]] to i8*
   19168 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
   19169 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
   19170 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i32 0, i32 0
   19171 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
   19172 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
   19173 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i32 0, i32 1
   19174 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
   19175 // CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i32 7, i32 1)
   19176 // CHECK:   ret void
   19177 void test_vst2_lane_u8(uint8_t * a, uint8x8x2_t b) {
   19178   vst2_lane_u8(a, b, 7);
   19179 }
   19180 
   19181 // CHECK-LABEL: define void @test_vst2_lane_u16(i16* %a, [2 x i64] %b.coerce) #0 {
   19182 // CHECK:   [[B:%.*]] = alloca %struct.uint16x4x2_t, align 8
   19183 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x2_t, align 8
   19184 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[B]], i32 0, i32 0
   19185 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x i16>]* [[COERCE_DIVE]] to [2 x i64]*
   19186 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
   19187 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x2_t* [[__S1]] to i8*
   19188 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x2_t* [[B]] to i8*
   19189 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
   19190 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
   19191 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
   19192 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i32 0, i32 0
   19193 // CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
   19194 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
   19195 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
   19196 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i32 0, i32 1
   19197 // CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
   19198 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
   19199 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
   19200 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
   19201 // CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i32 3, i32 2)
   19202 // CHECK:   ret void
   19203 void test_vst2_lane_u16(uint16_t * a, uint16x4x2_t b) {
   19204   vst2_lane_u16(a, b, 3);
   19205 }
   19206 
   19207 // CHECK-LABEL: define void @test_vst2_lane_u32(i32* %a, [2 x i64] %b.coerce) #0 {
   19208 // CHECK:   [[B:%.*]] = alloca %struct.uint32x2x2_t, align 8
   19209 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x2_t, align 8
   19210 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[B]], i32 0, i32 0
   19211 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <2 x i32>]* [[COERCE_DIVE]] to [2 x i64]*
   19212 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
   19213 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x2_t* [[__S1]] to i8*
   19214 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x2_t* [[B]] to i8*
   19215 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
   19216 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
   19217 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
   19218 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i32 0, i32 0
   19219 // CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
   19220 // CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
   19221 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
   19222 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL1]], i32 0, i32 1
   19223 // CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
   19224 // CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
   19225 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
   19226 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
   19227 // CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], i32 1, i32 4)
   19228 // CHECK:   ret void
   19229 void test_vst2_lane_u32(uint32_t * a, uint32x2x2_t b) {
   19230   vst2_lane_u32(a, b, 1);
   19231 }
   19232 
   19233 // CHECK-LABEL: define void @test_vst2_lane_s8(i8* %a, [2 x i64] %b.coerce) #0 {
   19234 // CHECK:   [[B:%.*]] = alloca %struct.int8x8x2_t, align 8
   19235 // CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x2_t, align 8
   19236 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[B]], i32 0, i32 0
   19237 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE]] to [2 x i64]*
   19238 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
   19239 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x2_t* [[__S1]] to i8*
   19240 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x2_t* [[B]] to i8*
   19241 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
   19242 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
   19243 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i32 0, i32 0
   19244 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
   19245 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
   19246 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i32 0, i32 1
   19247 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
   19248 // CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i32 7, i32 1)
   19249 // CHECK:   ret void
   19250 void test_vst2_lane_s8(int8_t * a, int8x8x2_t b) {
   19251   vst2_lane_s8(a, b, 7);
   19252 }
   19253 
   19254 // CHECK-LABEL: define void @test_vst2_lane_s16(i16* %a, [2 x i64] %b.coerce) #0 {
   19255 // CHECK:   [[B:%.*]] = alloca %struct.int16x4x2_t, align 8
   19256 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x2_t, align 8
   19257 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[B]], i32 0, i32 0
   19258 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x i16>]* [[COERCE_DIVE]] to [2 x i64]*
   19259 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
   19260 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x2_t* [[__S1]] to i8*
   19261 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x2_t* [[B]] to i8*
   19262 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
   19263 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
   19264 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
   19265 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i32 0, i32 0
   19266 // CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
   19267 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
   19268 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
   19269 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i32 0, i32 1
   19270 // CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
   19271 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
   19272 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
   19273 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
   19274 // CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i32 3, i32 2)
   19275 // CHECK:   ret void
   19276 void test_vst2_lane_s16(int16_t * a, int16x4x2_t b) {
   19277   vst2_lane_s16(a, b, 3);
   19278 }
   19279 
   19280 // CHECK-LABEL: define void @test_vst2_lane_s32(i32* %a, [2 x i64] %b.coerce) #0 {
   19281 // CHECK:   [[B:%.*]] = alloca %struct.int32x2x2_t, align 8
   19282 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x2_t, align 8
   19283 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[B]], i32 0, i32 0
   19284 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <2 x i32>]* [[COERCE_DIVE]] to [2 x i64]*
   19285 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
   19286 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x2_t* [[__S1]] to i8*
   19287 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x2_t* [[B]] to i8*
   19288 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
   19289 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
   19290 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
   19291 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i32 0, i32 0
   19292 // CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
   19293 // CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
   19294 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
   19295 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL1]], i32 0, i32 1
   19296 // CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
   19297 // CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
   19298 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
   19299 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
   19300 // CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], i32 1, i32 4)
   19301 // CHECK:   ret void
   19302 void test_vst2_lane_s32(int32_t * a, int32x2x2_t b) {
   19303   vst2_lane_s32(a, b, 1);
   19304 }
   19305 
   19306 // CHECK-LABEL: define void @test_vst2_lane_f16(half* %a, [2 x i64] %b.coerce) #0 {
   19307 // CHECK:   [[B:%.*]] = alloca %struct.float16x4x2_t, align 8
   19308 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x2_t, align 8
   19309 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[B]], i32 0, i32 0
   19310 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x half>]* [[COERCE_DIVE]] to [2 x i64]*
   19311 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
   19312 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x2_t* [[__S1]] to i8*
   19313 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x2_t* [[B]] to i8*
   19314 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
   19315 // CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
   19316 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
   19317 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL]], i32 0, i32 0
   19318 // CHECK:   [[TMP4:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
   19319 // CHECK:   [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
   19320 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
   19321 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL1]], i32 0, i32 1
   19322 // CHECK:   [[TMP6:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
   19323 // CHECK:   [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
   19324 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
   19325 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
   19326 // CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i32 3, i32 2)
   19327 // CHECK:   ret void
   19328 void test_vst2_lane_f16(float16_t * a, float16x4x2_t b) {
   19329   vst2_lane_f16(a, b, 3);
   19330 }
   19331 
   19332 // CHECK-LABEL: define void @test_vst2_lane_f32(float* %a, [2 x i64] %b.coerce) #0 {
   19333 // CHECK:   [[B:%.*]] = alloca %struct.float32x2x2_t, align 8
   19334 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x2_t, align 8
   19335 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[B]], i32 0, i32 0
   19336 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <2 x float>]* [[COERCE_DIVE]] to [2 x i64]*
   19337 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
   19338 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x2_t* [[__S1]] to i8*
   19339 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x2_t* [[B]] to i8*
   19340 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
   19341 // CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
   19342 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
   19343 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL]], i32 0, i32 0
   19344 // CHECK:   [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
   19345 // CHECK:   [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
   19346 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
   19347 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL1]], i32 0, i32 1
   19348 // CHECK:   [[TMP6:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
   19349 // CHECK:   [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
   19350 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
   19351 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
   19352 // CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v2f32(i8* [[TMP3]], <2 x float> [[TMP8]], <2 x float> [[TMP9]], i32 1, i32 4)
   19353 // CHECK:   ret void
   19354 void test_vst2_lane_f32(float32_t * a, float32x2x2_t b) {
   19355   vst2_lane_f32(a, b, 1);
   19356 }
   19357 
   19358 // CHECK-LABEL: define void @test_vst2_lane_p8(i8* %a, [2 x i64] %b.coerce) #0 {
   19359 // CHECK:   [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8
   19360 // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x2_t, align 8
   19361 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[B]], i32 0, i32 0
   19362 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE]] to [2 x i64]*
   19363 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
   19364 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x2_t* [[__S1]] to i8*
   19365 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x2_t* [[B]] to i8*
   19366 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
   19367 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
   19368 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i32 0, i32 0
   19369 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
   19370 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
   19371 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i32 0, i32 1
   19372 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
   19373 // CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i32 7, i32 1)
   19374 // CHECK:   ret void
   19375 void test_vst2_lane_p8(poly8_t * a, poly8x8x2_t b) {
   19376   vst2_lane_p8(a, b, 7);
   19377 }
   19378 
   19379 // CHECK-LABEL: define void @test_vst2_lane_p16(i16* %a, [2 x i64] %b.coerce) #0 {
   19380 // CHECK:   [[B:%.*]] = alloca %struct.poly16x4x2_t, align 8
   19381 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x2_t, align 8
   19382 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[B]], i32 0, i32 0
   19383 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x i16>]* [[COERCE_DIVE]] to [2 x i64]*
   19384 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
   19385 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x2_t* [[__S1]] to i8*
   19386 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x2_t* [[B]] to i8*
   19387 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
   19388 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
   19389 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
   19390 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i32 0, i32 0
   19391 // CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
   19392 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
   19393 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
   19394 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i32 0, i32 1
   19395 // CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
   19396 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
   19397 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
   19398 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
   19399 // CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i32 3, i32 2)
   19400 // CHECK:   ret void
   19401 void test_vst2_lane_p16(poly16_t * a, poly16x4x2_t b) {
   19402   vst2_lane_p16(a, b, 3);
   19403 }
   19404 
   19405 
   19406 // CHECK-LABEL: define void @test_vst3q_u8(i8* %a, [6 x i64] %b.coerce) #0 {
   19407 // CHECK:   [[B:%.*]] = alloca %struct.uint8x16x3_t, align 16
   19408 // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x3_t, align 16
   19409 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[B]], i32 0, i32 0
   19410 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <16 x i8>]* [[COERCE_DIVE]] to [6 x i64]*
   19411 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
   19412 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x3_t* [[__S1]] to i8*
   19413 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x3_t* [[B]] to i8*
   19414 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
   19415 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
   19416 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i32 0, i32 0
   19417 // CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
   19418 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
   19419 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1]], i32 0, i32 1
   19420 // CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
   19421 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
   19422 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3]], i32 0, i32 2
   19423 // CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
   19424 // CHECK:   call void @llvm.arm.neon.vst3.p0i8.v16i8(i8* %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i32 1)
   19425 // CHECK:   ret void
   19426 void test_vst3q_u8(uint8_t * a, uint8x16x3_t b) {
   19427   vst3q_u8(a, b);
   19428 }
   19429 
   19430 // CHECK-LABEL: define void @test_vst3q_u16(i16* %a, [6 x i64] %b.coerce) #0 {
   19431 // CHECK:   [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16
   19432 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16
   19433 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[B]], i32 0, i32 0
   19434 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i16>]* [[COERCE_DIVE]] to [6 x i64]*
   19435 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
   19436 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x3_t* [[__S1]] to i8*
   19437 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x3_t* [[B]] to i8*
   19438 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
   19439 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
   19440 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
   19441 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i32 0, i32 0
   19442 // CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
   19443 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
   19444 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
   19445 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i32 0, i32 1
   19446 // CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
   19447 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
   19448 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
   19449 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i32 0, i32 2
   19450 // CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
   19451 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
   19452 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
   19453 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
   19454 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
   19455 // CHECK:   call void @llvm.arm.neon.vst3.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i32 2)
   19456 // CHECK:   ret void
   19457 void test_vst3q_u16(uint16_t * a, uint16x8x3_t b) {
   19458   vst3q_u16(a, b);
   19459 }
   19460 
   19461 // CHECK-LABEL: define void @test_vst3q_u32(i32* %a, [6 x i64] %b.coerce) #0 {
   19462 // CHECK:   [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16
   19463 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16
   19464 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[B]], i32 0, i32 0
   19465 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x i32>]* [[COERCE_DIVE]] to [6 x i64]*
   19466 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
   19467 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x3_t* [[__S1]] to i8*
   19468 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x3_t* [[B]] to i8*
   19469 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
   19470 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
   19471 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
   19472 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i32 0, i32 0
   19473 // CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
   19474 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
   19475 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
   19476 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL1]], i32 0, i32 1
   19477 // CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
   19478 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
   19479 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
   19480 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL3]], i32 0, i32 2
   19481 // CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
   19482 // CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
   19483 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
   19484 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
   19485 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
   19486 // CHECK:   call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP3]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], i32 4)
   19487 // CHECK:   ret void
   19488 void test_vst3q_u32(uint32_t * a, uint32x4x3_t b) {
   19489   vst3q_u32(a, b);
   19490 }
   19491 
   19492 // CHECK-LABEL: define void @test_vst3q_s8(i8* %a, [6 x i64] %b.coerce) #0 {
   19493 // CHECK:   [[B:%.*]] = alloca %struct.int8x16x3_t, align 16
   19494 // CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x3_t, align 16
   19495 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[B]], i32 0, i32 0
   19496 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <16 x i8>]* [[COERCE_DIVE]] to [6 x i64]*
   19497 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
   19498 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x3_t* [[__S1]] to i8*
   19499 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x3_t* [[B]] to i8*
   19500 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
   19501 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
   19502 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i32 0, i32 0
   19503 // CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
   19504 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
   19505 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1]], i32 0, i32 1
   19506 // CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
   19507 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
   19508 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3]], i32 0, i32 2
   19509 // CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
   19510 // CHECK:   call void @llvm.arm.neon.vst3.p0i8.v16i8(i8* %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i32 1)
   19511 // CHECK:   ret void
   19512 void test_vst3q_s8(int8_t * a, int8x16x3_t b) {
   19513   vst3q_s8(a, b);
   19514 }
   19515 
   19516 // CHECK-LABEL: define void @test_vst3q_s16(i16* %a, [6 x i64] %b.coerce) #0 {
   19517 // CHECK:   [[B:%.*]] = alloca %struct.int16x8x3_t, align 16
   19518 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16
   19519 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[B]], i32 0, i32 0
   19520 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i16>]* [[COERCE_DIVE]] to [6 x i64]*
   19521 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
   19522 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x3_t* [[__S1]] to i8*
   19523 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x3_t* [[B]] to i8*
   19524 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
   19525 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
   19526 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
   19527 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i32 0, i32 0
   19528 // CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
   19529 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
   19530 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
   19531 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i32 0, i32 1
   19532 // CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
   19533 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
   19534 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
   19535 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i32 0, i32 2
   19536 // CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
   19537 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
   19538 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
   19539 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
   19540 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
   19541 // CHECK:   call void @llvm.arm.neon.vst3.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i32 2)
   19542 // CHECK:   ret void
   19543 void test_vst3q_s16(int16_t * a, int16x8x3_t b) {
   19544   vst3q_s16(a, b);
   19545 }
   19546 
   19547 // CHECK-LABEL: define void @test_vst3q_s32(i32* %a, [6 x i64] %b.coerce) #0 {
   19548 // CHECK:   [[B:%.*]] = alloca %struct.int32x4x3_t, align 16
   19549 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16
   19550 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[B]], i32 0, i32 0
   19551 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x i32>]* [[COERCE_DIVE]] to [6 x i64]*
   19552 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
   19553 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x3_t* [[__S1]] to i8*
   19554 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x3_t* [[B]] to i8*
   19555 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
   19556 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
   19557 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
   19558 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i32 0, i32 0
   19559 // CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
   19560 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
   19561 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
   19562 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL1]], i32 0, i32 1
   19563 // CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
   19564 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
   19565 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
   19566 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL3]], i32 0, i32 2
   19567 // CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
   19568 // CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
   19569 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
   19570 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
   19571 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
   19572 // CHECK:   call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP3]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], i32 4)
   19573 // CHECK:   ret void
   19574 void test_vst3q_s32(int32_t * a, int32x4x3_t b) {
   19575   vst3q_s32(a, b);
   19576 }
   19577 
   19578 // CHECK-LABEL: define void @test_vst3q_f16(half* %a, [6 x i64] %b.coerce) #0 {
   19579 // CHECK:   [[B:%.*]] = alloca %struct.float16x8x3_t, align 16
   19580 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16
   19581 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[B]], i32 0, i32 0
   19582 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x half>]* [[COERCE_DIVE]] to [6 x i64]*
   19583 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
   19584 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x3_t* [[__S1]] to i8*
   19585 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x3_t* [[B]] to i8*
   19586 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
   19587 // CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
   19588 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
   19589 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]], i32 0, i32 0
   19590 // CHECK:   [[TMP4:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
   19591 // CHECK:   [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
   19592 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
   19593 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL1]], i32 0, i32 1
   19594 // CHECK:   [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
   19595 // CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
   19596 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
   19597 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL3]], i32 0, i32 2
   19598 // CHECK:   [[TMP8:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 16
   19599 // CHECK:   [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <16 x i8>
   19600 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
   19601 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
   19602 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
   19603 // CHECK:   call void @llvm.arm.neon.vst3.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i32 2)
   19604 // CHECK:   ret void
   19605 void test_vst3q_f16(float16_t * a, float16x8x3_t b) {
   19606   vst3q_f16(a, b);
   19607 }
   19608 
   19609 // CHECK-LABEL: define void @test_vst3q_f32(float* %a, [6 x i64] %b.coerce) #0 {
   19610 // CHECK:   [[B:%.*]] = alloca %struct.float32x4x3_t, align 16
   19611 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16
   19612 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[B]], i32 0, i32 0
   19613 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x float>]* [[COERCE_DIVE]] to [6 x i64]*
   19614 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
   19615 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x3_t* [[__S1]] to i8*
   19616 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x3_t* [[B]] to i8*
   19617 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
   19618 // CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
   19619 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
   19620 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]], i32 0, i32 0
   19621 // CHECK:   [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
   19622 // CHECK:   [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
   19623 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
   19624 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL1]], i32 0, i32 1
   19625 // CHECK:   [[TMP6:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
   19626 // CHECK:   [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
   19627 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
   19628 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL3]], i32 0, i32 2
   19629 // CHECK:   [[TMP8:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align 16
   19630 // CHECK:   [[TMP9:%.*]] = bitcast <4 x float> [[TMP8]] to <16 x i8>
   19631 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
   19632 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
   19633 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x float>
   19634 // CHECK:   call void @llvm.arm.neon.vst3.p0i8.v4f32(i8* [[TMP3]], <4 x float> [[TMP10]], <4 x float> [[TMP11]], <4 x float> [[TMP12]], i32 4)
   19635 // CHECK:   ret void
   19636 void test_vst3q_f32(float32_t * a, float32x4x3_t b) {
   19637   vst3q_f32(a, b);
   19638 }
   19639 
   19640 // CHECK-LABEL: define void @test_vst3q_p8(i8* %a, [6 x i64] %b.coerce) #0 {
   19641 // CHECK:   [[B:%.*]] = alloca %struct.poly8x16x3_t, align 16
   19642 // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x3_t, align 16
   19643 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[B]], i32 0, i32 0
   19644 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <16 x i8>]* [[COERCE_DIVE]] to [6 x i64]*
   19645 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
   19646 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x3_t* [[__S1]] to i8*
   19647 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x3_t* [[B]] to i8*
   19648 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
   19649 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
   19650 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i32 0, i32 0
   19651 // CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
   19652 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
   19653 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1]], i32 0, i32 1
   19654 // CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
   19655 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
   19656 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3]], i32 0, i32 2
   19657 // CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
   19658 // CHECK:   call void @llvm.arm.neon.vst3.p0i8.v16i8(i8* %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i32 1)
   19659 // CHECK:   ret void
   19660 void test_vst3q_p8(poly8_t * a, poly8x16x3_t b) {
   19661   vst3q_p8(a, b);
   19662 }
   19663 
   19664 // CHECK-LABEL: define void @test_vst3q_p16(i16* %a, [6 x i64] %b.coerce) #0 {
   19665 // CHECK:   [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16
   19666 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16
   19667 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[B]], i32 0, i32 0
   19668 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i16>]* [[COERCE_DIVE]] to [6 x i64]*
   19669 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
   19670 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x3_t* [[__S1]] to i8*
   19671 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x3_t* [[B]] to i8*
   19672 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
   19673 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
   19674 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
   19675 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i32 0, i32 0
   19676 // CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
   19677 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
   19678 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
   19679 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i32 0, i32 1
   19680 // CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
   19681 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
   19682 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
   19683 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i32 0, i32 2
   19684 // CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
   19685 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
   19686 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
   19687 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
   19688 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
   19689 // CHECK:   call void @llvm.arm.neon.vst3.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i32 2)
   19690 // CHECK:   ret void
   19691 void test_vst3q_p16(poly16_t * a, poly16x8x3_t b) {
   19692   vst3q_p16(a, b);
   19693 }
   19694 
   19695 // CHECK-LABEL: define void @test_vst3_u8(i8* %a, [3 x i64] %b.coerce) #0 {
   19696 // CHECK:   [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8
   19697 // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x3_t, align 8
   19698 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[B]], i32 0, i32 0
   19699 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE]] to [3 x i64]*
   19700 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
   19701 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x3_t* [[__S1]] to i8*
   19702 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x3_t* [[B]] to i8*
   19703 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
   19704 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
   19705 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i32 0, i32 0
   19706 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
   19707 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
   19708 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i32 0, i32 1
   19709 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
   19710 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
   19711 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i32 0, i32 2
   19712 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
   19713 // CHECK:   call void @llvm.arm.neon.vst3.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i32 1)
   19714 // CHECK:   ret void
   19715 void test_vst3_u8(uint8_t * a, uint8x8x3_t b) {
   19716   vst3_u8(a, b);
   19717 }
   19718 
   19719 // CHECK-LABEL: define void @test_vst3_u16(i16* %a, [3 x i64] %b.coerce) #0 {
   19720 // CHECK:   [[B:%.*]] = alloca %struct.uint16x4x3_t, align 8
   19721 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x3_t, align 8
   19722 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[B]], i32 0, i32 0
   19723 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x i16>]* [[COERCE_DIVE]] to [3 x i64]*
   19724 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
   19725 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x3_t* [[__S1]] to i8*
   19726 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x3_t* [[B]] to i8*
   19727 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
   19728 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
   19729 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
   19730 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i32 0, i32 0
   19731 // CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
   19732 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
   19733 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
   19734 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i32 0, i32 1
   19735 // CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
   19736 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
   19737 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
   19738 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i32 0, i32 2
   19739 // CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
   19740 // CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
   19741 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
   19742 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
   19743 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
   19744 // CHECK:   call void @llvm.arm.neon.vst3.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i32 2)
   19745 // CHECK:   ret void
   19746 void test_vst3_u16(uint16_t * a, uint16x4x3_t b) {
   19747   vst3_u16(a, b);
   19748 }
   19749 
   19750 // CHECK-LABEL: define void @test_vst3_u32(i32* %a, [3 x i64] %b.coerce) #0 {
   19751 // CHECK:   [[B:%.*]] = alloca %struct.uint32x2x3_t, align 8
   19752 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x3_t, align 8
   19753 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[B]], i32 0, i32 0
   19754 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <2 x i32>]* [[COERCE_DIVE]] to [3 x i64]*
   19755 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
   19756 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x3_t* [[__S1]] to i8*
   19757 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x3_t* [[B]] to i8*
   19758 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
   19759 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
   19760 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
   19761 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i32 0, i32 0
   19762 // CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
   19763 // CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
   19764 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
   19765 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL1]], i32 0, i32 1
   19766 // CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
   19767 // CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
   19768 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
   19769 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL3]], i32 0, i32 2
   19770 // CHECK:   [[TMP8:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
   19771 // CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
   19772 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
   19773 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
   19774 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
   19775 // CHECK:   call void @llvm.arm.neon.vst3.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], i32 4)
   19776 // CHECK:   ret void
   19777 void test_vst3_u32(uint32_t * a, uint32x2x3_t b) {
   19778   vst3_u32(a, b);
   19779 }
   19780 
   19781 // CHECK-LABEL: define void @test_vst3_u64(i64* %a, [3 x i64] %b.coerce) #0 {
   19782 // CHECK:   [[B:%.*]] = alloca %struct.uint64x1x3_t, align 8
   19783 // CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x3_t, align 8
   19784 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[B]], i32 0, i32 0
   19785 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <1 x i64>]* [[COERCE_DIVE]] to [3 x i64]*
   19786 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
   19787 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x3_t* [[__S1]] to i8*
   19788 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint64x1x3_t* [[B]] to i8*
   19789 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
   19790 // CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
   19791 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
   19792 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i32 0, i32 0
   19793 // CHECK:   [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
   19794 // CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
   19795 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
   19796 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL1]], i32 0, i32 1
   19797 // CHECK:   [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
   19798 // CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
   19799 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
   19800 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL3]], i32 0, i32 2
   19801 // CHECK:   [[TMP8:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
   19802 // CHECK:   [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
   19803 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
   19804 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
   19805 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
   19806 // CHECK:   call void @llvm.arm.neon.vst3.p0i8.v1i64(i8* [[TMP3]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], <1 x i64> [[TMP12]], i32 4)
   19807 // CHECK:   ret void
   19808 void test_vst3_u64(uint64_t * a, uint64x1x3_t b) {
   19809   vst3_u64(a, b);
   19810 }
   19811 
   19812 // CHECK-LABEL: define void @test_vst3_s8(i8* %a, [3 x i64] %b.coerce) #0 {
   19813 // CHECK:   [[B:%.*]] = alloca %struct.int8x8x3_t, align 8
   19814 // CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x3_t, align 8
   19815 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[B]], i32 0, i32 0
   19816 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE]] to [3 x i64]*
   19817 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
   19818 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x3_t* [[__S1]] to i8*
   19819 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x3_t* [[B]] to i8*
   19820 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
   19821 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
   19822 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i32 0, i32 0
   19823 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
   19824 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
   19825 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i32 0, i32 1
   19826 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
   19827 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
   19828 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i32 0, i32 2
   19829 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
   19830 // CHECK:   call void @llvm.arm.neon.vst3.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i32 1)
   19831 // CHECK:   ret void
   19832 void test_vst3_s8(int8_t * a, int8x8x3_t b) {
   19833   vst3_s8(a, b);
   19834 }
   19835 
   19836 // CHECK-LABEL: define void @test_vst3_s16(i16* %a, [3 x i64] %b.coerce) #0 {
   19837 // CHECK:   [[B:%.*]] = alloca %struct.int16x4x3_t, align 8
   19838 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x3_t, align 8
   19839 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[B]], i32 0, i32 0
   19840 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x i16>]* [[COERCE_DIVE]] to [3 x i64]*
   19841 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
   19842 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x3_t* [[__S1]] to i8*
   19843 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x3_t* [[B]] to i8*
   19844 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
   19845 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
   19846 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
   19847 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i32 0, i32 0
   19848 // CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
   19849 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
   19850 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
   19851 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i32 0, i32 1
   19852 // CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
   19853 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
   19854 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
   19855 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i32 0, i32 2
   19856 // CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
   19857 // CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
   19858 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
   19859 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
   19860 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
   19861 // CHECK:   call void @llvm.arm.neon.vst3.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i32 2)
   19862 // CHECK:   ret void
   19863 void test_vst3_s16(int16_t * a, int16x4x3_t b) {
   19864   vst3_s16(a, b);
   19865 }
   19866 
   19867 // CHECK-LABEL: define void @test_vst3_s32(i32* %a, [3 x i64] %b.coerce) #0 {
   19868 // CHECK:   [[B:%.*]] = alloca %struct.int32x2x3_t, align 8
   19869 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x3_t, align 8
   19870 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[B]], i32 0, i32 0
   19871 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <2 x i32>]* [[COERCE_DIVE]] to [3 x i64]*
   19872 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
   19873 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x3_t* [[__S1]] to i8*
   19874 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x3_t* [[B]] to i8*
   19875 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
   19876 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
   19877 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
   19878 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i32 0, i32 0
   19879 // CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
   19880 // CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
   19881 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
   19882 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL1]], i32 0, i32 1
   19883 // CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
   19884 // CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
   19885 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
   19886 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL3]], i32 0, i32 2
   19887 // CHECK:   [[TMP8:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
   19888 // CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
   19889 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
   19890 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
   19891 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
   19892 // CHECK:   call void @llvm.arm.neon.vst3.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], i32 4)
   19893 // CHECK:   ret void
   19894 void test_vst3_s32(int32_t * a, int32x2x3_t b) {
   19895   vst3_s32(a, b);
   19896 }
   19897 
   19898 // CHECK-LABEL: define void @test_vst3_s64(i64* %a, [3 x i64] %b.coerce) #0 {
   19899 // CHECK:   [[B:%.*]] = alloca %struct.int64x1x3_t, align 8
   19900 // CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x3_t, align 8
   19901 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[B]], i32 0, i32 0
   19902 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <1 x i64>]* [[COERCE_DIVE]] to [3 x i64]*
   19903 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
   19904 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x3_t* [[__S1]] to i8*
   19905 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int64x1x3_t* [[B]] to i8*
   19906 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
   19907 // CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
   19908 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
   19909 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i32 0, i32 0
   19910 // CHECK:   [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
   19911 // CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
   19912 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
   19913 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL1]], i32 0, i32 1
   19914 // CHECK:   [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
   19915 // CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
   19916 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
   19917 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL3]], i32 0, i32 2
   19918 // CHECK:   [[TMP8:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
   19919 // CHECK:   [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
   19920 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
   19921 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
   19922 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
   19923 // CHECK:   call void @llvm.arm.neon.vst3.p0i8.v1i64(i8* [[TMP3]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], <1 x i64> [[TMP12]], i32 4)
   19924 // CHECK:   ret void
   19925 void test_vst3_s64(int64_t * a, int64x1x3_t b) {
   19926   vst3_s64(a, b);
   19927 }
   19928 
   19929 // CHECK-LABEL: define void @test_vst3_f16(half* %a, [3 x i64] %b.coerce) #0 {
   19930 // CHECK:   [[B:%.*]] = alloca %struct.float16x4x3_t, align 8
   19931 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x3_t, align 8
   19932 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[B]], i32 0, i32 0
   19933 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x half>]* [[COERCE_DIVE]] to [3 x i64]*
   19934 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
   19935 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x3_t* [[__S1]] to i8*
   19936 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x3_t* [[B]] to i8*
   19937 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
   19938 // CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
   19939 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
   19940 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]], i32 0, i32 0
   19941 // CHECK:   [[TMP4:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
   19942 // CHECK:   [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
   19943 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
   19944 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL1]], i32 0, i32 1
   19945 // CHECK:   [[TMP6:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
   19946 // CHECK:   [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
   19947 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
   19948 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL3]], i32 0, i32 2
   19949 // CHECK:   [[TMP8:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX4]], align 8
   19950 // CHECK:   [[TMP9:%.*]] = bitcast <4 x half> [[TMP8]] to <8 x i8>
   19951 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
   19952 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
   19953 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
   19954 // CHECK:   call void @llvm.arm.neon.vst3.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i32 2)
   19955 // CHECK:   ret void
   19956 void test_vst3_f16(float16_t * a, float16x4x3_t b) {
   19957   vst3_f16(a, b);
   19958 }
   19959 
   19960 // CHECK-LABEL: define void @test_vst3_f32(float* %a, [3 x i64] %b.coerce) #0 {
   19961 // CHECK:   [[B:%.*]] = alloca %struct.float32x2x3_t, align 8
   19962 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x3_t, align 8
   19963 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[B]], i32 0, i32 0
   19964 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <2 x float>]* [[COERCE_DIVE]] to [3 x i64]*
   19965 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
   19966 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x3_t* [[__S1]] to i8*
   19967 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x3_t* [[B]] to i8*
   19968 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
   19969 // CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
   19970 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
   19971 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]], i32 0, i32 0
   19972 // CHECK:   [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
   19973 // CHECK:   [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
   19974 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
   19975 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL1]], i32 0, i32 1
   19976 // CHECK:   [[TMP6:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
   19977 // CHECK:   [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
   19978 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
   19979 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL3]], i32 0, i32 2
   19980 // CHECK:   [[TMP8:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX4]], align 8
   19981 // CHECK:   [[TMP9:%.*]] = bitcast <2 x float> [[TMP8]] to <8 x i8>
   19982 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
   19983 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
   19984 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x float>
   19985 // CHECK:   call void @llvm.arm.neon.vst3.p0i8.v2f32(i8* [[TMP3]], <2 x float> [[TMP10]], <2 x float> [[TMP11]], <2 x float> [[TMP12]], i32 4)
   19986 // CHECK:   ret void
   19987 void test_vst3_f32(float32_t * a, float32x2x3_t b) {
   19988   vst3_f32(a, b);
   19989 }
   19990 
   19991 // CHECK-LABEL: define void @test_vst3_p8(i8* %a, [3 x i64] %b.coerce) #0 {
   19992 // CHECK:   [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8
   19993 // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x3_t, align 8
   19994 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[B]], i32 0, i32 0
   19995 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE]] to [3 x i64]*
   19996 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
   19997 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x3_t* [[__S1]] to i8*
   19998 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x3_t* [[B]] to i8*
   19999 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
   20000 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
   20001 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i32 0, i32 0
   20002 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
   20003 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
   20004 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i32 0, i32 1
   20005 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
   20006 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
   20007 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i32 0, i32 2
   20008 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
   20009 // CHECK:   call void @llvm.arm.neon.vst3.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i32 1)
   20010 // CHECK:   ret void
   20011 void test_vst3_p8(poly8_t * a, poly8x8x3_t b) {
   20012   vst3_p8(a, b);
   20013 }
   20014 
   20015 // CHECK-LABEL: define void @test_vst3_p16(i16* %a, [3 x i64] %b.coerce) #0 {
   20016 // CHECK:   [[B:%.*]] = alloca %struct.poly16x4x3_t, align 8
   20017 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x3_t, align 8
   20018 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[B]], i32 0, i32 0
   20019 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x i16>]* [[COERCE_DIVE]] to [3 x i64]*
   20020 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
   20021 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x3_t* [[__S1]] to i8*
   20022 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x3_t* [[B]] to i8*
   20023 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
   20024 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
   20025 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
   20026 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i32 0, i32 0
   20027 // CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
   20028 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
   20029 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
   20030 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i32 0, i32 1
   20031 // CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
   20032 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
   20033 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
   20034 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i32 0, i32 2
   20035 // CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
   20036 // CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
   20037 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
   20038 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
   20039 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
   20040 // CHECK:   call void @llvm.arm.neon.vst3.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i32 2)
   20041 // CHECK:   ret void
   20042 void test_vst3_p16(poly16_t * a, poly16x4x3_t b) {
   20043   vst3_p16(a, b);
   20044 }
   20045 
   20046 
   20047 // CHECK-LABEL: define void @test_vst3q_lane_u16(i16* %a, [6 x i64] %b.coerce) #0 {
   20048 // CHECK:   [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16
   20049 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16
   20050 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[B]], i32 0, i32 0
   20051 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i16>]* [[COERCE_DIVE]] to [6 x i64]*
   20052 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
   20053 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x3_t* [[__S1]] to i8*
   20054 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x3_t* [[B]] to i8*
   20055 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
   20056 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
   20057 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
   20058 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i32 0, i32 0
   20059 // CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
   20060 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
   20061 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
   20062 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i32 0, i32 1
   20063 // CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
   20064 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
   20065 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
   20066 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i32 0, i32 2
   20067 // CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
   20068 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
   20069 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
   20070 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
   20071 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
   20072 // CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i32 7, i32 2)
   20073 // CHECK:   ret void
   20074 void test_vst3q_lane_u16(uint16_t * a, uint16x8x3_t b) {
   20075   vst3q_lane_u16(a, b, 7);
   20076 }
   20077 
   20078 // CHECK-LABEL: define void @test_vst3q_lane_u32(i32* %a, [6 x i64] %b.coerce) #0 {
   20079 // CHECK:   [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16
   20080 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16
   20081 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[B]], i32 0, i32 0
   20082 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x i32>]* [[COERCE_DIVE]] to [6 x i64]*
   20083 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
   20084 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x3_t* [[__S1]] to i8*
   20085 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x3_t* [[B]] to i8*
   20086 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
   20087 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
   20088 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
   20089 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i32 0, i32 0
   20090 // CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
   20091 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
   20092 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
   20093 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL1]], i32 0, i32 1
   20094 // CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
   20095 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
   20096 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
   20097 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL3]], i32 0, i32 2
   20098 // CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
   20099 // CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
   20100 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
   20101 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
   20102 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
   20103 // CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v4i32(i8* [[TMP3]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], i32 3, i32 4)
   20104 // CHECK:   ret void
   20105 void test_vst3q_lane_u32(uint32_t * a, uint32x4x3_t b) {
   20106   vst3q_lane_u32(a, b, 3);
   20107 }
   20108 
   20109 // CHECK-LABEL: define void @test_vst3q_lane_s16(i16* %a, [6 x i64] %b.coerce) #0 {
   20110 // CHECK:   [[B:%.*]] = alloca %struct.int16x8x3_t, align 16
   20111 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16
   20112 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[B]], i32 0, i32 0
   20113 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i16>]* [[COERCE_DIVE]] to [6 x i64]*
   20114 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
   20115 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x3_t* [[__S1]] to i8*
   20116 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x3_t* [[B]] to i8*
   20117 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
   20118 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
   20119 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
   20120 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i32 0, i32 0
   20121 // CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
   20122 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
   20123 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
   20124 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i32 0, i32 1
   20125 // CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
   20126 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
   20127 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
   20128 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i32 0, i32 2
   20129 // CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
   20130 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
   20131 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
   20132 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
   20133 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
   20134 // CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i32 7, i32 2)
   20135 // CHECK:   ret void
   20136 void test_vst3q_lane_s16(int16_t * a, int16x8x3_t b) {
   20137   vst3q_lane_s16(a, b, 7);
   20138 }
   20139 
   20140 // CHECK-LABEL: define void @test_vst3q_lane_s32(i32* %a, [6 x i64] %b.coerce) #0 {
   20141 // CHECK:   [[B:%.*]] = alloca %struct.int32x4x3_t, align 16
   20142 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16
   20143 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[B]], i32 0, i32 0
   20144 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x i32>]* [[COERCE_DIVE]] to [6 x i64]*
   20145 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
   20146 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x3_t* [[__S1]] to i8*
   20147 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x3_t* [[B]] to i8*
   20148 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
   20149 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
   20150 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
   20151 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i32 0, i32 0
   20152 // CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
   20153 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
   20154 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
   20155 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL1]], i32 0, i32 1
   20156 // CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
   20157 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
   20158 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
   20159 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL3]], i32 0, i32 2
   20160 // CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
   20161 // CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
   20162 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
   20163 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
   20164 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
   20165 // CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v4i32(i8* [[TMP3]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], i32 3, i32 4)
   20166 // CHECK:   ret void
   20167 void test_vst3q_lane_s32(int32_t * a, int32x4x3_t b) {
   20168   vst3q_lane_s32(a, b, 3);
   20169 }
   20170 
   20171 // CHECK-LABEL: define void @test_vst3q_lane_f16(half* %a, [6 x i64] %b.coerce) #0 {
   20172 // CHECK:   [[B:%.*]] = alloca %struct.float16x8x3_t, align 16
   20173 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16
   20174 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[B]], i32 0, i32 0
   20175 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x half>]* [[COERCE_DIVE]] to [6 x i64]*
   20176 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
   20177 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x3_t* [[__S1]] to i8*
   20178 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x3_t* [[B]] to i8*
   20179 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
   20180 // CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
   20181 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
   20182 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]], i32 0, i32 0
   20183 // CHECK:   [[TMP4:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
   20184 // CHECK:   [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
   20185 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
   20186 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL1]], i32 0, i32 1
   20187 // CHECK:   [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
   20188 // CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
   20189 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
   20190 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL3]], i32 0, i32 2
   20191 // CHECK:   [[TMP8:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 16
   20192 // CHECK:   [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <16 x i8>
   20193 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
   20194 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
   20195 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
   20196 // CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i32 7, i32 2)
   20197 // CHECK:   ret void
   20198 void test_vst3q_lane_f16(float16_t * a, float16x8x3_t b) {
   20199   vst3q_lane_f16(a, b, 7);
   20200 }
   20201 
   20202 // CHECK-LABEL: define void @test_vst3q_lane_f32(float* %a, [6 x i64] %b.coerce) #0 {
   20203 // CHECK:   [[B:%.*]] = alloca %struct.float32x4x3_t, align 16
   20204 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16
   20205 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[B]], i32 0, i32 0
   20206 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x float>]* [[COERCE_DIVE]] to [6 x i64]*
   20207 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
   20208 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x3_t* [[__S1]] to i8*
   20209 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x3_t* [[B]] to i8*
   20210 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
   20211 // CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
   20212 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
   20213 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]], i32 0, i32 0
   20214 // CHECK:   [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
   20215 // CHECK:   [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
   20216 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
   20217 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL1]], i32 0, i32 1
   20218 // CHECK:   [[TMP6:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
   20219 // CHECK:   [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
   20220 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
   20221 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL3]], i32 0, i32 2
   20222 // CHECK:   [[TMP8:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align 16
   20223 // CHECK:   [[TMP9:%.*]] = bitcast <4 x float> [[TMP8]] to <16 x i8>
   20224 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
   20225 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
   20226 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x float>
   20227 // CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v4f32(i8* [[TMP3]], <4 x float> [[TMP10]], <4 x float> [[TMP11]], <4 x float> [[TMP12]], i32 3, i32 4)
   20228 // CHECK:   ret void
   20229 void test_vst3q_lane_f32(float32_t * a, float32x4x3_t b) {
   20230   vst3q_lane_f32(a, b, 3);
   20231 }
   20232 
   20233 // CHECK-LABEL: define void @test_vst3q_lane_p16(i16* %a, [6 x i64] %b.coerce) #0 {
   20234 // CHECK:   [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16
   20235 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16
   20236 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[B]], i32 0, i32 0
   20237 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i16>]* [[COERCE_DIVE]] to [6 x i64]*
   20238 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
   20239 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x3_t* [[__S1]] to i8*
   20240 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x3_t* [[B]] to i8*
   20241 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
   20242 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
   20243 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
   20244 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i32 0, i32 0
   20245 // CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
   20246 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
   20247 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
   20248 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i32 0, i32 1
   20249 // CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
   20250 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
   20251 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
   20252 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i32 0, i32 2
   20253 // CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
   20254 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
   20255 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
   20256 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
   20257 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
   20258 // CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i32 7, i32 2)
   20259 // CHECK:   ret void
   20260 void test_vst3q_lane_p16(poly16_t * a, poly16x8x3_t b) {
   20261   vst3q_lane_p16(a, b, 7);
   20262 }
   20263 
   20264 // CHECK-LABEL: define void @test_vst3_lane_u8(i8* %a, [3 x i64] %b.coerce) #0 {
   20265 // CHECK:   [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8
   20266 // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x3_t, align 8
   20267 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[B]], i32 0, i32 0
   20268 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE]] to [3 x i64]*
   20269 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
   20270 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x3_t* [[__S1]] to i8*
   20271 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x3_t* [[B]] to i8*
   20272 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
   20273 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
   20274 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i32 0, i32 0
   20275 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
   20276 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
   20277 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i32 0, i32 1
   20278 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
   20279 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
   20280 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i32 0, i32 2
   20281 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
   20282 // CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i32 7, i32 1)
   20283 // CHECK:   ret void
   20284 void test_vst3_lane_u8(uint8_t * a, uint8x8x3_t b) {
   20285   vst3_lane_u8(a, b, 7);
   20286 }
   20287 
   20288 // CHECK-LABEL: define void @test_vst3_lane_u16(i16* %a, [3 x i64] %b.coerce) #0 {
   20289 // CHECK:   [[B:%.*]] = alloca %struct.uint16x4x3_t, align 8
   20290 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x3_t, align 8
   20291 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[B]], i32 0, i32 0
   20292 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x i16>]* [[COERCE_DIVE]] to [3 x i64]*
   20293 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
   20294 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x3_t* [[__S1]] to i8*
   20295 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x3_t* [[B]] to i8*
   20296 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
   20297 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
   20298 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
   20299 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i32 0, i32 0
   20300 // CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
   20301 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
   20302 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
   20303 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i32 0, i32 1
   20304 // CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
   20305 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
   20306 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
   20307 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i32 0, i32 2
   20308 // CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
   20309 // CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
   20310 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
   20311 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
   20312 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
   20313 // CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i32 3, i32 2)
   20314 // CHECK:   ret void
   20315 void test_vst3_lane_u16(uint16_t * a, uint16x4x3_t b) {
   20316   vst3_lane_u16(a, b, 3);
   20317 }
   20318 
   20319 // CHECK-LABEL: define void @test_vst3_lane_u32(i32* %a, [3 x i64] %b.coerce) #0 {
   20320 // CHECK:   [[B:%.*]] = alloca %struct.uint32x2x3_t, align 8
   20321 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x3_t, align 8
   20322 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[B]], i32 0, i32 0
   20323 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <2 x i32>]* [[COERCE_DIVE]] to [3 x i64]*
   20324 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
   20325 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x3_t* [[__S1]] to i8*
   20326 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x3_t* [[B]] to i8*
   20327 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
   20328 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
   20329 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
   20330 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i32 0, i32 0
   20331 // CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
   20332 // CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
   20333 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
   20334 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL1]], i32 0, i32 1
   20335 // CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
   20336 // CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
   20337 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
   20338 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL3]], i32 0, i32 2
   20339 // CHECK:   [[TMP8:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
   20340 // CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
   20341 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
   20342 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
   20343 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
   20344 // CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], i32 1, i32 4)
   20345 // CHECK:   ret void
   20346 void test_vst3_lane_u32(uint32_t * a, uint32x2x3_t b) {
   20347   vst3_lane_u32(a, b, 1);
   20348 }
   20349 
   20350 // CHECK-LABEL: define void @test_vst3_lane_s8(i8* %a, [3 x i64] %b.coerce) #0 {
   20351 // CHECK:   [[B:%.*]] = alloca %struct.int8x8x3_t, align 8
   20352 // CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x3_t, align 8
   20353 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[B]], i32 0, i32 0
   20354 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE]] to [3 x i64]*
   20355 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
   20356 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x3_t* [[__S1]] to i8*
   20357 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x3_t* [[B]] to i8*
   20358 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
   20359 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
   20360 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i32 0, i32 0
   20361 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
   20362 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
   20363 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i32 0, i32 1
   20364 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
   20365 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
   20366 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i32 0, i32 2
   20367 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
   20368 // CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i32 7, i32 1)
   20369 // CHECK:   ret void
   20370 void test_vst3_lane_s8(int8_t * a, int8x8x3_t b) {
   20371   vst3_lane_s8(a, b, 7);
   20372 }
   20373 
   20374 // CHECK-LABEL: define void @test_vst3_lane_s16(i16* %a, [3 x i64] %b.coerce) #0 {
   20375 // CHECK:   [[B:%.*]] = alloca %struct.int16x4x3_t, align 8
   20376 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x3_t, align 8
   20377 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[B]], i32 0, i32 0
   20378 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x i16>]* [[COERCE_DIVE]] to [3 x i64]*
   20379 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
   20380 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x3_t* [[__S1]] to i8*
   20381 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x3_t* [[B]] to i8*
   20382 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
   20383 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
   20384 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
   20385 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i32 0, i32 0
   20386 // CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
   20387 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
   20388 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
   20389 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i32 0, i32 1
   20390 // CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
   20391 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
   20392 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
   20393 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i32 0, i32 2
   20394 // CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
   20395 // CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
   20396 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
   20397 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
   20398 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
   20399 // CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i32 3, i32 2)
   20400 // CHECK:   ret void
   20401 void test_vst3_lane_s16(int16_t * a, int16x4x3_t b) {
   20402   vst3_lane_s16(a, b, 3);
   20403 }
   20404 
   20405 // CHECK-LABEL: define void @test_vst3_lane_s32(i32* %a, [3 x i64] %b.coerce) #0 {
   20406 // CHECK:   [[B:%.*]] = alloca %struct.int32x2x3_t, align 8
   20407 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x3_t, align 8
   20408 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[B]], i32 0, i32 0
   20409 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <2 x i32>]* [[COERCE_DIVE]] to [3 x i64]*
   20410 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
   20411 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x3_t* [[__S1]] to i8*
   20412 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x3_t* [[B]] to i8*
   20413 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
   20414 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
   20415 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
   20416 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i32 0, i32 0
   20417 // CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
   20418 // CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
   20419 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
   20420 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL1]], i32 0, i32 1
   20421 // CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
   20422 // CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
   20423 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
   20424 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL3]], i32 0, i32 2
   20425 // CHECK:   [[TMP8:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
   20426 // CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
   20427 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
   20428 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
   20429 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
   20430 // CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], i32 1, i32 4)
   20431 // CHECK:   ret void
   20432 void test_vst3_lane_s32(int32_t * a, int32x2x3_t b) {
   20433   vst3_lane_s32(a, b, 1);
   20434 }
   20435 
   20436 // CHECK-LABEL: define void @test_vst3_lane_f16(half* %a, [3 x i64] %b.coerce) #0 {
   20437 // CHECK:   [[B:%.*]] = alloca %struct.float16x4x3_t, align 8
   20438 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x3_t, align 8
   20439 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[B]], i32 0, i32 0
   20440 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x half>]* [[COERCE_DIVE]] to [3 x i64]*
   20441 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
   20442 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x3_t* [[__S1]] to i8*
   20443 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x3_t* [[B]] to i8*
   20444 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
   20445 // CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
   20446 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
   20447 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]], i32 0, i32 0
   20448 // CHECK:   [[TMP4:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
   20449 // CHECK:   [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
   20450 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
   20451 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL1]], i32 0, i32 1
   20452 // CHECK:   [[TMP6:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
   20453 // CHECK:   [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
   20454 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
   20455 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL3]], i32 0, i32 2
   20456 // CHECK:   [[TMP8:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX4]], align 8
   20457 // CHECK:   [[TMP9:%.*]] = bitcast <4 x half> [[TMP8]] to <8 x i8>
   20458 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
   20459 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
   20460 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
   20461 // CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i32 3, i32 2)
   20462 // CHECK:   ret void
   20463 void test_vst3_lane_f16(float16_t * a, float16x4x3_t b) {
   20464   vst3_lane_f16(a, b, 3);
   20465 }
   20466 
   20467 // CHECK-LABEL: define void @test_vst3_lane_f32(float* %a, [3 x i64] %b.coerce) #0 {
   20468 // CHECK:   [[B:%.*]] = alloca %struct.float32x2x3_t, align 8
   20469 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x3_t, align 8
   20470 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[B]], i32 0, i32 0
   20471 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <2 x float>]* [[COERCE_DIVE]] to [3 x i64]*
   20472 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
   20473 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x3_t* [[__S1]] to i8*
   20474 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x3_t* [[B]] to i8*
   20475 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
   20476 // CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
   20477 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
   20478 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]], i32 0, i32 0
   20479 // CHECK:   [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
   20480 // CHECK:   [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
   20481 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
   20482 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL1]], i32 0, i32 1
   20483 // CHECK:   [[TMP6:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
   20484 // CHECK:   [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
   20485 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
   20486 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL3]], i32 0, i32 2
   20487 // CHECK:   [[TMP8:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX4]], align 8
   20488 // CHECK:   [[TMP9:%.*]] = bitcast <2 x float> [[TMP8]] to <8 x i8>
   20489 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
   20490 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
   20491 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x float>
   20492 // CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v2f32(i8* [[TMP3]], <2 x float> [[TMP10]], <2 x float> [[TMP11]], <2 x float> [[TMP12]], i32 1, i32 4)
   20493 // CHECK:   ret void
   20494 void test_vst3_lane_f32(float32_t * a, float32x2x3_t b) {
   20495   vst3_lane_f32(a, b, 1);
   20496 }
   20497 
   20498 // CHECK-LABEL: define void @test_vst3_lane_p8(i8* %a, [3 x i64] %b.coerce) #0 {
   20499 // CHECK:   [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8
   20500 // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x3_t, align 8
   20501 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[B]], i32 0, i32 0
   20502 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE]] to [3 x i64]*
   20503 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
   20504 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x3_t* [[__S1]] to i8*
   20505 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x3_t* [[B]] to i8*
   20506 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
   20507 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
   20508 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i32 0, i32 0
   20509 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
   20510 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
   20511 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i32 0, i32 1
   20512 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
   20513 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
   20514 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i32 0, i32 2
   20515 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
   20516 // CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i32 7, i32 1)
   20517 // CHECK:   ret void
   20518 void test_vst3_lane_p8(poly8_t * a, poly8x8x3_t b) {
   20519   vst3_lane_p8(a, b, 7);
   20520 }
   20521 
   20522 // CHECK-LABEL: define void @test_vst3_lane_p16(i16* %a, [3 x i64] %b.coerce) #0 {
   20523 // CHECK:   [[B:%.*]] = alloca %struct.poly16x4x3_t, align 8
   20524 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x3_t, align 8
   20525 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[B]], i32 0, i32 0
   20526 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x i16>]* [[COERCE_DIVE]] to [3 x i64]*
   20527 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
   20528 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x3_t* [[__S1]] to i8*
   20529 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x3_t* [[B]] to i8*
   20530 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
   20531 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
   20532 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
   20533 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i32 0, i32 0
   20534 // CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
   20535 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
   20536 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
   20537 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i32 0, i32 1
   20538 // CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
   20539 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
   20540 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
   20541 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i32 0, i32 2
   20542 // CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
   20543 // CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
   20544 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
   20545 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
   20546 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
   20547 // CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i32 3, i32 2)
   20548 // CHECK:   ret void
   20549 void test_vst3_lane_p16(poly16_t * a, poly16x4x3_t b) {
   20550   vst3_lane_p16(a, b, 3);
   20551 }
   20552 
   20553 
   20554 // CHECK-LABEL: define void @test_vst4q_u8(i8* %a, [8 x i64] %b.coerce) #0 {
   20555 // CHECK:   [[B:%.*]] = alloca %struct.uint8x16x4_t, align 16
   20556 // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x4_t, align 16
   20557 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[B]], i32 0, i32 0
   20558 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <16 x i8>]* [[COERCE_DIVE]] to [8 x i64]*
   20559 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
   20560 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x4_t* [[__S1]] to i8*
   20561 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x4_t* [[B]] to i8*
   20562 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
   20563 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
   20564 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i32 0, i32 0
   20565 // CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
   20566 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
   20567 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1]], i32 0, i32 1
   20568 // CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
   20569 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
   20570 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3]], i32 0, i32 2
   20571 // CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
   20572 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
   20573 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5]], i32 0, i32 3
   20574 // CHECK:   [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6]], align 16
   20575 // CHECK:   call void @llvm.arm.neon.vst4.p0i8.v16i8(i8* %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], i32 1)
   20576 // CHECK:   ret void
   20577 void test_vst4q_u8(uint8_t * a, uint8x16x4_t b) {
   20578   vst4q_u8(a, b);
   20579 }
   20580 
   20581 // CHECK-LABEL: define void @test_vst4q_u16(i16* %a, [8 x i64] %b.coerce) #0 {
   20582 // CHECK:   [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16
   20583 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16
   20584 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[B]], i32 0, i32 0
   20585 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i16>]* [[COERCE_DIVE]] to [8 x i64]*
   20586 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
   20587 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x4_t* [[__S1]] to i8*
   20588 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x4_t* [[B]] to i8*
   20589 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
   20590 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
   20591 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
   20592 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i32 0, i32 0
   20593 // CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
   20594 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
   20595 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
   20596 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i32 0, i32 1
   20597 // CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
   20598 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
   20599 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
   20600 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i32 0, i32 2
   20601 // CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
   20602 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
   20603 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
   20604 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i32 0, i32 3
   20605 // CHECK:   [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
   20606 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8>
   20607 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
   20608 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
   20609 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
   20610 // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
   20611 // CHECK:   call void @llvm.arm.neon.vst4.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i32 2)
   20612 // CHECK:   ret void
   20613 void test_vst4q_u16(uint16_t * a, uint16x8x4_t b) {
   20614   vst4q_u16(a, b);
   20615 }
   20616 
   20617 // CHECK-LABEL: define void @test_vst4q_u32(i32* %a, [8 x i64] %b.coerce) #0 {
   20618 // CHECK:   [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16
   20619 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16
   20620 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[B]], i32 0, i32 0
   20621 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x i32>]* [[COERCE_DIVE]] to [8 x i64]*
   20622 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
   20623 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x4_t* [[__S1]] to i8*
   20624 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x4_t* [[B]] to i8*
   20625 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
   20626 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
   20627 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
   20628 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i32 0, i32 0
   20629 // CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
   20630 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
   20631 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
   20632 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL1]], i32 0, i32 1
   20633 // CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
   20634 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
   20635 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
   20636 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL3]], i32 0, i32 2
   20637 // CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
   20638 // CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
   20639 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
   20640 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL5]], i32 0, i32 3
   20641 // CHECK:   [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align 16
   20642 // CHECK:   [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
   20643 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
   20644 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
   20645 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
   20646 // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x i32>
   20647 // CHECK:   call void @llvm.arm.neon.vst4.p0i8.v4i32(i8* [[TMP3]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], i32 4)
   20648 // CHECK:   ret void
   20649 void test_vst4q_u32(uint32_t * a, uint32x4x4_t b) {
   20650   vst4q_u32(a, b);
   20651 }
   20652 
   20653 // CHECK-LABEL: define void @test_vst4q_s8(i8* %a, [8 x i64] %b.coerce) #0 {
   20654 // CHECK:   [[B:%.*]] = alloca %struct.int8x16x4_t, align 16
   20655 // CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x4_t, align 16
   20656 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[B]], i32 0, i32 0
   20657 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <16 x i8>]* [[COERCE_DIVE]] to [8 x i64]*
   20658 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
   20659 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x4_t* [[__S1]] to i8*
   20660 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x4_t* [[B]] to i8*
   20661 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
   20662 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
   20663 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i32 0, i32 0
   20664 // CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
   20665 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
   20666 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1]], i32 0, i32 1
   20667 // CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
   20668 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
   20669 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3]], i32 0, i32 2
   20670 // CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
   20671 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
   20672 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5]], i32 0, i32 3
   20673 // CHECK:   [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6]], align 16
   20674 // CHECK:   call void @llvm.arm.neon.vst4.p0i8.v16i8(i8* %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], i32 1)
   20675 // CHECK:   ret void
   20676 void test_vst4q_s8(int8_t * a, int8x16x4_t b) {
   20677   vst4q_s8(a, b);
   20678 }
   20679 
   20680 // CHECK-LABEL: define void @test_vst4q_s16(i16* %a, [8 x i64] %b.coerce) #0 {
   20681 // CHECK:   [[B:%.*]] = alloca %struct.int16x8x4_t, align 16
   20682 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16
   20683 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[B]], i32 0, i32 0
   20684 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i16>]* [[COERCE_DIVE]] to [8 x i64]*
   20685 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
   20686 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x4_t* [[__S1]] to i8*
   20687 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x4_t* [[B]] to i8*
   20688 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
   20689 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
   20690 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
   20691 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i32 0, i32 0
   20692 // CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
   20693 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
   20694 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
   20695 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i32 0, i32 1
   20696 // CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
   20697 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
   20698 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
   20699 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i32 0, i32 2
   20700 // CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
   20701 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
   20702 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
   20703 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i32 0, i32 3
   20704 // CHECK:   [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
   20705 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8>
   20706 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
   20707 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
   20708 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
   20709 // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
   20710 // CHECK:   call void @llvm.arm.neon.vst4.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i32 2)
   20711 // CHECK:   ret void
   20712 void test_vst4q_s16(int16_t * a, int16x8x4_t b) {
   20713   vst4q_s16(a, b);
   20714 }
   20715 
   20716 // CHECK-LABEL: define void @test_vst4q_s32(i32* %a, [8 x i64] %b.coerce) #0 {
   20717 // CHECK:   [[B:%.*]] = alloca %struct.int32x4x4_t, align 16
   20718 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16
   20719 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[B]], i32 0, i32 0
   20720 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x i32>]* [[COERCE_DIVE]] to [8 x i64]*
   20721 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
   20722 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x4_t* [[__S1]] to i8*
   20723 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x4_t* [[B]] to i8*
   20724 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
   20725 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
   20726 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
   20727 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i32 0, i32 0
   20728 // CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
   20729 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
   20730 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
   20731 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL1]], i32 0, i32 1
   20732 // CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
   20733 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
   20734 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
   20735 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL3]], i32 0, i32 2
   20736 // CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
   20737 // CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
   20738 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
   20739 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL5]], i32 0, i32 3
   20740 // CHECK:   [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align 16
   20741 // CHECK:   [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
   20742 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
   20743 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
   20744 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
   20745 // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x i32>
   20746 // CHECK:   call void @llvm.arm.neon.vst4.p0i8.v4i32(i8* [[TMP3]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], i32 4)
   20747 // CHECK:   ret void
   20748 void test_vst4q_s32(int32_t * a, int32x4x4_t b) {
   20749   vst4q_s32(a, b);
   20750 }
   20751 
   20752 // CHECK-LABEL: define void @test_vst4q_f16(half* %a, [8 x i64] %b.coerce) #0 {
   20753 // CHECK:   [[B:%.*]] = alloca %struct.float16x8x4_t, align 16
   20754 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16
   20755 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[B]], i32 0, i32 0
   20756 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x half>]* [[COERCE_DIVE]] to [8 x i64]*
   20757 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
   20758 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x4_t* [[__S1]] to i8*
   20759 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x4_t* [[B]] to i8*
   20760 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
   20761 // CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
   20762 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
   20763 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]], i32 0, i32 0
   20764 // CHECK:   [[TMP4:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
   20765 // CHECK:   [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
   20766 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
   20767 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL1]], i32 0, i32 1
   20768 // CHECK:   [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
   20769 // CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
   20770 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
   20771 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL3]], i32 0, i32 2
   20772 // CHECK:   [[TMP8:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 16
   20773 // CHECK:   [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <16 x i8>
   20774 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
   20775 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL5]], i32 0, i32 3
   20776 // CHECK:   [[TMP10:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX6]], align 16
   20777 // CHECK:   [[TMP11:%.*]] = bitcast <8 x half> [[TMP10]] to <16 x i8>
   20778 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
   20779 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
   20780 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
   20781 // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
   20782 // CHECK:   call void @llvm.arm.neon.vst4.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i32 2)
   20783 // CHECK:   ret void
   20784 void test_vst4q_f16(float16_t * a, float16x8x4_t b) {
   20785   vst4q_f16(a, b);
   20786 }
   20787 
   20788 // CHECK-LABEL: define void @test_vst4q_f32(float* %a, [8 x i64] %b.coerce) #0 {
   20789 // CHECK:   [[B:%.*]] = alloca %struct.float32x4x4_t, align 16
   20790 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16
   20791 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[B]], i32 0, i32 0
   20792 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x float>]* [[COERCE_DIVE]] to [8 x i64]*
   20793 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
   20794 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x4_t* [[__S1]] to i8*
   20795 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x4_t* [[B]] to i8*
   20796 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
   20797 // CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
   20798 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
   20799 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]], i32 0, i32 0
   20800 // CHECK:   [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
   20801 // CHECK:   [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
   20802 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
   20803 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL1]], i32 0, i32 1
   20804 // CHECK:   [[TMP6:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
   20805 // CHECK:   [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
   20806 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
   20807 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL3]], i32 0, i32 2
   20808 // CHECK:   [[TMP8:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align 16
   20809 // CHECK:   [[TMP9:%.*]] = bitcast <4 x float> [[TMP8]] to <16 x i8>
   20810 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
   20811 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL5]], i32 0, i32 3
   20812 // CHECK:   [[TMP10:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX6]], align 16
   20813 // CHECK:   [[TMP11:%.*]] = bitcast <4 x float> [[TMP10]] to <16 x i8>
   20814 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
   20815 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
   20816 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x float>
   20817 // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x float>
   20818 // CHECK:   call void @llvm.arm.neon.vst4.p0i8.v4f32(i8* [[TMP3]], <4 x float> [[TMP12]], <4 x float> [[TMP13]], <4 x float> [[TMP14]], <4 x float> [[TMP15]], i32 4)
   20819 // CHECK:   ret void
   20820 void test_vst4q_f32(float32_t * a, float32x4x4_t b) {
   20821   vst4q_f32(a, b);
   20822 }
   20823 
   20824 // CHECK-LABEL: define void @test_vst4q_p8(i8* %a, [8 x i64] %b.coerce) #0 {
   20825 // CHECK:   [[B:%.*]] = alloca %struct.poly8x16x4_t, align 16
   20826 // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x4_t, align 16
   20827 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[B]], i32 0, i32 0
   20828 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <16 x i8>]* [[COERCE_DIVE]] to [8 x i64]*
   20829 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
   20830 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x4_t* [[__S1]] to i8*
   20831 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x4_t* [[B]] to i8*
   20832 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
   20833 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
   20834 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i32 0, i32 0
   20835 // CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
   20836 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
   20837 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1]], i32 0, i32 1
   20838 // CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
   20839 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
   20840 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3]], i32 0, i32 2
   20841 // CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
   20842 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
   20843 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5]], i32 0, i32 3
   20844 // CHECK:   [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6]], align 16
   20845 // CHECK:   call void @llvm.arm.neon.vst4.p0i8.v16i8(i8* %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], i32 1)
   20846 // CHECK:   ret void
   20847 void test_vst4q_p8(poly8_t * a, poly8x16x4_t b) {
   20848   vst4q_p8(a, b);
   20849 }
   20850 
   20851 // CHECK-LABEL: define void @test_vst4q_p16(i16* %a, [8 x i64] %b.coerce) #0 {
   20852 // CHECK:   [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16
   20853 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16
   20854 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[B]], i32 0, i32 0
   20855 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i16>]* [[COERCE_DIVE]] to [8 x i64]*
   20856 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
   20857 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x4_t* [[__S1]] to i8*
   20858 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x4_t* [[B]] to i8*
   20859 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
   20860 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
   20861 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
   20862 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i32 0, i32 0
   20863 // CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
   20864 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
   20865 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
   20866 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i32 0, i32 1
   20867 // CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
   20868 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
   20869 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
   20870 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i32 0, i32 2
   20871 // CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
   20872 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
   20873 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
   20874 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i32 0, i32 3
   20875 // CHECK:   [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
   20876 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8>
   20877 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
   20878 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
   20879 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
   20880 // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
   20881 // CHECK:   call void @llvm.arm.neon.vst4.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i32 2)
   20882 // CHECK:   ret void
   20883 void test_vst4q_p16(poly16_t * a, poly16x8x4_t b) {
   20884   vst4q_p16(a, b);
   20885 }
   20886 
   20887 // CHECK-LABEL: define void @test_vst4_u8(i8* %a, [4 x i64] %b.coerce) #0 {
   20888 // CHECK:   [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8
   20889 // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x4_t, align 8
   20890 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[B]], i32 0, i32 0
   20891 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
   20892 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
   20893 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x4_t* [[__S1]] to i8*
   20894 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x4_t* [[B]] to i8*
   20895 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
   20896 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
   20897 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i32 0, i32 0
   20898 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
   20899 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
   20900 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i32 0, i32 1
   20901 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
   20902 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
   20903 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i32 0, i32 2
   20904 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
   20905 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
   20906 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i32 0, i32 3
   20907 // CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
   20908 // CHECK:   call void @llvm.arm.neon.vst4.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i32 1)
   20909 // CHECK:   ret void
   20910 void test_vst4_u8(uint8_t * a, uint8x8x4_t b) {
   20911   vst4_u8(a, b);
   20912 }
   20913 
   20914 // CHECK-LABEL: define void @test_vst4_u16(i16* %a, [4 x i64] %b.coerce) #0 {
   20915 // CHECK:   [[B:%.*]] = alloca %struct.uint16x4x4_t, align 8
   20916 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x4_t, align 8
   20917 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[B]], i32 0, i32 0
   20918 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
   20919 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
   20920 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x4_t* [[__S1]] to i8*
   20921 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x4_t* [[B]] to i8*
   20922 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
   20923 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
   20924 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
   20925 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i32 0, i32 0
   20926 // CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
   20927 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
   20928 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
   20929 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i32 0, i32 1
   20930 // CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
   20931 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
   20932 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
   20933 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i32 0, i32 2
   20934 // CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
   20935 // CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
   20936 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
   20937 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i32 0, i32 3
   20938 // CHECK:   [[TMP10:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
   20939 // CHECK:   [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8>
   20940 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
   20941 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
   20942 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
   20943 // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
   20944 // CHECK:   call void @llvm.arm.neon.vst4.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i32 2)
   20945 // CHECK:   ret void
   20946 void test_vst4_u16(uint16_t * a, uint16x4x4_t b) {
   20947   vst4_u16(a, b);
   20948 }
   20949 
   20950 // CHECK-LABEL: define void @test_vst4_u32(i32* %a, [4 x i64] %b.coerce) #0 {
   20951 // CHECK:   [[B:%.*]] = alloca %struct.uint32x2x4_t, align 8
   20952 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x4_t, align 8
   20953 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[B]], i32 0, i32 0
   20954 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <2 x i32>]* [[COERCE_DIVE]] to [4 x i64]*
   20955 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
   20956 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x4_t* [[__S1]] to i8*
   20957 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x4_t* [[B]] to i8*
   20958 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
   20959 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
   20960 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
   20961 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i32 0, i32 0
   20962 // CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
   20963 // CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
   20964 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
   20965 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL1]], i32 0, i32 1
   20966 // CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
   20967 // CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
   20968 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
   20969 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL3]], i32 0, i32 2
   20970 // CHECK:   [[TMP8:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
   20971 // CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
   20972 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
   20973 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL5]], i32 0, i32 3
   20974 // CHECK:   [[TMP10:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX6]], align 8
   20975 // CHECK:   [[TMP11:%.*]] = bitcast <2 x i32> [[TMP10]] to <8 x i8>
   20976 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
   20977 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
   20978 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
   20979 // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x i32>
   20980 // CHECK:   call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <2 x i32> [[TMP15]], i32 4)
   20981 // CHECK:   ret void
   20982 void test_vst4_u32(uint32_t * a, uint32x2x4_t b) {
   20983   vst4_u32(a, b);
   20984 }
   20985 
   20986 // CHECK-LABEL: define void @test_vst4_u64(i64* %a, [4 x i64] %b.coerce) #0 {
   20987 // CHECK:   [[B:%.*]] = alloca %struct.uint64x1x4_t, align 8
   20988 // CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x4_t, align 8
   20989 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[B]], i32 0, i32 0
   20990 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <1 x i64>]* [[COERCE_DIVE]] to [4 x i64]*
   20991 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
   20992 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x4_t* [[__S1]] to i8*
   20993 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint64x1x4_t* [[B]] to i8*
   20994 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
   20995 // CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
   20996 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
   20997 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i32 0, i32 0
   20998 // CHECK:   [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
   20999 // CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
   21000 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
   21001 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL1]], i32 0, i32 1
   21002 // CHECK:   [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
   21003 // CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
   21004 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
   21005 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL3]], i32 0, i32 2
   21006 // CHECK:   [[TMP8:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
   21007 // CHECK:   [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
   21008 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
   21009 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL5]], i32 0, i32 3
   21010 // CHECK:   [[TMP10:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX6]], align 8
   21011 // CHECK:   [[TMP11:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
   21012 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
   21013 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
   21014 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
   21015 // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
   21016 // CHECK:   call void @llvm.arm.neon.vst4.p0i8.v1i64(i8* [[TMP3]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], <1 x i64> [[TMP15]], i32 4)
   21017 // CHECK:   ret void
   21018 void test_vst4_u64(uint64_t * a, uint64x1x4_t b) {
   21019   vst4_u64(a, b);
   21020 }
   21021 
   21022 // CHECK-LABEL: define void @test_vst4_s8(i8* %a, [4 x i64] %b.coerce) #0 {
   21023 // CHECK:   [[B:%.*]] = alloca %struct.int8x8x4_t, align 8
   21024 // CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x4_t, align 8
   21025 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[B]], i32 0, i32 0
   21026 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
   21027 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
   21028 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x4_t* [[__S1]] to i8*
   21029 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x4_t* [[B]] to i8*
   21030 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
   21031 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
   21032 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i32 0, i32 0
   21033 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
   21034 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
   21035 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i32 0, i32 1
   21036 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
   21037 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
   21038 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i32 0, i32 2
   21039 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
   21040 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
   21041 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i32 0, i32 3
   21042 // CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
   21043 // CHECK:   call void @llvm.arm.neon.vst4.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i32 1)
   21044 // CHECK:   ret void
   21045 void test_vst4_s8(int8_t * a, int8x8x4_t b) {
   21046   vst4_s8(a, b);
   21047 }
   21048 
   21049 // CHECK-LABEL: define void @test_vst4_s16(i16* %a, [4 x i64] %b.coerce) #0 {
   21050 // CHECK:   [[B:%.*]] = alloca %struct.int16x4x4_t, align 8
   21051 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x4_t, align 8
   21052 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[B]], i32 0, i32 0
   21053 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
   21054 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
   21055 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x4_t* [[__S1]] to i8*
   21056 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x4_t* [[B]] to i8*
   21057 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
   21058 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
   21059 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
   21060 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i32 0, i32 0
   21061 // CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
   21062 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
   21063 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
   21064 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i32 0, i32 1
   21065 // CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
   21066 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
   21067 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
   21068 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i32 0, i32 2
   21069 // CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
   21070 // CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
   21071 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
   21072 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i32 0, i32 3
   21073 // CHECK:   [[TMP10:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
   21074 // CHECK:   [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8>
   21075 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
   21076 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
   21077 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
   21078 // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
   21079 // CHECK:   call void @llvm.arm.neon.vst4.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i32 2)
   21080 // CHECK:   ret void
   21081 void test_vst4_s16(int16_t * a, int16x4x4_t b) {
   21082   vst4_s16(a, b);
   21083 }
   21084 
   21085 // CHECK-LABEL: define void @test_vst4_s32(i32* %a, [4 x i64] %b.coerce) #0 {
   21086 // CHECK:   [[B:%.*]] = alloca %struct.int32x2x4_t, align 8
   21087 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x4_t, align 8
   21088 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[B]], i32 0, i32 0
   21089 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <2 x i32>]* [[COERCE_DIVE]] to [4 x i64]*
   21090 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
   21091 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x4_t* [[__S1]] to i8*
   21092 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x4_t* [[B]] to i8*
   21093 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
   21094 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
   21095 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
   21096 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i32 0, i32 0
   21097 // CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
   21098 // CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
   21099 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
   21100 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL1]], i32 0, i32 1
   21101 // CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
   21102 // CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
   21103 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
   21104 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL3]], i32 0, i32 2
   21105 // CHECK:   [[TMP8:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
   21106 // CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
   21107 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
   21108 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL5]], i32 0, i32 3
   21109 // CHECK:   [[TMP10:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX6]], align 8
   21110 // CHECK:   [[TMP11:%.*]] = bitcast <2 x i32> [[TMP10]] to <8 x i8>
   21111 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
   21112 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
   21113 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
   21114 // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x i32>
   21115 // CHECK:   call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <2 x i32> [[TMP15]], i32 4)
   21116 // CHECK:   ret void
   21117 void test_vst4_s32(int32_t * a, int32x2x4_t b) {
   21118   vst4_s32(a, b);
   21119 }
   21120 
   21121 // CHECK-LABEL: define void @test_vst4_s64(i64* %a, [4 x i64] %b.coerce) #0 {
   21122 // CHECK:   [[B:%.*]] = alloca %struct.int64x1x4_t, align 8
   21123 // CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x4_t, align 8
   21124 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[B]], i32 0, i32 0
   21125 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <1 x i64>]* [[COERCE_DIVE]] to [4 x i64]*
   21126 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
   21127 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x4_t* [[__S1]] to i8*
   21128 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int64x1x4_t* [[B]] to i8*
   21129 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
   21130 // CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
   21131 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
   21132 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i32 0, i32 0
   21133 // CHECK:   [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
   21134 // CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
   21135 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
   21136 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL1]], i32 0, i32 1
   21137 // CHECK:   [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
   21138 // CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
   21139 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
   21140 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL3]], i32 0, i32 2
   21141 // CHECK:   [[TMP8:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
   21142 // CHECK:   [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
   21143 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
   21144 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL5]], i32 0, i32 3
   21145 // CHECK:   [[TMP10:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX6]], align 8
   21146 // CHECK:   [[TMP11:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
   21147 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
   21148 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
   21149 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
   21150 // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
   21151 // CHECK:   call void @llvm.arm.neon.vst4.p0i8.v1i64(i8* [[TMP3]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], <1 x i64> [[TMP15]], i32 4)
   21152 // CHECK:   ret void
   21153 void test_vst4_s64(int64_t * a, int64x1x4_t b) {
   21154   vst4_s64(a, b);
   21155 }
   21156 
   21157 // CHECK-LABEL: define void @test_vst4_f16(half* %a, [4 x i64] %b.coerce) #0 {
   21158 // CHECK:   [[B:%.*]] = alloca %struct.float16x4x4_t, align 8
   21159 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x4_t, align 8
   21160 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[B]], i32 0, i32 0
   21161 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x half>]* [[COERCE_DIVE]] to [4 x i64]*
   21162 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
   21163 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x4_t* [[__S1]] to i8*
   21164 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x4_t* [[B]] to i8*
   21165 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
   21166 // CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
   21167 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
   21168 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]], i32 0, i32 0
   21169 // CHECK:   [[TMP4:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
   21170 // CHECK:   [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
   21171 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
   21172 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL1]], i32 0, i32 1
   21173 // CHECK:   [[TMP6:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
   21174 // CHECK:   [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
   21175 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
   21176 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL3]], i32 0, i32 2
   21177 // CHECK:   [[TMP8:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX4]], align 8
   21178 // CHECK:   [[TMP9:%.*]] = bitcast <4 x half> [[TMP8]] to <8 x i8>
   21179 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
   21180 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL5]], i32 0, i32 3
   21181 // CHECK:   [[TMP10:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX6]], align 8
   21182 // CHECK:   [[TMP11:%.*]] = bitcast <4 x half> [[TMP10]] to <8 x i8>
   21183 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
   21184 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
   21185 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
   21186 // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
   21187 // CHECK:   call void @llvm.arm.neon.vst4.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i32 2)
   21188 // CHECK:   ret void
   21189 void test_vst4_f16(float16_t * a, float16x4x4_t b) {
   21190   vst4_f16(a, b);
   21191 }
   21192 
   21193 // CHECK-LABEL: define void @test_vst4_f32(float* %a, [4 x i64] %b.coerce) #0 {
   21194 // CHECK:   [[B:%.*]] = alloca %struct.float32x2x4_t, align 8
   21195 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x4_t, align 8
   21196 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[B]], i32 0, i32 0
   21197 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <2 x float>]* [[COERCE_DIVE]] to [4 x i64]*
   21198 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
   21199 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x4_t* [[__S1]] to i8*
   21200 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x4_t* [[B]] to i8*
   21201 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
   21202 // CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
   21203 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
   21204 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]], i32 0, i32 0
   21205 // CHECK:   [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
   21206 // CHECK:   [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
   21207 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
   21208 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL1]], i32 0, i32 1
   21209 // CHECK:   [[TMP6:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
   21210 // CHECK:   [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
   21211 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
   21212 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL3]], i32 0, i32 2
   21213 // CHECK:   [[TMP8:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX4]], align 8
   21214 // CHECK:   [[TMP9:%.*]] = bitcast <2 x float> [[TMP8]] to <8 x i8>
   21215 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
   21216 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL5]], i32 0, i32 3
   21217 // CHECK:   [[TMP10:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX6]], align 8
   21218 // CHECK:   [[TMP11:%.*]] = bitcast <2 x float> [[TMP10]] to <8 x i8>
   21219 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
   21220 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
   21221 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x float>
   21222 // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x float>
   21223 // CHECK:   call void @llvm.arm.neon.vst4.p0i8.v2f32(i8* [[TMP3]], <2 x float> [[TMP12]], <2 x float> [[TMP13]], <2 x float> [[TMP14]], <2 x float> [[TMP15]], i32 4)
   21224 // CHECK:   ret void
   21225 void test_vst4_f32(float32_t * a, float32x2x4_t b) {
   21226   vst4_f32(a, b);
   21227 }
   21228 
   21229 // CHECK-LABEL: define void @test_vst4_p8(i8* %a, [4 x i64] %b.coerce) #0 {
   21230 // CHECK:   [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8
   21231 // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x4_t, align 8
   21232 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[B]], i32 0, i32 0
   21233 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
   21234 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
   21235 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x4_t* [[__S1]] to i8*
   21236 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x4_t* [[B]] to i8*
   21237 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
   21238 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
   21239 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i32 0, i32 0
   21240 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
   21241 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
   21242 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i32 0, i32 1
   21243 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
   21244 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
   21245 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i32 0, i32 2
   21246 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
   21247 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
   21248 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i32 0, i32 3
   21249 // CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
   21250 // CHECK:   call void @llvm.arm.neon.vst4.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i32 1)
   21251 // CHECK:   ret void
   21252 void test_vst4_p8(poly8_t * a, poly8x8x4_t b) {
   21253   vst4_p8(a, b);
   21254 }
   21255 
   21256 // CHECK-LABEL: define void @test_vst4_p16(i16* %a, [4 x i64] %b.coerce) #0 {
   21257 // CHECK:   [[B:%.*]] = alloca %struct.poly16x4x4_t, align 8
   21258 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x4_t, align 8
   21259 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[B]], i32 0, i32 0
   21260 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
   21261 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
   21262 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x4_t* [[__S1]] to i8*
   21263 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x4_t* [[B]] to i8*
   21264 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
   21265 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
   21266 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
   21267 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i32 0, i32 0
   21268 // CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
   21269 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
   21270 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
   21271 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i32 0, i32 1
   21272 // CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
   21273 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
   21274 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
   21275 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i32 0, i32 2
   21276 // CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
   21277 // CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
   21278 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
   21279 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i32 0, i32 3
   21280 // CHECK:   [[TMP10:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
   21281 // CHECK:   [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8>
   21282 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
   21283 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
   21284 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
   21285 // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
   21286 // CHECK:   call void @llvm.arm.neon.vst4.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i32 2)
   21287 // CHECK:   ret void
   21288 void test_vst4_p16(poly16_t * a, poly16x4x4_t b) {
   21289   vst4_p16(a, b);
   21290 }
   21291 
   21292 
   21293 // CHECK-LABEL: define void @test_vst4q_lane_u16(i16* %a, [8 x i64] %b.coerce) #0 {
   21294 // CHECK:   [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16
   21295 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16
   21296 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[B]], i32 0, i32 0
   21297 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i16>]* [[COERCE_DIVE]] to [8 x i64]*
   21298 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
   21299 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x4_t* [[__S1]] to i8*
   21300 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x4_t* [[B]] to i8*
   21301 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
   21302 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
   21303 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
   21304 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i32 0, i32 0
   21305 // CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
   21306 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
   21307 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
   21308 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i32 0, i32 1
   21309 // CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
   21310 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
   21311 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
   21312 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i32 0, i32 2
   21313 // CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
   21314 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
   21315 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
   21316 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i32 0, i32 3
   21317 // CHECK:   [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
   21318 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8>
   21319 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
   21320 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
   21321 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
   21322 // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
   21323 // CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i32 7, i32 2)
   21324 // CHECK:   ret void
   21325 void test_vst4q_lane_u16(uint16_t * a, uint16x8x4_t b) {
   21326   vst4q_lane_u16(a, b, 7);
   21327 }
   21328 
   21329 // CHECK-LABEL: define void @test_vst4q_lane_u32(i32* %a, [8 x i64] %b.coerce) #0 {
   21330 // CHECK:   [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16
   21331 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16
   21332 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[B]], i32 0, i32 0
   21333 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x i32>]* [[COERCE_DIVE]] to [8 x i64]*
   21334 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
   21335 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x4_t* [[__S1]] to i8*
   21336 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x4_t* [[B]] to i8*
   21337 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
   21338 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
   21339 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
   21340 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i32 0, i32 0
   21341 // CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
   21342 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
   21343 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
   21344 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL1]], i32 0, i32 1
   21345 // CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
   21346 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
   21347 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
   21348 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL3]], i32 0, i32 2
   21349 // CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
   21350 // CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
   21351 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
   21352 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL5]], i32 0, i32 3
   21353 // CHECK:   [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align 16
   21354 // CHECK:   [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
   21355 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
   21356 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
   21357 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
   21358 // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x i32>
   21359 // CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v4i32(i8* [[TMP3]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], i32 3, i32 4)
   21360 // CHECK:   ret void
   21361 void test_vst4q_lane_u32(uint32_t * a, uint32x4x4_t b) {
   21362   vst4q_lane_u32(a, b, 3);
   21363 }
   21364 
   21365 // CHECK-LABEL: define void @test_vst4q_lane_s16(i16* %a, [8 x i64] %b.coerce) #0 {
   21366 // CHECK:   [[B:%.*]] = alloca %struct.int16x8x4_t, align 16
   21367 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16
   21368 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[B]], i32 0, i32 0
   21369 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i16>]* [[COERCE_DIVE]] to [8 x i64]*
   21370 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
   21371 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x4_t* [[__S1]] to i8*
   21372 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x4_t* [[B]] to i8*
   21373 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
   21374 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
   21375 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
   21376 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i32 0, i32 0
   21377 // CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
   21378 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
   21379 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
   21380 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i32 0, i32 1
   21381 // CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
   21382 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
   21383 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
   21384 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i32 0, i32 2
   21385 // CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
   21386 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
   21387 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
   21388 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i32 0, i32 3
   21389 // CHECK:   [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
   21390 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8>
   21391 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
   21392 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
   21393 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
   21394 // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
   21395 // CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i32 7, i32 2)
   21396 // CHECK:   ret void
   21397 void test_vst4q_lane_s16(int16_t * a, int16x8x4_t b) {
   21398   vst4q_lane_s16(a, b, 7);
   21399 }
   21400 
   21401 // CHECK-LABEL: define void @test_vst4q_lane_s32(i32* %a, [8 x i64] %b.coerce) #0 {
   21402 // CHECK:   [[B:%.*]] = alloca %struct.int32x4x4_t, align 16
   21403 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16
   21404 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[B]], i32 0, i32 0
   21405 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x i32>]* [[COERCE_DIVE]] to [8 x i64]*
   21406 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
   21407 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x4_t* [[__S1]] to i8*
   21408 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x4_t* [[B]] to i8*
   21409 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
   21410 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
   21411 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
   21412 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i32 0, i32 0
   21413 // CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
   21414 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
   21415 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
   21416 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL1]], i32 0, i32 1
   21417 // CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
   21418 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
   21419 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
   21420 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL3]], i32 0, i32 2
   21421 // CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
   21422 // CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
   21423 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
   21424 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL5]], i32 0, i32 3
   21425 // CHECK:   [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align 16
   21426 // CHECK:   [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
   21427 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
   21428 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
   21429 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
   21430 // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x i32>
   21431 // CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v4i32(i8* [[TMP3]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], i32 3, i32 4)
   21432 // CHECK:   ret void
   21433 void test_vst4q_lane_s32(int32_t * a, int32x4x4_t b) {
   21434   vst4q_lane_s32(a, b, 3);
   21435 }
   21436 
   21437 // CHECK-LABEL: define void @test_vst4q_lane_f16(half* %a, [8 x i64] %b.coerce) #0 {
   21438 // CHECK:   [[B:%.*]] = alloca %struct.float16x8x4_t, align 16
   21439 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16
   21440 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[B]], i32 0, i32 0
   21441 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x half>]* [[COERCE_DIVE]] to [8 x i64]*
   21442 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
   21443 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x4_t* [[__S1]] to i8*
   21444 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x4_t* [[B]] to i8*
   21445 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
   21446 // CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
   21447 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
   21448 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]], i32 0, i32 0
   21449 // CHECK:   [[TMP4:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
   21450 // CHECK:   [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
   21451 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
   21452 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL1]], i32 0, i32 1
   21453 // CHECK:   [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
   21454 // CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
   21455 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
   21456 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL3]], i32 0, i32 2
   21457 // CHECK:   [[TMP8:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 16
   21458 // CHECK:   [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <16 x i8>
   21459 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
   21460 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL5]], i32 0, i32 3
   21461 // CHECK:   [[TMP10:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX6]], align 16
   21462 // CHECK:   [[TMP11:%.*]] = bitcast <8 x half> [[TMP10]] to <16 x i8>
   21463 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
   21464 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
   21465 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
   21466 // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
   21467 // CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i32 7, i32 2)
   21468 // CHECK:   ret void
   21469 void test_vst4q_lane_f16(float16_t * a, float16x8x4_t b) {
   21470   vst4q_lane_f16(a, b, 7);
   21471 }
   21472 
   21473 // CHECK-LABEL: define void @test_vst4q_lane_f32(float* %a, [8 x i64] %b.coerce) #0 {
   21474 // CHECK:   [[B:%.*]] = alloca %struct.float32x4x4_t, align 16
   21475 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16
   21476 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[B]], i32 0, i32 0
   21477 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x float>]* [[COERCE_DIVE]] to [8 x i64]*
   21478 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
   21479 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x4_t* [[__S1]] to i8*
   21480 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x4_t* [[B]] to i8*
   21481 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
   21482 // CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
   21483 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
   21484 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]], i32 0, i32 0
   21485 // CHECK:   [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
   21486 // CHECK:   [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
   21487 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
   21488 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL1]], i32 0, i32 1
   21489 // CHECK:   [[TMP6:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
   21490 // CHECK:   [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
   21491 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
   21492 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL3]], i32 0, i32 2
   21493 // CHECK:   [[TMP8:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align 16
   21494 // CHECK:   [[TMP9:%.*]] = bitcast <4 x float> [[TMP8]] to <16 x i8>
   21495 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
   21496 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL5]], i32 0, i32 3
   21497 // CHECK:   [[TMP10:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX6]], align 16
   21498 // CHECK:   [[TMP11:%.*]] = bitcast <4 x float> [[TMP10]] to <16 x i8>
   21499 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
   21500 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
   21501 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x float>
   21502 // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x float>
   21503 // CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v4f32(i8* [[TMP3]], <4 x float> [[TMP12]], <4 x float> [[TMP13]], <4 x float> [[TMP14]], <4 x float> [[TMP15]], i32 3, i32 4)
   21504 // CHECK:   ret void
   21505 void test_vst4q_lane_f32(float32_t * a, float32x4x4_t b) {
   21506   vst4q_lane_f32(a, b, 3);
   21507 }
   21508 
   21509 // CHECK-LABEL: define void @test_vst4q_lane_p16(i16* %a, [8 x i64] %b.coerce) #0 {
   21510 // CHECK:   [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16
   21511 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16
   21512 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[B]], i32 0, i32 0
   21513 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i16>]* [[COERCE_DIVE]] to [8 x i64]*
   21514 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
   21515 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x4_t* [[__S1]] to i8*
   21516 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x4_t* [[B]] to i8*
   21517 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
   21518 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
   21519 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
   21520 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i32 0, i32 0
   21521 // CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
   21522 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
   21523 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
   21524 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i32 0, i32 1
   21525 // CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
   21526 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
   21527 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
   21528 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i32 0, i32 2
   21529 // CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
   21530 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
   21531 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
   21532 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i32 0, i32 3
   21533 // CHECK:   [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
   21534 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8>
   21535 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
   21536 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
   21537 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
   21538 // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
   21539 // CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i32 7, i32 2)
   21540 // CHECK:   ret void
   21541 void test_vst4q_lane_p16(poly16_t * a, poly16x8x4_t b) {
   21542   vst4q_lane_p16(a, b, 7);
   21543 }
   21544 
   21545 // CHECK-LABEL: define void @test_vst4_lane_u8(i8* %a, [4 x i64] %b.coerce) #0 {
   21546 // CHECK:   [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8
   21547 // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x4_t, align 8
   21548 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[B]], i32 0, i32 0
   21549 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
   21550 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
   21551 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x4_t* [[__S1]] to i8*
   21552 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x4_t* [[B]] to i8*
   21553 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
   21554 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
   21555 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i32 0, i32 0
   21556 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
   21557 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
   21558 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i32 0, i32 1
   21559 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
   21560 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
   21561 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i32 0, i32 2
   21562 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
   21563 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
   21564 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i32 0, i32 3
   21565 // CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
   21566 // CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i32 7, i32 1)
   21567 // CHECK:   ret void
   21568 void test_vst4_lane_u8(uint8_t * a, uint8x8x4_t b) {
   21569   vst4_lane_u8(a, b, 7);
   21570 }
   21571 
   21572 // CHECK-LABEL: define void @test_vst4_lane_u16(i16* %a, [4 x i64] %b.coerce) #0 {
   21573 // CHECK:   [[B:%.*]] = alloca %struct.uint16x4x4_t, align 8
   21574 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x4_t, align 8
   21575 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[B]], i32 0, i32 0
   21576 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
   21577 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
   21578 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x4_t* [[__S1]] to i8*
   21579 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x4_t* [[B]] to i8*
   21580 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
   21581 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
   21582 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
   21583 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i32 0, i32 0
   21584 // CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
   21585 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
   21586 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
   21587 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i32 0, i32 1
   21588 // CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
   21589 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
   21590 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
   21591 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i32 0, i32 2
   21592 // CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
   21593 // CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
   21594 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
   21595 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i32 0, i32 3
   21596 // CHECK:   [[TMP10:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
   21597 // CHECK:   [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8>
   21598 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
   21599 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
   21600 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
   21601 // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
   21602 // CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i32 3, i32 2)
   21603 // CHECK:   ret void
   21604 void test_vst4_lane_u16(uint16_t * a, uint16x4x4_t b) {
   21605   vst4_lane_u16(a, b, 3);
   21606 }
   21607 
   21608 // CHECK-LABEL: define void @test_vst4_lane_u32(i32* %a, [4 x i64] %b.coerce) #0 {
   21609 // CHECK:   [[B:%.*]] = alloca %struct.uint32x2x4_t, align 8
   21610 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x4_t, align 8
   21611 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[B]], i32 0, i32 0
   21612 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <2 x i32>]* [[COERCE_DIVE]] to [4 x i64]*
   21613 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
   21614 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x4_t* [[__S1]] to i8*
   21615 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x4_t* [[B]] to i8*
   21616 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
   21617 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
   21618 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
   21619 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i32 0, i32 0
   21620 // CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
   21621 // CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
   21622 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
   21623 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL1]], i32 0, i32 1
   21624 // CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
   21625 // CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
   21626 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
   21627 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL3]], i32 0, i32 2
   21628 // CHECK:   [[TMP8:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
   21629 // CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
   21630 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
   21631 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL5]], i32 0, i32 3
   21632 // CHECK:   [[TMP10:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX6]], align 8
   21633 // CHECK:   [[TMP11:%.*]] = bitcast <2 x i32> [[TMP10]] to <8 x i8>
   21634 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
   21635 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
   21636 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
   21637 // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x i32>
   21638 // CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <2 x i32> [[TMP15]], i32 1, i32 4)
   21639 // CHECK:   ret void
   21640 void test_vst4_lane_u32(uint32_t * a, uint32x2x4_t b) {
   21641   vst4_lane_u32(a, b, 1);
   21642 }
   21643 
   21644 // CHECK-LABEL: define void @test_vst4_lane_s8(i8* %a, [4 x i64] %b.coerce) #0 {
   21645 // CHECK:   [[B:%.*]] = alloca %struct.int8x8x4_t, align 8
   21646 // CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x4_t, align 8
   21647 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[B]], i32 0, i32 0
   21648 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
   21649 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
   21650 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x4_t* [[__S1]] to i8*
   21651 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x4_t* [[B]] to i8*
   21652 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
   21653 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
   21654 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i32 0, i32 0
   21655 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
   21656 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
   21657 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i32 0, i32 1
   21658 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
   21659 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
   21660 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i32 0, i32 2
   21661 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
   21662 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
   21663 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i32 0, i32 3
   21664 // CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
   21665 // CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i32 7, i32 1)
   21666 // CHECK:   ret void
   21667 void test_vst4_lane_s8(int8_t * a, int8x8x4_t b) {
   21668   vst4_lane_s8(a, b, 7);
   21669 }
   21670 
   21671 // CHECK-LABEL: define void @test_vst4_lane_s16(i16* %a, [4 x i64] %b.coerce) #0 {
   21672 // CHECK:   [[B:%.*]] = alloca %struct.int16x4x4_t, align 8
   21673 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x4_t, align 8
   21674 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[B]], i32 0, i32 0
   21675 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
   21676 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
   21677 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x4_t* [[__S1]] to i8*
   21678 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x4_t* [[B]] to i8*
   21679 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
   21680 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
   21681 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
   21682 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i32 0, i32 0
   21683 // CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
   21684 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
   21685 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
   21686 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i32 0, i32 1
   21687 // CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
   21688 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
   21689 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
   21690 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i32 0, i32 2
   21691 // CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
   21692 // CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
   21693 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
   21694 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i32 0, i32 3
   21695 // CHECK:   [[TMP10:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
   21696 // CHECK:   [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8>
   21697 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
   21698 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
   21699 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
   21700 // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
   21701 // CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i32 3, i32 2)
   21702 // CHECK:   ret void
   21703 void test_vst4_lane_s16(int16_t * a, int16x4x4_t b) {
   21704   vst4_lane_s16(a, b, 3);
   21705 }
   21706 
   21707 // CHECK-LABEL: define void @test_vst4_lane_s32(i32* %a, [4 x i64] %b.coerce) #0 {
   21708 // CHECK:   [[B:%.*]] = alloca %struct.int32x2x4_t, align 8
   21709 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x4_t, align 8
   21710 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[B]], i32 0, i32 0
   21711 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <2 x i32>]* [[COERCE_DIVE]] to [4 x i64]*
   21712 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
   21713 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x4_t* [[__S1]] to i8*
   21714 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x4_t* [[B]] to i8*
   21715 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
   21716 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
   21717 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
   21718 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i32 0, i32 0
   21719 // CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
   21720 // CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
   21721 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
   21722 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL1]], i32 0, i32 1
   21723 // CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
   21724 // CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
   21725 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
   21726 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL3]], i32 0, i32 2
   21727 // CHECK:   [[TMP8:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
   21728 // CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
   21729 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
   21730 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL5]], i32 0, i32 3
   21731 // CHECK:   [[TMP10:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX6]], align 8
   21732 // CHECK:   [[TMP11:%.*]] = bitcast <2 x i32> [[TMP10]] to <8 x i8>
   21733 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
   21734 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
   21735 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
   21736 // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x i32>
   21737 // CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <2 x i32> [[TMP15]], i32 1, i32 4)
   21738 // CHECK:   ret void
   21739 void test_vst4_lane_s32(int32_t * a, int32x2x4_t b) {
   21740   vst4_lane_s32(a, b, 1);
   21741 }
   21742 
   21743 // CHECK-LABEL: define void @test_vst4_lane_f16(half* %a, [4 x i64] %b.coerce) #0 {
   21744 // CHECK:   [[B:%.*]] = alloca %struct.float16x4x4_t, align 8
   21745 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x4_t, align 8
   21746 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[B]], i32 0, i32 0
   21747 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x half>]* [[COERCE_DIVE]] to [4 x i64]*
   21748 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
   21749 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x4_t* [[__S1]] to i8*
   21750 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x4_t* [[B]] to i8*
   21751 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
   21752 // CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
   21753 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
   21754 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]], i32 0, i32 0
   21755 // CHECK:   [[TMP4:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
   21756 // CHECK:   [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
   21757 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
   21758 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL1]], i32 0, i32 1
   21759 // CHECK:   [[TMP6:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
   21760 // CHECK:   [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
   21761 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
   21762 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL3]], i32 0, i32 2
   21763 // CHECK:   [[TMP8:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX4]], align 8
   21764 // CHECK:   [[TMP9:%.*]] = bitcast <4 x half> [[TMP8]] to <8 x i8>
   21765 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
   21766 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL5]], i32 0, i32 3
   21767 // CHECK:   [[TMP10:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX6]], align 8
   21768 // CHECK:   [[TMP11:%.*]] = bitcast <4 x half> [[TMP10]] to <8 x i8>
   21769 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
   21770 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
   21771 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
   21772 // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
   21773 // CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i32 3, i32 2)
   21774 // CHECK:   ret void
   21775 void test_vst4_lane_f16(float16_t * a, float16x4x4_t b) {
   21776   vst4_lane_f16(a, b, 3);
   21777 }
   21778 
   21779 // CHECK-LABEL: define void @test_vst4_lane_f32(float* %a, [4 x i64] %b.coerce) #0 {
   21780 // CHECK:   [[B:%.*]] = alloca %struct.float32x2x4_t, align 8
   21781 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x4_t, align 8
   21782 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[B]], i32 0, i32 0
   21783 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <2 x float>]* [[COERCE_DIVE]] to [4 x i64]*
   21784 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
   21785 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x4_t* [[__S1]] to i8*
   21786 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x4_t* [[B]] to i8*
   21787 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
   21788 // CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
   21789 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
   21790 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]], i32 0, i32 0
   21791 // CHECK:   [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
   21792 // CHECK:   [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
   21793 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
   21794 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL1]], i32 0, i32 1
   21795 // CHECK:   [[TMP6:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
   21796 // CHECK:   [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
   21797 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
   21798 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL3]], i32 0, i32 2
   21799 // CHECK:   [[TMP8:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX4]], align 8
   21800 // CHECK:   [[TMP9:%.*]] = bitcast <2 x float> [[TMP8]] to <8 x i8>
   21801 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
   21802 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL5]], i32 0, i32 3
   21803 // CHECK:   [[TMP10:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX6]], align 8
   21804 // CHECK:   [[TMP11:%.*]] = bitcast <2 x float> [[TMP10]] to <8 x i8>
   21805 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
   21806 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
   21807 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x float>
   21808 // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x float>
   21809 // CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v2f32(i8* [[TMP3]], <2 x float> [[TMP12]], <2 x float> [[TMP13]], <2 x float> [[TMP14]], <2 x float> [[TMP15]], i32 1, i32 4)
   21810 // CHECK:   ret void
   21811 void test_vst4_lane_f32(float32_t * a, float32x2x4_t b) {
   21812   vst4_lane_f32(a, b, 1);
   21813 }
   21814 
   21815 // CHECK-LABEL: define void @test_vst4_lane_p8(i8* %a, [4 x i64] %b.coerce) #0 {
   21816 // CHECK:   [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8
   21817 // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x4_t, align 8
   21818 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[B]], i32 0, i32 0
   21819 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
   21820 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
   21821 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x4_t* [[__S1]] to i8*
   21822 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x4_t* [[B]] to i8*
   21823 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
   21824 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
   21825 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i32 0, i32 0
   21826 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
   21827 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
   21828 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i32 0, i32 1
   21829 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
   21830 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
   21831 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i32 0, i32 2
   21832 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
   21833 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
   21834 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i32 0, i32 3
   21835 // CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
   21836 // CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i32 7, i32 1)
   21837 // CHECK:   ret void
   21838 void test_vst4_lane_p8(poly8_t * a, poly8x8x4_t b) {
   21839   vst4_lane_p8(a, b, 7);
   21840 }
   21841 
   21842 // CHECK-LABEL: define void @test_vst4_lane_p16(i16* %a, [4 x i64] %b.coerce) #0 {
   21843 // CHECK:   [[B:%.*]] = alloca %struct.poly16x4x4_t, align 8
   21844 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x4_t, align 8
   21845 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[B]], i32 0, i32 0
   21846 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
   21847 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
   21848 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x4_t* [[__S1]] to i8*
   21849 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x4_t* [[B]] to i8*
   21850 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
   21851 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
   21852 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
   21853 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i32 0, i32 0
   21854 // CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
   21855 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
   21856 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
   21857 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i32 0, i32 1
   21858 // CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
   21859 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
   21860 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
   21861 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i32 0, i32 2
   21862 // CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
   21863 // CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
   21864 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
   21865 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i32 0, i32 3
   21866 // CHECK:   [[TMP10:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
   21867 // CHECK:   [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8>
   21868 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
   21869 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
   21870 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
   21871 // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
   21872 // CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i32 3, i32 2)
   21873 // CHECK:   ret void
   21874 void test_vst4_lane_p16(poly16_t * a, poly16x4x4_t b) {
   21875   vst4_lane_p16(a, b, 3);
   21876 }
   21877 
   21878 
   21879 // CHECK-LABEL: define <8 x i8> @test_vsub_s8(<8 x i8> %a, <8 x i8> %b) #0 {
   21880 // CHECK:   [[SUB_I:%.*]] = sub <8 x i8> %a, %b
   21881 // CHECK:   ret <8 x i8> [[SUB_I]]
   21882 int8x8_t test_vsub_s8(int8x8_t a, int8x8_t b) {
   21883   return vsub_s8(a, b);
   21884 }
   21885 
   21886 // CHECK-LABEL: define <4 x i16> @test_vsub_s16(<4 x i16> %a, <4 x i16> %b) #0 {
   21887 // CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %a, %b
   21888 // CHECK:   ret <4 x i16> [[SUB_I]]
   21889 int16x4_t test_vsub_s16(int16x4_t a, int16x4_t b) {
   21890   return vsub_s16(a, b);
   21891 }
   21892 
   21893 // CHECK-LABEL: define <2 x i32> @test_vsub_s32(<2 x i32> %a, <2 x i32> %b) #0 {
   21894 // CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %a, %b
   21895 // CHECK:   ret <2 x i32> [[SUB_I]]
   21896 int32x2_t test_vsub_s32(int32x2_t a, int32x2_t b) {
   21897   return vsub_s32(a, b);
   21898 }
   21899 
   21900 // CHECK-LABEL: define <1 x i64> @test_vsub_s64(<1 x i64> %a, <1 x i64> %b) #0 {
   21901 // CHECK:   [[SUB_I:%.*]] = sub <1 x i64> %a, %b
   21902 // CHECK:   ret <1 x i64> [[SUB_I]]
   21903 int64x1_t test_vsub_s64(int64x1_t a, int64x1_t b) {
   21904   return vsub_s64(a, b);
   21905 }
   21906 
   21907 // CHECK-LABEL: define <2 x float> @test_vsub_f32(<2 x float> %a, <2 x float> %b) #0 {
   21908 // CHECK:   [[SUB_I:%.*]] = fsub <2 x float> %a, %b
   21909 // CHECK:   ret <2 x float> [[SUB_I]]
   21910 float32x2_t test_vsub_f32(float32x2_t a, float32x2_t b) {
   21911   return vsub_f32(a, b);
   21912 }
   21913 
   21914 // CHECK-LABEL: define <8 x i8> @test_vsub_u8(<8 x i8> %a, <8 x i8> %b) #0 {
   21915 // CHECK:   [[SUB_I:%.*]] = sub <8 x i8> %a, %b
   21916 // CHECK:   ret <8 x i8> [[SUB_I]]
   21917 uint8x8_t test_vsub_u8(uint8x8_t a, uint8x8_t b) {
   21918   return vsub_u8(a, b);
   21919 }
   21920 
   21921 // CHECK-LABEL: define <4 x i16> @test_vsub_u16(<4 x i16> %a, <4 x i16> %b) #0 {
   21922 // CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %a, %b
   21923 // CHECK:   ret <4 x i16> [[SUB_I]]
   21924 uint16x4_t test_vsub_u16(uint16x4_t a, uint16x4_t b) {
   21925   return vsub_u16(a, b);
   21926 }
   21927 
   21928 // CHECK-LABEL: define <2 x i32> @test_vsub_u32(<2 x i32> %a, <2 x i32> %b) #0 {
   21929 // CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %a, %b
   21930 // CHECK:   ret <2 x i32> [[SUB_I]]
   21931 uint32x2_t test_vsub_u32(uint32x2_t a, uint32x2_t b) {
   21932   return vsub_u32(a, b);
   21933 }
   21934 
   21935 // CHECK-LABEL: define <1 x i64> @test_vsub_u64(<1 x i64> %a, <1 x i64> %b) #0 {
   21936 // CHECK:   [[SUB_I:%.*]] = sub <1 x i64> %a, %b
   21937 // CHECK:   ret <1 x i64> [[SUB_I]]
   21938 uint64x1_t test_vsub_u64(uint64x1_t a, uint64x1_t b) {
   21939   return vsub_u64(a, b);
   21940 }
   21941 
   21942 // CHECK-LABEL: define <16 x i8> @test_vsubq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
   21943 // CHECK:   [[SUB_I:%.*]] = sub <16 x i8> %a, %b
   21944 // CHECK:   ret <16 x i8> [[SUB_I]]
   21945 int8x16_t test_vsubq_s8(int8x16_t a, int8x16_t b) {
   21946   return vsubq_s8(a, b);
   21947 }
   21948 
   21949 // CHECK-LABEL: define <8 x i16> @test_vsubq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
   21950 // CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, %b
   21951 // CHECK:   ret <8 x i16> [[SUB_I]]
   21952 int16x8_t test_vsubq_s16(int16x8_t a, int16x8_t b) {
   21953   return vsubq_s16(a, b);
   21954 }
   21955 
   21956 // CHECK-LABEL: define <4 x i32> @test_vsubq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
   21957 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, %b
   21958 // CHECK:   ret <4 x i32> [[SUB_I]]
   21959 int32x4_t test_vsubq_s32(int32x4_t a, int32x4_t b) {
   21960   return vsubq_s32(a, b);
   21961 }
   21962 
   21963 // CHECK-LABEL: define <2 x i64> @test_vsubq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
   21964 // CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, %b
   21965 // CHECK:   ret <2 x i64> [[SUB_I]]
   21966 int64x2_t test_vsubq_s64(int64x2_t a, int64x2_t b) {
   21967   return vsubq_s64(a, b);
   21968 }
   21969 
   21970 // CHECK-LABEL: define <4 x float> @test_vsubq_f32(<4 x float> %a, <4 x float> %b) #0 {
   21971 // CHECK:   [[SUB_I:%.*]] = fsub <4 x float> %a, %b
   21972 // CHECK:   ret <4 x float> [[SUB_I]]
   21973 float32x4_t test_vsubq_f32(float32x4_t a, float32x4_t b) {
   21974   return vsubq_f32(a, b);
   21975 }
   21976 
   21977 // CHECK-LABEL: define <16 x i8> @test_vsubq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
   21978 // CHECK:   [[SUB_I:%.*]] = sub <16 x i8> %a, %b
   21979 // CHECK:   ret <16 x i8> [[SUB_I]]
   21980 uint8x16_t test_vsubq_u8(uint8x16_t a, uint8x16_t b) {
   21981   return vsubq_u8(a, b);
   21982 }
   21983 
   21984 // CHECK-LABEL: define <8 x i16> @test_vsubq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
   21985 // CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, %b
   21986 // CHECK:   ret <8 x i16> [[SUB_I]]
   21987 uint16x8_t test_vsubq_u16(uint16x8_t a, uint16x8_t b) {
   21988   return vsubq_u16(a, b);
   21989 }
   21990 
   21991 // CHECK-LABEL: define <4 x i32> @test_vsubq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
   21992 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, %b
   21993 // CHECK:   ret <4 x i32> [[SUB_I]]
   21994 uint32x4_t test_vsubq_u32(uint32x4_t a, uint32x4_t b) {
   21995   return vsubq_u32(a, b);
   21996 }
   21997 
   21998 // CHECK-LABEL: define <2 x i64> @test_vsubq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
   21999 // CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, %b
   22000 // CHECK:   ret <2 x i64> [[SUB_I]]
   22001 uint64x2_t test_vsubq_u64(uint64x2_t a, uint64x2_t b) {
   22002   return vsubq_u64(a, b);
   22003 }
   22004 
   22005 
   22006 // CHECK-LABEL: define <8 x i8> @test_vsubhn_s16(<8 x i16> %a, <8 x i16> %b) #0 {
   22007 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   22008 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   22009 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   22010 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   22011 // CHECK:   [[VSUBHN_I:%.*]] = sub <8 x i16> [[TMP2]], [[TMP3]]
   22012 // CHECK:   [[VSUBHN1_I:%.*]] = lshr <8 x i16> [[VSUBHN_I]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
   22013 // CHECK:   [[VSUBHN2_I:%.*]] = trunc <8 x i16> [[VSUBHN1_I]] to <8 x i8>
   22014 // CHECK:   ret <8 x i8> [[VSUBHN2_I]]
   22015 int8x8_t test_vsubhn_s16(int16x8_t a, int16x8_t b) {
   22016   return vsubhn_s16(a, b);
   22017 }
   22018 
   22019 // CHECK-LABEL: define <4 x i16> @test_vsubhn_s32(<4 x i32> %a, <4 x i32> %b) #0 {
   22020 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   22021 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   22022 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   22023 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   22024 // CHECK:   [[VSUBHN_I:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]]
   22025 // CHECK:   [[VSUBHN1_I:%.*]] = lshr <4 x i32> [[VSUBHN_I]], <i32 16, i32 16, i32 16, i32 16>
   22026 // CHECK:   [[VSUBHN2_I:%.*]] = trunc <4 x i32> [[VSUBHN1_I]] to <4 x i16>
   22027 // CHECK:   ret <4 x i16> [[VSUBHN2_I]]
   22028 int16x4_t test_vsubhn_s32(int32x4_t a, int32x4_t b) {
   22029   return vsubhn_s32(a, b);
   22030 }
   22031 
   22032 // CHECK-LABEL: define <2 x i32> @test_vsubhn_s64(<2 x i64> %a, <2 x i64> %b) #0 {
   22033 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   22034 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   22035 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   22036 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
   22037 // CHECK:   [[VSUBHN_I:%.*]] = sub <2 x i64> [[TMP2]], [[TMP3]]
   22038 // CHECK:   [[VSUBHN1_I:%.*]] = lshr <2 x i64> [[VSUBHN_I]], <i64 32, i64 32>
   22039 // CHECK:   [[VSUBHN2_I:%.*]] = trunc <2 x i64> [[VSUBHN1_I]] to <2 x i32>
   22040 // CHECK:   ret <2 x i32> [[VSUBHN2_I]]
   22041 int32x2_t test_vsubhn_s64(int64x2_t a, int64x2_t b) {
   22042   return vsubhn_s64(a, b);
   22043 }
   22044 
   22045 // CHECK-LABEL: define <8 x i8> @test_vsubhn_u16(<8 x i16> %a, <8 x i16> %b) #0 {
   22046 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   22047 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   22048 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   22049 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   22050 // CHECK:   [[VSUBHN_I:%.*]] = sub <8 x i16> [[TMP2]], [[TMP3]]
   22051 // CHECK:   [[VSUBHN1_I:%.*]] = lshr <8 x i16> [[VSUBHN_I]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
   22052 // CHECK:   [[VSUBHN2_I:%.*]] = trunc <8 x i16> [[VSUBHN1_I]] to <8 x i8>
   22053 // CHECK:   ret <8 x i8> [[VSUBHN2_I]]
   22054 uint8x8_t test_vsubhn_u16(uint16x8_t a, uint16x8_t b) {
   22055   return vsubhn_u16(a, b);
   22056 }
   22057 
   22058 // CHECK-LABEL: define <4 x i16> @test_vsubhn_u32(<4 x i32> %a, <4 x i32> %b) #0 {
   22059 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   22060 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   22061 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   22062 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   22063 // CHECK:   [[VSUBHN_I:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]]
   22064 // CHECK:   [[VSUBHN1_I:%.*]] = lshr <4 x i32> [[VSUBHN_I]], <i32 16, i32 16, i32 16, i32 16>
   22065 // CHECK:   [[VSUBHN2_I:%.*]] = trunc <4 x i32> [[VSUBHN1_I]] to <4 x i16>
   22066 // CHECK:   ret <4 x i16> [[VSUBHN2_I]]
   22067 uint16x4_t test_vsubhn_u32(uint32x4_t a, uint32x4_t b) {
   22068   return vsubhn_u32(a, b);
   22069 }
   22070 
   22071 // CHECK-LABEL: define <2 x i32> @test_vsubhn_u64(<2 x i64> %a, <2 x i64> %b) #0 {
   22072 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   22073 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   22074 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   22075 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
   22076 // CHECK:   [[VSUBHN_I:%.*]] = sub <2 x i64> [[TMP2]], [[TMP3]]
   22077 // CHECK:   [[VSUBHN1_I:%.*]] = lshr <2 x i64> [[VSUBHN_I]], <i64 32, i64 32>
   22078 // CHECK:   [[VSUBHN2_I:%.*]] = trunc <2 x i64> [[VSUBHN1_I]] to <2 x i32>
   22079 // CHECK:   ret <2 x i32> [[VSUBHN2_I]]
   22080 uint32x2_t test_vsubhn_u64(uint64x2_t a, uint64x2_t b) {
   22081   return vsubhn_u64(a, b);
   22082 }
   22083 
   22084 
   22085 // CHECK-LABEL: define <8 x i16> @test_vsubl_s8(<8 x i8> %a, <8 x i8> %b) #0 {
   22086 // CHECK:   [[VMOVL_I_I:%.*]] = sext <8 x i8> %a to <8 x i16>
   22087 // CHECK:   [[VMOVL_I4_I:%.*]] = sext <8 x i8> %b to <8 x i16>
   22088 // CHECK:   [[SUB_I:%.*]] = sub <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]]
   22089 // CHECK:   ret <8 x i16> [[SUB_I]]
   22090 int16x8_t test_vsubl_s8(int8x8_t a, int8x8_t b) {
   22091   return vsubl_s8(a, b);
   22092 }
   22093 
   22094 // CHECK-LABEL: define <4 x i32> @test_vsubl_s16(<4 x i16> %a, <4 x i16> %b) #0 {
   22095 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   22096 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   22097 // CHECK:   [[VMOVL_I_I:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
   22098 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   22099 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
   22100 // CHECK:   [[VMOVL_I4_I:%.*]] = sext <4 x i16> [[TMP3]] to <4 x i32>
   22101 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]]
   22102 // CHECK:   ret <4 x i32> [[SUB_I]]
   22103 int32x4_t test_vsubl_s16(int16x4_t a, int16x4_t b) {
   22104   return vsubl_s16(a, b);
   22105 }
   22106 
   22107 // CHECK-LABEL: define <2 x i64> @test_vsubl_s32(<2 x i32> %a, <2 x i32> %b) #0 {
   22108 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   22109 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   22110 // CHECK:   [[VMOVL_I_I:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
   22111 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   22112 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
   22113 // CHECK:   [[VMOVL_I4_I:%.*]] = sext <2 x i32> [[TMP3]] to <2 x i64>
   22114 // CHECK:   [[SUB_I:%.*]] = sub <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]]
   22115 // CHECK:   ret <2 x i64> [[SUB_I]]
   22116 int64x2_t test_vsubl_s32(int32x2_t a, int32x2_t b) {
   22117   return vsubl_s32(a, b);
   22118 }
   22119 
   22120 // CHECK-LABEL: define <8 x i16> @test_vsubl_u8(<8 x i8> %a, <8 x i8> %b) #0 {
   22121 // CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> %a to <8 x i16>
   22122 // CHECK:   [[VMOVL_I4_I:%.*]] = zext <8 x i8> %b to <8 x i16>
   22123 // CHECK:   [[SUB_I:%.*]] = sub <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]]
   22124 // CHECK:   ret <8 x i16> [[SUB_I]]
   22125 uint16x8_t test_vsubl_u8(uint8x8_t a, uint8x8_t b) {
   22126   return vsubl_u8(a, b);
   22127 }
   22128 
   22129 // CHECK-LABEL: define <4 x i32> @test_vsubl_u16(<4 x i16> %a, <4 x i16> %b) #0 {
   22130 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   22131 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   22132 // CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
   22133 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   22134 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
   22135 // CHECK:   [[VMOVL_I4_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
   22136 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]]
   22137 // CHECK:   ret <4 x i32> [[SUB_I]]
   22138 uint32x4_t test_vsubl_u16(uint16x4_t a, uint16x4_t b) {
   22139   return vsubl_u16(a, b);
   22140 }
   22141 
   22142 // CHECK-LABEL: define <2 x i64> @test_vsubl_u32(<2 x i32> %a, <2 x i32> %b) #0 {
   22143 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   22144 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   22145 // CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
   22146 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   22147 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
   22148 // CHECK:   [[VMOVL_I4_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64>
   22149 // CHECK:   [[SUB_I:%.*]] = sub <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]]
   22150 // CHECK:   ret <2 x i64> [[SUB_I]]
   22151 uint64x2_t test_vsubl_u32(uint32x2_t a, uint32x2_t b) {
   22152   return vsubl_u32(a, b);
   22153 }
   22154 
   22155 
   22156 // CHECK-LABEL: define <8 x i16> @test_vsubw_s8(<8 x i16> %a, <8 x i8> %b) #0 {
   22157 // CHECK:   [[VMOVL_I_I:%.*]] = sext <8 x i8> %b to <8 x i16>
   22158 // CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMOVL_I_I]]
   22159 // CHECK:   ret <8 x i16> [[SUB_I]]
   22160 int16x8_t test_vsubw_s8(int16x8_t a, int8x8_t b) {
   22161   return vsubw_s8(a, b);
   22162 }
   22163 
   22164 // CHECK-LABEL: define <4 x i32> @test_vsubw_s16(<4 x i32> %a, <4 x i16> %b) #0 {
   22165 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   22166 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   22167 // CHECK:   [[VMOVL_I_I:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
   22168 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMOVL_I_I]]
   22169 // CHECK:   ret <4 x i32> [[SUB_I]]
   22170 int32x4_t test_vsubw_s16(int32x4_t a, int16x4_t b) {
   22171   return vsubw_s16(a, b);
   22172 }
   22173 
   22174 // CHECK-LABEL: define <2 x i64> @test_vsubw_s32(<2 x i64> %a, <2 x i32> %b) #0 {
   22175 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   22176 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   22177 // CHECK:   [[VMOVL_I_I:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
   22178 // CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMOVL_I_I]]
   22179 // CHECK:   ret <2 x i64> [[SUB_I]]
   22180 int64x2_t test_vsubw_s32(int64x2_t a, int32x2_t b) {
   22181   return vsubw_s32(a, b);
   22182 }
   22183 
   22184 // CHECK-LABEL: define <8 x i16> @test_vsubw_u8(<8 x i16> %a, <8 x i8> %b) #0 {
   22185 // CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> %b to <8 x i16>
   22186 // CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMOVL_I_I]]
   22187 // CHECK:   ret <8 x i16> [[SUB_I]]
   22188 uint16x8_t test_vsubw_u8(uint16x8_t a, uint8x8_t b) {
   22189   return vsubw_u8(a, b);
   22190 }
   22191 
   22192 // CHECK-LABEL: define <4 x i32> @test_vsubw_u16(<4 x i32> %a, <4 x i16> %b) #0 {
   22193 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   22194 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   22195 // CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
   22196 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMOVL_I_I]]
   22197 // CHECK:   ret <4 x i32> [[SUB_I]]
   22198 uint32x4_t test_vsubw_u16(uint32x4_t a, uint16x4_t b) {
   22199   return vsubw_u16(a, b);
   22200 }
   22201 
   22202 // CHECK-LABEL: define <2 x i64> @test_vsubw_u32(<2 x i64> %a, <2 x i32> %b) #0 {
   22203 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   22204 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   22205 // CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
   22206 // CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMOVL_I_I]]
   22207 // CHECK:   ret <2 x i64> [[SUB_I]]
   22208 uint64x2_t test_vsubw_u32(uint64x2_t a, uint32x2_t b) {
   22209   return vsubw_u32(a, b);
   22210 }
   22211 
   22212 
   22213 // CHECK-LABEL: define <8 x i8> @test_vtbl1_u8(<8 x i8> %a, <8 x i8> %b) #0 {
   22214 // CHECK:   [[VTBL1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %a, <8 x i8> %b) #4
   22215 // CHECK:   ret <8 x i8> [[VTBL1_I]]
   22216 uint8x8_t test_vtbl1_u8(uint8x8_t a, uint8x8_t b) {
   22217   return vtbl1_u8(a, b);
   22218 }
   22219 
   22220 // CHECK-LABEL: define <8 x i8> @test_vtbl1_s8(<8 x i8> %a, <8 x i8> %b) #0 {
   22221 // CHECK:   [[VTBL1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %a, <8 x i8> %b) #4
   22222 // CHECK:   ret <8 x i8> [[VTBL1_I]]
   22223 int8x8_t test_vtbl1_s8(int8x8_t a, int8x8_t b) {
   22224   return vtbl1_s8(a, b);
   22225 }
   22226 
   22227 // CHECK-LABEL: define <8 x i8> @test_vtbl1_p8(<8 x i8> %a, <8 x i8> %b) #0 {
   22228 // CHECK:   [[VTBL1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %a, <8 x i8> %b) #4
   22229 // CHECK:   ret <8 x i8> [[VTBL1_I]]
   22230 poly8x8_t test_vtbl1_p8(poly8x8_t a, uint8x8_t b) {
   22231   return vtbl1_p8(a, b);
   22232 }
   22233 
   22234 
   22235 // CHECK-LABEL: define <8 x i8> @test_vtbl2_u8([2 x i64] %a.coerce, <8 x i8> %b) #0 {
   22236 // CHECK:   [[__P0_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
   22237 // CHECK:   [[A:%.*]] = alloca %struct.uint8x8x2_t, align 8
   22238 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[A]], i32 0, i32 0
   22239 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE]] to [2 x i64]*
   22240 // CHECK:   store [2 x i64] [[A]].coerce, [2 x i64]* [[TMP0]], align 8
   22241 // CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[A]], i32 0, i32 0
   22242 // CHECK:   [[TMP1:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE1]] to [2 x i64]*
   22243 // CHECK:   [[TMP2:%.*]] = load [2 x i64], [2 x i64]* [[TMP1]], align 8
   22244 // CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__P0_I]], i32 0, i32 0
   22245 // CHECK:   [[TMP3:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE_I]] to [2 x i64]*
   22246 // CHECK:   store [2 x i64] [[TMP2]], [2 x i64]* [[TMP3]], align 8
   22247 // CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__P0_I]], i32 0, i32 0
   22248 // CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
   22249 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
   22250 // CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__P0_I]], i32 0, i32 0
   22251 // CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
   22252 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
   22253 // CHECK:   [[VTBL2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> %b) #4
   22254 // CHECK:   ret <8 x i8> [[VTBL2_I]]
   22255 uint8x8_t test_vtbl2_u8(uint8x8x2_t a, uint8x8_t b) {
   22256   return vtbl2_u8(a, b);
   22257 }
   22258 
   22259 // CHECK-LABEL: define <8 x i8> @test_vtbl2_s8([2 x i64] %a.coerce, <8 x i8> %b) #0 {
   22260 // CHECK:   [[__P0_I:%.*]] = alloca %struct.int8x8x2_t, align 8
   22261 // CHECK:   [[A:%.*]] = alloca %struct.int8x8x2_t, align 8
   22262 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[A]], i32 0, i32 0
   22263 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE]] to [2 x i64]*
   22264 // CHECK:   store [2 x i64] [[A]].coerce, [2 x i64]* [[TMP0]], align 8
   22265 // CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[A]], i32 0, i32 0
   22266 // CHECK:   [[TMP1:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE1]] to [2 x i64]*
   22267 // CHECK:   [[TMP2:%.*]] = load [2 x i64], [2 x i64]* [[TMP1]], align 8
   22268 // CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__P0_I]], i32 0, i32 0
   22269 // CHECK:   [[TMP3:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE_I]] to [2 x i64]*
   22270 // CHECK:   store [2 x i64] [[TMP2]], [2 x i64]* [[TMP3]], align 8
   22271 // CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__P0_I]], i32 0, i32 0
   22272 // CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
   22273 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
   22274 // CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__P0_I]], i32 0, i32 0
   22275 // CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
   22276 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
   22277 // CHECK:   [[VTBL2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> %b) #4
   22278 // CHECK:   ret <8 x i8> [[VTBL2_I]]
   22279 int8x8_t test_vtbl2_s8(int8x8x2_t a, int8x8_t b) {
   22280   return vtbl2_s8(a, b);
   22281 }
   22282 
   22283 // CHECK-LABEL: define <8 x i8> @test_vtbl2_p8([2 x i64] %a.coerce, <8 x i8> %b) #0 {
   22284 // CHECK:   [[__P0_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
   22285 // CHECK:   [[A:%.*]] = alloca %struct.poly8x8x2_t, align 8
   22286 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[A]], i32 0, i32 0
   22287 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE]] to [2 x i64]*
   22288 // CHECK:   store [2 x i64] [[A]].coerce, [2 x i64]* [[TMP0]], align 8
   22289 // CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[A]], i32 0, i32 0
   22290 // CHECK:   [[TMP1:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE1]] to [2 x i64]*
   22291 // CHECK:   [[TMP2:%.*]] = load [2 x i64], [2 x i64]* [[TMP1]], align 8
   22292 // CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__P0_I]], i32 0, i32 0
   22293 // CHECK:   [[TMP3:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE_I]] to [2 x i64]*
   22294 // CHECK:   store [2 x i64] [[TMP2]], [2 x i64]* [[TMP3]], align 8
   22295 // CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__P0_I]], i32 0, i32 0
   22296 // CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
   22297 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
   22298 // CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__P0_I]], i32 0, i32 0
   22299 // CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
   22300 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
   22301 // CHECK:   [[VTBL2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> %b) #4
   22302 // CHECK:   ret <8 x i8> [[VTBL2_I]]
   22303 poly8x8_t test_vtbl2_p8(poly8x8x2_t a, uint8x8_t b) {
   22304   return vtbl2_p8(a, b);
   22305 }
   22306 
   22307 
   22308 // CHECK-LABEL: define <8 x i8> @test_vtbl3_u8([3 x i64] %a.coerce, <8 x i8> %b) #0 {
   22309 // CHECK:   [[__P0_I:%.*]] = alloca %struct.uint8x8x3_t, align 8
   22310 // CHECK:   [[A:%.*]] = alloca %struct.uint8x8x3_t, align 8
   22311 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[A]], i32 0, i32 0
   22312 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE]] to [3 x i64]*
   22313 // CHECK:   store [3 x i64] [[A]].coerce, [3 x i64]* [[TMP0]], align 8
   22314 // CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[A]], i32 0, i32 0
   22315 // CHECK:   [[TMP1:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE1]] to [3 x i64]*
   22316 // CHECK:   [[TMP2:%.*]] = load [3 x i64], [3 x i64]* [[TMP1]], align 8
   22317 // CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P0_I]], i32 0, i32 0
   22318 // CHECK:   [[TMP3:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE_I]] to [3 x i64]*
   22319 // CHECK:   store [3 x i64] [[TMP2]], [3 x i64]* [[TMP3]], align 8
   22320 // CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P0_I]], i32 0, i32 0
   22321 // CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
   22322 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
   22323 // CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P0_I]], i32 0, i32 0
   22324 // CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
   22325 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
   22326 // CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P0_I]], i32 0, i32 0
   22327 // CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3_I]], i32 0, i32 2
   22328 // CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
   22329 // CHECK:   [[VTBL3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl3(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> %b) #4
   22330 // CHECK:   ret <8 x i8> [[VTBL3_I]]
   22331 uint8x8_t test_vtbl3_u8(uint8x8x3_t a, uint8x8_t b) {
   22332   return vtbl3_u8(a, b);
   22333 }
   22334 
   22335 // CHECK-LABEL: define <8 x i8> @test_vtbl3_s8([3 x i64] %a.coerce, <8 x i8> %b) #0 {
   22336 // CHECK:   [[__P0_I:%.*]] = alloca %struct.int8x8x3_t, align 8
   22337 // CHECK:   [[A:%.*]] = alloca %struct.int8x8x3_t, align 8
   22338 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[A]], i32 0, i32 0
   22339 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE]] to [3 x i64]*
   22340 // CHECK:   store [3 x i64] [[A]].coerce, [3 x i64]* [[TMP0]], align 8
   22341 // CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[A]], i32 0, i32 0
   22342 // CHECK:   [[TMP1:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE1]] to [3 x i64]*
   22343 // CHECK:   [[TMP2:%.*]] = load [3 x i64], [3 x i64]* [[TMP1]], align 8
   22344 // CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P0_I]], i32 0, i32 0
   22345 // CHECK:   [[TMP3:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE_I]] to [3 x i64]*
   22346 // CHECK:   store [3 x i64] [[TMP2]], [3 x i64]* [[TMP3]], align 8
   22347 // CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P0_I]], i32 0, i32 0
   22348 // CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
   22349 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
   22350 // CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P0_I]], i32 0, i32 0
   22351 // CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
   22352 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
   22353 // CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P0_I]], i32 0, i32 0
   22354 // CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3_I]], i32 0, i32 2
   22355 // CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
   22356 // CHECK:   [[VTBL3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl3(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> %b) #4
   22357 // CHECK:   ret <8 x i8> [[VTBL3_I]]
   22358 int8x8_t test_vtbl3_s8(int8x8x3_t a, int8x8_t b) {
   22359   return vtbl3_s8(a, b);
   22360 }
   22361 
   22362 // CHECK-LABEL: define <8 x i8> @test_vtbl3_p8([3 x i64] %a.coerce, <8 x i8> %b) #0 {
   22363 // CHECK:   [[__P0_I:%.*]] = alloca %struct.poly8x8x3_t, align 8
   22364 // CHECK:   [[A:%.*]] = alloca %struct.poly8x8x3_t, align 8
   22365 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[A]], i32 0, i32 0
   22366 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE]] to [3 x i64]*
   22367 // CHECK:   store [3 x i64] [[A]].coerce, [3 x i64]* [[TMP0]], align 8
   22368 // CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[A]], i32 0, i32 0
   22369 // CHECK:   [[TMP1:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE1]] to [3 x i64]*
   22370 // CHECK:   [[TMP2:%.*]] = load [3 x i64], [3 x i64]* [[TMP1]], align 8
   22371 // CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P0_I]], i32 0, i32 0
   22372 // CHECK:   [[TMP3:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE_I]] to [3 x i64]*
   22373 // CHECK:   store [3 x i64] [[TMP2]], [3 x i64]* [[TMP3]], align 8
   22374 // CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P0_I]], i32 0, i32 0
   22375 // CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
   22376 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
   22377 // CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P0_I]], i32 0, i32 0
   22378 // CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
   22379 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
   22380 // CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P0_I]], i32 0, i32 0
   22381 // CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3_I]], i32 0, i32 2
   22382 // CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
   22383 // CHECK:   [[VTBL3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl3(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> %b) #4
   22384 // CHECK:   ret <8 x i8> [[VTBL3_I]]
   22385 poly8x8_t test_vtbl3_p8(poly8x8x3_t a, uint8x8_t b) {
   22386   return vtbl3_p8(a, b);
   22387 }
   22388 
   22389 
   22390 // CHECK-LABEL: define <8 x i8> @test_vtbl4_u8([4 x i64] %a.coerce, <8 x i8> %b) #0 {
   22391 // CHECK:   [[__P0_I:%.*]] = alloca %struct.uint8x8x4_t, align 8
   22392 // CHECK:   [[A:%.*]] = alloca %struct.uint8x8x4_t, align 8
   22393 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[A]], i32 0, i32 0
   22394 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
   22395 // CHECK:   store [4 x i64] [[A]].coerce, [4 x i64]* [[TMP0]], align 8
   22396 // CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[A]], i32 0, i32 0
   22397 // CHECK:   [[TMP1:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE1]] to [4 x i64]*
   22398 // CHECK:   [[TMP2:%.*]] = load [4 x i64], [4 x i64]* [[TMP1]], align 8
   22399 // CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P0_I]], i32 0, i32 0
   22400 // CHECK:   [[TMP3:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE_I]] to [4 x i64]*
   22401 // CHECK:   store [4 x i64] [[TMP2]], [4 x i64]* [[TMP3]], align 8
   22402 // CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P0_I]], i32 0, i32 0
   22403 // CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
   22404 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
   22405 // CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P0_I]], i32 0, i32 0
   22406 // CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
   22407 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
   22408 // CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P0_I]], i32 0, i32 0
   22409 // CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3_I]], i32 0, i32 2
   22410 // CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
   22411 // CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P0_I]], i32 0, i32 0
   22412 // CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5_I]], i32 0, i32 3
   22413 // CHECK:   [[TMP7:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6_I]], align 8
   22414 // CHECK:   [[VTBL4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], <8 x i8> %b) #4
   22415 // CHECK:   ret <8 x i8> [[VTBL4_I]]
   22416 uint8x8_t test_vtbl4_u8(uint8x8x4_t a, uint8x8_t b) {
   22417   return vtbl4_u8(a, b);
   22418 }
   22419 
   22420 // CHECK-LABEL: define <8 x i8> @test_vtbl4_s8([4 x i64] %a.coerce, <8 x i8> %b) #0 {
   22421 // CHECK:   [[__P0_I:%.*]] = alloca %struct.int8x8x4_t, align 8
   22422 // CHECK:   [[A:%.*]] = alloca %struct.int8x8x4_t, align 8
   22423 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[A]], i32 0, i32 0
   22424 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
   22425 // CHECK:   store [4 x i64] [[A]].coerce, [4 x i64]* [[TMP0]], align 8
   22426 // CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[A]], i32 0, i32 0
   22427 // CHECK:   [[TMP1:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE1]] to [4 x i64]*
   22428 // CHECK:   [[TMP2:%.*]] = load [4 x i64], [4 x i64]* [[TMP1]], align 8
   22429 // CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P0_I]], i32 0, i32 0
   22430 // CHECK:   [[TMP3:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE_I]] to [4 x i64]*
   22431 // CHECK:   store [4 x i64] [[TMP2]], [4 x i64]* [[TMP3]], align 8
   22432 // CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P0_I]], i32 0, i32 0
   22433 // CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
   22434 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
   22435 // CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P0_I]], i32 0, i32 0
   22436 // CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
   22437 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
   22438 // CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P0_I]], i32 0, i32 0
   22439 // CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3_I]], i32 0, i32 2
   22440 // CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
   22441 // CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P0_I]], i32 0, i32 0
   22442 // CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5_I]], i32 0, i32 3
   22443 // CHECK:   [[TMP7:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6_I]], align 8
   22444 // CHECK:   [[VTBL4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], <8 x i8> %b) #4
   22445 // CHECK:   ret <8 x i8> [[VTBL4_I]]
   22446 int8x8_t test_vtbl4_s8(int8x8x4_t a, int8x8_t b) {
   22447   return vtbl4_s8(a, b);
   22448 }
   22449 
   22450 // CHECK-LABEL: define <8 x i8> @test_vtbl4_p8([4 x i64] %a.coerce, <8 x i8> %b) #0 {
   22451 // CHECK:   [[__P0_I:%.*]] = alloca %struct.poly8x8x4_t, align 8
   22452 // CHECK:   [[A:%.*]] = alloca %struct.poly8x8x4_t, align 8
   22453 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[A]], i32 0, i32 0
   22454 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
   22455 // CHECK:   store [4 x i64] [[A]].coerce, [4 x i64]* [[TMP0]], align 8
   22456 // CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[A]], i32 0, i32 0
   22457 // CHECK:   [[TMP1:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE1]] to [4 x i64]*
   22458 // CHECK:   [[TMP2:%.*]] = load [4 x i64], [4 x i64]* [[TMP1]], align 8
   22459 // CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P0_I]], i32 0, i32 0
   22460 // CHECK:   [[TMP3:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE_I]] to [4 x i64]*
   22461 // CHECK:   store [4 x i64] [[TMP2]], [4 x i64]* [[TMP3]], align 8
   22462 // CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P0_I]], i32 0, i32 0
   22463 // CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
   22464 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
   22465 // CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P0_I]], i32 0, i32 0
   22466 // CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
   22467 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
   22468 // CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P0_I]], i32 0, i32 0
   22469 // CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3_I]], i32 0, i32 2
   22470 // CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
   22471 // CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P0_I]], i32 0, i32 0
   22472 // CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5_I]], i32 0, i32 3
   22473 // CHECK:   [[TMP7:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6_I]], align 8
   22474 // CHECK:   [[VTBL4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], <8 x i8> %b) #4
   22475 // CHECK:   ret <8 x i8> [[VTBL4_I]]
   22476 poly8x8_t test_vtbl4_p8(poly8x8x4_t a, uint8x8_t b) {
   22477   return vtbl4_p8(a, b);
   22478 }
   22479 
   22480 
   22481 // CHECK-LABEL: define <8 x i8> @test_vtbx1_u8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
   22482 // CHECK:   [[VTBX1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx1(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #4
   22483 // CHECK:   ret <8 x i8> [[VTBX1_I]]
   22484 uint8x8_t test_vtbx1_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) {
   22485   return vtbx1_u8(a, b, c);
   22486 }
   22487 
   22488 // CHECK-LABEL: define <8 x i8> @test_vtbx1_s8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
   22489 // CHECK:   [[VTBX1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx1(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #4
   22490 // CHECK:   ret <8 x i8> [[VTBX1_I]]
   22491 int8x8_t test_vtbx1_s8(int8x8_t a, int8x8_t b, int8x8_t c) {
   22492   return vtbx1_s8(a, b, c);
   22493 }
   22494 
   22495 // CHECK-LABEL: define <8 x i8> @test_vtbx1_p8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
   22496 // CHECK:   [[VTBX1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx1(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #4
   22497 // CHECK:   ret <8 x i8> [[VTBX1_I]]
   22498 poly8x8_t test_vtbx1_p8(poly8x8_t a, poly8x8_t b, uint8x8_t c) {
   22499   return vtbx1_p8(a, b, c);
   22500 }
   22501 
   22502 
   22503 // CHECK-LABEL: define <8 x i8> @test_vtbx2_u8(<8 x i8> %a, [2 x i64] %b.coerce, <8 x i8> %c) #0 {
   22504 // CHECK:   [[__P1_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
   22505 // CHECK:   [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8
   22506 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[B]], i32 0, i32 0
   22507 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE]] to [2 x i64]*
   22508 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
   22509 // CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[B]], i32 0, i32 0
   22510 // CHECK:   [[TMP1:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE1]] to [2 x i64]*
   22511 // CHECK:   [[TMP2:%.*]] = load [2 x i64], [2 x i64]* [[TMP1]], align 8
   22512 // CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__P1_I]], i32 0, i32 0
   22513 // CHECK:   [[TMP3:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE_I]] to [2 x i64]*
   22514 // CHECK:   store [2 x i64] [[TMP2]], [2 x i64]* [[TMP3]], align 8
   22515 // CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__P1_I]], i32 0, i32 0
   22516 // CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
   22517 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
   22518 // CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__P1_I]], i32 0, i32 0
   22519 // CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
   22520 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
   22521 // CHECK:   [[VTBX2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx2(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> %c) #4
   22522 // CHECK:   ret <8 x i8> [[VTBX2_I]]
   22523 uint8x8_t test_vtbx2_u8(uint8x8_t a, uint8x8x2_t b, uint8x8_t c) {
   22524   return vtbx2_u8(a, b, c);
   22525 }
   22526 
   22527 // CHECK-LABEL: define <8 x i8> @test_vtbx2_s8(<8 x i8> %a, [2 x i64] %b.coerce, <8 x i8> %c) #0 {
   22528 // CHECK:   [[__P1_I:%.*]] = alloca %struct.int8x8x2_t, align 8
   22529 // CHECK:   [[B:%.*]] = alloca %struct.int8x8x2_t, align 8
   22530 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[B]], i32 0, i32 0
   22531 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE]] to [2 x i64]*
   22532 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
   22533 // CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[B]], i32 0, i32 0
   22534 // CHECK:   [[TMP1:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE1]] to [2 x i64]*
   22535 // CHECK:   [[TMP2:%.*]] = load [2 x i64], [2 x i64]* [[TMP1]], align 8
   22536 // CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__P1_I]], i32 0, i32 0
   22537 // CHECK:   [[TMP3:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE_I]] to [2 x i64]*
   22538 // CHECK:   store [2 x i64] [[TMP2]], [2 x i64]* [[TMP3]], align 8
   22539 // CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__P1_I]], i32 0, i32 0
   22540 // CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
   22541 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
   22542 // CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__P1_I]], i32 0, i32 0
   22543 // CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
   22544 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
   22545 // CHECK:   [[VTBX2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx2(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> %c) #4
   22546 // CHECK:   ret <8 x i8> [[VTBX2_I]]
   22547 int8x8_t test_vtbx2_s8(int8x8_t a, int8x8x2_t b, int8x8_t c) {
   22548   return vtbx2_s8(a, b, c);
   22549 }
   22550 
   22551 // CHECK-LABEL: define <8 x i8> @test_vtbx2_p8(<8 x i8> %a, [2 x i64] %b.coerce, <8 x i8> %c) #0 {
   22552 // CHECK:   [[__P1_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
   22553 // CHECK:   [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8
   22554 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[B]], i32 0, i32 0
   22555 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE]] to [2 x i64]*
   22556 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
   22557 // CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[B]], i32 0, i32 0
   22558 // CHECK:   [[TMP1:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE1]] to [2 x i64]*
   22559 // CHECK:   [[TMP2:%.*]] = load [2 x i64], [2 x i64]* [[TMP1]], align 8
   22560 // CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__P1_I]], i32 0, i32 0
   22561 // CHECK:   [[TMP3:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE_I]] to [2 x i64]*
   22562 // CHECK:   store [2 x i64] [[TMP2]], [2 x i64]* [[TMP3]], align 8
   22563 // CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__P1_I]], i32 0, i32 0
   22564 // CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
   22565 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
   22566 // CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__P1_I]], i32 0, i32 0
   22567 // CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
   22568 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
   22569 // CHECK:   [[VTBX2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx2(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> %c) #4
   22570 // CHECK:   ret <8 x i8> [[VTBX2_I]]
   22571 poly8x8_t test_vtbx2_p8(poly8x8_t a, poly8x8x2_t b, uint8x8_t c) {
   22572   return vtbx2_p8(a, b, c);
   22573 }
   22574 
   22575 
   22576 // CHECK-LABEL: define <8 x i8> @test_vtbx3_u8(<8 x i8> %a, [3 x i64] %b.coerce, <8 x i8> %c) #0 {
   22577 // CHECK:   [[__P1_I:%.*]] = alloca %struct.uint8x8x3_t, align 8
   22578 // CHECK:   [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8
   22579 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[B]], i32 0, i32 0
   22580 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE]] to [3 x i64]*
   22581 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
   22582 // CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[B]], i32 0, i32 0
   22583 // CHECK:   [[TMP1:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE1]] to [3 x i64]*
   22584 // CHECK:   [[TMP2:%.*]] = load [3 x i64], [3 x i64]* [[TMP1]], align 8
   22585 // CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P1_I]], i32 0, i32 0
   22586 // CHECK:   [[TMP3:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE_I]] to [3 x i64]*
   22587 // CHECK:   store [3 x i64] [[TMP2]], [3 x i64]* [[TMP3]], align 8
   22588 // CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P1_I]], i32 0, i32 0
   22589 // CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
   22590 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
   22591 // CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P1_I]], i32 0, i32 0
   22592 // CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
   22593 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
   22594 // CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P1_I]], i32 0, i32 0
   22595 // CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3_I]], i32 0, i32 2
   22596 // CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
   22597 // CHECK:   [[VTBX3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx3(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> %c) #4
   22598 // CHECK:   ret <8 x i8> [[VTBX3_I]]
   22599 uint8x8_t test_vtbx3_u8(uint8x8_t a, uint8x8x3_t b, uint8x8_t c) {
   22600   return vtbx3_u8(a, b, c);
   22601 }
   22602 
   22603 // CHECK-LABEL: define <8 x i8> @test_vtbx3_s8(<8 x i8> %a, [3 x i64] %b.coerce, <8 x i8> %c) #0 {
   22604 // CHECK:   [[__P1_I:%.*]] = alloca %struct.int8x8x3_t, align 8
   22605 // CHECK:   [[B:%.*]] = alloca %struct.int8x8x3_t, align 8
   22606 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[B]], i32 0, i32 0
   22607 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE]] to [3 x i64]*
   22608 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
   22609 // CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[B]], i32 0, i32 0
   22610 // CHECK:   [[TMP1:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE1]] to [3 x i64]*
   22611 // CHECK:   [[TMP2:%.*]] = load [3 x i64], [3 x i64]* [[TMP1]], align 8
   22612 // CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P1_I]], i32 0, i32 0
   22613 // CHECK:   [[TMP3:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE_I]] to [3 x i64]*
   22614 // CHECK:   store [3 x i64] [[TMP2]], [3 x i64]* [[TMP3]], align 8
   22615 // CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P1_I]], i32 0, i32 0
   22616 // CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
   22617 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
   22618 // CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P1_I]], i32 0, i32 0
   22619 // CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
   22620 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
   22621 // CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P1_I]], i32 0, i32 0
   22622 // CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3_I]], i32 0, i32 2
   22623 // CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
   22624 // CHECK:   [[VTBX3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx3(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> %c) #4
   22625 // CHECK:   ret <8 x i8> [[VTBX3_I]]
   22626 int8x8_t test_vtbx3_s8(int8x8_t a, int8x8x3_t b, int8x8_t c) {
   22627   return vtbx3_s8(a, b, c);
   22628 }
   22629 
   22630 // CHECK-LABEL: define <8 x i8> @test_vtbx3_p8(<8 x i8> %a, [3 x i64] %b.coerce, <8 x i8> %c) #0 {
   22631 // CHECK:   [[__P1_I:%.*]] = alloca %struct.poly8x8x3_t, align 8
   22632 // CHECK:   [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8
   22633 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[B]], i32 0, i32 0
   22634 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE]] to [3 x i64]*
   22635 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
   22636 // CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[B]], i32 0, i32 0
   22637 // CHECK:   [[TMP1:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE1]] to [3 x i64]*
   22638 // CHECK:   [[TMP2:%.*]] = load [3 x i64], [3 x i64]* [[TMP1]], align 8
   22639 // CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P1_I]], i32 0, i32 0
   22640 // CHECK:   [[TMP3:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE_I]] to [3 x i64]*
   22641 // CHECK:   store [3 x i64] [[TMP2]], [3 x i64]* [[TMP3]], align 8
   22642 // CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P1_I]], i32 0, i32 0
   22643 // CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
   22644 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
   22645 // CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P1_I]], i32 0, i32 0
   22646 // CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
   22647 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
   22648 // CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P1_I]], i32 0, i32 0
   22649 // CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3_I]], i32 0, i32 2
   22650 // CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
   22651 // CHECK:   [[VTBX3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx3(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> %c) #4
   22652 // CHECK:   ret <8 x i8> [[VTBX3_I]]
   22653 poly8x8_t test_vtbx3_p8(poly8x8_t a, poly8x8x3_t b, uint8x8_t c) {
   22654   return vtbx3_p8(a, b, c);
   22655 }
   22656 
   22657 
   22658 // CHECK-LABEL: define <8 x i8> @test_vtbx4_u8(<8 x i8> %a, [4 x i64] %b.coerce, <8 x i8> %c) #0 {
   22659 // CHECK:   [[__P1_I:%.*]] = alloca %struct.uint8x8x4_t, align 8
   22660 // CHECK:   [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8
   22661 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[B]], i32 0, i32 0
   22662 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
   22663 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
   22664 // CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[B]], i32 0, i32 0
   22665 // CHECK:   [[TMP1:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE1]] to [4 x i64]*
   22666 // CHECK:   [[TMP2:%.*]] = load [4 x i64], [4 x i64]* [[TMP1]], align 8
   22667 // CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P1_I]], i32 0, i32 0
   22668 // CHECK:   [[TMP3:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE_I]] to [4 x i64]*
   22669 // CHECK:   store [4 x i64] [[TMP2]], [4 x i64]* [[TMP3]], align 8
   22670 // CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P1_I]], i32 0, i32 0
   22671 // CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
   22672 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
   22673 // CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P1_I]], i32 0, i32 0
   22674 // CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
   22675 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
   22676 // CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P1_I]], i32 0, i32 0
   22677 // CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3_I]], i32 0, i32 2
   22678 // CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
   22679 // CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P1_I]], i32 0, i32 0
   22680 // CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5_I]], i32 0, i32 3
   22681 // CHECK:   [[TMP7:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6_I]], align 8
   22682 // CHECK:   [[VTBX4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx4(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], <8 x i8> %c) #4
   22683 // CHECK:   ret <8 x i8> [[VTBX4_I]]
   22684 uint8x8_t test_vtbx4_u8(uint8x8_t a, uint8x8x4_t b, uint8x8_t c) {
   22685   return vtbx4_u8(a, b, c);
   22686 }
   22687 
   22688 // CHECK-LABEL: define <8 x i8> @test_vtbx4_s8(<8 x i8> %a, [4 x i64] %b.coerce, <8 x i8> %c) #0 {
   22689 // CHECK:   [[__P1_I:%.*]] = alloca %struct.int8x8x4_t, align 8
   22690 // CHECK:   [[B:%.*]] = alloca %struct.int8x8x4_t, align 8
   22691 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[B]], i32 0, i32 0
   22692 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
   22693 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
   22694 // CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[B]], i32 0, i32 0
   22695 // CHECK:   [[TMP1:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE1]] to [4 x i64]*
   22696 // CHECK:   [[TMP2:%.*]] = load [4 x i64], [4 x i64]* [[TMP1]], align 8
   22697 // CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P1_I]], i32 0, i32 0
   22698 // CHECK:   [[TMP3:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE_I]] to [4 x i64]*
   22699 // CHECK:   store [4 x i64] [[TMP2]], [4 x i64]* [[TMP3]], align 8
   22700 // CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P1_I]], i32 0, i32 0
   22701 // CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
   22702 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
   22703 // CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P1_I]], i32 0, i32 0
   22704 // CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
   22705 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
   22706 // CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P1_I]], i32 0, i32 0
   22707 // CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3_I]], i32 0, i32 2
   22708 // CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
   22709 // CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P1_I]], i32 0, i32 0
   22710 // CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5_I]], i32 0, i32 3
   22711 // CHECK:   [[TMP7:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6_I]], align 8
   22712 // CHECK:   [[VTBX4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx4(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], <8 x i8> %c) #4
   22713 // CHECK:   ret <8 x i8> [[VTBX4_I]]
   22714 int8x8_t test_vtbx4_s8(int8x8_t a, int8x8x4_t b, int8x8_t c) {
   22715   return vtbx4_s8(a, b, c);
   22716 }
   22717 
   22718 // CHECK-LABEL: define <8 x i8> @test_vtbx4_p8(<8 x i8> %a, [4 x i64] %b.coerce, <8 x i8> %c) #0 {
   22719 // CHECK:   [[__P1_I:%.*]] = alloca %struct.poly8x8x4_t, align 8
   22720 // CHECK:   [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8
   22721 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[B]], i32 0, i32 0
   22722 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
   22723 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
   22724 // CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[B]], i32 0, i32 0
   22725 // CHECK:   [[TMP1:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE1]] to [4 x i64]*
   22726 // CHECK:   [[TMP2:%.*]] = load [4 x i64], [4 x i64]* [[TMP1]], align 8
   22727 // CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P1_I]], i32 0, i32 0
   22728 // CHECK:   [[TMP3:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE_I]] to [4 x i64]*
   22729 // CHECK:   store [4 x i64] [[TMP2]], [4 x i64]* [[TMP3]], align 8
   22730 // CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P1_I]], i32 0, i32 0
   22731 // CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
   22732 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
   22733 // CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P1_I]], i32 0, i32 0
   22734 // CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
   22735 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
   22736 // CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P1_I]], i32 0, i32 0
   22737 // CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3_I]], i32 0, i32 2
   22738 // CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
   22739 // CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P1_I]], i32 0, i32 0
   22740 // CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5_I]], i32 0, i32 3
   22741 // CHECK:   [[TMP7:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6_I]], align 8
   22742 // CHECK:   [[VTBX4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx4(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], <8 x i8> %c) #4
   22743 // CHECK:   ret <8 x i8> [[VTBX4_I]]
   22744 poly8x8_t test_vtbx4_p8(poly8x8_t a, poly8x8x4_t b, uint8x8_t c) {
   22745   return vtbx4_p8(a, b, c);
   22746 }
   22747 
   22748 
   22749 // CHECK-LABEL: define void @test_vtrn_s8(%struct.int8x8x2_t* noalias sret %agg.result, <8 x i8> %a, <8 x i8> %b) #0 {
   22750 // CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x8x2_t, align 8
   22751 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
   22752 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
   22753 // CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
   22754 // CHECK:   store <8 x i8> [[VTRN_I]], <8 x i8>* [[TMP1]]
   22755 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
   22756 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
   22757 // CHECK:   store <8 x i8> [[VTRN1_I]], <8 x i8>* [[TMP2]]
   22758 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* %agg.result to i8*
   22759 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
   22760 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false) #4
   22761 // CHECK:   ret void
   22762 int8x8x2_t test_vtrn_s8(int8x8_t a, int8x8_t b) {
   22763   return vtrn_s8(a, b);
   22764 }
   22765 
   22766 // CHECK-LABEL: define void @test_vtrn_s16(%struct.int16x4x2_t* noalias sret %agg.result, <4 x i16> %a, <4 x i16> %b) #0 {
   22767 // CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x4x2_t, align 8
   22768 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
   22769 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   22770 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   22771 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
   22772 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   22773 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
   22774 // CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   22775 // CHECK:   store <4 x i16> [[VTRN_I]], <4 x i16>* [[TMP3]]
   22776 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
   22777 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   22778 // CHECK:   store <4 x i16> [[VTRN1_I]], <4 x i16>* [[TMP6]]
   22779 // CHECK:   [[TMP7:%.*]] = bitcast %struct.int16x4x2_t* %agg.result to i8*
   22780 // CHECK:   [[TMP8:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
   22781 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
   22782 // CHECK:   ret void
   22783 int16x4x2_t test_vtrn_s16(int16x4_t a, int16x4_t b) {
   22784   return vtrn_s16(a, b);
   22785 }
   22786 
   22787 // CHECK-LABEL: define void @test_vtrn_s32(%struct.int32x2x2_t* noalias sret %agg.result, <2 x i32> %a, <2 x i32> %b) #0 {
   22788 // CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x2x2_t, align 8
   22789 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
   22790 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   22791 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   22792 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
   22793 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   22794 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
   22795 // CHECK:   [[VTRN_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 0, i32 2>
   22796 // CHECK:   store <2 x i32> [[VTRN_I]], <2 x i32>* [[TMP3]]
   22797 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
   22798 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 1, i32 3>
   22799 // CHECK:   store <2 x i32> [[VTRN1_I]], <2 x i32>* [[TMP6]]
   22800 // CHECK:   [[TMP7:%.*]] = bitcast %struct.int32x2x2_t* %agg.result to i8*
   22801 // CHECK:   [[TMP8:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
   22802 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
   22803 // CHECK:   ret void
   22804 int32x2x2_t test_vtrn_s32(int32x2_t a, int32x2_t b) {
   22805   return vtrn_s32(a, b);
   22806 }
   22807 
   22808 // CHECK-LABEL: define void @test_vtrn_u8(%struct.uint8x8x2_t* noalias sret %agg.result, <8 x i8> %a, <8 x i8> %b) #0 {
   22809 // CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
   22810 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
   22811 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
   22812 // CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
   22813 // CHECK:   store <8 x i8> [[VTRN_I]], <8 x i8>* [[TMP1]]
   22814 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
   22815 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
   22816 // CHECK:   store <8 x i8> [[VTRN1_I]], <8 x i8>* [[TMP2]]
   22817 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* %agg.result to i8*
   22818 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
   22819 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false) #4
   22820 // CHECK:   ret void
   22821 uint8x8x2_t test_vtrn_u8(uint8x8_t a, uint8x8_t b) {
   22822   return vtrn_u8(a, b);
   22823 }
   22824 
   22825 // CHECK-LABEL: define void @test_vtrn_u16(%struct.uint16x4x2_t* noalias sret %agg.result, <4 x i16> %a, <4 x i16> %b) #0 {
   22826 // CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x4x2_t, align 8
   22827 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
   22828 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   22829 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   22830 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
   22831 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   22832 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
   22833 // CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   22834 // CHECK:   store <4 x i16> [[VTRN_I]], <4 x i16>* [[TMP3]]
   22835 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
   22836 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   22837 // CHECK:   store <4 x i16> [[VTRN1_I]], <4 x i16>* [[TMP6]]
   22838 // CHECK:   [[TMP7:%.*]] = bitcast %struct.uint16x4x2_t* %agg.result to i8*
   22839 // CHECK:   [[TMP8:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
   22840 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
   22841 // CHECK:   ret void
   22842 uint16x4x2_t test_vtrn_u16(uint16x4_t a, uint16x4_t b) {
   22843   return vtrn_u16(a, b);
   22844 }
   22845 
   22846 // CHECK-LABEL: define void @test_vtrn_u32(%struct.uint32x2x2_t* noalias sret %agg.result, <2 x i32> %a, <2 x i32> %b) #0 {
   22847 // CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x2x2_t, align 8
   22848 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
   22849 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   22850 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   22851 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
   22852 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   22853 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
   22854 // CHECK:   [[VTRN_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 0, i32 2>
   22855 // CHECK:   store <2 x i32> [[VTRN_I]], <2 x i32>* [[TMP3]]
   22856 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
   22857 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 1, i32 3>
   22858 // CHECK:   store <2 x i32> [[VTRN1_I]], <2 x i32>* [[TMP6]]
   22859 // CHECK:   [[TMP7:%.*]] = bitcast %struct.uint32x2x2_t* %agg.result to i8*
   22860 // CHECK:   [[TMP8:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
   22861 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
   22862 // CHECK:   ret void
   22863 uint32x2x2_t test_vtrn_u32(uint32x2_t a, uint32x2_t b) {
   22864   return vtrn_u32(a, b);
   22865 }
   22866 
   22867 // CHECK-LABEL: define void @test_vtrn_f32(%struct.float32x2x2_t* noalias sret %agg.result, <2 x float> %a, <2 x float> %b) #0 {
   22868 // CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x2x2_t, align 8
   22869 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
   22870 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %a to <8 x i8>
   22871 // CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %b to <8 x i8>
   22872 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x float>*
   22873 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
   22874 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
   22875 // CHECK:   [[VTRN_I:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> <i32 0, i32 2>
   22876 // CHECK:   store <2 x float> [[VTRN_I]], <2 x float>* [[TMP3]]
   22877 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x float>, <2 x float>* [[TMP3]], i32 1
   22878 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> <i32 1, i32 3>
   22879 // CHECK:   store <2 x float> [[VTRN1_I]], <2 x float>* [[TMP6]]
   22880 // CHECK:   [[TMP7:%.*]] = bitcast %struct.float32x2x2_t* %agg.result to i8*
   22881 // CHECK:   [[TMP8:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
   22882 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
   22883 // CHECK:   ret void
   22884 float32x2x2_t test_vtrn_f32(float32x2_t a, float32x2_t b) {
   22885   return vtrn_f32(a, b);
   22886 }
   22887 
   22888 // CHECK-LABEL: define void @test_vtrn_p8(%struct.poly8x8x2_t* noalias sret %agg.result, <8 x i8> %a, <8 x i8> %b) #0 {
   22889 // CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
   22890 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
   22891 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
   22892 // CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
   22893 // CHECK:   store <8 x i8> [[VTRN_I]], <8 x i8>* [[TMP1]]
   22894 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
   22895 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
   22896 // CHECK:   store <8 x i8> [[VTRN1_I]], <8 x i8>* [[TMP2]]
   22897 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* %agg.result to i8*
   22898 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
   22899 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false) #4
   22900 // CHECK:   ret void
   22901 poly8x8x2_t test_vtrn_p8(poly8x8_t a, poly8x8_t b) {
   22902   return vtrn_p8(a, b);
   22903 }
   22904 
   22905 // CHECK-LABEL: define void @test_vtrn_p16(%struct.poly16x4x2_t* noalias sret %agg.result, <4 x i16> %a, <4 x i16> %b) #0 {
   22906 // CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x4x2_t, align 8
   22907 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
   22908 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   22909 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   22910 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
   22911 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   22912 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
   22913 // CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   22914 // CHECK:   store <4 x i16> [[VTRN_I]], <4 x i16>* [[TMP3]]
   22915 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
   22916 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   22917 // CHECK:   store <4 x i16> [[VTRN1_I]], <4 x i16>* [[TMP6]]
   22918 // CHECK:   [[TMP7:%.*]] = bitcast %struct.poly16x4x2_t* %agg.result to i8*
   22919 // CHECK:   [[TMP8:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
   22920 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
   22921 // CHECK:   ret void
   22922 poly16x4x2_t test_vtrn_p16(poly16x4_t a, poly16x4_t b) {
   22923   return vtrn_p16(a, b);
   22924 }
   22925 
   22926 // CHECK-LABEL: define void @test_vtrnq_s8(%struct.int8x16x2_t* noalias sret %agg.result, <16 x i8> %a, <16 x i8> %b) #0 {
   22927 // CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x16x2_t, align 16
   22928 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
   22929 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
   22930 // CHECK:   [[VTRN_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
   22931 // CHECK:   store <16 x i8> [[VTRN_I]], <16 x i8>* [[TMP1]]
   22932 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
   22933 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
   22934 // CHECK:   store <16 x i8> [[VTRN1_I]], <16 x i8>* [[TMP2]]
   22935 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* %agg.result to i8*
   22936 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
   22937 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false) #4
   22938 // CHECK:   ret void
   22939 int8x16x2_t test_vtrnq_s8(int8x16_t a, int8x16_t b) {
   22940   return vtrnq_s8(a, b);
   22941 }
   22942 
   22943 // CHECK-LABEL: define void @test_vtrnq_s16(%struct.int16x8x2_t* noalias sret %agg.result, <8 x i16> %a, <8 x i16> %b) #0 {
   22944 // CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x8x2_t, align 16
   22945 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
   22946 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   22947 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   22948 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
   22949 // CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   22950 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
   22951 // CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
   22952 // CHECK:   store <8 x i16> [[VTRN_I]], <8 x i16>* [[TMP3]]
   22953 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
   22954 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
   22955 // CHECK:   store <8 x i16> [[VTRN1_I]], <8 x i16>* [[TMP6]]
   22956 // CHECK:   [[TMP7:%.*]] = bitcast %struct.int16x8x2_t* %agg.result to i8*
   22957 // CHECK:   [[TMP8:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
   22958 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
   22959 // CHECK:   ret void
   22960 int16x8x2_t test_vtrnq_s16(int16x8_t a, int16x8_t b) {
   22961   return vtrnq_s16(a, b);
   22962 }
   22963 
   22964 // CHECK-LABEL: define void @test_vtrnq_s32(%struct.int32x4x2_t* noalias sret %agg.result, <4 x i32> %a, <4 x i32> %b) #0 {
   22965 // CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x4x2_t, align 16
   22966 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
   22967 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   22968 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   22969 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
   22970 // CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   22971 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
   22972 // CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   22973 // CHECK:   store <4 x i32> [[VTRN_I]], <4 x i32>* [[TMP3]]
   22974 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
   22975 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   22976 // CHECK:   store <4 x i32> [[VTRN1_I]], <4 x i32>* [[TMP6]]
   22977 // CHECK:   [[TMP7:%.*]] = bitcast %struct.int32x4x2_t* %agg.result to i8*
   22978 // CHECK:   [[TMP8:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
   22979 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
   22980 // CHECK:   ret void
   22981 int32x4x2_t test_vtrnq_s32(int32x4_t a, int32x4_t b) {
   22982   return vtrnq_s32(a, b);
   22983 }
   22984 
   22985 // CHECK-LABEL: define void @test_vtrnq_u8(%struct.uint8x16x2_t* noalias sret %agg.result, <16 x i8> %a, <16 x i8> %b) #0 {
   22986 // CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
   22987 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
   22988 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
   22989 // CHECK:   [[VTRN_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
   22990 // CHECK:   store <16 x i8> [[VTRN_I]], <16 x i8>* [[TMP1]]
   22991 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
   22992 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
   22993 // CHECK:   store <16 x i8> [[VTRN1_I]], <16 x i8>* [[TMP2]]
   22994 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* %agg.result to i8*
   22995 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
   22996 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false) #4
   22997 // CHECK:   ret void
   22998 uint8x16x2_t test_vtrnq_u8(uint8x16_t a, uint8x16_t b) {
   22999   return vtrnq_u8(a, b);
   23000 }
   23001 
   23002 // CHECK-LABEL: define void @test_vtrnq_u16(%struct.uint16x8x2_t* noalias sret %agg.result, <8 x i16> %a, <8 x i16> %b) #0 {
   23003 // CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x8x2_t, align 16
   23004 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
   23005 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   23006 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   23007 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
   23008 // CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   23009 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
   23010 // CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
   23011 // CHECK:   store <8 x i16> [[VTRN_I]], <8 x i16>* [[TMP3]]
   23012 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
   23013 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
   23014 // CHECK:   store <8 x i16> [[VTRN1_I]], <8 x i16>* [[TMP6]]
   23015 // CHECK:   [[TMP7:%.*]] = bitcast %struct.uint16x8x2_t* %agg.result to i8*
   23016 // CHECK:   [[TMP8:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
   23017 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
   23018 // CHECK:   ret void
   23019 uint16x8x2_t test_vtrnq_u16(uint16x8_t a, uint16x8_t b) {
   23020   return vtrnq_u16(a, b);
   23021 }
   23022 
   23023 // CHECK-LABEL: define void @test_vtrnq_u32(%struct.uint32x4x2_t* noalias sret %agg.result, <4 x i32> %a, <4 x i32> %b) #0 {
   23024 // CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x4x2_t, align 16
   23025 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
   23026 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   23027 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   23028 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
   23029 // CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   23030 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
   23031 // CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   23032 // CHECK:   store <4 x i32> [[VTRN_I]], <4 x i32>* [[TMP3]]
   23033 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
   23034 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   23035 // CHECK:   store <4 x i32> [[VTRN1_I]], <4 x i32>* [[TMP6]]
   23036 // CHECK:   [[TMP7:%.*]] = bitcast %struct.uint32x4x2_t* %agg.result to i8*
   23037 // CHECK:   [[TMP8:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
   23038 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
   23039 // CHECK:   ret void
   23040 uint32x4x2_t test_vtrnq_u32(uint32x4_t a, uint32x4_t b) {
   23041   return vtrnq_u32(a, b);
   23042 }
   23043 
   23044 // CHECK-LABEL: define void @test_vtrnq_f32(%struct.float32x4x2_t* noalias sret %agg.result, <4 x float> %a, <4 x float> %b) #0 {
   23045 // CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x4x2_t, align 16
   23046 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
   23047 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %a to <16 x i8>
   23048 // CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %b to <16 x i8>
   23049 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x float>*
   23050 // CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
   23051 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
   23052 // CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   23053 // CHECK:   store <4 x float> [[VTRN_I]], <4 x float>* [[TMP3]]
   23054 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[TMP3]], i32 1
   23055 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   23056 // CHECK:   store <4 x float> [[VTRN1_I]], <4 x float>* [[TMP6]]
   23057 // CHECK:   [[TMP7:%.*]] = bitcast %struct.float32x4x2_t* %agg.result to i8*
   23058 // CHECK:   [[TMP8:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
   23059 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
   23060 // CHECK:   ret void
   23061 float32x4x2_t test_vtrnq_f32(float32x4_t a, float32x4_t b) {
   23062   return vtrnq_f32(a, b);
   23063 }
   23064 
   23065 // CHECK-LABEL: define void @test_vtrnq_p8(%struct.poly8x16x2_t* noalias sret %agg.result, <16 x i8> %a, <16 x i8> %b) #0 {
   23066 // CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
   23067 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
   23068 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
   23069 // CHECK:   [[VTRN_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
   23070 // CHECK:   store <16 x i8> [[VTRN_I]], <16 x i8>* [[TMP1]]
   23071 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
   23072 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
   23073 // CHECK:   store <16 x i8> [[VTRN1_I]], <16 x i8>* [[TMP2]]
   23074 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* %agg.result to i8*
   23075 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
   23076 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false) #4
   23077 // CHECK:   ret void
   23078 poly8x16x2_t test_vtrnq_p8(poly8x16_t a, poly8x16_t b) {
   23079   return vtrnq_p8(a, b);
   23080 }
   23081 
   23082 // CHECK-LABEL: define void @test_vtrnq_p16(%struct.poly16x8x2_t* noalias sret %agg.result, <8 x i16> %a, <8 x i16> %b) #0 {
   23083 // CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x8x2_t, align 16
   23084 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
   23085 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   23086 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   23087 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
   23088 // CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   23089 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
   23090 // CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
   23091 // CHECK:   store <8 x i16> [[VTRN_I]], <8 x i16>* [[TMP3]]
   23092 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
   23093 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
   23094 // CHECK:   store <8 x i16> [[VTRN1_I]], <8 x i16>* [[TMP6]]
   23095 // CHECK:   [[TMP7:%.*]] = bitcast %struct.poly16x8x2_t* %agg.result to i8*
   23096 // CHECK:   [[TMP8:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
   23097 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
   23098 // CHECK:   ret void
   23099 poly16x8x2_t test_vtrnq_p16(poly16x8_t a, poly16x8_t b) {
   23100   return vtrnq_p16(a, b);
   23101 }
   23102 
   23103 
   23104 // CHECK-LABEL: define <8 x i8> @test_vtst_s8(<8 x i8> %a, <8 x i8> %b) #0 {
   23105 // CHECK:   [[TMP0:%.*]] = and <8 x i8> %a, %b
   23106 // CHECK:   [[TMP1:%.*]] = icmp ne <8 x i8> [[TMP0]], zeroinitializer
   23107 // CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i8>
   23108 // CHECK:   ret <8 x i8> [[VTST_I]]
   23109 uint8x8_t test_vtst_s8(int8x8_t a, int8x8_t b) {
   23110   return vtst_s8(a, b);
   23111 }
   23112 
   23113 // CHECK-LABEL: define <4 x i16> @test_vtst_s16(<4 x i16> %a, <4 x i16> %b) #0 {
   23114 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   23115 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   23116 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   23117 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   23118 // CHECK:   [[TMP4:%.*]] = and <4 x i16> [[TMP2]], [[TMP3]]
   23119 // CHECK:   [[TMP5:%.*]] = icmp ne <4 x i16> [[TMP4]], zeroinitializer
   23120 // CHECK:   [[VTST_I:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i16>
   23121 // CHECK:   ret <4 x i16> [[VTST_I]]
   23122 uint16x4_t test_vtst_s16(int16x4_t a, int16x4_t b) {
   23123   return vtst_s16(a, b);
   23124 }
   23125 
   23126 // CHECK-LABEL: define <2 x i32> @test_vtst_s32(<2 x i32> %a, <2 x i32> %b) #0 {
   23127 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   23128 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   23129 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   23130 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   23131 // CHECK:   [[TMP4:%.*]] = and <2 x i32> [[TMP2]], [[TMP3]]
   23132 // CHECK:   [[TMP5:%.*]] = icmp ne <2 x i32> [[TMP4]], zeroinitializer
   23133 // CHECK:   [[VTST_I:%.*]] = sext <2 x i1> [[TMP5]] to <2 x i32>
   23134 // CHECK:   ret <2 x i32> [[VTST_I]]
   23135 uint32x2_t test_vtst_s32(int32x2_t a, int32x2_t b) {
   23136   return vtst_s32(a, b);
   23137 }
   23138 
   23139 // CHECK-LABEL: define <8 x i8> @test_vtst_u8(<8 x i8> %a, <8 x i8> %b) #0 {
   23140 // CHECK:   [[TMP0:%.*]] = and <8 x i8> %a, %b
   23141 // CHECK:   [[TMP1:%.*]] = icmp ne <8 x i8> [[TMP0]], zeroinitializer
   23142 // CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i8>
   23143 // CHECK:   ret <8 x i8> [[VTST_I]]
   23144 uint8x8_t test_vtst_u8(uint8x8_t a, uint8x8_t b) {
   23145   return vtst_u8(a, b);
   23146 }
   23147 
   23148 // CHECK-LABEL: define <4 x i16> @test_vtst_u16(<4 x i16> %a, <4 x i16> %b) #0 {
   23149 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   23150 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   23151 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   23152 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   23153 // CHECK:   [[TMP4:%.*]] = and <4 x i16> [[TMP2]], [[TMP3]]
   23154 // CHECK:   [[TMP5:%.*]] = icmp ne <4 x i16> [[TMP4]], zeroinitializer
   23155 // CHECK:   [[VTST_I:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i16>
   23156 // CHECK:   ret <4 x i16> [[VTST_I]]
   23157 uint16x4_t test_vtst_u16(uint16x4_t a, uint16x4_t b) {
   23158   return vtst_u16(a, b);
   23159 }
   23160 
   23161 // CHECK-LABEL: define <2 x i32> @test_vtst_u32(<2 x i32> %a, <2 x i32> %b) #0 {
   23162 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   23163 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   23164 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   23165 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   23166 // CHECK:   [[TMP4:%.*]] = and <2 x i32> [[TMP2]], [[TMP3]]
   23167 // CHECK:   [[TMP5:%.*]] = icmp ne <2 x i32> [[TMP4]], zeroinitializer
   23168 // CHECK:   [[VTST_I:%.*]] = sext <2 x i1> [[TMP5]] to <2 x i32>
   23169 // CHECK:   ret <2 x i32> [[VTST_I]]
   23170 uint32x2_t test_vtst_u32(uint32x2_t a, uint32x2_t b) {
   23171   return vtst_u32(a, b);
   23172 }
   23173 
   23174 // CHECK-LABEL: define <8 x i8> @test_vtst_p8(<8 x i8> %a, <8 x i8> %b) #0 {
   23175 // CHECK:   [[TMP0:%.*]] = and <8 x i8> %a, %b
   23176 // CHECK:   [[TMP1:%.*]] = icmp ne <8 x i8> [[TMP0]], zeroinitializer
   23177 // CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i8>
   23178 // CHECK:   ret <8 x i8> [[VTST_I]]
   23179 uint8x8_t test_vtst_p8(poly8x8_t a, poly8x8_t b) {
   23180   return vtst_p8(a, b);
   23181 }
   23182 
   23183 // CHECK-LABEL: define <4 x i16> @test_vtst_p16(<4 x i16> %a, <4 x i16> %b) #0 {
   23184 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   23185 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   23186 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   23187 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   23188 // CHECK:   [[TMP4:%.*]] = and <4 x i16> [[TMP2]], [[TMP3]]
   23189 // CHECK:   [[TMP5:%.*]] = icmp ne <4 x i16> [[TMP4]], zeroinitializer
   23190 // CHECK:   [[VTST_I:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i16>
   23191 // CHECK:   ret <4 x i16> [[VTST_I]]
   23192 uint16x4_t test_vtst_p16(poly16x4_t a, poly16x4_t b) {
   23193   return vtst_p16(a, b);
   23194 }
   23195 
   23196 // CHECK-LABEL: define <16 x i8> @test_vtstq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
   23197 // CHECK:   [[TMP0:%.*]] = and <16 x i8> %a, %b
   23198 // CHECK:   [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer
   23199 // CHECK:   [[VTST_I:%.*]] = sext <16 x i1> [[TMP1]] to <16 x i8>
   23200 // CHECK:   ret <16 x i8> [[VTST_I]]
   23201 uint8x16_t test_vtstq_s8(int8x16_t a, int8x16_t b) {
   23202   return vtstq_s8(a, b);
   23203 }
   23204 
   23205 // CHECK-LABEL: define <8 x i16> @test_vtstq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
   23206 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   23207 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   23208 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   23209 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   23210 // CHECK:   [[TMP4:%.*]] = and <8 x i16> [[TMP2]], [[TMP3]]
   23211 // CHECK:   [[TMP5:%.*]] = icmp ne <8 x i16> [[TMP4]], zeroinitializer
   23212 // CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i16>
   23213 // CHECK:   ret <8 x i16> [[VTST_I]]
   23214 uint16x8_t test_vtstq_s16(int16x8_t a, int16x8_t b) {
   23215   return vtstq_s16(a, b);
   23216 }
   23217 
   23218 // CHECK-LABEL: define <4 x i32> @test_vtstq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
   23219 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   23220 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   23221 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   23222 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   23223 // CHECK:   [[TMP4:%.*]] = and <4 x i32> [[TMP2]], [[TMP3]]
   23224 // CHECK:   [[TMP5:%.*]] = icmp ne <4 x i32> [[TMP4]], zeroinitializer
   23225 // CHECK:   [[VTST_I:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i32>
   23226 // CHECK:   ret <4 x i32> [[VTST_I]]
   23227 uint32x4_t test_vtstq_s32(int32x4_t a, int32x4_t b) {
   23228   return vtstq_s32(a, b);
   23229 }
   23230 
   23231 // CHECK-LABEL: define <16 x i8> @test_vtstq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
   23232 // CHECK:   [[TMP0:%.*]] = and <16 x i8> %a, %b
   23233 // CHECK:   [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer
   23234 // CHECK:   [[VTST_I:%.*]] = sext <16 x i1> [[TMP1]] to <16 x i8>
   23235 // CHECK:   ret <16 x i8> [[VTST_I]]
   23236 uint8x16_t test_vtstq_u8(uint8x16_t a, uint8x16_t b) {
   23237   return vtstq_u8(a, b);
   23238 }
   23239 
   23240 // CHECK-LABEL: define <8 x i16> @test_vtstq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
   23241 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   23242 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   23243 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   23244 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   23245 // CHECK:   [[TMP4:%.*]] = and <8 x i16> [[TMP2]], [[TMP3]]
   23246 // CHECK:   [[TMP5:%.*]] = icmp ne <8 x i16> [[TMP4]], zeroinitializer
   23247 // CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i16>
   23248 // CHECK:   ret <8 x i16> [[VTST_I]]
   23249 uint16x8_t test_vtstq_u16(uint16x8_t a, uint16x8_t b) {
   23250   return vtstq_u16(a, b);
   23251 }
   23252 
   23253 // CHECK-LABEL: define <4 x i32> @test_vtstq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
   23254 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   23255 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   23256 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   23257 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   23258 // CHECK:   [[TMP4:%.*]] = and <4 x i32> [[TMP2]], [[TMP3]]
   23259 // CHECK:   [[TMP5:%.*]] = icmp ne <4 x i32> [[TMP4]], zeroinitializer
   23260 // CHECK:   [[VTST_I:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i32>
   23261 // CHECK:   ret <4 x i32> [[VTST_I]]
   23262 uint32x4_t test_vtstq_u32(uint32x4_t a, uint32x4_t b) {
   23263   return vtstq_u32(a, b);
   23264 }
   23265 
   23266 // CHECK-LABEL: define <16 x i8> @test_vtstq_p8(<16 x i8> %a, <16 x i8> %b) #0 {
   23267 // CHECK:   [[TMP0:%.*]] = and <16 x i8> %a, %b
   23268 // CHECK:   [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer
   23269 // CHECK:   [[VTST_I:%.*]] = sext <16 x i1> [[TMP1]] to <16 x i8>
   23270 // CHECK:   ret <16 x i8> [[VTST_I]]
   23271 uint8x16_t test_vtstq_p8(poly8x16_t a, poly8x16_t b) {
   23272   return vtstq_p8(a, b);
   23273 }
   23274 
   23275 // CHECK-LABEL: define <8 x i16> @test_vtstq_p16(<8 x i16> %a, <8 x i16> %b) #0 {
   23276 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   23277 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   23278 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   23279 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   23280 // CHECK:   [[TMP4:%.*]] = and <8 x i16> [[TMP2]], [[TMP3]]
   23281 // CHECK:   [[TMP5:%.*]] = icmp ne <8 x i16> [[TMP4]], zeroinitializer
   23282 // CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i16>
   23283 // CHECK:   ret <8 x i16> [[VTST_I]]
   23284 uint16x8_t test_vtstq_p16(poly16x8_t a, poly16x8_t b) {
   23285   return vtstq_p16(a, b);
   23286 }
   23287 
   23288 
   23289 // CHECK-LABEL: define void @test_vuzp_s8(%struct.int8x8x2_t* noalias sret %agg.result, <8 x i8> %a, <8 x i8> %b) #0 {
   23290 // CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x8x2_t, align 8
   23291 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
   23292 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
   23293 // CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
   23294 // CHECK:   store <8 x i8> [[VUZP_I]], <8 x i8>* [[TMP1]]
   23295 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
   23296 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
   23297 // CHECK:   store <8 x i8> [[VUZP1_I]], <8 x i8>* [[TMP2]]
   23298 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* %agg.result to i8*
   23299 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
   23300 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false) #4
   23301 // CHECK:   ret void
   23302 int8x8x2_t test_vuzp_s8(int8x8_t a, int8x8_t b) {
   23303   return vuzp_s8(a, b);
   23304 }
   23305 
   23306 // CHECK-LABEL: define void @test_vuzp_s16(%struct.int16x4x2_t* noalias sret %agg.result, <4 x i16> %a, <4 x i16> %b) #0 {
   23307 // CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x4x2_t, align 8
   23308 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
   23309 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   23310 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   23311 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
   23312 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   23313 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
   23314 // CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
   23315 // CHECK:   store <4 x i16> [[VUZP_I]], <4 x i16>* [[TMP3]]
   23316 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
   23317 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
   23318 // CHECK:   store <4 x i16> [[VUZP1_I]], <4 x i16>* [[TMP6]]
   23319 // CHECK:   [[TMP7:%.*]] = bitcast %struct.int16x4x2_t* %agg.result to i8*
   23320 // CHECK:   [[TMP8:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
   23321 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
   23322 // CHECK:   ret void
   23323 int16x4x2_t test_vuzp_s16(int16x4_t a, int16x4_t b) {
   23324   return vuzp_s16(a, b);
   23325 }
   23326 
   23327 // CHECK-LABEL: define void @test_vuzp_s32(%struct.int32x2x2_t* noalias sret %agg.result, <2 x i32> %a, <2 x i32> %b) #0 {
   23328 // CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x2x2_t, align 8
   23329 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
   23330 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   23331 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   23332 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
   23333 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   23334 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
   23335 // CHECK:   [[VUZP_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 0, i32 2>
   23336 // CHECK:   store <2 x i32> [[VUZP_I]], <2 x i32>* [[TMP3]]
   23337 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
   23338 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 1, i32 3>
   23339 // CHECK:   store <2 x i32> [[VUZP1_I]], <2 x i32>* [[TMP6]]
   23340 // CHECK:   [[TMP7:%.*]] = bitcast %struct.int32x2x2_t* %agg.result to i8*
   23341 // CHECK:   [[TMP8:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
   23342 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
   23343 // CHECK:   ret void
   23344 int32x2x2_t test_vuzp_s32(int32x2_t a, int32x2_t b) {
   23345   return vuzp_s32(a, b);
   23346 }
   23347 
   23348 // CHECK-LABEL: define void @test_vuzp_u8(%struct.uint8x8x2_t* noalias sret %agg.result, <8 x i8> %a, <8 x i8> %b) #0 {
   23349 // CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
   23350 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
   23351 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
   23352 // CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
   23353 // CHECK:   store <8 x i8> [[VUZP_I]], <8 x i8>* [[TMP1]]
   23354 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
   23355 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
   23356 // CHECK:   store <8 x i8> [[VUZP1_I]], <8 x i8>* [[TMP2]]
   23357 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* %agg.result to i8*
   23358 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
   23359 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false) #4
   23360 // CHECK:   ret void
   23361 uint8x8x2_t test_vuzp_u8(uint8x8_t a, uint8x8_t b) {
   23362   return vuzp_u8(a, b);
   23363 }
   23364 
   23365 // CHECK-LABEL: define void @test_vuzp_u16(%struct.uint16x4x2_t* noalias sret %agg.result, <4 x i16> %a, <4 x i16> %b) #0 {
   23366 // CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x4x2_t, align 8
   23367 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
   23368 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   23369 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   23370 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
   23371 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   23372 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
   23373 // CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
   23374 // CHECK:   store <4 x i16> [[VUZP_I]], <4 x i16>* [[TMP3]]
   23375 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
   23376 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
   23377 // CHECK:   store <4 x i16> [[VUZP1_I]], <4 x i16>* [[TMP6]]
   23378 // CHECK:   [[TMP7:%.*]] = bitcast %struct.uint16x4x2_t* %agg.result to i8*
   23379 // CHECK:   [[TMP8:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
   23380 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
   23381 // CHECK:   ret void
   23382 uint16x4x2_t test_vuzp_u16(uint16x4_t a, uint16x4_t b) {
   23383   return vuzp_u16(a, b);
   23384 }
   23385 
   23386 // CHECK-LABEL: define void @test_vuzp_u32(%struct.uint32x2x2_t* noalias sret %agg.result, <2 x i32> %a, <2 x i32> %b) #0 {
   23387 // CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x2x2_t, align 8
   23388 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
   23389 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   23390 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   23391 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
   23392 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   23393 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
   23394 // CHECK:   [[VUZP_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 0, i32 2>
   23395 // CHECK:   store <2 x i32> [[VUZP_I]], <2 x i32>* [[TMP3]]
   23396 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
   23397 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 1, i32 3>
   23398 // CHECK:   store <2 x i32> [[VUZP1_I]], <2 x i32>* [[TMP6]]
   23399 // CHECK:   [[TMP7:%.*]] = bitcast %struct.uint32x2x2_t* %agg.result to i8*
   23400 // CHECK:   [[TMP8:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
   23401 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
   23402 // CHECK:   ret void
   23403 uint32x2x2_t test_vuzp_u32(uint32x2_t a, uint32x2_t b) {
   23404   return vuzp_u32(a, b);
   23405 }
   23406 
   23407 // CHECK-LABEL: define void @test_vuzp_f32(%struct.float32x2x2_t* noalias sret %agg.result, <2 x float> %a, <2 x float> %b) #0 {
   23408 // CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x2x2_t, align 8
   23409 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
   23410 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %a to <8 x i8>
   23411 // CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %b to <8 x i8>
   23412 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x float>*
   23413 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
   23414 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
   23415 // CHECK:   [[VUZP_I:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> <i32 0, i32 2>
   23416 // CHECK:   store <2 x float> [[VUZP_I]], <2 x float>* [[TMP3]]
   23417 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x float>, <2 x float>* [[TMP3]], i32 1
   23418 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> <i32 1, i32 3>
   23419 // CHECK:   store <2 x float> [[VUZP1_I]], <2 x float>* [[TMP6]]
   23420 // CHECK:   [[TMP7:%.*]] = bitcast %struct.float32x2x2_t* %agg.result to i8*
   23421 // CHECK:   [[TMP8:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
   23422 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
   23423 // CHECK:   ret void
   23424 float32x2x2_t test_vuzp_f32(float32x2_t a, float32x2_t b) {
   23425   return vuzp_f32(a, b);
   23426 }
   23427 
   23428 // CHECK-LABEL: define void @test_vuzp_p8(%struct.poly8x8x2_t* noalias sret %agg.result, <8 x i8> %a, <8 x i8> %b) #0 {
   23429 // CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
   23430 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
   23431 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
   23432 // CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
   23433 // CHECK:   store <8 x i8> [[VUZP_I]], <8 x i8>* [[TMP1]]
   23434 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
   23435 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
   23436 // CHECK:   store <8 x i8> [[VUZP1_I]], <8 x i8>* [[TMP2]]
   23437 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* %agg.result to i8*
   23438 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
   23439 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false) #4
   23440 // CHECK:   ret void
   23441 poly8x8x2_t test_vuzp_p8(poly8x8_t a, poly8x8_t b) {
   23442   return vuzp_p8(a, b);
   23443 }
   23444 
   23445 // CHECK-LABEL: define void @test_vuzp_p16(%struct.poly16x4x2_t* noalias sret %agg.result, <4 x i16> %a, <4 x i16> %b) #0 {
   23446 // CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x4x2_t, align 8
   23447 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
   23448 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   23449 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   23450 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
   23451 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   23452 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
   23453 // CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
   23454 // CHECK:   store <4 x i16> [[VUZP_I]], <4 x i16>* [[TMP3]]
   23455 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
   23456 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
   23457 // CHECK:   store <4 x i16> [[VUZP1_I]], <4 x i16>* [[TMP6]]
   23458 // CHECK:   [[TMP7:%.*]] = bitcast %struct.poly16x4x2_t* %agg.result to i8*
   23459 // CHECK:   [[TMP8:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
   23460 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
   23461 // CHECK:   ret void
   23462 poly16x4x2_t test_vuzp_p16(poly16x4_t a, poly16x4_t b) {
   23463   return vuzp_p16(a, b);
   23464 }
   23465 
   23466 // CHECK-LABEL: define void @test_vuzpq_s8(%struct.int8x16x2_t* noalias sret %agg.result, <16 x i8> %a, <16 x i8> %b) #0 {
   23467 // CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x16x2_t, align 16
   23468 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
   23469 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
   23470 // CHECK:   [[VUZP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
   23471 // CHECK:   store <16 x i8> [[VUZP_I]], <16 x i8>* [[TMP1]]
   23472 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
   23473 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
   23474 // CHECK:   store <16 x i8> [[VUZP1_I]], <16 x i8>* [[TMP2]]
   23475 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* %agg.result to i8*
   23476 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
   23477 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false) #4
   23478 // CHECK:   ret void
   23479 int8x16x2_t test_vuzpq_s8(int8x16_t a, int8x16_t b) {
   23480   return vuzpq_s8(a, b);
   23481 }
   23482 
   23483 // CHECK-LABEL: define void @test_vuzpq_s16(%struct.int16x8x2_t* noalias sret %agg.result, <8 x i16> %a, <8 x i16> %b) #0 {
   23484 // CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x8x2_t, align 16
   23485 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
   23486 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   23487 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   23488 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
   23489 // CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   23490 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
   23491 // CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
   23492 // CHECK:   store <8 x i16> [[VUZP_I]], <8 x i16>* [[TMP3]]
   23493 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
   23494 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
   23495 // CHECK:   store <8 x i16> [[VUZP1_I]], <8 x i16>* [[TMP6]]
   23496 // CHECK:   [[TMP7:%.*]] = bitcast %struct.int16x8x2_t* %agg.result to i8*
   23497 // CHECK:   [[TMP8:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
   23498 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
   23499 // CHECK:   ret void
   23500 int16x8x2_t test_vuzpq_s16(int16x8_t a, int16x8_t b) {
   23501   return vuzpq_s16(a, b);
   23502 }
   23503 
   23504 // CHECK-LABEL: define void @test_vuzpq_s32(%struct.int32x4x2_t* noalias sret %agg.result, <4 x i32> %a, <4 x i32> %b) #0 {
   23505 // CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x4x2_t, align 16
   23506 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
   23507 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   23508 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   23509 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
   23510 // CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   23511 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
   23512 // CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
   23513 // CHECK:   store <4 x i32> [[VUZP_I]], <4 x i32>* [[TMP3]]
   23514 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
   23515 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
   23516 // CHECK:   store <4 x i32> [[VUZP1_I]], <4 x i32>* [[TMP6]]
   23517 // CHECK:   [[TMP7:%.*]] = bitcast %struct.int32x4x2_t* %agg.result to i8*
   23518 // CHECK:   [[TMP8:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
   23519 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
   23520 // CHECK:   ret void
   23521 int32x4x2_t test_vuzpq_s32(int32x4_t a, int32x4_t b) {
   23522   return vuzpq_s32(a, b);
   23523 }
   23524 
   23525 // CHECK-LABEL: define void @test_vuzpq_u8(%struct.uint8x16x2_t* noalias sret %agg.result, <16 x i8> %a, <16 x i8> %b) #0 {
   23526 // CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
   23527 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
   23528 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
   23529 // CHECK:   [[VUZP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
   23530 // CHECK:   store <16 x i8> [[VUZP_I]], <16 x i8>* [[TMP1]]
   23531 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
   23532 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
   23533 // CHECK:   store <16 x i8> [[VUZP1_I]], <16 x i8>* [[TMP2]]
   23534 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* %agg.result to i8*
   23535 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
   23536 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false) #4
   23537 // CHECK:   ret void
   23538 uint8x16x2_t test_vuzpq_u8(uint8x16_t a, uint8x16_t b) {
   23539   return vuzpq_u8(a, b);
   23540 }
   23541 
   23542 // CHECK-LABEL: define void @test_vuzpq_u16(%struct.uint16x8x2_t* noalias sret %agg.result, <8 x i16> %a, <8 x i16> %b) #0 {
   23543 // CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x8x2_t, align 16
   23544 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
   23545 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   23546 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   23547 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
   23548 // CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   23549 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
   23550 // CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
   23551 // CHECK:   store <8 x i16> [[VUZP_I]], <8 x i16>* [[TMP3]]
   23552 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
   23553 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
   23554 // CHECK:   store <8 x i16> [[VUZP1_I]], <8 x i16>* [[TMP6]]
   23555 // CHECK:   [[TMP7:%.*]] = bitcast %struct.uint16x8x2_t* %agg.result to i8*
   23556 // CHECK:   [[TMP8:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
   23557 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
   23558 // CHECK:   ret void
   23559 uint16x8x2_t test_vuzpq_u16(uint16x8_t a, uint16x8_t b) {
   23560   return vuzpq_u16(a, b);
   23561 }
   23562 
   23563 // CHECK-LABEL: define void @test_vuzpq_u32(%struct.uint32x4x2_t* noalias sret %agg.result, <4 x i32> %a, <4 x i32> %b) #0 {
   23564 // CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x4x2_t, align 16
   23565 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
   23566 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   23567 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   23568 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
   23569 // CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   23570 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
   23571 // CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
   23572 // CHECK:   store <4 x i32> [[VUZP_I]], <4 x i32>* [[TMP3]]
   23573 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
   23574 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
   23575 // CHECK:   store <4 x i32> [[VUZP1_I]], <4 x i32>* [[TMP6]]
   23576 // CHECK:   [[TMP7:%.*]] = bitcast %struct.uint32x4x2_t* %agg.result to i8*
   23577 // CHECK:   [[TMP8:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
   23578 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
   23579 // CHECK:   ret void
   23580 uint32x4x2_t test_vuzpq_u32(uint32x4_t a, uint32x4_t b) {
   23581   return vuzpq_u32(a, b);
   23582 }
   23583 
   23584 // CHECK-LABEL: define void @test_vuzpq_f32(%struct.float32x4x2_t* noalias sret %agg.result, <4 x float> %a, <4 x float> %b) #0 {
   23585 // CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x4x2_t, align 16
   23586 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
   23587 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %a to <16 x i8>
   23588 // CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %b to <16 x i8>
   23589 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x float>*
   23590 // CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
   23591 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
   23592 // CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
   23593 // CHECK:   store <4 x float> [[VUZP_I]], <4 x float>* [[TMP3]]
   23594 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[TMP3]], i32 1
   23595 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
   23596 // CHECK:   store <4 x float> [[VUZP1_I]], <4 x float>* [[TMP6]]
   23597 // CHECK:   [[TMP7:%.*]] = bitcast %struct.float32x4x2_t* %agg.result to i8*
   23598 // CHECK:   [[TMP8:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
   23599 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
   23600 // CHECK:   ret void
   23601 float32x4x2_t test_vuzpq_f32(float32x4_t a, float32x4_t b) {
   23602   return vuzpq_f32(a, b);
   23603 }
   23604 
   23605 // CHECK-LABEL: define void @test_vuzpq_p8(%struct.poly8x16x2_t* noalias sret %agg.result, <16 x i8> %a, <16 x i8> %b) #0 {
   23606 // CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
   23607 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
   23608 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
   23609 // CHECK:   [[VUZP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
   23610 // CHECK:   store <16 x i8> [[VUZP_I]], <16 x i8>* [[TMP1]]
   23611 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
   23612 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
   23613 // CHECK:   store <16 x i8> [[VUZP1_I]], <16 x i8>* [[TMP2]]
   23614 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* %agg.result to i8*
   23615 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
   23616 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false) #4
   23617 // CHECK:   ret void
   23618 poly8x16x2_t test_vuzpq_p8(poly8x16_t a, poly8x16_t b) {
   23619   return vuzpq_p8(a, b);
   23620 }
   23621 
   23622 // CHECK-LABEL: define void @test_vuzpq_p16(%struct.poly16x8x2_t* noalias sret %agg.result, <8 x i16> %a, <8 x i16> %b) #0 {
   23623 // CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x8x2_t, align 16
   23624 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
   23625 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   23626 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   23627 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
   23628 // CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   23629 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
   23630 // CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
   23631 // CHECK:   store <8 x i16> [[VUZP_I]], <8 x i16>* [[TMP3]]
   23632 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
   23633 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
   23634 // CHECK:   store <8 x i16> [[VUZP1_I]], <8 x i16>* [[TMP6]]
   23635 // CHECK:   [[TMP7:%.*]] = bitcast %struct.poly16x8x2_t* %agg.result to i8*
   23636 // CHECK:   [[TMP8:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
   23637 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
   23638 // CHECK:   ret void
   23639 poly16x8x2_t test_vuzpq_p16(poly16x8_t a, poly16x8_t b) {
   23640   return vuzpq_p16(a, b);
   23641 }
   23642 
   23643 
   23644 // CHECK-LABEL: define void @test_vzip_s8(%struct.int8x8x2_t* noalias sret %agg.result, <8 x i8> %a, <8 x i8> %b) #0 {
   23645 // CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x8x2_t, align 8
   23646 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
   23647 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
   23648 // CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
   23649 // CHECK:   store <8 x i8> [[VZIP_I]], <8 x i8>* [[TMP1]]
   23650 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
   23651 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
   23652 // CHECK:   store <8 x i8> [[VZIP1_I]], <8 x i8>* [[TMP2]]
   23653 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* %agg.result to i8*
   23654 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
   23655 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false) #4
   23656 // CHECK:   ret void
   23657 int8x8x2_t test_vzip_s8(int8x8_t a, int8x8_t b) {
   23658   return vzip_s8(a, b);
   23659 }
   23660 
   23661 // CHECK-LABEL: define void @test_vzip_s16(%struct.int16x4x2_t* noalias sret %agg.result, <4 x i16> %a, <4 x i16> %b) #0 {
   23662 // CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x4x2_t, align 8
   23663 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
   23664 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   23665 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   23666 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
   23667 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   23668 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
   23669 // CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   23670 // CHECK:   store <4 x i16> [[VZIP_I]], <4 x i16>* [[TMP3]]
   23671 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
   23672 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
   23673 // CHECK:   store <4 x i16> [[VZIP1_I]], <4 x i16>* [[TMP6]]
   23674 // CHECK:   [[TMP7:%.*]] = bitcast %struct.int16x4x2_t* %agg.result to i8*
   23675 // CHECK:   [[TMP8:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
   23676 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
   23677 // CHECK:   ret void
   23678 int16x4x2_t test_vzip_s16(int16x4_t a, int16x4_t b) {
   23679   return vzip_s16(a, b);
   23680 }
   23681 
   23682 // CHECK-LABEL: define void @test_vzip_s32(%struct.int32x2x2_t* noalias sret %agg.result, <2 x i32> %a, <2 x i32> %b) #0 {
   23683 // CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x2x2_t, align 8
   23684 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
   23685 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   23686 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   23687 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
   23688 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   23689 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
   23690 // CHECK:   [[VZIP_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 0, i32 2>
   23691 // CHECK:   store <2 x i32> [[VZIP_I]], <2 x i32>* [[TMP3]]
   23692 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
   23693 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 1, i32 3>
   23694 // CHECK:   store <2 x i32> [[VZIP1_I]], <2 x i32>* [[TMP6]]
   23695 // CHECK:   [[TMP7:%.*]] = bitcast %struct.int32x2x2_t* %agg.result to i8*
   23696 // CHECK:   [[TMP8:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
   23697 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
   23698 // CHECK:   ret void
   23699 int32x2x2_t test_vzip_s32(int32x2_t a, int32x2_t b) {
   23700   return vzip_s32(a, b);
   23701 }
   23702 
   23703 // CHECK-LABEL: define void @test_vzip_u8(%struct.uint8x8x2_t* noalias sret %agg.result, <8 x i8> %a, <8 x i8> %b) #0 {
   23704 // CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
   23705 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
   23706 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
   23707 // CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
   23708 // CHECK:   store <8 x i8> [[VZIP_I]], <8 x i8>* [[TMP1]]
   23709 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
   23710 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
   23711 // CHECK:   store <8 x i8> [[VZIP1_I]], <8 x i8>* [[TMP2]]
   23712 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* %agg.result to i8*
   23713 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
   23714 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false) #4
   23715 // CHECK:   ret void
   23716 uint8x8x2_t test_vzip_u8(uint8x8_t a, uint8x8_t b) {
   23717   return vzip_u8(a, b);
   23718 }
   23719 
   23720 // CHECK-LABEL: define void @test_vzip_u16(%struct.uint16x4x2_t* noalias sret %agg.result, <4 x i16> %a, <4 x i16> %b) #0 {
   23721 // CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x4x2_t, align 8
   23722 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
   23723 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   23724 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   23725 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
   23726 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   23727 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
   23728 // CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   23729 // CHECK:   store <4 x i16> [[VZIP_I]], <4 x i16>* [[TMP3]]
   23730 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
   23731 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
   23732 // CHECK:   store <4 x i16> [[VZIP1_I]], <4 x i16>* [[TMP6]]
   23733 // CHECK:   [[TMP7:%.*]] = bitcast %struct.uint16x4x2_t* %agg.result to i8*
   23734 // CHECK:   [[TMP8:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
   23735 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
   23736 // CHECK:   ret void
   23737 uint16x4x2_t test_vzip_u16(uint16x4_t a, uint16x4_t b) {
   23738   return vzip_u16(a, b);
   23739 }
   23740 
   23741 // CHECK-LABEL: define void @test_vzip_u32(%struct.uint32x2x2_t* noalias sret %agg.result, <2 x i32> %a, <2 x i32> %b) #0 {
   23742 // CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x2x2_t, align 8
   23743 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
   23744 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   23745 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   23746 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
   23747 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   23748 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
   23749 // CHECK:   [[VZIP_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 0, i32 2>
   23750 // CHECK:   store <2 x i32> [[VZIP_I]], <2 x i32>* [[TMP3]]
   23751 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
   23752 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 1, i32 3>
   23753 // CHECK:   store <2 x i32> [[VZIP1_I]], <2 x i32>* [[TMP6]]
   23754 // CHECK:   [[TMP7:%.*]] = bitcast %struct.uint32x2x2_t* %agg.result to i8*
   23755 // CHECK:   [[TMP8:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
   23756 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
   23757 // CHECK:   ret void
   23758 uint32x2x2_t test_vzip_u32(uint32x2_t a, uint32x2_t b) {
   23759   return vzip_u32(a, b);
   23760 }
   23761 
   23762 // CHECK-LABEL: define void @test_vzip_f32(%struct.float32x2x2_t* noalias sret %agg.result, <2 x float> %a, <2 x float> %b) #0 {
   23763 // CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x2x2_t, align 8
   23764 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
   23765 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %a to <8 x i8>
   23766 // CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %b to <8 x i8>
   23767 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x float>*
   23768 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
   23769 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
   23770 // CHECK:   [[VZIP_I:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> <i32 0, i32 2>
   23771 // CHECK:   store <2 x float> [[VZIP_I]], <2 x float>* [[TMP3]]
   23772 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x float>, <2 x float>* [[TMP3]], i32 1
   23773 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> <i32 1, i32 3>
   23774 // CHECK:   store <2 x float> [[VZIP1_I]], <2 x float>* [[TMP6]]
   23775 // CHECK:   [[TMP7:%.*]] = bitcast %struct.float32x2x2_t* %agg.result to i8*
   23776 // CHECK:   [[TMP8:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
   23777 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
   23778 // CHECK:   ret void
   23779 float32x2x2_t test_vzip_f32(float32x2_t a, float32x2_t b) {
   23780   return vzip_f32(a, b);
   23781 }
   23782 
   23783 // CHECK-LABEL: define void @test_vzip_p8(%struct.poly8x8x2_t* noalias sret %agg.result, <8 x i8> %a, <8 x i8> %b) #0 {
   23784 // CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
   23785 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
   23786 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
   23787 // CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
   23788 // CHECK:   store <8 x i8> [[VZIP_I]], <8 x i8>* [[TMP1]]
   23789 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
   23790 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
   23791 // CHECK:   store <8 x i8> [[VZIP1_I]], <8 x i8>* [[TMP2]]
   23792 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* %agg.result to i8*
   23793 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
   23794 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false) #4
   23795 // CHECK:   ret void
   23796 poly8x8x2_t test_vzip_p8(poly8x8_t a, poly8x8_t b) {
   23797   return vzip_p8(a, b);
   23798 }
   23799 
   23800 // CHECK-LABEL: define void @test_vzip_p16(%struct.poly16x4x2_t* noalias sret %agg.result, <4 x i16> %a, <4 x i16> %b) #0 {
   23801 // CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x4x2_t, align 8
   23802 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
   23803 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   23804 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   23805 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
   23806 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   23807 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
   23808 // CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   23809 // CHECK:   store <4 x i16> [[VZIP_I]], <4 x i16>* [[TMP3]]
   23810 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
   23811 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
   23812 // CHECK:   store <4 x i16> [[VZIP1_I]], <4 x i16>* [[TMP6]]
   23813 // CHECK:   [[TMP7:%.*]] = bitcast %struct.poly16x4x2_t* %agg.result to i8*
   23814 // CHECK:   [[TMP8:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
   23815 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
   23816 // CHECK:   ret void
   23817 poly16x4x2_t test_vzip_p16(poly16x4_t a, poly16x4_t b) {
   23818   return vzip_p16(a, b);
   23819 }
   23820 
   23821 // CHECK-LABEL: define void @test_vzipq_s8(%struct.int8x16x2_t* noalias sret %agg.result, <16 x i8> %a, <16 x i8> %b) #0 {
   23822 // CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x16x2_t, align 16
   23823 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
   23824 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
   23825 // CHECK:   [[VZIP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
   23826 // CHECK:   store <16 x i8> [[VZIP_I]], <16 x i8>* [[TMP1]]
   23827 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
   23828 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
   23829 // CHECK:   store <16 x i8> [[VZIP1_I]], <16 x i8>* [[TMP2]]
   23830 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* %agg.result to i8*
   23831 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
   23832 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false) #4
   23833 // CHECK:   ret void
   23834 int8x16x2_t test_vzipq_s8(int8x16_t a, int8x16_t b) {
   23835   return vzipq_s8(a, b);
   23836 }
   23837 
   23838 // CHECK-LABEL: define void @test_vzipq_s16(%struct.int16x8x2_t* noalias sret %agg.result, <8 x i16> %a, <8 x i16> %b) #0 {
   23839 // CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x8x2_t, align 16
   23840 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
   23841 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   23842 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   23843 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
   23844 // CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   23845 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
   23846 // CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
   23847 // CHECK:   store <8 x i16> [[VZIP_I]], <8 x i16>* [[TMP3]]
   23848 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
   23849 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
   23850 // CHECK:   store <8 x i16> [[VZIP1_I]], <8 x i16>* [[TMP6]]
   23851 // CHECK:   [[TMP7:%.*]] = bitcast %struct.int16x8x2_t* %agg.result to i8*
   23852 // CHECK:   [[TMP8:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
   23853 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
   23854 // CHECK:   ret void
   23855 int16x8x2_t test_vzipq_s16(int16x8_t a, int16x8_t b) {
   23856   return vzipq_s16(a, b);
   23857 }
   23858 
   23859 // CHECK-LABEL: define void @test_vzipq_s32(%struct.int32x4x2_t* noalias sret %agg.result, <4 x i32> %a, <4 x i32> %b) #0 {
   23860 // CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x4x2_t, align 16
   23861 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
   23862 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   23863 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   23864 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
   23865 // CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   23866 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
   23867 // CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   23868 // CHECK:   store <4 x i32> [[VZIP_I]], <4 x i32>* [[TMP3]]
   23869 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
   23870 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
   23871 // CHECK:   store <4 x i32> [[VZIP1_I]], <4 x i32>* [[TMP6]]
   23872 // CHECK:   [[TMP7:%.*]] = bitcast %struct.int32x4x2_t* %agg.result to i8*
   23873 // CHECK:   [[TMP8:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
   23874 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
   23875 // CHECK:   ret void
   23876 int32x4x2_t test_vzipq_s32(int32x4_t a, int32x4_t b) {
   23877   return vzipq_s32(a, b);
   23878 }
   23879 
   23880 // CHECK-LABEL: define void @test_vzipq_u8(%struct.uint8x16x2_t* noalias sret %agg.result, <16 x i8> %a, <16 x i8> %b) #0 {
   23881 // CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
   23882 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
   23883 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
   23884 // CHECK:   [[VZIP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
   23885 // CHECK:   store <16 x i8> [[VZIP_I]], <16 x i8>* [[TMP1]]
   23886 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
   23887 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
   23888 // CHECK:   store <16 x i8> [[VZIP1_I]], <16 x i8>* [[TMP2]]
   23889 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* %agg.result to i8*
   23890 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
   23891 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false) #4
   23892 // CHECK:   ret void
   23893 uint8x16x2_t test_vzipq_u8(uint8x16_t a, uint8x16_t b) {
   23894   return vzipq_u8(a, b);
   23895 }
   23896 
   23897 // CHECK-LABEL: define void @test_vzipq_u16(%struct.uint16x8x2_t* noalias sret %agg.result, <8 x i16> %a, <8 x i16> %b) #0 {
   23898 // CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x8x2_t, align 16
   23899 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
   23900 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   23901 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   23902 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
   23903 // CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   23904 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
   23905 // CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
   23906 // CHECK:   store <8 x i16> [[VZIP_I]], <8 x i16>* [[TMP3]]
   23907 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
   23908 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
   23909 // CHECK:   store <8 x i16> [[VZIP1_I]], <8 x i16>* [[TMP6]]
   23910 // CHECK:   [[TMP7:%.*]] = bitcast %struct.uint16x8x2_t* %agg.result to i8*
   23911 // CHECK:   [[TMP8:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
   23912 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
   23913 // CHECK:   ret void
   23914 uint16x8x2_t test_vzipq_u16(uint16x8_t a, uint16x8_t b) {
   23915   return vzipq_u16(a, b);
   23916 }
   23917 
   23918 // CHECK-LABEL: define void @test_vzipq_u32(%struct.uint32x4x2_t* noalias sret %agg.result, <4 x i32> %a, <4 x i32> %b) #0 {
   23919 // CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x4x2_t, align 16
   23920 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
   23921 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   23922 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   23923 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
   23924 // CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   23925 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
   23926 // CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   23927 // CHECK:   store <4 x i32> [[VZIP_I]], <4 x i32>* [[TMP3]]
   23928 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
   23929 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
   23930 // CHECK:   store <4 x i32> [[VZIP1_I]], <4 x i32>* [[TMP6]]
   23931 // CHECK:   [[TMP7:%.*]] = bitcast %struct.uint32x4x2_t* %agg.result to i8*
   23932 // CHECK:   [[TMP8:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
   23933 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
   23934 // CHECK:   ret void
   23935 uint32x4x2_t test_vzipq_u32(uint32x4_t a, uint32x4_t b) {
   23936   return vzipq_u32(a, b);
   23937 }
   23938 
   23939 // CHECK-LABEL: define void @test_vzipq_f32(%struct.float32x4x2_t* noalias sret %agg.result, <4 x float> %a, <4 x float> %b) #0 {
   23940 // CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x4x2_t, align 16
   23941 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
   23942 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %a to <16 x i8>
   23943 // CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %b to <16 x i8>
   23944 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x float>*
   23945 // CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
   23946 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
   23947 // CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   23948 // CHECK:   store <4 x float> [[VZIP_I]], <4 x float>* [[TMP3]]
   23949 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[TMP3]], i32 1
   23950 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
   23951 // CHECK:   store <4 x float> [[VZIP1_I]], <4 x float>* [[TMP6]]
   23952 // CHECK:   [[TMP7:%.*]] = bitcast %struct.float32x4x2_t* %agg.result to i8*
   23953 // CHECK:   [[TMP8:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
   23954 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
   23955 // CHECK:   ret void
   23956 float32x4x2_t test_vzipq_f32(float32x4_t a, float32x4_t b) {
   23957   return vzipq_f32(a, b);
   23958 }
   23959 
   23960 // CHECK-LABEL: define void @test_vzipq_p8(%struct.poly8x16x2_t* noalias sret %agg.result, <16 x i8> %a, <16 x i8> %b) #0 {
   23961 // CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
   23962 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
   23963 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
   23964 // CHECK:   [[VZIP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
   23965 // CHECK:   store <16 x i8> [[VZIP_I]], <16 x i8>* [[TMP1]]
   23966 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
   23967 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
   23968 // CHECK:   store <16 x i8> [[VZIP1_I]], <16 x i8>* [[TMP2]]
   23969 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* %agg.result to i8*
   23970 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
   23971 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false) #4
   23972 // CHECK:   ret void
   23973 poly8x16x2_t test_vzipq_p8(poly8x16_t a, poly8x16_t b) {
   23974   return vzipq_p8(a, b);
   23975 }
   23976 
   23977 // CHECK-LABEL: define void @test_vzipq_p16(%struct.poly16x8x2_t* noalias sret %agg.result, <8 x i16> %a, <8 x i16> %b) #0 {
   23978 // CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x8x2_t, align 16
   23979 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
   23980 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   23981 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   23982 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
   23983 // CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   23984 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
   23985 // CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
   23986 // CHECK:   store <8 x i16> [[VZIP_I]], <8 x i16>* [[TMP3]]
   23987 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
   23988 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
   23989 // CHECK:   store <8 x i16> [[VZIP1_I]], <8 x i16>* [[TMP6]]
   23990 // CHECK:   [[TMP7:%.*]] = bitcast %struct.poly16x8x2_t* %agg.result to i8*
   23991 // CHECK:   [[TMP8:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
   23992 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
   23993 // CHECK:   ret void
   23994 poly16x8x2_t test_vzipq_p16(poly16x8_t a, poly16x8_t b) {
   23995   return vzipq_p16(a, b);
   23996 }
   23997 
   23998 
   23999