Home | History | Annotate | Download | only in CodeGen
      1 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
      2 // RUN:  -ffp-contract=fast -emit-llvm -o - %s | opt -S -mem2reg \
      3 // RUN:  | FileCheck %s
      4 
      5 // Test new aarch64 intrinsics with poly64
      6 
      7 #include <arm_neon.h>
      8 
      9 // CHECK-LABEL: define <1 x i64> @test_vceq_p64(<1 x i64> %a, <1 x i64> %b) #0 {
     10 // CHECK:   [[CMP_I:%.*]] = icmp eq <1 x i64> %a, %b
     11 // CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
     12 // CHECK:   ret <1 x i64> [[SEXT_I]]
     13 uint64x1_t test_vceq_p64(poly64x1_t a, poly64x1_t b) {
     14   return vceq_p64(a, b);
     15 }
     16 
     17 // CHECK-LABEL: define <2 x i64> @test_vceqq_p64(<2 x i64> %a, <2 x i64> %b) #0 {
     18 // CHECK:   [[CMP_I:%.*]] = icmp eq <2 x i64> %a, %b
     19 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
     20 // CHECK:   ret <2 x i64> [[SEXT_I]]
     21 uint64x2_t test_vceqq_p64(poly64x2_t a, poly64x2_t b) {
     22   return vceqq_p64(a, b);
     23 }
     24 
     25 // CHECK-LABEL: define <1 x i64> @test_vtst_p64(<1 x i64> %a, <1 x i64> %b) #0 {
     26 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
     27 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
     28 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
     29 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
     30 // CHECK:   [[TMP4:%.*]] = and <1 x i64> [[TMP2]], [[TMP3]]
     31 // CHECK:   [[TMP5:%.*]] = icmp ne <1 x i64> [[TMP4]], zeroinitializer
     32 // CHECK:   [[VTST_I:%.*]] = sext <1 x i1> [[TMP5]] to <1 x i64>
     33 // CHECK:   ret <1 x i64> [[VTST_I]]
     34 uint64x1_t test_vtst_p64(poly64x1_t a, poly64x1_t b) {
     35   return vtst_p64(a, b);
     36 }
     37 
     38 // CHECK-LABEL: define <2 x i64> @test_vtstq_p64(<2 x i64> %a, <2 x i64> %b) #0 {
     39 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
     40 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
     41 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
     42 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
     43 // CHECK:   [[TMP4:%.*]] = and <2 x i64> [[TMP2]], [[TMP3]]
     44 // CHECK:   [[TMP5:%.*]] = icmp ne <2 x i64> [[TMP4]], zeroinitializer
     45 // CHECK:   [[VTST_I:%.*]] = sext <2 x i1> [[TMP5]] to <2 x i64>
     46 // CHECK:   ret <2 x i64> [[VTST_I]]
     47 uint64x2_t test_vtstq_p64(poly64x2_t a, poly64x2_t b) {
     48   return vtstq_p64(a, b);
     49 }
     50 
     51 // CHECK-LABEL: define <1 x i64> @test_vbsl_p64(<1 x i64> %a, <1 x i64> %b, <1 x i64> %c) #0 {
     52 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
     53 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
     54 // CHECK:   [[TMP2:%.*]] = bitcast <1 x i64> %c to <8 x i8>
     55 // CHECK:   [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
     56 // CHECK:   [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
     57 // CHECK:   [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x i64>
     58 // CHECK:   [[VBSL3_I:%.*]] = and <1 x i64> [[VBSL_I]], [[VBSL1_I]]
     59 // CHECK:   [[TMP3:%.*]] = xor <1 x i64> [[VBSL_I]], <i64 -1>
     60 // CHECK:   [[VBSL4_I:%.*]] = and <1 x i64> [[TMP3]], [[VBSL2_I]]
     61 // CHECK:   [[VBSL5_I:%.*]] = or <1 x i64> [[VBSL3_I]], [[VBSL4_I]]
     62 // CHECK:   ret <1 x i64> [[VBSL5_I]]
     63 poly64x1_t test_vbsl_p64(poly64x1_t a, poly64x1_t b, poly64x1_t c) {
     64   return vbsl_p64(a, b, c);
     65 }
     66 
     67 // CHECK-LABEL: define <2 x i64> @test_vbslq_p64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) #0 {
     68 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
     69 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
     70 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i64> %c to <16 x i8>
     71 // CHECK:   [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
     72 // CHECK:   [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
     73 // CHECK:   [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64>
     74 // CHECK:   [[VBSL3_I:%.*]] = and <2 x i64> [[VBSL_I]], [[VBSL1_I]]
     75 // CHECK:   [[TMP3:%.*]] = xor <2 x i64> [[VBSL_I]], <i64 -1, i64 -1>
     76 // CHECK:   [[VBSL4_I:%.*]] = and <2 x i64> [[TMP3]], [[VBSL2_I]]
     77 // CHECK:   [[VBSL5_I:%.*]] = or <2 x i64> [[VBSL3_I]], [[VBSL4_I]]
     78 // CHECK:   ret <2 x i64> [[VBSL5_I]]
     79 poly64x2_t test_vbslq_p64(poly64x2_t a, poly64x2_t b, poly64x2_t c) {
     80   return vbslq_p64(a, b, c);
     81 }
     82 
     83 // CHECK-LABEL: define i64 @test_vget_lane_p64(<1 x i64> %v) #0 {
     84 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %v to <8 x i8>
     85 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
     86 // CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0
     87 // CHECK:   ret i64 [[VGET_LANE]]
     88 poly64_t test_vget_lane_p64(poly64x1_t v) {
     89   return vget_lane_p64(v, 0);
     90 }
     91 
     92 // CHECK-LABEL: define i64 @test_vgetq_lane_p64(<2 x i64> %v) #0 {
     93 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %v to <16 x i8>
     94 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
     95 // CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
     96 // CHECK:   ret i64 [[VGETQ_LANE]]
     97 poly64_t test_vgetq_lane_p64(poly64x2_t v) {
     98   return vgetq_lane_p64(v, 1);
     99 }
    100 
    101 // CHECK-LABEL: define <1 x i64> @test_vset_lane_p64(i64 %a, <1 x i64> %v) #0 {
    102 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %v to <8 x i8>
    103 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
    104 // CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x i64> [[TMP1]], i64 %a, i32 0
    105 // CHECK:   ret <1 x i64> [[VSET_LANE]]
    106 poly64x1_t test_vset_lane_p64(poly64_t a, poly64x1_t v) {
    107   return vset_lane_p64(a, v, 0);
    108 }
    109 
    110 // CHECK-LABEL: define <2 x i64> @test_vsetq_lane_p64(i64 %a, <2 x i64> %v) #0 {
    111 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %v to <16 x i8>
    112 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
    113 // CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x i64> [[TMP1]], i64 %a, i32 1
    114 // CHECK:   ret <2 x i64> [[VSET_LANE]]
    115 poly64x2_t test_vsetq_lane_p64(poly64_t a, poly64x2_t v) {
    116   return vsetq_lane_p64(a, v, 1);
    117 }
    118 
    119 // CHECK-LABEL: define <1 x i64> @test_vcopy_lane_p64(<1 x i64> %a, <1 x i64> %b) #0 {
    120 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %b to <8 x i8>
    121 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
    122 // CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0
    123 // CHECK:   [[TMP2:%.*]] = bitcast <1 x i64> %a to <8 x i8>
    124 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x i64>
    125 // CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x i64> [[TMP3]], i64 [[VGET_LANE]], i32 0
    126 // CHECK:   ret <1 x i64> [[VSET_LANE]]
    127 poly64x1_t test_vcopy_lane_p64(poly64x1_t a, poly64x1_t b) {
    128   return vcopy_lane_p64(a, 0, b, 0);
    129 
    130 }
    131 
    132 // CHECK-LABEL: define <2 x i64> @test_vcopyq_lane_p64(<2 x i64> %a, <1 x i64> %b) #0 {
    133 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %b to <8 x i8>
    134 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
    135 // CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0
    136 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i64> %a to <16 x i8>
    137 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64>
    138 // CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[VGET_LANE]], i32 1
    139 // CHECK:   ret <2 x i64> [[VSET_LANE]]
    140 poly64x2_t test_vcopyq_lane_p64(poly64x2_t a, poly64x1_t b) {
    141   return vcopyq_lane_p64(a, 1, b, 0);
    142 }
    143 
    144 // CHECK-LABEL: define <2 x i64> @test_vcopyq_laneq_p64(<2 x i64> %a, <2 x i64> %b) #0 {
    145 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
    146 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
    147 // CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
    148 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i64> %a to <16 x i8>
    149 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64>
    150 // CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[VGETQ_LANE]], i32 1
    151 // CHECK:   ret <2 x i64> [[VSET_LANE]]
    152 poly64x2_t test_vcopyq_laneq_p64(poly64x2_t a, poly64x2_t b) {
    153   return vcopyq_laneq_p64(a, 1, b, 1);
    154 }
    155 
    156 // CHECK-LABEL: define <1 x i64> @test_vcreate_p64(i64 %a) #0 {
    157 // CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <1 x i64>
    158 // CHECK:   ret <1 x i64> [[TMP0]]
    159 poly64x1_t test_vcreate_p64(uint64_t a) {
    160   return vcreate_p64(a);
    161 }
    162 
    163 // CHECK-LABEL: define <1 x i64> @test_vdup_n_p64(i64 %a) #0 {
    164 // CHECK:   [[VECINIT_I:%.*]] = insertelement <1 x i64> undef, i64 %a, i32 0
    165 // CHECK:   ret <1 x i64> [[VECINIT_I]]
    166 poly64x1_t test_vdup_n_p64(poly64_t a) {
    167   return vdup_n_p64(a);
    168 }
    169 // CHECK-LABEL: define <2 x i64> @test_vdupq_n_p64(i64 %a) #0 {
    170 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i64> undef, i64 %a, i32 0
    171 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 %a, i32 1
    172 // CHECK:   ret <2 x i64> [[VECINIT1_I]]
    173 poly64x2_t test_vdupq_n_p64(poly64_t a) {
    174   return vdupq_n_p64(a);
    175 }
    176 
    177 // CHECK-LABEL: define <1 x i64> @test_vmov_n_p64(i64 %a) #0 {
    178 // CHECK:   [[VECINIT_I:%.*]] = insertelement <1 x i64> undef, i64 %a, i32 0
    179 // CHECK:   ret <1 x i64> [[VECINIT_I]]
    180 poly64x1_t test_vmov_n_p64(poly64_t a) {
    181   return vmov_n_p64(a);
    182 }
    183 
    184 // CHECK-LABEL: define <2 x i64> @test_vmovq_n_p64(i64 %a) #0 {
    185 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i64> undef, i64 %a, i32 0
    186 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 %a, i32 1
    187 // CHECK:   ret <2 x i64> [[VECINIT1_I]]
    188 poly64x2_t test_vmovq_n_p64(poly64_t a) {
    189   return vmovq_n_p64(a);
    190 }
    191 
    192 // CHECK-LABEL: define <1 x i64> @test_vdup_lane_p64(<1 x i64> %vec) #0 {
    193 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <1 x i64> %vec, <1 x i64> %vec, <1 x i32> zeroinitializer
    194 // CHECK:   ret <1 x i64> [[SHUFFLE]]
    195 poly64x1_t test_vdup_lane_p64(poly64x1_t vec) {
    196   return vdup_lane_p64(vec, 0);
    197 }
    198 
    199 // CHECK-LABEL: define <2 x i64> @test_vdupq_lane_p64(<1 x i64> %vec) #0 {
    200 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <1 x i64> %vec, <1 x i64> %vec, <2 x i32> zeroinitializer
    201 // CHECK:   ret <2 x i64> [[SHUFFLE]]
    202 poly64x2_t test_vdupq_lane_p64(poly64x1_t vec) {
    203   return vdupq_lane_p64(vec, 0);
    204 }
    205 
    206 // CHECK-LABEL: define <2 x i64> @test_vdupq_laneq_p64(<2 x i64> %vec) #0 {
    207 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i64> %vec, <2 x i64> %vec, <2 x i32> <i32 1, i32 1>
    208 // CHECK:   ret <2 x i64> [[SHUFFLE]]
    209 poly64x2_t test_vdupq_laneq_p64(poly64x2_t vec) {
    210   return vdupq_laneq_p64(vec, 1);
    211 }
    212 
    213 // CHECK-LABEL: define <2 x i64> @test_vcombine_p64(<1 x i64> %low, <1 x i64> %high) #0 {
    214 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <1 x i64> %low, <1 x i64> %high, <2 x i32> <i32 0, i32 1>
    215 // CHECK:   ret <2 x i64> [[SHUFFLE_I]]
    216 poly64x2_t test_vcombine_p64(poly64x1_t low, poly64x1_t high) {
    217   return vcombine_p64(low, high);
    218 }
    219 
    220 // CHECK-LABEL: define <1 x i64> @test_vld1_p64(i64* %ptr) #0 {
    221 // CHECK:   [[TMP0:%.*]] = bitcast i64* %ptr to i8*
    222 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <1 x i64>*
    223 // CHECK:   [[TMP2:%.*]] = load <1 x i64>, <1 x i64>* [[TMP1]]
    224 // CHECK:   ret <1 x i64> [[TMP2]]
    225 poly64x1_t test_vld1_p64(poly64_t const * ptr) {
    226   return vld1_p64(ptr);
    227 }
    228 
    229 // CHECK-LABEL: define <2 x i64> @test_vld1q_p64(i64* %ptr) #0 {
    230 // CHECK:   [[TMP0:%.*]] = bitcast i64* %ptr to i8*
    231 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <2 x i64>*
    232 // CHECK:   [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]]
    233 // CHECK:   ret <2 x i64> [[TMP2]]
    234 poly64x2_t test_vld1q_p64(poly64_t const * ptr) {
    235   return vld1q_p64(ptr);
    236 }
    237 
    238 // CHECK-LABEL: define void @test_vst1_p64(i64* %ptr, <1 x i64> %val) #0 {
    239 // CHECK:   [[TMP0:%.*]] = bitcast i64* %ptr to i8*
    240 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %val to <8 x i8>
    241 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <1 x i64>*
    242 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
    243 // CHECK:   store <1 x i64> [[TMP3]], <1 x i64>* [[TMP2]]
    244 // CHECK:   ret void
    245 void test_vst1_p64(poly64_t * ptr, poly64x1_t val) {
    246   return vst1_p64(ptr, val);
    247 }
    248 
    249 // CHECK-LABEL: define void @test_vst1q_p64(i64* %ptr, <2 x i64> %val) #0 {
    250 // CHECK:   [[TMP0:%.*]] = bitcast i64* %ptr to i8*
    251 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %val to <16 x i8>
    252 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <2 x i64>*
    253 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
    254 // CHECK:   store <2 x i64> [[TMP3]], <2 x i64>* [[TMP2]]
    255 // CHECK:   ret void
    256 void test_vst1q_p64(poly64_t * ptr, poly64x2_t val) {
    257   return vst1q_p64(ptr, val);
    258 }
    259 
    260 // CHECK-LABEL: define %struct.poly64x1x2_t @test_vld2_p64(i64* %ptr) #0 {
    261 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x2_t, align 8
    262 // CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x2_t, align 8
    263 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x2_t* [[__RET]] to i8*
    264 // CHECK:   [[TMP1:%.*]] = bitcast i64* %ptr to i8*
    265 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <1 x i64>*
    266 // CHECK:   [[VLD2:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2.v1i64.p0v1i64(<1 x i64>* [[TMP2]])
    267 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }*
    268 // CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2]], { <1 x i64>, <1 x i64> }* [[TMP3]]
    269 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x1x2_t* [[RETVAL]] to i8*
    270 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x1x2_t* [[__RET]] to i8*
    271 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
    272 // CHECK:   [[TMP6:%.*]] = load %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[RETVAL]], align 8
    273 // CHECK:   ret %struct.poly64x1x2_t [[TMP6]]
    274 poly64x1x2_t test_vld2_p64(poly64_t const * ptr) {
    275   return vld2_p64(ptr);
    276 }
    277 
    278 // CHECK-LABEL: define %struct.poly64x2x2_t @test_vld2q_p64(i64* %ptr) #0 {
    279 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x2_t, align 16
    280 // CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x2_t, align 16
    281 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x2_t* [[__RET]] to i8*
    282 // CHECK:   [[TMP1:%.*]] = bitcast i64* %ptr to i8*
    283 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i64>*
    284 // CHECK:   [[VLD2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0v2i64(<2 x i64>* [[TMP2]])
    285 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64> }*
    286 // CHECK:   store { <2 x i64>, <2 x i64> } [[VLD2]], { <2 x i64>, <2 x i64> }* [[TMP3]]
    287 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x2x2_t* [[RETVAL]] to i8*
    288 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x2x2_t* [[__RET]] to i8*
    289 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
    290 // CHECK:   [[TMP6:%.*]] = load %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[RETVAL]], align 16
    291 // CHECK:   ret %struct.poly64x2x2_t [[TMP6]]
    292 poly64x2x2_t test_vld2q_p64(poly64_t const * ptr) {
    293   return vld2q_p64(ptr);
    294 }
    295 
    296 // CHECK-LABEL: define %struct.poly64x1x3_t @test_vld3_p64(i64* %ptr) #0 {
    297 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x3_t, align 8
    298 // CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x3_t, align 8
    299 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x3_t* [[__RET]] to i8*
    300 // CHECK:   [[TMP1:%.*]] = bitcast i64* %ptr to i8*
    301 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <1 x i64>*
    302 // CHECK:   [[VLD3:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3.v1i64.p0v1i64(<1 x i64>* [[TMP2]])
    303 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
    304 // CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
    305 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x1x3_t* [[RETVAL]] to i8*
    306 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x1x3_t* [[__RET]] to i8*
    307 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
    308 // CHECK:   [[TMP6:%.*]] = load %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[RETVAL]], align 8
    309 // CHECK:   ret %struct.poly64x1x3_t [[TMP6]]
    310 poly64x1x3_t test_vld3_p64(poly64_t const * ptr) {
    311   return vld3_p64(ptr);
    312 }
    313 
    314 // CHECK-LABEL: define %struct.poly64x2x3_t @test_vld3q_p64(i64* %ptr) #0 {
    315 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x3_t, align 16
    316 // CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x3_t, align 16
    317 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x3_t* [[__RET]] to i8*
    318 // CHECK:   [[TMP1:%.*]] = bitcast i64* %ptr to i8*
    319 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i64>*
    320 // CHECK:   [[VLD3:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3.v2i64.p0v2i64(<2 x i64>* [[TMP2]])
    321 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64> }*
    322 // CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
    323 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x2x3_t* [[RETVAL]] to i8*
    324 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x2x3_t* [[__RET]] to i8*
    325 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
    326 // CHECK:   [[TMP6:%.*]] = load %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[RETVAL]], align 16
    327 // CHECK:   ret %struct.poly64x2x3_t [[TMP6]]
    328 poly64x2x3_t test_vld3q_p64(poly64_t const * ptr) {
    329   return vld3q_p64(ptr);
    330 }
    331 
    332 // CHECK-LABEL: define %struct.poly64x1x4_t @test_vld4_p64(i64* %ptr) #0 {
    333 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x4_t, align 8
    334 // CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x4_t, align 8
    335 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x4_t* [[__RET]] to i8*
    336 // CHECK:   [[TMP1:%.*]] = bitcast i64* %ptr to i8*
    337 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <1 x i64>*
    338 // CHECK:   [[VLD4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4.v1i64.p0v1i64(<1 x i64>* [[TMP2]])
    339 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
    340 // CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
    341 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x1x4_t* [[RETVAL]] to i8*
    342 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x1x4_t* [[__RET]] to i8*
    343 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
    344 // CHECK:   [[TMP6:%.*]] = load %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[RETVAL]], align 8
    345 // CHECK:   ret %struct.poly64x1x4_t [[TMP6]]
    346 poly64x1x4_t test_vld4_p64(poly64_t const * ptr) {
    347   return vld4_p64(ptr);
    348 }
    349 
    350 // CHECK-LABEL: define %struct.poly64x2x4_t @test_vld4q_p64(i64* %ptr) #0 {
    351 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x4_t, align 16
    352 // CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x4_t, align 16
    353 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x4_t* [[__RET]] to i8*
    354 // CHECK:   [[TMP1:%.*]] = bitcast i64* %ptr to i8*
    355 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i64>*
    356 // CHECK:   [[VLD4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4.v2i64.p0v2i64(<2 x i64>* [[TMP2]])
    357 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }*
    358 // CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
    359 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x2x4_t* [[RETVAL]] to i8*
    360 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x2x4_t* [[__RET]] to i8*
    361 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
    362 // CHECK:   [[TMP6:%.*]] = load %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[RETVAL]], align 16
    363 // CHECK:   ret %struct.poly64x2x4_t [[TMP6]]
    364 poly64x2x4_t test_vld4q_p64(poly64_t const * ptr) {
    365   return vld4q_p64(ptr);
    366 }
    367 
    368 // CHECK-LABEL: define void @test_vst2_p64(i64* %ptr, [2 x <1 x i64>] %val.coerce) #0 {
    369 // CHECK:   [[VAL:%.*]] = alloca %struct.poly64x1x2_t, align 8
    370 // CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x2_t, align 8
    371 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[VAL]], i32 0, i32 0
    372 // CHECK:   store [2 x <1 x i64>] [[VAL]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
    373 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x2_t* [[__S1]] to i8*
    374 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x1x2_t* [[VAL]] to i8*
    375 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
    376 // CHECK:   [[TMP2:%.*]] = bitcast i64* %ptr to i8*
    377 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[__S1]], i32 0, i32 0
    378 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL1]], i64 0, i64 0
    379 // CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
    380 // CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
    381 // CHECK:   [[VAL2:%.*]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[__S1]], i32 0, i32 0
    382 // CHECK:   [[ARRAYIDX3:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL2]], i64 0, i64 1
    383 // CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX3]], align 8
    384 // CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
    385 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
    386 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
    387 // CHECK:   call void @llvm.aarch64.neon.st2.v1i64.p0i8(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i8* [[TMP2]])
    388 // CHECK:   ret void
    389 void test_vst2_p64(poly64_t * ptr, poly64x1x2_t val) {
    390   return vst2_p64(ptr, val);
    391 }
    392 
    393 // CHECK-LABEL: define void @test_vst2q_p64(i64* %ptr, [2 x <2 x i64>] %val.coerce) #0 {
    394 // CHECK:   [[VAL:%.*]] = alloca %struct.poly64x2x2_t, align 16
    395 // CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x2_t, align 16
    396 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[VAL]], i32 0, i32 0
    397 // CHECK:   store [2 x <2 x i64>] [[VAL]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
    398 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x2_t* [[__S1]] to i8*
    399 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x2x2_t* [[VAL]] to i8*
    400 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
    401 // CHECK:   [[TMP2:%.*]] = bitcast i64* %ptr to i8*
    402 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[__S1]], i32 0, i32 0
    403 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL1]], i64 0, i64 0
    404 // CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
    405 // CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
    406 // CHECK:   [[VAL2:%.*]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[__S1]], i32 0, i32 0
    407 // CHECK:   [[ARRAYIDX3:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL2]], i64 0, i64 1
    408 // CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX3]], align 16
    409 // CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
    410 // CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
    411 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
    412 // CHECK:   call void @llvm.aarch64.neon.st2.v2i64.p0i8(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i8* [[TMP2]])
    413 // CHECK:   ret void
    414 void test_vst2q_p64(poly64_t * ptr, poly64x2x2_t val) {
    415   return vst2q_p64(ptr, val);
    416 }
    417 
    418 // CHECK-LABEL: define void @test_vst3_p64(i64* %ptr, [3 x <1 x i64>] %val.coerce) #0 {
    419 // CHECK:   [[VAL:%.*]] = alloca %struct.poly64x1x3_t, align 8
    420 // CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x3_t, align 8
    421 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[VAL]], i32 0, i32 0
    422 // CHECK:   store [3 x <1 x i64>] [[VAL]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
    423 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x3_t* [[__S1]] to i8*
    424 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x1x3_t* [[VAL]] to i8*
    425 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
    426 // CHECK:   [[TMP2:%.*]] = bitcast i64* %ptr to i8*
    427 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0
    428 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL1]], i64 0, i64 0
    429 // CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
    430 // CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
    431 // CHECK:   [[VAL2:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0
    432 // CHECK:   [[ARRAYIDX3:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL2]], i64 0, i64 1
    433 // CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX3]], align 8
    434 // CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
    435 // CHECK:   [[VAL4:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0
    436 // CHECK:   [[ARRAYIDX5:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL4]], i64 0, i64 2
    437 // CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX5]], align 8
    438 // CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
    439 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
    440 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
    441 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
    442 // CHECK:   call void @llvm.aarch64.neon.st3.v1i64.p0i8(<1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], i8* [[TMP2]])
    443 // CHECK:   ret void
    444 void test_vst3_p64(poly64_t * ptr, poly64x1x3_t val) {
    445   return vst3_p64(ptr, val);
    446 }
    447 
    448 // CHECK-LABEL: define void @test_vst3q_p64(i64* %ptr, [3 x <2 x i64>] %val.coerce) #0 {
    449 // CHECK:   [[VAL:%.*]] = alloca %struct.poly64x2x3_t, align 16
    450 // CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x3_t, align 16
    451 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[VAL]], i32 0, i32 0
    452 // CHECK:   store [3 x <2 x i64>] [[VAL]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
    453 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x3_t* [[__S1]] to i8*
    454 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x2x3_t* [[VAL]] to i8*
    455 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
    456 // CHECK:   [[TMP2:%.*]] = bitcast i64* %ptr to i8*
    457 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0
    458 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL1]], i64 0, i64 0
    459 // CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
    460 // CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
    461 // CHECK:   [[VAL2:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0
    462 // CHECK:   [[ARRAYIDX3:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL2]], i64 0, i64 1
    463 // CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX3]], align 16
    464 // CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
    465 // CHECK:   [[VAL4:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0
    466 // CHECK:   [[ARRAYIDX5:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL4]], i64 0, i64 2
    467 // CHECK:   [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX5]], align 16
    468 // CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
    469 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
    470 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
    471 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
    472 // CHECK:   call void @llvm.aarch64.neon.st3.v2i64.p0i8(<2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], i8* [[TMP2]])
    473 // CHECK:   ret void
    474 void test_vst3q_p64(poly64_t * ptr, poly64x2x3_t val) {
    475   return vst3q_p64(ptr, val);
    476 }
    477 
    478 // CHECK-LABEL: define void @test_vst4_p64(i64* %ptr, [4 x <1 x i64>] %val.coerce) #0 {
    479 // CHECK:   [[VAL:%.*]] = alloca %struct.poly64x1x4_t, align 8
    480 // CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x4_t, align 8
    481 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[VAL]], i32 0, i32 0
    482 // CHECK:   store [4 x <1 x i64>] [[VAL]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
    483 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x4_t* [[__S1]] to i8*
    484 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x1x4_t* [[VAL]] to i8*
    485 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
    486 // CHECK:   [[TMP2:%.*]] = bitcast i64* %ptr to i8*
    487 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
    488 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL1]], i64 0, i64 0
    489 // CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
    490 // CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
    491 // CHECK:   [[VAL2:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
    492 // CHECK:   [[ARRAYIDX3:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL2]], i64 0, i64 1
    493 // CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX3]], align 8
    494 // CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
    495 // CHECK:   [[VAL4:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
    496 // CHECK:   [[ARRAYIDX5:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL4]], i64 0, i64 2
    497 // CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX5]], align 8
    498 // CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
    499 // CHECK:   [[VAL6:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
    500 // CHECK:   [[ARRAYIDX7:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL6]], i64 0, i64 3
    501 // CHECK:   [[TMP9:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX7]], align 8
    502 // CHECK:   [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8>
    503 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
    504 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
    505 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
    506 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x i64>
    507 // CHECK:   call void @llvm.aarch64.neon.st4.v1i64.p0i8(<1 x i64> [[TMP11]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], i8* [[TMP2]])
    508 // CHECK:   ret void
    509 void test_vst4_p64(poly64_t * ptr, poly64x1x4_t val) {
    510   return vst4_p64(ptr, val);
    511 }
    512 
    513 // CHECK-LABEL: define void @test_vst4q_p64(i64* %ptr, [4 x <2 x i64>] %val.coerce) #0 {
    514 // CHECK:   [[VAL:%.*]] = alloca %struct.poly64x2x4_t, align 16
    515 // CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x4_t, align 16
    516 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[VAL]], i32 0, i32 0
    517 // CHECK:   store [4 x <2 x i64>] [[VAL]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
    518 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x4_t* [[__S1]] to i8*
    519 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x2x4_t* [[VAL]] to i8*
    520 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
    521 // CHECK:   [[TMP2:%.*]] = bitcast i64* %ptr to i8*
    522 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
    523 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL1]], i64 0, i64 0
    524 // CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
    525 // CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
    526 // CHECK:   [[VAL2:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
    527 // CHECK:   [[ARRAYIDX3:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL2]], i64 0, i64 1
    528 // CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX3]], align 16
    529 // CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
    530 // CHECK:   [[VAL4:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
    531 // CHECK:   [[ARRAYIDX5:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL4]], i64 0, i64 2
    532 // CHECK:   [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX5]], align 16
    533 // CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
    534 // CHECK:   [[VAL6:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
    535 // CHECK:   [[ARRAYIDX7:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL6]], i64 0, i64 3
    536 // CHECK:   [[TMP9:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX7]], align 16
    537 // CHECK:   [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8>
    538 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
    539 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
    540 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
    541 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x i64>
    542 // CHECK:   call void @llvm.aarch64.neon.st4.v2i64.p0i8(<2 x i64> [[TMP11]], <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], i8* [[TMP2]])
    543 // CHECK:   ret void
    544 void test_vst4q_p64(poly64_t * ptr, poly64x2x4_t val) {
    545   return vst4q_p64(ptr, val);
    546 }
    547 
    548 // CHECK-LABEL: define <1 x i64> @test_vext_p64(<1 x i64> %a, <1 x i64> %b) #0 {
    549 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
    550 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
    551 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
    552 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
    553 // CHECK:   [[VEXT:%.*]] = shufflevector <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
    554 // CHECK:   ret <1 x i64> [[VEXT]]
    555 poly64x1_t test_vext_p64(poly64x1_t a, poly64x1_t b) {
    556   return vext_u64(a, b, 0);
    557 
    558 }
    559 
    560 // CHECK-LABEL: define <2 x i64> @test_vextq_p64(<2 x i64> %a, <2 x i64> %b) #0 {
    561 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
    562 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
    563 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
    564 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
    565 // CHECK:   [[VEXT:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i32> <i32 1, i32 2>
    566 // CHECK:   ret <2 x i64> [[VEXT]]
    567 poly64x2_t test_vextq_p64(poly64x2_t a, poly64x2_t b) {
    568   return vextq_p64(a, b, 1);
    569 }
    570 
    571 // CHECK-LABEL: define <2 x i64> @test_vzip1q_p64(<2 x i64> %a, <2 x i64> %b) #0 {
    572 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
    573 // CHECK:   ret <2 x i64> [[SHUFFLE_I]]
    574 poly64x2_t test_vzip1q_p64(poly64x2_t a, poly64x2_t b) {
    575   return vzip1q_p64(a, b);
    576 }
    577 
    578 // CHECK-LABEL: define <2 x i64> @test_vzip2q_p64(<2 x i64> %a, <2 x i64> %b) #0 {
    579 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
    580 // CHECK:   ret <2 x i64> [[SHUFFLE_I]]
    581 poly64x2_t test_vzip2q_p64(poly64x2_t a, poly64x2_t b) {
    582   return vzip2q_u64(a, b);
    583 }
    584 
    585 // CHECK-LABEL: define <2 x i64> @test_vuzp1q_p64(<2 x i64> %a, <2 x i64> %b) #0 {
    586 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
    587 // CHECK:   ret <2 x i64> [[SHUFFLE_I]]
    588 poly64x2_t test_vuzp1q_p64(poly64x2_t a, poly64x2_t b) {
    589   return vuzp1q_p64(a, b);
    590 }
    591 
    592 // CHECK-LABEL: define <2 x i64> @test_vuzp2q_p64(<2 x i64> %a, <2 x i64> %b) #0 {
    593 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
    594 // CHECK:   ret <2 x i64> [[SHUFFLE_I]]
    595 poly64x2_t test_vuzp2q_p64(poly64x2_t a, poly64x2_t b) {
    596   return vuzp2q_u64(a, b);
    597 }
    598 
    599 // CHECK-LABEL: define <2 x i64> @test_vtrn1q_p64(<2 x i64> %a, <2 x i64> %b) #0 {
    600 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
    601 // CHECK:   ret <2 x i64> [[SHUFFLE_I]]
    602 poly64x2_t test_vtrn1q_p64(poly64x2_t a, poly64x2_t b) {
    603   return vtrn1q_p64(a, b);
    604 }
    605 
    606 // CHECK-LABEL: define <2 x i64> @test_vtrn2q_p64(<2 x i64> %a, <2 x i64> %b) #0 {
    607 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
    608 // CHECK:   ret <2 x i64> [[SHUFFLE_I]]
    609 poly64x2_t test_vtrn2q_p64(poly64x2_t a, poly64x2_t b) {
    610   return vtrn2q_u64(a, b);
    611 }
    612 
    613 // CHECK-LABEL: define <1 x i64> @test_vsri_n_p64(<1 x i64> %a, <1 x i64> %b) #0 {
    614 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
    615 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
    616 // CHECK:   [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
    617 // CHECK:   [[VSRI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
    618 // CHECK:   [[VSRI_N2:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsri.v1i64(<1 x i64> [[VSRI_N]], <1 x i64> [[VSRI_N1]], i32 33)
    619 // CHECK:   ret <1 x i64> [[VSRI_N2]]
    620 poly64x1_t test_vsri_n_p64(poly64x1_t a, poly64x1_t b) {
    621   return vsri_n_p64(a, b, 33);
    622 }
    623 
    624 // CHECK-LABEL: define <2 x i64> @test_vsriq_n_p64(<2 x i64> %a, <2 x i64> %b) #0 {
    625 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
    626 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
    627 // CHECK:   [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
    628 // CHECK:   [[VSRI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
    629 // CHECK:   [[VSRI_N2:%.*]] = call <2 x i64> @llvm.aarch64.neon.vsri.v2i64(<2 x i64> [[VSRI_N]], <2 x i64> [[VSRI_N1]], i32 64)
    630 // CHECK:   ret <2 x i64> [[VSRI_N2]]
    631 poly64x2_t test_vsriq_n_p64(poly64x2_t a, poly64x2_t b) {
    632   return vsriq_n_p64(a, b, 64);
    633 }
    634 
    635