Home | History | Annotate | Download | only in ARM
      1 ; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
      2 ; RUN: llc < %s -march=arm -mattr=+neon -regalloc=basic | FileCheck %s
      3 
      4 define <8 x i8> @vld1lanei8(i8* %A, <8 x i8>* %B) nounwind {
      5 ;CHECK: vld1lanei8:
      6 ;Check the (default) alignment value.
      7 ;CHECK: vld1.8 {d16[3]}, [r0]
      8 	%tmp1 = load <8 x i8>* %B
      9 	%tmp2 = load i8* %A, align 8
     10 	%tmp3 = insertelement <8 x i8> %tmp1, i8 %tmp2, i32 3
     11         ret <8 x i8> %tmp3
     12 }
     13 
     14 define <4 x i16> @vld1lanei16(i16* %A, <4 x i16>* %B) nounwind {
     15 ;CHECK: vld1lanei16:
     16 ;Check the alignment value.  Max for this instruction is 16 bits:
     17 ;CHECK: vld1.16 {d16[2]}, [r0, :16]
     18 	%tmp1 = load <4 x i16>* %B
     19 	%tmp2 = load i16* %A, align 8
     20 	%tmp3 = insertelement <4 x i16> %tmp1, i16 %tmp2, i32 2
     21         ret <4 x i16> %tmp3
     22 }
     23 
     24 define <2 x i32> @vld1lanei32(i32* %A, <2 x i32>* %B) nounwind {
     25 ;CHECK: vld1lanei32:
     26 ;Check the alignment value.  Max for this instruction is 32 bits:
     27 ;CHECK: vld1.32 {d16[1]}, [r0, :32]
     28 	%tmp1 = load <2 x i32>* %B
     29 	%tmp2 = load i32* %A, align 8
     30 	%tmp3 = insertelement <2 x i32> %tmp1, i32 %tmp2, i32 1
     31         ret <2 x i32> %tmp3
     32 }
     33 
     34 define <2 x float> @vld1lanef(float* %A, <2 x float>* %B) nounwind {
     35 ;CHECK: vld1lanef:
     36 ;CHECK: vld1.32 {d16[1]}, [r0]
     37 	%tmp1 = load <2 x float>* %B
     38 	%tmp2 = load float* %A, align 4
     39 	%tmp3 = insertelement <2 x float> %tmp1, float %tmp2, i32 1
     40 	ret <2 x float> %tmp3
     41 }
     42 
     43 define <16 x i8> @vld1laneQi8(i8* %A, <16 x i8>* %B) nounwind {
     44 ;CHECK: vld1laneQi8:
     45 ;CHECK: vld1.8 {d17[1]}, [r0]
     46 	%tmp1 = load <16 x i8>* %B
     47 	%tmp2 = load i8* %A, align 8
     48 	%tmp3 = insertelement <16 x i8> %tmp1, i8 %tmp2, i32 9
     49 	ret <16 x i8> %tmp3
     50 }
     51 
     52 define <8 x i16> @vld1laneQi16(i16* %A, <8 x i16>* %B) nounwind {
     53 ;CHECK: vld1laneQi16:
     54 ;CHECK: vld1.16 {d17[1]}, [r0, :16]
     55 	%tmp1 = load <8 x i16>* %B
     56 	%tmp2 = load i16* %A, align 8
     57 	%tmp3 = insertelement <8 x i16> %tmp1, i16 %tmp2, i32 5
     58 	ret <8 x i16> %tmp3
     59 }
     60 
     61 define <4 x i32> @vld1laneQi32(i32* %A, <4 x i32>* %B) nounwind {
     62 ;CHECK: vld1laneQi32:
     63 ;CHECK: vld1.32 {d17[1]}, [r0, :32]
     64 	%tmp1 = load <4 x i32>* %B
     65 	%tmp2 = load i32* %A, align 8
     66 	%tmp3 = insertelement <4 x i32> %tmp1, i32 %tmp2, i32 3
     67 	ret <4 x i32> %tmp3
     68 }
     69 
     70 define <4 x float> @vld1laneQf(float* %A, <4 x float>* %B) nounwind {
     71 ;CHECK: vld1laneQf:
     72 ;CHECK: vld1.32 {d16[0]}, [r0]
     73 	%tmp1 = load <4 x float>* %B
     74 	%tmp2 = load float* %A
     75 	%tmp3 = insertelement <4 x float> %tmp1, float %tmp2, i32 0
     76 	ret <4 x float> %tmp3
     77 }
     78 
     79 %struct.__neon_int8x8x2_t = type { <8 x i8>,  <8 x i8> }
     80 %struct.__neon_int16x4x2_t = type { <4 x i16>, <4 x i16> }
     81 %struct.__neon_int32x2x2_t = type { <2 x i32>, <2 x i32> }
     82 %struct.__neon_float32x2x2_t = type { <2 x float>, <2 x float> }
     83 
     84 %struct.__neon_int16x8x2_t = type { <8 x i16>, <8 x i16> }
     85 %struct.__neon_int32x4x2_t = type { <4 x i32>, <4 x i32> }
     86 %struct.__neon_float32x4x2_t = type { <4 x float>, <4 x float> }
     87 
     88 define <8 x i8> @vld2lanei8(i8* %A, <8 x i8>* %B) nounwind {
     89 ;CHECK: vld2lanei8:
     90 ;Check the alignment value.  Max for this instruction is 16 bits:
     91 ;CHECK: vld2.8 {d16[1], d17[1]}, [r0, :16]
     92 	%tmp1 = load <8 x i8>* %B
     93 	%tmp2 = call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 4)
     94         %tmp3 = extractvalue %struct.__neon_int8x8x2_t %tmp2, 0
     95         %tmp4 = extractvalue %struct.__neon_int8x8x2_t %tmp2, 1
     96         %tmp5 = add <8 x i8> %tmp3, %tmp4
     97 	ret <8 x i8> %tmp5
     98 }
     99 
    100 define <4 x i16> @vld2lanei16(i16* %A, <4 x i16>* %B) nounwind {
    101 ;CHECK: vld2lanei16:
    102 ;Check the alignment value.  Max for this instruction is 32 bits:
    103 ;CHECK: vld2.16 {d16[1], d17[1]}, [r0, :32]
    104 	%tmp0 = bitcast i16* %A to i8*
    105 	%tmp1 = load <4 x i16>* %B
    106 	%tmp2 = call %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 8)
    107         %tmp3 = extractvalue %struct.__neon_int16x4x2_t %tmp2, 0
    108         %tmp4 = extractvalue %struct.__neon_int16x4x2_t %tmp2, 1
    109         %tmp5 = add <4 x i16> %tmp3, %tmp4
    110 	ret <4 x i16> %tmp5
    111 }
    112 
    113 define <2 x i32> @vld2lanei32(i32* %A, <2 x i32>* %B) nounwind {
    114 ;CHECK: vld2lanei32:
    115 ;CHECK: vld2.32
    116 	%tmp0 = bitcast i32* %A to i8*
    117 	%tmp1 = load <2 x i32>* %B
    118 	%tmp2 = call %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1)
    119         %tmp3 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 0
    120         %tmp4 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 1
    121         %tmp5 = add <2 x i32> %tmp3, %tmp4
    122 	ret <2 x i32> %tmp5
    123 }
    124 
    125 ;Check for a post-increment updating load.
    126 define <2 x i32> @vld2lanei32_update(i32** %ptr, <2 x i32>* %B) nounwind {
    127 ;CHECK: vld2lanei32_update:
    128 ;CHECK: vld2.32 {d16[1], d17[1]}, [{{r[0-9]+}}]!
    129 	%A = load i32** %ptr
    130 	%tmp0 = bitcast i32* %A to i8*
    131 	%tmp1 = load <2 x i32>* %B
    132 	%tmp2 = call %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1)
    133 	%tmp3 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 0
    134 	%tmp4 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 1
    135 	%tmp5 = add <2 x i32> %tmp3, %tmp4
    136 	%tmp6 = getelementptr i32* %A, i32 2
    137 	store i32* %tmp6, i32** %ptr
    138 	ret <2 x i32> %tmp5
    139 }
    140 
    141 define <2 x float> @vld2lanef(float* %A, <2 x float>* %B) nounwind {
    142 ;CHECK: vld2lanef:
    143 ;CHECK: vld2.32
    144 	%tmp0 = bitcast float* %A to i8*
    145 	%tmp1 = load <2 x float>* %B
    146 	%tmp2 = call %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2lane.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1)
    147         %tmp3 = extractvalue %struct.__neon_float32x2x2_t %tmp2, 0
    148         %tmp4 = extractvalue %struct.__neon_float32x2x2_t %tmp2, 1
    149         %tmp5 = fadd <2 x float> %tmp3, %tmp4
    150 	ret <2 x float> %tmp5
    151 }
    152 
    153 define <8 x i16> @vld2laneQi16(i16* %A, <8 x i16>* %B) nounwind {
    154 ;CHECK: vld2laneQi16:
    155 ;Check the (default) alignment.
    156 ;CHECK: vld2.16 {d17[1], d19[1]}, [{{r[0-9]+}}]
    157 	%tmp0 = bitcast i16* %A to i8*
    158 	%tmp1 = load <8 x i16>* %B
    159 	%tmp2 = call %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 5, i32 1)
    160         %tmp3 = extractvalue %struct.__neon_int16x8x2_t %tmp2, 0
    161         %tmp4 = extractvalue %struct.__neon_int16x8x2_t %tmp2, 1
    162         %tmp5 = add <8 x i16> %tmp3, %tmp4
    163 	ret <8 x i16> %tmp5
    164 }
    165 
    166 define <4 x i32> @vld2laneQi32(i32* %A, <4 x i32>* %B) nounwind {
    167 ;CHECK: vld2laneQi32:
    168 ;Check the alignment value.  Max for this instruction is 64 bits:
    169 ;CHECK: vld2.32 {d17[0], d19[0]}, [{{r[0-9]+}}, :64]
    170 	%tmp0 = bitcast i32* %A to i8*
    171 	%tmp1 = load <4 x i32>* %B
    172 	%tmp2 = call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 16)
    173         %tmp3 = extractvalue %struct.__neon_int32x4x2_t %tmp2, 0
    174         %tmp4 = extractvalue %struct.__neon_int32x4x2_t %tmp2, 1
    175         %tmp5 = add <4 x i32> %tmp3, %tmp4
    176 	ret <4 x i32> %tmp5
    177 }
    178 
    179 define <4 x float> @vld2laneQf(float* %A, <4 x float>* %B) nounwind {
    180 ;CHECK: vld2laneQf:
    181 ;CHECK: vld2.32
    182 	%tmp0 = bitcast float* %A to i8*
    183 	%tmp1 = load <4 x float>* %B
    184 	%tmp2 = call %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2lane.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1)
    185         %tmp3 = extractvalue %struct.__neon_float32x4x2_t %tmp2, 0
    186         %tmp4 = extractvalue %struct.__neon_float32x4x2_t %tmp2, 1
    187         %tmp5 = fadd <4 x float> %tmp3, %tmp4
    188 	ret <4 x float> %tmp5
    189 }
    190 
    191 declare %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
    192 declare %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2lane.v4i16(i8*, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
    193 declare %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
    194 declare %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2lane.v2f32(i8*, <2 x float>, <2 x float>, i32, i32) nounwind readonly
    195 
    196 declare %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16(i8*, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly
    197 declare %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2lane.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32) nounwind readonly
    198 declare %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2lane.v4f32(i8*, <4 x float>, <4 x float>, i32, i32) nounwind readonly
    199 
    200 %struct.__neon_int8x8x3_t = type { <8 x i8>,  <8 x i8>,  <8 x i8> }
    201 %struct.__neon_int16x4x3_t = type { <4 x i16>, <4 x i16>, <4 x i16> }
    202 %struct.__neon_int32x2x3_t = type { <2 x i32>, <2 x i32>, <2 x i32> }
    203 %struct.__neon_float32x2x3_t = type { <2 x float>, <2 x float>, <2 x float> }
    204 
    205 %struct.__neon_int16x8x3_t = type { <8 x i16>, <8 x i16>, <8 x i16> }
    206 %struct.__neon_int32x4x3_t = type { <4 x i32>, <4 x i32>, <4 x i32> }
    207 %struct.__neon_float32x4x3_t = type { <4 x float>, <4 x float>, <4 x float> }
    208 
    209 define <8 x i8> @vld3lanei8(i8* %A, <8 x i8>* %B) nounwind {
    210 ;CHECK: vld3lanei8:
    211 ;CHECK: vld3.8
    212 	%tmp1 = load <8 x i8>* %B
    213 	%tmp2 = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 1)
    214         %tmp3 = extractvalue %struct.__neon_int8x8x3_t %tmp2, 0
    215         %tmp4 = extractvalue %struct.__neon_int8x8x3_t %tmp2, 1
    216         %tmp5 = extractvalue %struct.__neon_int8x8x3_t %tmp2, 2
    217         %tmp6 = add <8 x i8> %tmp3, %tmp4
    218         %tmp7 = add <8 x i8> %tmp5, %tmp6
    219 	ret <8 x i8> %tmp7
    220 }
    221 
    222 define <4 x i16> @vld3lanei16(i16* %A, <4 x i16>* %B) nounwind {
    223 ;CHECK: vld3lanei16:
    224 ;Check the (default) alignment value.  VLD3 does not support alignment.
    225 ;CHECK: vld3.16 {d{{.*}}[1], d{{.*}}[1], d{{.*}}[1]}, [{{r[0-9]+}}]
    226 	%tmp0 = bitcast i16* %A to i8*
    227 	%tmp1 = load <4 x i16>* %B
    228 	%tmp2 = call %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 8)
    229         %tmp3 = extractvalue %struct.__neon_int16x4x3_t %tmp2, 0
    230         %tmp4 = extractvalue %struct.__neon_int16x4x3_t %tmp2, 1
    231         %tmp5 = extractvalue %struct.__neon_int16x4x3_t %tmp2, 2
    232         %tmp6 = add <4 x i16> %tmp3, %tmp4
    233         %tmp7 = add <4 x i16> %tmp5, %tmp6
    234 	ret <4 x i16> %tmp7
    235 }
    236 
    237 define <2 x i32> @vld3lanei32(i32* %A, <2 x i32>* %B) nounwind {
    238 ;CHECK: vld3lanei32:
    239 ;CHECK: vld3.32
    240 	%tmp0 = bitcast i32* %A to i8*
    241 	%tmp1 = load <2 x i32>* %B
    242 	%tmp2 = call %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1)
    243         %tmp3 = extractvalue %struct.__neon_int32x2x3_t %tmp2, 0
    244         %tmp4 = extractvalue %struct.__neon_int32x2x3_t %tmp2, 1
    245         %tmp5 = extractvalue %struct.__neon_int32x2x3_t %tmp2, 2
    246         %tmp6 = add <2 x i32> %tmp3, %tmp4
    247         %tmp7 = add <2 x i32> %tmp5, %tmp6
    248 	ret <2 x i32> %tmp7
    249 }
    250 
    251 define <2 x float> @vld3lanef(float* %A, <2 x float>* %B) nounwind {
    252 ;CHECK: vld3lanef:
    253 ;CHECK: vld3.32
    254 	%tmp0 = bitcast float* %A to i8*
    255 	%tmp1 = load <2 x float>* %B
    256 	%tmp2 = call %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3lane.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1)
    257         %tmp3 = extractvalue %struct.__neon_float32x2x3_t %tmp2, 0
    258         %tmp4 = extractvalue %struct.__neon_float32x2x3_t %tmp2, 1
    259         %tmp5 = extractvalue %struct.__neon_float32x2x3_t %tmp2, 2
    260         %tmp6 = fadd <2 x float> %tmp3, %tmp4
    261         %tmp7 = fadd <2 x float> %tmp5, %tmp6
    262 	ret <2 x float> %tmp7
    263 }
    264 
    265 define <8 x i16> @vld3laneQi16(i16* %A, <8 x i16>* %B) nounwind {
    266 ;CHECK: vld3laneQi16:
    267 ;Check the (default) alignment value.  VLD3 does not support alignment.
    268 ;CHECK: vld3.16 {d{{.*}}[1], d{{.*}}[1], d{{.*}}[1]}, [{{r[0-9]+}}]
    269 	%tmp0 = bitcast i16* %A to i8*
    270 	%tmp1 = load <8 x i16>* %B
    271 	%tmp2 = call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 8)
    272         %tmp3 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 0
    273         %tmp4 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 1
    274         %tmp5 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 2
    275         %tmp6 = add <8 x i16> %tmp3, %tmp4
    276         %tmp7 = add <8 x i16> %tmp5, %tmp6
    277 	ret <8 x i16> %tmp7
    278 }
    279 
    280 ;Check for a post-increment updating load with register increment.
    281 define <8 x i16> @vld3laneQi16_update(i16** %ptr, <8 x i16>* %B, i32 %inc) nounwind {
    282 ;CHECK: vld3laneQi16_update:
    283 ;CHECK: vld3.16 {d{{.*}}[1], d{{.*}}[1], d{{.*}}[1]}, [{{r[0-9]+}}], {{r[0-9]+}}
    284 	%A = load i16** %ptr
    285 	%tmp0 = bitcast i16* %A to i8*
    286 	%tmp1 = load <8 x i16>* %B
    287 	%tmp2 = call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 8)
    288 	%tmp3 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 0
    289 	%tmp4 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 1
    290 	%tmp5 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 2
    291 	%tmp6 = add <8 x i16> %tmp3, %tmp4
    292 	%tmp7 = add <8 x i16> %tmp5, %tmp6
    293 	%tmp8 = getelementptr i16* %A, i32 %inc
    294 	store i16* %tmp8, i16** %ptr
    295 	ret <8 x i16> %tmp7
    296 }
    297 
    298 define <4 x i32> @vld3laneQi32(i32* %A, <4 x i32>* %B) nounwind {
    299 ;CHECK: vld3laneQi32:
    300 ;CHECK: vld3.32
    301 	%tmp0 = bitcast i32* %A to i8*
    302 	%tmp1 = load <4 x i32>* %B
    303 	%tmp2 = call %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 3, i32 1)
    304         %tmp3 = extractvalue %struct.__neon_int32x4x3_t %tmp2, 0
    305         %tmp4 = extractvalue %struct.__neon_int32x4x3_t %tmp2, 1
    306         %tmp5 = extractvalue %struct.__neon_int32x4x3_t %tmp2, 2
    307         %tmp6 = add <4 x i32> %tmp3, %tmp4
    308         %tmp7 = add <4 x i32> %tmp5, %tmp6
    309 	ret <4 x i32> %tmp7
    310 }
    311 
    312 define <4 x float> @vld3laneQf(float* %A, <4 x float>* %B) nounwind {
    313 ;CHECK: vld3laneQf:
    314 ;CHECK: vld3.32
    315 	%tmp0 = bitcast float* %A to i8*
    316 	%tmp1 = load <4 x float>* %B
    317 	%tmp2 = call %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3lane.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1)
    318         %tmp3 = extractvalue %struct.__neon_float32x4x3_t %tmp2, 0
    319         %tmp4 = extractvalue %struct.__neon_float32x4x3_t %tmp2, 1
    320         %tmp5 = extractvalue %struct.__neon_float32x4x3_t %tmp2, 2
    321         %tmp6 = fadd <4 x float> %tmp3, %tmp4
    322         %tmp7 = fadd <4 x float> %tmp5, %tmp6
    323 	ret <4 x float> %tmp7
    324 }
    325 
    326 declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
    327 declare %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
    328 declare %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
    329 declare %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32, i32) nounwind readonly
    330 
    331 declare %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly
    332 declare %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) nounwind readonly
    333 declare %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32, i32) nounwind readonly
    334 
    335 %struct.__neon_int8x8x4_t = type { <8 x i8>,  <8 x i8>,  <8 x i8>,  <8 x i8> }
    336 %struct.__neon_int16x4x4_t = type { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }
    337 %struct.__neon_int32x2x4_t = type { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }
    338 %struct.__neon_float32x2x4_t = type { <2 x float>, <2 x float>, <2 x float>, <2 x float> }
    339 
    340 %struct.__neon_int16x8x4_t = type { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }
    341 %struct.__neon_int32x4x4_t = type { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }
    342 %struct.__neon_float32x4x4_t = type { <4 x float>, <4 x float>, <4 x float>, <4 x float> }
    343 
    344 define <8 x i8> @vld4lanei8(i8* %A, <8 x i8>* %B) nounwind {
    345 ;CHECK: vld4lanei8:
    346 ;Check the alignment value.  Max for this instruction is 32 bits:
    347 ;CHECK: vld4.8 {d{{.*}}[1], d{{.*}}[1], d{{.*}}[1], d{{.*}}[1]}, [{{r[0-9]+}}, :32]
    348 	%tmp1 = load <8 x i8>* %B
    349 	%tmp2 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8)
    350         %tmp3 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 0
    351         %tmp4 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 1
    352         %tmp5 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 2
    353         %tmp6 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 3
    354         %tmp7 = add <8 x i8> %tmp3, %tmp4
    355         %tmp8 = add <8 x i8> %tmp5, %tmp6
    356         %tmp9 = add <8 x i8> %tmp7, %tmp8
    357 	ret <8 x i8> %tmp9
    358 }
    359 
    360 ;Check for a post-increment updating load.
    361 define <8 x i8> @vld4lanei8_update(i8** %ptr, <8 x i8>* %B) nounwind {
    362 ;CHECK: vld4lanei8_update:
    363 ;CHECK: vld4.8 {d16[1], d17[1], d18[1], d19[1]}, [{{r[0-9]+}}, :32]!
    364 	%A = load i8** %ptr
    365 	%tmp1 = load <8 x i8>* %B
    366 	%tmp2 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8)
    367 	%tmp3 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 0
    368 	%tmp4 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 1
    369 	%tmp5 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 2
    370 	%tmp6 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 3
    371 	%tmp7 = add <8 x i8> %tmp3, %tmp4
    372 	%tmp8 = add <8 x i8> %tmp5, %tmp6
    373 	%tmp9 = add <8 x i8> %tmp7, %tmp8
    374 	%tmp10 = getelementptr i8* %A, i32 4
    375 	store i8* %tmp10, i8** %ptr
    376 	ret <8 x i8> %tmp9
    377 }
    378 
    379 define <4 x i16> @vld4lanei16(i16* %A, <4 x i16>* %B) nounwind {
    380 ;CHECK: vld4lanei16:
    381 ;Check that a power-of-two alignment smaller than the total size of the memory
    382 ;being loaded is ignored.
    383 ;CHECK: vld4.16 {d16[1], d17[1], d18[1], d19[1]}, [{{r[0-9]+}}]
    384 	%tmp0 = bitcast i16* %A to i8*
    385 	%tmp1 = load <4 x i16>* %B
    386 	%tmp2 = call %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 4)
    387         %tmp3 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 0
    388         %tmp4 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 1
    389         %tmp5 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 2
    390         %tmp6 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 3
    391         %tmp7 = add <4 x i16> %tmp3, %tmp4
    392         %tmp8 = add <4 x i16> %tmp5, %tmp6
    393         %tmp9 = add <4 x i16> %tmp7, %tmp8
    394 	ret <4 x i16> %tmp9
    395 }
    396 
    397 define <2 x i32> @vld4lanei32(i32* %A, <2 x i32>* %B) nounwind {
    398 ;CHECK: vld4lanei32:
    399 ;Check the alignment value.  An 8-byte alignment is allowed here even though
    400 ;it is smaller than the total size of the memory being loaded.
    401 ;CHECK: vld4.32 {d16[1], d17[1], d18[1], d19[1]}, [{{r[0-9]+}}, :64]
    402 	%tmp0 = bitcast i32* %A to i8*
    403 	%tmp1 = load <2 x i32>* %B
    404 	%tmp2 = call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 8)
    405         %tmp3 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 0
    406         %tmp4 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 1
    407         %tmp5 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 2
    408         %tmp6 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 3
    409         %tmp7 = add <2 x i32> %tmp3, %tmp4
    410         %tmp8 = add <2 x i32> %tmp5, %tmp6
    411         %tmp9 = add <2 x i32> %tmp7, %tmp8
    412 	ret <2 x i32> %tmp9
    413 }
    414 
    415 define <2 x float> @vld4lanef(float* %A, <2 x float>* %B) nounwind {
    416 ;CHECK: vld4lanef:
    417 ;CHECK: vld4.32
    418 	%tmp0 = bitcast float* %A to i8*
    419 	%tmp1 = load <2 x float>* %B
    420 	%tmp2 = call %struct.__neon_float32x2x4_t @llvm.arm.neon.vld4lane.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1)
    421         %tmp3 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 0
    422         %tmp4 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 1
    423         %tmp5 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 2
    424         %tmp6 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 3
    425         %tmp7 = fadd <2 x float> %tmp3, %tmp4
    426         %tmp8 = fadd <2 x float> %tmp5, %tmp6
    427         %tmp9 = fadd <2 x float> %tmp7, %tmp8
    428 	ret <2 x float> %tmp9
    429 }
    430 
    431 define <8 x i16> @vld4laneQi16(i16* %A, <8 x i16>* %B) nounwind {
    432 ;CHECK: vld4laneQi16:
    433 ;Check the alignment value.  Max for this instruction is 64 bits:
    434 ;CHECK: vld4.16 {d16[1], d18[1], d20[1], d22[1]}, [{{r[0-9]+}}, :64]
    435 	%tmp0 = bitcast i16* %A to i8*
    436 	%tmp1 = load <8 x i16>* %B
    437 	%tmp2 = call %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 16)
    438         %tmp3 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 0
    439         %tmp4 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 1
    440         %tmp5 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 2
    441         %tmp6 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 3
    442         %tmp7 = add <8 x i16> %tmp3, %tmp4
    443         %tmp8 = add <8 x i16> %tmp5, %tmp6
    444         %tmp9 = add <8 x i16> %tmp7, %tmp8
    445 	ret <8 x i16> %tmp9
    446 }
    447 
    448 define <4 x i32> @vld4laneQi32(i32* %A, <4 x i32>* %B) nounwind {
    449 ;CHECK: vld4laneQi32:
    450 ;Check the (default) alignment.
    451 ;CHECK: vld4.32 {d17[0], d19[0], d21[0], d23[0]}, [{{r[0-9]+}}]
    452 	%tmp0 = bitcast i32* %A to i8*
    453 	%tmp1 = load <4 x i32>* %B
    454 	%tmp2 = call %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 1)
    455         %tmp3 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 0
    456         %tmp4 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 1
    457         %tmp5 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 2
    458         %tmp6 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 3
    459         %tmp7 = add <4 x i32> %tmp3, %tmp4
    460         %tmp8 = add <4 x i32> %tmp5, %tmp6
    461         %tmp9 = add <4 x i32> %tmp7, %tmp8
    462 	ret <4 x i32> %tmp9
    463 }
    464 
    465 define <4 x float> @vld4laneQf(float* %A, <4 x float>* %B) nounwind {
    466 ;CHECK: vld4laneQf:
    467 ;CHECK: vld4.32
    468 	%tmp0 = bitcast float* %A to i8*
    469 	%tmp1 = load <4 x float>* %B
    470 	%tmp2 = call %struct.__neon_float32x4x4_t @llvm.arm.neon.vld4lane.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1)
    471         %tmp3 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 0
    472         %tmp4 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 1
    473         %tmp5 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 2
    474         %tmp6 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 3
    475         %tmp7 = fadd <4 x float> %tmp3, %tmp4
    476         %tmp8 = fadd <4 x float> %tmp5, %tmp6
    477         %tmp9 = fadd <4 x float> %tmp7, %tmp8
    478 	ret <4 x float> %tmp9
    479 }
    480 
    481 declare %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
    482 declare %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
    483 declare %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
    484 declare %struct.__neon_float32x2x4_t @llvm.arm.neon.vld4lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32) nounwind readonly
    485 
    486 declare %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly
    487 declare %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) nounwind readonly
    488 declare %struct.__neon_float32x4x4_t @llvm.arm.neon.vld4lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32, i32) nounwind readonly
    489 
    490 ; Radar 8776599: If one of the operands to a QQQQ REG_SEQUENCE is a register
    491 ; in the QPR_VFP2 regclass, it needs to be copied to a QPR regclass because
    492 ; we don't currently have a QQQQ_VFP2 super-regclass.  (The "0" for the low
    493 ; part of %ins67 is supposed to be loaded by a VLDRS instruction in this test.)
    494 define <8 x i16> @test_qqqq_regsequence_subreg([6 x i64] %b) nounwind {
    495 ;CHECK: test_qqqq_regsequence_subreg
    496 ;CHECK: vld3.16
    497   %tmp63 = extractvalue [6 x i64] %b, 5
    498   %tmp64 = zext i64 %tmp63 to i128
    499   %tmp65 = shl i128 %tmp64, 64
    500   %ins67 = or i128 %tmp65, 0
    501   %tmp78 = bitcast i128 %ins67 to <8 x i16>
    502   %vld3_lane = tail call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16(i8* undef, <8 x i16> undef, <8 x i16> undef, <8 x i16> %tmp78, i32 1, i32 2)
    503   %tmp3 = extractvalue %struct.__neon_int16x8x3_t %vld3_lane, 0
    504   %tmp4 = extractvalue %struct.__neon_int16x8x3_t %vld3_lane, 1
    505   %tmp5 = extractvalue %struct.__neon_int16x8x3_t %vld3_lane, 2
    506   %tmp6 = add <8 x i16> %tmp3, %tmp4
    507   %tmp7 = add <8 x i16> %tmp5, %tmp6
    508   ret <8 x i16> %tmp7
    509 }
    510 
    511 declare void @llvm.trap() nounwind
    512