Home | History | Annotate | Download | only in ARM
      1 ; RUN: llc -mtriple=arm-eabi -float-abi=soft -mattr=+neon %s -o - | FileCheck %s
      2 
      3 ; RUN: llc -mtriple=arm-eabi -float-abi=soft -mattr=+neon -regalloc=basic %s -o - \
      4 ; RUN:	| FileCheck %s
      5 
      6 define <8 x i8> @vld1lanei8(i8* %A, <8 x i8>* %B) nounwind {
      7 ;CHECK-LABEL: vld1lanei8:
      8 ;Check the (default) alignment value.
      9 ;CHECK: vld1.8 {d16[3]}, [r0]
     10 	%tmp1 = load <8 x i8>* %B
     11 	%tmp2 = load i8* %A, align 8
     12 	%tmp3 = insertelement <8 x i8> %tmp1, i8 %tmp2, i32 3
     13         ret <8 x i8> %tmp3
     14 }
     15 
     16 define <4 x i16> @vld1lanei16(i16* %A, <4 x i16>* %B) nounwind {
     17 ;CHECK-LABEL: vld1lanei16:
     18 ;Check the alignment value.  Max for this instruction is 16 bits:
     19 ;CHECK: vld1.16 {d16[2]}, [r0:16]
     20 	%tmp1 = load <4 x i16>* %B
     21 	%tmp2 = load i16* %A, align 8
     22 	%tmp3 = insertelement <4 x i16> %tmp1, i16 %tmp2, i32 2
     23         ret <4 x i16> %tmp3
     24 }
     25 
     26 define <2 x i32> @vld1lanei32(i32* %A, <2 x i32>* %B) nounwind {
     27 ;CHECK-LABEL: vld1lanei32:
     28 ;Check the alignment value.  Max for this instruction is 32 bits:
     29 ;CHECK: vld1.32 {d16[1]}, [r0:32]
     30 	%tmp1 = load <2 x i32>* %B
     31 	%tmp2 = load i32* %A, align 8
     32 	%tmp3 = insertelement <2 x i32> %tmp1, i32 %tmp2, i32 1
     33         ret <2 x i32> %tmp3
     34 }
     35 
     36 define <2 x i32> @vld1lanei32a32(i32* %A, <2 x i32>* %B) nounwind {
     37 ;CHECK-LABEL: vld1lanei32a32:
     38 ;Check the alignment value.  Legal values are none or :32.
     39 ;CHECK: vld1.32 {d16[1]}, [r0:32]
     40 	%tmp1 = load <2 x i32>* %B
     41 	%tmp2 = load i32* %A, align 4
     42 	%tmp3 = insertelement <2 x i32> %tmp1, i32 %tmp2, i32 1
     43         ret <2 x i32> %tmp3
     44 }
     45 
     46 define <2 x float> @vld1lanef(float* %A, <2 x float>* %B) nounwind {
     47 ;CHECK-LABEL: vld1lanef:
     48 ;CHECK: vld1.32 {d16[1]}, [r0:32]
     49 	%tmp1 = load <2 x float>* %B
     50 	%tmp2 = load float* %A, align 4
     51 	%tmp3 = insertelement <2 x float> %tmp1, float %tmp2, i32 1
     52 	ret <2 x float> %tmp3
     53 }
     54 
     55 define <16 x i8> @vld1laneQi8(i8* %A, <16 x i8>* %B) nounwind {
     56 ;CHECK-LABEL: vld1laneQi8:
     57 ;CHECK: vld1.8 {d17[1]}, [r0]
     58 	%tmp1 = load <16 x i8>* %B
     59 	%tmp2 = load i8* %A, align 8
     60 	%tmp3 = insertelement <16 x i8> %tmp1, i8 %tmp2, i32 9
     61 	ret <16 x i8> %tmp3
     62 }
     63 
     64 define <8 x i16> @vld1laneQi16(i16* %A, <8 x i16>* %B) nounwind {
     65 ;CHECK-LABEL: vld1laneQi16:
     66 ;CHECK: vld1.16 {d17[1]}, [r0:16]
     67 	%tmp1 = load <8 x i16>* %B
     68 	%tmp2 = load i16* %A, align 8
     69 	%tmp3 = insertelement <8 x i16> %tmp1, i16 %tmp2, i32 5
     70 	ret <8 x i16> %tmp3
     71 }
     72 
     73 define <4 x i32> @vld1laneQi32(i32* %A, <4 x i32>* %B) nounwind {
     74 ;CHECK-LABEL: vld1laneQi32:
     75 ;CHECK: vld1.32 {d17[1]}, [r0:32]
     76 	%tmp1 = load <4 x i32>* %B
     77 	%tmp2 = load i32* %A, align 8
     78 	%tmp3 = insertelement <4 x i32> %tmp1, i32 %tmp2, i32 3
     79 	ret <4 x i32> %tmp3
     80 }
     81 
     82 define <4 x float> @vld1laneQf(float* %A, <4 x float>* %B) nounwind {
     83 ;CHECK-LABEL: vld1laneQf:
     84 ;CHECK: vld1.32 {d16[0]}, [r0:32]
     85 	%tmp1 = load <4 x float>* %B
     86 	%tmp2 = load float* %A
     87 	%tmp3 = insertelement <4 x float> %tmp1, float %tmp2, i32 0
     88 	ret <4 x float> %tmp3
     89 }
     90 
     91 %struct.__neon_int8x8x2_t = type { <8 x i8>,  <8 x i8> }
     92 %struct.__neon_int16x4x2_t = type { <4 x i16>, <4 x i16> }
     93 %struct.__neon_int32x2x2_t = type { <2 x i32>, <2 x i32> }
     94 %struct.__neon_float32x2x2_t = type { <2 x float>, <2 x float> }
     95 
     96 %struct.__neon_int16x8x2_t = type { <8 x i16>, <8 x i16> }
     97 %struct.__neon_int32x4x2_t = type { <4 x i32>, <4 x i32> }
     98 %struct.__neon_float32x4x2_t = type { <4 x float>, <4 x float> }
     99 
    100 define <8 x i8> @vld2lanei8(i8* %A, <8 x i8>* %B) nounwind {
    101 ;CHECK-LABEL: vld2lanei8:
    102 ;Check the alignment value.  Max for this instruction is 16 bits:
    103 ;CHECK: vld2.8 {d16[1], d17[1]}, [r0:16]
    104 	%tmp1 = load <8 x i8>* %B
    105 	%tmp2 = call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 4)
    106         %tmp3 = extractvalue %struct.__neon_int8x8x2_t %tmp2, 0
    107         %tmp4 = extractvalue %struct.__neon_int8x8x2_t %tmp2, 1
    108         %tmp5 = add <8 x i8> %tmp3, %tmp4
    109 	ret <8 x i8> %tmp5
    110 }
    111 
    112 define <4 x i16> @vld2lanei16(i16* %A, <4 x i16>* %B) nounwind {
    113 ;CHECK-LABEL: vld2lanei16:
    114 ;Check the alignment value.  Max for this instruction is 32 bits:
    115 ;CHECK: vld2.16 {d16[1], d17[1]}, [r0:32]
    116 	%tmp0 = bitcast i16* %A to i8*
    117 	%tmp1 = load <4 x i16>* %B
    118 	%tmp2 = call %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 8)
    119         %tmp3 = extractvalue %struct.__neon_int16x4x2_t %tmp2, 0
    120         %tmp4 = extractvalue %struct.__neon_int16x4x2_t %tmp2, 1
    121         %tmp5 = add <4 x i16> %tmp3, %tmp4
    122 	ret <4 x i16> %tmp5
    123 }
    124 
    125 define <2 x i32> @vld2lanei32(i32* %A, <2 x i32>* %B) nounwind {
    126 ;CHECK-LABEL: vld2lanei32:
    127 ;CHECK: vld2.32
    128 	%tmp0 = bitcast i32* %A to i8*
    129 	%tmp1 = load <2 x i32>* %B
    130 	%tmp2 = call %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1)
    131         %tmp3 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 0
    132         %tmp4 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 1
    133         %tmp5 = add <2 x i32> %tmp3, %tmp4
    134 	ret <2 x i32> %tmp5
    135 }
    136 
    137 ;Check for a post-increment updating load.
    138 define <2 x i32> @vld2lanei32_update(i32** %ptr, <2 x i32>* %B) nounwind {
    139 ;CHECK-LABEL: vld2lanei32_update:
    140 ;CHECK: vld2.32 {d16[1], d17[1]}, [{{r[0-9]+}}]!
    141 	%A = load i32** %ptr
    142 	%tmp0 = bitcast i32* %A to i8*
    143 	%tmp1 = load <2 x i32>* %B
    144 	%tmp2 = call %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1)
    145 	%tmp3 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 0
    146 	%tmp4 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 1
    147 	%tmp5 = add <2 x i32> %tmp3, %tmp4
    148 	%tmp6 = getelementptr i32* %A, i32 2
    149 	store i32* %tmp6, i32** %ptr
    150 	ret <2 x i32> %tmp5
    151 }
    152 
    153 define <2 x float> @vld2lanef(float* %A, <2 x float>* %B) nounwind {
    154 ;CHECK-LABEL: vld2lanef:
    155 ;CHECK: vld2.32
    156 	%tmp0 = bitcast float* %A to i8*
    157 	%tmp1 = load <2 x float>* %B
    158 	%tmp2 = call %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2lane.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1)
    159         %tmp3 = extractvalue %struct.__neon_float32x2x2_t %tmp2, 0
    160         %tmp4 = extractvalue %struct.__neon_float32x2x2_t %tmp2, 1
    161         %tmp5 = fadd <2 x float> %tmp3, %tmp4
    162 	ret <2 x float> %tmp5
    163 }
    164 
    165 define <8 x i16> @vld2laneQi16(i16* %A, <8 x i16>* %B) nounwind {
    166 ;CHECK-LABEL: vld2laneQi16:
    167 ;Check the (default) alignment.
    168 ;CHECK: vld2.16 {d17[1], d19[1]}, [{{r[0-9]+}}]
    169 	%tmp0 = bitcast i16* %A to i8*
    170 	%tmp1 = load <8 x i16>* %B
    171 	%tmp2 = call %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 5, i32 1)
    172         %tmp3 = extractvalue %struct.__neon_int16x8x2_t %tmp2, 0
    173         %tmp4 = extractvalue %struct.__neon_int16x8x2_t %tmp2, 1
    174         %tmp5 = add <8 x i16> %tmp3, %tmp4
    175 	ret <8 x i16> %tmp5
    176 }
    177 
    178 define <4 x i32> @vld2laneQi32(i32* %A, <4 x i32>* %B) nounwind {
    179 ;CHECK-LABEL: vld2laneQi32:
    180 ;Check the alignment value.  Max for this instruction is 64 bits:
    181 ;CHECK: vld2.32 {d17[0], d19[0]}, [{{r[0-9]+}}:64]
    182 	%tmp0 = bitcast i32* %A to i8*
    183 	%tmp1 = load <4 x i32>* %B
    184 	%tmp2 = call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 16)
    185         %tmp3 = extractvalue %struct.__neon_int32x4x2_t %tmp2, 0
    186         %tmp4 = extractvalue %struct.__neon_int32x4x2_t %tmp2, 1
    187         %tmp5 = add <4 x i32> %tmp3, %tmp4
    188 	ret <4 x i32> %tmp5
    189 }
    190 
    191 define <4 x float> @vld2laneQf(float* %A, <4 x float>* %B) nounwind {
    192 ;CHECK-LABEL: vld2laneQf:
    193 ;CHECK: vld2.32
    194 	%tmp0 = bitcast float* %A to i8*
    195 	%tmp1 = load <4 x float>* %B
    196 	%tmp2 = call %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2lane.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1)
    197         %tmp3 = extractvalue %struct.__neon_float32x4x2_t %tmp2, 0
    198         %tmp4 = extractvalue %struct.__neon_float32x4x2_t %tmp2, 1
    199         %tmp5 = fadd <4 x float> %tmp3, %tmp4
    200 	ret <4 x float> %tmp5
    201 }
    202 
    203 declare %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
    204 declare %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2lane.v4i16(i8*, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
    205 declare %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
    206 declare %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2lane.v2f32(i8*, <2 x float>, <2 x float>, i32, i32) nounwind readonly
    207 
    208 declare %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16(i8*, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly
    209 declare %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2lane.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32) nounwind readonly
    210 declare %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2lane.v4f32(i8*, <4 x float>, <4 x float>, i32, i32) nounwind readonly
    211 
    212 %struct.__neon_int8x8x3_t = type { <8 x i8>,  <8 x i8>,  <8 x i8> }
    213 %struct.__neon_int16x4x3_t = type { <4 x i16>, <4 x i16>, <4 x i16> }
    214 %struct.__neon_int32x2x3_t = type { <2 x i32>, <2 x i32>, <2 x i32> }
    215 %struct.__neon_float32x2x3_t = type { <2 x float>, <2 x float>, <2 x float> }
    216 
    217 %struct.__neon_int16x8x3_t = type { <8 x i16>, <8 x i16>, <8 x i16> }
    218 %struct.__neon_int32x4x3_t = type { <4 x i32>, <4 x i32>, <4 x i32> }
    219 %struct.__neon_float32x4x3_t = type { <4 x float>, <4 x float>, <4 x float> }
    220 
    221 define <8 x i8> @vld3lanei8(i8* %A, <8 x i8>* %B) nounwind {
    222 ;CHECK-LABEL: vld3lanei8:
    223 ;CHECK: vld3.8
    224 	%tmp1 = load <8 x i8>* %B
    225 	%tmp2 = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 1)
    226         %tmp3 = extractvalue %struct.__neon_int8x8x3_t %tmp2, 0
    227         %tmp4 = extractvalue %struct.__neon_int8x8x3_t %tmp2, 1
    228         %tmp5 = extractvalue %struct.__neon_int8x8x3_t %tmp2, 2
    229         %tmp6 = add <8 x i8> %tmp3, %tmp4
    230         %tmp7 = add <8 x i8> %tmp5, %tmp6
    231 	ret <8 x i8> %tmp7
    232 }
    233 
    234 define <4 x i16> @vld3lanei16(i16* %A, <4 x i16>* %B) nounwind {
    235 ;CHECK-LABEL: vld3lanei16:
    236 ;Check the (default) alignment value.  VLD3 does not support alignment.
    237 ;CHECK: vld3.16 {d{{.*}}[1], d{{.*}}[1], d{{.*}}[1]}, [{{r[0-9]+}}]
    238 	%tmp0 = bitcast i16* %A to i8*
    239 	%tmp1 = load <4 x i16>* %B
    240 	%tmp2 = call %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 8)
    241         %tmp3 = extractvalue %struct.__neon_int16x4x3_t %tmp2, 0
    242         %tmp4 = extractvalue %struct.__neon_int16x4x3_t %tmp2, 1
    243         %tmp5 = extractvalue %struct.__neon_int16x4x3_t %tmp2, 2
    244         %tmp6 = add <4 x i16> %tmp3, %tmp4
    245         %tmp7 = add <4 x i16> %tmp5, %tmp6
    246 	ret <4 x i16> %tmp7
    247 }
    248 
    249 define <2 x i32> @vld3lanei32(i32* %A, <2 x i32>* %B) nounwind {
    250 ;CHECK-LABEL: vld3lanei32:
    251 ;CHECK: vld3.32
    252 	%tmp0 = bitcast i32* %A to i8*
    253 	%tmp1 = load <2 x i32>* %B
    254 	%tmp2 = call %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1)
    255         %tmp3 = extractvalue %struct.__neon_int32x2x3_t %tmp2, 0
    256         %tmp4 = extractvalue %struct.__neon_int32x2x3_t %tmp2, 1
    257         %tmp5 = extractvalue %struct.__neon_int32x2x3_t %tmp2, 2
    258         %tmp6 = add <2 x i32> %tmp3, %tmp4
    259         %tmp7 = add <2 x i32> %tmp5, %tmp6
    260 	ret <2 x i32> %tmp7
    261 }
    262 
    263 define <2 x float> @vld3lanef(float* %A, <2 x float>* %B) nounwind {
    264 ;CHECK-LABEL: vld3lanef:
    265 ;CHECK: vld3.32
    266 	%tmp0 = bitcast float* %A to i8*
    267 	%tmp1 = load <2 x float>* %B
    268 	%tmp2 = call %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3lane.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1)
    269         %tmp3 = extractvalue %struct.__neon_float32x2x3_t %tmp2, 0
    270         %tmp4 = extractvalue %struct.__neon_float32x2x3_t %tmp2, 1
    271         %tmp5 = extractvalue %struct.__neon_float32x2x3_t %tmp2, 2
    272         %tmp6 = fadd <2 x float> %tmp3, %tmp4
    273         %tmp7 = fadd <2 x float> %tmp5, %tmp6
    274 	ret <2 x float> %tmp7
    275 }
    276 
    277 define <8 x i16> @vld3laneQi16(i16* %A, <8 x i16>* %B) nounwind {
    278 ;CHECK-LABEL: vld3laneQi16:
    279 ;Check the (default) alignment value.  VLD3 does not support alignment.
    280 ;CHECK: vld3.16 {d{{.*}}[1], d{{.*}}[1], d{{.*}}[1]}, [{{r[0-9]+}}]
    281 	%tmp0 = bitcast i16* %A to i8*
    282 	%tmp1 = load <8 x i16>* %B
    283 	%tmp2 = call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 8)
    284         %tmp3 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 0
    285         %tmp4 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 1
    286         %tmp5 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 2
    287         %tmp6 = add <8 x i16> %tmp3, %tmp4
    288         %tmp7 = add <8 x i16> %tmp5, %tmp6
    289 	ret <8 x i16> %tmp7
    290 }
    291 
    292 ;Check for a post-increment updating load with register increment.
    293 define <8 x i16> @vld3laneQi16_update(i16** %ptr, <8 x i16>* %B, i32 %inc) nounwind {
    294 ;CHECK-LABEL: vld3laneQi16_update:
    295 ;CHECK: vld3.16 {d{{.*}}[1], d{{.*}}[1], d{{.*}}[1]}, [{{r[0-9]+}}], {{r[0-9]+}}
    296 	%A = load i16** %ptr
    297 	%tmp0 = bitcast i16* %A to i8*
    298 	%tmp1 = load <8 x i16>* %B
    299 	%tmp2 = call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 8)
    300 	%tmp3 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 0
    301 	%tmp4 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 1
    302 	%tmp5 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 2
    303 	%tmp6 = add <8 x i16> %tmp3, %tmp4
    304 	%tmp7 = add <8 x i16> %tmp5, %tmp6
    305 	%tmp8 = getelementptr i16* %A, i32 %inc
    306 	store i16* %tmp8, i16** %ptr
    307 	ret <8 x i16> %tmp7
    308 }
    309 
    310 define <4 x i32> @vld3laneQi32(i32* %A, <4 x i32>* %B) nounwind {
    311 ;CHECK-LABEL: vld3laneQi32:
    312 ;CHECK: vld3.32
    313 	%tmp0 = bitcast i32* %A to i8*
    314 	%tmp1 = load <4 x i32>* %B
    315 	%tmp2 = call %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 3, i32 1)
    316         %tmp3 = extractvalue %struct.__neon_int32x4x3_t %tmp2, 0
    317         %tmp4 = extractvalue %struct.__neon_int32x4x3_t %tmp2, 1
    318         %tmp5 = extractvalue %struct.__neon_int32x4x3_t %tmp2, 2
    319         %tmp6 = add <4 x i32> %tmp3, %tmp4
    320         %tmp7 = add <4 x i32> %tmp5, %tmp6
    321 	ret <4 x i32> %tmp7
    322 }
    323 
    324 define <4 x float> @vld3laneQf(float* %A, <4 x float>* %B) nounwind {
    325 ;CHECK-LABEL: vld3laneQf:
    326 ;CHECK: vld3.32
    327 	%tmp0 = bitcast float* %A to i8*
    328 	%tmp1 = load <4 x float>* %B
    329 	%tmp2 = call %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3lane.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1)
    330         %tmp3 = extractvalue %struct.__neon_float32x4x3_t %tmp2, 0
    331         %tmp4 = extractvalue %struct.__neon_float32x4x3_t %tmp2, 1
    332         %tmp5 = extractvalue %struct.__neon_float32x4x3_t %tmp2, 2
    333         %tmp6 = fadd <4 x float> %tmp3, %tmp4
    334         %tmp7 = fadd <4 x float> %tmp5, %tmp6
    335 	ret <4 x float> %tmp7
    336 }
    337 
    338 declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
    339 declare %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
    340 declare %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
    341 declare %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32, i32) nounwind readonly
    342 
    343 declare %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly
    344 declare %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) nounwind readonly
    345 declare %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32, i32) nounwind readonly
    346 
    347 %struct.__neon_int8x8x4_t = type { <8 x i8>,  <8 x i8>,  <8 x i8>,  <8 x i8> }
    348 %struct.__neon_int16x4x4_t = type { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }
    349 %struct.__neon_int32x2x4_t = type { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }
    350 %struct.__neon_float32x2x4_t = type { <2 x float>, <2 x float>, <2 x float>, <2 x float> }
    351 
    352 %struct.__neon_int16x8x4_t = type { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }
    353 %struct.__neon_int32x4x4_t = type { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }
    354 %struct.__neon_float32x4x4_t = type { <4 x float>, <4 x float>, <4 x float>, <4 x float> }
    355 
    356 define <8 x i8> @vld4lanei8(i8* %A, <8 x i8>* %B) nounwind {
    357 ;CHECK-LABEL: vld4lanei8:
    358 ;Check the alignment value.  Max for this instruction is 32 bits:
    359 ;CHECK: vld4.8 {d{{.*}}[1], d{{.*}}[1], d{{.*}}[1], d{{.*}}[1]}, [{{r[0-9]+}}:32]
    360 	%tmp1 = load <8 x i8>* %B
    361 	%tmp2 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8)
    362         %tmp3 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 0
    363         %tmp4 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 1
    364         %tmp5 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 2
    365         %tmp6 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 3
    366         %tmp7 = add <8 x i8> %tmp3, %tmp4
    367         %tmp8 = add <8 x i8> %tmp5, %tmp6
    368         %tmp9 = add <8 x i8> %tmp7, %tmp8
    369 	ret <8 x i8> %tmp9
    370 }
    371 
    372 ;Check for a post-increment updating load.
    373 define <8 x i8> @vld4lanei8_update(i8** %ptr, <8 x i8>* %B) nounwind {
    374 ;CHECK-LABEL: vld4lanei8_update:
    375 ;CHECK: vld4.8 {d16[1], d17[1], d18[1], d19[1]}, [{{r[0-9]+}}:32]!
    376 	%A = load i8** %ptr
    377 	%tmp1 = load <8 x i8>* %B
    378 	%tmp2 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8)
    379 	%tmp3 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 0
    380 	%tmp4 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 1
    381 	%tmp5 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 2
    382 	%tmp6 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 3
    383 	%tmp7 = add <8 x i8> %tmp3, %tmp4
    384 	%tmp8 = add <8 x i8> %tmp5, %tmp6
    385 	%tmp9 = add <8 x i8> %tmp7, %tmp8
    386 	%tmp10 = getelementptr i8* %A, i32 4
    387 	store i8* %tmp10, i8** %ptr
    388 	ret <8 x i8> %tmp9
    389 }
    390 
    391 define <4 x i16> @vld4lanei16(i16* %A, <4 x i16>* %B) nounwind {
    392 ;CHECK-LABEL: vld4lanei16:
    393 ;Check that a power-of-two alignment smaller than the total size of the memory
    394 ;being loaded is ignored.
    395 ;CHECK: vld4.16 {d16[1], d17[1], d18[1], d19[1]}, [{{r[0-9]+}}]
    396 	%tmp0 = bitcast i16* %A to i8*
    397 	%tmp1 = load <4 x i16>* %B
    398 	%tmp2 = call %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 4)
    399         %tmp3 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 0
    400         %tmp4 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 1
    401         %tmp5 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 2
    402         %tmp6 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 3
    403         %tmp7 = add <4 x i16> %tmp3, %tmp4
    404         %tmp8 = add <4 x i16> %tmp5, %tmp6
    405         %tmp9 = add <4 x i16> %tmp7, %tmp8
    406 	ret <4 x i16> %tmp9
    407 }
    408 
    409 define <2 x i32> @vld4lanei32(i32* %A, <2 x i32>* %B) nounwind {
    410 ;CHECK-LABEL: vld4lanei32:
    411 ;Check the alignment value.  An 8-byte alignment is allowed here even though
    412 ;it is smaller than the total size of the memory being loaded.
    413 ;CHECK: vld4.32 {d16[1], d17[1], d18[1], d19[1]}, [{{r[0-9]+}}:64]
    414 	%tmp0 = bitcast i32* %A to i8*
    415 	%tmp1 = load <2 x i32>* %B
    416 	%tmp2 = call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 8)
    417         %tmp3 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 0
    418         %tmp4 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 1
    419         %tmp5 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 2
    420         %tmp6 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 3
    421         %tmp7 = add <2 x i32> %tmp3, %tmp4
    422         %tmp8 = add <2 x i32> %tmp5, %tmp6
    423         %tmp9 = add <2 x i32> %tmp7, %tmp8
    424 	ret <2 x i32> %tmp9
    425 }
    426 
    427 define <2 x float> @vld4lanef(float* %A, <2 x float>* %B) nounwind {
    428 ;CHECK-LABEL: vld4lanef:
    429 ;CHECK: vld4.32
    430 	%tmp0 = bitcast float* %A to i8*
    431 	%tmp1 = load <2 x float>* %B
    432 	%tmp2 = call %struct.__neon_float32x2x4_t @llvm.arm.neon.vld4lane.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1)
    433         %tmp3 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 0
    434         %tmp4 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 1
    435         %tmp5 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 2
    436         %tmp6 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 3
    437         %tmp7 = fadd <2 x float> %tmp3, %tmp4
    438         %tmp8 = fadd <2 x float> %tmp5, %tmp6
    439         %tmp9 = fadd <2 x float> %tmp7, %tmp8
    440 	ret <2 x float> %tmp9
    441 }
    442 
    443 define <8 x i16> @vld4laneQi16(i16* %A, <8 x i16>* %B) nounwind {
    444 ;CHECK-LABEL: vld4laneQi16:
    445 ;Check the alignment value.  Max for this instruction is 64 bits:
    446 ;CHECK: vld4.16 {d16[1], d18[1], d20[1], d22[1]}, [{{r[0-9]+}}:64]
    447 	%tmp0 = bitcast i16* %A to i8*
    448 	%tmp1 = load <8 x i16>* %B
    449 	%tmp2 = call %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 16)
    450         %tmp3 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 0
    451         %tmp4 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 1
    452         %tmp5 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 2
    453         %tmp6 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 3
    454         %tmp7 = add <8 x i16> %tmp3, %tmp4
    455         %tmp8 = add <8 x i16> %tmp5, %tmp6
    456         %tmp9 = add <8 x i16> %tmp7, %tmp8
    457 	ret <8 x i16> %tmp9
    458 }
    459 
    460 define <4 x i32> @vld4laneQi32(i32* %A, <4 x i32>* %B) nounwind {
    461 ;CHECK-LABEL: vld4laneQi32:
    462 ;Check the (default) alignment.
    463 ;CHECK: vld4.32 {d17[0], d19[0], d21[0], d23[0]}, [{{r[0-9]+}}]
    464 	%tmp0 = bitcast i32* %A to i8*
    465 	%tmp1 = load <4 x i32>* %B
    466 	%tmp2 = call %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 1)
    467         %tmp3 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 0
    468         %tmp4 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 1
    469         %tmp5 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 2
    470         %tmp6 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 3
    471         %tmp7 = add <4 x i32> %tmp3, %tmp4
    472         %tmp8 = add <4 x i32> %tmp5, %tmp6
    473         %tmp9 = add <4 x i32> %tmp7, %tmp8
    474 	ret <4 x i32> %tmp9
    475 }
    476 
    477 define <4 x float> @vld4laneQf(float* %A, <4 x float>* %B) nounwind {
    478 ;CHECK-LABEL: vld4laneQf:
    479 ;CHECK: vld4.32
    480 	%tmp0 = bitcast float* %A to i8*
    481 	%tmp1 = load <4 x float>* %B
    482 	%tmp2 = call %struct.__neon_float32x4x4_t @llvm.arm.neon.vld4lane.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1)
    483         %tmp3 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 0
    484         %tmp4 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 1
    485         %tmp5 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 2
    486         %tmp6 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 3
    487         %tmp7 = fadd <4 x float> %tmp3, %tmp4
    488         %tmp8 = fadd <4 x float> %tmp5, %tmp6
    489         %tmp9 = fadd <4 x float> %tmp7, %tmp8
    490 	ret <4 x float> %tmp9
    491 }
    492 
    493 declare %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
    494 declare %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
    495 declare %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
    496 declare %struct.__neon_float32x2x4_t @llvm.arm.neon.vld4lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32) nounwind readonly
    497 
    498 declare %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly
    499 declare %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) nounwind readonly
    500 declare %struct.__neon_float32x4x4_t @llvm.arm.neon.vld4lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32, i32) nounwind readonly
    501 
    502 ; Radar 8776599: If one of the operands to a QQQQ REG_SEQUENCE is a register
    503 ; in the QPR_VFP2 regclass, it needs to be copied to a QPR regclass because
    504 ; we don't currently have a QQQQ_VFP2 super-regclass.  (The "0" for the low
    505 ; part of %ins67 is supposed to be loaded by a VLDRS instruction in this test.)
    506 define <8 x i16> @test_qqqq_regsequence_subreg([6 x i64] %b) nounwind {
    507 ;CHECK-LABEL: test_qqqq_regsequence_subreg:
    508 ;CHECK: vld3.16
    509   %tmp63 = extractvalue [6 x i64] %b, 5
    510   %tmp64 = zext i64 %tmp63 to i128
    511   %tmp65 = shl i128 %tmp64, 64
    512   %ins67 = or i128 %tmp65, 0
    513   %tmp78 = bitcast i128 %ins67 to <8 x i16>
    514   %vld3_lane = tail call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16(i8* undef, <8 x i16> undef, <8 x i16> undef, <8 x i16> %tmp78, i32 1, i32 2)
    515   %tmp3 = extractvalue %struct.__neon_int16x8x3_t %vld3_lane, 0
    516   %tmp4 = extractvalue %struct.__neon_int16x8x3_t %vld3_lane, 1
    517   %tmp5 = extractvalue %struct.__neon_int16x8x3_t %vld3_lane, 2
    518   %tmp6 = add <8 x i16> %tmp3, %tmp4
    519   %tmp7 = add <8 x i16> %tmp5, %tmp6
    520   ret <8 x i16> %tmp7
    521 }
    522 
    523 declare void @llvm.trap() nounwind
    524