Home | History | Annotate | Download | only in ARM
      1 ; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
      2 
      3 define void @vst1lanei8(i8* %A, <8 x i8>* %B) nounwind {
      4 ;CHECK: vst1lanei8:
      5 ;Check the (default) alignment.
      6 ;CHECK: vst1.8 {d16[3]}, [r0]
      7 	%tmp1 = load <8 x i8>* %B
      8         %tmp2 = extractelement <8 x i8> %tmp1, i32 3
      9         store i8 %tmp2, i8* %A, align 8
     10 	ret void
     11 }
     12 
     13 ;Check for a post-increment updating store.
     14 define void @vst1lanei8_update(i8** %ptr, <8 x i8>* %B) nounwind {
     15 ;CHECK: vst1lanei8_update:
     16 ;CHECK: vst1.8 {d16[3]}, [r2]!
     17 	%A = load i8** %ptr
     18 	%tmp1 = load <8 x i8>* %B
     19 	%tmp2 = extractelement <8 x i8> %tmp1, i32 3
     20 	store i8 %tmp2, i8* %A, align 8
     21 	%tmp3 = getelementptr i8* %A, i32 1
     22 	store i8* %tmp3, i8** %ptr
     23 	ret void
     24 }
     25 
     26 define void @vst1lanei16(i16* %A, <4 x i16>* %B) nounwind {
     27 ;CHECK: vst1lanei16:
     28 ;Check the alignment value.  Max for this instruction is 16 bits:
     29 ;CHECK: vst1.16 {d16[2]}, [r0, :16]
     30 	%tmp1 = load <4 x i16>* %B
     31         %tmp2 = extractelement <4 x i16> %tmp1, i32 2
     32         store i16 %tmp2, i16* %A, align 8
     33 	ret void
     34 }
     35 
     36 define void @vst1lanei32(i32* %A, <2 x i32>* %B) nounwind {
     37 ;CHECK: vst1lanei32:
     38 ;Check the alignment value.  Max for this instruction is 32 bits:
     39 ;CHECK: vst1.32 {d16[1]}, [r0, :32]
     40 	%tmp1 = load <2 x i32>* %B
     41         %tmp2 = extractelement <2 x i32> %tmp1, i32 1
     42         store i32 %tmp2, i32* %A, align 8
     43 	ret void
     44 }
     45 
     46 define void @vst1lanef(float* %A, <2 x float>* %B) nounwind {
     47 ;CHECK: vst1lanef:
     48 ;CHECK: vst1.32 {d16[1]}, [r0]
     49 	%tmp1 = load <2 x float>* %B
     50         %tmp2 = extractelement <2 x float> %tmp1, i32 1
     51         store float %tmp2, float* %A
     52 	ret void
     53 }
     54 
     55 define void @vst1laneQi8(i8* %A, <16 x i8>* %B) nounwind {
     56 ;CHECK: vst1laneQi8:
     57 ; // Can use scalar load. No need to use vectors.
     58 ; // CHE-CK: vst1.8 {d17[1]}, [r0]
     59 	%tmp1 = load <16 x i8>* %B
     60         %tmp2 = extractelement <16 x i8> %tmp1, i32 9
     61         store i8 %tmp2, i8* %A, align 8
     62 	ret void
     63 }
     64 
     65 define void @vst1laneQi16(i16* %A, <8 x i16>* %B) nounwind {
     66 ;CHECK: vst1laneQi16:
     67 ;CHECK: vst1.16 {d17[1]}, [r0, :16]
     68 	%tmp1 = load <8 x i16>* %B
     69         %tmp2 = extractelement <8 x i16> %tmp1, i32 5
     70         store i16 %tmp2, i16* %A, align 8
     71 	ret void
     72 }
     73 
     74 define void @vst1laneQi32(i32* %A, <4 x i32>* %B) nounwind {
     75 ;CHECK: vst1laneQi32:
     76 ; // Can use scalar load. No need to use vectors.
     77 ; // CHE-CK: vst1.32 {d17[1]}, [r0, :32]
     78 	%tmp1 = load <4 x i32>* %B
     79         %tmp2 = extractelement <4 x i32> %tmp1, i32 3
     80         store i32 %tmp2, i32* %A, align 8
     81 	ret void
     82 }
     83 
     84 ;Check for a post-increment updating store.
     85 define void @vst1laneQi32_update(i32** %ptr, <4 x i32>* %B) nounwind {
     86 ;CHECK: vst1laneQi32_update:
     87 ; // Can use scalar load. No need to use vectors.
     88 ; // CHE-CK: vst1.32 {d17[1]}, [r1, :32]!
     89 	%A = load i32** %ptr
     90 	%tmp1 = load <4 x i32>* %B
     91 	%tmp2 = extractelement <4 x i32> %tmp1, i32 3
     92 	store i32 %tmp2, i32* %A, align 8
     93 	%tmp3 = getelementptr i32* %A, i32 1
     94 	store i32* %tmp3, i32** %ptr
     95 	ret void
     96 }
     97 
     98 define void @vst1laneQf(float* %A, <4 x float>* %B) nounwind {
     99 ;CHECK: vst1laneQf:
    100 ; // Can use scalar load. No need to use vectors.
    101 ; // CHE-CK: vst1.32 {d17[1]}, [r0]
    102 	%tmp1 = load <4 x float>* %B
    103         %tmp2 = extractelement <4 x float> %tmp1, i32 3
    104         store float %tmp2, float* %A
    105 	ret void
    106 }
    107 
    108 define void @vst2lanei8(i8* %A, <8 x i8>* %B) nounwind {
    109 ;CHECK: vst2lanei8:
    110 ;Check the alignment value.  Max for this instruction is 16 bits:
    111 ;CHECK: vst2.8 {d16[1], d17[1]}, [r0, :16]
    112 	%tmp1 = load <8 x i8>* %B
    113 	call void @llvm.arm.neon.vst2lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 4)
    114 	ret void
    115 }
    116 
    117 define void @vst2lanei16(i16* %A, <4 x i16>* %B) nounwind {
    118 ;CHECK: vst2lanei16:
    119 ;Check the alignment value.  Max for this instruction is 32 bits:
    120 ;CHECK: vst2.16 {d16[1], d17[1]}, [r0, :32]
    121 	%tmp0 = bitcast i16* %A to i8*
    122 	%tmp1 = load <4 x i16>* %B
    123 	call void @llvm.arm.neon.vst2lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 8)
    124 	ret void
    125 }
    126 
    127 ;Check for a post-increment updating store with register increment.
    128 define void @vst2lanei16_update(i16** %ptr, <4 x i16>* %B, i32 %inc) nounwind {
    129 ;CHECK: vst2lanei16_update:
    130 ;CHECK: vst2.16 {d16[1], d17[1]}, [r1], r2
    131 	%A = load i16** %ptr
    132 	%tmp0 = bitcast i16* %A to i8*
    133 	%tmp1 = load <4 x i16>* %B
    134 	call void @llvm.arm.neon.vst2lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 2)
    135 	%tmp2 = getelementptr i16* %A, i32 %inc
    136 	store i16* %tmp2, i16** %ptr
    137 	ret void
    138 }
    139 
    140 define void @vst2lanei32(i32* %A, <2 x i32>* %B) nounwind {
    141 ;CHECK: vst2lanei32:
    142 ;CHECK: vst2.32
    143 	%tmp0 = bitcast i32* %A to i8*
    144 	%tmp1 = load <2 x i32>* %B
    145 	call void @llvm.arm.neon.vst2lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1)
    146 	ret void
    147 }
    148 
    149 define void @vst2lanef(float* %A, <2 x float>* %B) nounwind {
    150 ;CHECK: vst2lanef:
    151 ;CHECK: vst2.32
    152 	%tmp0 = bitcast float* %A to i8*
    153 	%tmp1 = load <2 x float>* %B
    154 	call void @llvm.arm.neon.vst2lane.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1)
    155 	ret void
    156 }
    157 
    158 define void @vst2laneQi16(i16* %A, <8 x i16>* %B) nounwind {
    159 ;CHECK: vst2laneQi16:
    160 ;Check the (default) alignment.
    161 ;CHECK: vst2.16 {d17[1], d19[1]}, [r0]
    162 	%tmp0 = bitcast i16* %A to i8*
    163 	%tmp1 = load <8 x i16>* %B
    164 	call void @llvm.arm.neon.vst2lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 5, i32 1)
    165 	ret void
    166 }
    167 
    168 define void @vst2laneQi32(i32* %A, <4 x i32>* %B) nounwind {
    169 ;CHECK: vst2laneQi32:
    170 ;Check the alignment value.  Max for this instruction is 64 bits:
    171 ;CHECK: vst2.32 {d17[0], d19[0]}, [r0, :64]
    172 	%tmp0 = bitcast i32* %A to i8*
    173 	%tmp1 = load <4 x i32>* %B
    174 	call void @llvm.arm.neon.vst2lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 16)
    175 	ret void
    176 }
    177 
    178 define void @vst2laneQf(float* %A, <4 x float>* %B) nounwind {
    179 ;CHECK: vst2laneQf:
    180 ;CHECK: vst2.32
    181 	%tmp0 = bitcast float* %A to i8*
    182 	%tmp1 = load <4 x float>* %B
    183 	call void @llvm.arm.neon.vst2lane.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, i32 3, i32 1)
    184 	ret void
    185 }
    186 
    187 declare void @llvm.arm.neon.vst2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32) nounwind
    188 declare void @llvm.arm.neon.vst2lane.v4i16(i8*, <4 x i16>, <4 x i16>, i32, i32) nounwind
    189 declare void @llvm.arm.neon.vst2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32) nounwind
    190 declare void @llvm.arm.neon.vst2lane.v2f32(i8*, <2 x float>, <2 x float>, i32, i32) nounwind
    191 
    192 declare void @llvm.arm.neon.vst2lane.v8i16(i8*, <8 x i16>, <8 x i16>, i32, i32) nounwind
    193 declare void @llvm.arm.neon.vst2lane.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32) nounwind
    194 declare void @llvm.arm.neon.vst2lane.v4f32(i8*, <4 x float>, <4 x float>, i32, i32) nounwind
    195 
    196 define void @vst3lanei8(i8* %A, <8 x i8>* %B) nounwind {
    197 ;CHECK: vst3lanei8:
    198 ;CHECK: vst3.8
    199 	%tmp1 = load <8 x i8>* %B
    200 	call void @llvm.arm.neon.vst3lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 1)
    201 	ret void
    202 }
    203 
    204 define void @vst3lanei16(i16* %A, <4 x i16>* %B) nounwind {
    205 ;CHECK: vst3lanei16:
    206 ;Check the (default) alignment value.  VST3 does not support alignment.
    207 ;CHECK: vst3.16 {d16[1], d17[1], d18[1]}, [r0]
    208 	%tmp0 = bitcast i16* %A to i8*
    209 	%tmp1 = load <4 x i16>* %B
    210 	call void @llvm.arm.neon.vst3lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 8)
    211 	ret void
    212 }
    213 
    214 define void @vst3lanei32(i32* %A, <2 x i32>* %B) nounwind {
    215 ;CHECK: vst3lanei32:
    216 ;CHECK: vst3.32
    217 	%tmp0 = bitcast i32* %A to i8*
    218 	%tmp1 = load <2 x i32>* %B
    219 	call void @llvm.arm.neon.vst3lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1)
    220 	ret void
    221 }
    222 
    223 define void @vst3lanef(float* %A, <2 x float>* %B) nounwind {
    224 ;CHECK: vst3lanef:
    225 ;CHECK: vst3.32
    226 	%tmp0 = bitcast float* %A to i8*
    227 	%tmp1 = load <2 x float>* %B
    228 	call void @llvm.arm.neon.vst3lane.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1)
    229 	ret void
    230 }
    231 
    232 define void @vst3laneQi16(i16* %A, <8 x i16>* %B) nounwind {
    233 ;CHECK: vst3laneQi16:
    234 ;Check the (default) alignment value.  VST3 does not support alignment.
    235 ;CHECK: vst3.16 {d17[2], d19[2], d21[2]}, [r0]
    236 	%tmp0 = bitcast i16* %A to i8*
    237 	%tmp1 = load <8 x i16>* %B
    238 	call void @llvm.arm.neon.vst3lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 6, i32 8)
    239 	ret void
    240 }
    241 
    242 define void @vst3laneQi32(i32* %A, <4 x i32>* %B) nounwind {
    243 ;CHECK: vst3laneQi32:
    244 ;CHECK: vst3.32
    245 	%tmp0 = bitcast i32* %A to i8*
    246 	%tmp1 = load <4 x i32>* %B
    247 	call void @llvm.arm.neon.vst3lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 0, i32 1)
    248 	ret void
    249 }
    250 
    251 ;Check for a post-increment updating store.
    252 define void @vst3laneQi32_update(i32** %ptr, <4 x i32>* %B) nounwind {
    253 ;CHECK: vst3laneQi32_update:
    254 ;CHECK: vst3.32 {d16[0], d18[0], d20[0]}, [r1]!
    255 	%A = load i32** %ptr
    256 	%tmp0 = bitcast i32* %A to i8*
    257 	%tmp1 = load <4 x i32>* %B
    258 	call void @llvm.arm.neon.vst3lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 0, i32 1)
    259 	%tmp2 = getelementptr i32* %A, i32 3
    260 	store i32* %tmp2, i32** %ptr
    261 	ret void
    262 }
    263 
    264 define void @vst3laneQf(float* %A, <4 x float>* %B) nounwind {
    265 ;CHECK: vst3laneQf:
    266 ;CHECK: vst3.32
    267 	%tmp0 = bitcast float* %A to i8*
    268 	%tmp1 = load <4 x float>* %B
    269 	call void @llvm.arm.neon.vst3lane.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1)
    270 	ret void
    271 }
    272 
    273 declare void @llvm.arm.neon.vst3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind
    274 declare void @llvm.arm.neon.vst3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind
    275 declare void @llvm.arm.neon.vst3lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind
    276 declare void @llvm.arm.neon.vst3lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32, i32) nounwind
    277 
    278 declare void @llvm.arm.neon.vst3lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind
    279 declare void @llvm.arm.neon.vst3lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) nounwind
    280 declare void @llvm.arm.neon.vst3lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32, i32) nounwind
    281 
    282 
    283 define void @vst4lanei8(i8* %A, <8 x i8>* %B) nounwind {
    284 ;CHECK: vst4lanei8:
    285 ;Check the alignment value.  Max for this instruction is 32 bits:
    286 ;CHECK: vst4.8 {d16[1], d17[1], d18[1], d19[1]}, [r0, :32]
    287 	%tmp1 = load <8 x i8>* %B
    288 	call void @llvm.arm.neon.vst4lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8)
    289 	ret void
    290 }
    291 
    292 ;Check for a post-increment updating store.
    293 define void @vst4lanei8_update(i8** %ptr, <8 x i8>* %B) nounwind {
    294 ;CHECK: vst4lanei8_update:
    295 ;CHECK: vst4.8 {d16[1], d17[1], d18[1], d19[1]}, [r1, :32]!
    296 	%A = load i8** %ptr
    297 	%tmp1 = load <8 x i8>* %B
    298 	call void @llvm.arm.neon.vst4lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8)
    299 	%tmp2 = getelementptr i8* %A, i32 4
    300 	store i8* %tmp2, i8** %ptr
    301 	ret void
    302 }
    303 
    304 define void @vst4lanei16(i16* %A, <4 x i16>* %B) nounwind {
    305 ;CHECK: vst4lanei16:
    306 ;CHECK: vst4.16
    307 	%tmp0 = bitcast i16* %A to i8*
    308 	%tmp1 = load <4 x i16>* %B
    309 	call void @llvm.arm.neon.vst4lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 1)
    310 	ret void
    311 }
    312 
    313 define void @vst4lanei32(i32* %A, <2 x i32>* %B) nounwind {
    314 ;CHECK: vst4lanei32:
    315 ;Check the alignment value.  Max for this instruction is 128 bits:
    316 ;CHECK: vst4.32 {d16[1], d17[1], d18[1], d19[1]}, [r0, :128]
    317 	%tmp0 = bitcast i32* %A to i8*
    318 	%tmp1 = load <2 x i32>* %B
    319 	call void @llvm.arm.neon.vst4lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 16)
    320 	ret void
    321 }
    322 
    323 define void @vst4lanef(float* %A, <2 x float>* %B) nounwind {
    324 ;CHECK: vst4lanef:
    325 ;CHECK: vst4.32
    326 	%tmp0 = bitcast float* %A to i8*
    327 	%tmp1 = load <2 x float>* %B
    328 	call void @llvm.arm.neon.vst4lane.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1)
    329 	ret void
    330 }
    331 
    332 define void @vst4laneQi16(i16* %A, <8 x i16>* %B) nounwind {
    333 ;CHECK: vst4laneQi16:
    334 ;Check the alignment value.  Max for this instruction is 64 bits:
    335 ;CHECK: vst4.16 {d17[3], d19[3], d21[3], d23[3]}, [r0, :64]
    336 	%tmp0 = bitcast i16* %A to i8*
    337 	%tmp1 = load <8 x i16>* %B
    338 	call void @llvm.arm.neon.vst4lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 7, i32 16)
    339 	ret void
    340 }
    341 
    342 define void @vst4laneQi32(i32* %A, <4 x i32>* %B) nounwind {
    343 ;CHECK: vst4laneQi32:
    344 ;Check the (default) alignment.
    345 ;CHECK: vst4.32 {d17[0], d19[0], d21[0], d23[0]}, [r0]
    346 	%tmp0 = bitcast i32* %A to i8*
    347 	%tmp1 = load <4 x i32>* %B
    348 	call void @llvm.arm.neon.vst4lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 1)
    349 	ret void
    350 }
    351 
    352 define void @vst4laneQf(float* %A, <4 x float>* %B) nounwind {
    353 ;CHECK: vst4laneQf:
    354 ;CHECK: vst4.32
    355 	%tmp0 = bitcast float* %A to i8*
    356 	%tmp1 = load <4 x float>* %B
    357 	call void @llvm.arm.neon.vst4lane.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1)
    358 	ret void
    359 }
    360 
    361 declare void @llvm.arm.neon.vst4lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind
    362 declare void @llvm.arm.neon.vst4lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind
    363 declare void @llvm.arm.neon.vst4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind
    364 declare void @llvm.arm.neon.vst4lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32) nounwind
    365 
    366 declare void @llvm.arm.neon.vst4lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind
    367 declare void @llvm.arm.neon.vst4lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) nounwind
    368 declare void @llvm.arm.neon.vst4lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32, i32) nounwind
    369