1 ; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s 2 3 define void @vst1lanei8(i8* %A, <8 x i8>* %B) nounwind { 4 ;CHECK: vst1lanei8: 5 ;Check the (default) alignment. 6 ;CHECK: vst1.8 {d16[3]}, [r0] 7 %tmp1 = load <8 x i8>* %B 8 %tmp2 = extractelement <8 x i8> %tmp1, i32 3 9 store i8 %tmp2, i8* %A, align 8 10 ret void 11 } 12 13 ;Check for a post-increment updating store. 14 define void @vst1lanei8_update(i8** %ptr, <8 x i8>* %B) nounwind { 15 ;CHECK: vst1lanei8_update: 16 ;CHECK: vst1.8 {d16[3]}, [r2]! 17 %A = load i8** %ptr 18 %tmp1 = load <8 x i8>* %B 19 %tmp2 = extractelement <8 x i8> %tmp1, i32 3 20 store i8 %tmp2, i8* %A, align 8 21 %tmp3 = getelementptr i8* %A, i32 1 22 store i8* %tmp3, i8** %ptr 23 ret void 24 } 25 26 define void @vst1lanei16(i16* %A, <4 x i16>* %B) nounwind { 27 ;CHECK: vst1lanei16: 28 ;Check the alignment value. Max for this instruction is 16 bits: 29 ;CHECK: vst1.16 {d16[2]}, [r0, :16] 30 %tmp1 = load <4 x i16>* %B 31 %tmp2 = extractelement <4 x i16> %tmp1, i32 2 32 store i16 %tmp2, i16* %A, align 8 33 ret void 34 } 35 36 define void @vst1lanei32(i32* %A, <2 x i32>* %B) nounwind { 37 ;CHECK: vst1lanei32: 38 ;Check the alignment value. Max for this instruction is 32 bits: 39 ;CHECK: vst1.32 {d16[1]}, [r0, :32] 40 %tmp1 = load <2 x i32>* %B 41 %tmp2 = extractelement <2 x i32> %tmp1, i32 1 42 store i32 %tmp2, i32* %A, align 8 43 ret void 44 } 45 46 define void @vst1lanef(float* %A, <2 x float>* %B) nounwind { 47 ;CHECK: vst1lanef: 48 ;CHECK: vst1.32 {d16[1]}, [r0] 49 %tmp1 = load <2 x float>* %B 50 %tmp2 = extractelement <2 x float> %tmp1, i32 1 51 store float %tmp2, float* %A 52 ret void 53 } 54 55 define void @vst1laneQi8(i8* %A, <16 x i8>* %B) nounwind { 56 ;CHECK: vst1laneQi8: 57 ; // Can use scalar load. No need to use vectors. 58 ; // CHE-CK: vst1.8 {d17[1]}, [r0] 59 %tmp1 = load <16 x i8>* %B 60 %tmp2 = extractelement <16 x i8> %tmp1, i32 9 61 store i8 %tmp2, i8* %A, align 8 62 ret void 63 } 64 65 define void @vst1laneQi16(i16* %A, <8 x i16>* %B) nounwind { 66 ;CHECK: vst1laneQi16: 67 ;CHECK: vst1.16 {d17[1]}, [r0, :16] 68 %tmp1 = load <8 x i16>* %B 69 %tmp2 = extractelement <8 x i16> %tmp1, i32 5 70 store i16 %tmp2, i16* %A, align 8 71 ret void 72 } 73 74 define void @vst1laneQi32(i32* %A, <4 x i32>* %B) nounwind { 75 ;CHECK: vst1laneQi32: 76 ; // Can use scalar load. No need to use vectors. 77 ; // CHE-CK: vst1.32 {d17[1]}, [r0, :32] 78 %tmp1 = load <4 x i32>* %B 79 %tmp2 = extractelement <4 x i32> %tmp1, i32 3 80 store i32 %tmp2, i32* %A, align 8 81 ret void 82 } 83 84 ;Check for a post-increment updating store. 85 define void @vst1laneQi32_update(i32** %ptr, <4 x i32>* %B) nounwind { 86 ;CHECK: vst1laneQi32_update: 87 ; // Can use scalar load. No need to use vectors. 88 ; // CHE-CK: vst1.32 {d17[1]}, [r1, :32]! 89 %A = load i32** %ptr 90 %tmp1 = load <4 x i32>* %B 91 %tmp2 = extractelement <4 x i32> %tmp1, i32 3 92 store i32 %tmp2, i32* %A, align 8 93 %tmp3 = getelementptr i32* %A, i32 1 94 store i32* %tmp3, i32** %ptr 95 ret void 96 } 97 98 define void @vst1laneQf(float* %A, <4 x float>* %B) nounwind { 99 ;CHECK: vst1laneQf: 100 ; // Can use scalar load. No need to use vectors. 101 ; // CHE-CK: vst1.32 {d17[1]}, [r0] 102 %tmp1 = load <4 x float>* %B 103 %tmp2 = extractelement <4 x float> %tmp1, i32 3 104 store float %tmp2, float* %A 105 ret void 106 } 107 108 define void @vst2lanei8(i8* %A, <8 x i8>* %B) nounwind { 109 ;CHECK: vst2lanei8: 110 ;Check the alignment value. Max for this instruction is 16 bits: 111 ;CHECK: vst2.8 {d16[1], d17[1]}, [r0, :16] 112 %tmp1 = load <8 x i8>* %B 113 call void @llvm.arm.neon.vst2lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 4) 114 ret void 115 } 116 117 define void @vst2lanei16(i16* %A, <4 x i16>* %B) nounwind { 118 ;CHECK: vst2lanei16: 119 ;Check the alignment value. Max for this instruction is 32 bits: 120 ;CHECK: vst2.16 {d16[1], d17[1]}, [r0, :32] 121 %tmp0 = bitcast i16* %A to i8* 122 %tmp1 = load <4 x i16>* %B 123 call void @llvm.arm.neon.vst2lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 8) 124 ret void 125 } 126 127 ;Check for a post-increment updating store with register increment. 128 define void @vst2lanei16_update(i16** %ptr, <4 x i16>* %B, i32 %inc) nounwind { 129 ;CHECK: vst2lanei16_update: 130 ;CHECK: vst2.16 {d16[1], d17[1]}, [r1], r2 131 %A = load i16** %ptr 132 %tmp0 = bitcast i16* %A to i8* 133 %tmp1 = load <4 x i16>* %B 134 call void @llvm.arm.neon.vst2lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 2) 135 %tmp2 = getelementptr i16* %A, i32 %inc 136 store i16* %tmp2, i16** %ptr 137 ret void 138 } 139 140 define void @vst2lanei32(i32* %A, <2 x i32>* %B) nounwind { 141 ;CHECK: vst2lanei32: 142 ;CHECK: vst2.32 143 %tmp0 = bitcast i32* %A to i8* 144 %tmp1 = load <2 x i32>* %B 145 call void @llvm.arm.neon.vst2lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1) 146 ret void 147 } 148 149 define void @vst2lanef(float* %A, <2 x float>* %B) nounwind { 150 ;CHECK: vst2lanef: 151 ;CHECK: vst2.32 152 %tmp0 = bitcast float* %A to i8* 153 %tmp1 = load <2 x float>* %B 154 call void @llvm.arm.neon.vst2lane.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1) 155 ret void 156 } 157 158 define void @vst2laneQi16(i16* %A, <8 x i16>* %B) nounwind { 159 ;CHECK: vst2laneQi16: 160 ;Check the (default) alignment. 161 ;CHECK: vst2.16 {d17[1], d19[1]}, [r0] 162 %tmp0 = bitcast i16* %A to i8* 163 %tmp1 = load <8 x i16>* %B 164 call void @llvm.arm.neon.vst2lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 5, i32 1) 165 ret void 166 } 167 168 define void @vst2laneQi32(i32* %A, <4 x i32>* %B) nounwind { 169 ;CHECK: vst2laneQi32: 170 ;Check the alignment value. Max for this instruction is 64 bits: 171 ;CHECK: vst2.32 {d17[0], d19[0]}, [r0, :64] 172 %tmp0 = bitcast i32* %A to i8* 173 %tmp1 = load <4 x i32>* %B 174 call void @llvm.arm.neon.vst2lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 16) 175 ret void 176 } 177 178 define void @vst2laneQf(float* %A, <4 x float>* %B) nounwind { 179 ;CHECK: vst2laneQf: 180 ;CHECK: vst2.32 181 %tmp0 = bitcast float* %A to i8* 182 %tmp1 = load <4 x float>* %B 183 call void @llvm.arm.neon.vst2lane.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, i32 3, i32 1) 184 ret void 185 } 186 187 declare void @llvm.arm.neon.vst2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32) nounwind 188 declare void @llvm.arm.neon.vst2lane.v4i16(i8*, <4 x i16>, <4 x i16>, i32, i32) nounwind 189 declare void @llvm.arm.neon.vst2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32) nounwind 190 declare void @llvm.arm.neon.vst2lane.v2f32(i8*, <2 x float>, <2 x float>, i32, i32) nounwind 191 192 declare void @llvm.arm.neon.vst2lane.v8i16(i8*, <8 x i16>, <8 x i16>, i32, i32) nounwind 193 declare void @llvm.arm.neon.vst2lane.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32) nounwind 194 declare void @llvm.arm.neon.vst2lane.v4f32(i8*, <4 x float>, <4 x float>, i32, i32) nounwind 195 196 define void @vst3lanei8(i8* %A, <8 x i8>* %B) nounwind { 197 ;CHECK: vst3lanei8: 198 ;CHECK: vst3.8 199 %tmp1 = load <8 x i8>* %B 200 call void @llvm.arm.neon.vst3lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 1) 201 ret void 202 } 203 204 define void @vst3lanei16(i16* %A, <4 x i16>* %B) nounwind { 205 ;CHECK: vst3lanei16: 206 ;Check the (default) alignment value. VST3 does not support alignment. 207 ;CHECK: vst3.16 {d16[1], d17[1], d18[1]}, [r0] 208 %tmp0 = bitcast i16* %A to i8* 209 %tmp1 = load <4 x i16>* %B 210 call void @llvm.arm.neon.vst3lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 8) 211 ret void 212 } 213 214 define void @vst3lanei32(i32* %A, <2 x i32>* %B) nounwind { 215 ;CHECK: vst3lanei32: 216 ;CHECK: vst3.32 217 %tmp0 = bitcast i32* %A to i8* 218 %tmp1 = load <2 x i32>* %B 219 call void @llvm.arm.neon.vst3lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1) 220 ret void 221 } 222 223 define void @vst3lanef(float* %A, <2 x float>* %B) nounwind { 224 ;CHECK: vst3lanef: 225 ;CHECK: vst3.32 226 %tmp0 = bitcast float* %A to i8* 227 %tmp1 = load <2 x float>* %B 228 call void @llvm.arm.neon.vst3lane.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1) 229 ret void 230 } 231 232 define void @vst3laneQi16(i16* %A, <8 x i16>* %B) nounwind { 233 ;CHECK: vst3laneQi16: 234 ;Check the (default) alignment value. VST3 does not support alignment. 235 ;CHECK: vst3.16 {d17[2], d19[2], d21[2]}, [r0] 236 %tmp0 = bitcast i16* %A to i8* 237 %tmp1 = load <8 x i16>* %B 238 call void @llvm.arm.neon.vst3lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 6, i32 8) 239 ret void 240 } 241 242 define void @vst3laneQi32(i32* %A, <4 x i32>* %B) nounwind { 243 ;CHECK: vst3laneQi32: 244 ;CHECK: vst3.32 245 %tmp0 = bitcast i32* %A to i8* 246 %tmp1 = load <4 x i32>* %B 247 call void @llvm.arm.neon.vst3lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 0, i32 1) 248 ret void 249 } 250 251 ;Check for a post-increment updating store. 252 define void @vst3laneQi32_update(i32** %ptr, <4 x i32>* %B) nounwind { 253 ;CHECK: vst3laneQi32_update: 254 ;CHECK: vst3.32 {d16[0], d18[0], d20[0]}, [r1]! 255 %A = load i32** %ptr 256 %tmp0 = bitcast i32* %A to i8* 257 %tmp1 = load <4 x i32>* %B 258 call void @llvm.arm.neon.vst3lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 0, i32 1) 259 %tmp2 = getelementptr i32* %A, i32 3 260 store i32* %tmp2, i32** %ptr 261 ret void 262 } 263 264 define void @vst3laneQf(float* %A, <4 x float>* %B) nounwind { 265 ;CHECK: vst3laneQf: 266 ;CHECK: vst3.32 267 %tmp0 = bitcast float* %A to i8* 268 %tmp1 = load <4 x float>* %B 269 call void @llvm.arm.neon.vst3lane.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1) 270 ret void 271 } 272 273 declare void @llvm.arm.neon.vst3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind 274 declare void @llvm.arm.neon.vst3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind 275 declare void @llvm.arm.neon.vst3lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind 276 declare void @llvm.arm.neon.vst3lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32, i32) nounwind 277 278 declare void @llvm.arm.neon.vst3lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind 279 declare void @llvm.arm.neon.vst3lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) nounwind 280 declare void @llvm.arm.neon.vst3lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32, i32) nounwind 281 282 283 define void @vst4lanei8(i8* %A, <8 x i8>* %B) nounwind { 284 ;CHECK: vst4lanei8: 285 ;Check the alignment value. Max for this instruction is 32 bits: 286 ;CHECK: vst4.8 {d16[1], d17[1], d18[1], d19[1]}, [r0, :32] 287 %tmp1 = load <8 x i8>* %B 288 call void @llvm.arm.neon.vst4lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8) 289 ret void 290 } 291 292 ;Check for a post-increment updating store. 293 define void @vst4lanei8_update(i8** %ptr, <8 x i8>* %B) nounwind { 294 ;CHECK: vst4lanei8_update: 295 ;CHECK: vst4.8 {d16[1], d17[1], d18[1], d19[1]}, [r1, :32]! 296 %A = load i8** %ptr 297 %tmp1 = load <8 x i8>* %B 298 call void @llvm.arm.neon.vst4lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8) 299 %tmp2 = getelementptr i8* %A, i32 4 300 store i8* %tmp2, i8** %ptr 301 ret void 302 } 303 304 define void @vst4lanei16(i16* %A, <4 x i16>* %B) nounwind { 305 ;CHECK: vst4lanei16: 306 ;CHECK: vst4.16 307 %tmp0 = bitcast i16* %A to i8* 308 %tmp1 = load <4 x i16>* %B 309 call void @llvm.arm.neon.vst4lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 1) 310 ret void 311 } 312 313 define void @vst4lanei32(i32* %A, <2 x i32>* %B) nounwind { 314 ;CHECK: vst4lanei32: 315 ;Check the alignment value. Max for this instruction is 128 bits: 316 ;CHECK: vst4.32 {d16[1], d17[1], d18[1], d19[1]}, [r0, :128] 317 %tmp0 = bitcast i32* %A to i8* 318 %tmp1 = load <2 x i32>* %B 319 call void @llvm.arm.neon.vst4lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 16) 320 ret void 321 } 322 323 define void @vst4lanef(float* %A, <2 x float>* %B) nounwind { 324 ;CHECK: vst4lanef: 325 ;CHECK: vst4.32 326 %tmp0 = bitcast float* %A to i8* 327 %tmp1 = load <2 x float>* %B 328 call void @llvm.arm.neon.vst4lane.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1) 329 ret void 330 } 331 332 define void @vst4laneQi16(i16* %A, <8 x i16>* %B) nounwind { 333 ;CHECK: vst4laneQi16: 334 ;Check the alignment value. Max for this instruction is 64 bits: 335 ;CHECK: vst4.16 {d17[3], d19[3], d21[3], d23[3]}, [r0, :64] 336 %tmp0 = bitcast i16* %A to i8* 337 %tmp1 = load <8 x i16>* %B 338 call void @llvm.arm.neon.vst4lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 7, i32 16) 339 ret void 340 } 341 342 define void @vst4laneQi32(i32* %A, <4 x i32>* %B) nounwind { 343 ;CHECK: vst4laneQi32: 344 ;Check the (default) alignment. 345 ;CHECK: vst4.32 {d17[0], d19[0], d21[0], d23[0]}, [r0] 346 %tmp0 = bitcast i32* %A to i8* 347 %tmp1 = load <4 x i32>* %B 348 call void @llvm.arm.neon.vst4lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 1) 349 ret void 350 } 351 352 define void @vst4laneQf(float* %A, <4 x float>* %B) nounwind { 353 ;CHECK: vst4laneQf: 354 ;CHECK: vst4.32 355 %tmp0 = bitcast float* %A to i8* 356 %tmp1 = load <4 x float>* %B 357 call void @llvm.arm.neon.vst4lane.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1) 358 ret void 359 } 360 361 declare void @llvm.arm.neon.vst4lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind 362 declare void @llvm.arm.neon.vst4lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind 363 declare void @llvm.arm.neon.vst4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind 364 declare void @llvm.arm.neon.vst4lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32) nounwind 365 366 declare void @llvm.arm.neon.vst4lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind 367 declare void @llvm.arm.neon.vst4lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) nounwind 368 declare void @llvm.arm.neon.vst4lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32, i32) nounwind 369