1 ; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s 2 ; RUN: llc < %s -march=arm -mattr=+neon -regalloc=basic | FileCheck %s 3 4 define <8 x i8> @vld1lanei8(i8* %A, <8 x i8>* %B) nounwind { 5 ;CHECK: vld1lanei8: 6 ;Check the (default) alignment value. 7 ;CHECK: vld1.8 {d16[3]}, [r0] 8 %tmp1 = load <8 x i8>* %B 9 %tmp2 = load i8* %A, align 8 10 %tmp3 = insertelement <8 x i8> %tmp1, i8 %tmp2, i32 3 11 ret <8 x i8> %tmp3 12 } 13 14 define <4 x i16> @vld1lanei16(i16* %A, <4 x i16>* %B) nounwind { 15 ;CHECK: vld1lanei16: 16 ;Check the alignment value. Max for this instruction is 16 bits: 17 ;CHECK: vld1.16 {d16[2]}, [r0, :16] 18 %tmp1 = load <4 x i16>* %B 19 %tmp2 = load i16* %A, align 8 20 %tmp3 = insertelement <4 x i16> %tmp1, i16 %tmp2, i32 2 21 ret <4 x i16> %tmp3 22 } 23 24 define <2 x i32> @vld1lanei32(i32* %A, <2 x i32>* %B) nounwind { 25 ;CHECK: vld1lanei32: 26 ;Check the alignment value. Max for this instruction is 32 bits: 27 ;CHECK: vld1.32 {d16[1]}, [r0, :32] 28 %tmp1 = load <2 x i32>* %B 29 %tmp2 = load i32* %A, align 8 30 %tmp3 = insertelement <2 x i32> %tmp1, i32 %tmp2, i32 1 31 ret <2 x i32> %tmp3 32 } 33 34 define <2 x i32> @vld1lanei32a32(i32* %A, <2 x i32>* %B) nounwind { 35 ;CHECK: vld1lanei32a32: 36 ;Check the alignment value. Legal values are none or :32. 37 ;CHECK: vld1.32 {d16[1]}, [r0, :32] 38 %tmp1 = load <2 x i32>* %B 39 %tmp2 = load i32* %A, align 4 40 %tmp3 = insertelement <2 x i32> %tmp1, i32 %tmp2, i32 1 41 ret <2 x i32> %tmp3 42 } 43 44 define <2 x float> @vld1lanef(float* %A, <2 x float>* %B) nounwind { 45 ;CHECK: vld1lanef: 46 ;CHECK: vld1.32 {d16[1]}, [r0, :32] 47 %tmp1 = load <2 x float>* %B 48 %tmp2 = load float* %A, align 4 49 %tmp3 = insertelement <2 x float> %tmp1, float %tmp2, i32 1 50 ret <2 x float> %tmp3 51 } 52 53 define <16 x i8> @vld1laneQi8(i8* %A, <16 x i8>* %B) nounwind { 54 ;CHECK: vld1laneQi8: 55 ;CHECK: vld1.8 {d17[1]}, [r0] 56 %tmp1 = load <16 x i8>* %B 57 %tmp2 = load i8* %A, align 8 58 %tmp3 = insertelement <16 x i8> %tmp1, i8 %tmp2, i32 9 59 ret <16 x i8> %tmp3 60 } 61 62 define <8 x i16> @vld1laneQi16(i16* %A, <8 x i16>* %B) nounwind { 63 ;CHECK: vld1laneQi16: 64 ;CHECK: vld1.16 {d17[1]}, [r0, :16] 65 %tmp1 = load <8 x i16>* %B 66 %tmp2 = load i16* %A, align 8 67 %tmp3 = insertelement <8 x i16> %tmp1, i16 %tmp2, i32 5 68 ret <8 x i16> %tmp3 69 } 70 71 define <4 x i32> @vld1laneQi32(i32* %A, <4 x i32>* %B) nounwind { 72 ;CHECK: vld1laneQi32: 73 ;CHECK: vld1.32 {d17[1]}, [r0, :32] 74 %tmp1 = load <4 x i32>* %B 75 %tmp2 = load i32* %A, align 8 76 %tmp3 = insertelement <4 x i32> %tmp1, i32 %tmp2, i32 3 77 ret <4 x i32> %tmp3 78 } 79 80 define <4 x float> @vld1laneQf(float* %A, <4 x float>* %B) nounwind { 81 ;CHECK: vld1laneQf: 82 ;CHECK: vld1.32 {d16[0]}, [r0, :32] 83 %tmp1 = load <4 x float>* %B 84 %tmp2 = load float* %A 85 %tmp3 = insertelement <4 x float> %tmp1, float %tmp2, i32 0 86 ret <4 x float> %tmp3 87 } 88 89 %struct.__neon_int8x8x2_t = type { <8 x i8>, <8 x i8> } 90 %struct.__neon_int16x4x2_t = type { <4 x i16>, <4 x i16> } 91 %struct.__neon_int32x2x2_t = type { <2 x i32>, <2 x i32> } 92 %struct.__neon_float32x2x2_t = type { <2 x float>, <2 x float> } 93 94 %struct.__neon_int16x8x2_t = type { <8 x i16>, <8 x i16> } 95 %struct.__neon_int32x4x2_t = type { <4 x i32>, <4 x i32> } 96 %struct.__neon_float32x4x2_t = type { <4 x float>, <4 x float> } 97 98 define <8 x i8> @vld2lanei8(i8* %A, <8 x i8>* %B) nounwind { 99 ;CHECK: vld2lanei8: 100 ;Check the alignment value. Max for this instruction is 16 bits: 101 ;CHECK: vld2.8 {d16[1], d17[1]}, [r0, :16] 102 %tmp1 = load <8 x i8>* %B 103 %tmp2 = call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 4) 104 %tmp3 = extractvalue %struct.__neon_int8x8x2_t %tmp2, 0 105 %tmp4 = extractvalue %struct.__neon_int8x8x2_t %tmp2, 1 106 %tmp5 = add <8 x i8> %tmp3, %tmp4 107 ret <8 x i8> %tmp5 108 } 109 110 define <4 x i16> @vld2lanei16(i16* %A, <4 x i16>* %B) nounwind { 111 ;CHECK: vld2lanei16: 112 ;Check the alignment value. Max for this instruction is 32 bits: 113 ;CHECK: vld2.16 {d16[1], d17[1]}, [r0, :32] 114 %tmp0 = bitcast i16* %A to i8* 115 %tmp1 = load <4 x i16>* %B 116 %tmp2 = call %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 8) 117 %tmp3 = extractvalue %struct.__neon_int16x4x2_t %tmp2, 0 118 %tmp4 = extractvalue %struct.__neon_int16x4x2_t %tmp2, 1 119 %tmp5 = add <4 x i16> %tmp3, %tmp4 120 ret <4 x i16> %tmp5 121 } 122 123 define <2 x i32> @vld2lanei32(i32* %A, <2 x i32>* %B) nounwind { 124 ;CHECK: vld2lanei32: 125 ;CHECK: vld2.32 126 %tmp0 = bitcast i32* %A to i8* 127 %tmp1 = load <2 x i32>* %B 128 %tmp2 = call %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1) 129 %tmp3 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 0 130 %tmp4 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 1 131 %tmp5 = add <2 x i32> %tmp3, %tmp4 132 ret <2 x i32> %tmp5 133 } 134 135 ;Check for a post-increment updating load. 136 define <2 x i32> @vld2lanei32_update(i32** %ptr, <2 x i32>* %B) nounwind { 137 ;CHECK: vld2lanei32_update: 138 ;CHECK: vld2.32 {d16[1], d17[1]}, [{{r[0-9]+}}]! 139 %A = load i32** %ptr 140 %tmp0 = bitcast i32* %A to i8* 141 %tmp1 = load <2 x i32>* %B 142 %tmp2 = call %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1) 143 %tmp3 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 0 144 %tmp4 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 1 145 %tmp5 = add <2 x i32> %tmp3, %tmp4 146 %tmp6 = getelementptr i32* %A, i32 2 147 store i32* %tmp6, i32** %ptr 148 ret <2 x i32> %tmp5 149 } 150 151 define <2 x float> @vld2lanef(float* %A, <2 x float>* %B) nounwind { 152 ;CHECK: vld2lanef: 153 ;CHECK: vld2.32 154 %tmp0 = bitcast float* %A to i8* 155 %tmp1 = load <2 x float>* %B 156 %tmp2 = call %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2lane.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1) 157 %tmp3 = extractvalue %struct.__neon_float32x2x2_t %tmp2, 0 158 %tmp4 = extractvalue %struct.__neon_float32x2x2_t %tmp2, 1 159 %tmp5 = fadd <2 x float> %tmp3, %tmp4 160 ret <2 x float> %tmp5 161 } 162 163 define <8 x i16> @vld2laneQi16(i16* %A, <8 x i16>* %B) nounwind { 164 ;CHECK: vld2laneQi16: 165 ;Check the (default) alignment. 166 ;CHECK: vld2.16 {d17[1], d19[1]}, [{{r[0-9]+}}] 167 %tmp0 = bitcast i16* %A to i8* 168 %tmp1 = load <8 x i16>* %B 169 %tmp2 = call %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 5, i32 1) 170 %tmp3 = extractvalue %struct.__neon_int16x8x2_t %tmp2, 0 171 %tmp4 = extractvalue %struct.__neon_int16x8x2_t %tmp2, 1 172 %tmp5 = add <8 x i16> %tmp3, %tmp4 173 ret <8 x i16> %tmp5 174 } 175 176 define <4 x i32> @vld2laneQi32(i32* %A, <4 x i32>* %B) nounwind { 177 ;CHECK: vld2laneQi32: 178 ;Check the alignment value. Max for this instruction is 64 bits: 179 ;CHECK: vld2.32 {d17[0], d19[0]}, [{{r[0-9]+}}, :64] 180 %tmp0 = bitcast i32* %A to i8* 181 %tmp1 = load <4 x i32>* %B 182 %tmp2 = call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 16) 183 %tmp3 = extractvalue %struct.__neon_int32x4x2_t %tmp2, 0 184 %tmp4 = extractvalue %struct.__neon_int32x4x2_t %tmp2, 1 185 %tmp5 = add <4 x i32> %tmp3, %tmp4 186 ret <4 x i32> %tmp5 187 } 188 189 define <4 x float> @vld2laneQf(float* %A, <4 x float>* %B) nounwind { 190 ;CHECK: vld2laneQf: 191 ;CHECK: vld2.32 192 %tmp0 = bitcast float* %A to i8* 193 %tmp1 = load <4 x float>* %B 194 %tmp2 = call %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2lane.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1) 195 %tmp3 = extractvalue %struct.__neon_float32x4x2_t %tmp2, 0 196 %tmp4 = extractvalue %struct.__neon_float32x4x2_t %tmp2, 1 197 %tmp5 = fadd <4 x float> %tmp3, %tmp4 198 ret <4 x float> %tmp5 199 } 200 201 declare %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly 202 declare %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2lane.v4i16(i8*, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly 203 declare %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly 204 declare %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2lane.v2f32(i8*, <2 x float>, <2 x float>, i32, i32) nounwind readonly 205 206 declare %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16(i8*, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly 207 declare %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2lane.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32) nounwind readonly 208 declare %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2lane.v4f32(i8*, <4 x float>, <4 x float>, i32, i32) nounwind readonly 209 210 %struct.__neon_int8x8x3_t = type { <8 x i8>, <8 x i8>, <8 x i8> } 211 %struct.__neon_int16x4x3_t = type { <4 x i16>, <4 x i16>, <4 x i16> } 212 %struct.__neon_int32x2x3_t = type { <2 x i32>, <2 x i32>, <2 x i32> } 213 %struct.__neon_float32x2x3_t = type { <2 x float>, <2 x float>, <2 x float> } 214 215 %struct.__neon_int16x8x3_t = type { <8 x i16>, <8 x i16>, <8 x i16> } 216 %struct.__neon_int32x4x3_t = type { <4 x i32>, <4 x i32>, <4 x i32> } 217 %struct.__neon_float32x4x3_t = type { <4 x float>, <4 x float>, <4 x float> } 218 219 define <8 x i8> @vld3lanei8(i8* %A, <8 x i8>* %B) nounwind { 220 ;CHECK: vld3lanei8: 221 ;CHECK: vld3.8 222 %tmp1 = load <8 x i8>* %B 223 %tmp2 = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 1) 224 %tmp3 = extractvalue %struct.__neon_int8x8x3_t %tmp2, 0 225 %tmp4 = extractvalue %struct.__neon_int8x8x3_t %tmp2, 1 226 %tmp5 = extractvalue %struct.__neon_int8x8x3_t %tmp2, 2 227 %tmp6 = add <8 x i8> %tmp3, %tmp4 228 %tmp7 = add <8 x i8> %tmp5, %tmp6 229 ret <8 x i8> %tmp7 230 } 231 232 define <4 x i16> @vld3lanei16(i16* %A, <4 x i16>* %B) nounwind { 233 ;CHECK: vld3lanei16: 234 ;Check the (default) alignment value. VLD3 does not support alignment. 235 ;CHECK: vld3.16 {d{{.*}}[1], d{{.*}}[1], d{{.*}}[1]}, [{{r[0-9]+}}] 236 %tmp0 = bitcast i16* %A to i8* 237 %tmp1 = load <4 x i16>* %B 238 %tmp2 = call %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 8) 239 %tmp3 = extractvalue %struct.__neon_int16x4x3_t %tmp2, 0 240 %tmp4 = extractvalue %struct.__neon_int16x4x3_t %tmp2, 1 241 %tmp5 = extractvalue %struct.__neon_int16x4x3_t %tmp2, 2 242 %tmp6 = add <4 x i16> %tmp3, %tmp4 243 %tmp7 = add <4 x i16> %tmp5, %tmp6 244 ret <4 x i16> %tmp7 245 } 246 247 define <2 x i32> @vld3lanei32(i32* %A, <2 x i32>* %B) nounwind { 248 ;CHECK: vld3lanei32: 249 ;CHECK: vld3.32 250 %tmp0 = bitcast i32* %A to i8* 251 %tmp1 = load <2 x i32>* %B 252 %tmp2 = call %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1) 253 %tmp3 = extractvalue %struct.__neon_int32x2x3_t %tmp2, 0 254 %tmp4 = extractvalue %struct.__neon_int32x2x3_t %tmp2, 1 255 %tmp5 = extractvalue %struct.__neon_int32x2x3_t %tmp2, 2 256 %tmp6 = add <2 x i32> %tmp3, %tmp4 257 %tmp7 = add <2 x i32> %tmp5, %tmp6 258 ret <2 x i32> %tmp7 259 } 260 261 define <2 x float> @vld3lanef(float* %A, <2 x float>* %B) nounwind { 262 ;CHECK: vld3lanef: 263 ;CHECK: vld3.32 264 %tmp0 = bitcast float* %A to i8* 265 %tmp1 = load <2 x float>* %B 266 %tmp2 = call %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3lane.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1) 267 %tmp3 = extractvalue %struct.__neon_float32x2x3_t %tmp2, 0 268 %tmp4 = extractvalue %struct.__neon_float32x2x3_t %tmp2, 1 269 %tmp5 = extractvalue %struct.__neon_float32x2x3_t %tmp2, 2 270 %tmp6 = fadd <2 x float> %tmp3, %tmp4 271 %tmp7 = fadd <2 x float> %tmp5, %tmp6 272 ret <2 x float> %tmp7 273 } 274 275 define <8 x i16> @vld3laneQi16(i16* %A, <8 x i16>* %B) nounwind { 276 ;CHECK: vld3laneQi16: 277 ;Check the (default) alignment value. VLD3 does not support alignment. 278 ;CHECK: vld3.16 {d{{.*}}[1], d{{.*}}[1], d{{.*}}[1]}, [{{r[0-9]+}}] 279 %tmp0 = bitcast i16* %A to i8* 280 %tmp1 = load <8 x i16>* %B 281 %tmp2 = call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 8) 282 %tmp3 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 0 283 %tmp4 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 1 284 %tmp5 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 2 285 %tmp6 = add <8 x i16> %tmp3, %tmp4 286 %tmp7 = add <8 x i16> %tmp5, %tmp6 287 ret <8 x i16> %tmp7 288 } 289 290 ;Check for a post-increment updating load with register increment. 291 define <8 x i16> @vld3laneQi16_update(i16** %ptr, <8 x i16>* %B, i32 %inc) nounwind { 292 ;CHECK: vld3laneQi16_update: 293 ;CHECK: vld3.16 {d{{.*}}[1], d{{.*}}[1], d{{.*}}[1]}, [{{r[0-9]+}}], {{r[0-9]+}} 294 %A = load i16** %ptr 295 %tmp0 = bitcast i16* %A to i8* 296 %tmp1 = load <8 x i16>* %B 297 %tmp2 = call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 8) 298 %tmp3 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 0 299 %tmp4 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 1 300 %tmp5 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 2 301 %tmp6 = add <8 x i16> %tmp3, %tmp4 302 %tmp7 = add <8 x i16> %tmp5, %tmp6 303 %tmp8 = getelementptr i16* %A, i32 %inc 304 store i16* %tmp8, i16** %ptr 305 ret <8 x i16> %tmp7 306 } 307 308 define <4 x i32> @vld3laneQi32(i32* %A, <4 x i32>* %B) nounwind { 309 ;CHECK: vld3laneQi32: 310 ;CHECK: vld3.32 311 %tmp0 = bitcast i32* %A to i8* 312 %tmp1 = load <4 x i32>* %B 313 %tmp2 = call %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 3, i32 1) 314 %tmp3 = extractvalue %struct.__neon_int32x4x3_t %tmp2, 0 315 %tmp4 = extractvalue %struct.__neon_int32x4x3_t %tmp2, 1 316 %tmp5 = extractvalue %struct.__neon_int32x4x3_t %tmp2, 2 317 %tmp6 = add <4 x i32> %tmp3, %tmp4 318 %tmp7 = add <4 x i32> %tmp5, %tmp6 319 ret <4 x i32> %tmp7 320 } 321 322 define <4 x float> @vld3laneQf(float* %A, <4 x float>* %B) nounwind { 323 ;CHECK: vld3laneQf: 324 ;CHECK: vld3.32 325 %tmp0 = bitcast float* %A to i8* 326 %tmp1 = load <4 x float>* %B 327 %tmp2 = call %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3lane.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1) 328 %tmp3 = extractvalue %struct.__neon_float32x4x3_t %tmp2, 0 329 %tmp4 = extractvalue %struct.__neon_float32x4x3_t %tmp2, 1 330 %tmp5 = extractvalue %struct.__neon_float32x4x3_t %tmp2, 2 331 %tmp6 = fadd <4 x float> %tmp3, %tmp4 332 %tmp7 = fadd <4 x float> %tmp5, %tmp6 333 ret <4 x float> %tmp7 334 } 335 336 declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly 337 declare %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly 338 declare %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly 339 declare %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32, i32) nounwind readonly 340 341 declare %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly 342 declare %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) nounwind readonly 343 declare %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32, i32) nounwind readonly 344 345 %struct.__neon_int8x8x4_t = type { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } 346 %struct.__neon_int16x4x4_t = type { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } 347 %struct.__neon_int32x2x4_t = type { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } 348 %struct.__neon_float32x2x4_t = type { <2 x float>, <2 x float>, <2 x float>, <2 x float> } 349 350 %struct.__neon_int16x8x4_t = type { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } 351 %struct.__neon_int32x4x4_t = type { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } 352 %struct.__neon_float32x4x4_t = type { <4 x float>, <4 x float>, <4 x float>, <4 x float> } 353 354 define <8 x i8> @vld4lanei8(i8* %A, <8 x i8>* %B) nounwind { 355 ;CHECK: vld4lanei8: 356 ;Check the alignment value. Max for this instruction is 32 bits: 357 ;CHECK: vld4.8 {d{{.*}}[1], d{{.*}}[1], d{{.*}}[1], d{{.*}}[1]}, [{{r[0-9]+}}, :32] 358 %tmp1 = load <8 x i8>* %B 359 %tmp2 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8) 360 %tmp3 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 0 361 %tmp4 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 1 362 %tmp5 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 2 363 %tmp6 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 3 364 %tmp7 = add <8 x i8> %tmp3, %tmp4 365 %tmp8 = add <8 x i8> %tmp5, %tmp6 366 %tmp9 = add <8 x i8> %tmp7, %tmp8 367 ret <8 x i8> %tmp9 368 } 369 370 ;Check for a post-increment updating load. 371 define <8 x i8> @vld4lanei8_update(i8** %ptr, <8 x i8>* %B) nounwind { 372 ;CHECK: vld4lanei8_update: 373 ;CHECK: vld4.8 {d16[1], d17[1], d18[1], d19[1]}, [{{r[0-9]+}}, :32]! 374 %A = load i8** %ptr 375 %tmp1 = load <8 x i8>* %B 376 %tmp2 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8) 377 %tmp3 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 0 378 %tmp4 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 1 379 %tmp5 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 2 380 %tmp6 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 3 381 %tmp7 = add <8 x i8> %tmp3, %tmp4 382 %tmp8 = add <8 x i8> %tmp5, %tmp6 383 %tmp9 = add <8 x i8> %tmp7, %tmp8 384 %tmp10 = getelementptr i8* %A, i32 4 385 store i8* %tmp10, i8** %ptr 386 ret <8 x i8> %tmp9 387 } 388 389 define <4 x i16> @vld4lanei16(i16* %A, <4 x i16>* %B) nounwind { 390 ;CHECK: vld4lanei16: 391 ;Check that a power-of-two alignment smaller than the total size of the memory 392 ;being loaded is ignored. 393 ;CHECK: vld4.16 {d16[1], d17[1], d18[1], d19[1]}, [{{r[0-9]+}}] 394 %tmp0 = bitcast i16* %A to i8* 395 %tmp1 = load <4 x i16>* %B 396 %tmp2 = call %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 4) 397 %tmp3 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 0 398 %tmp4 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 1 399 %tmp5 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 2 400 %tmp6 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 3 401 %tmp7 = add <4 x i16> %tmp3, %tmp4 402 %tmp8 = add <4 x i16> %tmp5, %tmp6 403 %tmp9 = add <4 x i16> %tmp7, %tmp8 404 ret <4 x i16> %tmp9 405 } 406 407 define <2 x i32> @vld4lanei32(i32* %A, <2 x i32>* %B) nounwind { 408 ;CHECK: vld4lanei32: 409 ;Check the alignment value. An 8-byte alignment is allowed here even though 410 ;it is smaller than the total size of the memory being loaded. 411 ;CHECK: vld4.32 {d16[1], d17[1], d18[1], d19[1]}, [{{r[0-9]+}}, :64] 412 %tmp0 = bitcast i32* %A to i8* 413 %tmp1 = load <2 x i32>* %B 414 %tmp2 = call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 8) 415 %tmp3 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 0 416 %tmp4 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 1 417 %tmp5 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 2 418 %tmp6 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 3 419 %tmp7 = add <2 x i32> %tmp3, %tmp4 420 %tmp8 = add <2 x i32> %tmp5, %tmp6 421 %tmp9 = add <2 x i32> %tmp7, %tmp8 422 ret <2 x i32> %tmp9 423 } 424 425 define <2 x float> @vld4lanef(float* %A, <2 x float>* %B) nounwind { 426 ;CHECK: vld4lanef: 427 ;CHECK: vld4.32 428 %tmp0 = bitcast float* %A to i8* 429 %tmp1 = load <2 x float>* %B 430 %tmp2 = call %struct.__neon_float32x2x4_t @llvm.arm.neon.vld4lane.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1) 431 %tmp3 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 0 432 %tmp4 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 1 433 %tmp5 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 2 434 %tmp6 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 3 435 %tmp7 = fadd <2 x float> %tmp3, %tmp4 436 %tmp8 = fadd <2 x float> %tmp5, %tmp6 437 %tmp9 = fadd <2 x float> %tmp7, %tmp8 438 ret <2 x float> %tmp9 439 } 440 441 define <8 x i16> @vld4laneQi16(i16* %A, <8 x i16>* %B) nounwind { 442 ;CHECK: vld4laneQi16: 443 ;Check the alignment value. Max for this instruction is 64 bits: 444 ;CHECK: vld4.16 {d16[1], d18[1], d20[1], d22[1]}, [{{r[0-9]+}}, :64] 445 %tmp0 = bitcast i16* %A to i8* 446 %tmp1 = load <8 x i16>* %B 447 %tmp2 = call %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 16) 448 %tmp3 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 0 449 %tmp4 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 1 450 %tmp5 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 2 451 %tmp6 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 3 452 %tmp7 = add <8 x i16> %tmp3, %tmp4 453 %tmp8 = add <8 x i16> %tmp5, %tmp6 454 %tmp9 = add <8 x i16> %tmp7, %tmp8 455 ret <8 x i16> %tmp9 456 } 457 458 define <4 x i32> @vld4laneQi32(i32* %A, <4 x i32>* %B) nounwind { 459 ;CHECK: vld4laneQi32: 460 ;Check the (default) alignment. 461 ;CHECK: vld4.32 {d17[0], d19[0], d21[0], d23[0]}, [{{r[0-9]+}}] 462 %tmp0 = bitcast i32* %A to i8* 463 %tmp1 = load <4 x i32>* %B 464 %tmp2 = call %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 1) 465 %tmp3 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 0 466 %tmp4 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 1 467 %tmp5 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 2 468 %tmp6 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 3 469 %tmp7 = add <4 x i32> %tmp3, %tmp4 470 %tmp8 = add <4 x i32> %tmp5, %tmp6 471 %tmp9 = add <4 x i32> %tmp7, %tmp8 472 ret <4 x i32> %tmp9 473 } 474 475 define <4 x float> @vld4laneQf(float* %A, <4 x float>* %B) nounwind { 476 ;CHECK: vld4laneQf: 477 ;CHECK: vld4.32 478 %tmp0 = bitcast float* %A to i8* 479 %tmp1 = load <4 x float>* %B 480 %tmp2 = call %struct.__neon_float32x4x4_t @llvm.arm.neon.vld4lane.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1) 481 %tmp3 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 0 482 %tmp4 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 1 483 %tmp5 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 2 484 %tmp6 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 3 485 %tmp7 = fadd <4 x float> %tmp3, %tmp4 486 %tmp8 = fadd <4 x float> %tmp5, %tmp6 487 %tmp9 = fadd <4 x float> %tmp7, %tmp8 488 ret <4 x float> %tmp9 489 } 490 491 declare %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly 492 declare %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly 493 declare %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly 494 declare %struct.__neon_float32x2x4_t @llvm.arm.neon.vld4lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32) nounwind readonly 495 496 declare %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly 497 declare %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) nounwind readonly 498 declare %struct.__neon_float32x4x4_t @llvm.arm.neon.vld4lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32, i32) nounwind readonly 499 500 ; Radar 8776599: If one of the operands to a QQQQ REG_SEQUENCE is a register 501 ; in the QPR_VFP2 regclass, it needs to be copied to a QPR regclass because 502 ; we don't currently have a QQQQ_VFP2 super-regclass. (The "0" for the low 503 ; part of %ins67 is supposed to be loaded by a VLDRS instruction in this test.) 504 define <8 x i16> @test_qqqq_regsequence_subreg([6 x i64] %b) nounwind { 505 ;CHECK: test_qqqq_regsequence_subreg 506 ;CHECK: vld3.16 507 %tmp63 = extractvalue [6 x i64] %b, 5 508 %tmp64 = zext i64 %tmp63 to i128 509 %tmp65 = shl i128 %tmp64, 64 510 %ins67 = or i128 %tmp65, 0 511 %tmp78 = bitcast i128 %ins67 to <8 x i16> 512 %vld3_lane = tail call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16(i8* undef, <8 x i16> undef, <8 x i16> undef, <8 x i16> %tmp78, i32 1, i32 2) 513 %tmp3 = extractvalue %struct.__neon_int16x8x3_t %vld3_lane, 0 514 %tmp4 = extractvalue %struct.__neon_int16x8x3_t %vld3_lane, 1 515 %tmp5 = extractvalue %struct.__neon_int16x8x3_t %vld3_lane, 2 516 %tmp6 = add <8 x i16> %tmp3, %tmp4 517 %tmp7 = add <8 x i16> %tmp5, %tmp6 518 ret <8 x i16> %tmp7 519 } 520 521 declare void @llvm.trap() nounwind 522