1 ; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s 2 ; RUN: llc < %s -march=arm -mattr=+neon -regalloc=basic | FileCheck %s 3 4 define <8 x i8> @vld1lanei8(i8* %A, <8 x i8>* %B) nounwind { 5 ;CHECK: vld1lanei8: 6 ;Check the (default) alignment value. 7 ;CHECK: vld1.8 {d16[3]}, [r0] 8 %tmp1 = load <8 x i8>* %B 9 %tmp2 = load i8* %A, align 8 10 %tmp3 = insertelement <8 x i8> %tmp1, i8 %tmp2, i32 3 11 ret <8 x i8> %tmp3 12 } 13 14 define <4 x i16> @vld1lanei16(i16* %A, <4 x i16>* %B) nounwind { 15 ;CHECK: vld1lanei16: 16 ;Check the alignment value. Max for this instruction is 16 bits: 17 ;CHECK: vld1.16 {d16[2]}, [r0, :16] 18 %tmp1 = load <4 x i16>* %B 19 %tmp2 = load i16* %A, align 8 20 %tmp3 = insertelement <4 x i16> %tmp1, i16 %tmp2, i32 2 21 ret <4 x i16> %tmp3 22 } 23 24 define <2 x i32> @vld1lanei32(i32* %A, <2 x i32>* %B) nounwind { 25 ;CHECK: vld1lanei32: 26 ;Check the alignment value. Max for this instruction is 32 bits: 27 ;CHECK: vld1.32 {d16[1]}, [r0, :32] 28 %tmp1 = load <2 x i32>* %B 29 %tmp2 = load i32* %A, align 8 30 %tmp3 = insertelement <2 x i32> %tmp1, i32 %tmp2, i32 1 31 ret <2 x i32> %tmp3 32 } 33 34 define <2 x float> @vld1lanef(float* %A, <2 x float>* %B) nounwind { 35 ;CHECK: vld1lanef: 36 ;CHECK: vld1.32 {d16[1]}, [r0] 37 %tmp1 = load <2 x float>* %B 38 %tmp2 = load float* %A, align 4 39 %tmp3 = insertelement <2 x float> %tmp1, float %tmp2, i32 1 40 ret <2 x float> %tmp3 41 } 42 43 define <16 x i8> @vld1laneQi8(i8* %A, <16 x i8>* %B) nounwind { 44 ;CHECK: vld1laneQi8: 45 ;CHECK: vld1.8 {d17[1]}, [r0] 46 %tmp1 = load <16 x i8>* %B 47 %tmp2 = load i8* %A, align 8 48 %tmp3 = insertelement <16 x i8> %tmp1, i8 %tmp2, i32 9 49 ret <16 x i8> %tmp3 50 } 51 52 define <8 x i16> @vld1laneQi16(i16* %A, <8 x i16>* %B) nounwind { 53 ;CHECK: vld1laneQi16: 54 ;CHECK: vld1.16 {d17[1]}, [r0, :16] 55 %tmp1 = load <8 x i16>* %B 56 %tmp2 = load i16* %A, align 8 57 %tmp3 = insertelement <8 x i16> %tmp1, i16 %tmp2, i32 5 58 ret <8 x i16> %tmp3 59 } 60 61 define <4 x i32> @vld1laneQi32(i32* %A, <4 x i32>* %B) nounwind { 62 ;CHECK: vld1laneQi32: 63 ;CHECK: vld1.32 {d17[1]}, [r0, :32] 64 %tmp1 = load <4 x i32>* %B 65 %tmp2 = load i32* %A, align 8 66 %tmp3 = insertelement <4 x i32> %tmp1, i32 %tmp2, i32 3 67 ret <4 x i32> %tmp3 68 } 69 70 define <4 x float> @vld1laneQf(float* %A, <4 x float>* %B) nounwind { 71 ;CHECK: vld1laneQf: 72 ;CHECK: vld1.32 {d16[0]}, [r0] 73 %tmp1 = load <4 x float>* %B 74 %tmp2 = load float* %A 75 %tmp3 = insertelement <4 x float> %tmp1, float %tmp2, i32 0 76 ret <4 x float> %tmp3 77 } 78 79 %struct.__neon_int8x8x2_t = type { <8 x i8>, <8 x i8> } 80 %struct.__neon_int16x4x2_t = type { <4 x i16>, <4 x i16> } 81 %struct.__neon_int32x2x2_t = type { <2 x i32>, <2 x i32> } 82 %struct.__neon_float32x2x2_t = type { <2 x float>, <2 x float> } 83 84 %struct.__neon_int16x8x2_t = type { <8 x i16>, <8 x i16> } 85 %struct.__neon_int32x4x2_t = type { <4 x i32>, <4 x i32> } 86 %struct.__neon_float32x4x2_t = type { <4 x float>, <4 x float> } 87 88 define <8 x i8> @vld2lanei8(i8* %A, <8 x i8>* %B) nounwind { 89 ;CHECK: vld2lanei8: 90 ;Check the alignment value. Max for this instruction is 16 bits: 91 ;CHECK: vld2.8 {d16[1], d17[1]}, [r0, :16] 92 %tmp1 = load <8 x i8>* %B 93 %tmp2 = call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 4) 94 %tmp3 = extractvalue %struct.__neon_int8x8x2_t %tmp2, 0 95 %tmp4 = extractvalue %struct.__neon_int8x8x2_t %tmp2, 1 96 %tmp5 = add <8 x i8> %tmp3, %tmp4 97 ret <8 x i8> %tmp5 98 } 99 100 define <4 x i16> @vld2lanei16(i16* %A, <4 x i16>* %B) nounwind { 101 ;CHECK: vld2lanei16: 102 ;Check the alignment value. Max for this instruction is 32 bits: 103 ;CHECK: vld2.16 {d16[1], d17[1]}, [r0, :32] 104 %tmp0 = bitcast i16* %A to i8* 105 %tmp1 = load <4 x i16>* %B 106 %tmp2 = call %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 8) 107 %tmp3 = extractvalue %struct.__neon_int16x4x2_t %tmp2, 0 108 %tmp4 = extractvalue %struct.__neon_int16x4x2_t %tmp2, 1 109 %tmp5 = add <4 x i16> %tmp3, %tmp4 110 ret <4 x i16> %tmp5 111 } 112 113 define <2 x i32> @vld2lanei32(i32* %A, <2 x i32>* %B) nounwind { 114 ;CHECK: vld2lanei32: 115 ;CHECK: vld2.32 116 %tmp0 = bitcast i32* %A to i8* 117 %tmp1 = load <2 x i32>* %B 118 %tmp2 = call %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1) 119 %tmp3 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 0 120 %tmp4 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 1 121 %tmp5 = add <2 x i32> %tmp3, %tmp4 122 ret <2 x i32> %tmp5 123 } 124 125 ;Check for a post-increment updating load. 126 define <2 x i32> @vld2lanei32_update(i32** %ptr, <2 x i32>* %B) nounwind { 127 ;CHECK: vld2lanei32_update: 128 ;CHECK: vld2.32 {d16[1], d17[1]}, [{{r[0-9]+}}]! 129 %A = load i32** %ptr 130 %tmp0 = bitcast i32* %A to i8* 131 %tmp1 = load <2 x i32>* %B 132 %tmp2 = call %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1) 133 %tmp3 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 0 134 %tmp4 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 1 135 %tmp5 = add <2 x i32> %tmp3, %tmp4 136 %tmp6 = getelementptr i32* %A, i32 2 137 store i32* %tmp6, i32** %ptr 138 ret <2 x i32> %tmp5 139 } 140 141 define <2 x float> @vld2lanef(float* %A, <2 x float>* %B) nounwind { 142 ;CHECK: vld2lanef: 143 ;CHECK: vld2.32 144 %tmp0 = bitcast float* %A to i8* 145 %tmp1 = load <2 x float>* %B 146 %tmp2 = call %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2lane.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1) 147 %tmp3 = extractvalue %struct.__neon_float32x2x2_t %tmp2, 0 148 %tmp4 = extractvalue %struct.__neon_float32x2x2_t %tmp2, 1 149 %tmp5 = fadd <2 x float> %tmp3, %tmp4 150 ret <2 x float> %tmp5 151 } 152 153 define <8 x i16> @vld2laneQi16(i16* %A, <8 x i16>* %B) nounwind { 154 ;CHECK: vld2laneQi16: 155 ;Check the (default) alignment. 156 ;CHECK: vld2.16 {d17[1], d19[1]}, [{{r[0-9]+}}] 157 %tmp0 = bitcast i16* %A to i8* 158 %tmp1 = load <8 x i16>* %B 159 %tmp2 = call %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 5, i32 1) 160 %tmp3 = extractvalue %struct.__neon_int16x8x2_t %tmp2, 0 161 %tmp4 = extractvalue %struct.__neon_int16x8x2_t %tmp2, 1 162 %tmp5 = add <8 x i16> %tmp3, %tmp4 163 ret <8 x i16> %tmp5 164 } 165 166 define <4 x i32> @vld2laneQi32(i32* %A, <4 x i32>* %B) nounwind { 167 ;CHECK: vld2laneQi32: 168 ;Check the alignment value. Max for this instruction is 64 bits: 169 ;CHECK: vld2.32 {d17[0], d19[0]}, [{{r[0-9]+}}, :64] 170 %tmp0 = bitcast i32* %A to i8* 171 %tmp1 = load <4 x i32>* %B 172 %tmp2 = call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 16) 173 %tmp3 = extractvalue %struct.__neon_int32x4x2_t %tmp2, 0 174 %tmp4 = extractvalue %struct.__neon_int32x4x2_t %tmp2, 1 175 %tmp5 = add <4 x i32> %tmp3, %tmp4 176 ret <4 x i32> %tmp5 177 } 178 179 define <4 x float> @vld2laneQf(float* %A, <4 x float>* %B) nounwind { 180 ;CHECK: vld2laneQf: 181 ;CHECK: vld2.32 182 %tmp0 = bitcast float* %A to i8* 183 %tmp1 = load <4 x float>* %B 184 %tmp2 = call %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2lane.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1) 185 %tmp3 = extractvalue %struct.__neon_float32x4x2_t %tmp2, 0 186 %tmp4 = extractvalue %struct.__neon_float32x4x2_t %tmp2, 1 187 %tmp5 = fadd <4 x float> %tmp3, %tmp4 188 ret <4 x float> %tmp5 189 } 190 191 declare %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly 192 declare %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2lane.v4i16(i8*, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly 193 declare %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly 194 declare %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2lane.v2f32(i8*, <2 x float>, <2 x float>, i32, i32) nounwind readonly 195 196 declare %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16(i8*, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly 197 declare %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2lane.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32) nounwind readonly 198 declare %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2lane.v4f32(i8*, <4 x float>, <4 x float>, i32, i32) nounwind readonly 199 200 %struct.__neon_int8x8x3_t = type { <8 x i8>, <8 x i8>, <8 x i8> } 201 %struct.__neon_int16x4x3_t = type { <4 x i16>, <4 x i16>, <4 x i16> } 202 %struct.__neon_int32x2x3_t = type { <2 x i32>, <2 x i32>, <2 x i32> } 203 %struct.__neon_float32x2x3_t = type { <2 x float>, <2 x float>, <2 x float> } 204 205 %struct.__neon_int16x8x3_t = type { <8 x i16>, <8 x i16>, <8 x i16> } 206 %struct.__neon_int32x4x3_t = type { <4 x i32>, <4 x i32>, <4 x i32> } 207 %struct.__neon_float32x4x3_t = type { <4 x float>, <4 x float>, <4 x float> } 208 209 define <8 x i8> @vld3lanei8(i8* %A, <8 x i8>* %B) nounwind { 210 ;CHECK: vld3lanei8: 211 ;CHECK: vld3.8 212 %tmp1 = load <8 x i8>* %B 213 %tmp2 = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 1) 214 %tmp3 = extractvalue %struct.__neon_int8x8x3_t %tmp2, 0 215 %tmp4 = extractvalue %struct.__neon_int8x8x3_t %tmp2, 1 216 %tmp5 = extractvalue %struct.__neon_int8x8x3_t %tmp2, 2 217 %tmp6 = add <8 x i8> %tmp3, %tmp4 218 %tmp7 = add <8 x i8> %tmp5, %tmp6 219 ret <8 x i8> %tmp7 220 } 221 222 define <4 x i16> @vld3lanei16(i16* %A, <4 x i16>* %B) nounwind { 223 ;CHECK: vld3lanei16: 224 ;Check the (default) alignment value. VLD3 does not support alignment. 225 ;CHECK: vld3.16 {d{{.*}}[1], d{{.*}}[1], d{{.*}}[1]}, [{{r[0-9]+}}] 226 %tmp0 = bitcast i16* %A to i8* 227 %tmp1 = load <4 x i16>* %B 228 %tmp2 = call %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 8) 229 %tmp3 = extractvalue %struct.__neon_int16x4x3_t %tmp2, 0 230 %tmp4 = extractvalue %struct.__neon_int16x4x3_t %tmp2, 1 231 %tmp5 = extractvalue %struct.__neon_int16x4x3_t %tmp2, 2 232 %tmp6 = add <4 x i16> %tmp3, %tmp4 233 %tmp7 = add <4 x i16> %tmp5, %tmp6 234 ret <4 x i16> %tmp7 235 } 236 237 define <2 x i32> @vld3lanei32(i32* %A, <2 x i32>* %B) nounwind { 238 ;CHECK: vld3lanei32: 239 ;CHECK: vld3.32 240 %tmp0 = bitcast i32* %A to i8* 241 %tmp1 = load <2 x i32>* %B 242 %tmp2 = call %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1) 243 %tmp3 = extractvalue %struct.__neon_int32x2x3_t %tmp2, 0 244 %tmp4 = extractvalue %struct.__neon_int32x2x3_t %tmp2, 1 245 %tmp5 = extractvalue %struct.__neon_int32x2x3_t %tmp2, 2 246 %tmp6 = add <2 x i32> %tmp3, %tmp4 247 %tmp7 = add <2 x i32> %tmp5, %tmp6 248 ret <2 x i32> %tmp7 249 } 250 251 define <2 x float> @vld3lanef(float* %A, <2 x float>* %B) nounwind { 252 ;CHECK: vld3lanef: 253 ;CHECK: vld3.32 254 %tmp0 = bitcast float* %A to i8* 255 %tmp1 = load <2 x float>* %B 256 %tmp2 = call %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3lane.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1) 257 %tmp3 = extractvalue %struct.__neon_float32x2x3_t %tmp2, 0 258 %tmp4 = extractvalue %struct.__neon_float32x2x3_t %tmp2, 1 259 %tmp5 = extractvalue %struct.__neon_float32x2x3_t %tmp2, 2 260 %tmp6 = fadd <2 x float> %tmp3, %tmp4 261 %tmp7 = fadd <2 x float> %tmp5, %tmp6 262 ret <2 x float> %tmp7 263 } 264 265 define <8 x i16> @vld3laneQi16(i16* %A, <8 x i16>* %B) nounwind { 266 ;CHECK: vld3laneQi16: 267 ;Check the (default) alignment value. VLD3 does not support alignment. 268 ;CHECK: vld3.16 {d{{.*}}[1], d{{.*}}[1], d{{.*}}[1]}, [{{r[0-9]+}}] 269 %tmp0 = bitcast i16* %A to i8* 270 %tmp1 = load <8 x i16>* %B 271 %tmp2 = call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 8) 272 %tmp3 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 0 273 %tmp4 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 1 274 %tmp5 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 2 275 %tmp6 = add <8 x i16> %tmp3, %tmp4 276 %tmp7 = add <8 x i16> %tmp5, %tmp6 277 ret <8 x i16> %tmp7 278 } 279 280 ;Check for a post-increment updating load with register increment. 281 define <8 x i16> @vld3laneQi16_update(i16** %ptr, <8 x i16>* %B, i32 %inc) nounwind { 282 ;CHECK: vld3laneQi16_update: 283 ;CHECK: vld3.16 {d{{.*}}[1], d{{.*}}[1], d{{.*}}[1]}, [{{r[0-9]+}}], {{r[0-9]+}} 284 %A = load i16** %ptr 285 %tmp0 = bitcast i16* %A to i8* 286 %tmp1 = load <8 x i16>* %B 287 %tmp2 = call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 8) 288 %tmp3 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 0 289 %tmp4 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 1 290 %tmp5 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 2 291 %tmp6 = add <8 x i16> %tmp3, %tmp4 292 %tmp7 = add <8 x i16> %tmp5, %tmp6 293 %tmp8 = getelementptr i16* %A, i32 %inc 294 store i16* %tmp8, i16** %ptr 295 ret <8 x i16> %tmp7 296 } 297 298 define <4 x i32> @vld3laneQi32(i32* %A, <4 x i32>* %B) nounwind { 299 ;CHECK: vld3laneQi32: 300 ;CHECK: vld3.32 301 %tmp0 = bitcast i32* %A to i8* 302 %tmp1 = load <4 x i32>* %B 303 %tmp2 = call %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 3, i32 1) 304 %tmp3 = extractvalue %struct.__neon_int32x4x3_t %tmp2, 0 305 %tmp4 = extractvalue %struct.__neon_int32x4x3_t %tmp2, 1 306 %tmp5 = extractvalue %struct.__neon_int32x4x3_t %tmp2, 2 307 %tmp6 = add <4 x i32> %tmp3, %tmp4 308 %tmp7 = add <4 x i32> %tmp5, %tmp6 309 ret <4 x i32> %tmp7 310 } 311 312 define <4 x float> @vld3laneQf(float* %A, <4 x float>* %B) nounwind { 313 ;CHECK: vld3laneQf: 314 ;CHECK: vld3.32 315 %tmp0 = bitcast float* %A to i8* 316 %tmp1 = load <4 x float>* %B 317 %tmp2 = call %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3lane.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1) 318 %tmp3 = extractvalue %struct.__neon_float32x4x3_t %tmp2, 0 319 %tmp4 = extractvalue %struct.__neon_float32x4x3_t %tmp2, 1 320 %tmp5 = extractvalue %struct.__neon_float32x4x3_t %tmp2, 2 321 %tmp6 = fadd <4 x float> %tmp3, %tmp4 322 %tmp7 = fadd <4 x float> %tmp5, %tmp6 323 ret <4 x float> %tmp7 324 } 325 326 declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly 327 declare %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly 328 declare %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly 329 declare %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32, i32) nounwind readonly 330 331 declare %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly 332 declare %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) nounwind readonly 333 declare %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32, i32) nounwind readonly 334 335 %struct.__neon_int8x8x4_t = type { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } 336 %struct.__neon_int16x4x4_t = type { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } 337 %struct.__neon_int32x2x4_t = type { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } 338 %struct.__neon_float32x2x4_t = type { <2 x float>, <2 x float>, <2 x float>, <2 x float> } 339 340 %struct.__neon_int16x8x4_t = type { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } 341 %struct.__neon_int32x4x4_t = type { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } 342 %struct.__neon_float32x4x4_t = type { <4 x float>, <4 x float>, <4 x float>, <4 x float> } 343 344 define <8 x i8> @vld4lanei8(i8* %A, <8 x i8>* %B) nounwind { 345 ;CHECK: vld4lanei8: 346 ;Check the alignment value. Max for this instruction is 32 bits: 347 ;CHECK: vld4.8 {d{{.*}}[1], d{{.*}}[1], d{{.*}}[1], d{{.*}}[1]}, [{{r[0-9]+}}, :32] 348 %tmp1 = load <8 x i8>* %B 349 %tmp2 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8) 350 %tmp3 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 0 351 %tmp4 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 1 352 %tmp5 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 2 353 %tmp6 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 3 354 %tmp7 = add <8 x i8> %tmp3, %tmp4 355 %tmp8 = add <8 x i8> %tmp5, %tmp6 356 %tmp9 = add <8 x i8> %tmp7, %tmp8 357 ret <8 x i8> %tmp9 358 } 359 360 ;Check for a post-increment updating load. 361 define <8 x i8> @vld4lanei8_update(i8** %ptr, <8 x i8>* %B) nounwind { 362 ;CHECK: vld4lanei8_update: 363 ;CHECK: vld4.8 {d16[1], d17[1], d18[1], d19[1]}, [{{r[0-9]+}}, :32]! 364 %A = load i8** %ptr 365 %tmp1 = load <8 x i8>* %B 366 %tmp2 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8) 367 %tmp3 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 0 368 %tmp4 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 1 369 %tmp5 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 2 370 %tmp6 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 3 371 %tmp7 = add <8 x i8> %tmp3, %tmp4 372 %tmp8 = add <8 x i8> %tmp5, %tmp6 373 %tmp9 = add <8 x i8> %tmp7, %tmp8 374 %tmp10 = getelementptr i8* %A, i32 4 375 store i8* %tmp10, i8** %ptr 376 ret <8 x i8> %tmp9 377 } 378 379 define <4 x i16> @vld4lanei16(i16* %A, <4 x i16>* %B) nounwind { 380 ;CHECK: vld4lanei16: 381 ;Check that a power-of-two alignment smaller than the total size of the memory 382 ;being loaded is ignored. 383 ;CHECK: vld4.16 {d16[1], d17[1], d18[1], d19[1]}, [{{r[0-9]+}}] 384 %tmp0 = bitcast i16* %A to i8* 385 %tmp1 = load <4 x i16>* %B 386 %tmp2 = call %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 4) 387 %tmp3 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 0 388 %tmp4 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 1 389 %tmp5 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 2 390 %tmp6 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 3 391 %tmp7 = add <4 x i16> %tmp3, %tmp4 392 %tmp8 = add <4 x i16> %tmp5, %tmp6 393 %tmp9 = add <4 x i16> %tmp7, %tmp8 394 ret <4 x i16> %tmp9 395 } 396 397 define <2 x i32> @vld4lanei32(i32* %A, <2 x i32>* %B) nounwind { 398 ;CHECK: vld4lanei32: 399 ;Check the alignment value. An 8-byte alignment is allowed here even though 400 ;it is smaller than the total size of the memory being loaded. 401 ;CHECK: vld4.32 {d16[1], d17[1], d18[1], d19[1]}, [{{r[0-9]+}}, :64] 402 %tmp0 = bitcast i32* %A to i8* 403 %tmp1 = load <2 x i32>* %B 404 %tmp2 = call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 8) 405 %tmp3 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 0 406 %tmp4 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 1 407 %tmp5 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 2 408 %tmp6 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 3 409 %tmp7 = add <2 x i32> %tmp3, %tmp4 410 %tmp8 = add <2 x i32> %tmp5, %tmp6 411 %tmp9 = add <2 x i32> %tmp7, %tmp8 412 ret <2 x i32> %tmp9 413 } 414 415 define <2 x float> @vld4lanef(float* %A, <2 x float>* %B) nounwind { 416 ;CHECK: vld4lanef: 417 ;CHECK: vld4.32 418 %tmp0 = bitcast float* %A to i8* 419 %tmp1 = load <2 x float>* %B 420 %tmp2 = call %struct.__neon_float32x2x4_t @llvm.arm.neon.vld4lane.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1) 421 %tmp3 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 0 422 %tmp4 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 1 423 %tmp5 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 2 424 %tmp6 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 3 425 %tmp7 = fadd <2 x float> %tmp3, %tmp4 426 %tmp8 = fadd <2 x float> %tmp5, %tmp6 427 %tmp9 = fadd <2 x float> %tmp7, %tmp8 428 ret <2 x float> %tmp9 429 } 430 431 define <8 x i16> @vld4laneQi16(i16* %A, <8 x i16>* %B) nounwind { 432 ;CHECK: vld4laneQi16: 433 ;Check the alignment value. Max for this instruction is 64 bits: 434 ;CHECK: vld4.16 {d16[1], d18[1], d20[1], d22[1]}, [{{r[0-9]+}}, :64] 435 %tmp0 = bitcast i16* %A to i8* 436 %tmp1 = load <8 x i16>* %B 437 %tmp2 = call %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 16) 438 %tmp3 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 0 439 %tmp4 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 1 440 %tmp5 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 2 441 %tmp6 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 3 442 %tmp7 = add <8 x i16> %tmp3, %tmp4 443 %tmp8 = add <8 x i16> %tmp5, %tmp6 444 %tmp9 = add <8 x i16> %tmp7, %tmp8 445 ret <8 x i16> %tmp9 446 } 447 448 define <4 x i32> @vld4laneQi32(i32* %A, <4 x i32>* %B) nounwind { 449 ;CHECK: vld4laneQi32: 450 ;Check the (default) alignment. 451 ;CHECK: vld4.32 {d17[0], d19[0], d21[0], d23[0]}, [{{r[0-9]+}}] 452 %tmp0 = bitcast i32* %A to i8* 453 %tmp1 = load <4 x i32>* %B 454 %tmp2 = call %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 1) 455 %tmp3 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 0 456 %tmp4 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 1 457 %tmp5 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 2 458 %tmp6 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 3 459 %tmp7 = add <4 x i32> %tmp3, %tmp4 460 %tmp8 = add <4 x i32> %tmp5, %tmp6 461 %tmp9 = add <4 x i32> %tmp7, %tmp8 462 ret <4 x i32> %tmp9 463 } 464 465 define <4 x float> @vld4laneQf(float* %A, <4 x float>* %B) nounwind { 466 ;CHECK: vld4laneQf: 467 ;CHECK: vld4.32 468 %tmp0 = bitcast float* %A to i8* 469 %tmp1 = load <4 x float>* %B 470 %tmp2 = call %struct.__neon_float32x4x4_t @llvm.arm.neon.vld4lane.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1) 471 %tmp3 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 0 472 %tmp4 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 1 473 %tmp5 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 2 474 %tmp6 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 3 475 %tmp7 = fadd <4 x float> %tmp3, %tmp4 476 %tmp8 = fadd <4 x float> %tmp5, %tmp6 477 %tmp9 = fadd <4 x float> %tmp7, %tmp8 478 ret <4 x float> %tmp9 479 } 480 481 declare %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly 482 declare %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly 483 declare %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly 484 declare %struct.__neon_float32x2x4_t @llvm.arm.neon.vld4lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32) nounwind readonly 485 486 declare %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly 487 declare %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) nounwind readonly 488 declare %struct.__neon_float32x4x4_t @llvm.arm.neon.vld4lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32, i32) nounwind readonly 489 490 ; Radar 8776599: If one of the operands to a QQQQ REG_SEQUENCE is a register 491 ; in the QPR_VFP2 regclass, it needs to be copied to a QPR regclass because 492 ; we don't currently have a QQQQ_VFP2 super-regclass. (The "0" for the low 493 ; part of %ins67 is supposed to be loaded by a VLDRS instruction in this test.) 494 define <8 x i16> @test_qqqq_regsequence_subreg([6 x i64] %b) nounwind { 495 ;CHECK: test_qqqq_regsequence_subreg 496 ;CHECK: vld3.16 497 %tmp63 = extractvalue [6 x i64] %b, 5 498 %tmp64 = zext i64 %tmp63 to i128 499 %tmp65 = shl i128 %tmp64, 64 500 %ins67 = or i128 %tmp65, 0 501 %tmp78 = bitcast i128 %ins67 to <8 x i16> 502 %vld3_lane = tail call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16(i8* undef, <8 x i16> undef, <8 x i16> undef, <8 x i16> %tmp78, i32 1, i32 2) 503 %tmp3 = extractvalue %struct.__neon_int16x8x3_t %vld3_lane, 0 504 %tmp4 = extractvalue %struct.__neon_int16x8x3_t %vld3_lane, 1 505 %tmp5 = extractvalue %struct.__neon_int16x8x3_t %vld3_lane, 2 506 %tmp6 = add <8 x i16> %tmp3, %tmp4 507 %tmp7 = add <8 x i16> %tmp5, %tmp6 508 ret <8 x i16> %tmp7 509 } 510 511 declare void @llvm.trap() nounwind 512