1 ; RUN: llc -mtriple=arm-eabi -float-abi=soft -mattr=+neon %s -o - | FileCheck %s 2 3 ; RUN: llc -mtriple=arm-eabi -float-abi=soft -mattr=+neon -regalloc=basic %s -o - \ 4 ; RUN: | FileCheck %s 5 6 define <8 x i8> @vld1lanei8(i8* %A, <8 x i8>* %B) nounwind { 7 ;CHECK-LABEL: vld1lanei8: 8 ;Check the (default) alignment value. 9 ;CHECK: vld1.8 {d16[3]}, [r0] 10 %tmp1 = load <8 x i8>* %B 11 %tmp2 = load i8* %A, align 8 12 %tmp3 = insertelement <8 x i8> %tmp1, i8 %tmp2, i32 3 13 ret <8 x i8> %tmp3 14 } 15 16 define <4 x i16> @vld1lanei16(i16* %A, <4 x i16>* %B) nounwind { 17 ;CHECK-LABEL: vld1lanei16: 18 ;Check the alignment value. Max for this instruction is 16 bits: 19 ;CHECK: vld1.16 {d16[2]}, [r0:16] 20 %tmp1 = load <4 x i16>* %B 21 %tmp2 = load i16* %A, align 8 22 %tmp3 = insertelement <4 x i16> %tmp1, i16 %tmp2, i32 2 23 ret <4 x i16> %tmp3 24 } 25 26 define <2 x i32> @vld1lanei32(i32* %A, <2 x i32>* %B) nounwind { 27 ;CHECK-LABEL: vld1lanei32: 28 ;Check the alignment value. Max for this instruction is 32 bits: 29 ;CHECK: vld1.32 {d16[1]}, [r0:32] 30 %tmp1 = load <2 x i32>* %B 31 %tmp2 = load i32* %A, align 8 32 %tmp3 = insertelement <2 x i32> %tmp1, i32 %tmp2, i32 1 33 ret <2 x i32> %tmp3 34 } 35 36 define <2 x i32> @vld1lanei32a32(i32* %A, <2 x i32>* %B) nounwind { 37 ;CHECK-LABEL: vld1lanei32a32: 38 ;Check the alignment value. Legal values are none or :32. 39 ;CHECK: vld1.32 {d16[1]}, [r0:32] 40 %tmp1 = load <2 x i32>* %B 41 %tmp2 = load i32* %A, align 4 42 %tmp3 = insertelement <2 x i32> %tmp1, i32 %tmp2, i32 1 43 ret <2 x i32> %tmp3 44 } 45 46 define <2 x float> @vld1lanef(float* %A, <2 x float>* %B) nounwind { 47 ;CHECK-LABEL: vld1lanef: 48 ;CHECK: vld1.32 {d16[1]}, [r0:32] 49 %tmp1 = load <2 x float>* %B 50 %tmp2 = load float* %A, align 4 51 %tmp3 = insertelement <2 x float> %tmp1, float %tmp2, i32 1 52 ret <2 x float> %tmp3 53 } 54 55 define <16 x i8> @vld1laneQi8(i8* %A, <16 x i8>* %B) nounwind { 56 ;CHECK-LABEL: vld1laneQi8: 57 ;CHECK: vld1.8 {d17[1]}, [r0] 58 %tmp1 = load <16 x i8>* %B 59 %tmp2 = load i8* %A, align 8 60 %tmp3 = insertelement <16 x i8> %tmp1, i8 %tmp2, i32 9 61 ret <16 x i8> %tmp3 62 } 63 64 define <8 x i16> @vld1laneQi16(i16* %A, <8 x i16>* %B) nounwind { 65 ;CHECK-LABEL: vld1laneQi16: 66 ;CHECK: vld1.16 {d17[1]}, [r0:16] 67 %tmp1 = load <8 x i16>* %B 68 %tmp2 = load i16* %A, align 8 69 %tmp3 = insertelement <8 x i16> %tmp1, i16 %tmp2, i32 5 70 ret <8 x i16> %tmp3 71 } 72 73 define <4 x i32> @vld1laneQi32(i32* %A, <4 x i32>* %B) nounwind { 74 ;CHECK-LABEL: vld1laneQi32: 75 ;CHECK: vld1.32 {d17[1]}, [r0:32] 76 %tmp1 = load <4 x i32>* %B 77 %tmp2 = load i32* %A, align 8 78 %tmp3 = insertelement <4 x i32> %tmp1, i32 %tmp2, i32 3 79 ret <4 x i32> %tmp3 80 } 81 82 define <4 x float> @vld1laneQf(float* %A, <4 x float>* %B) nounwind { 83 ;CHECK-LABEL: vld1laneQf: 84 ;CHECK: vld1.32 {d16[0]}, [r0:32] 85 %tmp1 = load <4 x float>* %B 86 %tmp2 = load float* %A 87 %tmp3 = insertelement <4 x float> %tmp1, float %tmp2, i32 0 88 ret <4 x float> %tmp3 89 } 90 91 %struct.__neon_int8x8x2_t = type { <8 x i8>, <8 x i8> } 92 %struct.__neon_int16x4x2_t = type { <4 x i16>, <4 x i16> } 93 %struct.__neon_int32x2x2_t = type { <2 x i32>, <2 x i32> } 94 %struct.__neon_float32x2x2_t = type { <2 x float>, <2 x float> } 95 96 %struct.__neon_int16x8x2_t = type { <8 x i16>, <8 x i16> } 97 %struct.__neon_int32x4x2_t = type { <4 x i32>, <4 x i32> } 98 %struct.__neon_float32x4x2_t = type { <4 x float>, <4 x float> } 99 100 define <8 x i8> @vld2lanei8(i8* %A, <8 x i8>* %B) nounwind { 101 ;CHECK-LABEL: vld2lanei8: 102 ;Check the alignment value. Max for this instruction is 16 bits: 103 ;CHECK: vld2.8 {d16[1], d17[1]}, [r0:16] 104 %tmp1 = load <8 x i8>* %B 105 %tmp2 = call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 4) 106 %tmp3 = extractvalue %struct.__neon_int8x8x2_t %tmp2, 0 107 %tmp4 = extractvalue %struct.__neon_int8x8x2_t %tmp2, 1 108 %tmp5 = add <8 x i8> %tmp3, %tmp4 109 ret <8 x i8> %tmp5 110 } 111 112 define <4 x i16> @vld2lanei16(i16* %A, <4 x i16>* %B) nounwind { 113 ;CHECK-LABEL: vld2lanei16: 114 ;Check the alignment value. Max for this instruction is 32 bits: 115 ;CHECK: vld2.16 {d16[1], d17[1]}, [r0:32] 116 %tmp0 = bitcast i16* %A to i8* 117 %tmp1 = load <4 x i16>* %B 118 %tmp2 = call %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 8) 119 %tmp3 = extractvalue %struct.__neon_int16x4x2_t %tmp2, 0 120 %tmp4 = extractvalue %struct.__neon_int16x4x2_t %tmp2, 1 121 %tmp5 = add <4 x i16> %tmp3, %tmp4 122 ret <4 x i16> %tmp5 123 } 124 125 define <2 x i32> @vld2lanei32(i32* %A, <2 x i32>* %B) nounwind { 126 ;CHECK-LABEL: vld2lanei32: 127 ;CHECK: vld2.32 128 %tmp0 = bitcast i32* %A to i8* 129 %tmp1 = load <2 x i32>* %B 130 %tmp2 = call %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1) 131 %tmp3 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 0 132 %tmp4 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 1 133 %tmp5 = add <2 x i32> %tmp3, %tmp4 134 ret <2 x i32> %tmp5 135 } 136 137 ;Check for a post-increment updating load. 138 define <2 x i32> @vld2lanei32_update(i32** %ptr, <2 x i32>* %B) nounwind { 139 ;CHECK-LABEL: vld2lanei32_update: 140 ;CHECK: vld2.32 {d16[1], d17[1]}, [{{r[0-9]+}}]! 141 %A = load i32** %ptr 142 %tmp0 = bitcast i32* %A to i8* 143 %tmp1 = load <2 x i32>* %B 144 %tmp2 = call %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1) 145 %tmp3 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 0 146 %tmp4 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 1 147 %tmp5 = add <2 x i32> %tmp3, %tmp4 148 %tmp6 = getelementptr i32* %A, i32 2 149 store i32* %tmp6, i32** %ptr 150 ret <2 x i32> %tmp5 151 } 152 153 define <2 x float> @vld2lanef(float* %A, <2 x float>* %B) nounwind { 154 ;CHECK-LABEL: vld2lanef: 155 ;CHECK: vld2.32 156 %tmp0 = bitcast float* %A to i8* 157 %tmp1 = load <2 x float>* %B 158 %tmp2 = call %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2lane.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1) 159 %tmp3 = extractvalue %struct.__neon_float32x2x2_t %tmp2, 0 160 %tmp4 = extractvalue %struct.__neon_float32x2x2_t %tmp2, 1 161 %tmp5 = fadd <2 x float> %tmp3, %tmp4 162 ret <2 x float> %tmp5 163 } 164 165 define <8 x i16> @vld2laneQi16(i16* %A, <8 x i16>* %B) nounwind { 166 ;CHECK-LABEL: vld2laneQi16: 167 ;Check the (default) alignment. 168 ;CHECK: vld2.16 {d17[1], d19[1]}, [{{r[0-9]+}}] 169 %tmp0 = bitcast i16* %A to i8* 170 %tmp1 = load <8 x i16>* %B 171 %tmp2 = call %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 5, i32 1) 172 %tmp3 = extractvalue %struct.__neon_int16x8x2_t %tmp2, 0 173 %tmp4 = extractvalue %struct.__neon_int16x8x2_t %tmp2, 1 174 %tmp5 = add <8 x i16> %tmp3, %tmp4 175 ret <8 x i16> %tmp5 176 } 177 178 define <4 x i32> @vld2laneQi32(i32* %A, <4 x i32>* %B) nounwind { 179 ;CHECK-LABEL: vld2laneQi32: 180 ;Check the alignment value. Max for this instruction is 64 bits: 181 ;CHECK: vld2.32 {d17[0], d19[0]}, [{{r[0-9]+}}:64] 182 %tmp0 = bitcast i32* %A to i8* 183 %tmp1 = load <4 x i32>* %B 184 %tmp2 = call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 16) 185 %tmp3 = extractvalue %struct.__neon_int32x4x2_t %tmp2, 0 186 %tmp4 = extractvalue %struct.__neon_int32x4x2_t %tmp2, 1 187 %tmp5 = add <4 x i32> %tmp3, %tmp4 188 ret <4 x i32> %tmp5 189 } 190 191 define <4 x float> @vld2laneQf(float* %A, <4 x float>* %B) nounwind { 192 ;CHECK-LABEL: vld2laneQf: 193 ;CHECK: vld2.32 194 %tmp0 = bitcast float* %A to i8* 195 %tmp1 = load <4 x float>* %B 196 %tmp2 = call %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2lane.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1) 197 %tmp3 = extractvalue %struct.__neon_float32x4x2_t %tmp2, 0 198 %tmp4 = extractvalue %struct.__neon_float32x4x2_t %tmp2, 1 199 %tmp5 = fadd <4 x float> %tmp3, %tmp4 200 ret <4 x float> %tmp5 201 } 202 203 declare %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly 204 declare %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2lane.v4i16(i8*, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly 205 declare %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly 206 declare %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2lane.v2f32(i8*, <2 x float>, <2 x float>, i32, i32) nounwind readonly 207 208 declare %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16(i8*, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly 209 declare %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2lane.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32) nounwind readonly 210 declare %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2lane.v4f32(i8*, <4 x float>, <4 x float>, i32, i32) nounwind readonly 211 212 %struct.__neon_int8x8x3_t = type { <8 x i8>, <8 x i8>, <8 x i8> } 213 %struct.__neon_int16x4x3_t = type { <4 x i16>, <4 x i16>, <4 x i16> } 214 %struct.__neon_int32x2x3_t = type { <2 x i32>, <2 x i32>, <2 x i32> } 215 %struct.__neon_float32x2x3_t = type { <2 x float>, <2 x float>, <2 x float> } 216 217 %struct.__neon_int16x8x3_t = type { <8 x i16>, <8 x i16>, <8 x i16> } 218 %struct.__neon_int32x4x3_t = type { <4 x i32>, <4 x i32>, <4 x i32> } 219 %struct.__neon_float32x4x3_t = type { <4 x float>, <4 x float>, <4 x float> } 220 221 define <8 x i8> @vld3lanei8(i8* %A, <8 x i8>* %B) nounwind { 222 ;CHECK-LABEL: vld3lanei8: 223 ;CHECK: vld3.8 224 %tmp1 = load <8 x i8>* %B 225 %tmp2 = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 1) 226 %tmp3 = extractvalue %struct.__neon_int8x8x3_t %tmp2, 0 227 %tmp4 = extractvalue %struct.__neon_int8x8x3_t %tmp2, 1 228 %tmp5 = extractvalue %struct.__neon_int8x8x3_t %tmp2, 2 229 %tmp6 = add <8 x i8> %tmp3, %tmp4 230 %tmp7 = add <8 x i8> %tmp5, %tmp6 231 ret <8 x i8> %tmp7 232 } 233 234 define <4 x i16> @vld3lanei16(i16* %A, <4 x i16>* %B) nounwind { 235 ;CHECK-LABEL: vld3lanei16: 236 ;Check the (default) alignment value. VLD3 does not support alignment. 237 ;CHECK: vld3.16 {d{{.*}}[1], d{{.*}}[1], d{{.*}}[1]}, [{{r[0-9]+}}] 238 %tmp0 = bitcast i16* %A to i8* 239 %tmp1 = load <4 x i16>* %B 240 %tmp2 = call %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 8) 241 %tmp3 = extractvalue %struct.__neon_int16x4x3_t %tmp2, 0 242 %tmp4 = extractvalue %struct.__neon_int16x4x3_t %tmp2, 1 243 %tmp5 = extractvalue %struct.__neon_int16x4x3_t %tmp2, 2 244 %tmp6 = add <4 x i16> %tmp3, %tmp4 245 %tmp7 = add <4 x i16> %tmp5, %tmp6 246 ret <4 x i16> %tmp7 247 } 248 249 define <2 x i32> @vld3lanei32(i32* %A, <2 x i32>* %B) nounwind { 250 ;CHECK-LABEL: vld3lanei32: 251 ;CHECK: vld3.32 252 %tmp0 = bitcast i32* %A to i8* 253 %tmp1 = load <2 x i32>* %B 254 %tmp2 = call %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1) 255 %tmp3 = extractvalue %struct.__neon_int32x2x3_t %tmp2, 0 256 %tmp4 = extractvalue %struct.__neon_int32x2x3_t %tmp2, 1 257 %tmp5 = extractvalue %struct.__neon_int32x2x3_t %tmp2, 2 258 %tmp6 = add <2 x i32> %tmp3, %tmp4 259 %tmp7 = add <2 x i32> %tmp5, %tmp6 260 ret <2 x i32> %tmp7 261 } 262 263 define <2 x float> @vld3lanef(float* %A, <2 x float>* %B) nounwind { 264 ;CHECK-LABEL: vld3lanef: 265 ;CHECK: vld3.32 266 %tmp0 = bitcast float* %A to i8* 267 %tmp1 = load <2 x float>* %B 268 %tmp2 = call %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3lane.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1) 269 %tmp3 = extractvalue %struct.__neon_float32x2x3_t %tmp2, 0 270 %tmp4 = extractvalue %struct.__neon_float32x2x3_t %tmp2, 1 271 %tmp5 = extractvalue %struct.__neon_float32x2x3_t %tmp2, 2 272 %tmp6 = fadd <2 x float> %tmp3, %tmp4 273 %tmp7 = fadd <2 x float> %tmp5, %tmp6 274 ret <2 x float> %tmp7 275 } 276 277 define <8 x i16> @vld3laneQi16(i16* %A, <8 x i16>* %B) nounwind { 278 ;CHECK-LABEL: vld3laneQi16: 279 ;Check the (default) alignment value. VLD3 does not support alignment. 280 ;CHECK: vld3.16 {d{{.*}}[1], d{{.*}}[1], d{{.*}}[1]}, [{{r[0-9]+}}] 281 %tmp0 = bitcast i16* %A to i8* 282 %tmp1 = load <8 x i16>* %B 283 %tmp2 = call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 8) 284 %tmp3 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 0 285 %tmp4 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 1 286 %tmp5 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 2 287 %tmp6 = add <8 x i16> %tmp3, %tmp4 288 %tmp7 = add <8 x i16> %tmp5, %tmp6 289 ret <8 x i16> %tmp7 290 } 291 292 ;Check for a post-increment updating load with register increment. 293 define <8 x i16> @vld3laneQi16_update(i16** %ptr, <8 x i16>* %B, i32 %inc) nounwind { 294 ;CHECK-LABEL: vld3laneQi16_update: 295 ;CHECK: vld3.16 {d{{.*}}[1], d{{.*}}[1], d{{.*}}[1]}, [{{r[0-9]+}}], {{r[0-9]+}} 296 %A = load i16** %ptr 297 %tmp0 = bitcast i16* %A to i8* 298 %tmp1 = load <8 x i16>* %B 299 %tmp2 = call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 8) 300 %tmp3 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 0 301 %tmp4 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 1 302 %tmp5 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 2 303 %tmp6 = add <8 x i16> %tmp3, %tmp4 304 %tmp7 = add <8 x i16> %tmp5, %tmp6 305 %tmp8 = getelementptr i16* %A, i32 %inc 306 store i16* %tmp8, i16** %ptr 307 ret <8 x i16> %tmp7 308 } 309 310 define <4 x i32> @vld3laneQi32(i32* %A, <4 x i32>* %B) nounwind { 311 ;CHECK-LABEL: vld3laneQi32: 312 ;CHECK: vld3.32 313 %tmp0 = bitcast i32* %A to i8* 314 %tmp1 = load <4 x i32>* %B 315 %tmp2 = call %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 3, i32 1) 316 %tmp3 = extractvalue %struct.__neon_int32x4x3_t %tmp2, 0 317 %tmp4 = extractvalue %struct.__neon_int32x4x3_t %tmp2, 1 318 %tmp5 = extractvalue %struct.__neon_int32x4x3_t %tmp2, 2 319 %tmp6 = add <4 x i32> %tmp3, %tmp4 320 %tmp7 = add <4 x i32> %tmp5, %tmp6 321 ret <4 x i32> %tmp7 322 } 323 324 define <4 x float> @vld3laneQf(float* %A, <4 x float>* %B) nounwind { 325 ;CHECK-LABEL: vld3laneQf: 326 ;CHECK: vld3.32 327 %tmp0 = bitcast float* %A to i8* 328 %tmp1 = load <4 x float>* %B 329 %tmp2 = call %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3lane.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1) 330 %tmp3 = extractvalue %struct.__neon_float32x4x3_t %tmp2, 0 331 %tmp4 = extractvalue %struct.__neon_float32x4x3_t %tmp2, 1 332 %tmp5 = extractvalue %struct.__neon_float32x4x3_t %tmp2, 2 333 %tmp6 = fadd <4 x float> %tmp3, %tmp4 334 %tmp7 = fadd <4 x float> %tmp5, %tmp6 335 ret <4 x float> %tmp7 336 } 337 338 declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly 339 declare %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly 340 declare %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly 341 declare %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32, i32) nounwind readonly 342 343 declare %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly 344 declare %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) nounwind readonly 345 declare %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32, i32) nounwind readonly 346 347 %struct.__neon_int8x8x4_t = type { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } 348 %struct.__neon_int16x4x4_t = type { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } 349 %struct.__neon_int32x2x4_t = type { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } 350 %struct.__neon_float32x2x4_t = type { <2 x float>, <2 x float>, <2 x float>, <2 x float> } 351 352 %struct.__neon_int16x8x4_t = type { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } 353 %struct.__neon_int32x4x4_t = type { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } 354 %struct.__neon_float32x4x4_t = type { <4 x float>, <4 x float>, <4 x float>, <4 x float> } 355 356 define <8 x i8> @vld4lanei8(i8* %A, <8 x i8>* %B) nounwind { 357 ;CHECK-LABEL: vld4lanei8: 358 ;Check the alignment value. Max for this instruction is 32 bits: 359 ;CHECK: vld4.8 {d{{.*}}[1], d{{.*}}[1], d{{.*}}[1], d{{.*}}[1]}, [{{r[0-9]+}}:32] 360 %tmp1 = load <8 x i8>* %B 361 %tmp2 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8) 362 %tmp3 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 0 363 %tmp4 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 1 364 %tmp5 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 2 365 %tmp6 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 3 366 %tmp7 = add <8 x i8> %tmp3, %tmp4 367 %tmp8 = add <8 x i8> %tmp5, %tmp6 368 %tmp9 = add <8 x i8> %tmp7, %tmp8 369 ret <8 x i8> %tmp9 370 } 371 372 ;Check for a post-increment updating load. 373 define <8 x i8> @vld4lanei8_update(i8** %ptr, <8 x i8>* %B) nounwind { 374 ;CHECK-LABEL: vld4lanei8_update: 375 ;CHECK: vld4.8 {d16[1], d17[1], d18[1], d19[1]}, [{{r[0-9]+}}:32]! 376 %A = load i8** %ptr 377 %tmp1 = load <8 x i8>* %B 378 %tmp2 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8) 379 %tmp3 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 0 380 %tmp4 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 1 381 %tmp5 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 2 382 %tmp6 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 3 383 %tmp7 = add <8 x i8> %tmp3, %tmp4 384 %tmp8 = add <8 x i8> %tmp5, %tmp6 385 %tmp9 = add <8 x i8> %tmp7, %tmp8 386 %tmp10 = getelementptr i8* %A, i32 4 387 store i8* %tmp10, i8** %ptr 388 ret <8 x i8> %tmp9 389 } 390 391 define <4 x i16> @vld4lanei16(i16* %A, <4 x i16>* %B) nounwind { 392 ;CHECK-LABEL: vld4lanei16: 393 ;Check that a power-of-two alignment smaller than the total size of the memory 394 ;being loaded is ignored. 395 ;CHECK: vld4.16 {d16[1], d17[1], d18[1], d19[1]}, [{{r[0-9]+}}] 396 %tmp0 = bitcast i16* %A to i8* 397 %tmp1 = load <4 x i16>* %B 398 %tmp2 = call %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 4) 399 %tmp3 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 0 400 %tmp4 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 1 401 %tmp5 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 2 402 %tmp6 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 3 403 %tmp7 = add <4 x i16> %tmp3, %tmp4 404 %tmp8 = add <4 x i16> %tmp5, %tmp6 405 %tmp9 = add <4 x i16> %tmp7, %tmp8 406 ret <4 x i16> %tmp9 407 } 408 409 define <2 x i32> @vld4lanei32(i32* %A, <2 x i32>* %B) nounwind { 410 ;CHECK-LABEL: vld4lanei32: 411 ;Check the alignment value. An 8-byte alignment is allowed here even though 412 ;it is smaller than the total size of the memory being loaded. 413 ;CHECK: vld4.32 {d16[1], d17[1], d18[1], d19[1]}, [{{r[0-9]+}}:64] 414 %tmp0 = bitcast i32* %A to i8* 415 %tmp1 = load <2 x i32>* %B 416 %tmp2 = call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 8) 417 %tmp3 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 0 418 %tmp4 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 1 419 %tmp5 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 2 420 %tmp6 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 3 421 %tmp7 = add <2 x i32> %tmp3, %tmp4 422 %tmp8 = add <2 x i32> %tmp5, %tmp6 423 %tmp9 = add <2 x i32> %tmp7, %tmp8 424 ret <2 x i32> %tmp9 425 } 426 427 define <2 x float> @vld4lanef(float* %A, <2 x float>* %B) nounwind { 428 ;CHECK-LABEL: vld4lanef: 429 ;CHECK: vld4.32 430 %tmp0 = bitcast float* %A to i8* 431 %tmp1 = load <2 x float>* %B 432 %tmp2 = call %struct.__neon_float32x2x4_t @llvm.arm.neon.vld4lane.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1) 433 %tmp3 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 0 434 %tmp4 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 1 435 %tmp5 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 2 436 %tmp6 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 3 437 %tmp7 = fadd <2 x float> %tmp3, %tmp4 438 %tmp8 = fadd <2 x float> %tmp5, %tmp6 439 %tmp9 = fadd <2 x float> %tmp7, %tmp8 440 ret <2 x float> %tmp9 441 } 442 443 define <8 x i16> @vld4laneQi16(i16* %A, <8 x i16>* %B) nounwind { 444 ;CHECK-LABEL: vld4laneQi16: 445 ;Check the alignment value. Max for this instruction is 64 bits: 446 ;CHECK: vld4.16 {d16[1], d18[1], d20[1], d22[1]}, [{{r[0-9]+}}:64] 447 %tmp0 = bitcast i16* %A to i8* 448 %tmp1 = load <8 x i16>* %B 449 %tmp2 = call %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 16) 450 %tmp3 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 0 451 %tmp4 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 1 452 %tmp5 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 2 453 %tmp6 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 3 454 %tmp7 = add <8 x i16> %tmp3, %tmp4 455 %tmp8 = add <8 x i16> %tmp5, %tmp6 456 %tmp9 = add <8 x i16> %tmp7, %tmp8 457 ret <8 x i16> %tmp9 458 } 459 460 define <4 x i32> @vld4laneQi32(i32* %A, <4 x i32>* %B) nounwind { 461 ;CHECK-LABEL: vld4laneQi32: 462 ;Check the (default) alignment. 463 ;CHECK: vld4.32 {d17[0], d19[0], d21[0], d23[0]}, [{{r[0-9]+}}] 464 %tmp0 = bitcast i32* %A to i8* 465 %tmp1 = load <4 x i32>* %B 466 %tmp2 = call %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 1) 467 %tmp3 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 0 468 %tmp4 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 1 469 %tmp5 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 2 470 %tmp6 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 3 471 %tmp7 = add <4 x i32> %tmp3, %tmp4 472 %tmp8 = add <4 x i32> %tmp5, %tmp6 473 %tmp9 = add <4 x i32> %tmp7, %tmp8 474 ret <4 x i32> %tmp9 475 } 476 477 define <4 x float> @vld4laneQf(float* %A, <4 x float>* %B) nounwind { 478 ;CHECK-LABEL: vld4laneQf: 479 ;CHECK: vld4.32 480 %tmp0 = bitcast float* %A to i8* 481 %tmp1 = load <4 x float>* %B 482 %tmp2 = call %struct.__neon_float32x4x4_t @llvm.arm.neon.vld4lane.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1) 483 %tmp3 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 0 484 %tmp4 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 1 485 %tmp5 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 2 486 %tmp6 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 3 487 %tmp7 = fadd <4 x float> %tmp3, %tmp4 488 %tmp8 = fadd <4 x float> %tmp5, %tmp6 489 %tmp9 = fadd <4 x float> %tmp7, %tmp8 490 ret <4 x float> %tmp9 491 } 492 493 declare %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly 494 declare %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly 495 declare %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly 496 declare %struct.__neon_float32x2x4_t @llvm.arm.neon.vld4lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32) nounwind readonly 497 498 declare %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly 499 declare %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) nounwind readonly 500 declare %struct.__neon_float32x4x4_t @llvm.arm.neon.vld4lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32, i32) nounwind readonly 501 502 ; Radar 8776599: If one of the operands to a QQQQ REG_SEQUENCE is a register 503 ; in the QPR_VFP2 regclass, it needs to be copied to a QPR regclass because 504 ; we don't currently have a QQQQ_VFP2 super-regclass. (The "0" for the low 505 ; part of %ins67 is supposed to be loaded by a VLDRS instruction in this test.) 506 define <8 x i16> @test_qqqq_regsequence_subreg([6 x i64] %b) nounwind { 507 ;CHECK-LABEL: test_qqqq_regsequence_subreg: 508 ;CHECK: vld3.16 509 %tmp63 = extractvalue [6 x i64] %b, 5 510 %tmp64 = zext i64 %tmp63 to i128 511 %tmp65 = shl i128 %tmp64, 64 512 %ins67 = or i128 %tmp65, 0 513 %tmp78 = bitcast i128 %ins67 to <8 x i16> 514 %vld3_lane = tail call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16(i8* undef, <8 x i16> undef, <8 x i16> undef, <8 x i16> %tmp78, i32 1, i32 2) 515 %tmp3 = extractvalue %struct.__neon_int16x8x3_t %vld3_lane, 0 516 %tmp4 = extractvalue %struct.__neon_int16x8x3_t %vld3_lane, 1 517 %tmp5 = extractvalue %struct.__neon_int16x8x3_t %vld3_lane, 2 518 %tmp6 = add <8 x i16> %tmp3, %tmp4 519 %tmp7 = add <8 x i16> %tmp5, %tmp6 520 ret <8 x i16> %tmp7 521 } 522 523 declare void @llvm.trap() nounwind 524