1 ; RUN: llc < %s -mtriple=aarch64-none-eabi | FileCheck %s 2 3 ; Simple load of v4i16 4 define <4 x half> @load_64(<4 x half>* nocapture readonly %a) #0 { 5 ; CHECK-LABEL: load_64: 6 ; CHECK: ldr d0, [x0] 7 entry: 8 %0 = load <4 x half>, <4 x half>* %a, align 8 9 ret <4 x half> %0 10 } 11 12 ; Simple load of v8i16 13 define <8 x half> @load_128(<8 x half>* nocapture readonly %a) #0 { 14 ; CHECK-LABEL: load_128: 15 ; CHECK: ldr q0, [x0] 16 entry: 17 %0 = load <8 x half>, <8 x half>* %a, align 16 18 ret <8 x half> %0 19 } 20 21 ; Duplicating load to v4i16 22 define <4 x half> @load_dup_64(half* nocapture readonly %a) #0 { 23 ; CHECK-LABEL: load_dup_64: 24 ; CHECK: ld1r { v0.4h }, [x0] 25 entry: 26 %0 = load half, half* %a, align 2 27 %1 = insertelement <4 x half> undef, half %0, i32 0 28 %2 = shufflevector <4 x half> %1, <4 x half> undef, <4 x i32> zeroinitializer 29 ret <4 x half> %2 30 } 31 32 ; Duplicating load to v8i16 33 define <8 x half> @load_dup_128(half* nocapture readonly %a) #0 { 34 ; CHECK-LABEL: load_dup_128: 35 ; CHECK: ld1r { v0.8h }, [x0] 36 entry: 37 %0 = load half, half* %a, align 2 38 %1 = insertelement <8 x half> undef, half %0, i32 0 39 %2 = shufflevector <8 x half> %1, <8 x half> undef, <8 x i32> zeroinitializer 40 ret <8 x half> %2 41 } 42 43 ; Load to one lane of v4f16 44 define <4 x half> @load_lane_64(half* nocapture readonly %a, <4 x half> %b) #0 { 45 ; CHECK-LABEL: load_lane_64: 46 ; CHECK: ld1 { v0.h }[2], [x0] 47 entry: 48 %0 = load half, half* %a, align 2 49 %1 = insertelement <4 x half> %b, half %0, i32 2 50 ret <4 x half> %1 51 } 52 53 ; Load to one lane of v8f16 54 define <8 x half> @load_lane_128(half* nocapture readonly %a, <8 x half> %b) #0 { 55 ; CHECK-LABEL: load_lane_128: 56 ; CHECK: ld1 { v0.h }[5], [x0] 57 entry: 58 %0 = load half, half* %a, align 2 59 %1 = insertelement <8 x half> %b, half %0, i32 5 60 ret <8 x half> %1 61 } 62 63 ; Simple store of v4f16 64 define void @store_64(<4 x half>* nocapture %a, <4 x half> %b) #1 { 65 ; CHECK-LABEL: store_64: 66 ; CHECK: str d0, [x0] 67 entry: 68 store <4 x half> %b, <4 x half>* %a, align 8 69 ret void 70 } 71 72 ; Simple store of v8f16 73 define void @store_128(<8 x half>* nocapture %a, <8 x half> %b) #1 { 74 ; CHECK-LABEL: store_128: 75 ; CHECK: str q0, [x0] 76 entry: 77 store <8 x half> %b, <8 x half>* %a, align 16 78 ret void 79 } 80 81 ; Store from one lane of v4f16 82 define void @store_lane_64(half* nocapture %a, <4 x half> %b) #1 { 83 ; CHECK-LABEL: store_lane_64: 84 ; CHECK: st1 { v0.h }[2], [x0] 85 entry: 86 %0 = extractelement <4 x half> %b, i32 2 87 store half %0, half* %a, align 2 88 ret void 89 } 90 91 ; Store from one lane of v8f16 92 define void @store_lane_128(half* nocapture %a, <8 x half> %b) #1 { 93 ; CHECK-LABEL: store_lane_128: 94 ; CHECK: st1 { v0.h }[5], [x0] 95 entry: 96 %0 = extractelement <8 x half> %b, i32 5 97 store half %0, half* %a, align 2 98 ret void 99 } 100 101 ; NEON intrinsics - (de-)interleaving loads and stores 102 declare { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2.v4f16.p0v4f16(<4 x half>*) 103 declare { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3.v4f16.p0v4f16(<4 x half>*) 104 declare { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4.v4f16.p0v4f16(<4 x half>*) 105 declare void @llvm.aarch64.neon.st2.v4f16.p0v4f16(<4 x half>, <4 x half>, <4 x half>*) 106 declare void @llvm.aarch64.neon.st3.v4f16.p0v4f16(<4 x half>, <4 x half>, <4 x half>, <4 x half>*) 107 declare void @llvm.aarch64.neon.st4.v4f16.p0v4f16(<4 x half>, <4 x half>, <4 x half>, <4 x half>, <4 x half>*) 108 declare { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2.v8f16.p0v8f16(<8 x half>*) 109 declare { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3.v8f16.p0v8f16(<8 x half>*) 110 declare { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4.v8f16.p0v8f16(<8 x half>*) 111 declare void @llvm.aarch64.neon.st2.v8f16.p0v8f16(<8 x half>, <8 x half>, <8 x half>*) 112 declare void @llvm.aarch64.neon.st3.v8f16.p0v8f16(<8 x half>, <8 x half>, <8 x half>, <8 x half>*) 113 declare void @llvm.aarch64.neon.st4.v8f16.p0v8f16(<8 x half>, <8 x half>, <8 x half>, <8 x half>, <8 x half>*) 114 115 ; Load 2 x v4f16 with de-interleaving 116 define { <4 x half>, <4 x half> } @load_interleave_64_2(<4 x half>* %a) #0 { 117 ; CHECK-LABEL: load_interleave_64_2: 118 ; CHECK: ld2 { v0.4h, v1.4h }, [x0] 119 entry: 120 %0 = tail call { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2.v4f16.p0v4f16(<4 x half>* %a) 121 ret { <4 x half>, <4 x half> } %0 122 } 123 124 ; Load 3 x v4f16 with de-interleaving 125 define { <4 x half>, <4 x half>, <4 x half> } @load_interleave_64_3(<4 x half>* %a) #0 { 126 ; CHECK-LABEL: load_interleave_64_3: 127 ; CHECK: ld3 { v0.4h, v1.4h, v2.4h }, [x0] 128 entry: 129 %0 = tail call { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3.v4f16.p0v4f16(<4 x half>* %a) 130 ret { <4 x half>, <4 x half>, <4 x half> } %0 131 } 132 133 ; Load 4 x v4f16 with de-interleaving 134 define { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @load_interleave_64_4(<4 x half>* %a) #0 { 135 ; CHECK-LABEL: load_interleave_64_4: 136 ; CHECK: ld4 { v0.4h, v1.4h, v2.4h, v3.4h }, [x0] 137 entry: 138 %0 = tail call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4.v4f16.p0v4f16(<4 x half>* %a) 139 ret { <4 x half>, <4 x half>, <4 x half>, <4 x half> } %0 140 } 141 142 ; Store 2 x v4f16 with interleaving 143 define void @store_interleave_64_2(<4 x half>* %a, <4 x half> %b, <4 x half> %c) #0 { 144 ; CHECK-LABEL: store_interleave_64_2: 145 ; CHECK: st2 { v0.4h, v1.4h }, [x0] 146 entry: 147 tail call void @llvm.aarch64.neon.st2.v4f16.p0v4f16(<4 x half> %b, <4 x half> %c, <4 x half>* %a) 148 ret void 149 } 150 151 ; Store 3 x v4f16 with interleaving 152 define void @store_interleave_64_3(<4 x half>* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d) #0 { 153 ; CHECK-LABEL: store_interleave_64_3: 154 ; CHECK: st3 { v0.4h, v1.4h, v2.4h }, [x0] 155 entry: 156 tail call void @llvm.aarch64.neon.st3.v4f16.p0v4f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half>* %a) 157 ret void 158 } 159 160 ; Store 4 x v4f16 with interleaving 161 define void @store_interleave_64_4(<4 x half>* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e) #0 { 162 ; CHECK-LABEL: store_interleave_64_4: 163 ; CHECK: st4 { v0.4h, v1.4h, v2.4h, v3.4h }, [x0] 164 entry: 165 tail call void @llvm.aarch64.neon.st4.v4f16.p0v4f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e, <4 x half>* %a) 166 ret void 167 } 168 169 ; Load 2 x v8f16 with de-interleaving 170 define { <8 x half>, <8 x half> } @load_interleave_128_2(<8 x half>* %a) #0 { 171 ; CHECK-LABEL: load_interleave_128_2: 172 ; CHECK: ld2 { v0.8h, v1.8h }, [x0] 173 entry: 174 %0 = tail call { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2.v8f16.p0v8f16(<8 x half>* %a) 175 ret { <8 x half>, <8 x half> } %0 176 } 177 178 ; Load 3 x v8f16 with de-interleaving 179 define { <8 x half>, <8 x half>, <8 x half> } @load_interleave_128_3(<8 x half>* %a) #0 { 180 ; CHECK-LABEL: load_interleave_128_3: 181 ; CHECK: ld3 { v0.8h, v1.8h, v2.8h }, [x0] 182 entry: 183 %0 = tail call { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3.v8f16.p0v8f16(<8 x half>* %a) 184 ret { <8 x half>, <8 x half>, <8 x half> } %0 185 } 186 187 ; Load 8 x v8f16 with de-interleaving 188 define { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @load_interleave_128_4(<8 x half>* %a) #0 { 189 ; CHECK-LABEL: load_interleave_128_4: 190 ; CHECK: ld4 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0] 191 entry: 192 %0 = tail call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4.v8f16.p0v8f16(<8 x half>* %a) 193 ret { <8 x half>, <8 x half>, <8 x half>, <8 x half> } %0 194 } 195 196 ; Store 2 x v8f16 with interleaving 197 define void @store_interleave_128_2(<8 x half>* %a, <8 x half> %b, <8 x half> %c) #0 { 198 ; CHECK-LABEL: store_interleave_128_2: 199 ; CHECK: st2 { v0.8h, v1.8h }, [x0] 200 entry: 201 tail call void @llvm.aarch64.neon.st2.v8f16.p0v8f16(<8 x half> %b, <8 x half> %c, <8 x half>* %a) 202 ret void 203 } 204 205 ; Store 3 x v8f16 with interleaving 206 define void @store_interleave_128_3(<8 x half>* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d) #0 { 207 ; CHECK-LABEL: store_interleave_128_3: 208 ; CHECK: st3 { v0.8h, v1.8h, v2.8h }, [x0] 209 entry: 210 tail call void @llvm.aarch64.neon.st3.v8f16.p0v8f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half>* %a) 211 ret void 212 } 213 214 ; Store 8 x v8f16 with interleaving 215 define void @store_interleave_128_4(<8 x half>* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e) #0 { 216 ; CHECK-LABEL: store_interleave_128_4: 217 ; CHECK: st4 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0] 218 entry: 219 tail call void @llvm.aarch64.neon.st4.v8f16.p0v8f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e, <8 x half>* %a) 220 ret void 221 } 222 223 ; NEON intrinsics - duplicating loads 224 declare { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2r.v4f16.p0f16(half*) 225 declare { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3r.v4f16.p0f16(half*) 226 declare { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4r.v4f16.p0f16(half*) 227 declare { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2r.v8f16.p0f16(half*) 228 declare { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3r.v8f16.p0f16(half*) 229 declare { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4r.v8f16.p0f16(half*) 230 231 ; Load 2 x v4f16 with duplication 232 define { <4 x half>, <4 x half> } @load_dup_64_2(half* %a) #0 { 233 ; CHECK-LABEL: load_dup_64_2: 234 ; CHECK: ld2r { v0.4h, v1.4h }, [x0] 235 entry: 236 %0 = tail call { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2r.v4f16.p0f16(half* %a) 237 ret { <4 x half>, <4 x half> } %0 238 } 239 240 ; Load 3 x v4f16 with duplication 241 define { <4 x half>, <4 x half>, <4 x half> } @load_dup_64_3(half* %a) #0 { 242 ; CHECK-LABEL: load_dup_64_3: 243 ; CHECK: ld3r { v0.4h, v1.4h, v2.4h }, [x0] 244 entry: 245 %0 = tail call { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3r.v4f16.p0f16(half* %a) 246 ret { <4 x half>, <4 x half>, <4 x half> } %0 247 } 248 249 ; Load 4 x v4f16 with duplication 250 define { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @load_dup_64_4(half* %a) #0 { 251 ; CHECK-LABEL: load_dup_64_4: 252 ; CHECK: ld4r { v0.4h, v1.4h, v2.4h, v3.4h }, [x0] 253 entry: 254 %0 = tail call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4r.v4f16.p0f16(half* %a) 255 ret { <4 x half>, <4 x half>, <4 x half>, <4 x half> } %0 256 } 257 258 ; Load 2 x v8f16 with duplication 259 define { <8 x half>, <8 x half> } @load_dup_128_2(half* %a) #0 { 260 ; CHECK-LABEL: load_dup_128_2: 261 ; CHECK: ld2r { v0.8h, v1.8h }, [x0] 262 entry: 263 %0 = tail call { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2r.v8f16.p0f16(half* %a) 264 ret { <8 x half>, <8 x half> } %0 265 } 266 267 ; Load 3 x v8f16 with duplication 268 define { <8 x half>, <8 x half>, <8 x half> } @load_dup_128_3(half* %a) #0 { 269 ; CHECK-LABEL: load_dup_128_3: 270 ; CHECK: ld3r { v0.8h, v1.8h, v2.8h }, [x0] 271 entry: 272 %0 = tail call { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3r.v8f16.p0f16(half* %a) 273 ret { <8 x half>, <8 x half>, <8 x half> } %0 274 } 275 276 ; Load 8 x v8f16 with duplication 277 define { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @load_dup_128_4(half* %a) #0 { 278 ; CHECK-LABEL: load_dup_128_4: 279 ; CHECK: ld4r { v0.8h, v1.8h, v2.8h, v3.8h }, [x0] 280 entry: 281 %0 = tail call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4r.v8f16.p0f16(half* %a) 282 ret { <8 x half>, <8 x half>, <8 x half>, <8 x half> } %0 283 } 284 285 286 ; NEON intrinsics - loads and stores to/from one lane 287 declare { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2lane.v4f16.p0f16(<4 x half>, <4 x half>, i64, half*) 288 declare { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3lane.v4f16.p0f16(<4 x half>, <4 x half>, <4 x half>, i64, half*) 289 declare { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4lane.v4f16.p0f16(<4 x half>, <4 x half>, <4 x half>, <4 x half>, i64, half*) 290 declare void @llvm.aarch64.neon.st2lane.v4f16.p0f16(<4 x half>, <4 x half>, i64, half*) 291 declare void @llvm.aarch64.neon.st3lane.v4f16.p0f16(<4 x half>, <4 x half>, <4 x half>, i64, half*) 292 declare void @llvm.aarch64.neon.st4lane.v4f16.p0f16(<4 x half>, <4 x half>, <4 x half>, <4 x half>, i64, half*) 293 declare { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2lane.v8f16.p0f16(<8 x half>, <8 x half>, i64, half*) 294 declare { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3lane.v8f16.p0f16(<8 x half>, <8 x half>, <8 x half>, i64, half*) 295 declare { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4lane.v8f16.p0f16(<8 x half>, <8 x half>, <8 x half>, <8 x half>, i64, half*) 296 declare void @llvm.aarch64.neon.st2lane.v8f16.p0f16(<8 x half>, <8 x half>, i64, half*) 297 declare void @llvm.aarch64.neon.st3lane.v8f16.p0f16(<8 x half>, <8 x half>, <8 x half>, i64, half*) 298 declare void @llvm.aarch64.neon.st4lane.v8f16.p0f16(<8 x half>, <8 x half>, <8 x half>, <8 x half>, i64, half*) 299 300 ; Load one lane of 2 x v4f16 301 define { <4 x half>, <4 x half> } @load_lane_64_2(half* %a, <4 x half> %b, <4 x half> %c) #0 { 302 ; CHECK-LABEL: load_lane_64_2: 303 ; CHECK: ld2 { v0.h, v1.h }[2], [x0] 304 entry: 305 %0 = tail call { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2lane.v4f16.p0f16(<4 x half> %b, <4 x half> %c, i64 2, half* %a) 306 ret { <4 x half>, <4 x half> } %0 307 } 308 309 ; Load one lane of 3 x v4f16 310 define { <4 x half>, <4 x half>, <4 x half> } @load_lane_64_3(half* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d) #0 { 311 ; CHECK-LABEL: load_lane_64_3: 312 ; CHECK: ld3 { v0.h, v1.h, v2.h }[2], [x0] 313 entry: 314 %0 = tail call { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3lane.v4f16.p0f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, i64 2, half* %a) 315 ret { <4 x half>, <4 x half>, <4 x half> } %0 316 } 317 318 ; Load one lane of 4 x v4f16 319 define { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @load_lane_64_4(half* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e) #0 { 320 ; CHECK-LABEL: load_lane_64_4: 321 ; CHECK: ld4 { v0.h, v1.h, v2.h, v3.h }[2], [x0] 322 entry: 323 %0 = tail call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4lane.v4f16.p0f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e, i64 2, half* %a) 324 ret { <4 x half>, <4 x half>, <4 x half>, <4 x half> } %0 325 } 326 327 ; Store one lane of 2 x v4f16 328 define void @store_lane_64_2(half* %a, <4 x half> %b, <4 x half> %c) #0 { 329 ; CHECK-LABEL: store_lane_64_2: 330 ; CHECK: st2 { v0.h, v1.h }[2], [x0] 331 entry: 332 tail call void @llvm.aarch64.neon.st2lane.v4f16.p0f16(<4 x half> %b, <4 x half> %c, i64 2, half* %a) 333 ret void 334 } 335 336 ; Store one lane of 3 x v4f16 337 define void @store_lane_64_3(half* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d) #0 { 338 ; CHECK-LABEL: store_lane_64_3: 339 ; CHECK: st3 { v0.h, v1.h, v2.h }[2], [x0] 340 entry: 341 tail call void @llvm.aarch64.neon.st3lane.v4f16.p0f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, i64 2, half* %a) 342 ret void 343 } 344 345 ; Store one lane of 4 x v4f16 346 define void @store_lane_64_4(half* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e) #0 { 347 ; CHECK-LABEL: store_lane_64_4: 348 ; CHECK: st4 { v0.h, v1.h, v2.h, v3.h }[2], [x0] 349 entry: 350 tail call void @llvm.aarch64.neon.st4lane.v4f16.p0f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e, i64 2, half* %a) 351 ret void 352 } 353 354 ; Load one lane of 2 x v8f16 355 define { <8 x half>, <8 x half> } @load_lane_128_2(half* %a, <8 x half> %b, <8 x half> %c) #0 { 356 ; CHECK-LABEL: load_lane_128_2: 357 ; CHECK: ld2 { v0.h, v1.h }[2], [x0] 358 entry: 359 %0 = tail call { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2lane.v8f16.p0f16(<8 x half> %b, <8 x half> %c, i64 2, half* %a) 360 ret { <8 x half>, <8 x half> } %0 361 } 362 363 ; Load one lane of 3 x v8f16 364 define { <8 x half>, <8 x half>, <8 x half> } @load_lane_128_3(half* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d) #0 { 365 ; CHECK-LABEL: load_lane_128_3: 366 ; CHECK: ld3 { v0.h, v1.h, v2.h }[2], [x0] 367 entry: 368 %0 = tail call { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3lane.v8f16.p0f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, i64 2, half* %a) 369 ret { <8 x half>, <8 x half>, <8 x half> } %0 370 } 371 372 ; Load one lane of 8 x v8f16 373 define { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @load_lane_128_4(half* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e) #0 { 374 ; CHECK-LABEL: load_lane_128_4: 375 ; CHECK: ld4 { v0.h, v1.h, v2.h, v3.h }[2], [x0] 376 entry: 377 %0 = tail call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4lane.v8f16.p0f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e, i64 2, half* %a) 378 ret { <8 x half>, <8 x half>, <8 x half>, <8 x half> } %0 379 } 380 381 ; Store one lane of 2 x v8f16 382 define void @store_lane_128_2(half* %a, <8 x half> %b, <8 x half> %c) #0 { 383 ; CHECK-LABEL: store_lane_128_2: 384 ; CHECK: st2 { v0.h, v1.h }[2], [x0] 385 entry: 386 tail call void @llvm.aarch64.neon.st2lane.v8f16.p0f16(<8 x half> %b, <8 x half> %c, i64 2, half* %a) 387 ret void 388 } 389 390 ; Store one lane of 3 x v8f16 391 define void @store_lane_128_3(half* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d) #0 { 392 ; CHECK-LABEL: store_lane_128_3: 393 ; CHECK: st3 { v0.h, v1.h, v2.h }[2], [x0] 394 entry: 395 tail call void @llvm.aarch64.neon.st3lane.v8f16.p0f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, i64 2, half* %a) 396 ret void 397 } 398 399 ; Store one lane of 8 x v8f16 400 define void @store_lane_128_4(half* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e) #0 { 401 ; CHECK-LABEL: store_lane_128_4: 402 ; CHECK: st4 { v0.h, v1.h, v2.h, v3.h }[2], [x0] 403 entry: 404 tail call void @llvm.aarch64.neon.st4lane.v8f16.p0f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e, i64 2, half* %a) 405 ret void 406 } 407 408 ; NEON intrinsics - load/store without interleaving 409 declare { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x2.v4f16.p0v4f16(<4 x half>*) 410 declare { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x3.v4f16.p0v4f16(<4 x half>*) 411 declare { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x4.v4f16.p0v4f16(<4 x half>*) 412 declare void @llvm.aarch64.neon.st1x2.v4f16.p0v4f16(<4 x half>, <4 x half>, <4 x half>*) 413 declare void @llvm.aarch64.neon.st1x3.v4f16.p0v4f16(<4 x half>, <4 x half>, <4 x half>, <4 x half>*) 414 declare void @llvm.aarch64.neon.st1x4.v4f16.p0v4f16(<4 x half>, <4 x half>, <4 x half>, <4 x half>, <4 x half>*) 415 declare { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x2.v8f16.p0v8f16(<8 x half>*) 416 declare { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x3.v8f16.p0v8f16(<8 x half>*) 417 declare { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x4.v8f16.p0v8f16(<8 x half>*) 418 declare void @llvm.aarch64.neon.st1x2.v8f16.p0v8f16(<8 x half>, <8 x half>, <8 x half>*) 419 declare void @llvm.aarch64.neon.st1x3.v8f16.p0v8f16(<8 x half>, <8 x half>, <8 x half>, <8 x half>*) 420 declare void @llvm.aarch64.neon.st1x4.v8f16.p0v8f16(<8 x half>, <8 x half>, <8 x half>, <8 x half>, <8 x half>*) 421 422 ; Load 2 x v4f16 without de-interleaving 423 define { <4 x half>, <4 x half> } @load_64_2(<4 x half>* %a) #0 { 424 ; CHECK-LABEL: load_64_2: 425 ; CHECK: ld1 { v0.4h, v1.4h }, [x0] 426 entry: 427 %0 = tail call { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x2.v4f16.p0v4f16(<4 x half>* %a) 428 ret { <4 x half>, <4 x half> } %0 429 } 430 431 ; Load 3 x v4f16 without de-interleaving 432 define { <4 x half>, <4 x half>, <4 x half> } @load_64_3(<4 x half>* %a) #0 { 433 ; CHECK-LABEL: load_64_3: 434 ; CHECK: ld1 { v0.4h, v1.4h, v2.4h }, [x0] 435 entry: 436 %0 = tail call { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x3.v4f16.p0v4f16(<4 x half>* %a) 437 ret { <4 x half>, <4 x half>, <4 x half> } %0 438 } 439 440 ; Load 4 x v4f16 without de-interleaving 441 define { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @load_64_4(<4 x half>* %a) #0 { 442 ; CHECK-LABEL: load_64_4: 443 ; CHECK: ld1 { v0.4h, v1.4h, v2.4h, v3.4h }, [x0] 444 entry: 445 %0 = tail call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x4.v4f16.p0v4f16(<4 x half>* %a) 446 ret { <4 x half>, <4 x half>, <4 x half>, <4 x half> } %0 447 } 448 449 ; Store 2 x v4f16 without interleaving 450 define void @store_64_2(<4 x half>* %a, <4 x half> %b, <4 x half> %c) #0 { 451 ; CHECK-LABEL: store_64_2: 452 ; CHECK: st1 { v0.4h, v1.4h }, [x0] 453 entry: 454 tail call void @llvm.aarch64.neon.st1x2.v4f16.p0v4f16(<4 x half> %b, <4 x half> %c, <4 x half>* %a) 455 ret void 456 } 457 458 ; Store 3 x v4f16 without interleaving 459 define void @store_64_3(<4 x half>* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d) #0 { 460 ; CHECK-LABEL: store_64_3: 461 ; CHECK: st1 { v0.4h, v1.4h, v2.4h }, [x0] 462 entry: 463 tail call void @llvm.aarch64.neon.st1x3.v4f16.p0v4f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half>* %a) 464 ret void 465 } 466 467 ; Store 4 x v4f16 without interleaving 468 define void @store_64_4(<4 x half>* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e) #0 { 469 ; CHECK-LABEL: store_64_4: 470 ; CHECK: st1 { v0.4h, v1.4h, v2.4h, v3.4h }, [x0] 471 entry: 472 tail call void @llvm.aarch64.neon.st1x4.v4f16.p0v4f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e, <4 x half>* %a) 473 ret void 474 } 475 476 ; Load 2 x v8f16 without de-interleaving 477 define { <8 x half>, <8 x half> } @load_128_2(<8 x half>* %a) #0 { 478 ; CHECK-LABEL: load_128_2: 479 ; CHECK: ld1 { v0.8h, v1.8h }, [x0] 480 entry: 481 %0 = tail call { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x2.v8f16.p0v8f16(<8 x half>* %a) 482 ret { <8 x half>, <8 x half> } %0 483 } 484 485 ; Load 3 x v8f16 without de-interleaving 486 define { <8 x half>, <8 x half>, <8 x half> } @load_128_3(<8 x half>* %a) #0 { 487 ; CHECK-LABEL: load_128_3: 488 ; CHECK: ld1 { v0.8h, v1.8h, v2.8h }, [x0] 489 entry: 490 %0 = tail call { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x3.v8f16.p0v8f16(<8 x half>* %a) 491 ret { <8 x half>, <8 x half>, <8 x half> } %0 492 } 493 494 ; Load 8 x v8f16 without de-interleaving 495 define { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @load_128_4(<8 x half>* %a) #0 { 496 ; CHECK-LABEL: load_128_4: 497 ; CHECK: ld1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0] 498 entry: 499 %0 = tail call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x4.v8f16.p0v8f16(<8 x half>* %a) 500 ret { <8 x half>, <8 x half>, <8 x half>, <8 x half> } %0 501 } 502 503 ; Store 2 x v8f16 without interleaving 504 define void @store_128_2(<8 x half>* %a, <8 x half> %b, <8 x half> %c) #0 { 505 ; CHECK-LABEL: store_128_2: 506 ; CHECK: st1 { v0.8h, v1.8h }, [x0] 507 entry: 508 tail call void @llvm.aarch64.neon.st1x2.v8f16.p0v8f16(<8 x half> %b, <8 x half> %c, <8 x half>* %a) 509 ret void 510 } 511 512 ; Store 3 x v8f16 without interleaving 513 define void @store_128_3(<8 x half>* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d) #0 { 514 ; CHECK-LABEL: store_128_3: 515 ; CHECK: st1 { v0.8h, v1.8h, v2.8h }, [x0] 516 entry: 517 tail call void @llvm.aarch64.neon.st1x3.v8f16.p0v8f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half>* %a) 518 ret void 519 } 520 521 ; Store 8 x v8f16 without interleaving 522 define void @store_128_4(<8 x half>* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e) #0 { 523 ; CHECK-LABEL: store_128_4: 524 ; CHECK: st1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0] 525 entry: 526 tail call void @llvm.aarch64.neon.st1x4.v8f16.p0v8f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e, <8 x half>* %a) 527 ret void 528 } 529