Home | History | Annotate | Download | only in AArch64
      1 ; RUN: llc < %s -mtriple=aarch64-none-eabi | FileCheck %s
      2 
      3 ; Simple load of v4i16
      4 define <4 x half> @load_64(<4 x half>* nocapture readonly %a) #0 {
      5 ; CHECK-LABEL: load_64:
      6 ; CHECK: ldr d0, [x0]
      7 entry:
      8   %0 = load <4 x half>, <4 x half>* %a, align 8
      9   ret <4 x half> %0
     10 }
     11 
     12 ; Simple load of v8i16
     13 define <8 x half> @load_128(<8 x half>* nocapture readonly %a) #0 {
     14 ; CHECK-LABEL: load_128:
     15 ; CHECK: ldr q0, [x0]
     16 entry:
     17   %0 = load <8 x half>, <8 x half>* %a, align 16
     18   ret <8 x half> %0
     19 }
     20 
     21 ; Duplicating load to v4i16
     22 define <4 x half> @load_dup_64(half* nocapture readonly %a) #0 {
     23 ; CHECK-LABEL: load_dup_64:
     24 ; CHECK: ld1r { v0.4h }, [x0]
     25 entry:
     26   %0 = load half, half* %a, align 2
     27   %1 = insertelement <4 x half> undef, half %0, i32 0
     28   %2 = shufflevector <4 x half> %1, <4 x half> undef, <4 x i32> zeroinitializer
     29   ret <4 x half> %2
     30 }
     31 
     32 ; Duplicating load to v8i16
     33 define <8 x half> @load_dup_128(half* nocapture readonly %a) #0 {
     34 ; CHECK-LABEL: load_dup_128:
     35 ; CHECK: ld1r { v0.8h }, [x0]
     36 entry:
     37   %0 = load half, half* %a, align 2
     38   %1 = insertelement <8 x half> undef, half %0, i32 0
     39   %2 = shufflevector <8 x half> %1, <8 x half> undef, <8 x i32> zeroinitializer
     40   ret <8 x half> %2
     41 }
     42 
     43 ; Load to one lane of v4f16
     44 define <4 x half> @load_lane_64(half* nocapture readonly %a, <4 x half> %b) #0 {
     45 ; CHECK-LABEL: load_lane_64:
     46 ; CHECK: ld1 { v0.h }[2], [x0]
     47 entry:
     48   %0 = load half, half* %a, align 2
     49   %1 = insertelement <4 x half> %b, half %0, i32 2
     50   ret <4 x half> %1
     51 }
     52 
     53 ; Load to one lane of v8f16
     54 define <8 x half> @load_lane_128(half* nocapture readonly %a, <8 x half> %b) #0 {
     55 ; CHECK-LABEL: load_lane_128:
     56 ; CHECK: ld1 { v0.h }[5], [x0]
     57 entry:
     58   %0 = load half, half* %a, align 2
     59   %1 = insertelement <8 x half> %b, half %0, i32 5
     60   ret <8 x half> %1
     61 }
     62 
     63 ; Simple store of v4f16
     64 define void @store_64(<4 x half>* nocapture %a, <4 x half> %b) #1 {
     65 ; CHECK-LABEL: store_64:
     66 ; CHECK: str d0, [x0]
     67 entry:
     68   store <4 x half> %b, <4 x half>* %a, align 8
     69   ret void
     70 }
     71 
     72 ; Simple store of v8f16
     73 define void @store_128(<8 x half>* nocapture %a, <8 x half> %b) #1 {
     74 ; CHECK-LABEL: store_128:
     75 ; CHECK: str q0, [x0]
     76 entry:
     77   store <8 x half> %b, <8 x half>* %a, align 16
     78   ret void
     79 }
     80 
     81 ; Store from one lane of v4f16
     82 define void @store_lane_64(half* nocapture %a, <4 x half> %b) #1 {
     83 ; CHECK-LABEL: store_lane_64:
     84 ; CHECK: st1 { v0.h }[2], [x0]
     85 entry:
     86   %0 = extractelement <4 x half> %b, i32 2
     87   store half %0, half* %a, align 2
     88   ret void
     89 }
     90 
     91 ; Store from one lane of v8f16
     92 define void @store_lane_128(half* nocapture %a, <8 x half> %b) #1 {
     93 ; CHECK-LABEL: store_lane_128:
     94 ; CHECK: st1 { v0.h }[5], [x0]
     95 entry:
     96   %0 = extractelement <8 x half> %b, i32 5
     97   store half %0, half* %a, align 2
     98   ret void
     99 }
    100 
    101 ; NEON intrinsics - (de-)interleaving loads and stores
    102 declare { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2.v4f16.p0v4f16(<4 x half>*)
    103 declare { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3.v4f16.p0v4f16(<4 x half>*)
    104 declare { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4.v4f16.p0v4f16(<4 x half>*)
    105 declare void @llvm.aarch64.neon.st2.v4f16.p0v4f16(<4 x half>, <4 x half>, <4 x half>*)
    106 declare void @llvm.aarch64.neon.st3.v4f16.p0v4f16(<4 x half>, <4 x half>, <4 x half>, <4 x half>*)
    107 declare void @llvm.aarch64.neon.st4.v4f16.p0v4f16(<4 x half>, <4 x half>, <4 x half>, <4 x half>, <4 x half>*)
    108 declare { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2.v8f16.p0v8f16(<8 x half>*)
    109 declare { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3.v8f16.p0v8f16(<8 x half>*)
    110 declare { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4.v8f16.p0v8f16(<8 x half>*)
    111 declare void @llvm.aarch64.neon.st2.v8f16.p0v8f16(<8 x half>, <8 x half>, <8 x half>*)
    112 declare void @llvm.aarch64.neon.st3.v8f16.p0v8f16(<8 x half>, <8 x half>, <8 x half>, <8 x half>*)
    113 declare void @llvm.aarch64.neon.st4.v8f16.p0v8f16(<8 x half>, <8 x half>, <8 x half>, <8 x half>, <8 x half>*)
    114 
    115 ; Load 2 x v4f16 with de-interleaving
    116 define { <4 x half>, <4 x half> } @load_interleave_64_2(<4 x half>* %a) #0 {
    117 ; CHECK-LABEL: load_interleave_64_2:
    118 ; CHECK: ld2 { v0.4h, v1.4h }, [x0]
    119 entry:
    120   %0 = tail call { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2.v4f16.p0v4f16(<4 x half>* %a)
    121   ret { <4 x half>, <4 x half> } %0
    122 }
    123 
    124 ; Load 3 x v4f16 with de-interleaving
    125 define { <4 x half>, <4 x half>, <4 x half> } @load_interleave_64_3(<4 x half>* %a) #0 {
    126 ; CHECK-LABEL: load_interleave_64_3:
    127 ; CHECK: ld3 { v0.4h, v1.4h, v2.4h }, [x0]
    128 entry:
    129   %0 = tail call { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3.v4f16.p0v4f16(<4 x half>* %a)
    130   ret { <4 x half>, <4 x half>, <4 x half> } %0
    131 }
    132 
    133 ; Load 4 x v4f16 with de-interleaving
    134 define { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @load_interleave_64_4(<4 x half>* %a) #0 {
    135 ; CHECK-LABEL: load_interleave_64_4:
    136 ; CHECK: ld4 { v0.4h, v1.4h, v2.4h, v3.4h }, [x0]
    137 entry:
    138   %0 = tail call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4.v4f16.p0v4f16(<4 x half>* %a)
    139   ret { <4 x half>, <4 x half>, <4 x half>, <4 x half> } %0
    140 }
    141 
    142 ; Store 2 x v4f16 with interleaving
    143 define void @store_interleave_64_2(<4 x half>* %a, <4 x half> %b, <4 x half> %c) #0 {
    144 ; CHECK-LABEL: store_interleave_64_2:
    145 ; CHECK: st2 { v0.4h, v1.4h }, [x0]
    146 entry:
    147   tail call void @llvm.aarch64.neon.st2.v4f16.p0v4f16(<4 x half> %b, <4 x half> %c, <4 x half>* %a)
    148   ret void
    149 }
    150 
    151 ; Store 3 x v4f16 with interleaving
    152 define void @store_interleave_64_3(<4 x half>* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d) #0 {
    153 ; CHECK-LABEL: store_interleave_64_3:
    154 ; CHECK: st3 { v0.4h, v1.4h, v2.4h }, [x0]
    155 entry:
    156   tail call void @llvm.aarch64.neon.st3.v4f16.p0v4f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half>* %a)
    157   ret void
    158 }
    159 
    160 ; Store 4 x v4f16 with interleaving
    161 define void @store_interleave_64_4(<4 x half>* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e) #0 {
    162 ; CHECK-LABEL: store_interleave_64_4:
    163 ; CHECK: st4 { v0.4h, v1.4h, v2.4h, v3.4h }, [x0]
    164 entry:
    165   tail call void @llvm.aarch64.neon.st4.v4f16.p0v4f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e, <4 x half>* %a)
    166   ret void
    167 }
    168 
    169 ; Load 2 x v8f16 with de-interleaving
    170 define { <8 x half>, <8 x half> } @load_interleave_128_2(<8 x half>* %a) #0 {
    171 ; CHECK-LABEL: load_interleave_128_2:
    172 ; CHECK: ld2 { v0.8h, v1.8h }, [x0]
    173 entry:
    174   %0 = tail call { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2.v8f16.p0v8f16(<8 x half>* %a)
    175   ret { <8 x half>, <8 x half> } %0
    176 }
    177 
    178 ; Load 3 x v8f16 with de-interleaving
    179 define { <8 x half>, <8 x half>, <8 x half> } @load_interleave_128_3(<8 x half>* %a) #0 {
    180 ; CHECK-LABEL: load_interleave_128_3:
    181 ; CHECK: ld3 { v0.8h, v1.8h, v2.8h }, [x0]
    182 entry:
    183   %0 = tail call { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3.v8f16.p0v8f16(<8 x half>* %a)
    184   ret { <8 x half>, <8 x half>, <8 x half> } %0
    185 }
    186 
    187 ; Load 8 x v8f16 with de-interleaving
    188 define { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @load_interleave_128_4(<8 x half>* %a) #0 {
    189 ; CHECK-LABEL: load_interleave_128_4:
    190 ; CHECK: ld4 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0]
    191 entry:
    192   %0 = tail call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4.v8f16.p0v8f16(<8 x half>* %a)
    193   ret { <8 x half>, <8 x half>, <8 x half>, <8 x half> } %0
    194 }
    195 
    196 ; Store 2 x v8f16 with interleaving
    197 define void @store_interleave_128_2(<8 x half>* %a, <8 x half> %b, <8 x half> %c) #0 {
    198 ; CHECK-LABEL: store_interleave_128_2:
    199 ; CHECK: st2 { v0.8h, v1.8h }, [x0]
    200 entry:
    201   tail call void @llvm.aarch64.neon.st2.v8f16.p0v8f16(<8 x half> %b, <8 x half> %c, <8 x half>* %a)
    202   ret void
    203 }
    204 
    205 ; Store 3 x v8f16 with interleaving
    206 define void @store_interleave_128_3(<8 x half>* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d) #0 {
    207 ; CHECK-LABEL: store_interleave_128_3:
    208 ; CHECK: st3 { v0.8h, v1.8h, v2.8h }, [x0]
    209 entry:
    210   tail call void @llvm.aarch64.neon.st3.v8f16.p0v8f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half>* %a)
    211   ret void
    212 }
    213 
    214 ; Store 8 x v8f16 with interleaving
    215 define void @store_interleave_128_4(<8 x half>* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e) #0 {
    216 ; CHECK-LABEL: store_interleave_128_4:
    217 ; CHECK: st4 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0]
    218 entry:
    219   tail call void @llvm.aarch64.neon.st4.v8f16.p0v8f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e, <8 x half>* %a)
    220   ret void
    221 }
    222 
    223 ; NEON intrinsics - duplicating loads
    224 declare { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2r.v4f16.p0f16(half*)
    225 declare { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3r.v4f16.p0f16(half*)
    226 declare { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4r.v4f16.p0f16(half*)
    227 declare { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2r.v8f16.p0f16(half*)
    228 declare { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3r.v8f16.p0f16(half*)
    229 declare { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4r.v8f16.p0f16(half*)
    230 
    231 ; Load 2 x v4f16 with duplication
    232 define { <4 x half>, <4 x half> } @load_dup_64_2(half* %a) #0 {
    233 ; CHECK-LABEL: load_dup_64_2:
    234 ; CHECK: ld2r { v0.4h, v1.4h }, [x0]
    235 entry:
    236   %0 = tail call { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2r.v4f16.p0f16(half* %a)
    237   ret { <4 x half>, <4 x half> } %0
    238 }
    239 
    240 ; Load 3 x v4f16 with duplication
    241 define { <4 x half>, <4 x half>, <4 x half> } @load_dup_64_3(half* %a) #0 {
    242 ; CHECK-LABEL: load_dup_64_3:
    243 ; CHECK: ld3r { v0.4h, v1.4h, v2.4h }, [x0]
    244 entry:
    245   %0 = tail call { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3r.v4f16.p0f16(half* %a)
    246   ret { <4 x half>, <4 x half>, <4 x half> } %0
    247 }
    248 
    249 ; Load 4 x v4f16 with duplication
    250 define { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @load_dup_64_4(half* %a) #0 {
    251 ; CHECK-LABEL: load_dup_64_4:
    252 ; CHECK: ld4r { v0.4h, v1.4h, v2.4h, v3.4h }, [x0]
    253 entry:
    254   %0 = tail call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4r.v4f16.p0f16(half* %a)
    255   ret { <4 x half>, <4 x half>, <4 x half>, <4 x half> } %0
    256 }
    257 
    258 ; Load 2 x v8f16 with duplication
    259 define { <8 x half>, <8 x half> } @load_dup_128_2(half* %a) #0 {
    260 ; CHECK-LABEL: load_dup_128_2:
    261 ; CHECK: ld2r { v0.8h, v1.8h }, [x0]
    262 entry:
    263   %0 = tail call { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2r.v8f16.p0f16(half* %a)
    264   ret { <8 x half>, <8 x half> } %0
    265 }
    266 
    267 ; Load 3 x v8f16 with duplication
    268 define { <8 x half>, <8 x half>, <8 x half> } @load_dup_128_3(half* %a) #0 {
    269 ; CHECK-LABEL: load_dup_128_3:
    270 ; CHECK: ld3r { v0.8h, v1.8h, v2.8h }, [x0]
    271 entry:
    272   %0 = tail call { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3r.v8f16.p0f16(half* %a)
    273   ret { <8 x half>, <8 x half>, <8 x half> } %0
    274 }
    275 
    276 ; Load 8 x v8f16 with duplication
    277 define { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @load_dup_128_4(half* %a) #0 {
    278 ; CHECK-LABEL: load_dup_128_4:
    279 ; CHECK: ld4r { v0.8h, v1.8h, v2.8h, v3.8h }, [x0]
    280 entry:
    281   %0 = tail call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4r.v8f16.p0f16(half* %a)
    282   ret { <8 x half>, <8 x half>, <8 x half>, <8 x half> } %0
    283 }
    284 
    285 
    286 ; NEON intrinsics - loads and stores to/from one lane
    287 declare { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2lane.v4f16.p0f16(<4 x half>, <4 x half>, i64, half*)
    288 declare { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3lane.v4f16.p0f16(<4 x half>, <4 x half>, <4 x half>, i64, half*)
    289 declare { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4lane.v4f16.p0f16(<4 x half>, <4 x half>, <4 x half>, <4 x half>, i64, half*)
    290 declare void @llvm.aarch64.neon.st2lane.v4f16.p0f16(<4 x half>, <4 x half>, i64, half*)
    291 declare void @llvm.aarch64.neon.st3lane.v4f16.p0f16(<4 x half>, <4 x half>, <4 x half>, i64, half*)
    292 declare void @llvm.aarch64.neon.st4lane.v4f16.p0f16(<4 x half>, <4 x half>, <4 x half>, <4 x half>, i64, half*)
    293 declare { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2lane.v8f16.p0f16(<8 x half>, <8 x half>, i64, half*)
    294 declare { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3lane.v8f16.p0f16(<8 x half>, <8 x half>, <8 x half>, i64, half*)
    295 declare { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4lane.v8f16.p0f16(<8 x half>, <8 x half>, <8 x half>, <8 x half>, i64, half*)
    296 declare void @llvm.aarch64.neon.st2lane.v8f16.p0f16(<8 x half>, <8 x half>, i64, half*)
    297 declare void @llvm.aarch64.neon.st3lane.v8f16.p0f16(<8 x half>, <8 x half>, <8 x half>, i64, half*)
    298 declare void @llvm.aarch64.neon.st4lane.v8f16.p0f16(<8 x half>, <8 x half>, <8 x half>, <8 x half>, i64, half*)
    299 
    300 ; Load one lane of 2 x v4f16
    301 define { <4 x half>, <4 x half> } @load_lane_64_2(half* %a, <4 x half> %b, <4 x half> %c) #0 {
    302 ; CHECK-LABEL: load_lane_64_2:
    303 ; CHECK: ld2 { v0.h, v1.h }[2], [x0]
    304 entry:
    305   %0 = tail call { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2lane.v4f16.p0f16(<4 x half> %b, <4 x half> %c, i64 2, half* %a)
    306   ret { <4 x half>, <4 x half> } %0
    307 }
    308 
    309 ; Load one lane of 3 x v4f16
    310 define { <4 x half>, <4 x half>, <4 x half> } @load_lane_64_3(half* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d) #0 {
    311 ; CHECK-LABEL: load_lane_64_3:
    312 ; CHECK: ld3 { v0.h, v1.h, v2.h }[2], [x0]
    313 entry:
    314   %0 = tail call { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3lane.v4f16.p0f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, i64 2, half* %a)
    315   ret { <4 x half>, <4 x half>, <4 x half> } %0
    316 }
    317 
    318 ; Load one lane of 4 x v4f16
    319 define { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @load_lane_64_4(half* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e) #0 {
    320 ; CHECK-LABEL: load_lane_64_4:
    321 ; CHECK: ld4 { v0.h, v1.h, v2.h, v3.h }[2], [x0]
    322 entry:
    323   %0 = tail call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4lane.v4f16.p0f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e, i64 2, half* %a)
    324   ret { <4 x half>, <4 x half>, <4 x half>, <4 x half> } %0
    325 }
    326 
    327 ; Store one lane of 2 x v4f16
    328 define void @store_lane_64_2(half* %a, <4 x half> %b, <4 x half> %c) #0 {
    329 ; CHECK-LABEL: store_lane_64_2:
    330 ; CHECK: st2 { v0.h, v1.h }[2], [x0]
    331 entry:
    332   tail call void @llvm.aarch64.neon.st2lane.v4f16.p0f16(<4 x half> %b, <4 x half> %c, i64 2, half* %a)
    333   ret void
    334 }
    335 
    336 ; Store one lane of 3 x v4f16
    337 define void @store_lane_64_3(half* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d) #0 {
    338 ; CHECK-LABEL: store_lane_64_3:
    339 ; CHECK: st3 { v0.h, v1.h, v2.h }[2], [x0]
    340 entry:
    341   tail call void @llvm.aarch64.neon.st3lane.v4f16.p0f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, i64 2, half* %a)
    342   ret void
    343 }
    344 
    345 ; Store one lane of 4 x v4f16
    346 define void @store_lane_64_4(half* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e) #0 {
    347 ; CHECK-LABEL: store_lane_64_4:
    348 ; CHECK: st4 { v0.h, v1.h, v2.h, v3.h }[2], [x0]
    349 entry:
    350   tail call void @llvm.aarch64.neon.st4lane.v4f16.p0f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e, i64 2, half* %a)
    351   ret void
    352 }
    353 
    354 ; Load one lane of 2 x v8f16
    355 define { <8 x half>, <8 x half> } @load_lane_128_2(half* %a, <8 x half> %b, <8 x half> %c) #0 {
    356 ; CHECK-LABEL: load_lane_128_2:
    357 ; CHECK: ld2 { v0.h, v1.h }[2], [x0]
    358 entry:
    359   %0 = tail call { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2lane.v8f16.p0f16(<8 x half> %b, <8 x half> %c, i64 2, half* %a)
    360   ret { <8 x half>, <8 x half> } %0
    361 }
    362 
    363 ; Load one lane of 3 x v8f16
    364 define { <8 x half>, <8 x half>, <8 x half> } @load_lane_128_3(half* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d) #0 {
    365 ; CHECK-LABEL: load_lane_128_3:
    366 ; CHECK: ld3 { v0.h, v1.h, v2.h }[2], [x0]
    367 entry:
    368   %0 = tail call { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3lane.v8f16.p0f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, i64 2, half* %a)
    369   ret { <8 x half>, <8 x half>, <8 x half> } %0
    370 }
    371 
    372 ; Load one lane of 8 x v8f16
    373 define { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @load_lane_128_4(half* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e) #0 {
    374 ; CHECK-LABEL: load_lane_128_4:
    375 ; CHECK: ld4 { v0.h, v1.h, v2.h, v3.h }[2], [x0]
    376 entry:
    377   %0 = tail call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4lane.v8f16.p0f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e, i64 2, half* %a)
    378   ret { <8 x half>, <8 x half>, <8 x half>, <8 x half> } %0
    379 }
    380 
    381 ; Store one lane of 2 x v8f16
    382 define void @store_lane_128_2(half* %a, <8 x half> %b, <8 x half> %c) #0 {
    383 ; CHECK-LABEL: store_lane_128_2:
    384 ; CHECK: st2 { v0.h, v1.h }[2], [x0]
    385 entry:
    386   tail call void @llvm.aarch64.neon.st2lane.v8f16.p0f16(<8 x half> %b, <8 x half> %c, i64 2, half* %a)
    387   ret void
    388 }
    389 
    390 ; Store one lane of 3 x v8f16
    391 define void @store_lane_128_3(half* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d) #0 {
    392 ; CHECK-LABEL: store_lane_128_3:
    393 ; CHECK: st3 { v0.h, v1.h, v2.h }[2], [x0]
    394 entry:
    395   tail call void @llvm.aarch64.neon.st3lane.v8f16.p0f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, i64 2, half* %a)
    396   ret void
    397 }
    398 
    399 ; Store one lane of 8 x v8f16
    400 define void @store_lane_128_4(half* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e) #0 {
    401 ; CHECK-LABEL: store_lane_128_4:
    402 ; CHECK: st4 { v0.h, v1.h, v2.h, v3.h }[2], [x0]
    403 entry:
    404   tail call void @llvm.aarch64.neon.st4lane.v8f16.p0f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e, i64 2, half* %a)
    405   ret void
    406 }
    407 
    408 ; NEON intrinsics - load/store without interleaving
    409 declare { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x2.v4f16.p0v4f16(<4 x half>*)
    410 declare { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x3.v4f16.p0v4f16(<4 x half>*)
    411 declare { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x4.v4f16.p0v4f16(<4 x half>*)
    412 declare void @llvm.aarch64.neon.st1x2.v4f16.p0v4f16(<4 x half>, <4 x half>, <4 x half>*)
    413 declare void @llvm.aarch64.neon.st1x3.v4f16.p0v4f16(<4 x half>, <4 x half>, <4 x half>, <4 x half>*)
    414 declare void @llvm.aarch64.neon.st1x4.v4f16.p0v4f16(<4 x half>, <4 x half>, <4 x half>, <4 x half>, <4 x half>*)
    415 declare { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x2.v8f16.p0v8f16(<8 x half>*)
    416 declare { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x3.v8f16.p0v8f16(<8 x half>*)
    417 declare { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x4.v8f16.p0v8f16(<8 x half>*)
    418 declare void @llvm.aarch64.neon.st1x2.v8f16.p0v8f16(<8 x half>, <8 x half>, <8 x half>*)
    419 declare void @llvm.aarch64.neon.st1x3.v8f16.p0v8f16(<8 x half>, <8 x half>, <8 x half>, <8 x half>*)
    420 declare void @llvm.aarch64.neon.st1x4.v8f16.p0v8f16(<8 x half>, <8 x half>, <8 x half>, <8 x half>, <8 x half>*)
    421 
    422 ; Load 2 x v4f16 without de-interleaving
    423 define { <4 x half>, <4 x half> } @load_64_2(<4 x half>* %a) #0 {
    424 ; CHECK-LABEL: load_64_2:
    425 ; CHECK: ld1 { v0.4h, v1.4h }, [x0]
    426 entry:
    427   %0 = tail call { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x2.v4f16.p0v4f16(<4 x half>* %a)
    428   ret { <4 x half>, <4 x half> } %0
    429 }
    430 
    431 ; Load 3 x v4f16 without de-interleaving
    432 define { <4 x half>, <4 x half>, <4 x half> } @load_64_3(<4 x half>* %a) #0 {
    433 ; CHECK-LABEL: load_64_3:
    434 ; CHECK: ld1 { v0.4h, v1.4h, v2.4h }, [x0]
    435 entry:
    436   %0 = tail call { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x3.v4f16.p0v4f16(<4 x half>* %a)
    437   ret { <4 x half>, <4 x half>, <4 x half> } %0
    438 }
    439 
    440 ; Load 4 x v4f16 without de-interleaving
    441 define { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @load_64_4(<4 x half>* %a) #0 {
    442 ; CHECK-LABEL: load_64_4:
    443 ; CHECK: ld1 { v0.4h, v1.4h, v2.4h, v3.4h }, [x0]
    444 entry:
    445   %0 = tail call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x4.v4f16.p0v4f16(<4 x half>* %a)
    446   ret { <4 x half>, <4 x half>, <4 x half>, <4 x half> } %0
    447 }
    448 
    449 ; Store 2 x v4f16 without interleaving
    450 define void @store_64_2(<4 x half>* %a, <4 x half> %b, <4 x half> %c) #0 {
    451 ; CHECK-LABEL: store_64_2:
    452 ; CHECK: st1 { v0.4h, v1.4h }, [x0]
    453 entry:
    454   tail call void @llvm.aarch64.neon.st1x2.v4f16.p0v4f16(<4 x half> %b, <4 x half> %c, <4 x half>* %a)
    455   ret void
    456 }
    457 
    458 ; Store 3 x v4f16 without interleaving
    459 define void @store_64_3(<4 x half>* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d) #0 {
    460 ; CHECK-LABEL: store_64_3:
    461 ; CHECK: st1 { v0.4h, v1.4h, v2.4h }, [x0]
    462 entry:
    463   tail call void @llvm.aarch64.neon.st1x3.v4f16.p0v4f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half>* %a)
    464   ret void
    465 }
    466 
    467 ; Store 4 x v4f16 without interleaving
    468 define void @store_64_4(<4 x half>* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e) #0 {
    469 ; CHECK-LABEL: store_64_4:
    470 ; CHECK: st1 { v0.4h, v1.4h, v2.4h, v3.4h }, [x0]
    471 entry:
    472   tail call void @llvm.aarch64.neon.st1x4.v4f16.p0v4f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e, <4 x half>* %a)
    473   ret void
    474 }
    475 
    476 ; Load 2 x v8f16 without de-interleaving
    477 define { <8 x half>, <8 x half> } @load_128_2(<8 x half>* %a) #0 {
    478 ; CHECK-LABEL: load_128_2:
    479 ; CHECK: ld1 { v0.8h, v1.8h }, [x0]
    480 entry:
    481   %0 = tail call { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x2.v8f16.p0v8f16(<8 x half>* %a)
    482   ret { <8 x half>, <8 x half> } %0
    483 }
    484 
    485 ; Load 3 x v8f16 without de-interleaving
    486 define { <8 x half>, <8 x half>, <8 x half> } @load_128_3(<8 x half>* %a) #0 {
    487 ; CHECK-LABEL: load_128_3:
    488 ; CHECK: ld1 { v0.8h, v1.8h, v2.8h }, [x0]
    489 entry:
    490   %0 = tail call { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x3.v8f16.p0v8f16(<8 x half>* %a)
    491   ret { <8 x half>, <8 x half>, <8 x half> } %0
    492 }
    493 
    494 ; Load 8 x v8f16 without de-interleaving
    495 define { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @load_128_4(<8 x half>* %a) #0 {
    496 ; CHECK-LABEL: load_128_4:
    497 ; CHECK: ld1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0]
    498 entry:
    499   %0 = tail call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x4.v8f16.p0v8f16(<8 x half>* %a)
    500   ret { <8 x half>, <8 x half>, <8 x half>, <8 x half> } %0
    501 }
    502 
    503 ; Store 2 x v8f16 without interleaving
    504 define void @store_128_2(<8 x half>* %a, <8 x half> %b, <8 x half> %c) #0 {
    505 ; CHECK-LABEL: store_128_2:
    506 ; CHECK: st1 { v0.8h, v1.8h }, [x0]
    507 entry:
    508   tail call void @llvm.aarch64.neon.st1x2.v8f16.p0v8f16(<8 x half> %b, <8 x half> %c, <8 x half>* %a)
    509   ret void
    510 }
    511 
    512 ; Store 3 x v8f16 without interleaving
    513 define void @store_128_3(<8 x half>* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d) #0 {
    514 ; CHECK-LABEL: store_128_3:
    515 ; CHECK: st1 { v0.8h, v1.8h, v2.8h }, [x0]
    516 entry:
    517   tail call void @llvm.aarch64.neon.st1x3.v8f16.p0v8f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half>* %a)
    518   ret void
    519 }
    520 
    521 ; Store 8 x v8f16 without interleaving
    522 define void @store_128_4(<8 x half>* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e) #0 {
    523 ; CHECK-LABEL: store_128_4:
    524 ; CHECK: st1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0]
    525 entry:
    526   tail call void @llvm.aarch64.neon.st1x4.v8f16.p0v8f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e, <8 x half>* %a)
    527   ret void
    528 }
    529