Home | History | Annotate | Download | only in AMDGPU
      1 ; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
      2 ; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s
      3 
      4 ; GCN-LABEL: {{^}}load_local_lo_v2i16_undeflo:
      5 ; GCN: s_waitcnt
      6 ; GFX9-NEXT: ds_read_u16_d16 v0, v0
      7 ; GFX9-NEXT: s_waitcnt
      8 ; GFX9-NEXT: s_setpc_b64
      9 
     10 ; VI: ds_read_u16
     11 define <2 x i16> @load_local_lo_v2i16_undeflo(i16 addrspace(3)* %in) #0 {
     12 entry:
     13   %load = load i16, i16 addrspace(3)* %in
     14   %build = insertelement <2 x i16> undef, i16 %load, i32 0
     15   ret <2 x i16> %build
     16 }
     17 
     18 ; GCN-LABEL: {{^}}load_local_lo_v2i16_reglo:
     19 ; GCN: s_waitcnt
     20 ; GFX9-NEXT: ds_read_u16_d16 v0, v0
     21 ; GFX9-NEXT: s_waitcnt
     22 ; GFX9-NEXT: s_setpc_b64
     23 
     24 ; VI: ds_read_u16
     25 define <2 x i16> @load_local_lo_v2i16_reglo(i16 addrspace(3)* %in, i16 %reg) #0 {
     26 entry:
     27   %load = load i16, i16 addrspace(3)* %in
     28   %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
     29   %build1 = insertelement <2 x i16> %build0, i16 %load, i32 0
     30   ret <2 x i16> %build1
     31 }
     32 
     33 ; Show that we get reasonable regalloc without physreg constraints.
     34 ; GCN-LABEL: {{^}}load_local_lo_v2i16_reglo_vreg:
     35 ; GCN: s_waitcnt
     36 ; GFX9-NEXT: ds_read_u16_d16 v0, v0
     37 ; GFX9-NEXT: s_waitcnt
     38 ; GFX9-NEXT: global_store_dword v[0:1], v0, off{{$}}
     39 ; GFX9-NEXT: s_waitcnt
     40 ; GFX9-NEXT: s_setpc_b64
     41 
     42 ; VI: ds_read_u16
     43 define void @load_local_lo_v2i16_reglo_vreg(i16 addrspace(3)* %in, i16 %reg) #0 {
     44 entry:
     45   %load = load i16, i16 addrspace(3)* %in
     46   %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
     47   %build1 = insertelement <2 x i16> %build0, i16 %load, i32 0
     48   store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
     49   ret void
     50 }
     51 
     52 ; GCN-LABEL: {{^}}load_local_lo_v2i16_zerolo:
     53 ; GCN: s_waitcnt
     54 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
     55 ; GFX9-NEXT: ds_read_u16_d16 v1, v0
     56 ; GFX9-NEXT: s_waitcnt
     57 ; GFX9-NEXT: v_mov_b32_e32 v0, v1
     58 ; GFX9-NEXT: s_setpc_b64
     59 
     60 ; VI: ds_read_u16 v
     61 define <2 x i16> @load_local_lo_v2i16_zerolo(i16 addrspace(3)* %in) #0 {
     62 entry:
     63   %load = load i16, i16 addrspace(3)* %in
     64   %build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 0
     65   ret <2 x i16> %build
     66 }
     67 
     68 ; GCN-LABEL: {{^}}load_local_lo_v2f16_fpimm:
     69 ; GCN: s_waitcnt
     70 ; GFX9-NEXT: v_mov_b32_e32 v1, 2.0
     71 ; GFX9-NEXT: ds_read_u16_d16 v1, v0
     72 ; GFX9-NEXT: s_waitcnt
     73 ; GFX9-NEXT: v_mov_b32_e32 v0, v1
     74 ; GFX9-NEXT: s_setpc_b64
     75 
     76 ; VI: ds_read_u16 v
     77 define <2 x half> @load_local_lo_v2f16_fpimm(half addrspace(3)* %in) #0 {
     78 entry:
     79   %load = load half, half addrspace(3)* %in
     80   %build = insertelement <2 x half> <half 0.0, half 2.0>, half %load, i32 0
     81   ret <2 x half> %build
     82 }
     83 
     84 ; GCN-LABEL: {{^}}load_local_lo_v2f16_reghi_vreg:
     85 ; GCN: s_waitcnt
     86 ; GFX9-NEXT: ds_read_u16_d16 v1, v0
     87 ; GFX9-NEXT: s_waitcnt
     88 ; GFX9-NEXT: global_store_dword v[0:1], v1, off{{$}}
     89 ; GFX9-NEXT: s_waitcnt
     90 ; GFX9-NEXT: s_setpc_b64
     91 
     92 ; VI: ds_read_u16 v
     93 define void @load_local_lo_v2f16_reghi_vreg(half addrspace(3)* %in, i32 %reg) #0 {
     94 entry:
     95   %reg.bc = bitcast i32 %reg to <2 x half>
     96   %load = load half, half addrspace(3)* %in
     97   %build1 = insertelement <2 x half> %reg.bc, half %load, i32 0
     98   store <2 x half> %build1, <2 x half> addrspace(1)* undef
     99   ret void
    100 }
    101 
    102 ; GCN-LABEL: {{^}}load_local_lo_v2f16_reglo_vreg:
    103 
    104 ; GFX9: ds_read_u16 v
    105 ; GFX9: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}}
    106 ; GFX9: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16, v{{[0-9]+}}
    107 ; GFX9: global_store_dword
    108 
    109 ; VI: ds_read_u16 v
    110 define void @load_local_lo_v2f16_reglo_vreg(half addrspace(3)* %in, half %reg) #0 {
    111 entry:
    112   %load = load half, half addrspace(3)* %in
    113   %build0 = insertelement <2 x half> undef, half %reg, i32 1
    114   %build1 = insertelement <2 x half> %build0, half %load, i32 0
    115   store <2 x half> %build1, <2 x half> addrspace(1)* undef
    116   ret void
    117 }
    118 
    119 ; GCN-LABEL: {{^}}load_local_lo_v2i16_reghi_vreg_zexti8:
    120 ; GCN: s_waitcnt
    121 ; GFX9-NEXT: ds_read_u8_d16 v1, v0
    122 ; GFX9-NEXT: s_waitcnt
    123 ; GFX9-NEXT: global_store_dword v[0:1], v1, off{{$}}
    124 ; GFX9-NEXT: s_waitcnt
    125 ; GFX9-NEXT: s_setpc_b64
    126 
    127 ; VI: ds_read_u8 v
    128 define void @load_local_lo_v2i16_reghi_vreg_zexti8(i8 addrspace(3)* %in, i32 %reg) #0 {
    129 entry:
    130   %reg.bc = bitcast i32 %reg to <2 x i16>
    131   %load = load i8, i8 addrspace(3)* %in
    132   %ext = zext i8 %load to i16
    133   %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
    134   store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
    135   ret void
    136 }
    137 
    138 ; GCN-LABEL: {{^}}load_local_lo_v2i16_reglo_vreg_zexti8:
    139 ; GCN: s_waitcnt
    140 ; GFX9: ds_read_u8 v
    141 ; GFX9: global_store_dword
    142 ; GFX9-NEXT: s_waitcnt
    143 ; GFX9-NEXT: s_setpc_b64
    144 
    145 ; VI: ds_read_u8 v
    146 define void @load_local_lo_v2i16_reglo_vreg_zexti8(i8 addrspace(3)* %in, i16 %reg) #0 {
    147 entry:
    148   %load = load i8, i8 addrspace(3)* %in
    149   %ext = zext i8 %load to i16
    150   %build0 = insertelement <2 x i16> undef, i16 %reg, i32 1
    151   %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 0
    152   store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
    153   ret void
    154 }
    155 
    156 ; GCN-LABEL: {{^}}load_local_lo_v2i16_reghi_vreg_sexti8:
    157 ; GCN: s_waitcnt
    158 ; GFX9-NEXT: ds_read_i8_d16 v1, v0
    159 ; GFX9-NEXT: s_waitcnt
    160 ; GFX9-NEXT: global_store_dword v[0:1], v1, off{{$}}
    161 ; GFX9-NEXT: s_waitcnt
    162 ; GFX9-NEXT: s_setpc_b64
    163 
    164 ; VI: ds_read_i8 v
    165 define void @load_local_lo_v2i16_reghi_vreg_sexti8(i8 addrspace(3)* %in, i32 %reg) #0 {
    166 entry:
    167   %reg.bc = bitcast i32 %reg to <2 x i16>
    168   %load = load i8, i8 addrspace(3)* %in
    169   %ext = sext i8 %load to i16
    170   %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
    171   store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
    172   ret void
    173 }
    174 
    175 ; GCN-LABEL: {{^}}load_local_lo_v2i16_reglo_vreg_sexti8:
    176 ; GCN: s_waitcnt
    177 ; GFX9: ds_read_i8 v
    178 ; GFX9: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}}
    179 ; GFX9: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16, v{{[0-9]+}}
    180 
    181 ; VI: ds_read_i8 v
    182 define void @load_local_lo_v2i16_reglo_vreg_sexti8(i8 addrspace(3)* %in, i16 %reg) #0 {
    183 entry:
    184   %load = load i8, i8 addrspace(3)* %in
    185   %ext = sext i8 %load to i16
    186   %build0 = insertelement <2 x i16> undef, i16 %reg, i32 1
    187   %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 0
    188   store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
    189   ret void
    190 }
    191 
    192 ; GCN-LABEL: {{^}}load_global_lo_v2i16_reglo_vreg:
    193 ; GCN: s_waitcnt
    194 ; GFX9-NEXT: global_load_short_d16 v2, v[0:1], off offset:-4094
    195 ; GFX9-NEXT: s_waitcnt
    196 ; GFX9-NEXT: global_store_dword
    197 ; GFX9-NEXT: s_waitcnt
    198 ; GFX9-NEXT: s_setpc_b64
    199 define void @load_global_lo_v2i16_reglo_vreg(i16 addrspace(1)* %in, i32 %reg) #0 {
    200 entry:
    201   %reg.bc = bitcast i32 %reg to <2 x i16>
    202   %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 -2047
    203   %load = load i16, i16 addrspace(1)* %gep
    204   %build1 = insertelement <2 x i16> %reg.bc, i16 %load, i32 0
    205   store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
    206   ret void
    207 }
    208 
    209 ; GCN-LABEL: {{^}}load_global_lo_v2f16_reglo_vreg:
    210 ; GCN: s_waitcnt
    211 ; GFX9-NEXT: global_load_short_d16 v2, v[0:1], off offset:-4094
    212 ; GFX9-NEXT: s_waitcnt
    213 ; GFX9-NEXT: global_store_dword
    214 ; GFX9-NEXT: s_waitcnt
    215 ; GFX9-NEXT: s_setpc_b64
    216 define void @load_global_lo_v2f16_reglo_vreg(half addrspace(1)* %in, i32 %reg) #0 {
    217 entry:
    218   %reg.bc = bitcast i32 %reg to <2 x half>
    219   %gep = getelementptr inbounds half, half addrspace(1)* %in, i64 -2047
    220   %load = load half, half addrspace(1)* %gep
    221   %build1 = insertelement <2 x half> %reg.bc, half %load, i32 0
    222   store <2 x half> %build1, <2 x half> addrspace(1)* undef
    223   ret void
    224 }
    225 
    226 ; GCN-LABEL: {{^}}load_global_lo_v2i16_reglo_vreg_zexti8:
    227 ; GCN: s_waitcnt
    228 ; GFX9-NEXT: global_load_ubyte_d16 v2, v[0:1], off offset:-4095
    229 ; GFX9-NEXT: s_waitcnt
    230 ; GFX9-NEXT: global_store_dword
    231 ; GFX9-NEXT: s_waitcnt
    232 ; GFX9-NEXT: s_setpc_b64
    233 define void @load_global_lo_v2i16_reglo_vreg_zexti8(i8 addrspace(1)* %in, i32 %reg) #0 {
    234 entry:
    235   %reg.bc = bitcast i32 %reg to <2 x i16>
    236   %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095
    237   %load = load i8, i8 addrspace(1)* %gep
    238   %ext = zext i8 %load to i16
    239   %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
    240   store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
    241   ret void
    242 }
    243 
    244 ; GCN-LABEL: {{^}}load_global_lo_v2i16_reglo_vreg_sexti8:
    245 ; GCN: s_waitcnt
    246 ; GFX9-NEXT: global_load_sbyte_d16 v2, v[0:1], off offset:-4095
    247 ; GFX9-NEXT: s_waitcnt
    248 ; GFX9-NEXT: global_store_dword
    249 ; GFX9-NEXT: s_waitcnt
    250 ; GFX9-NEXT: s_setpc_b64
    251 define void @load_global_lo_v2i16_reglo_vreg_sexti8(i8 addrspace(1)* %in, i32 %reg) #0 {
    252 entry:
    253   %reg.bc = bitcast i32 %reg to <2 x i16>
    254   %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095
    255   %load = load i8, i8 addrspace(1)* %gep
    256   %ext = sext i8 %load to i16
    257   %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
    258   store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
    259   ret void
    260 }
    261 
    262 ; GCN-LABEL: {{^}}load_flat_lo_v2i16_reghi_vreg:
    263 ; GCN: s_waitcnt
    264 ; GFX9-NEXT: flat_load_short_d16 v2, v[0:1]
    265 ; GFX9-NEXT: s_waitcnt
    266 ; GFX9-NEXT: global_store_dword v[0:1], v2
    267 ; GFX9-NEXT: s_waitcnt
    268 ; GFX9-NEXT: s_setpc_b64
    269 
    270 ; VI: flat_load_ushort v{{[0-9]+}}
    271 ; VI: v_or_b32_e32
    272 define void @load_flat_lo_v2i16_reghi_vreg(i16* %in, i32 %reg) #0 {
    273 entry:
    274   %reg.bc = bitcast i32 %reg to <2 x i16>
    275   %load = load i16, i16* %in
    276   %build1 = insertelement <2 x i16> %reg.bc, i16 %load, i32 0
    277   store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
    278   ret void
    279 }
    280 
    281 ; GCN-LABEL: {{^}}load_flat_lo_v2f16_reghi_vreg:
    282 ; GCN: s_waitcnt
    283 ; GFX9-NEXT: flat_load_short_d16 v2, v[0:1]
    284 ; GFX9-NEXT: s_waitcnt
    285 ; GFX9-NEXT: global_store_dword v[0:1], v2
    286 ; GFX9-NEXT: s_waitcnt
    287 ; GFX9-NEXT: s_setpc_b64
    288 
    289 ; VI: flat_load_ushort v{{[0-9]+}}
    290 ; VI: v_or_b32_e32
    291 define void @load_flat_lo_v2f16_reghi_vreg(half* %in, i32 %reg) #0 {
    292 entry:
    293   %reg.bc = bitcast i32 %reg to <2 x half>
    294   %load = load half, half* %in
    295   %build1 = insertelement <2 x half> %reg.bc, half %load, i32 0
    296   store <2 x half> %build1, <2 x half> addrspace(1)* undef
    297   ret void
    298 }
    299 
    300 ; GCN-LABEL: {{^}}load_flat_lo_v2i16_reglo_vreg_zexti8:
    301 ; GCN: s_waitcnt
    302 ; GFX9-NEXT: flat_load_ubyte_d16 v2, v[0:1]
    303 ; GFX9-NEXT: s_waitcnt
    304 ; GFX9-NEXT: global_store_dword v[0:1], v2
    305 ; GFX9-NEXT: s_waitcnt
    306 ; GFX9-NEXT: s_setpc_b64
    307 
    308 ; VI: flat_load_ubyte v{{[0-9]+}}
    309 ; VI: v_or_b32_e32
    310 define void @load_flat_lo_v2i16_reglo_vreg_zexti8(i8* %in, i32 %reg) #0 {
    311 entry:
    312   %reg.bc = bitcast i32 %reg to <2 x i16>
    313   %load = load i8, i8* %in
    314   %ext = zext i8 %load to i16
    315   %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
    316   store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
    317   ret void
    318 }
    319 
    320 ; GCN-LABEL: {{^}}load_flat_lo_v2i16_reglo_vreg_sexti8:
    321 ; GCN: s_waitcnt
    322 ; GFX9-NEXT: flat_load_sbyte_d16 v2, v[0:1]
    323 ; GFX9-NEXT: s_waitcnt
    324 ; GFX9-NEXT: global_store_dword v[0:1], v2
    325 ; GFX9-NEXT: s_waitcnt
    326 ; GFX9-NEXT: s_setpc_b64
    327 
    328 ; VI: flat_load_sbyte v{{[0-9]+}}
    329 ; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
    330 
    331 define void @load_flat_lo_v2i16_reglo_vreg_sexti8(i8* %in, i32 %reg) #0 {
    332 entry:
    333   %reg.bc = bitcast i32 %reg to <2 x i16>
    334   %load = load i8, i8* %in
    335   %ext = sext i8 %load to i16
    336   %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
    337   store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
    338   ret void
    339 }
    340 
    341 ; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg:
    342 ; GCN: s_waitcnt
    343 ; GFX9: buffer_load_short_d16 v0, off, s[0:3], s5 offset:4094{{$}}
    344 ; GFX9-NEXT: s_waitcnt
    345 ; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
    346 ; GFX9-NEXT: s_waitcnt
    347 ; GFX9-NEXT: s_setpc_b64
    348 
    349 ; VI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s5 offset:4094{{$}}
    350 define void @load_private_lo_v2i16_reglo_vreg(i16 addrspace(5)* byval %in, i32 %reg) #0 {
    351 entry:
    352   %reg.bc = bitcast i32 %reg to <2 x i16>
    353   %gep = getelementptr inbounds i16, i16 addrspace(5)* %in, i64 2045
    354   %load = load i16, i16 addrspace(5)* %gep
    355   %build1 = insertelement <2 x i16> %reg.bc, i16 %load, i32 0
    356   store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
    357   ret void
    358 }
    359 
    360 ; GCN-LABEL: {{^}}load_private_lo_v2i16_reghi_vreg:
    361 ; GCN: s_waitcnt
    362 ; GFX9: buffer_load_ushort v1, off, s[0:3], s5 offset:4094{{$}}
    363 ; GFX9-NEXT: s_waitcnt
    364 ; GFX9: v_and_b32
    365 ; GFX9: v_lshl_or_b32
    366 
    367 ; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}
    368 ; GFX9-NEXT: s_waitcnt
    369 ; GFX9-NEXT: s_setpc_b64
    370 
    371 ; VI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s5 offset:4094{{$}}
    372 define void @load_private_lo_v2i16_reghi_vreg(i16 addrspace(5)* byval %in, i16 %reg) #0 {
    373 entry:
    374   %gep = getelementptr inbounds i16, i16 addrspace(5)* %in, i64 2045
    375   %load = load i16, i16 addrspace(5)* %gep
    376   %build0 = insertelement <2 x i16> undef, i16 %reg, i32 1
    377   %build1 = insertelement <2 x i16> %build0, i16 %load, i32 0
    378   store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
    379   ret void
    380 }
    381 
    382 ; GCN-LABEL: {{^}}load_private_lo_v2f16_reglo_vreg:
    383 ; GCN: s_waitcnt
    384 ; GFX9: buffer_load_short_d16 v0, off, s[0:3], s5 offset:4094{{$}}
    385 ; GFX9-NEXT: s_waitcnt
    386 ; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
    387 ; GFX9-NEXT: s_waitcnt
    388 ; GFX9-NEXT: s_setpc_b64
    389 
    390 ; VI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s5 offset:4094{{$}}
    391 define void @load_private_lo_v2f16_reglo_vreg(half addrspace(5)* byval %in, i32 %reg) #0 {
    392 entry:
    393   %reg.bc = bitcast i32 %reg to <2 x half>
    394   %gep = getelementptr inbounds half, half addrspace(5)* %in, i64 2045
    395   %load = load half, half addrspace(5)* %gep
    396   %build1 = insertelement <2 x half> %reg.bc, half %load, i32 0
    397   store <2 x half> %build1, <2 x half> addrspace(1)* undef
    398   ret void
    399 }
    400 
    401 ; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg_nooff:
    402 ; GCN: s_waitcnt
    403 ; GFX9-NEXT: buffer_load_short_d16 v1, off, s[0:3], s4 offset:4094{{$}}
    404 ; GFX9-NEXT: s_waitcnt
    405 ; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
    406 ; GFX9-NEXT: s_waitcnt
    407 ; GFX9-NEXT: s_setpc_b64
    408 
    409 ; VI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s4 offset:4094{{$}}
    410 define void @load_private_lo_v2i16_reglo_vreg_nooff(i16 addrspace(5)* %in, i32 %reg) #0 {
    411 entry:
    412   %reg.bc = bitcast i32 %reg to <2 x i16>
    413   %load = load volatile i16, i16 addrspace(5)* inttoptr (i32 4094 to i16 addrspace(5)*)
    414   %build1 = insertelement <2 x i16> %reg.bc, i16 %load, i32 0
    415   store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
    416   ret void
    417 }
    418 
    419 ; GCN-LABEL: {{^}}load_private_lo_v2i16_reghi_vreg_nooff:
    420 ; GCN: s_waitcnt
    421 ; GFX9-NEXT: buffer_load_short_d16 v1, off, s[0:3], s4 offset:4094{{$}}
    422 ; GFX9-NEXT: s_waitcnt
    423 ; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
    424 ; GFX9-NEXT: s_waitcnt
    425 ; GFX9-NEXT: s_setpc_b64
    426 
    427 ; VI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s4 offset:4094{{$}}
    428 define void @load_private_lo_v2i16_reghi_vreg_nooff(i16 addrspace(5)* %in, i32 %reg) #0 {
    429 entry:
    430   %reg.bc = bitcast i32 %reg to <2 x i16>
    431   %load = load volatile i16, i16 addrspace(5)* inttoptr (i32 4094 to i16 addrspace(5)*)
    432   %build1 = insertelement <2 x i16> %reg.bc, i16 %load, i32 0
    433   store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
    434   ret void
    435 }
    436 
    437 ; GCN-LABEL: {{^}}load_private_lo_v2f16_reglo_vreg_nooff:
    438 ; GCN: s_waitcnt
    439 ; GFX9-NEXT: buffer_load_short_d16 v1, off, s[0:3], s4 offset:4094{{$}}
    440 ; GFX9-NEXT: s_waitcnt
    441 ; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
    442 ; GFX9-NEXT: s_waitcnt
    443 ; GFX9-NEXT: s_setpc_b64
    444 
    445 ; VI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s4 offset:4094{{$}}
    446 define void @load_private_lo_v2f16_reglo_vreg_nooff(half addrspace(5)* %in, i32 %reg) #0 {
    447 entry:
    448   %reg.bc = bitcast i32 %reg to <2 x half>
    449   %load = load volatile half, half addrspace(5)* inttoptr (i32 4094 to half addrspace(5)*)
    450   %build1 = insertelement <2 x half> %reg.bc, half %load, i32 0
    451   store <2 x half> %build1, <2 x half> addrspace(1)* undef
    452   ret void
    453 }
    454 
    455 ; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg_zexti8:
    456 ; GCN: s_waitcnt
    457 ; GFX9: buffer_load_ubyte_d16 v0, off, s[0:3], s5 offset:4095{{$}}
    458 ; GFX9-NEXT: s_waitcnt
    459 ; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
    460 ; GFX9-NEXT: s_waitcnt
    461 ; GFX9-NEXT: s_setpc_b64
    462 
    463 ; VI: buffer_load_ubyte v{{[0-9]+}}, off, s[0:3], s5 offset:4095{{$}}
    464 define void @load_private_lo_v2i16_reglo_vreg_zexti8(i8 addrspace(5)* byval %in, i32 %reg) #0 {
    465 entry:
    466   %reg.bc = bitcast i32 %reg to <2 x i16>
    467   %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4091
    468   %load = load i8, i8 addrspace(5)* %gep
    469   %ext = zext i8 %load to i16
    470   %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
    471   store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
    472   ret void
    473 }
    474 
    475 ; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg_sexti8:
    476 ; GCN: s_waitcnt
    477 ; GFX9: buffer_load_sbyte_d16 v0, off, s[0:3], s5 offset:4095{{$}}
    478 ; GFX9-NEXT: s_waitcnt
    479 ; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
    480 ; GFX9-NEXT: s_waitcnt
    481 ; GFX9-NEXT: s_setpc_b64
    482 
    483 ; VI: buffer_load_sbyte v{{[0-9]+}}, off, s[0:3], s5 offset:4095{{$}}
    484 define void @load_private_lo_v2i16_reglo_vreg_sexti8(i8 addrspace(5)* byval %in, i32 %reg) #0 {
    485 entry:
    486   %reg.bc = bitcast i32 %reg to <2 x i16>
    487   %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4091
    488   %load = load i8, i8 addrspace(5)* %gep
    489   %ext = sext i8 %load to i16
    490   %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
    491   store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
    492   ret void
    493 }
    494 
    495 ; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg_nooff_zexti8:
    496 ; GCN: s_waitcnt
    497 ; GFX9-NEXT: buffer_load_ubyte_d16 v1, off, s[0:3], s4 offset:4094{{$}}
    498 ; GFX9-NEXT: s_waitcnt
    499 ; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
    500 ; GFX9-NEXT: s_waitcnt
    501 ; GFX9-NEXT: s_setpc_b64
    502 
    503 ; VI: buffer_load_ubyte v0, off, s[0:3], s4 offset:4094{{$}}
    504 define void @load_private_lo_v2i16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, i32 %reg) #0 {
    505 entry:
    506   %reg.bc = bitcast i32 %reg to <2 x i16>
    507   %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*)
    508   %ext = zext i8 %load to i16
    509   %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
    510   store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
    511   ret void
    512 }
    513 
    514 ; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg_nooff_sexti8:
    515 ; GCN: s_waitcnt
    516 ; GFX9-NEXT: buffer_load_sbyte_d16 v1, off, s[0:3], s4 offset:4094{{$}}
    517 ; GFX9-NEXT: s_waitcnt
    518 ; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
    519 ; GFX9-NEXT: s_waitcnt
    520 ; GFX9-NEXT: s_setpc_b64
    521 
    522 ; VI: buffer_load_sbyte v0, off, s[0:3], s4 offset:4094{{$}}
    523 define void @load_private_lo_v2i16_reglo_vreg_nooff_sexti8(i8 addrspace(5)* %in, i32 %reg) #0 {
    524 entry:
    525   %reg.bc = bitcast i32 %reg to <2 x i16>
    526   %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*)
    527   %ext = sext i8 %load to i16
    528   %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
    529   store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
    530   ret void
    531 }
    532 
    533 ; GCN-LABEL: {{^}}load_private_lo_v2f16_reglo_vreg_nooff_zexti8:
    534 ; GCN: s_waitcnt
    535 ; GFX9-NEXT: buffer_load_ubyte_d16 v1, off, s[0:3], s4 offset:4094{{$}}
    536 ; GFX9-NEXT: s_waitcnt
    537 ; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
    538 ; GFX9-NEXT: s_waitcnt
    539 ; GFX9-NEXT: s_setpc_b64
    540 
    541 ; VI: buffer_load_ubyte v0, off, s[0:3], s4 offset:4094{{$}}
    542 define void @load_private_lo_v2f16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, i32 %reg) #0 {
    543 entry:
    544   %reg.bc = bitcast i32 %reg to <2 x half>
    545   %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*)
    546   %ext = zext i8 %load to i16
    547   %bc.ext = bitcast i16 %ext to half
    548   %build1 = insertelement <2 x half> %reg.bc, half %bc.ext, i32 0
    549   store <2 x half> %build1, <2 x half> addrspace(1)* undef
    550   ret void
    551 }
    552 
    553 ; GCN-LABEL: {{^}}load_constant_lo_v2i16_reglo_vreg:
    554 ; GCN: s_waitcnt
    555 ; GFX9-NEXT: global_load_short_d16 v2, v[0:1], off offset:-4094
    556 ; GFX9-NEXT: s_waitcnt
    557 ; GFX9-NEXT: global_store_dword
    558 ; GFX9-NEXT: s_waitcnt
    559 ; GFX9-NEXT: s_setpc_b64
    560 
    561 ; VI: flat_load_ushort
    562 define void @load_constant_lo_v2i16_reglo_vreg(i16 addrspace(4)* %in, i32 %reg) #0 {
    563 entry:
    564   %reg.bc = bitcast i32 %reg to <2 x i16>
    565   %gep = getelementptr inbounds i16, i16 addrspace(4)* %in, i64 -2047
    566   %load = load i16, i16 addrspace(4)* %gep
    567   %build1 = insertelement <2 x i16> %reg.bc, i16 %load, i32 0
    568   store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
    569   ret void
    570 }
    571 
    572 ; GCN-LABEL: load_constant_lo_v2f16_reglo_vreg
    573 ; GCN: s_waitcnt
    574 ; GFX9-NEXT: global_load_short_d16 v2, v[0:1], off offset:-4094
    575 ; GFX9-NEXT: s_waitcnt
    576 ; GFX9-NEXT: global_store_dword
    577 ; GFX9-NEXT: s_waitcnt
    578 ; GFX9-NEXT: s_setpc_b64
    579 
    580 ; VI: flat_load_ushort
    581 define void @load_constant_lo_v2f16_reglo_vreg(half addrspace(4)* %in, i32 %reg) #0 {
    582 entry:
    583   %reg.bc = bitcast i32 %reg to <2 x half>
    584   %gep = getelementptr inbounds half, half addrspace(4)* %in, i64 -2047
    585   %load = load half, half addrspace(4)* %gep
    586   %build1 = insertelement <2 x half> %reg.bc, half %load, i32 0
    587   store <2 x half> %build1, <2 x half> addrspace(1)* undef
    588   ret void
    589 }
    590 
    591 ; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg_to_offset:
    592 ; GFX9: buffer_store_dword
    593 ; GFX9-NEXT: buffer_load_short_d16 v0, off, s[0:3], s5 offset:4094
    594 
    595 ; VI: buffer_load_ushort v
    596 define void @load_private_lo_v2i16_reglo_vreg_to_offset(i32 %reg) #0 {
    597 entry:
    598   %obj0 = alloca [10 x i32], align 4, addrspace(5)
    599   %obj1 = alloca [4096 x i16], align 2, addrspace(5)
    600   %reg.bc = bitcast i32 %reg to <2 x i16>
    601   %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
    602   store volatile i32 123, i32 addrspace(5)* %bc
    603   %gep = getelementptr inbounds [4096 x i16], [4096 x i16] addrspace(5)* %obj1, i32 0, i32 2025
    604   %load = load volatile i16, i16 addrspace(5)* %gep
    605   %build1 = insertelement <2 x i16> %reg.bc, i16 %load, i32 0
    606   store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
    607   ret void
    608 }
    609 
    610 ; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg_sexti8_to_offset:
    611 ; GFX9: buffer_store_dword
    612 ; GFX9-NEXT: buffer_load_sbyte_d16 v0, off, s[0:3], s5 offset:4095
    613 
    614 ; VI: buffer_load_sbyte v
    615 define void @load_private_lo_v2i16_reglo_vreg_sexti8_to_offset(i32 %reg) #0 {
    616 entry:
    617   %obj0 = alloca [10 x i32], align 4, addrspace(5)
    618   %obj1 = alloca [4096 x i8], align 2, addrspace(5)
    619   %reg.bc = bitcast i32 %reg to <2 x i16>
    620   %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
    621   store volatile i32 123, i32 addrspace(5)* %bc
    622   %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4051
    623   %load = load volatile i8, i8 addrspace(5)* %gep
    624   %load.ext = sext i8 %load to i16
    625   %build1 = insertelement <2 x i16> %reg.bc, i16 %load.ext, i32 0
    626   store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
    627   ret void
    628 }
    629 
    630 ; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg_zexti8_to_offset:
    631 ; GFX9: buffer_store_dword
    632 ; GFX9-NEXT: buffer_load_ubyte_d16 v0, off, s[0:3], s5 offset:4095
    633 
    634 ; VI: buffer_load_ubyte v
    635 define void @load_private_lo_v2i16_reglo_vreg_zexti8_to_offset(i32 %reg) #0 {
    636 entry:
    637   %obj0 = alloca [10 x i32], align 4, addrspace(5)
    638   %obj1 = alloca [4096 x i8], align 2, addrspace(5)
    639   %reg.bc = bitcast i32 %reg to <2 x i16>
    640   %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
    641   store volatile i32 123, i32 addrspace(5)* %bc
    642   %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4051
    643   %load = load volatile i8, i8 addrspace(5)* %gep
    644   %load.ext = zext i8 %load to i16
    645   %build1 = insertelement <2 x i16> %reg.bc, i16 %load.ext, i32 0
    646   store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
    647   ret void
    648 }
    649 
    650 attributes #0 = { nounwind }
    651