Home | History | Annotate | Download | only in AMDGPU
      1 ; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX900 %s
      2 ; RUN: llc -march=amdgcn -mcpu=gfx906 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX906,NO-D16-HI %s
      3 ; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX803,NO-D16-HI %s
      4 
      5 ; GCN-LABEL: {{^}}load_local_hi_v2i16_undeflo:
      6 ; GCN: s_waitcnt
      7 ; GFX900-NEXT: ds_read_u16_d16_hi v0, v0
      8 ; GFX900-NEXT: s_waitcnt
      9 ; GFX900-NEXT: s_setpc_b64
     10 
     11 ; NO-D16-HI: ds_read_u16 v
     12 define <2 x i16> @load_local_hi_v2i16_undeflo(i16 addrspace(3)* %in) #0 {
     13 entry:
     14   %load = load i16, i16 addrspace(3)* %in
     15   %build = insertelement <2 x i16> undef, i16 %load, i32 1
     16   ret <2 x i16> %build
     17 }
     18 
     19 ; GCN-LABEL: {{^}}load_local_hi_v2i16_reglo:
     20 ; GCN: s_waitcnt
     21 ; GFX900-NEXT: ds_read_u16_d16_hi v1, v0
     22 ; GFX900-NEXT: s_waitcnt
     23 ; GFX900-NEXT: v_mov_b32_e32 v0, v1
     24 ; GFX900-NEXT: s_setpc_b64
     25 
     26 ; NO-D16-HI: ds_read_u16 v
     27 define <2 x i16> @load_local_hi_v2i16_reglo(i16 addrspace(3)* %in, i16 %reg) #0 {
     28 entry:
     29   %load = load i16, i16 addrspace(3)* %in
     30   %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
     31   %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
     32   ret <2 x i16> %build1
     33 }
     34 
     35 ; Show that we get reasonable regalloc without physreg constraints.
     36 ; GCN-LABEL: {{^}}load_local_hi_v2i16_reglo_vreg:
     37 ; GCN: s_waitcnt
     38 ; GFX900-NEXT: ds_read_u16_d16_hi v1, v0
     39 ; GFX900-NEXT: s_waitcnt
     40 ; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}}
     41 ; GFX900-NEXT: s_waitcnt
     42 ; GFX900-NEXT: s_setpc_b64
     43 
     44 ; NO-D16-HI: ds_read_u16 v
     45 define void @load_local_hi_v2i16_reglo_vreg(i16 addrspace(3)* %in, i16 %reg) #0 {
     46 entry:
     47   %load = load i16, i16 addrspace(3)* %in
     48   %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
     49   %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
     50   store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
     51   ret void
     52 }
     53 
     54 ; GCN-LABEL: {{^}}load_local_hi_v2i16_zerolo:
     55 ; GCN: s_waitcnt
     56 ; GFX900-NEXT: v_mov_b32_e32 v1, 0
     57 ; GFX900-NEXT: ds_read_u16_d16_hi v1, v0
     58 ; GFX900-NEXT: s_waitcnt
     59 ; GFX900-NEXT: v_mov_b32_e32 v0, v1
     60 ; GFX900-NEXT: s_setpc_b64
     61 
     62 ; NO-D16-HI: ds_read_u16 v
     63 define <2 x i16> @load_local_hi_v2i16_zerolo(i16 addrspace(3)* %in) #0 {
     64 entry:
     65   %load = load i16, i16 addrspace(3)* %in
     66   %build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 1
     67   ret <2 x i16> %build
     68 }
     69 
     70 ; FIXME: Remove m0 initialization
     71 ; GCN-LABEL: {{^}}load_local_hi_v2i16_zerolo_shift:
     72 ; GCN: s_waitcnt
     73 ; GFX900-NEXT: ds_read_u16 v0, v0
     74 ; GFX900-NEXT: s_waitcnt lgkmcnt(0)
     75 ; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0
     76 ; GFX900-NEXT: s_setpc_b64
     77 
     78 ; NO-D16-HI: ds_read_u16 v
     79 ; NO-D16-HI: v_lshlrev_b32_e32 v0, 16, v0
     80 define i32 @load_local_hi_v2i16_zerolo_shift(i16 addrspace(3)* %in) #0 {
     81 entry:
     82   %load = load i16, i16 addrspace(3)* %in
     83   %zext = zext i16 %load to i32
     84   %shift = shl i32 %zext, 16
     85   ret i32 %shift
     86 }
     87 
     88 ; GCN-LABEL: {{^}}load_local_hi_v2f16_reglo_vreg:
     89 ; GCN: s_waitcnt
     90 ; GFX900-NEXT: ds_read_u16_d16_hi v1, v0
     91 ; GFX900-NEXT: s_waitcnt
     92 ; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}}
     93 ; GFX900-NEXT: s_waitcnt
     94 ; GFX900-NEXT: s_setpc_b64
     95 
     96 ; NO-D16-HI: ds_read_u16 v
     97 define void @load_local_hi_v2f16_reglo_vreg(half addrspace(3)* %in, half %reg) #0 {
     98 entry:
     99   %load = load half, half addrspace(3)* %in
    100   %build0 = insertelement <2 x half> undef, half %reg, i32 0
    101   %build1 = insertelement <2 x half> %build0, half %load, i32 1
    102   store <2 x half> %build1, <2 x half> addrspace(1)* undef
    103   ret void
    104 }
    105 
    106 ; GCN-LABEL: {{^}}load_local_hi_v2i16_reglo_vreg_zexti8:
    107 ; GCN: s_waitcnt
    108 ; GFX900-NEXT: ds_read_u8_d16_hi v1, v0
    109 ; GFX900-NEXT: s_waitcnt
    110 ; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}}
    111 ; GFX900-NEXT: s_waitcnt
    112 ; GFX900-NEXT: s_setpc_b64
    113 
    114 ; NO-D16-HI: ds_read_u8 v
    115 define void @load_local_hi_v2i16_reglo_vreg_zexti8(i8 addrspace(3)* %in, i16 %reg) #0 {
    116 entry:
    117   %load = load i8, i8 addrspace(3)* %in
    118   %ext = zext i8 %load to i16
    119   %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
    120   %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
    121   store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
    122   ret void
    123 }
    124 
    125 ; GCN-LABEL: {{^}}load_local_hi_v2i16_reglo_vreg_sexti8:
    126 ; GCN: s_waitcnt
    127 ; GFX900-NEXT: ds_read_i8_d16_hi v1, v0
    128 ; GFX900-NEXT: s_waitcnt
    129 ; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}}
    130 ; GFX900-NEXT: s_waitcnt
    131 ; GFX900-NEXT: s_setpc_b64
    132 
    133 ; NO-D16-HI: ds_read_i8 v
    134 define void @load_local_hi_v2i16_reglo_vreg_sexti8(i8 addrspace(3)* %in, i16 %reg) #0 {
    135 entry:
    136   %load = load i8, i8 addrspace(3)* %in
    137   %ext = sext i8 %load to i16
    138   %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
    139   %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
    140   store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
    141   ret void
    142 }
    143 
    144 ; GCN-LABEL: {{^}}load_global_hi_v2i16_reglo_vreg:
    145 ; GCN: s_waitcnt
    146 ; GFX900-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094
    147 ; GFX900-NEXT: s_waitcnt
    148 ; GFX900-NEXT: global_store_dword
    149 ; GFX900-NEXT: s_waitcnt
    150 ; GFX900-NEXT: s_setpc_b64
    151 define void @load_global_hi_v2i16_reglo_vreg(i16 addrspace(1)* %in, i16 %reg) #0 {
    152 entry:
    153   %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 -2047
    154   %load = load i16, i16 addrspace(1)* %gep
    155   %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
    156   %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
    157   store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
    158   ret void
    159 }
    160 
    161 ; GCN-LABEL: {{^}}load_global_hi_v2f16_reglo_vreg:
    162 ; GCN: s_waitcnt
    163 ; GFX900-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094
    164 ; GFX900-NEXT: s_waitcnt
    165 ; GFX900-NEXT: global_store_dword
    166 ; GFX900-NEXT: s_waitcnt
    167 ; GFX900-NEXT: s_setpc_b64
    168 define void @load_global_hi_v2f16_reglo_vreg(half addrspace(1)* %in, half %reg) #0 {
    169 entry:
    170   %gep = getelementptr inbounds half, half addrspace(1)* %in, i64 -2047
    171   %load = load half, half addrspace(1)* %gep
    172   %build0 = insertelement <2 x half> undef, half %reg, i32 0
    173   %build1 = insertelement <2 x half> %build0, half %load, i32 1
    174   store <2 x half> %build1, <2 x half> addrspace(1)* undef
    175   ret void
    176 }
    177 
    178 ; GCN-LABEL: {{^}}load_global_hi_v2i16_reglo_vreg_zexti8:
    179 ; GCN: s_waitcnt
    180 ; GFX900-NEXT: global_load_ubyte_d16_hi v2, v[0:1], off offset:-4095
    181 ; GFX900-NEXT: s_waitcnt
    182 ; GFX900-NEXT: global_store_dword
    183 ; GFX900-NEXT: s_waitcnt
    184 ; GFX900-NEXT: s_setpc_b64
    185 define void @load_global_hi_v2i16_reglo_vreg_zexti8(i8 addrspace(1)* %in, i16 %reg) #0 {
    186 entry:
    187   %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095
    188   %load = load i8, i8 addrspace(1)* %gep
    189   %ext = zext i8 %load to i16
    190   %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
    191   %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
    192   store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
    193   ret void
    194 }
    195 
    196 ; GCN-LABEL: {{^}}load_global_hi_v2i16_reglo_vreg_sexti8:
    197 ; GCN: s_waitcnt
    198 ; GFX900-NEXT: global_load_sbyte_d16_hi v2, v[0:1], off offset:-4095
    199 ; GFX900-NEXT: s_waitcnt
    200 ; GFX900-NEXT: global_store_dword
    201 ; GFX900-NEXT: s_waitcnt
    202 ; GFX900-NEXT: s_setpc_b64
    203 define void @load_global_hi_v2i16_reglo_vreg_sexti8(i8 addrspace(1)* %in, i16 %reg) #0 {
    204 entry:
    205   %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095
    206   %load = load i8, i8 addrspace(1)* %gep
    207   %ext = sext i8 %load to i16
    208   %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
    209   %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
    210   store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
    211   ret void
    212 }
    213 
    214 ; GCN-LABEL: load_flat_hi_v2i16_reglo_vreg:
    215 ; GCN: s_waitcnt
    216 ; GFX900-NEXT: flat_load_short_d16_hi v2, v[0:1]
    217 ; GFX900-NEXT: s_waitcnt
    218 ; GFX900-NEXT: global_store_dword v[0:1], v2
    219 ; GFX900-NEXT: s_waitcnt
    220 ; GFX900-NEXT: s_setpc_b64
    221 
    222 ; NO-D16-HI: flat_load_ushort v{{[0-9]+}}
    223 ; GFX803: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
    224 ; GFX803: v_or_b32_sdwa
    225 ; GFX906: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16,
    226 define void @load_flat_hi_v2i16_reglo_vreg(i16* %in, i16 %reg) #0 {
    227 entry:
    228   %load = load i16, i16* %in
    229   %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
    230   %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
    231   store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
    232   ret void
    233 }
    234 
    235 ; GCN-LABEL: {{^}}load_flat_hi_v2f16_reglo_vreg:
    236 ; GCN: s_waitcnt
    237 ; GFX900-NEXT: flat_load_short_d16_hi v2, v[0:1]
    238 ; GFX900-NEXT: s_waitcnt
    239 ; GFX900-NEXT: global_store_dword v[0:1], v2
    240 ; GFX900-NEXT: s_waitcnt
    241 ; GFX900-NEXT: s_setpc_b64
    242 
    243 ; NO-D16-HI: flat_load_ushort v{{[0-9]+}}
    244 ; GFX803: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
    245 ; GFX803: v_or_b32_sdwa
    246 ; GFX906: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16,
    247 define void @load_flat_hi_v2f16_reglo_vreg(half* %in, half %reg) #0 {
    248 entry:
    249   %load = load half, half* %in
    250   %build0 = insertelement <2 x half> undef, half %reg, i32 0
    251   %build1 = insertelement <2 x half> %build0, half %load, i32 1
    252   store <2 x half> %build1, <2 x half> addrspace(1)* undef
    253   ret void
    254 }
    255 
    256 ; GCN-LABEL: {{^}}load_flat_hi_v2i16_reglo_vreg_zexti8:
    257 ; GCN: s_waitcnt
    258 ; GFX900-NEXT: flat_load_ubyte_d16_hi v2, v[0:1]
    259 ; GFX900-NEXT: s_waitcnt
    260 ; GFX900-NEXT: global_store_dword v[0:1], v2
    261 ; GFX900-NEXT: s_waitcnt
    262 ; GFX900-NEXT: s_setpc_b64
    263 
    264 ; NO-D16-HI: flat_load_ubyte v{{[0-9]+}}
    265 ; GFX803: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
    266 ; GFX803: v_or_b32_sdwa
    267 ; GFX906: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16,
    268 define void @load_flat_hi_v2i16_reglo_vreg_zexti8(i8* %in, i16 %reg) #0 {
    269 entry:
    270   %load = load i8, i8* %in
    271   %ext = zext i8 %load to i16
    272   %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
    273   %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
    274   store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
    275   ret void
    276 }
    277 
    278 ; GCN-LABEL: {{^}}load_flat_hi_v2i16_reglo_vreg_sexti8:
    279 ; GCN: s_waitcnt
    280 ; GFX900-NEXT: flat_load_sbyte_d16_hi v2, v[0:1]
    281 ; GFX900-NEXT: s_waitcnt
    282 ; GFX900-NEXT: global_store_dword v[0:1], v2
    283 ; GFX900-NEXT: s_waitcnt
    284 ; GFX900-NEXT: s_setpc_b64
    285 
    286 ; NO-D16-HI: flat_load_sbyte v{{[0-9]+}}
    287 ; GFX803: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
    288 ; GFX803: v_or_b32_sdwa
    289 ; GFX906: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16,
    290 define void @load_flat_hi_v2i16_reglo_vreg_sexti8(i8* %in, i16 %reg) #0 {
    291 entry:
    292   %load = load i8, i8* %in
    293   %ext = sext i8 %load to i16
    294   %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
    295   %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
    296   store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
    297   ret void
    298 }
    299 
    300 ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg:
    301 ; GCN: s_waitcnt
    302 ; GFX900: buffer_load_short_d16_hi v0, off, s[0:3], s5 offset:4094{{$}}
    303 ; GFX900-NEXT: s_waitcnt
    304 ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
    305 ; GFX900-NEXT: s_waitcnt
    306 ; GFX900-NEXT: s_setpc_b64
    307 
    308 ; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s5 offset:4094{{$}}
    309 define void @load_private_hi_v2i16_reglo_vreg(i16 addrspace(5)* byval %in, i16 %reg) #0 {
    310 entry:
    311   %gep = getelementptr inbounds i16, i16 addrspace(5)* %in, i64 2045
    312   %load = load i16, i16 addrspace(5)* %gep
    313   %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
    314   %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
    315   store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
    316   ret void
    317 }
    318 
    319 ; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg:
    320 ; GCN: s_waitcnt
    321 ; GFX900: buffer_load_short_d16_hi v0, off, s[0:3], s5 offset:4094{{$}}
    322 ; GFX900-NEXT: s_waitcnt
    323 ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
    324 ; GFX900-NEXT: s_waitcnt
    325 ; GFX900-NEXT: s_setpc_b64
    326 
    327 ; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s5 offset:4094{{$}}
    328 define void @load_private_hi_v2f16_reglo_vreg(half addrspace(5)* byval %in, half %reg) #0 {
    329 entry:
    330   %gep = getelementptr inbounds half, half addrspace(5)* %in, i64 2045
    331   %load = load half, half addrspace(5)* %gep
    332   %build0 = insertelement <2 x half> undef, half %reg, i32 0
    333   %build1 = insertelement <2 x half> %build0, half %load, i32 1
    334   store <2 x half> %build1, <2 x half> addrspace(1)* undef
    335   ret void
    336 }
    337 
    338 ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_nooff:
    339 ; GCN: s_waitcnt
    340 ; GFX900: buffer_load_short_d16_hi v0, off, s[0:3], s4 offset:4094{{$}}
    341 ; GFX900: s_waitcnt
    342 ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
    343 ; GFX900-NEXT: s_waitcnt
    344 ; GFX900-NEXT: s_setpc_b64
    345 
    346 ; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s4 offset:4094{{$}}
    347 define void @load_private_hi_v2i16_reglo_vreg_nooff(i16 addrspace(5)* byval %in, i16 %reg) #0 {
    348 entry:
    349   %load = load volatile i16, i16 addrspace(5)* inttoptr (i32 4094 to i16 addrspace(5)*)
    350   %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
    351   %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
    352   store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
    353   ret void
    354 }
    355 
    356 ; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_nooff:
    357 ; GCN: s_waitcnt
    358 ; GFX900-NEXT: buffer_load_short_d16_hi v1, off, s[0:3], s4 offset:4094{{$}}
    359 ; GFX900-NEXT: s_waitcnt
    360 ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
    361 ; GFX900-NEXT: s_waitcnt
    362 ; GFX900-NEXT: s_setpc_b64
    363 
    364 ; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s4 offset:4094{{$}}
    365 define void @load_private_hi_v2f16_reglo_vreg_nooff(half addrspace(5)* %in, half %reg) #0 {
    366 entry:
    367   %load = load volatile half, half addrspace(5)* inttoptr (i32 4094 to half addrspace(5)*)
    368   %build0 = insertelement <2 x half> undef, half %reg, i32 0
    369   %build1 = insertelement <2 x half> %build0, half %load, i32 1
    370   store <2 x half> %build1, <2 x half> addrspace(1)* undef
    371   ret void
    372 }
    373 
    374 ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_zexti8:
    375 ; GCN: s_waitcnt
    376 ; GFX900: buffer_load_ubyte_d16_hi v0, off, s[0:3], s5 offset:4095{{$}}
    377 ; GFX900-NEXT: s_waitcnt
    378 ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
    379 ; GFX900-NEXT: s_waitcnt
    380 ; GFX900-NEXT: s_setpc_b64
    381 
    382 ; NO-D16-HI: buffer_load_ubyte v{{[0-9]+}}, off, s[0:3], s5 offset:4095{{$}}
    383 define void @load_private_hi_v2i16_reglo_vreg_zexti8(i8 addrspace(5)* byval %in, i16 %reg) #0 {
    384 entry:
    385   %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4091
    386   %load = load i8, i8 addrspace(5)* %gep
    387   %ext = zext i8 %load to i16
    388   %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
    389   %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
    390   store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
    391   ret void
    392 }
    393 
    394 ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_sexti8:
    395 ; GCN: s_waitcnt
    396 ; GFX900: buffer_load_sbyte_d16_hi v0, off, s[0:3], s5 offset:4095{{$}}
    397 ; GFX900-NEXT: s_waitcnt
    398 ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
    399 ; GFX900-NEXT: s_waitcnt
    400 ; GFX900-NEXT: s_setpc_b64
    401 
    402 ; NO-D16-HI: buffer_load_sbyte v{{[0-9]+}}, off, s[0:3], s5 offset:4095{{$}}
    403 define void @load_private_hi_v2i16_reglo_vreg_sexti8(i8 addrspace(5)* byval %in, i16 %reg) #0 {
    404 entry:
    405   %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4091
    406   %load = load i8, i8 addrspace(5)* %gep
    407   %ext = sext i8 %load to i16
    408   %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
    409   %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
    410   store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
    411   ret void
    412 }
    413 
    414 ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_nooff_zexti8:
    415 ; GCN: s_waitcnt
    416 ; GFX900-NEXT: buffer_load_ubyte_d16_hi v1, off, s[0:3], s4 offset:4094{{$}}
    417 ; GFX900-NEXT: s_waitcnt
    418 ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
    419 ; GFX900-NEXT: s_waitcnt
    420 ; GFX900-NEXT: s_setpc_b64
    421 
    422 ; NO-D16-HI: buffer_load_ubyte v0, off, s[0:3], s4 offset:4094{{$}}
    423 define void @load_private_hi_v2i16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, i16 %reg) #0 {
    424 entry:
    425   %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*)
    426   %ext = zext i8 %load to i16
    427   %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
    428   %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
    429   store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
    430   ret void
    431 }
    432 
    433 ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_nooff_sexti8:
    434 ; GCN: s_waitcnt
    435 ; GFX900-NEXT: buffer_load_sbyte_d16_hi v1, off, s[0:3], s4 offset:4094{{$}}
    436 ; GFX900-NEXT: s_waitcnt
    437 ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
    438 ; GFX900-NEXT: s_waitcnt
    439 ; GFX900-NEXT: s_setpc_b64
    440 
    441 ; NO-D16-HI: buffer_load_sbyte v0, off, s[0:3], s4 offset:4094{{$}}
    442 define void @load_private_hi_v2i16_reglo_vreg_nooff_sexti8(i8 addrspace(5)* %in, i16 %reg) #0 {
    443 entry:
    444   %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*)
    445   %ext = sext i8 %load to i16
    446   %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
    447   %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
    448   store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
    449   ret void
    450 }
    451 
    452 ; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_nooff_zexti8:
    453 ; GCN: s_waitcnt
    454 ; GFX900-NEXT: buffer_load_ubyte_d16_hi v1, off, s[0:3], s4 offset:4094{{$}}
    455 ; GFX900-NEXT: s_waitcnt
    456 ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
    457 ; GFX900-NEXT: s_waitcnt
    458 ; GFX900-NEXT: s_setpc_b64
    459 
    460 ; NO-D16-HI: buffer_load_ubyte v0, off, s[0:3], s4 offset:4094{{$}}
    461 define void @load_private_hi_v2f16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, half %reg) #0 {
    462 entry:
    463   %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*)
    464   %ext = zext i8 %load to i16
    465   %bc.ext = bitcast i16 %ext to half
    466   %build0 = insertelement <2 x half> undef, half %reg, i32 0
    467   %build1 = insertelement <2 x half> %build0, half %bc.ext, i32 1
    468   store <2 x half> %build1, <2 x half> addrspace(1)* undef
    469   ret void
    470 }
    471 
    472 ; GCN-LABEL: {{^}}load_constant_hi_v2i16_reglo_vreg:
    473 ; GCN: s_waitcnt
    474 ; GFX900-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094
    475 ; GFX900-NEXT: s_waitcnt
    476 ; GFX900-NEXT: global_store_dword
    477 ; GFX900-NEXT: s_waitcnt
    478 ; GFX900-NEXT: s_setpc_b64
    479 
    480 ; GFX803: flat_load_ushort
    481 ; GFX906: global_load_ushort
    482 define void @load_constant_hi_v2i16_reglo_vreg(i16 addrspace(4)* %in, i16 %reg) #0 {
    483 entry:
    484   %gep = getelementptr inbounds i16, i16 addrspace(4)* %in, i64 -2047
    485   %load = load i16, i16 addrspace(4)* %gep
    486   %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
    487   %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
    488   store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
    489   ret void
    490 }
    491 
    492 ; GCN-LABEL: load_constant_hi_v2f16_reglo_vreg
    493 ; GCN: s_waitcnt
    494 ; GFX900-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094
    495 ; GFX900-NEXT: s_waitcnt
    496 ; GFX900-NEXT: global_store_dword
    497 ; GFX900-NEXT: s_waitcnt
    498 ; GFX900-NEXT: s_setpc_b64
    499 
    500 ; GFX803: flat_load_ushort
    501 ; GFX906: global_load_ushort
    502 define void @load_constant_hi_v2f16_reglo_vreg(half addrspace(4)* %in, half %reg) #0 {
    503 entry:
    504   %gep = getelementptr inbounds half, half addrspace(4)* %in, i64 -2047
    505   %load = load half, half addrspace(4)* %gep
    506   %build0 = insertelement <2 x half> undef, half %reg, i32 0
    507   %build1 = insertelement <2 x half> %build0, half %load, i32 1
    508   store <2 x half> %build1, <2 x half> addrspace(1)* undef
    509   ret void
    510 }
    511 
    512 ; Local object gives known offset, so requires converting from offen
    513 ; to offset variant.
    514 
    515 ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_to_offset:
    516 ; GFX900: buffer_store_dword
    517 ; GFX900-NEXT: buffer_load_short_d16_hi v{{[0-9]+}}, off, s[0:3], s5 offset:4094
    518 define void @load_private_hi_v2i16_reglo_vreg_to_offset(i16 %reg) #0 {
    519 entry:
    520   %obj0 = alloca [10 x i32], align 4, addrspace(5)
    521   %obj1 = alloca [4096 x i16], align 2, addrspace(5)
    522   %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
    523   store volatile i32 123, i32 addrspace(5)* %bc
    524   %gep = getelementptr inbounds [4096 x i16], [4096 x i16] addrspace(5)* %obj1, i32 0, i32 2025
    525   %load = load i16, i16 addrspace(5)* %gep
    526   %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
    527   %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
    528   store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
    529   ret void
    530 }
    531 
    532 ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_sexti8_to_offset:
    533 ; GFX900: buffer_store_dword
    534 ; GFX900-NEXT: buffer_load_sbyte_d16_hi v{{[0-9]+}}, off, s[0:3], s5 offset:4095
    535 define void @load_private_hi_v2i16_reglo_vreg_sexti8_to_offset(i16 %reg) #0 {
    536 entry:
    537   %obj0 = alloca [10 x i32], align 4, addrspace(5)
    538   %obj1 = alloca [4096 x i8], align 2, addrspace(5)
    539   %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
    540   store volatile i32 123, i32 addrspace(5)* %bc
    541   %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4051
    542   %load = load i8, i8 addrspace(5)* %gep
    543   %ext = sext i8 %load to i16
    544   %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
    545   %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
    546   store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
    547   ret void
    548 }
    549 
    550 ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_zexti8_to_offset:
    551 ; GFX900: buffer_store_dword
    552 ; GFX900-NEXT: buffer_load_ubyte_d16_hi v{{[0-9]+}}, off, s[0:3], s5 offset:4095
    553 define void @load_private_hi_v2i16_reglo_vreg_zexti8_to_offset(i16 %reg) #0 {
    554 entry:
    555   %obj0 = alloca [10 x i32], align 4, addrspace(5)
    556   %obj1 = alloca [4096 x i8], align 2, addrspace(5)
    557   %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
    558   store volatile i32 123, i32 addrspace(5)* %bc
    559   %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4051
    560   %load = load i8, i8 addrspace(5)* %gep
    561   %ext = zext i8 %load to i16
    562   %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
    563   %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
    564   store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
    565   ret void
    566 }
    567 
    568 ; FIXME: Remove m0 init and waitcnt between reads
    569 ; FIXME: Is there a cost to using the extload over not?
    570 ; GCN-LABEL: {{^}}load_local_v2i16_split:
    571 ; GCN: s_waitcnt
    572 ; GFX900-NEXT: ds_read_u16 v1, v0
    573 ; GFX900-NEXT: s_waitcnt
    574 ; GFX900-NEXT: ds_read_u16_d16_hi v1, v0 offset:2
    575 ; GFX900-NEXT: s_waitcnt
    576 ; GFX900-NEXT: v_mov_b32_e32 v0, v1
    577 ; GFX900-NEXT: s_setpc_b64
    578 define <2 x i16> @load_local_v2i16_split(i16 addrspace(3)* %in) #0 {
    579 entry:
    580   %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 1
    581   %load0 = load volatile i16, i16 addrspace(3)* %in
    582   %load1 = load volatile i16, i16 addrspace(3)* %gep
    583   %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0
    584   %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1
    585   ret <2 x i16> %build1
    586 }
    587 
    588 ; FIXME: Remove waitcnt between reads
    589 ; GCN-LABEL: {{^}}load_global_v2i16_split:
    590 ; GCN: s_waitcnt
    591 ; GFX900-NEXT: global_load_ushort v2
    592 ; GFX900-NEXT: s_waitcnt
    593 ; GFX900-NEXT: global_load_short_d16_hi v2
    594 ; GFX900-NEXT: s_waitcnt
    595 ; GFX900-NEXT: v_mov_b32_e32 v0, v2
    596 ; GFX900-NEXT: s_setpc_b64
    597 define <2 x i16> @load_global_v2i16_split(i16 addrspace(1)* %in) #0 {
    598 entry:
    599   %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 1
    600   %load0 = load volatile i16, i16 addrspace(1)* %in
    601   %load1 = load volatile i16, i16 addrspace(1)* %gep
    602   %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0
    603   %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1
    604   ret <2 x i16> %build1
    605 }
    606 
    607 ; FIXME: Remove waitcnt between reads
    608 ; GCN-LABEL: {{^}}load_flat_v2i16_split:
    609 ; GCN: s_waitcnt
    610 ; GFX900-NEXT: flat_load_ushort v2
    611 ; GFX900-NEXT: s_waitcnt
    612 ; GFX900-NEXT: flat_load_short_d16_hi v2
    613 ; GFX900-NEXT: s_waitcnt
    614 ; GFX900-NEXT: v_mov_b32_e32 v0, v2
    615 ; GFX900-NEXT: s_setpc_b64
    616 define <2 x i16> @load_flat_v2i16_split(i16* %in) #0 {
    617 entry:
    618   %gep = getelementptr inbounds i16, i16* %in, i64 1
    619   %load0 = load volatile i16, i16* %in
    620   %load1 = load volatile i16, i16* %gep
    621   %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0
    622   %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1
    623   ret <2 x i16> %build1
    624 }
    625 
    626 ; FIXME: Remove waitcnt between reads
    627 ; GCN-LABEL: {{^}}load_constant_v2i16_split:
    628 ; GCN: s_waitcnt
    629 ; GFX900-NEXT: global_load_ushort v2
    630 ; GFX900-NEXT: s_waitcnt
    631 ; GFX900-NEXT: global_load_short_d16_hi v2
    632 ; GFX900-NEXT: s_waitcnt
    633 ; GFX900-NEXT: v_mov_b32_e32 v0, v2
    634 ; GFX900-NEXT: s_setpc_b64
    635 define <2 x i16> @load_constant_v2i16_split(i16 addrspace(4)* %in) #0 {
    636 entry:
    637   %gep = getelementptr inbounds i16, i16 addrspace(4)* %in, i64 1
    638   %load0 = load volatile i16, i16 addrspace(4)* %in
    639   %load1 = load volatile i16, i16 addrspace(4)* %gep
    640   %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0
    641   %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1
    642   ret <2 x i16> %build1
    643 }
    644 
    645 ; FIXME: Remove m0 init and waitcnt between reads
    646 ; FIXME: Is there a cost to using the extload over not?
    647 ; GCN-LABEL: {{^}}load_private_v2i16_split:
    648 ; GCN: s_waitcnt
    649 ; GFX900: buffer_load_ushort v0, off, s[0:3], s5 offset:4{{$}}
    650 ; GFX900-NEXT: s_waitcnt
    651 ; GFX900-NEXT: buffer_load_short_d16_hi v0, off, s[0:3], s5 offset:6
    652 ; GFX900-NEXT: s_waitcnt
    653 ; GFX900-NEXT: s_setpc_b64
    654 define <2 x i16> @load_private_v2i16_split(i16 addrspace(5)* byval %in) #0 {
    655 entry:
    656   %gep = getelementptr inbounds i16, i16 addrspace(5)* %in, i32 1
    657   %load0 = load volatile i16, i16 addrspace(5)* %in
    658   %load1 = load volatile i16, i16 addrspace(5)* %gep
    659   %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0
    660   %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1
    661   ret <2 x i16> %build1
    662 }
    663 
    664 attributes #0 = { nounwind }
    665