Home | History | Annotate | Download | only in AMDGPU
      1 ; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX900 %s
      2 ; RUN: llc -march=amdgcn -mcpu=gfx906 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX906,NO-D16-HI %s
      3 ; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX803,NO-D16-HI %s
      4 
      5 ; GCN-LABEL: {{^}}store_global_hi_v2i16:
      6 ; GCN: s_waitcnt
      7 
      8 ; GFX900-NEXT: global_store_short_d16_hi v[0:1], v2, off
      9 
     10 ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
     11 ; GFX803-NEXT: flat_store_short v[0:1], v2
     12 ; GFX906-NEXT: global_store_short v[0:1], v2, off
     13 
     14 ; GCN-NEXT: s_waitcnt
     15 ; GCN-NEXT: s_setpc_b64
     16 define void @store_global_hi_v2i16(i16 addrspace(1)* %out, i32 %arg) #0 {
     17 entry:
     18   ; FIXME: ABI for pre-gfx9
     19   %value = bitcast i32 %arg to <2 x i16>
     20   %hi = extractelement <2 x i16> %value, i32 1
     21   store i16 %hi, i16 addrspace(1)* %out
     22   ret void
     23 }
     24 
     25 ; GCN-LABEL: {{^}}store_global_hi_v2f16:
     26 ; GCN: s_waitcnt
     27 
     28 ; GFX900-NEXT: global_store_short_d16_hi v[0:1], v2, off
     29 
     30 ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
     31 ; GFX803-NEXT: flat_store_short v[0:1], v2
     32 ; GFX906-NEXT: global_store_short v[0:1], v2, off
     33 
     34 ; GCN-NEXT: s_waitcnt
     35 ; GCN-NEXT: s_setpc_b64
     36 define void @store_global_hi_v2f16(half addrspace(1)* %out, i32 %arg) #0 {
     37 entry:
     38   ; FIXME: ABI for pre-gfx9
     39   %value = bitcast i32 %arg to <2 x half>
     40   %hi = extractelement <2 x half> %value, i32 1
     41   store half %hi, half addrspace(1)* %out
     42   ret void
     43 }
     44 
     45 ; GCN-LABEL: {{^}}store_global_hi_i32_shift:
     46 ; GCN: s_waitcnt
     47 
     48 ; GFX900-NEXT: global_store_short_d16_hi v[0:1], v2, off
     49 
     50 ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
     51 ; GFX803-NEXT: flat_store_short v[0:1], v2
     52 ; GFX906-NEXT: global_store_short v[0:1], v2, off
     53 
     54 ; GCN-NEXT: s_waitcnt
     55 ; GCN-NEXT: s_setpc_b64
     56 define void @store_global_hi_i32_shift(i16 addrspace(1)* %out, i32 %value) #0 {
     57 entry:
     58   %hi32 = lshr i32 %value, 16
     59   %hi = trunc i32 %hi32 to i16
     60   store i16 %hi, i16 addrspace(1)* %out
     61   ret void
     62 }
     63 
     64 ; GCN-LABEL: {{^}}store_global_hi_v2i16_i8:
     65 ; GCN: s_waitcnt
     66 
     67 ; GFX900-NEXT: global_store_byte_d16_hi v[0:1], v2, off
     68 
     69 ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
     70 ; GFX803-NEXT: flat_store_byte v[0:1], v2
     71 ; GFX906-NEXT: global_store_byte v[0:1], v2, off
     72 
     73 ; GCN-NEXT: s_waitcnt
     74 ; GCN-NEXT: s_setpc_b64
     75 define void @store_global_hi_v2i16_i8(i8 addrspace(1)* %out, i32 %arg) #0 {
     76 entry:
     77   %value = bitcast i32 %arg to <2 x i16>
     78   %hi = extractelement <2 x i16> %value, i32 1
     79   %trunc = trunc i16 %hi to i8
     80   store i8 %trunc, i8 addrspace(1)* %out
     81   ret void
     82 }
     83 
     84 ; GCN-LABEL: {{^}}store_global_hi_i8_shift:
     85 ; GCN: s_waitcnt
     86 
     87 ; GFX900-NEXT: global_store_byte_d16_hi v[0:1], v2, off
     88 
     89 ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
     90 ; GFX803-NEXT: flat_store_byte v[0:1], v2
     91 ; GFX906-NEXT: global_store_byte v[0:1], v2, off
     92 
     93 ; GCN-NEXT: s_waitcnt
     94 ; GCN-NEXT: s_setpc_b64
     95 define void @store_global_hi_i8_shift(i8 addrspace(1)* %out, i32 %value) #0 {
     96 entry:
     97   %hi32 = lshr i32 %value, 16
     98   %hi = trunc i32 %hi32 to i8
     99   store i8 %hi, i8 addrspace(1)* %out
    100   ret void
    101 }
    102 
    103 ; GCN-LABEL: {{^}}store_global_hi_v2i16_max_offset:
    104 ; GCN: s_waitcnt
    105 ; GFX900-NEXT: global_store_short_d16_hi v[0:1], v2, off offset:4094
    106 
    107 ; GFX803-DAG: v_add_u32_e32
    108 ; GFX803-DAG: v_addc_u32_e32
    109 ; GFX803-DAG: v_lshrrev_b32_e32 v2, 16, v2
    110 ; GFX803: flat_store_short v[0:1], v2{{$}}
    111 
    112 ; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v2
    113 ; GFX906-NEXT: global_store_short v[0:1], v2, off
    114 
    115 ; GCN-NEXT: s_waitcnt
    116 ; GCN-NEXT: s_setpc_b64
    117 define void @store_global_hi_v2i16_max_offset(i16 addrspace(1)* %out, i32 %arg) #0 {
    118 entry:
    119   ; FIXME: ABI for pre-gfx9
    120   %value = bitcast i32 %arg to <2 x i16>
    121   %hi = extractelement <2 x i16> %value, i32 1
    122   %gep = getelementptr inbounds i16, i16 addrspace(1)* %out, i64 2047
    123   store i16 %hi, i16 addrspace(1)* %gep
    124   ret void
    125 }
    126 
    127 ; GCN-LABEL: {{^}}store_global_hi_v2i16_min_offset:
    128 ; GCN: s_waitcnt
    129 ; GFX900-NEXT: global_store_short_d16_hi v[0:1], v2, off offset:-4096{{$}}
    130 
    131 ; GFX803-DAG: v_add_u32_e32
    132 ; GFX803-DAG: v_addc_u32_e32
    133 ; GFX803-DAG: v_lshrrev_b32_e32 v2, 16, v2
    134 ; GFX803: flat_store_short v[0:1], v{{[0-9]$}}
    135 
    136 ; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v2
    137 ; GFX906-NEXT: global_store_short v[0:1], v2, off
    138 
    139 ; GCN-NEXT: s_waitcnt
    140 ; GCN-NEXT: s_setpc_b64
    141 define void @store_global_hi_v2i16_min_offset(i16 addrspace(1)* %out, i32 %arg) #0 {
    142 entry:
    143   %value = bitcast i32 %arg to <2 x i16>
    144   %hi = extractelement <2 x i16> %value, i32 1
    145   %gep = getelementptr inbounds i16, i16 addrspace(1)* %out, i64 -2048
    146   store i16 %hi, i16 addrspace(1)* %gep
    147   ret void
    148 }
    149 
    150 ; GCN-LABEL: {{^}}store_global_hi_v2i16_i8_max_offset:
    151 ; GCN: s_waitcnt
    152 ; GFX900-NEXT: global_store_byte_d16_hi v[0:1], v2, off offset:4095
    153 
    154 ; GFX803-DAG: v_add_u32_e32
    155 ; GFX803-DAG: v_addc_u32_e32
    156 ; GFX803-DAG: v_lshrrev_b32_e32 v2, 16, v2
    157 ; GFX803: flat_store_byte v[0:1], v{{[0-9]$}}
    158 
    159 ; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v2
    160 ; GFX906-NEXT: global_store_byte v[0:1], v2, off
    161 
    162 ; GCN-NEXT: s_waitcnt
    163 ; GCN-NEXT: s_setpc_b64
    164 define void @store_global_hi_v2i16_i8_max_offset(i8 addrspace(1)* %out, i32 %arg) #0 {
    165 entry:
    166   %value = bitcast i32 %arg to <2 x i16>
    167   %hi = extractelement <2 x i16> %value, i32 1
    168   %trunc = trunc i16 %hi to i8
    169   %gep = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 4095
    170   store i8 %trunc, i8 addrspace(1)* %gep
    171   ret void
    172 }
    173 
    174 ; GCN-LABEL: {{^}}store_global_hi_v2i16_i8_min_offset:
    175 ; GCN: s_waitcnt
    176 ; GFX900-NEXT: global_store_byte_d16_hi v[0:1], v2, off offset:-4095
    177 
    178 ; GFX803-DAG: v_add_u32_e32
    179 ; GFX803-DAG: v_addc_u32_e32
    180 ; GFX803-DAG: v_lshrrev_b32_e32 v2, 16, v2
    181 ; GFX803: flat_store_byte v[0:1], v{{[0-9]$}}
    182 
    183 ; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v2
    184 ; GFX906-NEXT: global_store_byte v[0:1], v2, off
    185 
    186 ; GCN-NEXT: s_waitcnt
    187 ; GCN-NEXT: s_setpc_b64
    188 define void @store_global_hi_v2i16_i8_min_offset(i8 addrspace(1)* %out, i32 %arg) #0 {
    189 entry:
    190   %value = bitcast i32 %arg to <2 x i16>
    191   %hi = extractelement <2 x i16> %value, i32 1
    192   %trunc = trunc i16 %hi to i8
    193   %gep = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 -4095
    194   store i8 %trunc, i8 addrspace(1)* %gep
    195   ret void
    196 }
    197 
    198 ; GCN-LABEL: {{^}}store_flat_hi_v2i16:
    199 ; GCN: s_waitcnt
    200 
    201 ; GFX900-NEXT: flat_store_short_d16_hi v[0:1], v2{{$}}
    202 
    203 ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
    204 ; NO-D16-HI-NEXT: flat_store_short v[0:1], v2
    205 
    206 ; GCN-NEXT: s_waitcnt
    207 ; GCN-NEXT: s_setpc_b64
    208 define void @store_flat_hi_v2i16(i16* %out, i32 %arg) #0 {
    209 entry:
    210   %value = bitcast i32 %arg to <2 x i16>
    211   %hi = extractelement <2 x i16> %value, i32 1
    212   store i16 %hi, i16* %out
    213   ret void
    214 }
    215 
    216 ; GCN-LABEL: {{^}}store_flat_hi_v2f16:
    217 ; GCN: s_waitcnt
    218 
    219 ; GFX900-NEXT: flat_store_short_d16_hi v[0:1], v2{{$}}
    220 
    221 ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
    222 ; NO-D16-HI-NEXT: flat_store_short v[0:1], v2
    223 
    224 ; GCN-NEXT: s_waitcnt
    225 ; GCN-NEXT: s_setpc_b64
    226 define void @store_flat_hi_v2f16(half* %out, i32 %arg) #0 {
    227 entry:
    228   %value = bitcast i32 %arg to <2 x half>
    229   %hi = extractelement <2 x half> %value, i32 1
    230   store half %hi, half* %out
    231   ret void
    232 }
    233 
    234 ; GCN-LABEL: {{^}}store_flat_hi_i32_shift:
    235 ; GCN: s_waitcnt
    236 
    237 ; GFX900-NEXT: flat_store_short_d16_hi v[0:1], v2{{$}}
    238 
    239 ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
    240 ; NO-D16-HI-NEXT: flat_store_short v[0:1], v2
    241 
    242 ; GCN-NEXT: s_waitcnt
    243 ; GCN-NEXT: s_setpc_b64
    244 define void @store_flat_hi_i32_shift(i16* %out, i32 %value) #0 {
    245 entry:
    246   %hi32 = lshr i32 %value, 16
    247   %hi = trunc i32 %hi32 to i16
    248   store i16 %hi, i16* %out
    249   ret void
    250 }
    251 
    252 ; GCN-LABEL: {{^}}store_flat_hi_v2i16_i8:
    253 ; GCN: s_waitcnt
    254 
    255 ; GFX900-NEXT: flat_store_byte_d16_hi v[0:1], v2{{$}}
    256 
    257 ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
    258 ; NO-D16-HI-NEXT: flat_store_byte v[0:1], v2
    259 
    260 ; GCN-NEXT: s_waitcnt
    261 ; GCN-NEXT: s_setpc_b64
    262 define void @store_flat_hi_v2i16_i8(i8* %out, i32 %arg) #0 {
    263 entry:
    264   %value = bitcast i32 %arg to <2 x i16>
    265   %hi = extractelement <2 x i16> %value, i32 1
    266   %trunc = trunc i16 %hi to i8
    267   store i8 %trunc, i8* %out
    268   ret void
    269 }
    270 
    271 ; GCN-LABEL: {{^}}store_flat_hi_i8_shift:
    272 ; GCN: s_waitcnt
    273 
    274 ; GFX900-NEXT: flat_store_byte_d16_hi v[0:1], v2{{$}}
    275 
    276 ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
    277 ; NO-D16-HI-NEXT: flat_store_byte v[0:1], v2
    278 
    279 ; GCN-NEXT: s_waitcnt
    280 ; GCN-NEXT: s_setpc_b64
    281 define void @store_flat_hi_i8_shift(i8* %out, i32 %value) #0 {
    282 entry:
    283   %hi32 = lshr i32 %value, 16
    284   %hi = trunc i32 %hi32 to i8
    285   store i8 %hi, i8* %out
    286   ret void
    287 }
    288 
    289 ; GCN-LABEL: {{^}}store_flat_hi_v2i16_max_offset:
    290 ; GCN: s_waitcnt
    291 ; GFX900-NEXT: flat_store_short_d16_hi v[0:1], v2 offset:4094{{$}}
    292 
    293 ; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v2
    294 ; GFX906-NEXT: flat_store_short v[0:1], v2 offset:4094
    295 
    296 ; GFX803-DAG: v_add_u32_e32
    297 ; GFX803-DAG: v_addc_u32_e32
    298 ; GFX803-DAG: v_lshrrev_b32_e32 v2, 16, v2
    299 ; GFX803: flat_store_short v[0:1], v2{{$}}
    300 
    301 ; GCN-NEXT: s_waitcnt
    302 ; GCN-NEXT: s_setpc_b64
    303 define void @store_flat_hi_v2i16_max_offset(i16* %out, i32 %arg) #0 {
    304 entry:
    305   %value = bitcast i32 %arg to <2 x i16>
    306   %hi = extractelement <2 x i16> %value, i32 1
    307   %gep = getelementptr inbounds i16, i16* %out, i64 2047
    308   store i16 %hi, i16* %gep
    309   ret void
    310 }
    311 
    312 ; GCN-LABEL: {{^}}store_flat_hi_v2i16_neg_offset:
    313 ; GCN: s_waitcnt
    314 ; GCN: v_add{{(_co)?}}_{{i|u}}32_e32
    315 
    316 ; GFX803: v_addc_u32_e32
    317 ; GFX900: v_addc_co_u32_e32
    318 
    319 ; GFX906-NEXT: v_lshrrev_b32_e32
    320 ; GFX906-NEXT: v_addc_co_u32_e32
    321 ; GFX906: flat_store_short v[0:1], v2
    322 
    323 ; GFX900-NEXT: flat_store_short_d16_hi v[0:1], v2{{$}}
    324 ; GFX803: flat_store_short v[0:1], v2{{$}}
    325 ; GCN-NEXT: s_waitcnt
    326 ; GCN-NEXT: s_setpc_b64
    327 define void @store_flat_hi_v2i16_neg_offset(i16* %out, i32 %arg) #0 {
    328 entry:
    329   %value = bitcast i32 %arg to <2 x i16>
    330   %hi = extractelement <2 x i16> %value, i32 1
    331   %gep = getelementptr inbounds i16, i16* %out, i64 -1023
    332   store i16 %hi, i16* %gep
    333   ret void
    334 }
    335 
    336 ; GCN-LABEL: {{^}}store_flat_hi_v2i16_i8_max_offset:
    337 ; GCN: s_waitcnt
    338 ; GFX900-NEXT: flat_store_byte_d16_hi v[0:1], v2 offset:4095{{$}}
    339 
    340 ; GFX803-DAG: v_lshrrev_b32_e32 v2, 16, v2
    341 ; GFX803-DAG: v_add_u32_e32
    342 ; GFX803-DAG: v_addc_u32_e32
    343 ; GFX803: flat_store_byte v[0:1], v2{{$}}
    344 
    345 ; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v2
    346 ; GFX906-NEXT: flat_store_byte v[0:1], v2 offset:4095{{$}}
    347 
    348 ; GCN-NEXT: s_waitcnt
    349 ; GCN-NEXT: s_setpc_b64
    350 define void @store_flat_hi_v2i16_i8_max_offset(i8* %out, i32 %arg) #0 {
    351 entry:
    352   %value = bitcast i32 %arg to <2 x i16>
    353   %hi = extractelement <2 x i16> %value, i32 1
    354   %trunc = trunc i16 %hi to i8
    355   %gep = getelementptr inbounds i8, i8* %out, i64 4095
    356   store i8 %trunc, i8* %gep
    357   ret void
    358 }
    359 
    360 ; GCN-LABEL: {{^}}store_flat_hi_v2i16_i8_neg_offset:
    361 ; GCN: s_waitcnt
    362 ; GCN-DAG: v_add{{(_co)?}}_{{i|u}}32_e32
    363 
    364 ; GFX803-DAG: v_addc_u32_e32
    365 ; GFX900-DAG: v_addc_co_u32_e32
    366 ; GFX906-DAG: v_add_co_u32_e32
    367 
    368 ; GFX900-NEXT: flat_store_byte_d16_hi v[0:1], v2{{$}}
    369 
    370 ; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v2
    371 ; GFX906-NEXT: v_addc_co_u32_e32
    372 ; GFX906-NEXT: flat_store_byte v[0:1], v2{{$}}
    373 
    374 ; GFX803-DAG: v_lshrrev_b32_e32 v2, 16, v2
    375 ; GFX803: flat_store_byte v[0:1], v2{{$}}
    376 
    377 ; GCN-NEXT: s_waitcnt
    378 ; GCN-NEXT: s_setpc_b64
    379 define void @store_flat_hi_v2i16_i8_neg_offset(i8* %out, i32 %arg) #0 {
    380 entry:
    381   %value = bitcast i32 %arg to <2 x i16>
    382   %hi = extractelement <2 x i16> %value, i32 1
    383   %trunc = trunc i16 %hi to i8
    384   %gep = getelementptr inbounds i8, i8* %out, i64 -4095
    385   store i8 %trunc, i8* %gep
    386   ret void
    387 }
    388 
    389 ; GCN-LABEL: {{^}}store_private_hi_v2i16:
    390 ; GCN: s_waitcnt
    391 
    392 ; GFX900-NEXT: buffer_store_short_d16_hi v1, v0, s[0:3], s4 offen{{$}}
    393 
    394 ; NO-D16-HI: v_lshrrev_b32_e32 v1, 16, v1
    395 ; NO-D16-HI: buffer_store_short v1, v0, s[0:3], s4 offen{{$}}
    396 
    397 ; GCN-NEXT: s_waitcnt
    398 ; GCN-NEXT: s_setpc_b64
    399 define void @store_private_hi_v2i16(i16 addrspace(5)* %out, i32 %arg) #0 {
    400 entry:
    401   ; FIXME: ABI for pre-gfx9
    402   %value = bitcast i32 %arg to <2 x i16>
    403   %hi = extractelement <2 x i16> %value, i32 1
    404   store i16 %hi, i16 addrspace(5)* %out
    405   ret void
    406 }
    407 
    408 ; GCN-LABEL: {{^}}store_private_hi_v2f16:
    409 ; GCN: s_waitcnt
    410 
    411 ; GFX900-NEXT: buffer_store_short_d16_hi v1, v0, s[0:3], s4 offen{{$}}
    412 
    413 ; NO-D16-HI: v_lshrrev_b32_e32 v1, 16, v1
    414 ; NO-D16-HI: buffer_store_short v1, v0, s[0:3], s4 offen{{$}}
    415 
    416 ; GCN-NEXT: s_waitcnt
    417 ; GCN-NEXT: s_setpc_b64
    418 define void @store_private_hi_v2f16(half addrspace(5)* %out, i32 %arg) #0 {
    419 entry:
    420   ; FIXME: ABI for pre-gfx9
    421   %value = bitcast i32 %arg to <2 x half>
    422   %hi = extractelement <2 x half> %value, i32 1
    423   store half %hi, half addrspace(5)* %out
    424   ret void
    425 }
    426 
    427 ; GCN-LABEL: {{^}}store_private_hi_i32_shift:
    428 ; GCN: s_waitcnt
    429 
    430 ; GFX900-NEXT: buffer_store_short_d16_hi v1, v0, s[0:3], s4 offen{{$}}
    431 
    432 ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
    433 ; NO-D16-HI-NEXT: buffer_store_short v1, v0, s[0:3], s4 offen{{$}}
    434 
    435 ; GCN-NEXT: s_waitcnt
    436 ; GCN-NEXT: s_setpc_b64
    437 define void @store_private_hi_i32_shift(i16 addrspace(5)* %out, i32 %value) #0 {
    438 entry:
    439   %hi32 = lshr i32 %value, 16
    440   %hi = trunc i32 %hi32 to i16
    441   store i16 %hi, i16 addrspace(5)* %out
    442   ret void
    443 }
    444 
    445 ; GCN-LABEL: {{^}}store_private_hi_v2i16_i8:
    446 ; GCN: s_waitcnt
    447 
    448 ; GFX900-NEXT: buffer_store_byte_d16_hi v1, v0, s[0:3], s4 offen{{$}}
    449 
    450 ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
    451 ; NO-D16-HI-NEXT: buffer_store_byte v1, v0, s[0:3], s4 offen{{$}}
    452 
    453 ; GCN-NEXT: s_waitcnt
    454 ; GCN-NEXT: s_setpc_b64
    455 define void @store_private_hi_v2i16_i8(i8 addrspace(5)* %out, i32 %arg) #0 {
    456 entry:
    457   %value = bitcast i32 %arg to <2 x i16>
    458   %hi = extractelement <2 x i16> %value, i32 1
    459   %trunc = trunc i16 %hi to i8
    460   store i8 %trunc, i8 addrspace(5)* %out
    461   ret void
    462 }
    463 
    464 ; GCN-LABEL: {{^}}store_private_hi_i8_shift:
    465 ; GCN: s_waitcnt
    466 
    467 ; GFX900-NEXT: buffer_store_byte_d16_hi v1, v0, s[0:3], s4 offen{{$}}
    468 
    469 ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
    470 ; NO-D16-HI-NEXT: buffer_store_byte v1, v0, s[0:3], s4 offen{{$}}
    471 
    472 ; GCN-NEXT: s_waitcnt
    473 ; GCN-NEXT: s_setpc_b64
    474 define void @store_private_hi_i8_shift(i8 addrspace(5)* %out, i32 %value) #0 {
    475 entry:
    476   %hi32 = lshr i32 %value, 16
    477   %hi = trunc i32 %hi32 to i8
    478   store i8 %hi, i8 addrspace(5)* %out
    479   ret void
    480 }
    481 
    482 ; GCN-LABEL: {{^}}store_private_hi_v2i16_max_offset:
    483 ; GCN: s_waitcnt
    484 ; GFX900: buffer_store_short_d16_hi v0, off, s[0:3], s5 offset:4094{{$}}
    485 
    486 ; NO-D16-HI: v_lshrrev_b32_e32 v0, 16, v0
    487 ; NO-D16-HI-NEXT: buffer_store_short v0, off, s[0:3], s5 offset:4094{{$}}
    488 
    489 ; GCN-NEXT: s_waitcnt
    490 ; GCN-NEXT: s_setpc_b64
    491 define void @store_private_hi_v2i16_max_offset(i16 addrspace(5)* byval %out, i32 %arg) #0 {
    492 entry:
    493   %value = bitcast i32 %arg to <2 x i16>
    494   %hi = extractelement <2 x i16> %value, i32 1
    495   %gep = getelementptr inbounds i16, i16 addrspace(5)* %out, i64 2045
    496   store i16 %hi, i16 addrspace(5)* %gep
    497   ret void
    498 }
    499 
    500 
    501 
    502 ; GCN-LABEL: {{^}}store_private_hi_v2i16_nooff:
    503 ; GCN: s_waitcnt
    504 
    505 ; GFX900-NEXT: buffer_store_short_d16_hi v0, off, s[0:3], s4{{$}}
    506 
    507 ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
    508 ; NO-D16-HI-NEXT: buffer_store_short v0, off, s[0:3], s4{{$}}
    509 
    510 ; GCN-NEXT: s_waitcnt
    511 ; GCN-NEXT: s_setpc_b64
    512 define void @store_private_hi_v2i16_nooff(i32 %arg) #0 {
    513 entry:
    514   ; FIXME: ABI for pre-gfx9
    515   %value = bitcast i32 %arg to <2 x i16>
    516   %hi = extractelement <2 x i16> %value, i32 1
    517   store volatile i16 %hi, i16 addrspace(5)* null
    518   ret void
    519 }
    520 
    521 
    522 ; GCN-LABEL: {{^}}store_private_hi_v2i16_i8_nooff:
    523 ; GCN: s_waitcnt
    524 
    525 ; GFX900-NEXT: buffer_store_byte_d16_hi v0, off, s[0:3], s4{{$}}
    526 
    527 ; NO-D16-HI: v_lshrrev_b32_e32 v0, 16, v0
    528 ; NO-D16-HI: buffer_store_byte v0, off, s[0:3], s4{{$}}
    529 
    530 ; GCN-NEXT: s_waitcnt
    531 ; GCN-NEXT: s_setpc_b64
    532 define void @store_private_hi_v2i16_i8_nooff(i32 %arg) #0 {
    533 entry:
    534   %value = bitcast i32 %arg to <2 x i16>
    535   %hi = extractelement <2 x i16> %value, i32 1
    536   %trunc = trunc i16 %hi to i8
    537   store volatile i8 %trunc, i8 addrspace(5)* null
    538   ret void
    539 }
    540 
    541 ; GCN-LABEL: {{^}}store_local_hi_v2i16:
    542 ; GCN: s_waitcnt
    543 
    544 ; GFX900-NEXT: ds_write_b16_d16_hi v0, v1{{$}}
    545 
    546 ; NO-D16-HI: v_lshrrev_b32_e32 v1, 16, v1
    547 ; NO-D16-HI: ds_write_b16 v0, v1
    548 
    549 ; GCN-NEXT: s_waitcnt
    550 ; GCN-NEXT: s_setpc_b64
    551 define void @store_local_hi_v2i16(i16 addrspace(3)* %out, i32 %arg) #0 {
    552 entry:
    553   ; FIXME: ABI for pre-gfx9
    554   %value = bitcast i32 %arg to <2 x i16>
    555   %hi = extractelement <2 x i16> %value, i32 1
    556   store i16 %hi, i16 addrspace(3)* %out
    557   ret void
    558 }
    559 
    560 ; GCN-LABEL: {{^}}store_local_hi_v2f16:
    561 ; GCN: s_waitcnt
    562 
    563 ; GFX900-NEXT: ds_write_b16_d16_hi v0, v1{{$}}
    564 
    565 ; NO-D16-HI: v_lshrrev_b32_e32 v1, 16, v1
    566 ; NO-D16-HI: ds_write_b16 v0, v1
    567 
    568 ; GCN-NEXT: s_waitcnt
    569 ; GCN-NEXT: s_setpc_b64
    570 define void @store_local_hi_v2f16(half addrspace(3)* %out, i32 %arg) #0 {
    571 entry:
    572   ; FIXME: ABI for pre-gfx9
    573   %value = bitcast i32 %arg to <2 x half>
    574   %hi = extractelement <2 x half> %value, i32 1
    575   store half %hi, half addrspace(3)* %out
    576   ret void
    577 }
    578 
    579 ; GCN-LABEL: {{^}}store_local_hi_i32_shift:
    580 ; GCN: s_waitcnt
    581 
    582 ; GFX900-NEXT: ds_write_b16_d16_hi v0, v1{{$}}
    583 
    584 ; NO-D16-HI: v_lshrrev_b32_e32 v1, 16, v1
    585 ; NO-D16-HI: ds_write_b16 v0, v1
    586 
    587 ; GCN-NEXT: s_waitcnt
    588 ; GCN-NEXT: s_setpc_b64
    589 define void @store_local_hi_i32_shift(i16 addrspace(3)* %out, i32 %value) #0 {
    590 entry:
    591   %hi32 = lshr i32 %value, 16
    592   %hi = trunc i32 %hi32 to i16
    593   store i16 %hi, i16 addrspace(3)* %out
    594   ret void
    595 }
    596 
    597 ; GCN-LABEL: {{^}}store_local_hi_v2i16_i8:
    598 ; GCN: s_waitcnt
    599 
    600 ; GFX900-NEXT: ds_write_b8_d16_hi v0, v1{{$}}
    601 
    602 ; NO-D16-HI: v_lshrrev_b32_e32 v1, 16, v1
    603 ; NO-D16-HI: ds_write_b8 v0, v1
    604 
    605 ; GCN-NEXT: s_waitcnt
    606 ; GCN-NEXT: s_setpc_b64
    607 define void @store_local_hi_v2i16_i8(i8 addrspace(3)* %out, i32 %arg) #0 {
    608 entry:
    609   %value = bitcast i32 %arg to <2 x i16>
    610   %hi = extractelement <2 x i16> %value, i32 1
    611   %trunc = trunc i16 %hi to i8
    612   store i8 %trunc, i8 addrspace(3)* %out
    613   ret void
    614 }
    615 
    616 ; GCN-LABEL: {{^}}store_local_hi_v2i16_max_offset:
    617 ; GCN: s_waitcnt
    618 ; GFX900-NEXT: ds_write_b16_d16_hi v0, v1 offset:65534{{$}}
    619 
    620 ; NO-D16-HI: v_lshrrev_b32_e32 v1, 16, v1
    621 ; NO-D16-HI: ds_write_b16 v0, v1 offset:65534{{$}}
    622 
    623 ; GCN-NEXT: s_waitcnt
    624 ; GCN-NEXT: s_setpc_b64
    625 define void @store_local_hi_v2i16_max_offset(i16 addrspace(3)* %out, i32 %arg) #0 {
    626 entry:
    627   ; FIXME: ABI for pre-gfx9
    628   %value = bitcast i32 %arg to <2 x i16>
    629   %hi = extractelement <2 x i16> %value, i32 1
    630   %gep = getelementptr inbounds i16, i16 addrspace(3)* %out, i64 32767
    631   store i16 %hi, i16 addrspace(3)* %gep
    632   ret void
    633 }
    634 
    635 ; GCN-LABEL: {{^}}store_private_hi_v2i16_to_offset:
    636 ; GCN: s_waitcnt
    637 ; GFX900: buffer_store_dword
    638 ; GFX900-NEXT: buffer_store_short_d16_hi v0, off, s[0:3], s5 offset:4094
    639 define void @store_private_hi_v2i16_to_offset(i32 %arg) #0 {
    640 entry:
    641   %obj0 = alloca [10 x i32], align 4, addrspace(5)
    642   %obj1 = alloca [4096 x i16], align 2, addrspace(5)
    643   %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
    644   store volatile i32 123, i32 addrspace(5)* %bc
    645   %value = bitcast i32 %arg to <2 x i16>
    646   %hi = extractelement <2 x i16> %value, i32 1
    647   %gep = getelementptr inbounds [4096 x i16], [4096 x i16] addrspace(5)* %obj1, i32 0, i32 2025
    648   store i16 %hi, i16 addrspace(5)* %gep
    649   ret void
    650 }
    651 
    652 ; GCN-LABEL: {{^}}store_private_hi_v2i16_i8_to_offset:
    653 ; GCN: s_waitcnt
    654 ; GFX900: buffer_store_dword
    655 ; GFX900-NEXT: buffer_store_byte_d16_hi v0, off, s[0:3], s5 offset:4095
    656 define void @store_private_hi_v2i16_i8_to_offset(i32 %arg) #0 {
    657 entry:
    658   %obj0 = alloca [10 x i32], align 4, addrspace(5)
    659   %obj1 = alloca [4096 x i8], align 2, addrspace(5)
    660   %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
    661   store volatile i32 123, i32 addrspace(5)* %bc
    662   %value = bitcast i32 %arg to <2 x i16>
    663   %hi = extractelement <2 x i16> %value, i32 1
    664   %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4051
    665   %trunc = trunc i16 %hi to i8
    666   store i8 %trunc, i8 addrspace(5)* %gep
    667   ret void
    668 }
    669 
    670 attributes #0 = { nounwind }
    671