Home | History | Annotate | Download | only in AMDGPU
      1 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
      2 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
      3 
      4 ; half args should be promoted to float
      5 
      6 ; GCN-LABEL: {{^}}load_f16_arg:
      7 ; GCN: s_load_dword [[ARG:s[0-9]+]]
      8 ; GCN: v_cvt_f16_f32_e32 [[CVT:v[0-9]+]], [[ARG]]
      9 ; GCN: buffer_store_short [[CVT]]
     10 define void @load_f16_arg(half addrspace(1)* %out, half %arg) #0 {
     11   store half %arg, half addrspace(1)* %out
     12   ret void
     13 }
     14 
     15 ; GCN-LABEL: {{^}}load_v2f16_arg:
     16 ; GCN-DAG: buffer_load_ushort [[V0:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44
     17 ; GCN-DAG: buffer_load_ushort [[V1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:46
     18 ; GCN: v_lshlrev_b32_e32 [[HI:v[0-9]+]], 16, [[V1]]
     19 ; GCN: v_or_b32_e32 [[PACKED:v[0-9]+]], [[V0]], [[HI]]
     20 ; GCN: buffer_store_dword [[PACKED]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
     21 ; GCN: s_endpgm
     22 define void @load_v2f16_arg(<2 x half> addrspace(1)* %out, <2 x half> %arg) #0 {
     23   store <2 x half> %arg, <2 x half> addrspace(1)* %out
     24   ret void
     25 }
     26 
     27 ; GCN-LABEL: {{^}}load_v3f16_arg:
     28 ; GCN: buffer_load_ushort
     29 ; GCN: buffer_load_ushort
     30 ; GCN: buffer_load_ushort
     31 ; GCN-NOT: buffer_load
     32 ; GCN-DAG: buffer_store_dword
     33 ; GCN-DAG: buffer_store_short
     34 ; GCN-NOT: buffer_store
     35 ; GCN: s_endpgm
     36 define void @load_v3f16_arg(<3 x half> addrspace(1)* %out, <3 x half> %arg) #0 {
     37   store <3 x half> %arg, <3 x half> addrspace(1)* %out
     38   ret void
     39 }
     40 
     41 ; GCN-LABEL: {{^}}load_v4f16_arg:
     42 ; GCN: buffer_load_ushort
     43 ; GCN: buffer_load_ushort
     44 ; GCN: buffer_load_ushort
     45 ; GCN: buffer_load_ushort
     46 ; GCN: buffer_store_dwordx2
     47 ; GCN: s_endpgm
     48 define void @load_v4f16_arg(<4 x half> addrspace(1)* %out, <4 x half> %arg) #0 {
     49   store <4 x half> %arg, <4 x half> addrspace(1)* %out
     50   ret void
     51 }
     52 
     53 ; GCN-LABEL: {{^}}load_v8f16_arg:
     54 define void @load_v8f16_arg(<8 x half> addrspace(1)* %out, <8 x half> %arg) #0 {
     55   store <8 x half> %arg, <8 x half> addrspace(1)* %out
     56   ret void
     57 }
     58 
     59 ; GCN-LABEL: {{^}}extload_v2f16_arg:
     60 define void @extload_v2f16_arg(<2 x float> addrspace(1)* %out, <2 x half> %in) #0 {
     61   %fpext = fpext <2 x half> %in to <2 x float>
     62   store <2 x float> %fpext, <2 x float> addrspace(1)* %out
     63   ret void
     64 }
     65 
     66 ; GCN-LABEL: {{^}}extload_f16_to_f32_arg:
     67 define void @extload_f16_to_f32_arg(float addrspace(1)* %out, half %arg) #0 {
     68   %ext = fpext half %arg to float
     69   store float %ext, float addrspace(1)* %out
     70   ret void
     71 }
     72 
     73 ; GCN-LABEL: {{^}}extload_v2f16_to_v2f32_arg:
     74 define void @extload_v2f16_to_v2f32_arg(<2 x float> addrspace(1)* %out, <2 x half> %arg) #0 {
     75   %ext = fpext <2 x half> %arg to <2 x float>
     76   store <2 x float> %ext, <2 x float> addrspace(1)* %out
     77   ret void
     78 }
     79 
     80 ; GCN-LABEL: {{^}}extload_v3f16_to_v3f32_arg:
     81 ; GCN: buffer_load_ushort
     82 ; GCN: buffer_load_ushort
     83 ; GCN: buffer_load_ushort
     84 ; GCN-NOT: buffer_load
     85 ; GCN: v_cvt_f32_f16_e32
     86 ; GCN: v_cvt_f32_f16_e32
     87 ; GCN: v_cvt_f32_f16_e32
     88 ; GCN-NOT: v_cvt_f32_f16
     89 ; GCN-DAG: buffer_store_dword
     90 ; GCN-DAG: buffer_store_dwordx2
     91 ; GCN: s_endpgm
     92 define void @extload_v3f16_to_v3f32_arg(<3 x float> addrspace(1)* %out, <3 x half> %arg) #0 {
     93   %ext = fpext <3 x half> %arg to <3 x float>
     94   store <3 x float> %ext, <3 x float> addrspace(1)* %out
     95   ret void
     96 }
     97 
     98 ; GCN-LABEL: {{^}}extload_v4f16_to_v4f32_arg:
     99 define void @extload_v4f16_to_v4f32_arg(<4 x float> addrspace(1)* %out, <4 x half> %arg) #0 {
    100   %ext = fpext <4 x half> %arg to <4 x float>
    101   store <4 x float> %ext, <4 x float> addrspace(1)* %out
    102   ret void
    103 }
    104 
    105 ; GCN-LABEL: {{^}}extload_v8f16_to_v8f32_arg:
    106 ; GCN: buffer_load_ushort
    107 ; GCN: buffer_load_ushort
    108 ; GCN: buffer_load_ushort
    109 ; GCN: buffer_load_ushort
    110 ; GCN: buffer_load_ushort
    111 ; GCN: buffer_load_ushort
    112 ; GCN: buffer_load_ushort
    113 ; GCN: buffer_load_ushort
    114 
    115 ; GCN: v_cvt_f32_f16_e32
    116 ; GCN: v_cvt_f32_f16_e32
    117 ; GCN: v_cvt_f32_f16_e32
    118 ; GCN: v_cvt_f32_f16_e32
    119 ; GCN: v_cvt_f32_f16_e32
    120 ; GCN: v_cvt_f32_f16_e32
    121 ; GCN: v_cvt_f32_f16_e32
    122 ; GCN: v_cvt_f32_f16_e32
    123 
    124 ; GCN: buffer_store_dwordx4
    125 ; GCN: buffer_store_dwordx4
    126 define void @extload_v8f16_to_v8f32_arg(<8 x float> addrspace(1)* %out, <8 x half> %arg) #0 {
    127   %ext = fpext <8 x half> %arg to <8 x float>
    128   store <8 x float> %ext, <8 x float> addrspace(1)* %out
    129   ret void
    130 }
    131 
    132 ; GCN-LABEL: {{^}}extload_f16_to_f64_arg:
    133 ; SI: s_load_dword [[ARG:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb{{$}}
    134 ; VI: s_load_dword [[ARG:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c{{$}}
    135 ; GCN: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[ARG]]
    136 ; GCN: buffer_store_dwordx2 [[RESULT]]
    137 define void @extload_f16_to_f64_arg(double addrspace(1)* %out, half %arg) #0 {
    138   %ext = fpext half %arg to double
    139   store double %ext, double addrspace(1)* %out
    140   ret void
    141 }
    142 
    143 ; GCN-LABEL: {{^}}extload_v2f16_to_v2f64_arg:
    144 ; GCN-DAG: buffer_load_ushort v
    145 ; GCN-DAG: buffer_load_ushort v
    146 ; GCN-DAG: v_cvt_f32_f16_e32
    147 ; GCN-DAG: v_cvt_f32_f16_e32
    148 ; GCN-DAG: v_cvt_f64_f32_e32
    149 ; GCN-DAG: v_cvt_f64_f32_e32
    150 ; GCN: s_endpgm
    151 define void @extload_v2f16_to_v2f64_arg(<2 x double> addrspace(1)* %out, <2 x half> %arg) #0 {
    152   %ext = fpext <2 x half> %arg to <2 x double>
    153   store <2 x double> %ext, <2 x double> addrspace(1)* %out
    154   ret void
    155 }
    156 
    157 ; GCN-LABEL: {{^}}extload_v3f16_to_v3f64_arg:
    158 ; GCN-DAG: buffer_load_ushort v
    159 ; GCN-DAG: buffer_load_ushort v
    160 ; GCN-DAG: buffer_load_ushort v
    161 ; GCN-DAG: v_cvt_f32_f16_e32
    162 ; GCN-DAG: v_cvt_f32_f16_e32
    163 ; GCN-DAG: v_cvt_f32_f16_e32
    164 ; GCN-DAG: v_cvt_f64_f32_e32
    165 ; GCN-DAG: v_cvt_f64_f32_e32
    166 ; GCN-DAG: v_cvt_f64_f32_e32
    167 ; GCN: s_endpgm
    168 define void @extload_v3f16_to_v3f64_arg(<3 x double> addrspace(1)* %out, <3 x half> %arg) #0 {
    169   %ext = fpext <3 x half> %arg to <3 x double>
    170   store <3 x double> %ext, <3 x double> addrspace(1)* %out
    171   ret void
    172 }
    173 
    174 ; GCN-LABEL: {{^}}extload_v4f16_to_v4f64_arg:
    175 ; GCN-DAG: buffer_load_ushort v
    176 ; GCN-DAG: buffer_load_ushort v
    177 ; GCN-DAG: buffer_load_ushort v
    178 ; GCN-DAG: buffer_load_ushort v
    179 ; GCN-DAG: v_cvt_f32_f16_e32
    180 ; GCN-DAG: v_cvt_f32_f16_e32
    181 ; GCN-DAG: v_cvt_f32_f16_e32
    182 ; GCN-DAG: v_cvt_f32_f16_e32
    183 ; GCN-DAG: v_cvt_f64_f32_e32
    184 ; GCN-DAG: v_cvt_f64_f32_e32
    185 ; GCN-DAG: v_cvt_f64_f32_e32
    186 ; GCN-DAG: v_cvt_f64_f32_e32
    187 ; GCN: s_endpgm
    188 define void @extload_v4f16_to_v4f64_arg(<4 x double> addrspace(1)* %out, <4 x half> %arg) #0 {
    189   %ext = fpext <4 x half> %arg to <4 x double>
    190   store <4 x double> %ext, <4 x double> addrspace(1)* %out
    191   ret void
    192 }
    193 
    194 ; GCN-LABEL: {{^}}extload_v8f16_to_v8f64_arg:
    195 ; GCN-DAG: buffer_load_ushort v
    196 ; GCN-DAG: buffer_load_ushort v
    197 ; GCN-DAG: buffer_load_ushort v
    198 ; GCN-DAG: buffer_load_ushort v
    199 
    200 ; GCN-DAG: buffer_load_ushort v
    201 ; GCN-DAG: buffer_load_ushort v
    202 ; GCN-DAG: buffer_load_ushort v
    203 ; GCN-DAG: buffer_load_ushort v
    204 
    205 ; GCN-DAG: v_cvt_f32_f16_e32
    206 ; GCN-DAG: v_cvt_f32_f16_e32
    207 ; GCN-DAG: v_cvt_f32_f16_e32
    208 ; GCN-DAG: v_cvt_f32_f16_e32
    209 
    210 ; GCN-DAG: v_cvt_f32_f16_e32
    211 ; GCN-DAG: v_cvt_f32_f16_e32
    212 ; GCN-DAG: v_cvt_f32_f16_e32
    213 ; GCN-DAG: v_cvt_f32_f16_e32
    214 
    215 ; GCN-DAG: v_cvt_f64_f32_e32
    216 ; GCN-DAG: v_cvt_f64_f32_e32
    217 ; GCN-DAG: v_cvt_f64_f32_e32
    218 ; GCN-DAG: v_cvt_f64_f32_e32
    219 
    220 ; GCN-DAG: v_cvt_f64_f32_e32
    221 ; GCN-DAG: v_cvt_f64_f32_e32
    222 ; GCN-DAG: v_cvt_f64_f32_e32
    223 ; GCN-DAG: v_cvt_f64_f32_e32
    224 
    225 ; GCN: s_endpgm
    226 define void @extload_v8f16_to_v8f64_arg(<8 x double> addrspace(1)* %out, <8 x half> %arg) #0 {
    227   %ext = fpext <8 x half> %arg to <8 x double>
    228   store <8 x double> %ext, <8 x double> addrspace(1)* %out
    229   ret void
    230 }
    231 
    232 ; GCN-LABEL: {{^}}global_load_store_f16:
    233 ; GCN: buffer_load_ushort [[TMP:v[0-9]+]]
    234 ; GCN: buffer_store_short [[TMP]]
    235 define void @global_load_store_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
    236   %val = load half, half addrspace(1)* %in
    237   store half %val, half addrspace(1)* %out
    238   ret void
    239 }
    240 
    241 ; GCN-LABEL: {{^}}global_load_store_v2f16:
    242 ; GCN: buffer_load_dword [[TMP:v[0-9]+]]
    243 ; GCN: buffer_store_dword [[TMP]]
    244 define void @global_load_store_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
    245   %val = load <2 x half>, <2 x half> addrspace(1)* %in
    246   store <2 x half> %val, <2 x half> addrspace(1)* %out
    247   ret void
    248 }
    249 
    250 ; GCN-LABEL: {{^}}global_load_store_v4f16:
    251 ; GCN: buffer_load_dwordx2 [[TMP:v\[[0-9]+:[0-9]+\]]]
    252 ; GCN: buffer_store_dwordx2 [[TMP]]
    253 define void @global_load_store_v4f16(<4 x half> addrspace(1)* %in, <4 x half> addrspace(1)* %out) #0 {
    254   %val = load <4 x half>, <4 x half> addrspace(1)* %in
    255   store <4 x half> %val, <4 x half> addrspace(1)* %out
    256   ret void
    257 }
    258 
    259 ; GCN-LABEL: {{^}}global_load_store_v8f16:
    260 ; GCN: buffer_load_dwordx4 [[TMP:v\[[0-9]+:[0-9]+\]]]
    261 ; GCN: buffer_store_dwordx4 [[TMP:v\[[0-9]+:[0-9]+\]]]
    262 ; GCN: s_endpgm
    263 define void @global_load_store_v8f16(<8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 {
    264   %val = load <8 x half>, <8 x half> addrspace(1)* %in
    265   store <8 x half> %val, <8 x half> addrspace(1)* %out
    266   ret void
    267 }
    268 
    269 ; GCN-LABEL: {{^}}global_extload_f16_to_f32:
    270 ; GCN: buffer_load_ushort [[LOAD:v[0-9]+]]
    271 ; GCN: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], [[LOAD]]
    272 ; GCN: buffer_store_dword [[CVT]]
    273 define void @global_extload_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %in) #0 {
    274   %val = load half, half addrspace(1)* %in
    275   %cvt = fpext half %val to float
    276   store float %cvt, float addrspace(1)* %out
    277   ret void
    278 }
    279 
    280 ; GCN-LABEL: {{^}}global_extload_v2f16_to_v2f32:
    281 ; GCN: buffer_load_dword [[LOAD:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
    282 ; GCN: v_cvt_f32_f16_e32 v[[CVT0:[0-9]+]], [[LOAD]]
    283 ; GCN: v_lshrrev_b32_e32 [[HI:v[0-9]+]], 16, [[LOAD]]
    284 ; GCN: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[HI]]
    285 ; GCN: buffer_store_dwordx2 v{{\[}}[[CVT0]]:[[CVT1]]{{\]}}
    286 ; GCN: s_endpgm
    287 define void @global_extload_v2f16_to_v2f32(<2 x float> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
    288   %val = load <2 x half>, <2 x half> addrspace(1)* %in
    289   %cvt = fpext <2 x half> %val to <2 x float>
    290   store <2 x float> %cvt, <2 x float> addrspace(1)* %out
    291   ret void
    292 }
    293 
    294 ; GCN-LABEL: {{^}}global_extload_v3f16_to_v3f32:
    295 define void @global_extload_v3f16_to_v3f32(<3 x float> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 {
    296   %val = load <3 x half>, <3 x half> addrspace(1)* %in
    297   %cvt = fpext <3 x half> %val to <3 x float>
    298   store <3 x float> %cvt, <3 x float> addrspace(1)* %out
    299   ret void
    300 }
    301 
    302 ; GCN-LABEL: {{^}}global_extload_v4f16_to_v4f32:
    303 define void @global_extload_v4f16_to_v4f32(<4 x float> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 {
    304   %val = load <4 x half>, <4 x half> addrspace(1)* %in
    305   %cvt = fpext <4 x half> %val to <4 x float>
    306   store <4 x float> %cvt, <4 x float> addrspace(1)* %out
    307   ret void
    308 }
    309 
    310 ; GCN-LABEL: {{^}}global_extload_v8f16_to_v8f32:
    311 define void @global_extload_v8f16_to_v8f32(<8 x float> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 {
    312   %val = load <8 x half>, <8 x half> addrspace(1)* %in
    313   %cvt = fpext <8 x half> %val to <8 x float>
    314   store <8 x float> %cvt, <8 x float> addrspace(1)* %out
    315   ret void
    316 }
    317 
    318 ; GCN-LABEL: {{^}}global_extload_v16f16_to_v16f32:
    319 ; GCN: buffer_load_dwordx4
    320 ; GCN: buffer_load_dwordx4
    321 
    322 ; GCN: v_cvt_f32_f16_e32
    323 ; GCN: v_cvt_f32_f16_e32
    324 ; GCN: v_cvt_f32_f16_e32
    325 ; GCN: v_cvt_f32_f16_e32
    326 ; GCN: v_cvt_f32_f16_e32
    327 ; GCN: v_cvt_f32_f16_e32
    328 ; GCN: v_cvt_f32_f16_e32
    329 ; GCN: v_cvt_f32_f16_e32
    330 ; GCN: v_cvt_f32_f16_e32
    331 ; GCN: v_cvt_f32_f16_e32
    332 ; GCN: v_cvt_f32_f16_e32
    333 ; GCN: v_cvt_f32_f16_e32
    334 ; GCN: v_cvt_f32_f16_e32
    335 ; GCN: v_cvt_f32_f16_e32
    336 ; GCN: v_cvt_f32_f16_e32
    337 ; GCN: v_cvt_f32_f16_e32
    338 
    339 ; GCN: buffer_store_dwordx4
    340 ; GCN: buffer_store_dwordx4
    341 ; GCN: buffer_store_dwordx4
    342 ; GCN: buffer_store_dwordx4
    343 
    344 ; GCN: s_endpgm
    345 define void @global_extload_v16f16_to_v16f32(<16 x float> addrspace(1)* %out, <16 x half> addrspace(1)* %in) #0 {
    346   %val = load <16 x half>, <16 x half> addrspace(1)* %in
    347   %cvt = fpext <16 x half> %val to <16 x float>
    348   store <16 x float> %cvt, <16 x float> addrspace(1)* %out
    349   ret void
    350 }
    351 
    352 ; GCN-LABEL: {{^}}global_extload_f16_to_f64:
    353 ; GCN: buffer_load_ushort [[LOAD:v[0-9]+]]
    354 ; GCN: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], [[LOAD]]
    355 ; GCN: v_cvt_f64_f32_e32 [[CVT1:v\[[0-9]+:[0-9]+\]]], [[CVT0]]
    356 ; GCN: buffer_store_dwordx2 [[CVT1]]
    357 define void @global_extload_f16_to_f64(double addrspace(1)* %out, half addrspace(1)* %in) #0 {
    358   %val = load half, half addrspace(1)* %in
    359   %cvt = fpext half %val to double
    360   store double %cvt, double addrspace(1)* %out
    361   ret void
    362 }
    363 
    364 ; GCN-LABEL: {{^}}global_extload_v2f16_to_v2f64:
    365 ; GCN-DAG: buffer_load_dword [[LOAD:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
    366 ; GCN-DAG: v_lshrrev_b32_e32 [[HI:v[0-9]+]], 16, [[LOAD]]
    367 ; GCN-DAG: v_cvt_f32_f16_e32 v[[CVT0:[0-9]+]], [[LOAD]]
    368 ; GCN-DAG: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[HI]]
    369 ; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT2_LO:[0-9]+]]:[[CVT2_HI:[0-9]+]]{{\]}}, v[[CVT0]]
    370 ; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT3_LO:[0-9]+]]:[[CVT3_HI:[0-9]+]]{{\]}}, v[[CVT1]]
    371 ; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[CVT2_LO]]:[[CVT3_HI]]{{\]}}
    372 ; GCN: s_endpgm
    373 define void @global_extload_v2f16_to_v2f64(<2 x double> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
    374   %val = load <2 x half>, <2 x half> addrspace(1)* %in
    375   %cvt = fpext <2 x half> %val to <2 x double>
    376   store <2 x double> %cvt, <2 x double> addrspace(1)* %out
    377   ret void
    378 }
    379 
    380 ; GCN-LABEL: {{^}}global_extload_v3f16_to_v3f64:
    381 
    382 ; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]]
    383 ; GCN-DAG: v_cvt_f32_f16_e32
    384 ; GCN-DAG: v_lshrrev_b32_e32 {{v[0-9]+}}, 16, {{v[0-9]+}}
    385 ; GCN-DAG: v_cvt_f32_f16_e32
    386 ; GCN-DAG: v_cvt_f32_f16_e32
    387 
    388 ; GCN: v_cvt_f64_f32_e32
    389 ; GCN: v_cvt_f64_f32_e32
    390 ; GCN: v_cvt_f64_f32_e32
    391 ; GCN-NOT: v_cvt_f64_f32_e32
    392 
    393 ; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
    394 ; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16
    395 ; GCN: s_endpgm
    396 define void @global_extload_v3f16_to_v3f64(<3 x double> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 {
    397   %val = load <3 x half>, <3 x half> addrspace(1)* %in
    398   %cvt = fpext <3 x half> %val to <3 x double>
    399   store <3 x double> %cvt, <3 x double> addrspace(1)* %out
    400   ret void
    401 }
    402 
    403 ; GCN-LABEL: {{^}}global_extload_v4f16_to_v4f64:
    404 define void @global_extload_v4f16_to_v4f64(<4 x double> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 {
    405   %val = load <4 x half>, <4 x half> addrspace(1)* %in
    406   %cvt = fpext <4 x half> %val to <4 x double>
    407   store <4 x double> %cvt, <4 x double> addrspace(1)* %out
    408   ret void
    409 }
    410 
    411 ; GCN-LABEL: {{^}}global_extload_v8f16_to_v8f64:
    412 define void @global_extload_v8f16_to_v8f64(<8 x double> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 {
    413   %val = load <8 x half>, <8 x half> addrspace(1)* %in
    414   %cvt = fpext <8 x half> %val to <8 x double>
    415   store <8 x double> %cvt, <8 x double> addrspace(1)* %out
    416   ret void
    417 }
    418 
    419 ; GCN-LABEL: {{^}}global_extload_v16f16_to_v16f64:
    420 define void @global_extload_v16f16_to_v16f64(<16 x double> addrspace(1)* %out, <16 x half> addrspace(1)* %in) #0 {
    421   %val = load <16 x half>, <16 x half> addrspace(1)* %in
    422   %cvt = fpext <16 x half> %val to <16 x double>
    423   store <16 x double> %cvt, <16 x double> addrspace(1)* %out
    424   ret void
    425 }
    426 
    427 ; GCN-LABEL: {{^}}global_truncstore_f32_to_f16:
    428 ; GCN: buffer_load_dword [[LOAD:v[0-9]+]]
    429 ; GCN: v_cvt_f16_f32_e32 [[CVT:v[0-9]+]], [[LOAD]]
    430 ; GCN: buffer_store_short [[CVT]]
    431 define void @global_truncstore_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %in) #0 {
    432   %val = load float, float addrspace(1)* %in
    433   %cvt = fptrunc float %val to half
    434   store half %cvt, half addrspace(1)* %out
    435   ret void
    436 }
    437 
    438 ; GCN-LABEL: {{^}}global_truncstore_v2f32_to_v2f16:
    439 ; GCN: buffer_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
    440 ; GCN-DAG: v_cvt_f16_f32_e32 [[CVT0:v[0-9]+]], v[[LO]]
    441 ; GCN-DAG: v_cvt_f16_f32_e32 [[CVT1:v[0-9]+]], v[[HI]]
    442 ; GCN-DAG: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 16, [[CVT1]]
    443 ; GCN-DAG: v_or_b32_e32 [[PACKED:v[0-9]+]], [[CVT0]], [[SHL]]
    444 ; GCN-DAG: buffer_store_dword [[PACKED]]
    445 ; GCN: s_endpgm
    446 define void @global_truncstore_v2f32_to_v2f16(<2 x half> addrspace(1)* %out, <2 x float> addrspace(1)* %in) #0 {
    447   %val = load <2 x float>, <2 x float> addrspace(1)* %in
    448   %cvt = fptrunc <2 x float> %val to <2 x half>
    449   store <2 x half> %cvt, <2 x half> addrspace(1)* %out
    450   ret void
    451 }
    452 
    453 ; GCN-LABEL: {{^}}global_truncstore_v3f32_to_v3f16:
    454 ; GCN: buffer_load_dwordx4
    455 ; GCN: v_cvt_f16_f32_e32
    456 ; GCN: v_cvt_f16_f32_e32
    457 ; GCN: v_cvt_f16_f32_e32
    458 ; GCN-NOT: v_cvt_f16_f32_e32
    459 ; GCN: buffer_store_short
    460 ; GCN: buffer_store_dword
    461 ; GCN: s_endpgm
    462 define void @global_truncstore_v3f32_to_v3f16(<3 x half> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 {
    463   %val = load <3 x float>, <3 x float> addrspace(1)* %in
    464   %cvt = fptrunc <3 x float> %val to <3 x half>
    465   store <3 x half> %cvt, <3 x half> addrspace(1)* %out
    466   ret void
    467 }
    468 
    469 ; GCN-LABEL: {{^}}global_truncstore_v4f32_to_v4f16:
    470 ; GCN: buffer_load_dwordx4
    471 ; GCN: v_cvt_f16_f32_e32
    472 ; GCN: v_cvt_f16_f32_e32
    473 ; GCN: v_cvt_f16_f32_e32
    474 ; GCN: v_cvt_f16_f32_e32
    475 ; GCN: buffer_store_dwordx2
    476 ; GCN: s_endpgm
    477 define void @global_truncstore_v4f32_to_v4f16(<4 x half> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
    478   %val = load <4 x float>, <4 x float> addrspace(1)* %in
    479   %cvt = fptrunc <4 x float> %val to <4 x half>
    480   store <4 x half> %cvt, <4 x half> addrspace(1)* %out
    481   ret void
    482 }
    483 
    484 ; GCN-LABEL: {{^}}global_truncstore_v8f32_to_v8f16:
    485 ; GCN: buffer_load_dwordx4
    486 ; GCN: buffer_load_dwordx4
    487 ; GCN: v_cvt_f16_f32_e32
    488 ; GCN: v_cvt_f16_f32_e32
    489 ; GCN: v_cvt_f16_f32_e32
    490 ; GCN: v_cvt_f16_f32_e32
    491 ; GCN: v_cvt_f16_f32_e32
    492 ; GCN: v_cvt_f16_f32_e32
    493 ; GCN: v_cvt_f16_f32_e32
    494 ; GCN: v_cvt_f16_f32_e32
    495 ; GCN: buffer_store_dwordx4
    496 ; GCN: s_endpgm
    497 define void @global_truncstore_v8f32_to_v8f16(<8 x half> addrspace(1)* %out, <8 x float> addrspace(1)* %in) #0 {
    498   %val = load <8 x float>, <8 x float> addrspace(1)* %in
    499   %cvt = fptrunc <8 x float> %val to <8 x half>
    500   store <8 x half> %cvt, <8 x half> addrspace(1)* %out
    501   ret void
    502 }
    503 
    504 ; GCN-LABEL: {{^}}global_truncstore_v16f32_to_v16f16:
    505 ; GCN: buffer_load_dwordx4
    506 ; GCN: buffer_load_dwordx4
    507 ; GCN: buffer_load_dwordx4
    508 ; GCN: buffer_load_dwordx4
    509 ; GCN-DAG: v_cvt_f16_f32_e32
    510 ; GCN-DAG: v_cvt_f16_f32_e32
    511 ; GCN-DAG: v_cvt_f16_f32_e32
    512 ; GCN-DAG: v_cvt_f16_f32_e32
    513 ; GCN-DAG: v_cvt_f16_f32_e32
    514 ; GCN-DAG: v_cvt_f16_f32_e32
    515 ; GCN-DAG: v_cvt_f16_f32_e32
    516 ; GCN-DAG: v_cvt_f16_f32_e32
    517 ; GCN-DAG: v_cvt_f16_f32_e32
    518 ; GCN-DAG: v_cvt_f16_f32_e32
    519 ; GCN-DAG: v_cvt_f16_f32_e32
    520 ; GCN-DAG: v_cvt_f16_f32_e32
    521 ; GCN-DAG: v_cvt_f16_f32_e32
    522 ; GCN-DAG: v_cvt_f16_f32_e32
    523 ; GCN-DAG: v_cvt_f16_f32_e32
    524 ; GCN-DAG: v_cvt_f16_f32_e32
    525 ; GCN-DAG: buffer_store_dwordx4
    526 ; GCN-DAG: buffer_store_dwordx4
    527 ; GCN: s_endpgm
    528 define void @global_truncstore_v16f32_to_v16f16(<16 x half> addrspace(1)* %out, <16 x float> addrspace(1)* %in) #0 {
    529   %val = load <16 x float>, <16 x float> addrspace(1)* %in
    530   %cvt = fptrunc <16 x float> %val to <16 x half>
    531   store <16 x half> %cvt, <16 x half> addrspace(1)* %out
    532   ret void
    533 }
    534 
    535 ; FIXME: Unsafe math should fold conversions away
    536 ; GCN-LABEL: {{^}}fadd_f16:
    537 ; SI-DAG: v_cvt_f32_f16_e32 v{{[0-9]+}},
    538 ; SI-DAG: v_cvt_f32_f16_e32 v{{[0-9]+}},
    539 ; SI-DAG: v_cvt_f32_f16_e32 v{{[0-9]+}},
    540 ; SI-DAG: v_cvt_f32_f16_e32 v{{[0-9]+}},
    541 ; SI: v_add_f32
    542 ; GCN: s_endpgm
    543 define void @fadd_f16(half addrspace(1)* %out, half %a, half %b) #0 {
    544    %add = fadd half %a, %b
    545    store half %add, half addrspace(1)* %out, align 4
    546    ret void
    547 }
    548 
    549 ; GCN-LABEL: {{^}}fadd_v2f16:
    550 ; SI: v_add_f32
    551 ; SI: v_add_f32
    552 ; GCN: s_endpgm
    553 define void @fadd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %a, <2 x half> %b) #0 {
    554   %add = fadd <2 x half> %a, %b
    555   store <2 x half> %add, <2 x half> addrspace(1)* %out, align 8
    556   ret void
    557 }
    558 
    559 ; GCN-LABEL: {{^}}fadd_v4f16:
    560 ; SI: v_add_f32
    561 ; SI: v_add_f32
    562 ; SI: v_add_f32
    563 ; SI: v_add_f32
    564 ; GCN: s_endpgm
    565 define void @fadd_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 {
    566   %b_ptr = getelementptr <4 x half>, <4 x half> addrspace(1)* %in, i32 1
    567   %a = load <4 x half>, <4 x half> addrspace(1)* %in, align 16
    568   %b = load <4 x half>, <4 x half> addrspace(1)* %b_ptr, align 16
    569   %result = fadd <4 x half> %a, %b
    570   store <4 x half> %result, <4 x half> addrspace(1)* %out, align 16
    571   ret void
    572 }
    573 
    574 ; GCN-LABEL: {{^}}fadd_v8f16:
    575 ; SI: v_add_f32
    576 ; SI: v_add_f32
    577 ; SI: v_add_f32
    578 ; SI: v_add_f32
    579 ; SI: v_add_f32
    580 ; SI: v_add_f32
    581 ; SI: v_add_f32
    582 ; SI: v_add_f32
    583 ; GCN: s_endpgm
    584 define void @fadd_v8f16(<8 x half> addrspace(1)* %out, <8 x half> %a, <8 x half> %b) #0 {
    585   %add = fadd <8 x half> %a, %b
    586   store <8 x half> %add, <8 x half> addrspace(1)* %out, align 32
    587   ret void
    588 }
    589 
    590 ; GCN-LABEL: {{^}}fsub_f16:
    591 ; GCN: v_subrev_f32_e32
    592 ; GCN: s_endpgm
    593 define void @fsub_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
    594   %b_ptr = getelementptr half, half addrspace(1)* %in, i32 1
    595   %a = load half, half addrspace(1)* %in
    596   %b = load half, half addrspace(1)* %b_ptr
    597   %sub = fsub half %a, %b
    598   store half %sub, half addrspace(1)* %out
    599   ret void
    600 }
    601 
    602 ; GCN-LABEL: {{^}}test_bitcast_from_half:
    603 ; GCN: buffer_load_ushort [[TMP:v[0-9]+]]
    604 ; GCN: buffer_store_short [[TMP]]
    605 define void @test_bitcast_from_half(half addrspace(1)* %in, i16 addrspace(1)* %out) #0 {
    606   %val = load half, half addrspace(1)* %in
    607   %val_int = bitcast half %val to i16
    608   store i16 %val_int, i16 addrspace(1)* %out
    609   ret void
    610 }
    611 
    612 ; GCN-LABEL: {{^}}test_bitcast_to_half:
    613 ; GCN: buffer_load_ushort [[TMP:v[0-9]+]]
    614 ; GCN: buffer_store_short [[TMP]]
    615 define void @test_bitcast_to_half(half addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
    616   %val = load i16, i16 addrspace(1)* %in
    617   %val_fp = bitcast i16 %val to half
    618   store half %val_fp, half addrspace(1)* %out
    619   ret void
    620 }
    621 
    622 attributes #0 = { nounwind }
    623