Home | History | Annotate | Download | only in AMDGPU
      1 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
      2 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
      3 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s
      4 ; FIXME: Merge into imm.ll
      5 
      6 ; GCN-LABEL: {{^}}store_inline_imm_neg_0.0_v2i16:
      7 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80008000{{$}}
      8 ; GCN: buffer_store_dword [[REG]]
      9 define amdgpu_kernel void @store_inline_imm_neg_0.0_v2i16(<2 x i16> addrspace(1)* %out) #0 {
     10   store <2 x i16> <i16 -32768, i16 -32768>, <2 x i16> addrspace(1)* %out
     11   ret void
     12 }
     13 
     14 ; GCN-LABEL: {{^}}store_inline_imm_0.0_v2f16:
     15 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
     16 ; GCN: buffer_store_dword [[REG]]
     17 define amdgpu_kernel void @store_inline_imm_0.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
     18   store <2 x half> <half 0.0, half 0.0>, <2 x half> addrspace(1)* %out
     19   ret void
     20 }
     21 
     22 ; GCN-LABEL: {{^}}store_imm_neg_0.0_v2f16:
     23 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80008000{{$}}
     24 ; GCN: buffer_store_dword [[REG]]
     25 define amdgpu_kernel void @store_imm_neg_0.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
     26   store <2 x half> <half -0.0, half -0.0>, <2 x half> addrspace(1)* %out
     27   ret void
     28 }
     29 
     30 ; GCN-LABEL: {{^}}store_inline_imm_0.5_v2f16:
     31 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x38003800{{$}}
     32 ; GCN: buffer_store_dword [[REG]]
     33 define amdgpu_kernel void @store_inline_imm_0.5_v2f16(<2 x half> addrspace(1)* %out) #0 {
     34   store <2 x half> <half 0.5, half 0.5>, <2 x half> addrspace(1)* %out
     35   ret void
     36 }
     37 
     38 ; GCN-LABEL: {{^}}store_inline_imm_m_0.5_v2f16:
     39 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xb800b800{{$}}
     40 ; GCN: buffer_store_dword [[REG]]
     41 define amdgpu_kernel void @store_inline_imm_m_0.5_v2f16(<2 x half> addrspace(1)* %out) #0 {
     42   store <2 x half> <half -0.5, half -0.5>, <2 x half> addrspace(1)* %out
     43   ret void
     44 }
     45 
     46 ; GCN-LABEL: {{^}}store_inline_imm_1.0_v2f16:
     47 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3c003c00{{$}}
     48 ; GCN: buffer_store_dword [[REG]]
     49 define amdgpu_kernel void @store_inline_imm_1.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
     50   store <2 x half> <half 1.0, half 1.0>, <2 x half> addrspace(1)* %out
     51   ret void
     52 }
     53 
     54 ; GCN-LABEL: {{^}}store_inline_imm_m_1.0_v2f16:
     55 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xbc00bc00{{$}}
     56 ; GCN: buffer_store_dword [[REG]]
     57 define amdgpu_kernel void @store_inline_imm_m_1.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
     58   store <2 x half> <half -1.0, half -1.0>, <2 x half> addrspace(1)* %out
     59   ret void
     60 }
     61 
     62 ; GCN-LABEL: {{^}}store_inline_imm_2.0_v2f16:
     63 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x40004000{{$}}
     64 ; GCN: buffer_store_dword [[REG]]
     65 define amdgpu_kernel void @store_inline_imm_2.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
     66   store <2 x half> <half 2.0, half 2.0>, <2 x half> addrspace(1)* %out
     67   ret void
     68 }
     69 
     70 ; GCN-LABEL: {{^}}store_inline_imm_m_2.0_v2f16:
     71 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xc000c000{{$}}
     72 ; GCN: buffer_store_dword [[REG]]
     73 define amdgpu_kernel void @store_inline_imm_m_2.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
     74   store <2 x half> <half -2.0, half -2.0>, <2 x half> addrspace(1)* %out
     75   ret void
     76 }
     77 
     78 ; GCN-LABEL: {{^}}store_inline_imm_4.0_v2f16:
     79 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x44004400{{$}}
     80 ; GCN: buffer_store_dword [[REG]]
     81 define amdgpu_kernel void @store_inline_imm_4.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
     82   store <2 x half> <half 4.0, half 4.0>, <2 x half> addrspace(1)* %out
     83   ret void
     84 }
     85 
     86 ; GCN-LABEL: {{^}}store_inline_imm_m_4.0_v2f16:
     87 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xc400c400{{$}}
     88 ; GCN: buffer_store_dword [[REG]]
     89 define amdgpu_kernel void @store_inline_imm_m_4.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
     90   store <2 x half> <half -4.0, half -4.0>, <2 x half> addrspace(1)* %out
     91   ret void
     92 }
     93 
     94 ; GCN-LABEL: {{^}}store_inline_imm_inv_2pi_v2f16:
     95 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x31183118{{$}}
     96 ; GCN: buffer_store_dword [[REG]]
     97 define amdgpu_kernel void @store_inline_imm_inv_2pi_v2f16(<2 x half> addrspace(1)* %out) #0 {
     98   store <2 x half> <half 0xH3118, half 0xH3118>, <2 x half> addrspace(1)* %out
     99   ret void
    100 }
    101 
    102 ; GCN-LABEL: {{^}}store_inline_imm_m_inv_2pi_v2f16:
    103 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xb118b118{{$}}
    104 ; GCN: buffer_store_dword [[REG]]
    105 define amdgpu_kernel void @store_inline_imm_m_inv_2pi_v2f16(<2 x half> addrspace(1)* %out) #0 {
    106   store <2 x half> <half 0xHB118, half 0xHB118>, <2 x half> addrspace(1)* %out
    107   ret void
    108 }
    109 
    110 ; GCN-LABEL: {{^}}store_literal_imm_v2f16:
    111 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x6c006c00
    112 ; GCN: buffer_store_dword [[REG]]
    113 define amdgpu_kernel void @store_literal_imm_v2f16(<2 x half> addrspace(1)* %out) #0 {
    114   store <2 x half> <half 4096.0, half 4096.0>, <2 x half> addrspace(1)* %out
    115   ret void
    116 }
    117 
    118 ; GCN-LABEL: {{^}}add_inline_imm_0.0_v2f16:
    119 ; GFX9: s_load_dword [[VAL:s[0-9]+]]
    120 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 0{{$}}
    121 ; GFX9: buffer_store_dword [[REG]]
    122 
    123 ; FIXME: Shouldn't need right shift and SDWA, also extra copy
    124 ; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
    125 ; VI-DAG: v_mov_b32_e32 [[CONST0:v[0-9]+]], 0
    126 ; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
    127 ; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
    128 
    129 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST0]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
    130 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 0
    131 ; VI: v_or_b32
    132 ; VI: buffer_store_dword
    133 define amdgpu_kernel void @add_inline_imm_0.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
    134   %y = fadd <2 x half> %x, <half 0.0, half 0.0>
    135   store <2 x half> %y, <2 x half> addrspace(1)* %out
    136   ret void
    137 }
    138 
    139 ; GCN-LABEL: {{^}}add_inline_imm_0.5_v2f16:
    140 ; GFX9: s_load_dword [[VAL:s[0-9]+]]
    141 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 0.5 op_sel_hi:[1,0]{{$}}
    142 ; GFX9: buffer_store_dword [[REG]]
    143 
    144 ; FIXME: Shouldn't need right shift and SDWA, also extra copy
    145 ; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
    146 ; VI-DAG: v_mov_b32_e32 [[CONST05:v[0-9]+]], 0x3800
    147 ; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
    148 ; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
    149 
    150 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST05]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
    151 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 0.5
    152 ; VI: v_or_b32
    153 ; VI: buffer_store_dword
    154 define amdgpu_kernel void @add_inline_imm_0.5_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
    155   %y = fadd <2 x half> %x, <half 0.5, half 0.5>
    156   store <2 x half> %y, <2 x half> addrspace(1)* %out
    157   ret void
    158 }
    159 
    160 ; GCN-LABEL: {{^}}add_inline_imm_neg_0.5_v2f16:
    161 ; GFX9: s_load_dword [[VAL:s[0-9]+]]
    162 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -0.5 op_sel_hi:[1,0]{{$}}
    163 ; GFX9: buffer_store_dword [[REG]]
    164 
    165 ; FIXME: Shouldn't need right shift and SDWA, also extra copy
    166 ; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
    167 ; VI-DAG: v_mov_b32_e32 [[CONSTM05:v[0-9]+]], 0xb800
    168 ; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
    169 ; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
    170 
    171 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONSTM05]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
    172 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], -0.5
    173 ; VI: v_or_b32
    174 ; VI: buffer_store_dword
    175 define amdgpu_kernel void @add_inline_imm_neg_0.5_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
    176   %y = fadd <2 x half> %x, <half -0.5, half -0.5>
    177   store <2 x half> %y, <2 x half> addrspace(1)* %out
    178   ret void
    179 }
    180 
    181 ; GCN-LABEL: {{^}}add_inline_imm_1.0_v2f16:
    182 ; GFX9: s_load_dword [[VAL:s[0-9]+]]
    183 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 1.0 op_sel_hi:[1,0]{{$}}
    184 ; GFX9: buffer_store_dword [[REG]]
    185 
    186 ; FIXME: Shouldn't need right shift and SDWA, also extra copy
    187 ; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
    188 ; VI-DAG: v_mov_b32_e32 [[CONST1:v[0-9]+]], 0x3c00
    189 ; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
    190 ; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
    191 
    192 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
    193 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 1.0
    194 ; VI: v_or_b32
    195 ; VI: buffer_store_dword
    196 define amdgpu_kernel void @add_inline_imm_1.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
    197   %y = fadd <2 x half> %x, <half 1.0, half 1.0>
    198   store <2 x half> %y, <2 x half> addrspace(1)* %out
    199   ret void
    200 }
    201 
    202 ; GCN-LABEL: {{^}}add_inline_imm_neg_1.0_v2f16:
    203 ; GFX9: s_load_dword [[VAL:s[0-9]+]]
    204 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -1.0 op_sel_hi:[1,0]{{$}}
    205 ; GFX9: buffer_store_dword [[REG]]
    206 
    207 
    208 ; FIXME: Shouldn't need right shift and SDWA, also extra copy
    209 ; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
    210 ; VI-DAG: v_mov_b32_e32 [[CONST1:v[0-9]+]], 0xbc00
    211 ; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
    212 ; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
    213 
    214 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
    215 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], -1.0
    216 ; VI: v_or_b32
    217 ; VI: buffer_store_dword
    218 define amdgpu_kernel void @add_inline_imm_neg_1.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
    219   %y = fadd <2 x half> %x, <half -1.0, half -1.0>
    220   store <2 x half> %y, <2 x half> addrspace(1)* %out
    221   ret void
    222 }
    223 
    224 ; GCN-LABEL: {{^}}add_inline_imm_2.0_v2f16:
    225 ; GFX9: s_load_dword [[VAL:s[0-9]+]]
    226 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 2.0 op_sel_hi:[1,0]{{$}}
    227 ; GFX9: buffer_store_dword [[REG]]
    228 
    229 ; FIXME: Shouldn't need right shift and SDWA, also extra copy
    230 ; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
    231 ; VI-DAG: v_mov_b32_e32 [[CONST2:v[0-9]+]], 0x4000
    232 ; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
    233 ; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
    234 
    235 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
    236 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 2.0
    237 ; VI: v_or_b32
    238 ; VI: buffer_store_dword
    239 define amdgpu_kernel void @add_inline_imm_2.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
    240   %y = fadd <2 x half> %x, <half 2.0, half 2.0>
    241   store <2 x half> %y, <2 x half> addrspace(1)* %out
    242   ret void
    243 }
    244 
    245 ; GCN-LABEL: {{^}}add_inline_imm_neg_2.0_v2f16:
    246 ; GFX9: s_load_dword [[VAL:s[0-9]+]]
    247 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -2.0 op_sel_hi:[1,0]{{$}}
    248 ; GFX9: buffer_store_dword [[REG]]
    249 
    250 ; FIXME: Shouldn't need right shift and SDWA, also extra copy
    251 ; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
    252 ; VI-DAG: v_mov_b32_e32 [[CONSTM2:v[0-9]+]], 0xc000
    253 ; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
    254 ; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
    255 
    256 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONSTM2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
    257 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], -2.0
    258 ; VI: v_or_b32
    259 ; VI: buffer_store_dword
    260 define amdgpu_kernel void @add_inline_imm_neg_2.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
    261   %y = fadd <2 x half> %x, <half -2.0, half -2.0>
    262   store <2 x half> %y, <2 x half> addrspace(1)* %out
    263   ret void
    264 }
    265 
    266 ; GCN-LABEL: {{^}}add_inline_imm_4.0_v2f16:
    267 ; GFX9: s_load_dword [[VAL:s[0-9]+]]
    268 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 4.0 op_sel_hi:[1,0]{{$}}
    269 ; GFX9: buffer_store_dword [[REG]]
    270 
    271 ; FIXME: Shouldn't need right shift and SDWA, also extra copy
    272 ; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
    273 ; VI-DAG: v_mov_b32_e32 [[CONST4:v[0-9]+]], 0x4400
    274 ; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
    275 ; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
    276 
    277 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
    278 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 4.0
    279 ; VI: v_or_b32
    280 ; VI: buffer_store_dword
    281 define amdgpu_kernel void @add_inline_imm_4.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
    282   %y = fadd <2 x half> %x, <half 4.0, half 4.0>
    283   store <2 x half> %y, <2 x half> addrspace(1)* %out
    284   ret void
    285 }
    286 
    287 ; GCN-LABEL: {{^}}add_inline_imm_neg_4.0_v2f16:
    288 ; GFX9: s_load_dword [[VAL:s[0-9]+]]
    289 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -4.0 op_sel_hi:[1,0]{{$}}
    290 ; GFX9: buffer_store_dword [[REG]]
    291 
    292 ; FIXME: Shouldn't need right shift and SDWA, also extra copy
    293 ; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
    294 ; VI-DAG: v_mov_b32_e32 [[CONSTM4:v[0-9]+]], 0xc400
    295 ; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
    296 ; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
    297 
    298 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONSTM4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
    299 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], -4.0
    300 ; VI: v_or_b32
    301 ; VI: buffer_store_dword
    302 define amdgpu_kernel void @add_inline_imm_neg_4.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
    303   %y = fadd <2 x half> %x, <half -4.0, half -4.0>
    304   store <2 x half> %y, <2 x half> addrspace(1)* %out
    305   ret void
    306 }
    307 
    308 ; GCN-LABEL: {{^}}commute_add_inline_imm_0.5_v2f16:
    309 ; GFX9: buffer_load_dword [[VAL:v[0-9]+]]
    310 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 0.5
    311 ; GFX9: buffer_store_dword [[REG]]
    312 
    313 ; VI-DAG: v_mov_b32_e32 [[CONST05:v[0-9]+]], 0x3800
    314 ; VI-DAG: buffer_load_dword
    315 ; VI-NOT: and
    316 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[CONST05]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
    317 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0.5, v{{[0-9]+}}
    318 ; VI: v_or_b32
    319 ; VI: buffer_store_dword
    320 define amdgpu_kernel void @commute_add_inline_imm_0.5_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
    321   %x = load <2 x half>, <2 x half> addrspace(1)* %in
    322   %y = fadd <2 x half> %x, <half 0.5, half 0.5>
    323   store <2 x half> %y, <2 x half> addrspace(1)* %out
    324   ret void
    325 }
    326 
    327 ; GCN-LABEL: {{^}}commute_add_literal_v2f16:
    328 ; GFX9-DAG: buffer_load_dword [[VAL:v[0-9]+]]
    329 ; GFX9-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x6400{{$}}
    330 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], [[K]] op_sel_hi:[1,0]{{$}}
    331 ; GFX9: buffer_store_dword [[REG]]
    332 
    333 ; VI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x6400{{$}}
    334 ; VI-DAG: buffer_load_dword
    335 ; VI-NOT: and
    336 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[K]]
    337 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[K]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
    338 ; VI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
    339 ; VI: buffer_store_dword
    340 define amdgpu_kernel void @commute_add_literal_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
    341   %x = load <2 x half>, <2 x half> addrspace(1)* %in
    342   %y = fadd <2 x half> %x, <half 1024.0, half 1024.0>
    343   store <2 x half> %y, <2 x half> addrspace(1)* %out
    344   ret void
    345 }
    346 
    347 ; GCN-LABEL: {{^}}add_inline_imm_1_v2f16:
    348 ; GFX9: s_load_dword [[VAL:s[0-9]+]]
    349 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 1 op_sel_hi:[1,0]{{$}}
    350 ; GFX9: buffer_store_dword [[REG]]
    351 
    352 ; FIXME: Shouldn't need right shift and SDWA, also extra copy
    353 ; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
    354 ; VI-DAG: v_mov_b32_e32 [[CONST1:v[0-9]+]], 1{{$}}
    355 ; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
    356 ; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
    357 
    358 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
    359 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 1{{$}}
    360 ; VI: v_or_b32
    361 ; VI: buffer_store_dword
    362 define amdgpu_kernel void @add_inline_imm_1_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
    363   %y = fadd <2 x half> %x, <half 0xH0001, half 0xH0001>
    364   store <2 x half> %y, <2 x half> addrspace(1)* %out
    365   ret void
    366 }
    367 
    368 ; GCN-LABEL: {{^}}add_inline_imm_2_v2f16:
    369 ; GFX9: s_load_dword [[VAL:s[0-9]+]]
    370 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 2 op_sel_hi:[1,0]{{$}}
    371 ; GFX9: buffer_store_dword [[REG]]
    372 
    373 
    374 ; FIXME: Shouldn't need right shift and SDWA, also extra copy
    375 ; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
    376 ; VI-DAG: v_mov_b32_e32 [[CONST2:v[0-9]+]], 2{{$}}
    377 ; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
    378 ; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
    379 
    380 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
    381 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 2{{$}}
    382 ; VI: v_or_b32
    383 ; VI: buffer_store_dword
    384 define amdgpu_kernel void @add_inline_imm_2_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
    385   %y = fadd <2 x half> %x, <half 0xH0002, half 0xH0002>
    386   store <2 x half> %y, <2 x half> addrspace(1)* %out
    387   ret void
    388 }
    389 
    390 ; GCN-LABEL: {{^}}add_inline_imm_16_v2f16:
    391 ; GFX9: s_load_dword [[VAL:s[0-9]+]]
    392 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 16 op_sel_hi:[1,0]{{$}}
    393 ; GFX9: buffer_store_dword [[REG]]
    394 
    395 
    396 ; FIXME: Shouldn't need right shift and SDWA, also extra copy
    397 ; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
    398 ; VI-DAG: v_mov_b32_e32 [[CONST16:v[0-9]+]], 16{{$}}
    399 ; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
    400 ; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
    401 
    402 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
    403 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 16{{$}}
    404 ; VI: v_or_b32
    405 ; VI: buffer_store_dword
    406 define amdgpu_kernel void @add_inline_imm_16_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
    407   %y = fadd <2 x half> %x, <half 0xH0010, half 0xH0010>
    408   store <2 x half> %y, <2 x half> addrspace(1)* %out
    409   ret void
    410 }
    411 
    412 ; GCN-LABEL: {{^}}add_inline_imm_neg_1_v2f16:
    413 ; GFX9: s_add_i32 [[VAL:s[0-9]+]], s4, -1
    414 ; GFX9: v_mov_b32_e32 [[REG:v[0-9]+]], [[VAL]]
    415 ; GFX9: buffer_store_dword [[REG]]
    416 
    417 ; VI: s_load_dword [[VAL:s[0-9]+]]
    418 ; VI: s_add_i32 [[ADD:s[0-9]+]], [[VAL]], -1{{$}}
    419 ; VI: v_mov_b32_e32 [[REG:v[0-9]+]], [[ADD]]
    420 ; VI: buffer_store_dword [[REG]]
    421 define amdgpu_kernel void @add_inline_imm_neg_1_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
    422   %xbc = bitcast <2 x half> %x to i32
    423   %y = add i32 %xbc, -1
    424   %ybc = bitcast i32 %y to <2 x half>
    425   store <2 x half> %ybc, <2 x half> addrspace(1)* %out
    426   ret void
    427 }
    428 
    429 ; GCN-LABEL: {{^}}add_inline_imm_neg_2_v2f16:
    430 ; GFX9: s_add_i32 [[VAL:s[0-9]+]], s4, 0xfffefffe
    431 ; GFX9: v_mov_b32_e32 [[REG:v[0-9]+]], [[VAL]]
    432 ; GFX9: buffer_store_dword [[REG]]
    433 
    434 ; VI: s_load_dword [[VAL:s[0-9]+]]
    435 ; VI: s_add_i32 [[ADD:s[0-9]+]], [[VAL]], 0xfffefffe{{$}}
    436 ; VI: v_mov_b32_e32 [[REG:v[0-9]+]], [[ADD]]
    437 ; VI: buffer_store_dword [[REG]]
    438 define amdgpu_kernel void @add_inline_imm_neg_2_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
    439   %xbc = bitcast <2 x half> %x to i32
    440   %y = add i32 %xbc, 4294901758 ; 0xfffefffe
    441   %ybc = bitcast i32 %y to <2 x half>
    442   store <2 x half> %ybc, <2 x half> addrspace(1)* %out
    443   ret void
    444 }
    445 
    446 ; GCN-LABEL: {{^}}add_inline_imm_neg_16_v2f16:
    447 ; GFX9: s_add_i32 [[VAL:s[0-9]+]], s4, 0xfff0fff0
    448 ; GFX9: v_mov_b32_e32 [[REG:v[0-9]+]], [[VAL]]
    449 ; GFX9: buffer_store_dword [[REG]]
    450 
    451 
    452 ; VI: s_load_dword [[VAL:s[0-9]+]]
    453 ; VI: s_add_i32 [[ADD:s[0-9]+]], [[VAL]], 0xfff0fff0{{$}}
    454 ; VI: v_mov_b32_e32 [[REG:v[0-9]+]], [[ADD]]
    455 ; VI: buffer_store_dword [[REG]]
    456 define amdgpu_kernel void @add_inline_imm_neg_16_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
    457   %xbc = bitcast <2 x half> %x to i32
    458   %y = add i32 %xbc, 4293984240 ; 0xfff0fff0
    459   %ybc = bitcast i32 %y to <2 x half>
    460   store <2 x half> %ybc, <2 x half> addrspace(1)* %out
    461   ret void
    462 }
    463 
    464 ; GCN-LABEL: {{^}}add_inline_imm_63_v2f16:
    465 ; GFX9: s_load_dword [[VAL:s[0-9]+]]
    466 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 63
    467 ; GFX9: buffer_store_dword [[REG]]
    468 
    469 ; FIXME: Shouldn't need right shift and SDWA, also extra copy
    470 ; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
    471 ; VI-DAG: v_mov_b32_e32 [[CONST63:v[0-9]+]], 63
    472 ; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
    473 ; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
    474 
    475 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST63]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
    476 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 63
    477 ; VI: v_or_b32
    478 ; VI: buffer_store_dword
    479 define amdgpu_kernel void @add_inline_imm_63_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
    480   %y = fadd <2 x half> %x, <half 0xH003F, half 0xH003F>
    481   store <2 x half> %y, <2 x half> addrspace(1)* %out
    482   ret void
    483 }
    484 
    485 ; GCN-LABEL: {{^}}add_inline_imm_64_v2f16:
    486 ; GFX9: s_load_dword [[VAL:s[0-9]+]]
    487 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 64
    488 ; GFX9: buffer_store_dword [[REG]]
    489 
    490 ; FIXME: Shouldn't need right shift and SDWA, also extra copy
    491 ; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
    492 ; VI-DAG: v_mov_b32_e32 [[CONST64:v[0-9]+]], 64
    493 ; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
    494 ; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
    495 
    496 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST64]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
    497 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 64
    498 ; VI: v_or_b32
    499 ; VI: buffer_store_dword
    500 define amdgpu_kernel void @add_inline_imm_64_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
    501   %y = fadd <2 x half> %x, <half 0xH0040, half 0xH0040>
    502   store <2 x half> %y, <2 x half> addrspace(1)* %out
    503   ret void
    504 }
    505 
    506 attributes #0 = { nounwind }
    507