1 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s 2 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s 3 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s 4 ; FIXME: Merge into imm.ll 5 6 ; GCN-LABEL: {{^}}store_inline_imm_neg_0.0_v2i16: 7 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80008000{{$}} 8 ; GCN: buffer_store_dword [[REG]] 9 define amdgpu_kernel void @store_inline_imm_neg_0.0_v2i16(<2 x i16> addrspace(1)* %out) #0 { 10 store <2 x i16> <i16 -32768, i16 -32768>, <2 x i16> addrspace(1)* %out 11 ret void 12 } 13 14 ; GCN-LABEL: {{^}}store_inline_imm_0.0_v2f16: 15 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}} 16 ; GCN: buffer_store_dword [[REG]] 17 define amdgpu_kernel void @store_inline_imm_0.0_v2f16(<2 x half> addrspace(1)* %out) #0 { 18 store <2 x half> <half 0.0, half 0.0>, <2 x half> addrspace(1)* %out 19 ret void 20 } 21 22 ; GCN-LABEL: {{^}}store_imm_neg_0.0_v2f16: 23 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80008000{{$}} 24 ; GCN: buffer_store_dword [[REG]] 25 define amdgpu_kernel void @store_imm_neg_0.0_v2f16(<2 x half> addrspace(1)* %out) #0 { 26 store <2 x half> <half -0.0, half -0.0>, <2 x half> addrspace(1)* %out 27 ret void 28 } 29 30 ; GCN-LABEL: {{^}}store_inline_imm_0.5_v2f16: 31 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x38003800{{$}} 32 ; GCN: buffer_store_dword [[REG]] 33 define amdgpu_kernel void @store_inline_imm_0.5_v2f16(<2 x half> addrspace(1)* %out) #0 { 34 store <2 x half> <half 0.5, half 0.5>, <2 x half> addrspace(1)* %out 35 ret void 36 } 37 38 ; GCN-LABEL: {{^}}store_inline_imm_m_0.5_v2f16: 39 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xb800b800{{$}} 40 ; GCN: buffer_store_dword [[REG]] 41 define amdgpu_kernel void @store_inline_imm_m_0.5_v2f16(<2 x half> addrspace(1)* %out) #0 { 42 store <2 x half> <half -0.5, half -0.5>, <2 x half> addrspace(1)* %out 43 ret void 44 } 45 46 ; GCN-LABEL: {{^}}store_inline_imm_1.0_v2f16: 47 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3c003c00{{$}} 48 ; GCN: buffer_store_dword [[REG]] 49 define amdgpu_kernel void @store_inline_imm_1.0_v2f16(<2 x half> addrspace(1)* %out) #0 { 50 store <2 x half> <half 1.0, half 1.0>, <2 x half> addrspace(1)* %out 51 ret void 52 } 53 54 ; GCN-LABEL: {{^}}store_inline_imm_m_1.0_v2f16: 55 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xbc00bc00{{$}} 56 ; GCN: buffer_store_dword [[REG]] 57 define amdgpu_kernel void @store_inline_imm_m_1.0_v2f16(<2 x half> addrspace(1)* %out) #0 { 58 store <2 x half> <half -1.0, half -1.0>, <2 x half> addrspace(1)* %out 59 ret void 60 } 61 62 ; GCN-LABEL: {{^}}store_inline_imm_2.0_v2f16: 63 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x40004000{{$}} 64 ; GCN: buffer_store_dword [[REG]] 65 define amdgpu_kernel void @store_inline_imm_2.0_v2f16(<2 x half> addrspace(1)* %out) #0 { 66 store <2 x half> <half 2.0, half 2.0>, <2 x half> addrspace(1)* %out 67 ret void 68 } 69 70 ; GCN-LABEL: {{^}}store_inline_imm_m_2.0_v2f16: 71 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xc000c000{{$}} 72 ; GCN: buffer_store_dword [[REG]] 73 define amdgpu_kernel void @store_inline_imm_m_2.0_v2f16(<2 x half> addrspace(1)* %out) #0 { 74 store <2 x half> <half -2.0, half -2.0>, <2 x half> addrspace(1)* %out 75 ret void 76 } 77 78 ; GCN-LABEL: {{^}}store_inline_imm_4.0_v2f16: 79 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x44004400{{$}} 80 ; GCN: buffer_store_dword [[REG]] 81 define amdgpu_kernel void @store_inline_imm_4.0_v2f16(<2 x half> addrspace(1)* %out) #0 { 82 store <2 x half> <half 4.0, half 4.0>, <2 x half> addrspace(1)* %out 83 ret void 84 } 85 86 ; GCN-LABEL: {{^}}store_inline_imm_m_4.0_v2f16: 87 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xc400c400{{$}} 88 ; GCN: buffer_store_dword [[REG]] 89 define amdgpu_kernel void @store_inline_imm_m_4.0_v2f16(<2 x half> addrspace(1)* %out) #0 { 90 store <2 x half> <half -4.0, half -4.0>, <2 x half> addrspace(1)* %out 91 ret void 92 } 93 94 ; GCN-LABEL: {{^}}store_inline_imm_inv_2pi_v2f16: 95 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x31183118{{$}} 96 ; GCN: buffer_store_dword [[REG]] 97 define amdgpu_kernel void @store_inline_imm_inv_2pi_v2f16(<2 x half> addrspace(1)* %out) #0 { 98 store <2 x half> <half 0xH3118, half 0xH3118>, <2 x half> addrspace(1)* %out 99 ret void 100 } 101 102 ; GCN-LABEL: {{^}}store_inline_imm_m_inv_2pi_v2f16: 103 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xb118b118{{$}} 104 ; GCN: buffer_store_dword [[REG]] 105 define amdgpu_kernel void @store_inline_imm_m_inv_2pi_v2f16(<2 x half> addrspace(1)* %out) #0 { 106 store <2 x half> <half 0xHB118, half 0xHB118>, <2 x half> addrspace(1)* %out 107 ret void 108 } 109 110 ; GCN-LABEL: {{^}}store_literal_imm_v2f16: 111 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x6c006c00 112 ; GCN: buffer_store_dword [[REG]] 113 define amdgpu_kernel void @store_literal_imm_v2f16(<2 x half> addrspace(1)* %out) #0 { 114 store <2 x half> <half 4096.0, half 4096.0>, <2 x half> addrspace(1)* %out 115 ret void 116 } 117 118 ; GCN-LABEL: {{^}}add_inline_imm_0.0_v2f16: 119 ; GFX9: s_load_dword [[VAL:s[0-9]+]] 120 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 0{{$}} 121 ; GFX9: buffer_store_dword [[REG]] 122 123 ; FIXME: Shouldn't need right shift and SDWA, also extra copy 124 ; VI-DAG: s_load_dword [[VAL:s[0-9]+]] 125 ; VI-DAG: v_mov_b32_e32 [[CONST0:v[0-9]+]], 0 126 ; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16 127 ; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]] 128 129 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST0]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 130 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 0 131 ; VI: v_or_b32 132 ; VI: buffer_store_dword 133 define amdgpu_kernel void @add_inline_imm_0.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { 134 %y = fadd <2 x half> %x, <half 0.0, half 0.0> 135 store <2 x half> %y, <2 x half> addrspace(1)* %out 136 ret void 137 } 138 139 ; GCN-LABEL: {{^}}add_inline_imm_0.5_v2f16: 140 ; GFX9: s_load_dword [[VAL:s[0-9]+]] 141 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 0.5 op_sel_hi:[1,0]{{$}} 142 ; GFX9: buffer_store_dword [[REG]] 143 144 ; FIXME: Shouldn't need right shift and SDWA, also extra copy 145 ; VI-DAG: s_load_dword [[VAL:s[0-9]+]] 146 ; VI-DAG: v_mov_b32_e32 [[CONST05:v[0-9]+]], 0x3800 147 ; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16 148 ; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]] 149 150 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST05]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 151 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 0.5 152 ; VI: v_or_b32 153 ; VI: buffer_store_dword 154 define amdgpu_kernel void @add_inline_imm_0.5_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { 155 %y = fadd <2 x half> %x, <half 0.5, half 0.5> 156 store <2 x half> %y, <2 x half> addrspace(1)* %out 157 ret void 158 } 159 160 ; GCN-LABEL: {{^}}add_inline_imm_neg_0.5_v2f16: 161 ; GFX9: s_load_dword [[VAL:s[0-9]+]] 162 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -0.5 op_sel_hi:[1,0]{{$}} 163 ; GFX9: buffer_store_dword [[REG]] 164 165 ; FIXME: Shouldn't need right shift and SDWA, also extra copy 166 ; VI-DAG: s_load_dword [[VAL:s[0-9]+]] 167 ; VI-DAG: v_mov_b32_e32 [[CONSTM05:v[0-9]+]], 0xb800 168 ; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16 169 ; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]] 170 171 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONSTM05]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 172 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], -0.5 173 ; VI: v_or_b32 174 ; VI: buffer_store_dword 175 define amdgpu_kernel void @add_inline_imm_neg_0.5_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { 176 %y = fadd <2 x half> %x, <half -0.5, half -0.5> 177 store <2 x half> %y, <2 x half> addrspace(1)* %out 178 ret void 179 } 180 181 ; GCN-LABEL: {{^}}add_inline_imm_1.0_v2f16: 182 ; GFX9: s_load_dword [[VAL:s[0-9]+]] 183 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 1.0 op_sel_hi:[1,0]{{$}} 184 ; GFX9: buffer_store_dword [[REG]] 185 186 ; FIXME: Shouldn't need right shift and SDWA, also extra copy 187 ; VI-DAG: s_load_dword [[VAL:s[0-9]+]] 188 ; VI-DAG: v_mov_b32_e32 [[CONST1:v[0-9]+]], 0x3c00 189 ; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16 190 ; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]] 191 192 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 193 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 1.0 194 ; VI: v_or_b32 195 ; VI: buffer_store_dword 196 define amdgpu_kernel void @add_inline_imm_1.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { 197 %y = fadd <2 x half> %x, <half 1.0, half 1.0> 198 store <2 x half> %y, <2 x half> addrspace(1)* %out 199 ret void 200 } 201 202 ; GCN-LABEL: {{^}}add_inline_imm_neg_1.0_v2f16: 203 ; GFX9: s_load_dword [[VAL:s[0-9]+]] 204 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -1.0 op_sel_hi:[1,0]{{$}} 205 ; GFX9: buffer_store_dword [[REG]] 206 207 208 ; FIXME: Shouldn't need right shift and SDWA, also extra copy 209 ; VI-DAG: s_load_dword [[VAL:s[0-9]+]] 210 ; VI-DAG: v_mov_b32_e32 [[CONST1:v[0-9]+]], 0xbc00 211 ; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16 212 ; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]] 213 214 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 215 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], -1.0 216 ; VI: v_or_b32 217 ; VI: buffer_store_dword 218 define amdgpu_kernel void @add_inline_imm_neg_1.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { 219 %y = fadd <2 x half> %x, <half -1.0, half -1.0> 220 store <2 x half> %y, <2 x half> addrspace(1)* %out 221 ret void 222 } 223 224 ; GCN-LABEL: {{^}}add_inline_imm_2.0_v2f16: 225 ; GFX9: s_load_dword [[VAL:s[0-9]+]] 226 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 2.0 op_sel_hi:[1,0]{{$}} 227 ; GFX9: buffer_store_dword [[REG]] 228 229 ; FIXME: Shouldn't need right shift and SDWA, also extra copy 230 ; VI-DAG: s_load_dword [[VAL:s[0-9]+]] 231 ; VI-DAG: v_mov_b32_e32 [[CONST2:v[0-9]+]], 0x4000 232 ; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16 233 ; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]] 234 235 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 236 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 2.0 237 ; VI: v_or_b32 238 ; VI: buffer_store_dword 239 define amdgpu_kernel void @add_inline_imm_2.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { 240 %y = fadd <2 x half> %x, <half 2.0, half 2.0> 241 store <2 x half> %y, <2 x half> addrspace(1)* %out 242 ret void 243 } 244 245 ; GCN-LABEL: {{^}}add_inline_imm_neg_2.0_v2f16: 246 ; GFX9: s_load_dword [[VAL:s[0-9]+]] 247 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -2.0 op_sel_hi:[1,0]{{$}} 248 ; GFX9: buffer_store_dword [[REG]] 249 250 ; FIXME: Shouldn't need right shift and SDWA, also extra copy 251 ; VI-DAG: s_load_dword [[VAL:s[0-9]+]] 252 ; VI-DAG: v_mov_b32_e32 [[CONSTM2:v[0-9]+]], 0xc000 253 ; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16 254 ; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]] 255 256 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONSTM2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 257 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], -2.0 258 ; VI: v_or_b32 259 ; VI: buffer_store_dword 260 define amdgpu_kernel void @add_inline_imm_neg_2.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { 261 %y = fadd <2 x half> %x, <half -2.0, half -2.0> 262 store <2 x half> %y, <2 x half> addrspace(1)* %out 263 ret void 264 } 265 266 ; GCN-LABEL: {{^}}add_inline_imm_4.0_v2f16: 267 ; GFX9: s_load_dword [[VAL:s[0-9]+]] 268 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 4.0 op_sel_hi:[1,0]{{$}} 269 ; GFX9: buffer_store_dword [[REG]] 270 271 ; FIXME: Shouldn't need right shift and SDWA, also extra copy 272 ; VI-DAG: s_load_dword [[VAL:s[0-9]+]] 273 ; VI-DAG: v_mov_b32_e32 [[CONST4:v[0-9]+]], 0x4400 274 ; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16 275 ; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]] 276 277 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 278 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 4.0 279 ; VI: v_or_b32 280 ; VI: buffer_store_dword 281 define amdgpu_kernel void @add_inline_imm_4.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { 282 %y = fadd <2 x half> %x, <half 4.0, half 4.0> 283 store <2 x half> %y, <2 x half> addrspace(1)* %out 284 ret void 285 } 286 287 ; GCN-LABEL: {{^}}add_inline_imm_neg_4.0_v2f16: 288 ; GFX9: s_load_dword [[VAL:s[0-9]+]] 289 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -4.0 op_sel_hi:[1,0]{{$}} 290 ; GFX9: buffer_store_dword [[REG]] 291 292 ; FIXME: Shouldn't need right shift and SDWA, also extra copy 293 ; VI-DAG: s_load_dword [[VAL:s[0-9]+]] 294 ; VI-DAG: v_mov_b32_e32 [[CONSTM4:v[0-9]+]], 0xc400 295 ; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16 296 ; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]] 297 298 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONSTM4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 299 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], -4.0 300 ; VI: v_or_b32 301 ; VI: buffer_store_dword 302 define amdgpu_kernel void @add_inline_imm_neg_4.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { 303 %y = fadd <2 x half> %x, <half -4.0, half -4.0> 304 store <2 x half> %y, <2 x half> addrspace(1)* %out 305 ret void 306 } 307 308 ; GCN-LABEL: {{^}}commute_add_inline_imm_0.5_v2f16: 309 ; GFX9: buffer_load_dword [[VAL:v[0-9]+]] 310 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 0.5 311 ; GFX9: buffer_store_dword [[REG]] 312 313 ; VI-DAG: v_mov_b32_e32 [[CONST05:v[0-9]+]], 0x3800 314 ; VI-DAG: buffer_load_dword 315 ; VI-NOT: and 316 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[CONST05]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 317 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0.5, v{{[0-9]+}} 318 ; VI: v_or_b32 319 ; VI: buffer_store_dword 320 define amdgpu_kernel void @commute_add_inline_imm_0.5_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { 321 %x = load <2 x half>, <2 x half> addrspace(1)* %in 322 %y = fadd <2 x half> %x, <half 0.5, half 0.5> 323 store <2 x half> %y, <2 x half> addrspace(1)* %out 324 ret void 325 } 326 327 ; GCN-LABEL: {{^}}commute_add_literal_v2f16: 328 ; GFX9-DAG: buffer_load_dword [[VAL:v[0-9]+]] 329 ; GFX9-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x6400{{$}} 330 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], [[K]] op_sel_hi:[1,0]{{$}} 331 ; GFX9: buffer_store_dword [[REG]] 332 333 ; VI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x6400{{$}} 334 ; VI-DAG: buffer_load_dword 335 ; VI-NOT: and 336 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[K]] 337 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[K]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 338 ; VI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 339 ; VI: buffer_store_dword 340 define amdgpu_kernel void @commute_add_literal_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { 341 %x = load <2 x half>, <2 x half> addrspace(1)* %in 342 %y = fadd <2 x half> %x, <half 1024.0, half 1024.0> 343 store <2 x half> %y, <2 x half> addrspace(1)* %out 344 ret void 345 } 346 347 ; GCN-LABEL: {{^}}add_inline_imm_1_v2f16: 348 ; GFX9: s_load_dword [[VAL:s[0-9]+]] 349 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 1 op_sel_hi:[1,0]{{$}} 350 ; GFX9: buffer_store_dword [[REG]] 351 352 ; FIXME: Shouldn't need right shift and SDWA, also extra copy 353 ; VI-DAG: s_load_dword [[VAL:s[0-9]+]] 354 ; VI-DAG: v_mov_b32_e32 [[CONST1:v[0-9]+]], 1{{$}} 355 ; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16 356 ; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]] 357 358 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 359 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 1{{$}} 360 ; VI: v_or_b32 361 ; VI: buffer_store_dword 362 define amdgpu_kernel void @add_inline_imm_1_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { 363 %y = fadd <2 x half> %x, <half 0xH0001, half 0xH0001> 364 store <2 x half> %y, <2 x half> addrspace(1)* %out 365 ret void 366 } 367 368 ; GCN-LABEL: {{^}}add_inline_imm_2_v2f16: 369 ; GFX9: s_load_dword [[VAL:s[0-9]+]] 370 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 2 op_sel_hi:[1,0]{{$}} 371 ; GFX9: buffer_store_dword [[REG]] 372 373 374 ; FIXME: Shouldn't need right shift and SDWA, also extra copy 375 ; VI-DAG: s_load_dword [[VAL:s[0-9]+]] 376 ; VI-DAG: v_mov_b32_e32 [[CONST2:v[0-9]+]], 2{{$}} 377 ; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16 378 ; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]] 379 380 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 381 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 2{{$}} 382 ; VI: v_or_b32 383 ; VI: buffer_store_dword 384 define amdgpu_kernel void @add_inline_imm_2_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { 385 %y = fadd <2 x half> %x, <half 0xH0002, half 0xH0002> 386 store <2 x half> %y, <2 x half> addrspace(1)* %out 387 ret void 388 } 389 390 ; GCN-LABEL: {{^}}add_inline_imm_16_v2f16: 391 ; GFX9: s_load_dword [[VAL:s[0-9]+]] 392 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 16 op_sel_hi:[1,0]{{$}} 393 ; GFX9: buffer_store_dword [[REG]] 394 395 396 ; FIXME: Shouldn't need right shift and SDWA, also extra copy 397 ; VI-DAG: s_load_dword [[VAL:s[0-9]+]] 398 ; VI-DAG: v_mov_b32_e32 [[CONST16:v[0-9]+]], 16{{$}} 399 ; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16 400 ; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]] 401 402 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 403 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 16{{$}} 404 ; VI: v_or_b32 405 ; VI: buffer_store_dword 406 define amdgpu_kernel void @add_inline_imm_16_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { 407 %y = fadd <2 x half> %x, <half 0xH0010, half 0xH0010> 408 store <2 x half> %y, <2 x half> addrspace(1)* %out 409 ret void 410 } 411 412 ; GCN-LABEL: {{^}}add_inline_imm_neg_1_v2f16: 413 ; GFX9: s_add_i32 [[VAL:s[0-9]+]], s4, -1 414 ; GFX9: v_mov_b32_e32 [[REG:v[0-9]+]], [[VAL]] 415 ; GFX9: buffer_store_dword [[REG]] 416 417 ; VI: s_load_dword [[VAL:s[0-9]+]] 418 ; VI: s_add_i32 [[ADD:s[0-9]+]], [[VAL]], -1{{$}} 419 ; VI: v_mov_b32_e32 [[REG:v[0-9]+]], [[ADD]] 420 ; VI: buffer_store_dword [[REG]] 421 define amdgpu_kernel void @add_inline_imm_neg_1_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { 422 %xbc = bitcast <2 x half> %x to i32 423 %y = add i32 %xbc, -1 424 %ybc = bitcast i32 %y to <2 x half> 425 store <2 x half> %ybc, <2 x half> addrspace(1)* %out 426 ret void 427 } 428 429 ; GCN-LABEL: {{^}}add_inline_imm_neg_2_v2f16: 430 ; GFX9: s_add_i32 [[VAL:s[0-9]+]], s4, 0xfffefffe 431 ; GFX9: v_mov_b32_e32 [[REG:v[0-9]+]], [[VAL]] 432 ; GFX9: buffer_store_dword [[REG]] 433 434 ; VI: s_load_dword [[VAL:s[0-9]+]] 435 ; VI: s_add_i32 [[ADD:s[0-9]+]], [[VAL]], 0xfffefffe{{$}} 436 ; VI: v_mov_b32_e32 [[REG:v[0-9]+]], [[ADD]] 437 ; VI: buffer_store_dword [[REG]] 438 define amdgpu_kernel void @add_inline_imm_neg_2_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { 439 %xbc = bitcast <2 x half> %x to i32 440 %y = add i32 %xbc, 4294901758 ; 0xfffefffe 441 %ybc = bitcast i32 %y to <2 x half> 442 store <2 x half> %ybc, <2 x half> addrspace(1)* %out 443 ret void 444 } 445 446 ; GCN-LABEL: {{^}}add_inline_imm_neg_16_v2f16: 447 ; GFX9: s_add_i32 [[VAL:s[0-9]+]], s4, 0xfff0fff0 448 ; GFX9: v_mov_b32_e32 [[REG:v[0-9]+]], [[VAL]] 449 ; GFX9: buffer_store_dword [[REG]] 450 451 452 ; VI: s_load_dword [[VAL:s[0-9]+]] 453 ; VI: s_add_i32 [[ADD:s[0-9]+]], [[VAL]], 0xfff0fff0{{$}} 454 ; VI: v_mov_b32_e32 [[REG:v[0-9]+]], [[ADD]] 455 ; VI: buffer_store_dword [[REG]] 456 define amdgpu_kernel void @add_inline_imm_neg_16_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { 457 %xbc = bitcast <2 x half> %x to i32 458 %y = add i32 %xbc, 4293984240 ; 0xfff0fff0 459 %ybc = bitcast i32 %y to <2 x half> 460 store <2 x half> %ybc, <2 x half> addrspace(1)* %out 461 ret void 462 } 463 464 ; GCN-LABEL: {{^}}add_inline_imm_63_v2f16: 465 ; GFX9: s_load_dword [[VAL:s[0-9]+]] 466 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 63 467 ; GFX9: buffer_store_dword [[REG]] 468 469 ; FIXME: Shouldn't need right shift and SDWA, also extra copy 470 ; VI-DAG: s_load_dword [[VAL:s[0-9]+]] 471 ; VI-DAG: v_mov_b32_e32 [[CONST63:v[0-9]+]], 63 472 ; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16 473 ; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]] 474 475 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST63]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 476 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 63 477 ; VI: v_or_b32 478 ; VI: buffer_store_dword 479 define amdgpu_kernel void @add_inline_imm_63_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { 480 %y = fadd <2 x half> %x, <half 0xH003F, half 0xH003F> 481 store <2 x half> %y, <2 x half> addrspace(1)* %out 482 ret void 483 } 484 485 ; GCN-LABEL: {{^}}add_inline_imm_64_v2f16: 486 ; GFX9: s_load_dword [[VAL:s[0-9]+]] 487 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 64 488 ; GFX9: buffer_store_dword [[REG]] 489 490 ; FIXME: Shouldn't need right shift and SDWA, also extra copy 491 ; VI-DAG: s_load_dword [[VAL:s[0-9]+]] 492 ; VI-DAG: v_mov_b32_e32 [[CONST64:v[0-9]+]], 64 493 ; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16 494 ; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]] 495 496 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST64]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 497 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 64 498 ; VI: v_or_b32 499 ; VI: buffer_store_dword 500 define amdgpu_kernel void @add_inline_imm_64_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { 501 %y = fadd <2 x half> %x, <half 0xH0040, half 0xH0040> 502 store <2 x half> %y, <2 x half> addrspace(1)* %out 503 ret void 504 } 505 506 attributes #0 = { nounwind } 507