1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s 2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=VI-FLUSH -check-prefix=GCN %s 3 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=+fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=VI-DENORM -check-prefix=GCN %s 4 5 ; GCN-LABEL: {{^}}mac_vvv: 6 ; GCN: buffer_load_dword [[A:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0{{$}} 7 ; GCN: buffer_load_dword [[B:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:4 8 ; GCN: buffer_load_dword [[C:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:8 9 ; GCN: v_mac_f32_e32 [[C]], [[A]], [[B]] 10 ; GCN: buffer_store_dword [[C]] 11 define amdgpu_kernel void @mac_vvv(float addrspace(1)* %out, float addrspace(1)* %in) #0 { 12 entry: 13 %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1 14 %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2 15 16 %a = load volatile float, float addrspace(1)* %in 17 %b = load volatile float, float addrspace(1)* %b_ptr 18 %c = load volatile float, float addrspace(1)* %c_ptr 19 20 %tmp0 = fmul float %a, %b 21 %tmp1 = fadd float %tmp0, %c 22 store float %tmp1, float addrspace(1)* %out 23 ret void 24 } 25 26 ; GCN-LABEL: {{^}}mad_inline_sgpr_inline: 27 ; GCN-NOT: v_mac_f32 28 ; GCN: v_mad_f32 v{{[0-9]}}, s{{[0-9]+}}, 0.5, 0.5 29 define amdgpu_kernel void @mad_inline_sgpr_inline(float addrspace(1)* %out, float %in) #0 { 30 entry: 31 %tmp0 = fmul float 0.5, %in 32 %tmp1 = fadd float %tmp0, 0.5 33 store float %tmp1, float addrspace(1)* %out 34 ret void 35 } 36 37 ; GCN-LABEL: {{^}}mad_vvs: 38 ; GCN-NOT: v_mac_f32 39 ; GCN: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}} 40 define amdgpu_kernel void @mad_vvs(float addrspace(1)* %out, float addrspace(1)* %in, float %c) #0 { 41 entry: 42 %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1 43 44 %a = load float, float addrspace(1)* %in 45 %b = load float, float addrspace(1)* %b_ptr 46 47 %tmp0 = fmul float %a, %b 48 %tmp1 = fadd float %tmp0, %c 49 store float %tmp1, float addrspace(1)* %out 50 ret void 51 } 52 53 ; GCN-LABEL: {{^}}mac_ssv: 54 ; GCN: v_mac_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} 55 define amdgpu_kernel void @mac_ssv(float addrspace(1)* %out, float addrspace(1)* %in, float %a) #0 { 56 entry: 57 %c = load float, float addrspace(1)* %in 58 59 %tmp0 = fmul float %a, %a 60 %tmp1 = fadd float %tmp0, %c 61 store float %tmp1, float addrspace(1)* %out 62 ret void 63 } 64 65 ; GCN-LABEL: {{^}}mac_mad_same_add: 66 ; GCN: v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, [[ADD:v[0-9]+]] 67 ; GCN: v_mac_f32_e32 [[ADD]], v{{[0-9]+}}, v{{[0-9]+}} 68 define amdgpu_kernel void @mac_mad_same_add(float addrspace(1)* %out, float addrspace(1)* %in) #0 { 69 entry: 70 %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1 71 %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2 72 %d_ptr = getelementptr float, float addrspace(1)* %in, i32 3 73 %e_ptr = getelementptr float, float addrspace(1)* %in, i32 4 74 75 %a = load volatile float, float addrspace(1)* %in 76 %b = load volatile float, float addrspace(1)* %b_ptr 77 %c = load volatile float, float addrspace(1)* %c_ptr 78 %d = load volatile float, float addrspace(1)* %d_ptr 79 %e = load volatile float, float addrspace(1)* %e_ptr 80 81 %tmp0 = fmul float %a, %b 82 %tmp1 = fadd float %tmp0, %c 83 84 %tmp2 = fmul float %d, %e 85 %tmp3 = fadd float %tmp2, %c 86 87 %out1 = getelementptr float, float addrspace(1)* %out, i32 1 88 store float %tmp1, float addrspace(1)* %out 89 store float %tmp3, float addrspace(1)* %out1 90 ret void 91 } 92 93 ; There is no advantage to using v_mac when one of the operands is negated 94 ; and v_mad accepts more operand types. 95 96 ; GCN-LABEL: {{^}}mad_neg_src0: 97 ; GCN-NOT: v_mac_f32 98 ; GCN: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} 99 define amdgpu_kernel void @mad_neg_src0(float addrspace(1)* %out, float addrspace(1)* %in) #0 { 100 entry: 101 %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1 102 %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2 103 104 %a = load float, float addrspace(1)* %in 105 %b = load float, float addrspace(1)* %b_ptr 106 %c = load float, float addrspace(1)* %c_ptr 107 108 %neg_a = fsub float -0.0, %a 109 %tmp0 = fmul float %neg_a, %b 110 %tmp1 = fadd float %tmp0, %c 111 112 store float %tmp1, float addrspace(1)* %out 113 ret void 114 } 115 116 ; GCN-LABEL: {{^}}nsz_mad_sub0_src0: 117 ; GCN-NOT: v_mac_f32 118 ; GCN: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} 119 define amdgpu_kernel void @nsz_mad_sub0_src0(float addrspace(1)* %out, float addrspace(1)* %in) #1 { 120 entry: 121 %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1 122 %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2 123 124 %a = load float, float addrspace(1)* %in 125 %b = load float, float addrspace(1)* %b_ptr 126 %c = load float, float addrspace(1)* %c_ptr 127 128 %neg_a = fsub float 0.0, %a 129 %tmp0 = fmul float %neg_a, %b 130 %tmp1 = fadd float %tmp0, %c 131 132 store float %tmp1, float addrspace(1)* %out 133 ret void 134 } 135 136 ; GCN-LABEL: {{^}}safe_mad_sub0_src0: 137 ; GCN: v_sub_f32_e32 [[SUB0:v[0-9]+]], 0, 138 ; GCN: v_mac_f32_e32 v{{[0-9]+}}, [[SUB0]], v{{[0-9]+}} 139 define amdgpu_kernel void @safe_mad_sub0_src0(float addrspace(1)* %out, float addrspace(1)* %in) #0 { 140 entry: 141 %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1 142 %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2 143 144 %a = load float, float addrspace(1)* %in 145 %b = load float, float addrspace(1)* %b_ptr 146 %c = load float, float addrspace(1)* %c_ptr 147 148 %neg_a = fsub float 0.0, %a 149 %tmp0 = fmul float %neg_a, %b 150 %tmp1 = fadd float %tmp0, %c 151 152 store float %tmp1, float addrspace(1)* %out 153 ret void 154 } 155 156 ; GCN-LABEL: {{^}}mad_neg_src1: 157 ; GCN-NOT: v_mac_f32 158 ; GCN: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} 159 define amdgpu_kernel void @mad_neg_src1(float addrspace(1)* %out, float addrspace(1)* %in) #0 { 160 entry: 161 %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1 162 %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2 163 164 %a = load float, float addrspace(1)* %in 165 %b = load float, float addrspace(1)* %b_ptr 166 %c = load float, float addrspace(1)* %c_ptr 167 168 %neg_b = fsub float -0.0, %b 169 %tmp0 = fmul float %a, %neg_b 170 %tmp1 = fadd float %tmp0, %c 171 172 store float %tmp1, float addrspace(1)* %out 173 ret void 174 } 175 176 ; GCN-LABEL: {{^}}nsz_mad_sub0_src1: 177 ; GCN-NOT: v_mac_f32 178 ; GCN: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} 179 define amdgpu_kernel void @nsz_mad_sub0_src1(float addrspace(1)* %out, float addrspace(1)* %in) #1 { 180 entry: 181 %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1 182 %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2 183 184 %a = load float, float addrspace(1)* %in 185 %b = load float, float addrspace(1)* %b_ptr 186 %c = load float, float addrspace(1)* %c_ptr 187 188 %neg_b = fsub float 0.0, %b 189 %tmp0 = fmul float %a, %neg_b 190 %tmp1 = fadd float %tmp0, %c 191 192 store float %tmp1, float addrspace(1)* %out 193 ret void 194 } 195 196 ; GCN-LABEL: {{^}}mad_neg_src2: 197 ; GCN-NOT: v_mac 198 ; GCN: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[-0-9]}} 199 define amdgpu_kernel void @mad_neg_src2(float addrspace(1)* %out, float addrspace(1)* %in) #0 { 200 entry: 201 %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1 202 %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2 203 204 %a = load float, float addrspace(1)* %in 205 %b = load float, float addrspace(1)* %b_ptr 206 %c = load float, float addrspace(1)* %c_ptr 207 208 %neg_c = fsub float -0.0, %c 209 %tmp0 = fmul float %a, %b 210 %tmp1 = fadd float %tmp0, %neg_c 211 212 store float %tmp1, float addrspace(1)* %out 213 ret void 214 } 215 216 ; Without special casing the inline constant check for v_mac_f32's 217 ; src2, this fails to fold the 1.0 into a mad. 218 219 ; GCN-LABEL: {{^}}fold_inline_imm_into_mac_src2_f32: 220 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 221 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 222 223 ; GCN: v_add_f32_e32 [[TMP2:v[0-9]+]], [[A]], [[A]] 224 ; GCN: v_mad_f32 v{{[0-9]+}}, [[TMP2]], -4.0, 1.0 225 define amdgpu_kernel void @fold_inline_imm_into_mac_src2_f32(float addrspace(1)* %out, float addrspace(1)* %a, float addrspace(1)* %b) #3 { 226 bb: 227 %tid = call i32 @llvm.amdgcn.workitem.id.x() 228 %tid.ext = sext i32 %tid to i64 229 %gep.a = getelementptr inbounds float, float addrspace(1)* %a, i64 %tid.ext 230 %gep.b = getelementptr inbounds float, float addrspace(1)* %b, i64 %tid.ext 231 %gep.out = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext 232 %tmp = load volatile float, float addrspace(1)* %gep.a 233 %tmp1 = load volatile float, float addrspace(1)* %gep.b 234 %tmp2 = fadd float %tmp, %tmp 235 %tmp3 = fmul float %tmp2, 4.0 236 %tmp4 = fsub float 1.0, %tmp3 237 %tmp5 = fadd float %tmp4, %tmp1 238 %tmp6 = fadd float %tmp1, %tmp1 239 %tmp7 = fmul float %tmp6, %tmp 240 %tmp8 = fsub float 1.0, %tmp7 241 %tmp9 = fmul float %tmp8, 8.0 242 %tmp10 = fadd float %tmp5, %tmp9 243 store float %tmp10, float addrspace(1)* %gep.out 244 ret void 245 } 246 247 ; GCN-LABEL: {{^}}fold_inline_imm_into_mac_src2_f16: 248 ; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]] 249 ; GCN: {{buffer|flat}}_load_ushort [[B:v[0-9]+]] 250 251 ; SI-DAG: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], [[A]] 252 ; SI-DAG: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], [[B]] 253 254 ; SI: v_add_f32_e32 [[TMP2:v[0-9]+]], [[CVT_A]], [[CVT_A]] 255 ; SI: v_mad_f32 v{{[0-9]+}}, [[TMP2]], -4.0, 1.0 256 ; SI: v_mac_f32_e32 v{{[0-9]+}}, 0x41000000, v{{[0-9]+}} 257 258 ; VI-FLUSH: v_add_f16_e32 [[TMP2:v[0-9]+]], [[A]], [[A]] 259 ; VI-FLUSH: v_mad_f16 v{{[0-9]+}}, [[TMP2]], -4.0, 1.0 260 define amdgpu_kernel void @fold_inline_imm_into_mac_src2_f16(half addrspace(1)* %out, half addrspace(1)* %a, half addrspace(1)* %b) #3 { 261 bb: 262 %tid = call i32 @llvm.amdgcn.workitem.id.x() 263 %tid.ext = sext i32 %tid to i64 264 %gep.a = getelementptr inbounds half, half addrspace(1)* %a, i64 %tid.ext 265 %gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext 266 %gep.out = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext 267 %tmp = load volatile half, half addrspace(1)* %gep.a 268 %tmp1 = load volatile half, half addrspace(1)* %gep.b 269 %tmp2 = fadd half %tmp, %tmp 270 %tmp3 = fmul half %tmp2, 4.0 271 %tmp4 = fsub half 1.0, %tmp3 272 %tmp5 = fadd half %tmp4, %tmp1 273 %tmp6 = fadd half %tmp1, %tmp1 274 %tmp7 = fmul half %tmp6, %tmp 275 %tmp8 = fsub half 1.0, %tmp7 276 %tmp9 = fmul half %tmp8, 8.0 277 %tmp10 = fadd half %tmp5, %tmp9 278 store half %tmp10, half addrspace(1)* %gep.out 279 ret void 280 } 281 282 declare i32 @llvm.amdgcn.workitem.id.x() #2 283 284 attributes #0 = { nounwind "no-signed-zeros-fp-math"="false" } 285 attributes #1 = { nounwind "no-signed-zeros-fp-math"="true" } 286 attributes #2 = { nounwind readnone } 287 attributes #3 = { nounwind } 288