1 ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=tahiti -mattr=-fp32-denormals,+fast-fmaf -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-STRICT,GCN-FLUSH-MAD,GCN-FLUSH-FASTFMA,GCN-FLUSH-FASTFMA-STRICT,SI %s 2 ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=tahiti -mattr=+fp32-denormals,+fast-fmaf -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM,SI-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-STRICT,SI %s 3 ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=verde -mattr=-fp32-denormals,-fast-fmaf -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-STRICT,GCN-FLUSH-MAD,SI-FLUSH,GCN-FLUSH-SLOWFMA,GCN-FLUSH-SLOWFMA-STRICT,SI %s 4 ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=verde -mattr=+fp32-denormals,-fast-fmaf -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM,SI-DENORM,GCN-DENORM-SLOWFMA,GCN-DENORM-SLOWFMA-STRICT,SI %s 5 6 ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=tahiti -mattr=-fp32-denormals,+fast-fmaf -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-CONTRACT,GCN-FLUSH-MAD,SI-FLUSH,GCN-FLUSH-FASTFMA,GCN-FLUSH-FASTFMA-CONTRACT,SI %s 7 ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=tahiti -mattr=+fp32-denormals,+fast-fmaf -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-CONTRACT,GCN-DENORM,SI-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-CONTRACT,SI %s 8 ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=verde -mattr=-fp32-denormals,-fast-fmaf -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-CONTRACT,GCN-FLUSH-MAD,SI-FLUSH,GCN-FLUSH-SLOWFMA,GCN-FLUSH-SLOWFMA-CONTRACT,SI %s 9 ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=verde -mattr=+fp32-denormals,-fast-fmaf -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-CONTRACT,GCN-DENORM,SI-DENORM,GCN-DENORM-SLOWFMA,GCN-DENORM-SLOWFMA-CONTRACT,SI %s 10 11 12 ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=gfx900 -mattr=-fp32-denormals -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-STRICT,GCN-FLUSH-MAD,GFX9-FLUSH,GCN-FLUSH-FASTFMA,GCN-FLUSH-FASTFMA-STRICT,GFX900 %s 13 ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=gfx900 -mattr=+fp32-denormals -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM,GFX9-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-STRICT,GFX900 %s 14 15 ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=gfx906 -mattr=-fp32-denormals -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-STRICT,GCN-FLUSH-FMAC,GFX9-FLUSH,GCN-FLUSH-FASTFMA,GCN-FLUSH-FASTFMA-STRICT,GFX906 %s 16 17 ; FIXME: Should probably test this, but sometimes selecting fmac is painful to match. 18 ; XUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=gfx906 -mattr=+fp32-denormals -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM,GFX9-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-STRICT,GFX906 %s 19 20 21 ; Test all permutations of: fp32 denormals, fast fp contract, fp contract enabled for fmuladd, fmaf fast/slow. 22 23 target triple = "amdgcn--" 24 25 26 declare i32 @llvm.amdgcn.workitem.id.x() #1 27 declare float @llvm.fmuladd.f32(float, float, float) #1 28 declare half @llvm.fmuladd.f16(half, half, half) #1 29 declare float @llvm.fabs.f32(float) #1 30 31 ; GCN-LABEL: {{^}}fmuladd_f32: 32 ; GCN-FLUSH-MAD: v_mac_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} 33 ; GCN-FLUSH-FMAC: v_fmac_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} 34 35 ; GCN-DENORM-FASTFMA: v_fma_f32 {{v[0-9]+, v[0-9]+, v[0-9]+}} 36 37 ; GCN-DENORM-SLOWFMA: v_mul_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} 38 ; GCN-DENORM-SLOWFMA: v_add_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} 39 define amdgpu_kernel void @fmuladd_f32(float addrspace(1)* %out, float addrspace(1)* %in1, 40 float addrspace(1)* %in2, float addrspace(1)* %in3) #0 { 41 %r0 = load float, float addrspace(1)* %in1 42 %r1 = load float, float addrspace(1)* %in2 43 %r2 = load float, float addrspace(1)* %in3 44 %r3 = tail call float @llvm.fmuladd.f32(float %r0, float %r1, float %r2) 45 store float %r3, float addrspace(1)* %out 46 ret void 47 } 48 49 ; GCN-LABEL: {{^}}fmul_fadd_f32: 50 ; GCN-FLUSH: v_mac_f32 51 52 ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 53 54 ; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 55 ; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 56 57 ; GCN-DENORM-STRICT: v_mul_f32_e32 58 ; GCN-DENORM-STRICT: v_add_f32_e32 59 define amdgpu_kernel void @fmul_fadd_f32(float addrspace(1)* %out, float addrspace(1)* %in1, 60 float addrspace(1)* %in2, float addrspace(1)* %in3) #0 { 61 %r0 = load volatile float, float addrspace(1)* %in1 62 %r1 = load volatile float, float addrspace(1)* %in2 63 %r2 = load volatile float, float addrspace(1)* %in3 64 %mul = fmul float %r0, %r1 65 %add = fadd float %mul, %r2 66 store float %add, float addrspace(1)* %out 67 ret void 68 } 69 70 ; GCN-LABEL: {{^}}fmuladd_2.0_a_b_f32 71 ; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]], 72 ; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]], 73 74 ; GCN-FLUSH-MAD: v_mac_f32_e32 [[R2]], 2.0, [[R1]] 75 ; GCN-FLUSH-FMAC: v_fmac_f32_e32 [[R2]], 2.0, [[R1]] 76 ; SI-FLUSH: buffer_store_dword [[R2]] 77 ; VI-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]] 78 79 ; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] 80 81 ; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] 82 ; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] 83 84 ; SI-DENORM buffer_store_dword [[RESULT]] 85 ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 86 define amdgpu_kernel void @fmuladd_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { 87 %tid = call i32 @llvm.amdgcn.workitem.id.x() 88 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid 89 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 90 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 91 92 %r1 = load volatile float, float addrspace(1)* %gep.0 93 %r2 = load volatile float, float addrspace(1)* %gep.1 94 95 %r3 = tail call float @llvm.fmuladd.f32(float 2.0, float %r1, float %r2) 96 store float %r3, float addrspace(1)* %gep.out 97 ret void 98 } 99 100 ; GCN-LABEL: {{^}}fmuladd_a_2.0_b_f32 101 ; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]], 102 ; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]], 103 104 ; GCN-FLUSH-MAD: v_mac_f32_e32 [[R2]], 2.0, [[R1]] 105 ; GCN-FLUSH-FMAC: v_fmac_f32_e32 [[R2]], 2.0, [[R1]] 106 107 ; SI-FLUSH: buffer_store_dword [[R2]] 108 ; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]] 109 110 ; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] 111 112 ; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] 113 ; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] 114 115 ; SI-DENORM: buffer_store_dword [[RESULT]] 116 ; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 117 define amdgpu_kernel void @fmuladd_a_2.0_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { 118 %tid = call i32 @llvm.amdgcn.workitem.id.x() 119 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid 120 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 121 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 122 123 %r1 = load volatile float, float addrspace(1)* %gep.0 124 %r2 = load volatile float, float addrspace(1)* %gep.1 125 126 %r3 = tail call float @llvm.fmuladd.f32(float %r1, float 2.0, float %r2) 127 store float %r3, float addrspace(1)* %gep.out 128 ret void 129 } 130 131 ; GCN-LABEL: {{^}}fadd_a_a_b_f32: 132 ; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]], 133 ; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]], 134 135 ; GCN-FLUSH: v_mac_f32_e32 [[R2]], 2.0, [[R1]] 136 137 ; SI-FLUSH: buffer_store_dword [[R2]] 138 ; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]] 139 140 ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] 141 142 ; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] 143 ; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] 144 145 ; GCN-DENORM-STRICT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] 146 ; GCN-DENORM-STRICT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] 147 148 ; SI-DENORM: buffer_store_dword [[RESULT]] 149 ; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 150 define amdgpu_kernel void @fadd_a_a_b_f32(float addrspace(1)* %out, 151 float addrspace(1)* %in1, 152 float addrspace(1)* %in2) #0 { 153 %tid = call i32 @llvm.amdgcn.workitem.id.x() 154 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid 155 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 156 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 157 158 %r0 = load volatile float, float addrspace(1)* %gep.0 159 %r1 = load volatile float, float addrspace(1)* %gep.1 160 161 %add.0 = fadd float %r0, %r0 162 %add.1 = fadd float %add.0, %r1 163 store float %add.1, float addrspace(1)* %gep.out 164 ret void 165 } 166 167 ; GCN-LABEL: {{^}}fadd_b_a_a_f32: 168 ; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]], 169 ; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]], 170 171 ; GCN-FLUSH: v_mac_f32_e32 [[R2]], 2.0, [[R1]] 172 173 ; SI-FLUSH: buffer_store_dword [[R2]] 174 ; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]] 175 176 ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] 177 178 ; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] 179 ; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] 180 181 ; GCN-DENORM-STRICT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] 182 ; GCN-DENORM-STRICT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] 183 184 ; SI-DENORM: buffer_store_dword [[RESULT]] 185 ; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 186 define amdgpu_kernel void @fadd_b_a_a_f32(float addrspace(1)* %out, 187 float addrspace(1)* %in1, 188 float addrspace(1)* %in2) #0 { 189 %tid = call i32 @llvm.amdgcn.workitem.id.x() 190 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid 191 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 192 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 193 194 %r0 = load volatile float, float addrspace(1)* %gep.0 195 %r1 = load volatile float, float addrspace(1)* %gep.1 196 197 %add.0 = fadd float %r0, %r0 198 %add.1 = fadd float %r1, %add.0 199 store float %add.1, float addrspace(1)* %gep.out 200 ret void 201 } 202 203 ; GCN-LABEL: {{^}}fmuladd_neg_2.0_a_b_f32 204 ; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]], 205 ; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]], 206 ; GCN-FLUSH-MAD: v_mac_f32_e32 [[R2]], -2.0, [[R1]] 207 ; GCN-FLUSH-FMAC: v_fmac_f32_e32 [[R2]], -2.0, [[R1]] 208 209 ; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]] 210 211 ; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] 212 ; GCN-DENORM-SLOWFMA: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] 213 214 ; SI-DENORM: buffer_store_dword [[RESULT]] 215 ; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 216 define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { 217 %tid = call i32 @llvm.amdgcn.workitem.id.x() 218 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid 219 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 220 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 221 222 %r1 = load volatile float, float addrspace(1)* %gep.0 223 %r2 = load volatile float, float addrspace(1)* %gep.1 224 225 %r3 = tail call float @llvm.fmuladd.f32(float -2.0, float %r1, float %r2) 226 store float %r3, float addrspace(1)* %gep.out 227 ret void 228 } 229 230 ; XXX 231 ; GCN-LABEL: {{^}}fmuladd_neg_2.0_neg_a_b_f32 232 ; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]], 233 ; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]], 234 235 ; GCN-FLUSH-MAD: v_mac_f32_e32 [[R2]], 2.0, [[R1]] 236 ; GCN-FLUSH-FMAC: v_fmac_f32_e32 [[R2]], 2.0, [[R1]] 237 238 ; SI-FLUSH: buffer_store_dword [[R2]] 239 ; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]] 240 241 ; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] 242 243 ; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] 244 ; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] 245 246 ; SI-DENORM: buffer_store_dword [[RESULT]] 247 ; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 248 define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { 249 %tid = call i32 @llvm.amdgcn.workitem.id.x() 250 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid 251 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 252 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 253 254 %r1 = load volatile float, float addrspace(1)* %gep.0 255 %r2 = load volatile float, float addrspace(1)* %gep.1 256 257 %r1.fneg = fsub float -0.000000e+00, %r1 258 259 %r3 = tail call float @llvm.fmuladd.f32(float -2.0, float %r1.fneg, float %r2) 260 store float %r3, float addrspace(1)* %gep.out 261 ret void 262 } 263 264 ; GCN-LABEL: {{^}}fmuladd_2.0_neg_a_b_f32: 265 ; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]], 266 ; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]], 267 268 ; GCN-FLUSH-MAD: v_mac_f32_e32 [[R2]], -2.0, [[R1]] 269 ; GCN-FLUSH-FMAC: v_fmac_f32_e32 [[R2]], -2.0, [[R1]] 270 271 ; SI-FLUSH: buffer_store_dword [[R2]] 272 ; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]] 273 274 ; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]] 275 276 ; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] 277 ; GCN-DENORM-SLOWFMA: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] 278 279 ; SI-DENORM: buffer_store_dword [[RESULT]] 280 ; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 281 define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { 282 %tid = call i32 @llvm.amdgcn.workitem.id.x() 283 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid 284 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 285 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 286 287 %r1 = load volatile float, float addrspace(1)* %gep.0 288 %r2 = load volatile float, float addrspace(1)* %gep.1 289 290 %r1.fneg = fsub float -0.000000e+00, %r1 291 292 %r3 = tail call float @llvm.fmuladd.f32(float 2.0, float %r1.fneg, float %r2) 293 store float %r3, float addrspace(1)* %gep.out 294 ret void 295 } 296 297 ; GCN-LABEL: {{^}}fmuladd_2.0_a_neg_b_f32: 298 ; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]], 299 ; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]], 300 ; GCN-FLUSH-MAD: v_mad_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]] 301 ; GCN-FLUSH-FMAC: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]] 302 303 ; SI-FLUSH: buffer_store_dword [[RESULT]] 304 ; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 305 306 ; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]] 307 308 ; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] 309 ; GCN-DENORM-SLOWFMA: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] 310 311 ; SI-DENORM: buffer_store_dword [[RESULT]] 312 ; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 313 define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { 314 %tid = call i32 @llvm.amdgcn.workitem.id.x() 315 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid 316 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 317 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 318 319 %r1 = load volatile float, float addrspace(1)* %gep.0 320 %r2 = load volatile float, float addrspace(1)* %gep.1 321 322 %r2.fneg = fsub float -0.000000e+00, %r2 323 324 %r3 = tail call float @llvm.fmuladd.f32(float 2.0, float %r1, float %r2.fneg) 325 store float %r3, float addrspace(1)* %gep.out 326 ret void 327 } 328 329 ; GCN-LABEL: {{^}}mad_sub_f32: 330 ; GCN: {{buffer|flat|global}}_load_dword [[REGA:v[0-9]+]] 331 ; GCN: {{buffer|flat|global}}_load_dword [[REGB:v[0-9]+]] 332 ; GCN: {{buffer|flat|global}}_load_dword [[REGC:v[0-9]+]] 333 ; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]] 334 335 ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]] 336 337 ; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] 338 ; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] 339 340 ; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] 341 ; GCN-DENORM-STRICT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] 342 343 ; SI: buffer_store_dword [[RESULT]] 344 ; VI: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 345 define amdgpu_kernel void @mad_sub_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 { 346 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 347 %tid.ext = sext i32 %tid to i64 348 %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext 349 %add1 = add i64 %tid.ext, 1 350 %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1 351 %add2 = add i64 %tid.ext, 2 352 %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2 353 %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext 354 %a = load volatile float, float addrspace(1)* %gep0, align 4 355 %b = load volatile float, float addrspace(1)* %gep1, align 4 356 %c = load volatile float, float addrspace(1)* %gep2, align 4 357 %mul = fmul float %a, %b 358 %sub = fsub float %mul, %c 359 store float %sub, float addrspace(1)* %outgep, align 4 360 ret void 361 } 362 363 ; GCN-LABEL: {{^}}mad_sub_inv_f32: 364 ; GCN: {{buffer|flat|global}}_load_dword [[REGA:v[0-9]+]] 365 ; GCN: {{buffer|flat|global}}_load_dword [[REGB:v[0-9]+]] 366 ; GCN: {{buffer|flat|global}}_load_dword [[REGC:v[0-9]+]] 367 368 ; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]] 369 370 ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]] 371 372 ; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] 373 ; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] 374 375 ; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] 376 ; GCN-DENORM-STRICT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] 377 378 ; SI: buffer_store_dword [[RESULT]] 379 ; VI: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 380 define amdgpu_kernel void @mad_sub_inv_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 { 381 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 382 %tid.ext = sext i32 %tid to i64 383 %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext 384 %add1 = add i64 %tid.ext, 1 385 %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1 386 %add2 = add i64 %tid.ext, 2 387 %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2 388 %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext 389 %a = load volatile float, float addrspace(1)* %gep0, align 4 390 %b = load volatile float, float addrspace(1)* %gep1, align 4 391 %c = load volatile float, float addrspace(1)* %gep2, align 4 392 %mul = fmul float %a, %b 393 %sub = fsub float %c, %mul 394 store float %sub, float addrspace(1)* %outgep, align 4 395 ret void 396 } 397 398 ; GCN-LABEL: {{^}}mad_sub_fabs_f32: 399 ; GCN: {{buffer|flat|global}}_load_dword [[REGA:v[0-9]+]] 400 ; GCN: {{buffer|flat|global}}_load_dword [[REGB:v[0-9]+]] 401 ; GCN: {{buffer|flat|global}}_load_dword [[REGC:v[0-9]+]] 402 ; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]| 403 404 ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]| 405 406 ; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] 407 ; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]| 408 409 ; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] 410 ; GCN-DENORM-STRICT: v_sub_f32_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]| 411 412 ; SI: buffer_store_dword [[RESULT]] 413 ; VI: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 414 define amdgpu_kernel void @mad_sub_fabs_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 { 415 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 416 %tid.ext = sext i32 %tid to i64 417 %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext 418 %add1 = add i64 %tid.ext, 1 419 %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1 420 %add2 = add i64 %tid.ext, 2 421 %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2 422 %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext 423 %a = load volatile float, float addrspace(1)* %gep0, align 4 424 %b = load volatile float, float addrspace(1)* %gep1, align 4 425 %c = load volatile float, float addrspace(1)* %gep2, align 4 426 %c.abs = call float @llvm.fabs.f32(float %c) #0 427 %mul = fmul float %a, %b 428 %sub = fsub float %mul, %c.abs 429 store float %sub, float addrspace(1)* %outgep, align 4 430 ret void 431 } 432 433 ; GCN-LABEL: {{^}}mad_sub_fabs_inv_f32: 434 ; GCN: {{buffer|flat|global}}_load_dword [[REGA:v[0-9]+]] 435 ; GCN: {{buffer|flat|global}}_load_dword [[REGB:v[0-9]+]] 436 ; GCN: {{buffer|flat|global}}_load_dword [[REGC:v[0-9]+]] 437 ; GCN-FLUSH-MAD: v_mad_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]| 438 ; GCN-FLUSH-FMA: v_fma_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]| 439 440 ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]| 441 442 ; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] 443 ; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]] 444 445 ; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] 446 ; GCN-DENORM-STRICT: v_sub_f32_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]] 447 448 ; SI: buffer_store_dword [[RESULT]] 449 ; VI: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 450 define amdgpu_kernel void @mad_sub_fabs_inv_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 { 451 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 452 %tid.ext = sext i32 %tid to i64 453 %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext 454 %add1 = add i64 %tid.ext, 1 455 %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1 456 %add2 = add i64 %tid.ext, 2 457 %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2 458 %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext 459 %a = load volatile float, float addrspace(1)* %gep0, align 4 460 %b = load volatile float, float addrspace(1)* %gep1, align 4 461 %c = load volatile float, float addrspace(1)* %gep2, align 4 462 %c.abs = call float @llvm.fabs.f32(float %c) #0 463 %mul = fmul float %a, %b 464 %sub = fsub float %c.abs, %mul 465 store float %sub, float addrspace(1)* %outgep, align 4 466 ret void 467 } 468 469 ; GCN-LABEL: {{^}}neg_neg_mad_f32: 470 ; GCN: {{buffer|flat|global}}_load_dword [[REGA:v[0-9]+]] 471 ; GCN: {{buffer|flat|global}}_load_dword [[REGB:v[0-9]+]] 472 ; GCN: {{buffer|flat|global}}_load_dword [[REGC:v[0-9]+]] 473 474 ; GCN-FLUSH: v_mac_f32_e32 [[REGC]], [[REGA]], [[REGB]] 475 ; SI-FLUSH: buffer_store_dword [[REGC]] 476 ; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REGC]] 477 478 ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], [[REGC]] 479 480 ; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] 481 ; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] 482 483 ; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] 484 ; GCN-DENORM-STRICT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] 485 486 ; SI-DENORM: buffer_store_dword [[RESULT]] 487 ; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 488 define amdgpu_kernel void @neg_neg_mad_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 { 489 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 490 %tid.ext = sext i32 %tid to i64 491 %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext 492 %add1 = add i64 %tid.ext, 1 493 %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1 494 %add2 = add i64 %tid.ext, 2 495 %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2 496 %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext 497 %a = load volatile float, float addrspace(1)* %gep0, align 4 498 %b = load volatile float, float addrspace(1)* %gep1, align 4 499 %c = load volatile float, float addrspace(1)* %gep2, align 4 500 %nega = fsub float -0.000000e+00, %a 501 %negb = fsub float -0.000000e+00, %b 502 %mul = fmul float %nega, %negb 503 %sub = fadd float %mul, %c 504 store float %sub, float addrspace(1)* %outgep, align 4 505 ret void 506 } 507 508 ; GCN-LABEL: {{^}}mad_fabs_sub_f32: 509 ; GCN: {{buffer|flat|global}}_load_dword [[REGA:v[0-9]+]] 510 ; GCN: {{buffer|flat|global}}_load_dword [[REGB:v[0-9]+]] 511 ; GCN: {{buffer|flat|global}}_load_dword [[REGC:v[0-9]+]] 512 ; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]] 513 514 ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]] 515 516 ; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e64 [[TMP:v[0-9]+]], [[REGA]], |[[REGB]]| 517 ; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] 518 519 ; GCN-DENORM-STRICT: v_mul_f32_e64 [[TMP:v[0-9]+]], [[REGA]], |[[REGB]]| 520 ; GCN-DENORM-STRICT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] 521 522 ; SI: buffer_store_dword [[RESULT]] 523 ; VI: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 524 define amdgpu_kernel void @mad_fabs_sub_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 { 525 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 526 %tid.ext = sext i32 %tid to i64 527 %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext 528 %add1 = add i64 %tid.ext, 1 529 %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1 530 %add2 = add i64 %tid.ext, 2 531 %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2 532 %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext 533 %a = load volatile float, float addrspace(1)* %gep0, align 4 534 %b = load volatile float, float addrspace(1)* %gep1, align 4 535 %c = load volatile float, float addrspace(1)* %gep2, align 4 536 %b.abs = call float @llvm.fabs.f32(float %b) #0 537 %mul = fmul float %a, %b.abs 538 %sub = fsub float %mul, %c 539 store float %sub, float addrspace(1)* %outgep, align 4 540 ret void 541 } 542 543 ; GCN-LABEL: {{^}}fsub_c_fadd_a_a_f32: 544 ; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]], 545 ; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]], 546 ; GCN-FLUSH: v_mac_f32_e32 [[R2]], -2.0, [[R1]] 547 ; SI-FLUSH: buffer_store_dword [[R2]] 548 ; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]] 549 550 ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]] 551 552 ; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] 553 ; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] 554 555 ; GCN-DENORM-STRICT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] 556 ; GCN-DENORM-STRICT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] 557 558 ; SI-DENORM: buffer_store_dword [[RESULT]] 559 ; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 560 define amdgpu_kernel void @fsub_c_fadd_a_a_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { 561 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 562 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid 563 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 564 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 565 566 %r1 = load volatile float, float addrspace(1)* %gep.0 567 %r2 = load volatile float, float addrspace(1)* %gep.1 568 569 %add = fadd float %r1, %r1 570 %r3 = fsub float %r2, %add 571 572 store float %r3, float addrspace(1)* %gep.out 573 ret void 574 } 575 576 ; GCN-LABEL: {{^}}fsub_fadd_a_a_c_f32: 577 ; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]], 578 ; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]], 579 ; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]] 580 581 ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]] 582 583 ; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] 584 ; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] 585 586 ; GCN-DENORM-STRICT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] 587 ; GCN-DENORM-STRICT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] 588 589 ; SI: buffer_store_dword [[RESULT]] 590 ; VI: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 591 define amdgpu_kernel void @fsub_fadd_a_a_c_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { 592 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 593 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid 594 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 595 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 596 597 %r1 = load volatile float, float addrspace(1)* %gep.0 598 %r2 = load volatile float, float addrspace(1)* %gep.1 599 600 %add = fadd float %r1, %r1 601 %r3 = fsub float %add, %r2 602 603 store float %r3, float addrspace(1)* %gep.out 604 ret void 605 } 606 607 attributes #0 = { nounwind } 608 attributes #1 = { nounwind readnone } 609