1 ; Make sure we still form mad even when unsafe math or fp-contract is allowed instead of fma. 2 3 ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-STD -check-prefix=SI-STD-SAFE -check-prefix=FUNC %s 4 ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-STD -check-prefix=SI-STD-SAFE -check-prefix=FUNC %s 5 ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-STD -check-prefix=SI-STD-UNSAFE -check-prefix=FUNC %s 6 7 ; Make sure we don't form mad with denormals 8 ; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=+fp32-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-DENORM -check-prefix=SI-DENORM-FASTFMAF -check-prefix=FUNC %s 9 ; RUN: llc -march=amdgcn -mcpu=verde -mattr=+fp32-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-DENORM -check-prefix=SI-DENORM-SLOWFMAF -check-prefix=FUNC %s 10 11 declare i32 @llvm.amdgcn.workitem.id.x() #0 12 declare float @llvm.fabs.f32(float) #0 13 declare float @llvm.fma.f32(float, float, float) #0 14 declare float @llvm.fmuladd.f32(float, float, float) #0 15 16 ; (fadd (fmul x, y), z) -> (fma x, y, z) 17 ; FUNC-LABEL: {{^}}combine_to_mad_f32_0: 18 ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 19 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 20 ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 21 22 ; SI-STD: v_mac_f32_e32 [[C]], [[A]], [[B]] 23 24 ; SI-DENORM-FASTFMAF: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]] 25 26 ; SI-DENORM-SLOWFMAF-NOT: v_fma 27 ; SI-DENORM-SLOWFMAF-NOT: v_mad 28 29 ; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]] 30 ; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]] 31 32 ; SI-DENORM: buffer_store_dword [[RESULT]] 33 ; SI-STD: buffer_store_dword [[C]] 34 define amdgpu_kernel void @combine_to_mad_f32_0(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { 35 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 36 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid 37 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 38 %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 39 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 40 41 %a = load volatile float, float addrspace(1)* %gep.0 42 %b = load volatile float, float addrspace(1)* %gep.1 43 %c = load volatile float, float addrspace(1)* %gep.2 44 45 %mul = fmul float %a, %b 46 %fma = fadd float %mul, %c 47 store float %fma, float addrspace(1)* %gep.out 48 ret void 49 } 50 51 ; (fadd (fmul x, y), z) -> (fma x, y, z) 52 ; FUNC-LABEL: {{^}}combine_to_mad_f32_0_2use: 53 ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 54 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 55 ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 56 ; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} 57 58 ; SI-STD-DAG: v_mac_f32_e32 [[C]], [[A]], [[B]] 59 ; SI-STD-DAG: v_mac_f32_e32 [[D]], [[A]], [[B]] 60 61 ; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], [[C]] 62 ; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], [[D]] 63 64 ; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]] 65 ; SI-DENORM-SLOWFMAF-DAG: v_add_f32_e32 [[RESULT0:v[0-9]+]], [[TMP]], [[C]] 66 ; SI-DENORM-SLOWFMAF-DAG: v_add_f32_e32 [[RESULT1:v[0-9]+]], [[TMP]], [[D]] 67 68 ; SI-DENORM-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 69 ; SI-DENORM-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 70 ; SI-STD-DAG: buffer_store_dword [[C]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 71 ; SI-STD-DAG: buffer_store_dword [[D]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 72 ; SI: s_endpgm 73 define amdgpu_kernel void @combine_to_mad_f32_0_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { 74 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 75 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid 76 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 77 %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 78 %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3 79 %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid 80 %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1 81 82 %a = load volatile float, float addrspace(1)* %gep.0 83 %b = load volatile float, float addrspace(1)* %gep.1 84 %c = load volatile float, float addrspace(1)* %gep.2 85 %d = load volatile float, float addrspace(1)* %gep.3 86 87 %mul = fmul float %a, %b 88 %fma0 = fadd float %mul, %c 89 %fma1 = fadd float %mul, %d 90 91 store volatile float %fma0, float addrspace(1)* %gep.out.0 92 store volatile float %fma1, float addrspace(1)* %gep.out.1 93 ret void 94 } 95 96 ; (fadd x, (fmul y, z)) -> (fma y, z, x) 97 ; FUNC-LABEL: {{^}}combine_to_mad_f32_1: 98 ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 99 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 100 ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 101 102 ; SI-STD: v_mac_f32_e32 [[C]], [[A]], [[B]] 103 ; SI-DENORM-FASTFMAF: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]] 104 105 ; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]] 106 ; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]] 107 108 ; SI-DENORM: buffer_store_dword [[RESULT]] 109 ; SI-STD: buffer_store_dword [[C]] 110 define amdgpu_kernel void @combine_to_mad_f32_1(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { 111 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 112 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid 113 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 114 %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 115 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 116 117 %a = load volatile float, float addrspace(1)* %gep.0 118 %b = load volatile float, float addrspace(1)* %gep.1 119 %c = load volatile float, float addrspace(1)* %gep.2 120 121 %mul = fmul float %a, %b 122 %fma = fadd float %c, %mul 123 store float %fma, float addrspace(1)* %gep.out 124 ret void 125 } 126 127 ; (fsub (fmul x, y), z) -> (fma x, y, (fneg z)) 128 ; FUNC-LABEL: {{^}}combine_to_mad_fsub_0_f32: 129 ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 130 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 131 ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 132 133 ; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], -[[C]] 134 ; SI-DENORM-FASTFMAF: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], -[[C]] 135 136 ; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]] 137 ; SI-DENORM-SLOWFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]] 138 139 ; SI: buffer_store_dword [[RESULT]] 140 define amdgpu_kernel void @combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { 141 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 142 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid 143 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 144 %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 145 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 146 147 %a = load volatile float, float addrspace(1)* %gep.0 148 %b = load volatile float, float addrspace(1)* %gep.1 149 %c = load volatile float, float addrspace(1)* %gep.2 150 151 %mul = fmul float %a, %b 152 %fma = fsub float %mul, %c 153 store float %fma, float addrspace(1)* %gep.out 154 ret void 155 } 156 157 ; (fsub (fmul x, y), z) -> (fma x, y, (fneg z)) 158 ; FUNC-LABEL: {{^}}combine_to_mad_fsub_0_f32_2use: 159 ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 160 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 161 ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 162 ; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} 163 164 ; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], -[[C]] 165 ; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]] 166 167 ; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], -[[C]] 168 ; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]] 169 170 ; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]] 171 ; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT0:v[0-9]+]], [[TMP]], [[C]] 172 ; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT1:v[0-9]+]], [[TMP]], [[D]] 173 174 ; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 175 ; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 176 ; SI: s_endpgm 177 define amdgpu_kernel void @combine_to_mad_fsub_0_f32_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { 178 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 179 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid 180 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 181 %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 182 %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3 183 %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid 184 %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1 185 186 %a = load volatile float, float addrspace(1)* %gep.0 187 %b = load volatile float, float addrspace(1)* %gep.1 188 %c = load volatile float, float addrspace(1)* %gep.2 189 %d = load volatile float, float addrspace(1)* %gep.3 190 191 %mul = fmul float %a, %b 192 %fma0 = fsub float %mul, %c 193 %fma1 = fsub float %mul, %d 194 store volatile float %fma0, float addrspace(1)* %gep.out.0 195 store volatile float %fma1, float addrspace(1)* %gep.out.1 196 ret void 197 } 198 199 ; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x) 200 ; FUNC-LABEL: {{^}}combine_to_mad_fsub_1_f32: 201 ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 202 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 203 ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 204 205 ; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], [[C]] 206 ; SI-DENORM-FASTFMAF: v_fma_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], [[C]] 207 208 ; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]] 209 ; SI-DENORM-SLOWFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]] 210 211 ; SI: buffer_store_dword [[RESULT]] 212 define amdgpu_kernel void @combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { 213 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 214 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid 215 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 216 %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 217 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 218 219 %a = load volatile float, float addrspace(1)* %gep.0 220 %b = load volatile float, float addrspace(1)* %gep.1 221 %c = load volatile float, float addrspace(1)* %gep.2 222 223 %mul = fmul float %a, %b 224 %fma = fsub float %c, %mul 225 store float %fma, float addrspace(1)* %gep.out 226 ret void 227 } 228 229 ; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x) 230 ; FUNC-LABEL: {{^}}combine_to_mad_fsub_1_f32_2use: 231 ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 232 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 233 ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 234 ; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} 235 236 ; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], [[C]] 237 ; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], [[D]] 238 239 ; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], [[C]] 240 ; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], [[D]] 241 242 ; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]] 243 ; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT0:v[0-9]+]], [[C]], [[TMP]] 244 ; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT1:v[0-9]+]], [[D]], [[TMP]] 245 246 ; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 247 ; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 248 ; SI: s_endpgm 249 define amdgpu_kernel void @combine_to_mad_fsub_1_f32_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { 250 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 251 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid 252 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 253 %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 254 %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3 255 %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid 256 %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1 257 258 %a = load volatile float, float addrspace(1)* %gep.0 259 %b = load volatile float, float addrspace(1)* %gep.1 260 %c = load volatile float, float addrspace(1)* %gep.2 261 %d = load volatile float, float addrspace(1)* %gep.3 262 263 %mul = fmul float %a, %b 264 %fma0 = fsub float %c, %mul 265 %fma1 = fsub float %d, %mul 266 store volatile float %fma0, float addrspace(1)* %gep.out.0 267 store volatile float %fma1, float addrspace(1)* %gep.out.1 268 ret void 269 } 270 271 ; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z)) 272 ; FUNC-LABEL: {{^}}combine_to_mad_fsub_2_f32: 273 ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 274 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 275 ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 276 277 ; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], -[[B]], -[[C]] 278 279 ; SI-DENORM-FASTFMAF: v_fma_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], -[[C]] 280 281 ; SI-DENORM-SLOWFMAF: v_mul_f32_e64 [[TMP:v[0-9]+]], [[A]], -[[B]] 282 ; SI-DENORM-SLOWFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]] 283 284 ; SI: buffer_store_dword [[RESULT]] 285 define amdgpu_kernel void @combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { 286 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 287 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid 288 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 289 %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 290 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 291 292 %a = load volatile float, float addrspace(1)* %gep.0 293 %b = load volatile float, float addrspace(1)* %gep.1 294 %c = load volatile float, float addrspace(1)* %gep.2 295 296 %mul = fmul float %a, %b 297 %mul.neg = fsub float -0.0, %mul 298 %fma = fsub float %mul.neg, %c 299 300 store float %fma, float addrspace(1)* %gep.out 301 ret void 302 } 303 304 ; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z)) 305 ; FUNC-LABEL: {{^}}combine_to_mad_fsub_2_f32_2uses_neg: 306 ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 307 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 308 ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 309 ; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} 310 311 ; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], [[A]], -[[B]], -[[C]] 312 ; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], [[A]], -[[B]], -[[D]] 313 314 ; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]] 315 ; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], -[[D]] 316 317 ; SI-DENORM-SLOWFMAF: v_mul_f32_e64 [[TMP:v[0-9]+]], [[A]], -[[B]] 318 ; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT0:v[0-9]+]], [[TMP]], [[C]] 319 ; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT1:v[0-9]+]], [[TMP]], [[D]] 320 321 ; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 322 ; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 323 ; SI: s_endpgm 324 define amdgpu_kernel void @combine_to_mad_fsub_2_f32_2uses_neg(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { 325 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 326 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid 327 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 328 %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 329 %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3 330 %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid 331 %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1 332 333 %a = load volatile float, float addrspace(1)* %gep.0 334 %b = load volatile float, float addrspace(1)* %gep.1 335 %c = load volatile float, float addrspace(1)* %gep.2 336 %d = load volatile float, float addrspace(1)* %gep.3 337 338 %mul = fmul float %a, %b 339 %mul.neg = fsub float -0.0, %mul 340 %fma0 = fsub float %mul.neg, %c 341 %fma1 = fsub float %mul.neg, %d 342 343 store volatile float %fma0, float addrspace(1)* %gep.out.0 344 store volatile float %fma1, float addrspace(1)* %gep.out.1 345 ret void 346 } 347 348 ; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z)) 349 ; FUNC-LABEL: {{^}}combine_to_mad_fsub_2_f32_2uses_mul: 350 ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 351 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 352 ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 353 ; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} 354 355 ; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]] 356 ; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]] 357 358 ; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]] 359 ; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]] 360 361 ; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]] 362 ; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e64 [[RESULT0:v[0-9]+]], -[[TMP]], [[C]] 363 ; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT1:v[0-9]+]], [[TMP]], [[D]] 364 365 ; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 366 ; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 367 ; SI: s_endpgm 368 define amdgpu_kernel void @combine_to_mad_fsub_2_f32_2uses_mul(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { 369 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 370 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid 371 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 372 %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 373 %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3 374 %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid 375 %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1 376 377 %a = load volatile float, float addrspace(1)* %gep.0 378 %b = load volatile float, float addrspace(1)* %gep.1 379 %c = load volatile float, float addrspace(1)* %gep.2 380 %d = load volatile float, float addrspace(1)* %gep.3 381 382 %mul = fmul float %a, %b 383 %mul.neg = fsub float -0.0, %mul 384 %fma0 = fsub float %mul.neg, %c 385 %fma1 = fsub float %mul, %d 386 387 store volatile float %fma0, float addrspace(1)* %gep.out.0 388 store volatile float %fma1, float addrspace(1)* %gep.out.1 389 ret void 390 } 391 392 ; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z))) 393 394 ; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_0_f32: 395 ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 396 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 397 ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 398 ; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} 399 ; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} 400 401 ; SI-STD: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]] 402 ; SI-STD: v_fma_f32 [[TMP1:v[0-9]+]], [[A]], [[B]], [[TMP0]] 403 ; SI-STD: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP1]], [[C]] 404 405 ; SI-DENORM: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]] 406 ; SI-DENORM: v_fma_f32 [[TMP1:v[0-9]+]], [[A]], [[B]], [[TMP0]] 407 ; SI-DENORM: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP1]], [[C]] 408 409 ; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 410 define amdgpu_kernel void @aggressive_combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { 411 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 412 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid 413 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 414 %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 415 %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3 416 %gep.4 = getelementptr float, float addrspace(1)* %gep.0, i32 4 417 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 418 419 %x = load volatile float, float addrspace(1)* %gep.0 420 %y = load volatile float, float addrspace(1)* %gep.1 421 %z = load volatile float, float addrspace(1)* %gep.2 422 %u = load volatile float, float addrspace(1)* %gep.3 423 %v = load volatile float, float addrspace(1)* %gep.4 424 425 %tmp0 = fmul float %u, %v 426 %tmp1 = call float @llvm.fma.f32(float %x, float %y, float %tmp0) #0 427 %tmp2 = fsub float %tmp1, %z 428 429 store float %tmp2, float addrspace(1)* %gep.out 430 ret void 431 } 432 433 ; fold (fsub x, (fma y, z, (fmul u, v))) 434 ; -> (fma (fneg y), z, (fma (fneg u), v, x)) 435 436 ; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_1_f32: 437 ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 438 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 439 ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 440 ; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} 441 ; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} 442 443 ; SI-STD: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]] 444 ; SI-STD: v_fma_f32 [[TMP1:v[0-9]+]], [[B]], [[C]], [[TMP0]] 445 ; SI-STD: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[A]], [[TMP1]] 446 447 ; SI-DENORM: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]] 448 ; SI-DENORM: v_fma_f32 [[TMP1:v[0-9]+]], [[B]], [[C]], [[TMP0]] 449 ; SI-DENORM: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[A]], [[TMP1]] 450 451 ; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 452 ; SI: s_endpgm 453 define amdgpu_kernel void @aggressive_combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { 454 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 455 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid 456 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 457 %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 458 %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3 459 %gep.4 = getelementptr float, float addrspace(1)* %gep.0, i32 4 460 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 461 462 %x = load volatile float, float addrspace(1)* %gep.0 463 %y = load volatile float, float addrspace(1)* %gep.1 464 %z = load volatile float, float addrspace(1)* %gep.2 465 %u = load volatile float, float addrspace(1)* %gep.3 466 %v = load volatile float, float addrspace(1)* %gep.4 467 468 %tmp0 = fmul float %u, %v 469 %tmp1 = call float @llvm.fma.f32(float %y, float %z, float %tmp0) #0 470 %tmp2 = fsub float %x, %tmp1 471 472 store float %tmp2, float addrspace(1)* %gep.out 473 ret void 474 } 475 476 ; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z))) 477 478 ; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_2_f32: 479 ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 480 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 481 ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 482 ; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} 483 ; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} 484 485 ; SI-STD-SAFE: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]] 486 ; SI-STD-SAFE: v_mac_f32_e32 [[TMP0]], [[A]], [[B]] 487 ; SI-STD-SAFE: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP0]], [[C]] 488 489 ; SI-STD-UNSAFE: v_mad_f32 [[RESULT:v[0-9]+]], [[D]], [[E]], -[[C]] 490 ; SI-STD-UNSAFE: v_mac_f32_e32 [[RESULT]], [[A]], [[B]] 491 492 ; SI-DENORM-FASTFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]] 493 ; SI-DENORM-FASTFMAF: v_fma_f32 [[TMP1:v[0-9]+]], [[A]], [[B]], [[TMP0]] 494 ; SI-DENORM-FASTFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP1]], [[C]] 495 496 ; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]] 497 ; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP1:v[0-9]+]], [[A]], [[B]] 498 ; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[TMP2:v[0-9]+]], [[TMP1]], [[TMP0]] 499 ; SI-DENORM-SLOWFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP2]], [[C]] 500 501 ; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 502 ; SI: s_endpgm 503 define amdgpu_kernel void @aggressive_combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { 504 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 505 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid 506 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 507 %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 508 %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3 509 %gep.4 = getelementptr float, float addrspace(1)* %gep.0, i32 4 510 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 511 512 %x = load volatile float, float addrspace(1)* %gep.0 513 %y = load volatile float, float addrspace(1)* %gep.1 514 %z = load volatile float, float addrspace(1)* %gep.2 515 %u = load volatile float, float addrspace(1)* %gep.3 516 %v = load volatile float, float addrspace(1)* %gep.4 517 518 %tmp0 = fmul float %u, %v 519 %tmp1 = call float @llvm.fmuladd.f32(float %x, float %y, float %tmp0) #0 520 %tmp2 = fsub float %tmp1, %z 521 522 store float %tmp2, float addrspace(1)* %gep.out 523 ret void 524 } 525 526 ; fold (fsub x, (fmuladd y, z, (fmul u, v))) 527 ; -> (fmuladd (fneg y), z, (fmuladd (fneg u), v, x)) 528 529 ; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_3_f32: 530 ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 531 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 532 ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 533 ; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} 534 ; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} 535 536 ; SI-STD-SAFE: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]] 537 ; SI-STD-SAFE: v_mac_f32_e32 [[TMP0]], [[B]], [[C]] 538 ; SI-STD-SAFE: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[A]], [[TMP0]] 539 540 ; SI-STD-UNSAFE: v_mad_f32 [[TMP:v[0-9]+]], -[[D]], [[E]], [[A]] 541 ; SI-STD-UNSAFE: v_mad_f32 [[RESULT:v[0-9]+]], -[[B]], [[C]], [[TMP]] 542 543 ; SI-DENORM-FASTFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]] 544 ; SI-DENORM-FASTFMAF: v_fma_f32 [[TMP1:v[0-9]+]], [[B]], [[C]], [[TMP0]] 545 ; SI-DENORM-FASTFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[A]], [[TMP1]] 546 547 ; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]] 548 ; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP1:v[0-9]+]], [[B]], [[C]] 549 ; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[TMP2:v[0-9]+]], [[TMP1]], [[TMP0]] 550 ; SI-DENORM-SLOWFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[A]], [[TMP2]] 551 552 ; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 553 ; SI: s_endpgm 554 define amdgpu_kernel void @aggressive_combine_to_mad_fsub_3_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { 555 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 556 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid 557 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 558 %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 559 %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3 560 %gep.4 = getelementptr float, float addrspace(1)* %gep.0, i32 4 561 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 562 563 %x = load volatile float, float addrspace(1)* %gep.0 564 %y = load volatile float, float addrspace(1)* %gep.1 565 %z = load volatile float, float addrspace(1)* %gep.2 566 %u = load volatile float, float addrspace(1)* %gep.3 567 %v = load volatile float, float addrspace(1)* %gep.4 568 569 %tmp0 = fmul float %u, %v 570 %tmp1 = call float @llvm.fmuladd.f32(float %y, float %z, float %tmp0) #0 571 %tmp2 = fsub float %x, %tmp1 572 573 store float %tmp2, float addrspace(1)* %gep.out 574 ret void 575 } 576 577 attributes #0 = { nounwind readnone } 578 attributes #1 = { nounwind } 579