1 ; Make sure we still form mad even when unsafe math or fp-contract is allowed instead of fma. 2 3 ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=SI-STD -check-prefix=FUNC %s 4 ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI -check-prefix=SI-STD -check-prefix=FUNC %s 5 ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI -check-prefix=SI-STD -check-prefix=FUNC %s 6 7 ; Make sure we don't form mad with denormals 8 ; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=+fp32-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=SI-DENORM -check-prefix=FUNC %s 9 ; RUN: llc -march=amdgcn -mcpu=verde -mattr=+fp32-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=SI-DENORM-SLOWFMAF -check-prefix=FUNC %s 10 11 declare i32 @llvm.r600.read.tidig.x() #0 12 declare float @llvm.fabs.f32(float) #0 13 declare float @llvm.fma.f32(float, float, float) #0 14 declare float @llvm.fmuladd.f32(float, float, float) #0 15 16 ; (fadd (fmul x, y), z) -> (fma x, y, z) 17 ; FUNC-LABEL: {{^}}combine_to_mad_f32_0: 18 ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 19 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 20 ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 21 22 ; SI-STD: v_mac_f32_e32 [[C]], [[B]], [[A]] 23 24 ; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]] 25 26 ; SI-DENORM-SLOWFMAF-NOT: v_fma 27 ; SI-DENORM-SLOWFMAF-NOT: v_mad 28 29 ; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] 30 ; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]] 31 32 ; SI-DENORM: buffer_store_dword [[RESULT]] 33 ; SI-STD: buffer_store_dword [[C]] 34 define void @combine_to_mad_f32_0(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { 35 %tid = tail call i32 @llvm.r600.read.tidig.x() #0 36 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid 37 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 38 %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 39 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 40 41 %a = load float, float addrspace(1)* %gep.0 42 %b = load float, float addrspace(1)* %gep.1 43 %c = load float, float addrspace(1)* %gep.2 44 45 %mul = fmul float %a, %b 46 %fma = fadd float %mul, %c 47 store float %fma, float addrspace(1)* %gep.out 48 ret void 49 } 50 51 ; (fadd (fmul x, y), z) -> (fma x, y, z) 52 ; FUNC-LABEL: {{^}}combine_to_mad_f32_0_2use: 53 ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 54 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 55 ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 56 ; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} 57 58 ; SI-STD-DAG: v_mac_f32_e32 [[C]], [[B]], [[A]] 59 ; SI-STD-DAG: v_mac_f32_e32 [[D]], [[B]], [[A]] 60 61 ; SI-DENORM-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], [[C]] 62 ; SI-DENORM-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], [[D]] 63 64 ; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] 65 ; SI-DENORM-SLOWFMAF-DAG: v_add_f32_e32 [[RESULT0:v[0-9]+]], [[C]], [[TMP]] 66 ; SI-DENORM-SLOWFMAF-DAG: v_add_f32_e32 [[RESULT1:v[0-9]+]], [[D]], [[TMP]] 67 68 ; SI-DENORM-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 69 ; SI-DENORM-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 70 ; SI-STD-DAG: buffer_store_dword [[C]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 71 ; SI-STD-DAG: buffer_store_dword [[D]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 72 ; SI: s_endpgm 73 define void @combine_to_mad_f32_0_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { 74 %tid = tail call i32 @llvm.r600.read.tidig.x() #0 75 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid 76 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 77 %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 78 %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3 79 %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid 80 %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1 81 82 %a = load float, float addrspace(1)* %gep.0 83 %b = load float, float addrspace(1)* %gep.1 84 %c = load float, float addrspace(1)* %gep.2 85 %d = load float, float addrspace(1)* %gep.3 86 87 %mul = fmul float %a, %b 88 %fma0 = fadd float %mul, %c 89 %fma1 = fadd float %mul, %d 90 91 store float %fma0, float addrspace(1)* %gep.out.0 92 store float %fma1, float addrspace(1)* %gep.out.1 93 ret void 94 } 95 96 ; (fadd x, (fmul y, z)) -> (fma y, z, x) 97 ; FUNC-LABEL: {{^}}combine_to_mad_f32_1: 98 ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 99 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 100 ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 101 102 ; SI-STD: v_mac_f32_e32 [[C]], [[B]], [[A]] 103 ; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]] 104 105 ; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] 106 ; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]] 107 108 ; SI-DENORM: buffer_store_dword [[RESULT]] 109 ; SI-STD: buffer_store_dword [[C]] 110 define void @combine_to_mad_f32_1(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { 111 %tid = tail call i32 @llvm.r600.read.tidig.x() #0 112 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid 113 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 114 %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 115 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 116 117 %a = load float, float addrspace(1)* %gep.0 118 %b = load float, float addrspace(1)* %gep.1 119 %c = load float, float addrspace(1)* %gep.2 120 121 %mul = fmul float %a, %b 122 %fma = fadd float %c, %mul 123 store float %fma, float addrspace(1)* %gep.out 124 ret void 125 } 126 127 ; (fsub (fmul x, y), z) -> (fma x, y, (fneg z)) 128 ; FUNC-LABEL: {{^}}combine_to_mad_fsub_0_f32: 129 ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 130 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 131 ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 132 133 ; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], -[[C]] 134 ; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], -[[C]] 135 136 ; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] 137 ; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]] 138 139 ; SI: buffer_store_dword [[RESULT]] 140 define void @combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { 141 %tid = tail call i32 @llvm.r600.read.tidig.x() #0 142 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid 143 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 144 %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 145 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 146 147 %a = load float, float addrspace(1)* %gep.0 148 %b = load float, float addrspace(1)* %gep.1 149 %c = load float, float addrspace(1)* %gep.2 150 151 %mul = fmul float %a, %b 152 %fma = fsub float %mul, %c 153 store float %fma, float addrspace(1)* %gep.out 154 ret void 155 } 156 157 ; (fsub (fmul x, y), z) -> (fma x, y, (fneg z)) 158 ; FUNC-LABEL: {{^}}combine_to_mad_fsub_0_f32_2use: 159 ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 160 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 161 ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 162 ; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} 163 164 ; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], -[[C]] 165 ; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]] 166 167 ; SI-DENORM-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], -[[C]] 168 ; SI-DENORM-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]] 169 170 ; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] 171 ; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT0:v[0-9]+]], [[C]], [[TMP]] 172 ; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[D]], [[TMP]] 173 174 ; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 175 ; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 176 ; SI: s_endpgm 177 define void @combine_to_mad_fsub_0_f32_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { 178 %tid = tail call i32 @llvm.r600.read.tidig.x() #0 179 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid 180 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 181 %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 182 %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3 183 %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid 184 %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1 185 186 %a = load float, float addrspace(1)* %gep.0 187 %b = load float, float addrspace(1)* %gep.1 188 %c = load float, float addrspace(1)* %gep.2 189 %d = load float, float addrspace(1)* %gep.3 190 191 %mul = fmul float %a, %b 192 %fma0 = fsub float %mul, %c 193 %fma1 = fsub float %mul, %d 194 store float %fma0, float addrspace(1)* %gep.out.0 195 store float %fma1, float addrspace(1)* %gep.out.1 196 ret void 197 } 198 199 ; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x) 200 ; FUNC-LABEL: {{^}}combine_to_mad_fsub_1_f32: 201 ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 202 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 203 ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 204 205 ; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], [[C]] 206 ; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], [[C]] 207 208 ; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] 209 ; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]] 210 211 ; SI: buffer_store_dword [[RESULT]] 212 define void @combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { 213 %tid = tail call i32 @llvm.r600.read.tidig.x() #0 214 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid 215 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 216 %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 217 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 218 219 %a = load float, float addrspace(1)* %gep.0 220 %b = load float, float addrspace(1)* %gep.1 221 %c = load float, float addrspace(1)* %gep.2 222 223 %mul = fmul float %a, %b 224 %fma = fsub float %c, %mul 225 store float %fma, float addrspace(1)* %gep.out 226 ret void 227 } 228 229 ; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x) 230 ; FUNC-LABEL: {{^}}combine_to_mad_fsub_1_f32_2use: 231 ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 232 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 233 ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 234 235 ; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], [[C]] 236 ; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], [[D]] 237 238 ; SI-DENORM-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], [[C]] 239 ; SI-DENORM-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], [[D]] 240 241 ; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] 242 ; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT0:v[0-9]+]], [[TMP]], [[C]] 243 ; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[TMP]], [[D]] 244 245 ; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 246 ; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 247 ; SI: s_endpgm 248 define void @combine_to_mad_fsub_1_f32_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { 249 %tid = tail call i32 @llvm.r600.read.tidig.x() #0 250 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid 251 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 252 %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 253 %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3 254 %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid 255 %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1 256 257 %a = load float, float addrspace(1)* %gep.0 258 %b = load float, float addrspace(1)* %gep.1 259 %c = load float, float addrspace(1)* %gep.2 260 %d = load float, float addrspace(1)* %gep.3 261 262 %mul = fmul float %a, %b 263 %fma0 = fsub float %c, %mul 264 %fma1 = fsub float %d, %mul 265 store float %fma0, float addrspace(1)* %gep.out.0 266 store float %fma1, float addrspace(1)* %gep.out.1 267 ret void 268 } 269 270 ; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z)) 271 ; FUNC-LABEL: {{^}}combine_to_mad_fsub_2_f32: 272 ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 273 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 274 ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 275 276 ; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], -[[C]] 277 278 ; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], -[[C]] 279 280 ; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] 281 ; SI-DENORM-SLOWFMAF: v_sub_f32_e64 [[RESULT:v[0-9]+]], -[[TMP]], [[C]] 282 283 ; SI: buffer_store_dword [[RESULT]] 284 define void @combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { 285 %tid = tail call i32 @llvm.r600.read.tidig.x() #0 286 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid 287 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 288 %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 289 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 290 291 %a = load float, float addrspace(1)* %gep.0 292 %b = load float, float addrspace(1)* %gep.1 293 %c = load float, float addrspace(1)* %gep.2 294 295 %mul = fmul float %a, %b 296 %mul.neg = fsub float -0.0, %mul 297 %fma = fsub float %mul.neg, %c 298 299 store float %fma, float addrspace(1)* %gep.out 300 ret void 301 } 302 303 ; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z)) 304 ; FUNC-LABEL: {{^}}combine_to_mad_fsub_2_f32_2uses_neg: 305 ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 306 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 307 ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 308 309 ; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]] 310 ; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], -[[D]] 311 312 ; SI-DENORM-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]] 313 ; SI-DENORM-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], -[[D]] 314 315 ; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] 316 ; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e64 [[RESULT0:v[0-9]+]], -[[TMP]], [[C]] 317 ; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e64 [[RESULT1:v[0-9]+]], -[[TMP]], [[D]] 318 319 ; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 320 ; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 321 ; SI: s_endpgm 322 define void @combine_to_mad_fsub_2_f32_2uses_neg(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { 323 %tid = tail call i32 @llvm.r600.read.tidig.x() #0 324 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid 325 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 326 %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 327 %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3 328 %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid 329 %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1 330 331 %a = load float, float addrspace(1)* %gep.0 332 %b = load float, float addrspace(1)* %gep.1 333 %c = load float, float addrspace(1)* %gep.2 334 %d = load float, float addrspace(1)* %gep.3 335 336 %mul = fmul float %a, %b 337 %mul.neg = fsub float -0.0, %mul 338 %fma0 = fsub float %mul.neg, %c 339 %fma1 = fsub float %mul.neg, %d 340 341 store float %fma0, float addrspace(1)* %gep.out.0 342 store float %fma1, float addrspace(1)* %gep.out.1 343 ret void 344 } 345 346 ; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z)) 347 ; FUNC-LABEL: {{^}}combine_to_mad_fsub_2_f32_2uses_mul: 348 ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 349 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 350 ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 351 352 ; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]] 353 ; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]] 354 355 ; SI-DENORM-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]] 356 ; SI-DENORM-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]] 357 358 ; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] 359 ; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e64 [[RESULT0:v[0-9]+]], -[[TMP]], [[C]] 360 ; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[D]], [[TMP]] 361 362 ; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 363 ; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 364 ; SI: s_endpgm 365 define void @combine_to_mad_fsub_2_f32_2uses_mul(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { 366 %tid = tail call i32 @llvm.r600.read.tidig.x() #0 367 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid 368 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 369 %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 370 %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3 371 %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid 372 %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1 373 374 %a = load float, float addrspace(1)* %gep.0 375 %b = load float, float addrspace(1)* %gep.1 376 %c = load float, float addrspace(1)* %gep.2 377 %d = load float, float addrspace(1)* %gep.3 378 379 %mul = fmul float %a, %b 380 %mul.neg = fsub float -0.0, %mul 381 %fma0 = fsub float %mul.neg, %c 382 %fma1 = fsub float %mul, %d 383 384 store float %fma0, float addrspace(1)* %gep.out.0 385 store float %fma1, float addrspace(1)* %gep.out.1 386 ret void 387 } 388 389 ; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z))) 390 391 ; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_0_f32: 392 ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 393 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 394 ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 395 ; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} 396 ; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} 397 398 ; SI-STD: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]] 399 ; SI-STD: v_fma_f32 [[TMP1:v[0-9]+]], [[A]], [[B]], [[TMP0]] 400 ; SI-STD: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP1]] 401 402 ; SI-DENORM: v_fma_f32 [[TMP0:v[0-9]+]], [[D]], [[E]], -[[C]] 403 ; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[TMP0]] 404 405 ; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]] 406 ; SI-DENORM-SLOWFMAF: v_fma_f32 [[TMP1:v[0-9]+]], [[A]], [[B]], [[TMP0]] 407 ; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[C]], [[TMP1]] 408 409 ; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 410 define void @aggressive_combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { 411 %tid = tail call i32 @llvm.r600.read.tidig.x() #0 412 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid 413 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 414 %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 415 %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3 416 %gep.4 = getelementptr float, float addrspace(1)* %gep.0, i32 4 417 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 418 419 %x = load float, float addrspace(1)* %gep.0 420 %y = load float, float addrspace(1)* %gep.1 421 %z = load float, float addrspace(1)* %gep.2 422 %u = load float, float addrspace(1)* %gep.3 423 %v = load float, float addrspace(1)* %gep.4 424 425 %tmp0 = fmul float %u, %v 426 %tmp1 = call float @llvm.fma.f32(float %x, float %y, float %tmp0) #0 427 %tmp2 = fsub float %tmp1, %z 428 429 store float %tmp2, float addrspace(1)* %gep.out 430 ret void 431 } 432 433 ; fold (fsub x, (fma y, z, (fmul u, v))) 434 ; -> (fma (fneg y), z, (fma (fneg u), v, x)) 435 436 ; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_1_f32: 437 ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 438 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 439 ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 440 ; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} 441 ; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} 442 443 ; SI-STD: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]] 444 ; SI-STD: v_fma_f32 [[TMP1:v[0-9]+]], [[B]], [[C]], [[TMP0]] 445 ; SI-STD: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP1]], [[A]] 446 447 ; SI-DENORM: v_fma_f32 [[TMP0:v[0-9]+]], -[[D]], [[E]], [[A]] 448 ; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], -[[B]], [[C]], [[TMP0]] 449 450 ; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]] 451 ; SI-DENORM-SLOWFMAF: v_fma_f32 [[TMP1:v[0-9]+]], [[B]], [[C]], [[TMP0]] 452 ; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP1]], [[A]] 453 454 ; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 455 ; SI: s_endpgm 456 define void @aggressive_combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { 457 %tid = tail call i32 @llvm.r600.read.tidig.x() #0 458 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid 459 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 460 %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 461 %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3 462 %gep.4 = getelementptr float, float addrspace(1)* %gep.0, i32 4 463 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 464 465 %x = load float, float addrspace(1)* %gep.0 466 %y = load float, float addrspace(1)* %gep.1 467 %z = load float, float addrspace(1)* %gep.2 468 %u = load float, float addrspace(1)* %gep.3 469 %v = load float, float addrspace(1)* %gep.4 470 471 %tmp0 = fmul float %u, %v 472 %tmp1 = call float @llvm.fma.f32(float %y, float %z, float %tmp0) #0 473 %tmp2 = fsub float %x, %tmp1 474 475 store float %tmp2, float addrspace(1)* %gep.out 476 ret void 477 } 478 479 ; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z))) 480 481 ; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_2_f32: 482 ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 483 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 484 ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 485 ; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} 486 ; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} 487 488 ; SI-STD: v_mad_f32 [[TMP:v[0-9]+]], [[D]], [[E]], -[[C]] 489 ; SI-STD: v_mac_f32_e32 [[TMP]], [[B]], [[A]] 490 491 ; SI-DENORM: v_fma_f32 [[TMP:v[0-9]+]], [[D]], [[E]], -[[C]] 492 ; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[TMP]] 493 494 ; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]] 495 ; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP1:v[0-9]+]], [[B]], [[A]] 496 ; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[TMP2:v[0-9]+]], [[TMP0]], [[TMP1]] 497 ; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP2]] 498 499 ; SI-DENORM: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 500 ; SI-STD: buffer_store_dword [[TMP]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 501 ; SI: s_endpgm 502 define void @aggressive_combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { 503 %tid = tail call i32 @llvm.r600.read.tidig.x() #0 504 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid 505 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 506 %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 507 %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3 508 %gep.4 = getelementptr float, float addrspace(1)* %gep.0, i32 4 509 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 510 511 %x = load float, float addrspace(1)* %gep.0 512 %y = load float, float addrspace(1)* %gep.1 513 %z = load float, float addrspace(1)* %gep.2 514 %u = load float, float addrspace(1)* %gep.3 515 %v = load float, float addrspace(1)* %gep.4 516 517 %tmp0 = fmul float %u, %v 518 %tmp1 = call float @llvm.fmuladd.f32(float %x, float %y, float %tmp0) #0 519 %tmp2 = fsub float %tmp1, %z 520 521 store float %tmp2, float addrspace(1)* %gep.out 522 ret void 523 } 524 525 ; fold (fsub x, (fmuladd y, z, (fmul u, v))) 526 ; -> (fmuladd (fneg y), z, (fmuladd (fneg u), v, x)) 527 528 ; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_3_f32: 529 ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 530 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 531 ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 532 ; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} 533 ; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} 534 535 ; SI-STD: v_mad_f32 [[TMP:v[0-9]+]], -[[D]], [[E]], [[A]] 536 ; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], -[[B]], [[C]], [[TMP]] 537 538 ; SI-DENORM: v_fma_f32 [[TMP:v[0-9]+]], -[[D]], [[E]], [[A]] 539 ; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], -[[B]], [[C]], [[TMP]] 540 541 ; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]] 542 ; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP1:v[0-9]+]], [[C]], [[B]] 543 ; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[TMP2:v[0-9]+]], [[TMP0]], [[TMP1]] 544 ; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP2]], [[A]] 545 546 ; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 547 ; SI: s_endpgm 548 define void @aggressive_combine_to_mad_fsub_3_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { 549 %tid = tail call i32 @llvm.r600.read.tidig.x() #0 550 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid 551 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 552 %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 553 %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3 554 %gep.4 = getelementptr float, float addrspace(1)* %gep.0, i32 4 555 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 556 557 %x = load float, float addrspace(1)* %gep.0 558 %y = load float, float addrspace(1)* %gep.1 559 %z = load float, float addrspace(1)* %gep.2 560 %u = load float, float addrspace(1)* %gep.3 561 %v = load float, float addrspace(1)* %gep.4 562 563 %tmp0 = fmul float %u, %v 564 %tmp1 = call float @llvm.fmuladd.f32(float %y, float %z, float %tmp0) #0 565 %tmp2 = fsub float %x, %tmp1 566 567 store float %tmp2, float addrspace(1)* %gep.out 568 ret void 569 } 570 571 attributes #0 = { nounwind readnone } 572 attributes #1 = { nounwind } 573