1 ; RUN: llc -march=amdgcn -mattr=+fast-fmaf,-fp32-denormals -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FLUSH %s 2 ; RUN: llc -march=amdgcn -mattr=-fast-fmaf,-fp32-denormals -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FLUSH %s 3 4 ; RUN: llc -march=amdgcn -mattr=+fast-fmaf,+fp32-denormals -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FASTFMA %s 5 ; RUN: llc -march=amdgcn -mattr=-fast-fmaf,+fp32-denormals -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-SLOWFMA %s 6 7 ; FIXME: This should also fold when fma is actually fast if an FMA 8 ; exists in the original program. 9 10 ; (fadd (fma x, y, (fmul u, v), z) -> (fma x, y (fma u, v, z)) 11 12 ; GCN-LABEL: {{^}}fast_add_fmuladd_fmul: 13 ; GCN: buffer_load_dword [[X:v[0-9]+]] 14 ; GCN: buffer_load_dword [[Y:v[0-9]+]] 15 ; GCN: buffer_load_dword [[Z:v[0-9]+]] 16 ; GCN: buffer_load_dword [[U:v[0-9]+]] 17 ; GCN: buffer_load_dword [[V:v[0-9]+]] 18 19 ; GCN-FLUSH: v_mac_f32_e32 [[Z]], [[U]], [[V]] 20 ; GCN-FLUSH-NEXT: v_mac_f32_e32 [[Z]], [[X]], [[Y]] 21 ; GCN-FLUSH-NEXT: buffer_store_dword [[Z]] 22 23 ; GCN-FASTFMA: v_fma_f32 [[FMA0:v[0-9]+]], [[U]], [[V]], [[Z]] 24 ; GCN-FASTFMA: v_fma_f32 [[FMA1:v[0-9]+]], [[X]], [[Y]], [[FMA0]] 25 ; GCN-FASTFMA: buffer_store_dword [[FMA1]] 26 27 ; GCN-SLOWFMA: v_mul_f32_e32 28 ; GCN-SLOWFMA: v_mul_f32_e32 29 ; GCN-SLOWFMA: v_add_f32_e32 30 ; GCN-SLOWFMA: v_add_f32_e32 31 define amdgpu_kernel void @fast_add_fmuladd_fmul() #0 { 32 %x = load volatile float, float addrspace(1)* undef 33 %y = load volatile float, float addrspace(1)* undef 34 %z = load volatile float, float addrspace(1)* undef 35 %u = load volatile float, float addrspace(1)* undef 36 %v = load volatile float, float addrspace(1)* undef 37 %mul.u.v = fmul fast float %u, %v 38 %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v) 39 %add = fadd fast float %fma, %z 40 store volatile float %add, float addrspace(1)* undef 41 ret void 42 } 43 44 ; GCN-LABEL: {{^}}fast_sub_fmuladd_fmul: 45 ; GCN: buffer_load_dword [[X:v[0-9]+]] 46 ; GCN: buffer_load_dword [[Y:v[0-9]+]] 47 ; GCN: buffer_load_dword [[Z:v[0-9]+]] 48 ; GCN: buffer_load_dword [[U:v[0-9]+]] 49 ; GCN: buffer_load_dword [[V:v[0-9]+]] 50 51 ; GCN-FLUSH: v_mad_f32 [[TMP:v[0-9]]], [[U]], [[V]], -[[Z]] 52 ; GCN-FLUSH-NEXT: v_mac_f32_e32 [[TMP]], [[X]], [[Y]] 53 ; GCN-FLUSH-NEXT: buffer_store_dword [[Z]] 54 55 ; GCN-FASTFMA: v_fma_f32 [[FMA0:v[0-9]+]], [[U]], [[V]], -[[Z]] 56 ; GCN-FASTFMA: v_fma_f32 [[FMA1:v[0-9]+]], [[X]], [[Y]], [[FMA0]] 57 ; GCN-FASTFMA: buffer_store_dword [[FMA1]] 58 define amdgpu_kernel void @fast_sub_fmuladd_fmul() #0 { 59 %x = load volatile float, float addrspace(1)* undef 60 %y = load volatile float, float addrspace(1)* undef 61 %z = load volatile float, float addrspace(1)* undef 62 %u = load volatile float, float addrspace(1)* undef 63 %v = load volatile float, float addrspace(1)* undef 64 %mul.u.v = fmul fast float %u, %v 65 %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v) 66 %add = fsub fast float %fma, %z 67 store volatile float %add, float addrspace(1)* undef 68 ret void 69 } 70 71 ; GCN-LABEL: {{^}}fast_add_fmuladd_fmul_multi_use_mul: 72 ; GCN: buffer_load_dword [[X:v[0-9]+]] 73 ; GCN: buffer_load_dword [[Y:v[0-9]+]] 74 ; GCN: buffer_load_dword [[Z:v[0-9]+]] 75 ; GCN: buffer_load_dword [[U:v[0-9]+]] 76 ; GCN: buffer_load_dword [[V:v[0-9]+]] 77 78 ; GCN-FLUSH-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[U]], [[V]] 79 ; GCN-FLUSH-DAG: v_mac_f32_e32 [[MUL]], [[X]], [[Y]] 80 ; GCN-FLUSH: v_add_f32_e32 v{{[0-9]+}}, [[U]], [[Z]] 81 82 ; GCN-FASTFMA: v_mul_f32_e32 [[MUL:v[0-9]+]], [[U]], [[V]] 83 ; GCN-FASTFMA: v_fma_f32 [[FMA1:v[0-9]+]], [[X]], [[Y]], [[MUL]] 84 ; GCN-FASTFMA: v_add_f32_e32 v{{[0-9]+}}, [[FMA1]], [[Z]] 85 86 ; GCN-SLOWFMA: v_mul_f32_e32 87 ; GCN-SLOWFMA: v_mul_f32_e32 88 ; GCN-SLOWFMA: v_add_f32_e32 89 ; GCN-SLOWFMA: v_add_f32_e32 90 define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_mul() #0 { 91 %x = load volatile float, float addrspace(1)* undef 92 %y = load volatile float, float addrspace(1)* undef 93 %z = load volatile float, float addrspace(1)* undef 94 %u = load volatile float, float addrspace(1)* undef 95 %v = load volatile float, float addrspace(1)* undef 96 %mul.u.v = fmul fast float %u, %v 97 store volatile float %mul.u.v, float addrspace(1)* undef 98 %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v) 99 %add = fadd fast float %fma, %z 100 store volatile float %add, float addrspace(1)* undef 101 ret void 102 } 103 104 ; GCN-LABEL: {{^}}fast_add_fmuladd_fmul_multi_use_mul_commute: 105 ; GCN: buffer_load_dword [[X:v[0-9]+]] 106 ; GCN: buffer_load_dword [[Y:v[0-9]+]] 107 ; GCN: buffer_load_dword [[Z:v[0-9]+]] 108 ; GCN: buffer_load_dword [[U:v[0-9]+]] 109 ; GCN: buffer_load_dword [[V:v[0-9]+]] 110 111 ; GCN-FLUSH-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[U]], [[V]] 112 ; GCN-FLUSH-DAG: v_mac_f32_e32 [[MUL]], [[X]], [[Y]] 113 ; GCN-FLUSH: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[U]] 114 115 ; GCN-FASTFMA: v_mul_f32_e32 [[MUL:v[0-9]+]], [[U]], [[V]] 116 ; GCN-FASTFMA: v_fma_f32 [[FMA1:v[0-9]+]], [[X]], [[Y]], [[MUL]] 117 ; GCN-FASTFMA: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[FMA1]] 118 119 ; GCN-SLOWFMA: v_mul_f32_e32 120 ; GCN-SLOWFMA: v_mul_f32_e32 121 ; GCN-SLOWFMA: v_add_f32_e32 122 ; GCN-SLOWFMA: v_add_f32_e32 123 define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_mul_commute() #0 { 124 %x = load volatile float, float addrspace(1)* undef 125 %y = load volatile float, float addrspace(1)* undef 126 %z = load volatile float, float addrspace(1)* undef 127 %u = load volatile float, float addrspace(1)* undef 128 %v = load volatile float, float addrspace(1)* undef 129 %mul.u.v = fmul fast float %u, %v 130 store volatile float %mul.u.v, float addrspace(1)* undef 131 %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v) 132 %add = fadd fast float %z, %fma 133 store volatile float %add, float addrspace(1)* undef 134 ret void 135 } 136 137 ; GCN-LABEL: {{^}}fast_add_fmuladd_fmul_multi_use_fmuladd: 138 ; GCN: buffer_load_dword [[X:v[0-9]+]] 139 ; GCN: buffer_load_dword [[Y:v[0-9]+]] 140 ; GCN: buffer_load_dword [[Z:v[0-9]+]] 141 ; GCN: buffer_load_dword [[U:v[0-9]+]] 142 ; GCN: buffer_load_dword [[V:v[0-9]+]] 143 144 ; GCN-SLOWFMA: v_mul_f32_e32 145 ; GCN-SLOWFMA: v_mul_f32_e32 146 ; GCN-SLOWFMA: v_add_f32_e32 147 ; GCN-SLOWFMA: v_add_f32_e32 148 define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_fmuladd() #0 { 149 %x = load volatile float, float addrspace(1)* undef 150 %y = load volatile float, float addrspace(1)* undef 151 %z = load volatile float, float addrspace(1)* undef 152 %u = load volatile float, float addrspace(1)* undef 153 %v = load volatile float, float addrspace(1)* undef 154 %mul.u.v = fmul fast float %u, %v 155 %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v) 156 store volatile float %fma, float addrspace(1)* undef 157 %add = fadd fast float %fma, %z 158 store volatile float %add, float addrspace(1)* undef 159 ret void 160 } 161 162 ; GCN-LABEL: {{^}}fast_add_fmuladd_fmul_multi_use_fmuladd_commute: 163 ; GCN: buffer_load_dword [[X:v[0-9]+]] 164 ; GCN: buffer_load_dword [[Y:v[0-9]+]] 165 ; GCN: buffer_load_dword [[Z:v[0-9]+]] 166 ; GCN: buffer_load_dword [[U:v[0-9]+]] 167 ; GCN: buffer_load_dword [[V:v[0-9]+]] 168 169 ; GCN-SLOWFMA: v_mul_f32_e32 170 ; GCN-SLOWFMA: v_mul_f32_e32 171 ; GCN-SLOWFMA: v_add_f32_e32 172 ; GCN-SLOWFMA: v_add_f32_e32 173 define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_fmuladd_commute() #0 { 174 %x = load volatile float, float addrspace(1)* undef 175 %y = load volatile float, float addrspace(1)* undef 176 %z = load volatile float, float addrspace(1)* undef 177 %u = load volatile float, float addrspace(1)* undef 178 %v = load volatile float, float addrspace(1)* undef 179 %mul.u.v = fmul fast float %u, %v 180 %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v) 181 store volatile float %fma, float addrspace(1)* undef 182 %add = fadd fast float %z, %fma 183 store volatile float %add, float addrspace(1)* undef 184 ret void 185 } 186 187 ; GCN-LABEL: {{^}}fast_sub_fmuladd_fmul_multi_use_mul: 188 ; GCN: buffer_load_dword [[X:v[0-9]+]] 189 ; GCN: buffer_load_dword [[Y:v[0-9]+]] 190 ; GCN: buffer_load_dword [[Z:v[0-9]+]] 191 ; GCN: buffer_load_dword [[U:v[0-9]+]] 192 ; GCN: buffer_load_dword [[V:v[0-9]+]] 193 194 ; GCN-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[U]], [[V]] 195 196 ; GCN-FLUSH: v_mad_f32 [[MAD:v[0-9]+]], [[X]], [[Y]], [[MUL]] 197 ; GCN-FLUSH: v_sub_f32_e32 [[SUB:v[0-9]+]], [[MAD]], [[Z]] 198 199 ; GCN-FASTFMA: v_fma_f32 [[MAD:v[0-9]+]], [[X]], [[Y]], [[MUL]] 200 ; GCN-FASTFMA: v_sub_f32_e32 [[SUB:v[0-9]+]], [[MAD]], [[Z]] 201 202 ; GCN-SLOWFMA-DAG: v_mul_f32_e32 v{{[0-9]+}}, [[X]], [[Y]] 203 ; GCN-SLOWFMA: v_add_f32_e32 204 ; GCN-SLOWFMA: v_sub_f32_e32 [[MAD:v[0-9]+]] 205 206 ; GCN: buffer_store_dword [[MUL]] 207 ; GCN: buffer_store_dword [[MAD]] 208 define amdgpu_kernel void @fast_sub_fmuladd_fmul_multi_use_mul() #0 { 209 %x = load volatile float, float addrspace(1)* undef 210 %y = load volatile float, float addrspace(1)* undef 211 %z = load volatile float, float addrspace(1)* undef 212 %u = load volatile float, float addrspace(1)* undef 213 %v = load volatile float, float addrspace(1)* undef 214 %mul.u.v = fmul fast float %u, %v 215 %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v) 216 %add = fsub fast float %fma, %z 217 store volatile float %mul.u.v, float addrspace(1)* undef 218 store volatile float %add, float addrspace(1)* undef 219 ret void 220 } 221 222 ; GCN-LABEL: {{^}}fast_sub_fmuladd_fmul_multi_use_fmuladd: 223 ; GCN: buffer_load_dword [[X:v[0-9]+]] 224 ; GCN: buffer_load_dword [[Y:v[0-9]+]] 225 ; GCN: buffer_load_dword [[Z:v[0-9]+]] 226 ; GCN: buffer_load_dword [[U:v[0-9]+]] 227 ; GCN: buffer_load_dword [[V:v[0-9]+]] 228 229 ; GCN-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[U]], [[V]] 230 231 ; GCN-FLUSH-NEXT: v_mac_f32_e32 [[MUL]], [[X]], [[Y]] 232 ; GCN-FLUSH-NEXT: v_sub_f32_e32 [[SUB:v[0-9]+]], [[MUL]], [[Z]] 233 ; GCN-FLUSH-NEXT: buffer_store_dword [[MUL]] 234 ; GCN-FLUSH-NEXT: buffer_store_dword [[SUB]] 235 236 ; GCN-FASTFMA-NEXT: v_fma_f32 [[FMA:v[0-9]+]], [[X]], [[Y]], [[U]] 237 ; GCN-FASTFMA-NEXT: v_sub_f32_e32 [[SUB:v[0-9]+]], [[FMA]], [[Z]] 238 ; GCN-FASTFMA-NEXT: buffer_store_dword [[FMA]] 239 ; GCN-FASTFMA-NEXT: buffer_store_dword [[SUB]] 240 241 ; GCN-SLOWFMA-DAG: v_mul_f32_e32 v{{[0-9]+}}, [[X]], [[Y]] 242 ; GCN-SLOWFMA: v_add_f32_e32 243 ; GCN-SLOWFMA: v_sub_f32_e32 244 define amdgpu_kernel void @fast_sub_fmuladd_fmul_multi_use_fmuladd() #0 { 245 %x = load volatile float, float addrspace(1)* undef 246 %y = load volatile float, float addrspace(1)* undef 247 %z = load volatile float, float addrspace(1)* undef 248 %u = load volatile float, float addrspace(1)* undef 249 %v = load volatile float, float addrspace(1)* undef 250 %mul.u.v = fmul fast float %u, %v 251 %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v) 252 %add = fsub fast float %fma, %z 253 store volatile float %fma, float addrspace(1)* undef 254 store volatile float %add, float addrspace(1)* undef 255 ret void 256 } 257 258 declare float @llvm.fma.f32(float, float, float) #1 259 declare float @llvm.fmuladd.f32(float, float, float) #1 260 261 attributes #0 = { nounwind } 262 attributes #1 = { nounwind readnone } 263