1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefix=SI-NOFMA -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s 2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefix=SI-NOFMA -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s 3 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast -enable-no-infs-fp-math -enable-unsafe-fp-math -mattr=+fp32-denormals < %s | FileCheck -enable-var-scope -check-prefix=SI-FMA -check-prefix=SI-UNSAFE -check-prefix=SI -check-prefix=FUNC %s 4 5 ; Note: The SI-FMA conversions of type x * (y + 1) --> x * y + x would be 6 ; beneficial even without fp32 denormals, but they do require no-infs-fp-math 7 ; for correctness. 8 9 declare i32 @llvm.amdgcn.workitem.id.x() #0 10 declare double @llvm.fabs.f64(double) #0 11 declare double @llvm.fma.f64(double, double, double) #0 12 declare float @llvm.fma.f32(float, float, float) #0 13 14 ; (fadd (fmul x, y), z) -> (fma x, y, z) 15 ; FUNC-LABEL: {{^}}combine_to_fma_f64_0: 16 ; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 17 ; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 18 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} 19 ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]] 20 ; SI: buffer_store_dwordx2 [[RESULT]] 21 define amdgpu_kernel void @combine_to_fma_f64_0(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { 22 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 23 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid 24 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 25 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 26 %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid 27 28 %a = load volatile double, double addrspace(1)* %gep.0 29 %b = load volatile double, double addrspace(1)* %gep.1 30 %c = load volatile double, double addrspace(1)* %gep.2 31 32 %mul = fmul double %a, %b 33 %fma = fadd double %mul, %c 34 store double %fma, double addrspace(1)* %gep.out 35 ret void 36 } 37 38 ; (fadd (fmul x, y), z) -> (fma x, y, z) 39 ; FUNC-LABEL: {{^}}combine_to_fma_f64_0_2use: 40 ; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 41 ; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 42 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} 43 ; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}} 44 ; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]] 45 ; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[D]] 46 ; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 47 ; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 48 ; SI: s_endpgm 49 define amdgpu_kernel void @combine_to_fma_f64_0_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { 50 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 51 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid 52 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 53 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 54 %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3 55 %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid 56 %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1 57 58 %a = load volatile double, double addrspace(1)* %gep.0 59 %b = load volatile double, double addrspace(1)* %gep.1 60 %c = load volatile double, double addrspace(1)* %gep.2 61 %d = load volatile double, double addrspace(1)* %gep.3 62 63 %mul = fmul double %a, %b 64 %fma0 = fadd double %mul, %c 65 %fma1 = fadd double %mul, %d 66 store volatile double %fma0, double addrspace(1)* %gep.out.0 67 store volatile double %fma1, double addrspace(1)* %gep.out.1 68 ret void 69 } 70 71 ; (fadd x, (fmul y, z)) -> (fma y, z, x) 72 ; FUNC-LABEL: {{^}}combine_to_fma_f64_1: 73 ; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 74 ; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 75 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} 76 ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]] 77 ; SI: buffer_store_dwordx2 [[RESULT]] 78 define amdgpu_kernel void @combine_to_fma_f64_1(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { 79 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 80 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid 81 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 82 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 83 %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid 84 85 %a = load volatile double, double addrspace(1)* %gep.0 86 %b = load volatile double, double addrspace(1)* %gep.1 87 %c = load volatile double, double addrspace(1)* %gep.2 88 89 %mul = fmul double %a, %b 90 %fma = fadd double %c, %mul 91 store double %fma, double addrspace(1)* %gep.out 92 ret void 93 } 94 95 ; (fsub (fmul x, y), z) -> (fma x, y, (fneg z)) 96 ; FUNC-LABEL: {{^}}combine_to_fma_fsub_0_f64: 97 ; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 98 ; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 99 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} 100 ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[C]] 101 ; SI: buffer_store_dwordx2 [[RESULT]] 102 define amdgpu_kernel void @combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { 103 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 104 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid 105 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 106 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 107 %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid 108 109 %a = load volatile double, double addrspace(1)* %gep.0 110 %b = load volatile double, double addrspace(1)* %gep.1 111 %c = load volatile double, double addrspace(1)* %gep.2 112 113 %mul = fmul double %a, %b 114 %fma = fsub double %mul, %c 115 store double %fma, double addrspace(1)* %gep.out 116 ret void 117 } 118 119 ; (fsub (fmul x, y), z) -> (fma x, y, (fneg z)) 120 ; FUNC-LABEL: {{^}}combine_to_fma_fsub_f64_0_2use: 121 ; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 122 ; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 123 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} 124 ; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}} 125 ; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[C]] 126 ; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[D]] 127 ; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 128 ; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 129 ; SI: s_endpgm 130 define amdgpu_kernel void @combine_to_fma_fsub_f64_0_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { 131 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 132 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid 133 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 134 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 135 %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3 136 %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid 137 %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1 138 139 %a = load volatile double, double addrspace(1)* %gep.0 140 %b = load volatile double, double addrspace(1)* %gep.1 141 %c = load volatile double, double addrspace(1)* %gep.2 142 %d = load volatile double, double addrspace(1)* %gep.3 143 144 %mul = fmul double %a, %b 145 %fma0 = fsub double %mul, %c 146 %fma1 = fsub double %mul, %d 147 store volatile double %fma0, double addrspace(1)* %gep.out.0 148 store volatile double %fma1, double addrspace(1)* %gep.out.1 149 ret void 150 } 151 152 ; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x) 153 ; FUNC-LABEL: {{^}}combine_to_fma_fsub_1_f64: 154 ; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 155 ; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 156 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} 157 ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[C]] 158 ; SI: buffer_store_dwordx2 [[RESULT]] 159 define amdgpu_kernel void @combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { 160 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 161 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid 162 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 163 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 164 %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid 165 166 %a = load volatile double, double addrspace(1)* %gep.0 167 %b = load volatile double, double addrspace(1)* %gep.1 168 %c = load volatile double, double addrspace(1)* %gep.2 169 170 %mul = fmul double %a, %b 171 %fma = fsub double %c, %mul 172 store double %fma, double addrspace(1)* %gep.out 173 ret void 174 } 175 176 ; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x) 177 ; FUNC-LABEL: {{^}}combine_to_fma_fsub_1_f64_2use: 178 ; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 179 ; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 180 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} 181 ; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}} 182 ; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[C]] 183 ; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[D]] 184 ; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 185 ; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 186 ; SI: s_endpgm 187 define amdgpu_kernel void @combine_to_fma_fsub_1_f64_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { 188 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 189 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid 190 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 191 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 192 %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3 193 %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid 194 %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1 195 196 %a = load volatile double, double addrspace(1)* %gep.0 197 %b = load volatile double, double addrspace(1)* %gep.1 198 %c = load volatile double, double addrspace(1)* %gep.2 199 %d = load volatile double, double addrspace(1)* %gep.3 200 201 %mul = fmul double %a, %b 202 %fma0 = fsub double %c, %mul 203 %fma1 = fsub double %d, %mul 204 store volatile double %fma0, double addrspace(1)* %gep.out.0 205 store volatile double %fma1, double addrspace(1)* %gep.out.1 206 ret void 207 } 208 209 ; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z)) 210 ; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64: 211 ; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 212 ; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 213 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} 214 ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]] 215 ; SI: buffer_store_dwordx2 [[RESULT]] 216 define amdgpu_kernel void @combine_to_fma_fsub_2_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { 217 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 218 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid 219 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 220 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 221 %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid 222 223 %a = load volatile double, double addrspace(1)* %gep.0 224 %b = load volatile double, double addrspace(1)* %gep.1 225 %c = load volatile double, double addrspace(1)* %gep.2 226 227 %mul = fmul double %a, %b 228 %mul.neg = fsub double -0.0, %mul 229 %fma = fsub double %mul.neg, %c 230 231 store double %fma, double addrspace(1)* %gep.out 232 ret void 233 } 234 235 ; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z)) 236 ; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64_2uses_neg: 237 ; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 238 ; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 239 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} 240 ; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}} 241 ; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]] 242 ; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[D]] 243 ; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 244 ; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 245 ; SI: s_endpgm 246 define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_neg(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { 247 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 248 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid 249 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 250 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 251 %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3 252 %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid 253 %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1 254 255 %a = load volatile double, double addrspace(1)* %gep.0 256 %b = load volatile double, double addrspace(1)* %gep.1 257 %c = load volatile double, double addrspace(1)* %gep.2 258 %d = load volatile double, double addrspace(1)* %gep.3 259 260 %mul = fmul double %a, %b 261 %mul.neg = fsub double -0.0, %mul 262 %fma0 = fsub double %mul.neg, %c 263 %fma1 = fsub double %mul.neg, %d 264 265 store volatile double %fma0, double addrspace(1)* %gep.out.0 266 store volatile double %fma1, double addrspace(1)* %gep.out.1 267 ret void 268 } 269 270 ; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z)) 271 ; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64_2uses_mul: 272 ; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 273 ; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 274 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} 275 ; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}} 276 ; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]] 277 ; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[D]] 278 ; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 279 ; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 280 ; SI: s_endpgm 281 define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_mul(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { 282 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 283 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid 284 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 285 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 286 %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3 287 %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid 288 %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1 289 290 %a = load volatile double, double addrspace(1)* %gep.0 291 %b = load volatile double, double addrspace(1)* %gep.1 292 %c = load volatile double, double addrspace(1)* %gep.2 293 %d = load volatile double, double addrspace(1)* %gep.3 294 295 %mul = fmul double %a, %b 296 %mul.neg = fsub double -0.0, %mul 297 %fma0 = fsub double %mul.neg, %c 298 %fma1 = fsub double %mul, %d 299 300 store volatile double %fma0, double addrspace(1)* %gep.out.0 301 store volatile double %fma1, double addrspace(1)* %gep.out.1 302 ret void 303 } 304 305 ; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z))) 306 307 ; FUNC-LABEL: {{^}}aggressive_combine_to_fma_fsub_0_f64: 308 ; SI-DAG: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 309 ; SI-DAG: buffer_load_dwordx2 [[Y:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 310 ; SI-DAG: buffer_load_dwordx2 [[Z:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} 311 ; SI-DAG: buffer_load_dwordx2 [[U:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}} 312 ; SI-DAG: buffer_load_dwordx2 [[V:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:32{{$}} 313 314 ; SI-SAFE: v_mul_f64 [[TMP0:v\[[0-9]+:[0-9]+\]]], [[U]], [[V]] 315 ; SI-SAFE: v_fma_f64 [[TMP1:v\[[0-9]+:[0-9]+\]]], [[X]], [[Y]], [[TMP0]] 316 ; SI-SAFE: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[TMP1]], -[[Z]] 317 318 ; SI-UNSAFE: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], [[U]], [[V]], -[[Z]] 319 ; SI-UNSAFE: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[X]], [[Y]], [[FMA0]] 320 321 ; SI: buffer_store_dwordx2 [[RESULT]] 322 define amdgpu_kernel void @aggressive_combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { 323 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 324 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid 325 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 326 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 327 %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3 328 %gep.4 = getelementptr double, double addrspace(1)* %gep.0, i32 4 329 %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid 330 331 %x = load volatile double, double addrspace(1)* %gep.0 332 %y = load volatile double, double addrspace(1)* %gep.1 333 %z = load volatile double, double addrspace(1)* %gep.2 334 %u = load volatile double, double addrspace(1)* %gep.3 335 %v = load volatile double, double addrspace(1)* %gep.4 336 337 %tmp0 = fmul double %u, %v 338 %tmp1 = call double @llvm.fma.f64(double %x, double %y, double %tmp0) #0 339 %tmp2 = fsub double %tmp1, %z 340 341 store double %tmp2, double addrspace(1)* %gep.out 342 ret void 343 } 344 345 ; fold (fsub x, (fma y, z, (fmul u, v))) 346 ; -> (fma (fneg y), z, (fma (fneg u), v, x)) 347 348 ; FUNC-LABEL: {{^}}aggressive_combine_to_fma_fsub_1_f64: 349 ; SI-DAG: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 350 ; SI-DAG: buffer_load_dwordx2 [[Y:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 351 ; SI-DAG: buffer_load_dwordx2 [[Z:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} 352 ; SI-DAG: buffer_load_dwordx2 [[U:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}} 353 ; SI-DAG: buffer_load_dwordx2 [[V:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:32{{$}} 354 355 ; SI-SAFE: v_mul_f64 [[TMP0:v\[[0-9]+:[0-9]+\]]], [[U]], [[V]] 356 ; SI-SAFE: v_fma_f64 [[TMP1:v\[[0-9]+:[0-9]+\]]], [[Y]], [[Z]], [[TMP0]] 357 ; SI-SAFE: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[X]], -[[TMP1]] 358 359 ; SI-UNSAFE: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], -[[U]], [[V]], [[X]] 360 ; SI-UNSAFE: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[Y]], [[Z]], [[FMA0]] 361 362 ; SI: buffer_store_dwordx2 [[RESULT]] 363 define amdgpu_kernel void @aggressive_combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { 364 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 365 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid 366 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 367 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 368 %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3 369 %gep.4 = getelementptr double, double addrspace(1)* %gep.0, i32 4 370 %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid 371 372 %x = load volatile double, double addrspace(1)* %gep.0 373 %y = load volatile double, double addrspace(1)* %gep.1 374 %z = load volatile double, double addrspace(1)* %gep.2 375 %u = load volatile double, double addrspace(1)* %gep.3 376 %v = load volatile double, double addrspace(1)* %gep.4 377 378 %tmp0 = fmul double %u, %v 379 %tmp1 = call double @llvm.fma.f64(double %y, double %z, double %tmp0) #0 380 %tmp2 = fsub double %x, %tmp1 381 382 store double %tmp2, double addrspace(1)* %gep.out 383 ret void 384 } 385 386 ; 387 ; Patterns (+ fneg variants): mul(add(1.0,x),y), mul(sub(1.0,x),y), mul(sub(x,1.0),y) 388 ; 389 390 ; FUNC-LABEL: {{^}}test_f32_mul_add_x_one_y: 391 ; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]] 392 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]] 393 ; 394 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]] 395 define amdgpu_kernel void @test_f32_mul_add_x_one_y(float addrspace(1)* %out, 396 float addrspace(1)* %in1, 397 float addrspace(1)* %in2) { 398 %x = load volatile float, float addrspace(1)* %in1 399 %y = load volatile float, float addrspace(1)* %in2 400 %a = fadd float %x, 1.0 401 %m = fmul float %a, %y 402 store float %m, float addrspace(1)* %out 403 ret void 404 } 405 406 ; FUNC-LABEL: {{^}}test_f32_mul_y_add_x_one: 407 ; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]] 408 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]] 409 ; 410 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]] 411 define amdgpu_kernel void @test_f32_mul_y_add_x_one(float addrspace(1)* %out, 412 float addrspace(1)* %in1, 413 float addrspace(1)* %in2) { 414 %x = load volatile float, float addrspace(1)* %in1 415 %y = load volatile float, float addrspace(1)* %in2 416 %a = fadd float %x, 1.0 417 %m = fmul float %y, %a 418 store float %m, float addrspace(1)* %out 419 ret void 420 } 421 422 ; FUNC-LABEL: {{^}}test_f32_mul_add_x_negone_y: 423 ; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]] 424 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]] 425 ; 426 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]] 427 define amdgpu_kernel void @test_f32_mul_add_x_negone_y(float addrspace(1)* %out, 428 float addrspace(1)* %in1, 429 float addrspace(1)* %in2) { 430 %x = load float, float addrspace(1)* %in1 431 %y = load float, float addrspace(1)* %in2 432 %a = fadd float %x, -1.0 433 %m = fmul float %a, %y 434 store float %m, float addrspace(1)* %out 435 ret void 436 } 437 438 ; FUNC-LABEL: {{^}}test_f32_mul_y_add_x_negone: 439 ; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]] 440 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]] 441 ; 442 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]] 443 define amdgpu_kernel void @test_f32_mul_y_add_x_negone(float addrspace(1)* %out, 444 float addrspace(1)* %in1, 445 float addrspace(1)* %in2) { 446 %x = load float, float addrspace(1)* %in1 447 %y = load float, float addrspace(1)* %in2 448 %a = fadd float %x, -1.0 449 %m = fmul float %y, %a 450 store float %m, float addrspace(1)* %out 451 ret void 452 } 453 454 ; FUNC-LABEL: {{^}}test_f32_mul_sub_one_x_y: 455 ; SI-NOFMA: v_sub_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]] 456 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]] 457 ; 458 ; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]] 459 define amdgpu_kernel void @test_f32_mul_sub_one_x_y(float addrspace(1)* %out, 460 float addrspace(1)* %in1, 461 float addrspace(1)* %in2) { 462 %x = load float, float addrspace(1)* %in1 463 %y = load float, float addrspace(1)* %in2 464 %s = fsub float 1.0, %x 465 %m = fmul float %s, %y 466 store float %m, float addrspace(1)* %out 467 ret void 468 } 469 470 ; FUNC-LABEL: {{^}}test_f32_mul_y_sub_one_x: 471 ; SI-NOFMA: v_sub_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]] 472 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]] 473 ; 474 ; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]] 475 define amdgpu_kernel void @test_f32_mul_y_sub_one_x(float addrspace(1)* %out, 476 float addrspace(1)* %in1, 477 float addrspace(1)* %in2) { 478 %x = load float, float addrspace(1)* %in1 479 %y = load float, float addrspace(1)* %in2 480 %s = fsub float 1.0, %x 481 %m = fmul float %y, %s 482 store float %m, float addrspace(1)* %out 483 ret void 484 } 485 486 ; FUNC-LABEL: {{^}}test_f32_mul_sub_negone_x_y: 487 ; SI-NOFMA: v_sub_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]] 488 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]] 489 ; 490 ; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]] 491 define amdgpu_kernel void @test_f32_mul_sub_negone_x_y(float addrspace(1)* %out, 492 float addrspace(1)* %in1, 493 float addrspace(1)* %in2) { 494 %x = load float, float addrspace(1)* %in1 495 %y = load float, float addrspace(1)* %in2 496 %s = fsub float -1.0, %x 497 %m = fmul float %s, %y 498 store float %m, float addrspace(1)* %out 499 ret void 500 } 501 502 ; FUNC-LABEL: {{^}}test_f32_mul_y_sub_negone_x: 503 ; SI-NOFMA: v_sub_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]] 504 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]] 505 ; 506 ; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]] 507 define amdgpu_kernel void @test_f32_mul_y_sub_negone_x(float addrspace(1)* %out, 508 float addrspace(1)* %in1, 509 float addrspace(1)* %in2) { 510 %x = load float, float addrspace(1)* %in1 511 %y = load float, float addrspace(1)* %in2 512 %s = fsub float -1.0, %x 513 %m = fmul float %y, %s 514 store float %m, float addrspace(1)* %out 515 ret void 516 } 517 518 ; FUNC-LABEL: {{^}}test_f32_mul_sub_x_one_y: 519 ; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]] 520 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]] 521 ; 522 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]] 523 define amdgpu_kernel void @test_f32_mul_sub_x_one_y(float addrspace(1)* %out, 524 float addrspace(1)* %in1, 525 float addrspace(1)* %in2) { 526 %x = load float, float addrspace(1)* %in1 527 %y = load float, float addrspace(1)* %in2 528 %s = fsub float %x, 1.0 529 %m = fmul float %s, %y 530 store float %m, float addrspace(1)* %out 531 ret void 532 } 533 534 ; FUNC-LABEL: {{^}}test_f32_mul_y_sub_x_one: 535 ; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]] 536 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]] 537 ; 538 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]] 539 define amdgpu_kernel void @test_f32_mul_y_sub_x_one(float addrspace(1)* %out, 540 float addrspace(1)* %in1, 541 float addrspace(1)* %in2) { 542 %x = load float, float addrspace(1)* %in1 543 %y = load float, float addrspace(1)* %in2 544 %s = fsub float %x, 1.0 545 %m = fmul float %y, %s 546 store float %m, float addrspace(1)* %out 547 ret void 548 } 549 550 ; FUNC-LABEL: {{^}}test_f32_mul_sub_x_negone_y: 551 ; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]] 552 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]] 553 ; 554 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]] 555 define amdgpu_kernel void @test_f32_mul_sub_x_negone_y(float addrspace(1)* %out, 556 float addrspace(1)* %in1, 557 float addrspace(1)* %in2) { 558 %x = load float, float addrspace(1)* %in1 559 %y = load float, float addrspace(1)* %in2 560 %s = fsub float %x, -1.0 561 %m = fmul float %s, %y 562 store float %m, float addrspace(1)* %out 563 ret void 564 } 565 566 ; FUNC-LABEL: {{^}}test_f32_mul_y_sub_x_negone: 567 ; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]] 568 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]] 569 ; 570 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]] 571 define amdgpu_kernel void @test_f32_mul_y_sub_x_negone(float addrspace(1)* %out, 572 float addrspace(1)* %in1, 573 float addrspace(1)* %in2) { 574 %x = load float, float addrspace(1)* %in1 575 %y = load float, float addrspace(1)* %in2 576 %s = fsub float %x, -1.0 577 %m = fmul float %y, %s 578 store float %m, float addrspace(1)* %out 579 ret void 580 } 581 582 ; 583 ; Interpolation Patterns: add(mul(x,t),mul(sub(1.0,t),y)) 584 ; 585 586 ; FUNC-LABEL: {{^}}test_f32_interp: 587 ; SI-NOFMA: v_sub_f32_e32 [[VT1:v[0-9]]], 1.0, [[VT:v[0-9]]] 588 ; SI-NOFMA: v_mul_f32_e32 [[VTY:v[0-9]]], [[VY:v[0-9]]], [[VT1]] 589 ; SI-NOFMA: v_mac_f32_e32 [[VTY]], [[VX:v[0-9]]], [[VT]] 590 ; 591 ; SI-FMA: v_fma_f32 [[VR:v[0-9]]], -[[VT:v[0-9]]], [[VY:v[0-9]]], [[VY]] 592 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VT]], [[VR]] 593 define amdgpu_kernel void @test_f32_interp(float addrspace(1)* %out, 594 float addrspace(1)* %in1, 595 float addrspace(1)* %in2, 596 float addrspace(1)* %in3) { 597 %x = load float, float addrspace(1)* %in1 598 %y = load float, float addrspace(1)* %in2 599 %t = load float, float addrspace(1)* %in3 600 %t1 = fsub float 1.0, %t 601 %tx = fmul float %x, %t 602 %ty = fmul float %y, %t1 603 %r = fadd float %tx, %ty 604 store float %r, float addrspace(1)* %out 605 ret void 606 } 607 608 ; FUNC-LABEL: {{^}}test_f64_interp: 609 ; SI-NOFMA: v_add_f64 [[VT1:v\[[0-9]+:[0-9]+\]]], -[[VT:v\[[0-9]+:[0-9]+\]]], 1.0 610 ; SI-NOFMA: v_mul_f64 [[VTY:v\[[0-9]+:[0-9]+\]]], [[VY:v\[[0-9]+:[0-9]+\]]], [[VT1]] 611 ; SI-NOFMA: v_fma_f64 v{{\[[0-9]+:[0-9]+\]}}, [[VX:v\[[0-9]+:[0-9]+\]]], [[VT]], [[VTY]] 612 ; 613 ; SI-FMA: v_fma_f64 [[VR:v\[[0-9]+:[0-9]+\]]], -[[VT:v\[[0-9]+:[0-9]+\]]], [[VY:v\[[0-9]+:[0-9]+\]]], [[VY]] 614 ; SI-FMA: v_fma_f64 v{{\[[0-9]+:[0-9]+\]}}, [[VX:v\[[0-9]+:[0-9]+\]]], [[VT]], [[VR]] 615 define amdgpu_kernel void @test_f64_interp(double addrspace(1)* %out, 616 double addrspace(1)* %in1, 617 double addrspace(1)* %in2, 618 double addrspace(1)* %in3) { 619 %x = load double, double addrspace(1)* %in1 620 %y = load double, double addrspace(1)* %in2 621 %t = load double, double addrspace(1)* %in3 622 %t1 = fsub double 1.0, %t 623 %tx = fmul double %x, %t 624 %ty = fmul double %y, %t1 625 %r = fadd double %tx, %ty 626 store double %r, double addrspace(1)* %out 627 ret void 628 } 629 630 ; Make sure negative constant cancels out fneg 631 ; GCN-LABEL: {{^}}fma_neg_2.0_neg_a_b_f32: 632 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] 633 ; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]] 634 ; GCN-NOT: [[A]] 635 ; GCN-NOT: [[B]] 636 ; GCN: v_fma_f32 v{{[0-9]+}}, [[A]], 2.0, [[B]] 637 define amdgpu_kernel void @fma_neg_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { 638 %tid = call i32 @llvm.amdgcn.workitem.id.x() 639 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid 640 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 641 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 642 643 %r1 = load volatile float, float addrspace(1)* %gep.0 644 %r2 = load volatile float, float addrspace(1)* %gep.1 645 646 %r1.fneg = fsub float -0.000000e+00, %r1 647 648 %r3 = tail call float @llvm.fma.f32(float -2.0, float %r1.fneg, float %r2) 649 store float %r3, float addrspace(1)* %gep.out 650 ret void 651 } 652 653 ; GCN-LABEL: {{^}}fma_2.0_neg_a_b_f32: 654 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] 655 ; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]] 656 ; GCN-NOT: [[A]] 657 ; GCN-NOT: [[B]] 658 ; GCN: v_fma_f32 v{{[0-9]+}}, [[A]], -2.0, [[B]] 659 define amdgpu_kernel void @fma_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { 660 %tid = call i32 @llvm.amdgcn.workitem.id.x() 661 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid 662 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 663 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 664 665 %r1 = load volatile float, float addrspace(1)* %gep.0 666 %r2 = load volatile float, float addrspace(1)* %gep.1 667 668 %r1.fneg = fsub float -0.000000e+00, %r1 669 670 %r3 = tail call float @llvm.fma.f32(float 2.0, float %r1.fneg, float %r2) 671 store float %r3, float addrspace(1)* %gep.out 672 ret void 673 } 674 675 attributes #0 = { nounwind readnone } 676 attributes #1 = { nounwind } 677 678