1 /**************************************************************************** 2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 * @file blend_jit.cpp 24 * 25 * @brief Implementation of the blend jitter 26 * 27 * Notes: 28 * 29 ******************************************************************************/ 30 #include "jit_pch.hpp" 31 #include "builder.h" 32 #include "jit_api.h" 33 #include "blend_jit.h" 34 #include "gen_state_llvm.h" 35 36 // components with bit-widths <= the QUANTIZE_THRESHOLD will be quantized 37 #define QUANTIZE_THRESHOLD 2 38 39 using namespace llvm; 40 using namespace SwrJit; 41 42 ////////////////////////////////////////////////////////////////////////// 43 /// Interface to Jitting a blend shader 44 ////////////////////////////////////////////////////////////////////////// 45 struct BlendJit : public Builder 46 { 47 BlendJit(JitManager* pJitMgr) : Builder(pJitMgr){}; 48 49 template<bool Color, bool Alpha> 50 void GenerateBlendFactor(SWR_BLEND_FACTOR factor, Value* constColor[4], Value* src[4], Value* src1[4], Value* dst[4], Value* result[4]) 51 { 52 Value* out[4]; 53 54 switch (factor) 55 { 56 case BLENDFACTOR_ONE: 57 out[0] = out[1] = out[2] = out[3] = VIMMED1(1.0f); 58 break; 59 case BLENDFACTOR_SRC_COLOR: 60 out[0] = src[0]; 61 out[1] = src[1]; 62 out[2] = src[2]; 63 out[3] = src[3]; 64 break; 65 case BLENDFACTOR_SRC_ALPHA: 66 out[0] = out[1] = out[2] = out[3] = src[3]; 67 break; 68 case BLENDFACTOR_DST_ALPHA: 69 out[0] = out[1] = out[2] = out[3] = dst[3]; 70 break; 71 case BLENDFACTOR_DST_COLOR: 72 out[0] = dst[0]; 73 out[1] = dst[1]; 74 out[2] = dst[2]; 75 out[3] = dst[3]; 76 break; 77 case BLENDFACTOR_SRC_ALPHA_SATURATE: 78 out[0] = out[1] = out[2] = VMINPS(src[3], FSUB(VIMMED1(1.0f), dst[3])); 79 out[3] = VIMMED1(1.0f); 80 break; 81 case BLENDFACTOR_CONST_COLOR: 82 out[0] = constColor[0]; 83 out[1] = constColor[1]; 84 out[2] = constColor[2]; 85 out[3] = constColor[3]; 86 break; 87 case BLENDFACTOR_CONST_ALPHA: 88 out[0] = out[1] = out[2] = out[3] = constColor[3]; 89 break; 90 case BLENDFACTOR_SRC1_COLOR: 91 out[0] = src1[0]; 92 out[1] = src1[1]; 93 out[2] = src1[2]; 94 out[3] = src1[3]; 95 break; 96 case BLENDFACTOR_SRC1_ALPHA: 97 out[0] = out[1] = out[2] = out[3] = src1[3]; 98 break; 99 case BLENDFACTOR_ZERO: 100 out[0] = out[1] = out[2] = out[3] = VIMMED1(0.0f); 101 break; 102 case BLENDFACTOR_INV_SRC_COLOR: 103 out[0] = FSUB(VIMMED1(1.0f), src[0]); 104 out[1] = FSUB(VIMMED1(1.0f), src[1]); 105 out[2] = FSUB(VIMMED1(1.0f), src[2]); 106 out[3] = FSUB(VIMMED1(1.0f), src[3]); 107 break; 108 case BLENDFACTOR_INV_SRC_ALPHA: 109 out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), src[3]); 110 break; 111 case BLENDFACTOR_INV_DST_ALPHA: 112 out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), dst[3]); 113 break; 114 case BLENDFACTOR_INV_DST_COLOR: 115 out[0] = FSUB(VIMMED1(1.0f), dst[0]); 116 out[1] = FSUB(VIMMED1(1.0f), dst[1]); 117 out[2] = FSUB(VIMMED1(1.0f), dst[2]); 118 out[3] = FSUB(VIMMED1(1.0f), dst[3]); 119 break; 120 case BLENDFACTOR_INV_CONST_COLOR: 121 out[0] = FSUB(VIMMED1(1.0f), constColor[0]); 122 out[1] = FSUB(VIMMED1(1.0f), constColor[1]); 123 out[2] = FSUB(VIMMED1(1.0f), constColor[2]); 124 out[3] = FSUB(VIMMED1(1.0f), constColor[3]); 125 break; 126 case BLENDFACTOR_INV_CONST_ALPHA: 127 out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), constColor[3]); 128 break; 129 case BLENDFACTOR_INV_SRC1_COLOR: 130 out[0] = FSUB(VIMMED1(1.0f), src1[0]); 131 out[1] = FSUB(VIMMED1(1.0f), src1[1]); 132 out[2] = FSUB(VIMMED1(1.0f), src1[2]); 133 out[3] = FSUB(VIMMED1(1.0f), src1[3]); 134 break; 135 case BLENDFACTOR_INV_SRC1_ALPHA: 136 out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), src1[3]); 137 break; 138 default: 139 SWR_INVALID("Unsupported blend factor: %d", factor); 140 out[0] = out[1] = out[2] = out[3] = VIMMED1(0.0f); 141 break; 142 } 143 144 if (Color) 145 { 146 result[0] = out[0]; 147 result[1] = out[1]; 148 result[2] = out[2]; 149 } 150 151 if (Alpha) 152 { 153 result[3] = out[3]; 154 } 155 } 156 157 void Clamp(SWR_FORMAT format, Value* src[4]) 158 { 159 const SWR_FORMAT_INFO& info = GetFormatInfo(format); 160 SWR_TYPE type = info.type[0]; 161 162 switch (type) 163 { 164 default: 165 break; 166 167 case SWR_TYPE_UNORM: 168 src[0] = VMINPS(VMAXPS(src[0], VIMMED1(0.0f)), VIMMED1(1.0f)); 169 src[1] = VMINPS(VMAXPS(src[1], VIMMED1(0.0f)), VIMMED1(1.0f)); 170 src[2] = VMINPS(VMAXPS(src[2], VIMMED1(0.0f)), VIMMED1(1.0f)); 171 src[3] = VMINPS(VMAXPS(src[3], VIMMED1(0.0f)), VIMMED1(1.0f)); 172 break; 173 174 case SWR_TYPE_SNORM: 175 src[0] = VMINPS(VMAXPS(src[0], VIMMED1(-1.0f)), VIMMED1(1.0f)); 176 src[1] = VMINPS(VMAXPS(src[1], VIMMED1(-1.0f)), VIMMED1(1.0f)); 177 src[2] = VMINPS(VMAXPS(src[2], VIMMED1(-1.0f)), VIMMED1(1.0f)); 178 src[3] = VMINPS(VMAXPS(src[3], VIMMED1(-1.0f)), VIMMED1(1.0f)); 179 break; 180 181 case SWR_TYPE_UNKNOWN: SWR_INVALID("Unsupport format type: %d", type); 182 } 183 } 184 185 void ApplyDefaults(SWR_FORMAT format, Value* src[4]) 186 { 187 const SWR_FORMAT_INFO& info = GetFormatInfo(format); 188 189 bool valid[] = { false, false, false, false }; 190 for (uint32_t c = 0; c < info.numComps; ++c) 191 { 192 valid[info.swizzle[c]] = true; 193 } 194 195 for (uint32_t c = 0; c < 4; ++c) 196 { 197 if (!valid[c]) 198 { 199 src[c] = BITCAST(VIMMED1((int)info.defaults[c]), mSimdFP32Ty); 200 } 201 } 202 } 203 204 void ApplyUnusedDefaults(SWR_FORMAT format, Value* src[4]) 205 { 206 const SWR_FORMAT_INFO& info = GetFormatInfo(format); 207 208 for (uint32_t c = 0; c < info.numComps; ++c) 209 { 210 if (info.type[c] == SWR_TYPE_UNUSED) 211 { 212 src[info.swizzle[c]] = BITCAST(VIMMED1((int)info.defaults[info.swizzle[c]]), mSimdFP32Ty); 213 } 214 } 215 } 216 217 void Quantize(SWR_FORMAT format, Value* src[4]) 218 { 219 const SWR_FORMAT_INFO& info = GetFormatInfo(format); 220 for (uint32_t c = 0; c < info.numComps; ++c) 221 { 222 if (info.bpc[c] <= QUANTIZE_THRESHOLD && info.type[c] != SWR_TYPE_UNUSED) 223 { 224 uint32_t swizComp = info.swizzle[c]; 225 float factor = (float)((1 << info.bpc[c]) - 1); 226 switch (info.type[c]) 227 { 228 case SWR_TYPE_UNORM: 229 src[swizComp] = FADD(FMUL(src[swizComp], VIMMED1(factor)), VIMMED1(0.5f)); 230 src[swizComp] = VROUND(src[swizComp], C(_MM_FROUND_TO_ZERO)); 231 src[swizComp] = FMUL(src[swizComp], VIMMED1(1.0f /factor)); 232 break; 233 default: SWR_INVALID("Unsupported format type: %d", info.type[c]); 234 } 235 } 236 } 237 } 238 239 template<bool Color, bool Alpha> 240 void BlendFunc(SWR_BLEND_OP blendOp, Value* src[4], Value* srcFactor[4], Value* dst[4], Value* dstFactor[4], Value* result[4]) 241 { 242 Value* out[4]; 243 Value* srcBlend[4]; 244 Value* dstBlend[4]; 245 for (uint32_t i = 0; i < 4; ++i) 246 { 247 srcBlend[i] = FMUL(src[i], srcFactor[i]); 248 dstBlend[i] = FMUL(dst[i], dstFactor[i]); 249 } 250 251 switch (blendOp) 252 { 253 case BLENDOP_ADD: 254 out[0] = FADD(srcBlend[0], dstBlend[0]); 255 out[1] = FADD(srcBlend[1], dstBlend[1]); 256 out[2] = FADD(srcBlend[2], dstBlend[2]); 257 out[3] = FADD(srcBlend[3], dstBlend[3]); 258 break; 259 260 case BLENDOP_SUBTRACT: 261 out[0] = FSUB(srcBlend[0], dstBlend[0]); 262 out[1] = FSUB(srcBlend[1], dstBlend[1]); 263 out[2] = FSUB(srcBlend[2], dstBlend[2]); 264 out[3] = FSUB(srcBlend[3], dstBlend[3]); 265 break; 266 267 case BLENDOP_REVSUBTRACT: 268 out[0] = FSUB(dstBlend[0], srcBlend[0]); 269 out[1] = FSUB(dstBlend[1], srcBlend[1]); 270 out[2] = FSUB(dstBlend[2], srcBlend[2]); 271 out[3] = FSUB(dstBlend[3], srcBlend[3]); 272 break; 273 274 case BLENDOP_MIN: 275 out[0] = VMINPS(src[0], dst[0]); 276 out[1] = VMINPS(src[1], dst[1]); 277 out[2] = VMINPS(src[2], dst[2]); 278 out[3] = VMINPS(src[3], dst[3]); 279 break; 280 281 case BLENDOP_MAX: 282 out[0] = VMAXPS(src[0], dst[0]); 283 out[1] = VMAXPS(src[1], dst[1]); 284 out[2] = VMAXPS(src[2], dst[2]); 285 out[3] = VMAXPS(src[3], dst[3]); 286 break; 287 288 default: 289 SWR_INVALID("Unsupported blend operation: %d", blendOp); 290 out[0] = out[1] = out[2] = out[3] = VIMMED1(0.0f); 291 break; 292 } 293 294 if (Color) 295 { 296 result[0] = out[0]; 297 result[1] = out[1]; 298 result[2] = out[2]; 299 } 300 301 if (Alpha) 302 { 303 result[3] = out[3]; 304 } 305 } 306 307 void LogicOpFunc(SWR_LOGIC_OP logicOp, Value* src[4], Value* dst[4], Value* result[4]) 308 { 309 // Op: (s == PS output, d = RT contents) 310 switch(logicOp) 311 { 312 case LOGICOP_CLEAR: 313 result[0] = VIMMED1(0); 314 result[1] = VIMMED1(0); 315 result[2] = VIMMED1(0); 316 result[3] = VIMMED1(0); 317 break; 318 319 case LOGICOP_NOR: 320 // ~(s | d) 321 result[0] = XOR(OR(src[0], dst[0]), VIMMED1(0xFFFFFFFF)); 322 result[1] = XOR(OR(src[1], dst[1]), VIMMED1(0xFFFFFFFF)); 323 result[2] = XOR(OR(src[2], dst[2]), VIMMED1(0xFFFFFFFF)); 324 result[3] = XOR(OR(src[3], dst[3]), VIMMED1(0xFFFFFFFF)); 325 break; 326 327 case LOGICOP_AND_INVERTED: 328 // ~s & d 329 // todo: use avx andnot instr when I can find the intrinsic to call 330 result[0] = AND(XOR(src[0], VIMMED1(0xFFFFFFFF)), dst[0]); 331 result[1] = AND(XOR(src[1], VIMMED1(0xFFFFFFFF)), dst[1]); 332 result[2] = AND(XOR(src[2], VIMMED1(0xFFFFFFFF)), dst[2]); 333 result[3] = AND(XOR(src[3], VIMMED1(0xFFFFFFFF)), dst[3]); 334 break; 335 336 case LOGICOP_COPY_INVERTED: 337 // ~s 338 result[0] = XOR(src[0], VIMMED1(0xFFFFFFFF)); 339 result[1] = XOR(src[1], VIMMED1(0xFFFFFFFF)); 340 result[2] = XOR(src[2], VIMMED1(0xFFFFFFFF)); 341 result[3] = XOR(src[3], VIMMED1(0xFFFFFFFF)); 342 break; 343 344 case LOGICOP_AND_REVERSE: 345 // s & ~d 346 // todo: use avx andnot instr when I can find the intrinsic to call 347 result[0] = AND(XOR(dst[0], VIMMED1(0xFFFFFFFF)), src[0]); 348 result[1] = AND(XOR(dst[1], VIMMED1(0xFFFFFFFF)), src[1]); 349 result[2] = AND(XOR(dst[2], VIMMED1(0xFFFFFFFF)), src[2]); 350 result[3] = AND(XOR(dst[3], VIMMED1(0xFFFFFFFF)), src[3]); 351 break; 352 353 case LOGICOP_INVERT: 354 // ~d 355 result[0] = XOR(dst[0], VIMMED1(0xFFFFFFFF)); 356 result[1] = XOR(dst[1], VIMMED1(0xFFFFFFFF)); 357 result[2] = XOR(dst[2], VIMMED1(0xFFFFFFFF)); 358 result[3] = XOR(dst[3], VIMMED1(0xFFFFFFFF)); 359 break; 360 361 case LOGICOP_XOR: 362 // s ^ d 363 result[0] = XOR(src[0], dst[0]); 364 result[1] = XOR(src[1], dst[1]); 365 result[2] = XOR(src[2], dst[2]); 366 result[3] = XOR(src[3], dst[3]); 367 break; 368 369 case LOGICOP_NAND: 370 // ~(s & d) 371 result[0] = XOR(AND(src[0], dst[0]), VIMMED1(0xFFFFFFFF)); 372 result[1] = XOR(AND(src[1], dst[1]), VIMMED1(0xFFFFFFFF)); 373 result[2] = XOR(AND(src[2], dst[2]), VIMMED1(0xFFFFFFFF)); 374 result[3] = XOR(AND(src[3], dst[3]), VIMMED1(0xFFFFFFFF)); 375 break; 376 377 case LOGICOP_AND: 378 // s & d 379 result[0] = AND(src[0], dst[0]); 380 result[1] = AND(src[1], dst[1]); 381 result[2] = AND(src[2], dst[2]); 382 result[3] = AND(src[3], dst[3]); 383 break; 384 385 case LOGICOP_EQUIV: 386 // ~(s ^ d) 387 result[0] = XOR(XOR(src[0], dst[0]), VIMMED1(0xFFFFFFFF)); 388 result[1] = XOR(XOR(src[1], dst[1]), VIMMED1(0xFFFFFFFF)); 389 result[2] = XOR(XOR(src[2], dst[2]), VIMMED1(0xFFFFFFFF)); 390 result[3] = XOR(XOR(src[3], dst[3]), VIMMED1(0xFFFFFFFF)); 391 break; 392 393 case LOGICOP_NOOP: 394 result[0] = dst[0]; 395 result[1] = dst[1]; 396 result[2] = dst[2]; 397 result[3] = dst[3]; 398 break; 399 400 case LOGICOP_OR_INVERTED: 401 // ~s | d 402 result[0] = OR(XOR(src[0], VIMMED1(0xFFFFFFFF)), dst[0]); 403 result[1] = OR(XOR(src[1], VIMMED1(0xFFFFFFFF)), dst[1]); 404 result[2] = OR(XOR(src[2], VIMMED1(0xFFFFFFFF)), dst[2]); 405 result[3] = OR(XOR(src[3], VIMMED1(0xFFFFFFFF)), dst[3]); 406 break; 407 408 case LOGICOP_COPY: 409 result[0] = src[0]; 410 result[1] = src[1]; 411 result[2] = src[2]; 412 result[3] = src[3]; 413 break; 414 415 case LOGICOP_OR_REVERSE: 416 // s | ~d 417 result[0] = OR(XOR(dst[0], VIMMED1(0xFFFFFFFF)), src[0]); 418 result[1] = OR(XOR(dst[1], VIMMED1(0xFFFFFFFF)), src[1]); 419 result[2] = OR(XOR(dst[2], VIMMED1(0xFFFFFFFF)), src[2]); 420 result[3] = OR(XOR(dst[3], VIMMED1(0xFFFFFFFF)), src[3]); 421 break; 422 423 case LOGICOP_OR: 424 // s | d 425 result[0] = OR(src[0], dst[0]); 426 result[1] = OR(src[1], dst[1]); 427 result[2] = OR(src[2], dst[2]); 428 result[3] = OR(src[3], dst[3]); 429 break; 430 431 case LOGICOP_SET: 432 result[0] = VIMMED1(0xFFFFFFFF); 433 result[1] = VIMMED1(0xFFFFFFFF); 434 result[2] = VIMMED1(0xFFFFFFFF); 435 result[3] = VIMMED1(0xFFFFFFFF); 436 break; 437 438 default: 439 SWR_INVALID("Unsupported logic operation: %d", logicOp); 440 result[0] = result[1] = result[2] = result[3] = VIMMED1(0.0f); 441 break; 442 } 443 } 444 445 void AlphaTest(const BLEND_COMPILE_STATE& state, Value* pBlendState, Value* ppAlpha, Value* ppMask) 446 { 447 // load uint32_t reference 448 Value* pRef = VBROADCAST(LOAD(pBlendState, { 0, SWR_BLEND_STATE_alphaTestReference })); 449 450 // load alpha 451 Value* pAlpha = LOAD(ppAlpha); 452 453 Value* pTest = nullptr; 454 if (state.alphaTestFormat == ALPHA_TEST_UNORM8) 455 { 456 // convert float alpha to unorm8 457 Value* pAlphaU8 = FMUL(pAlpha, VIMMED1(256.0f)); 458 pAlphaU8 = FP_TO_UI(pAlphaU8, mSimdInt32Ty); 459 460 // compare 461 switch (state.alphaTestFunction) 462 { 463 case ZFUNC_ALWAYS: pTest = VIMMED1(true); break; 464 case ZFUNC_NEVER: pTest = VIMMED1(false); break; 465 case ZFUNC_LT: pTest = ICMP_ULT(pAlphaU8, pRef); break; 466 case ZFUNC_EQ: pTest = ICMP_EQ(pAlphaU8, pRef); break; 467 case ZFUNC_LE: pTest = ICMP_ULE(pAlphaU8, pRef); break; 468 case ZFUNC_GT: pTest = ICMP_UGT(pAlphaU8, pRef); break; 469 case ZFUNC_NE: pTest = ICMP_NE(pAlphaU8, pRef); break; 470 case ZFUNC_GE: pTest = ICMP_UGE(pAlphaU8, pRef); break; 471 default: 472 SWR_INVALID("Invalid alpha test function"); 473 break; 474 } 475 } 476 else 477 { 478 // cast ref to float 479 pRef = BITCAST(pRef, mSimdFP32Ty); 480 481 // compare 482 switch (state.alphaTestFunction) 483 { 484 case ZFUNC_ALWAYS: pTest = VIMMED1(true); break; 485 case ZFUNC_NEVER: pTest = VIMMED1(false); break; 486 case ZFUNC_LT: pTest = FCMP_OLT(pAlpha, pRef); break; 487 case ZFUNC_EQ: pTest = FCMP_OEQ(pAlpha, pRef); break; 488 case ZFUNC_LE: pTest = FCMP_OLE(pAlpha, pRef); break; 489 case ZFUNC_GT: pTest = FCMP_OGT(pAlpha, pRef); break; 490 case ZFUNC_NE: pTest = FCMP_ONE(pAlpha, pRef); break; 491 case ZFUNC_GE: pTest = FCMP_OGE(pAlpha, pRef); break; 492 default: 493 SWR_INVALID("Invalid alpha test function"); 494 break; 495 } 496 } 497 498 // load current mask 499 Value* pMask = LOAD(ppMask); 500 501 // convert to int1 mask 502 pMask = MASK(pMask); 503 504 // and with alpha test result 505 pMask = AND(pMask, pTest); 506 507 // convert back to vector mask 508 pMask = VMASK(pMask); 509 510 // store new mask 511 STORE(pMask, ppMask); 512 } 513 514 Function* Create(const BLEND_COMPILE_STATE& state) 515 { 516 std::stringstream fnName("BLND_", std::ios_base::in | std::ios_base::out | std::ios_base::ate); 517 fnName << ComputeCRC(0, &state, sizeof(state)); 518 519 // blend function signature 520 //typedef void(*PFN_BLEND_JIT_FUNC)(const SWR_BLEND_STATE*, simdvector&, simdvector&, uint32_t, uint8_t*, simdvector&, simdscalari*, simdscalari*); 521 522 std::vector<Type*> args{ 523 PointerType::get(Gen_SWR_BLEND_STATE(JM()), 0), // SWR_BLEND_STATE* 524 PointerType::get(mSimdFP32Ty, 0), // simdvector& src 525 PointerType::get(mSimdFP32Ty, 0), // simdvector& src1 526 PointerType::get(mSimdFP32Ty, 0), // src0alpha 527 Type::getInt32Ty(JM()->mContext), // sampleNum 528 PointerType::get(mSimdFP32Ty, 0), // uint8_t* pDst 529 PointerType::get(mSimdFP32Ty, 0), // simdvector& result 530 PointerType::get(mSimdInt32Ty, 0), // simdscalari* oMask 531 PointerType::get(mSimdInt32Ty, 0), // simdscalari* pMask 532 }; 533 534 FunctionType* fTy = FunctionType::get(IRB()->getVoidTy(), args, false); 535 Function* blendFunc = Function::Create(fTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule); 536 blendFunc->getParent()->setModuleIdentifier(blendFunc->getName()); 537 538 BasicBlock* entry = BasicBlock::Create(JM()->mContext, "entry", blendFunc); 539 540 IRB()->SetInsertPoint(entry); 541 542 // arguments 543 auto argitr = blendFunc->arg_begin(); 544 Value* pBlendState = &*argitr++; 545 pBlendState->setName("pBlendState"); 546 Value* pSrc = &*argitr++; 547 pSrc->setName("src"); 548 Value* pSrc1 = &*argitr++; 549 pSrc1->setName("src1"); 550 Value* pSrc0Alpha = &*argitr++; 551 pSrc0Alpha->setName("src0alpha"); 552 Value* sampleNum = &*argitr++; 553 sampleNum->setName("sampleNum"); 554 Value* pDst = &*argitr++; 555 pDst->setName("pDst"); 556 Value* pResult = &*argitr++; 557 pResult->setName("result"); 558 Value* ppoMask = &*argitr++; 559 ppoMask->setName("ppoMask"); 560 Value* ppMask = &*argitr++; 561 ppMask->setName("pMask"); 562 563 static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT, "Unsupported hot tile format"); 564 Value* dst[4]; 565 Value* constantColor[4]; 566 Value* src[4]; 567 Value* src1[4]; 568 Value* result[4]; 569 for (uint32_t i = 0; i < 4; ++i) 570 { 571 // load hot tile 572 dst[i] = LOAD(pDst, { i }); 573 574 // load constant color 575 constantColor[i] = VBROADCAST(LOAD(pBlendState, { 0, SWR_BLEND_STATE_constantColor, i })); 576 577 // load src 578 src[i] = LOAD(pSrc, { i }); 579 580 // load src1 581 src1[i] = LOAD(pSrc1, { i }); 582 } 583 Value* currentSampleMask = VIMMED1(-1); 584 if (state.desc.alphaToCoverageEnable) 585 { 586 Value* pClampedSrc = FCLAMP(src[3], 0.0f, 1.0f); 587 uint32_t bits = (1 << state.desc.numSamples) - 1; 588 currentSampleMask = FMUL(pClampedSrc, VBROADCAST(C((float)bits))); 589 currentSampleMask = FP_TO_SI(FADD(currentSampleMask, VIMMED1(0.5f)), mSimdInt32Ty); 590 } 591 592 // alpha test 593 if (state.desc.alphaTestEnable) 594 { 595 AlphaTest(state, pBlendState, pSrc0Alpha, ppMask); 596 } 597 598 // color blend 599 if (state.blendState.blendEnable) 600 { 601 // clamp sources 602 Clamp(state.format, src); 603 Clamp(state.format, src1); 604 Clamp(state.format, dst); 605 Clamp(state.format, constantColor); 606 607 // apply defaults to hottile contents to take into account missing components 608 ApplyDefaults(state.format, dst); 609 610 // Force defaults for unused 'X' components 611 ApplyUnusedDefaults(state.format, dst); 612 613 // Quantize low precision components 614 Quantize(state.format, dst); 615 616 // special case clamping for R11G11B10_float which has no sign bit 617 if (state.format == R11G11B10_FLOAT) 618 { 619 dst[0] = VMAXPS(dst[0], VIMMED1(0.0f)); 620 dst[1] = VMAXPS(dst[1], VIMMED1(0.0f)); 621 dst[2] = VMAXPS(dst[2], VIMMED1(0.0f)); 622 dst[3] = VMAXPS(dst[3], VIMMED1(0.0f)); 623 } 624 625 Value* srcFactor[4]; 626 Value* dstFactor[4]; 627 if (state.desc.independentAlphaBlendEnable) 628 { 629 GenerateBlendFactor<true, false>(state.blendState.sourceBlendFactor, constantColor, src, src1, dst, srcFactor); 630 GenerateBlendFactor<false, true>(state.blendState.sourceAlphaBlendFactor, constantColor, src, src1, dst, srcFactor); 631 632 GenerateBlendFactor<true, false>(state.blendState.destBlendFactor, constantColor, src, src1, dst, dstFactor); 633 GenerateBlendFactor<false, true>(state.blendState.destAlphaBlendFactor, constantColor, src, src1, dst, dstFactor); 634 635 BlendFunc<true, false>(state.blendState.colorBlendFunc, src, srcFactor, dst, dstFactor, result); 636 BlendFunc<false, true>(state.blendState.alphaBlendFunc, src, srcFactor, dst, dstFactor, result); 637 } 638 else 639 { 640 GenerateBlendFactor<true, true>(state.blendState.sourceBlendFactor, constantColor, src, src1, dst, srcFactor); 641 GenerateBlendFactor<true, true>(state.blendState.destBlendFactor, constantColor, src, src1, dst, dstFactor); 642 643 BlendFunc<true, true>(state.blendState.colorBlendFunc, src, srcFactor, dst, dstFactor, result); 644 } 645 646 // store results out 647 for (uint32_t i = 0; i < 4; ++i) 648 { 649 STORE(result[i], pResult, { i }); 650 } 651 } 652 653 if(state.blendState.logicOpEnable) 654 { 655 const SWR_FORMAT_INFO& info = GetFormatInfo(state.format); 656 Value* vMask[4]; 657 float scale[4]; 658 659 if (!state.blendState.blendEnable) 660 { 661 Clamp(state.format, src); 662 Clamp(state.format, dst); 663 } 664 665 for(uint32_t i = 0; i < 4; i++) 666 { 667 if (info.type[i] == SWR_TYPE_UNUSED) 668 { 669 continue; 670 } 671 672 if (info.bpc[i] >= 32) 673 { 674 vMask[i] = VIMMED1(0xFFFFFFFF); 675 scale[i] = 0xFFFFFFFF; 676 } 677 else 678 { 679 vMask[i] = VIMMED1((1 << info.bpc[i]) - 1); 680 if (info.type[i] == SWR_TYPE_SNORM) 681 scale[i] = (1 << (info.bpc[i] - 1)) - 1; 682 else 683 scale[i] = (1 << info.bpc[i]) - 1; 684 } 685 686 switch (info.type[i]) 687 { 688 default: 689 SWR_INVALID("Unsupported type for logic op: %d", info.type[i]); 690 break; 691 692 case SWR_TYPE_UNKNOWN: 693 case SWR_TYPE_UNUSED: 694 // fallthrough 695 696 case SWR_TYPE_UINT: 697 case SWR_TYPE_SINT: 698 src[i] = BITCAST(src[i], mSimdInt32Ty); 699 dst[i] = BITCAST(dst[i], mSimdInt32Ty); 700 break; 701 case SWR_TYPE_SNORM: 702 src[i] = FP_TO_SI( 703 FMUL(src[i], VIMMED1(scale[i])), 704 mSimdInt32Ty); 705 dst[i] = FP_TO_SI( 706 FMUL(dst[i], VIMMED1(scale[i])), 707 mSimdInt32Ty); 708 break; 709 case SWR_TYPE_UNORM: 710 src[i] = FP_TO_UI( 711 FMUL(src[i], VIMMED1(scale[i])), 712 mSimdInt32Ty); 713 dst[i] = FP_TO_UI( 714 FMUL(dst[i], VIMMED1(scale[i])), 715 mSimdInt32Ty); 716 break; 717 } 718 } 719 720 LogicOpFunc(state.blendState.logicOpFunc, src, dst, result); 721 722 // store results out 723 for(uint32_t i = 0; i < 4; ++i) 724 { 725 if (info.type[i] == SWR_TYPE_UNUSED) 726 { 727 continue; 728 } 729 730 // clear upper bits from PS output not in RT format after doing logic op 731 result[i] = AND(result[i], vMask[i]); 732 733 switch (info.type[i]) 734 { 735 default: 736 SWR_INVALID("Unsupported type for logic op: %d", info.type[i]); 737 break; 738 739 case SWR_TYPE_UNKNOWN: 740 case SWR_TYPE_UNUSED: 741 // fallthrough 742 743 case SWR_TYPE_UINT: 744 case SWR_TYPE_SINT: 745 result[i] = BITCAST(result[i], mSimdFP32Ty); 746 break; 747 case SWR_TYPE_SNORM: 748 result[i] = SHL(result[i], C(32 - info.bpc[i])); 749 result[i] = ASHR(result[i], C(32 - info.bpc[i])); 750 result[i] = FMUL(SI_TO_FP(result[i], mSimdFP32Ty), 751 VIMMED1(1.0f / scale[i])); 752 break; 753 case SWR_TYPE_UNORM: 754 result[i] = FMUL(UI_TO_FP(result[i], mSimdFP32Ty), 755 VIMMED1(1.0f / scale[i])); 756 break; 757 } 758 759 STORE(result[i], pResult, {i}); 760 } 761 } 762 763 if(state.desc.oMaskEnable) 764 { 765 assert(!(state.desc.alphaToCoverageEnable)); 766 // load current mask 767 Value* oMask = LOAD(ppoMask); 768 currentSampleMask = AND(oMask, currentSampleMask); 769 } 770 771 if(state.desc.sampleMaskEnable) 772 { 773 Value* sampleMask = LOAD(pBlendState, { 0, SWR_BLEND_STATE_sampleMask}); 774 currentSampleMask = AND(VBROADCAST(sampleMask), currentSampleMask); 775 } 776 777 if(state.desc.sampleMaskEnable || state.desc.alphaToCoverageEnable || 778 state.desc.oMaskEnable) 779 { 780 // load coverage mask and mask off any lanes with no samples 781 Value* pMask = LOAD(ppMask); 782 Value* sampleMasked = SHL(C(1), sampleNum); 783 currentSampleMask = AND(currentSampleMask, VBROADCAST(sampleMasked)); 784 currentSampleMask = S_EXT(ICMP_UGT(currentSampleMask, VBROADCAST(C(0))), mSimdInt32Ty); 785 Value* outputMask = AND(pMask, currentSampleMask); 786 // store new mask 787 STORE(outputMask, GEP(ppMask, C(0))); 788 } 789 790 RET_VOID(); 791 792 JitManager::DumpToFile(blendFunc, ""); 793 794 ::FunctionPassManager passes(JM()->mpCurrentModule); 795 796 passes.add(createBreakCriticalEdgesPass()); 797 passes.add(createCFGSimplificationPass()); 798 passes.add(createEarlyCSEPass()); 799 passes.add(createPromoteMemoryToRegisterPass()); 800 passes.add(createCFGSimplificationPass()); 801 passes.add(createEarlyCSEPass()); 802 passes.add(createInstructionCombiningPass()); 803 passes.add(createInstructionSimplifierPass()); 804 passes.add(createConstantPropagationPass()); 805 passes.add(createSCCPPass()); 806 passes.add(createAggressiveDCEPass()); 807 808 passes.run(*blendFunc); 809 810 JitManager::DumpToFile(blendFunc, "optimized"); 811 812 return blendFunc; 813 } 814 }; 815 816 ////////////////////////////////////////////////////////////////////////// 817 /// @brief JITs from fetch shader IR 818 /// @param hJitMgr - JitManager handle 819 /// @param func - LLVM function IR 820 /// @return PFN_FETCH_FUNC - pointer to fetch code 821 PFN_BLEND_JIT_FUNC JitBlendFunc(HANDLE hJitMgr, const HANDLE hFunc) 822 { 823 const llvm::Function *func = (const llvm::Function*)hFunc; 824 JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr); 825 PFN_BLEND_JIT_FUNC pfnBlend; 826 pfnBlend = (PFN_BLEND_JIT_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str())); 827 // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot add new IR to the module 828 pJitMgr->mIsModuleFinalized = true; 829 830 return pfnBlend; 831 } 832 833 ////////////////////////////////////////////////////////////////////////// 834 /// @brief JIT compiles blend shader 835 /// @param hJitMgr - JitManager handle 836 /// @param state - blend state to build function from 837 extern "C" PFN_BLEND_JIT_FUNC JITCALL JitCompileBlend(HANDLE hJitMgr, const BLEND_COMPILE_STATE& state) 838 { 839 JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr); 840 841 pJitMgr->SetupNewModule(); 842 843 BlendJit theJit(pJitMgr); 844 HANDLE hFunc = theJit.Create(state); 845 846 return JitBlendFunc(hJitMgr, hFunc); 847 } 848