1 /**************************************************************************** 2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 * @file blend_jit.cpp 24 * 25 * @brief Implementation of the blend jitter 26 * 27 * Notes: 28 * 29 ******************************************************************************/ 30 #include "jit_api.h" 31 #include "blend_jit.h" 32 #include "builder.h" 33 #include "state_llvm.h" 34 35 #include <sstream> 36 37 // components with bit-widths <= the QUANTIZE_THRESHOLD will be quantized 38 #define QUANTIZE_THRESHOLD 2 39 40 using namespace llvm; 41 using namespace SwrJit; 42 43 ////////////////////////////////////////////////////////////////////////// 44 /// Interface to Jitting a blend shader 45 ////////////////////////////////////////////////////////////////////////// 46 struct BlendJit : public Builder 47 { 48 BlendJit(JitManager* pJitMgr) : Builder(pJitMgr){}; 49 50 template<bool Color, bool Alpha> 51 void GenerateBlendFactor(SWR_BLEND_FACTOR factor, Value* constColor[4], Value* src[4], Value* src1[4], Value* dst[4], Value* result[4]) 52 { 53 Value* out[4]; 54 55 switch (factor) 56 { 57 case BLENDFACTOR_ONE: 58 out[0] = out[1] = out[2] = out[3] = VIMMED1(1.0f); 59 break; 60 case BLENDFACTOR_SRC_COLOR: 61 out[0] = src[0]; 62 out[1] = src[1]; 63 out[2] = src[2]; 64 out[3] = src[3]; 65 break; 66 case BLENDFACTOR_SRC_ALPHA: 67 out[0] = out[1] = out[2] = out[3] = src[3]; 68 break; 69 case BLENDFACTOR_DST_ALPHA: 70 out[0] = out[1] = out[2] = out[3] = dst[3]; 71 break; 72 case BLENDFACTOR_DST_COLOR: 73 out[0] = dst[0]; 74 out[1] = dst[1]; 75 out[2] = dst[2]; 76 out[3] = dst[3]; 77 break; 78 case BLENDFACTOR_SRC_ALPHA_SATURATE: 79 out[0] = out[1] = out[2] = VMINPS(src[3], FSUB(VIMMED1(1.0f), dst[3])); 80 out[3] = VIMMED1(1.0f); 81 break; 82 case BLENDFACTOR_CONST_COLOR: 83 out[0] = constColor[0]; 84 out[1] = constColor[1]; 85 out[2] = constColor[2]; 86 out[3] = constColor[3]; 87 break; 88 case BLENDFACTOR_CONST_ALPHA: 89 out[0] = out[1] = out[2] = out[3] = constColor[3]; 90 break; 91 case BLENDFACTOR_SRC1_COLOR: 92 out[0] = src1[0]; 93 out[1] = src1[1]; 94 out[2] = src1[2]; 95 out[3] = src1[3]; 96 break; 97 case BLENDFACTOR_SRC1_ALPHA: 98 out[0] = out[1] = out[2] = out[3] = src1[3]; 99 break; 100 case BLENDFACTOR_ZERO: 101 out[0] = out[1] = out[2] = out[3] = VIMMED1(0.0f); 102 break; 103 case BLENDFACTOR_INV_SRC_COLOR: 104 out[0] = FSUB(VIMMED1(1.0f), src[0]); 105 out[1] = FSUB(VIMMED1(1.0f), src[1]); 106 out[2] = FSUB(VIMMED1(1.0f), src[2]); 107 out[3] = FSUB(VIMMED1(1.0f), src[3]); 108 break; 109 case BLENDFACTOR_INV_SRC_ALPHA: 110 out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), src[3]); 111 break; 112 case BLENDFACTOR_INV_DST_ALPHA: 113 out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), dst[3]); 114 break; 115 case BLENDFACTOR_INV_DST_COLOR: 116 out[0] = FSUB(VIMMED1(1.0f), dst[0]); 117 out[1] = FSUB(VIMMED1(1.0f), dst[1]); 118 out[2] = FSUB(VIMMED1(1.0f), dst[2]); 119 out[3] = FSUB(VIMMED1(1.0f), dst[3]); 120 break; 121 case BLENDFACTOR_INV_CONST_COLOR: 122 out[0] = FSUB(VIMMED1(1.0f), constColor[0]); 123 out[1] = FSUB(VIMMED1(1.0f), constColor[1]); 124 out[2] = FSUB(VIMMED1(1.0f), constColor[2]); 125 out[3] = FSUB(VIMMED1(1.0f), constColor[3]); 126 break; 127 case BLENDFACTOR_INV_CONST_ALPHA: 128 out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), constColor[3]); 129 break; 130 case BLENDFACTOR_INV_SRC1_COLOR: 131 out[0] = FSUB(VIMMED1(1.0f), src1[0]); 132 out[1] = FSUB(VIMMED1(1.0f), src1[1]); 133 out[2] = FSUB(VIMMED1(1.0f), src1[2]); 134 out[3] = FSUB(VIMMED1(1.0f), src1[3]); 135 break; 136 case BLENDFACTOR_INV_SRC1_ALPHA: 137 out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), src1[3]); 138 break; 139 default: 140 SWR_ASSERT(false, "Unsupported blend factor: %d", factor); 141 out[0] = out[1] = out[2] = out[3] = VIMMED1(0.0f); 142 break; 143 } 144 145 if (Color) 146 { 147 result[0] = out[0]; 148 result[1] = out[1]; 149 result[2] = out[2]; 150 } 151 152 if (Alpha) 153 { 154 result[3] = out[3]; 155 } 156 } 157 158 void Clamp(SWR_FORMAT format, Value* src[4]) 159 { 160 const SWR_FORMAT_INFO& info = GetFormatInfo(format); 161 SWR_TYPE type = info.type[0]; 162 163 switch (type) 164 { 165 case SWR_TYPE_FLOAT: 166 break; 167 168 case SWR_TYPE_UNORM: 169 src[0] = VMINPS(VMAXPS(src[0], VIMMED1(0.0f)), VIMMED1(1.0f)); 170 src[1] = VMINPS(VMAXPS(src[1], VIMMED1(0.0f)), VIMMED1(1.0f)); 171 src[2] = VMINPS(VMAXPS(src[2], VIMMED1(0.0f)), VIMMED1(1.0f)); 172 src[3] = VMINPS(VMAXPS(src[3], VIMMED1(0.0f)), VIMMED1(1.0f)); 173 break; 174 175 case SWR_TYPE_SNORM: 176 src[0] = VMINPS(VMAXPS(src[0], VIMMED1(-1.0f)), VIMMED1(1.0f)); 177 src[1] = VMINPS(VMAXPS(src[1], VIMMED1(-1.0f)), VIMMED1(1.0f)); 178 src[2] = VMINPS(VMAXPS(src[2], VIMMED1(-1.0f)), VIMMED1(1.0f)); 179 src[3] = VMINPS(VMAXPS(src[3], VIMMED1(-1.0f)), VIMMED1(1.0f)); 180 break; 181 182 default: SWR_ASSERT(false, "Unsupport format type: %d", type); 183 } 184 } 185 186 void ApplyDefaults(SWR_FORMAT format, Value* src[4]) 187 { 188 const SWR_FORMAT_INFO& info = GetFormatInfo(format); 189 190 bool valid[] = { false, false, false, false }; 191 for (uint32_t c = 0; c < info.numComps; ++c) 192 { 193 valid[info.swizzle[c]] = true; 194 } 195 196 for (uint32_t c = 0; c < 4; ++c) 197 { 198 if (!valid[c]) 199 { 200 src[c] = BITCAST(VIMMED1((int)info.defaults[c]), mSimdFP32Ty); 201 } 202 } 203 } 204 205 void ApplyUnusedDefaults(SWR_FORMAT format, Value* src[4]) 206 { 207 const SWR_FORMAT_INFO& info = GetFormatInfo(format); 208 209 for (uint32_t c = 0; c < info.numComps; ++c) 210 { 211 if (info.type[c] == SWR_TYPE_UNUSED) 212 { 213 src[info.swizzle[c]] = BITCAST(VIMMED1((int)info.defaults[info.swizzle[c]]), mSimdFP32Ty); 214 } 215 } 216 } 217 218 void Quantize(SWR_FORMAT format, Value* src[4]) 219 { 220 const SWR_FORMAT_INFO& info = GetFormatInfo(format); 221 for (uint32_t c = 0; c < info.numComps; ++c) 222 { 223 if (info.bpc[c] <= QUANTIZE_THRESHOLD && info.type[c] != SWR_TYPE_UNUSED) 224 { 225 uint32_t swizComp = info.swizzle[c]; 226 float factor = (float)((1 << info.bpc[c]) - 1); 227 switch (info.type[c]) 228 { 229 case SWR_TYPE_UNORM: 230 src[swizComp] = FADD(FMUL(src[swizComp], VIMMED1(factor)), VIMMED1(0.5f)); 231 src[swizComp] = VROUND(src[swizComp], C(_MM_FROUND_TO_ZERO)); 232 src[swizComp] = FMUL(src[swizComp], VIMMED1(1.0f /factor)); 233 break; 234 default: SWR_ASSERT(false, "Unsupported format type: %d", info.type[c]); 235 } 236 } 237 } 238 } 239 240 template<bool Color, bool Alpha> 241 void BlendFunc(SWR_BLEND_OP blendOp, Value* src[4], Value* srcFactor[4], Value* dst[4], Value* dstFactor[4], Value* result[4]) 242 { 243 Value* out[4]; 244 Value* srcBlend[4]; 245 Value* dstBlend[4]; 246 for (uint32_t i = 0; i < 4; ++i) 247 { 248 srcBlend[i] = FMUL(src[i], srcFactor[i]); 249 dstBlend[i] = FMUL(dst[i], dstFactor[i]); 250 } 251 252 switch (blendOp) 253 { 254 case BLENDOP_ADD: 255 out[0] = FADD(srcBlend[0], dstBlend[0]); 256 out[1] = FADD(srcBlend[1], dstBlend[1]); 257 out[2] = FADD(srcBlend[2], dstBlend[2]); 258 out[3] = FADD(srcBlend[3], dstBlend[3]); 259 break; 260 261 case BLENDOP_SUBTRACT: 262 out[0] = FSUB(srcBlend[0], dstBlend[0]); 263 out[1] = FSUB(srcBlend[1], dstBlend[1]); 264 out[2] = FSUB(srcBlend[2], dstBlend[2]); 265 out[3] = FSUB(srcBlend[3], dstBlend[3]); 266 break; 267 268 case BLENDOP_REVSUBTRACT: 269 out[0] = FSUB(dstBlend[0], srcBlend[0]); 270 out[1] = FSUB(dstBlend[1], srcBlend[1]); 271 out[2] = FSUB(dstBlend[2], srcBlend[2]); 272 out[3] = FSUB(dstBlend[3], srcBlend[3]); 273 break; 274 275 case BLENDOP_MIN: 276 out[0] = VMINPS(src[0], dst[0]); 277 out[1] = VMINPS(src[1], dst[1]); 278 out[2] = VMINPS(src[2], dst[2]); 279 out[3] = VMINPS(src[3], dst[3]); 280 break; 281 282 case BLENDOP_MAX: 283 out[0] = VMAXPS(src[0], dst[0]); 284 out[1] = VMAXPS(src[1], dst[1]); 285 out[2] = VMAXPS(src[2], dst[2]); 286 out[3] = VMAXPS(src[3], dst[3]); 287 break; 288 289 default: 290 SWR_ASSERT(false, "Unsupported blend operation: %d", blendOp); 291 out[0] = out[1] = out[2] = out[3] = VIMMED1(0.0f); 292 break; 293 } 294 295 if (Color) 296 { 297 result[0] = out[0]; 298 result[1] = out[1]; 299 result[2] = out[2]; 300 } 301 302 if (Alpha) 303 { 304 result[3] = out[3]; 305 } 306 } 307 308 void LogicOpFunc(SWR_LOGIC_OP logicOp, Value* src[4], Value* dst[4], Value* result[4]) 309 { 310 // Op: (s == PS output, d = RT contents) 311 switch(logicOp) 312 { 313 case LOGICOP_CLEAR: 314 result[0] = VIMMED1(0); 315 result[1] = VIMMED1(0); 316 result[2] = VIMMED1(0); 317 result[3] = VIMMED1(0); 318 break; 319 320 case LOGICOP_NOR: 321 // ~(s | d) 322 result[0] = XOR(OR(src[0], dst[0]), VIMMED1(0xFFFFFFFF)); 323 result[1] = XOR(OR(src[1], dst[1]), VIMMED1(0xFFFFFFFF)); 324 result[2] = XOR(OR(src[2], dst[2]), VIMMED1(0xFFFFFFFF)); 325 result[3] = XOR(OR(src[3], dst[3]), VIMMED1(0xFFFFFFFF)); 326 break; 327 328 case LOGICOP_AND_INVERTED: 329 // ~s & d 330 // todo: use avx andnot instr when I can find the intrinsic to call 331 result[0] = AND(XOR(src[0], VIMMED1(0xFFFFFFFF)), dst[0]); 332 result[1] = AND(XOR(src[1], VIMMED1(0xFFFFFFFF)), dst[1]); 333 result[2] = AND(XOR(src[2], VIMMED1(0xFFFFFFFF)), dst[2]); 334 result[3] = AND(XOR(src[3], VIMMED1(0xFFFFFFFF)), dst[3]); 335 break; 336 337 case LOGICOP_COPY_INVERTED: 338 // ~s 339 result[0] = XOR(src[0], VIMMED1(0xFFFFFFFF)); 340 result[1] = XOR(src[1], VIMMED1(0xFFFFFFFF)); 341 result[2] = XOR(src[2], VIMMED1(0xFFFFFFFF)); 342 result[3] = XOR(src[3], VIMMED1(0xFFFFFFFF)); 343 break; 344 345 case LOGICOP_AND_REVERSE: 346 // s & ~d 347 // todo: use avx andnot instr when I can find the intrinsic to call 348 result[0] = AND(XOR(dst[0], VIMMED1(0xFFFFFFFF)), src[0]); 349 result[1] = AND(XOR(dst[1], VIMMED1(0xFFFFFFFF)), src[1]); 350 result[2] = AND(XOR(dst[2], VIMMED1(0xFFFFFFFF)), src[2]); 351 result[3] = AND(XOR(dst[3], VIMMED1(0xFFFFFFFF)), src[3]); 352 break; 353 354 case LOGICOP_INVERT: 355 // ~d 356 result[0] = XOR(dst[0], VIMMED1(0xFFFFFFFF)); 357 result[1] = XOR(dst[1], VIMMED1(0xFFFFFFFF)); 358 result[2] = XOR(dst[2], VIMMED1(0xFFFFFFFF)); 359 result[3] = XOR(dst[3], VIMMED1(0xFFFFFFFF)); 360 break; 361 362 case LOGICOP_XOR: 363 // s ^ d 364 result[0] = XOR(src[0], dst[0]); 365 result[1] = XOR(src[1], dst[1]); 366 result[2] = XOR(src[2], dst[2]); 367 result[3] = XOR(src[3], dst[3]); 368 break; 369 370 case LOGICOP_NAND: 371 // ~(s & d) 372 result[0] = XOR(AND(src[0], dst[0]), VIMMED1(0xFFFFFFFF)); 373 result[1] = XOR(AND(src[1], dst[1]), VIMMED1(0xFFFFFFFF)); 374 result[2] = XOR(AND(src[2], dst[2]), VIMMED1(0xFFFFFFFF)); 375 result[3] = XOR(AND(src[3], dst[3]), VIMMED1(0xFFFFFFFF)); 376 break; 377 378 case LOGICOP_AND: 379 // s & d 380 result[0] = AND(src[0], dst[0]); 381 result[1] = AND(src[1], dst[1]); 382 result[2] = AND(src[2], dst[2]); 383 result[3] = AND(src[3], dst[3]); 384 break; 385 386 case LOGICOP_EQUIV: 387 // ~(s ^ d) 388 result[0] = XOR(XOR(src[0], dst[0]), VIMMED1(0xFFFFFFFF)); 389 result[1] = XOR(XOR(src[1], dst[1]), VIMMED1(0xFFFFFFFF)); 390 result[2] = XOR(XOR(src[2], dst[2]), VIMMED1(0xFFFFFFFF)); 391 result[3] = XOR(XOR(src[3], dst[3]), VIMMED1(0xFFFFFFFF)); 392 break; 393 394 case LOGICOP_NOOP: 395 result[0] = dst[0]; 396 result[1] = dst[1]; 397 result[2] = dst[2]; 398 result[3] = dst[3]; 399 break; 400 401 case LOGICOP_OR_INVERTED: 402 // ~s | d 403 result[0] = OR(XOR(src[0], VIMMED1(0xFFFFFFFF)), dst[0]); 404 result[1] = OR(XOR(src[1], VIMMED1(0xFFFFFFFF)), dst[1]); 405 result[2] = OR(XOR(src[2], VIMMED1(0xFFFFFFFF)), dst[2]); 406 result[3] = OR(XOR(src[3], VIMMED1(0xFFFFFFFF)), dst[3]); 407 break; 408 409 case LOGICOP_COPY: 410 result[0] = src[0]; 411 result[1] = src[1]; 412 result[2] = src[2]; 413 result[3] = src[3]; 414 break; 415 416 case LOGICOP_OR_REVERSE: 417 // s | ~d 418 result[0] = OR(XOR(dst[0], VIMMED1(0xFFFFFFFF)), src[0]); 419 result[1] = OR(XOR(dst[1], VIMMED1(0xFFFFFFFF)), src[1]); 420 result[2] = OR(XOR(dst[2], VIMMED1(0xFFFFFFFF)), src[2]); 421 result[3] = OR(XOR(dst[3], VIMMED1(0xFFFFFFFF)), src[3]); 422 break; 423 424 case LOGICOP_OR: 425 // s | d 426 result[0] = OR(src[0], dst[0]); 427 result[1] = OR(src[1], dst[1]); 428 result[2] = OR(src[2], dst[2]); 429 result[3] = OR(src[3], dst[3]); 430 break; 431 432 case LOGICOP_SET: 433 result[0] = VIMMED1(0xFFFFFFFF); 434 result[1] = VIMMED1(0xFFFFFFFF); 435 result[2] = VIMMED1(0xFFFFFFFF); 436 result[3] = VIMMED1(0xFFFFFFFF); 437 break; 438 439 default: 440 SWR_ASSERT(false, "Unsupported logic operation: %d", logicOp); 441 result[0] = result[1] = result[2] = result[3] = VIMMED1(0.0f); 442 break; 443 } 444 } 445 446 void AlphaTest(const BLEND_COMPILE_STATE& state, Value* pBlendState, Value* ppAlpha, Value* ppMask) 447 { 448 // load uint32_t reference 449 Value* pRef = VBROADCAST(LOAD(pBlendState, { 0, SWR_BLEND_STATE_alphaTestReference })); 450 451 // load alpha 452 Value* pAlpha = LOAD(ppAlpha); 453 454 Value* pTest = nullptr; 455 if (state.alphaTestFormat == ALPHA_TEST_UNORM8) 456 { 457 // convert float alpha to unorm8 458 Value* pAlphaU8 = FMUL(pAlpha, VIMMED1(256.0f)); 459 pAlphaU8 = FP_TO_UI(pAlphaU8, mSimdInt32Ty); 460 461 // compare 462 switch (state.alphaTestFunction) 463 { 464 case ZFUNC_ALWAYS: pTest = VIMMED1(true); break; 465 case ZFUNC_NEVER: pTest = VIMMED1(false); break; 466 case ZFUNC_LT: pTest = ICMP_ULT(pAlphaU8, pRef); break; 467 case ZFUNC_EQ: pTest = ICMP_EQ(pAlphaU8, pRef); break; 468 case ZFUNC_LE: pTest = ICMP_ULE(pAlphaU8, pRef); break; 469 case ZFUNC_GT: pTest = ICMP_UGT(pAlphaU8, pRef); break; 470 case ZFUNC_NE: pTest = ICMP_NE(pAlphaU8, pRef); break; 471 case ZFUNC_GE: pTest = ICMP_UGE(pAlphaU8, pRef); break; 472 default: 473 SWR_ASSERT(false, "Invalid alpha test function"); 474 break; 475 } 476 } 477 else 478 { 479 // cast ref to float 480 pRef = BITCAST(pRef, mSimdFP32Ty); 481 482 // compare 483 switch (state.alphaTestFunction) 484 { 485 case ZFUNC_ALWAYS: pTest = VIMMED1(true); break; 486 case ZFUNC_NEVER: pTest = VIMMED1(false); break; 487 case ZFUNC_LT: pTest = FCMP_OLT(pAlpha, pRef); break; 488 case ZFUNC_EQ: pTest = FCMP_OEQ(pAlpha, pRef); break; 489 case ZFUNC_LE: pTest = FCMP_OLE(pAlpha, pRef); break; 490 case ZFUNC_GT: pTest = FCMP_OGT(pAlpha, pRef); break; 491 case ZFUNC_NE: pTest = FCMP_ONE(pAlpha, pRef); break; 492 case ZFUNC_GE: pTest = FCMP_OGE(pAlpha, pRef); break; 493 default: 494 SWR_ASSERT(false, "Invalid alpha test function"); 495 break; 496 } 497 } 498 499 // load current mask 500 Value* pMask = LOAD(ppMask); 501 502 // convert to int1 mask 503 pMask = MASK(pMask); 504 505 // and with alpha test result 506 pMask = AND(pMask, pTest); 507 508 // convert back to vector mask 509 pMask = VMASK(pMask); 510 511 // store new mask 512 STORE(pMask, ppMask); 513 } 514 515 Function* Create(const BLEND_COMPILE_STATE& state) 516 { 517 static std::size_t jitNum = 0; 518 519 std::stringstream fnName("BlendShader", std::ios_base::in | std::ios_base::out | std::ios_base::ate); 520 fnName << jitNum++; 521 522 // blend function signature 523 //typedef void(*PFN_BLEND_JIT_FUNC)(const SWR_BLEND_STATE*, simdvector&, simdvector&, uint32_t, BYTE*, simdvector&, simdscalari*, simdscalari*); 524 525 std::vector<Type*> args{ 526 PointerType::get(Gen_SWR_BLEND_STATE(JM()), 0), // SWR_BLEND_STATE* 527 PointerType::get(mSimdFP32Ty, 0), // simdvector& src 528 PointerType::get(mSimdFP32Ty, 0), // simdvector& src1 529 PointerType::get(mSimdFP32Ty, 0), // src0alpha 530 Type::getInt32Ty(JM()->mContext), // sampleNum 531 PointerType::get(mSimdFP32Ty, 0), // uint8_t* pDst 532 PointerType::get(mSimdFP32Ty, 0), // simdvector& result 533 PointerType::get(mSimdInt32Ty, 0), // simdscalari* oMask 534 PointerType::get(mSimdInt32Ty, 0), // simdscalari* pMask 535 }; 536 537 FunctionType* fTy = FunctionType::get(IRB()->getVoidTy(), args, false); 538 Function* blendFunc = Function::Create(fTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule); 539 540 BasicBlock* entry = BasicBlock::Create(JM()->mContext, "entry", blendFunc); 541 542 IRB()->SetInsertPoint(entry); 543 544 // arguments 545 auto argitr = blendFunc->arg_begin(); 546 Value* pBlendState = &*argitr++; 547 pBlendState->setName("pBlendState"); 548 Value* pSrc = &*argitr++; 549 pSrc->setName("src"); 550 Value* pSrc1 = &*argitr++; 551 pSrc1->setName("src1"); 552 Value* pSrc0Alpha = &*argitr++; 553 pSrc0Alpha->setName("src0alpha"); 554 Value* sampleNum = &*argitr++; 555 sampleNum->setName("sampleNum"); 556 Value* pDst = &*argitr++; 557 pDst->setName("pDst"); 558 Value* pResult = &*argitr++; 559 pResult->setName("result"); 560 Value* ppoMask = &*argitr++; 561 ppoMask->setName("ppoMask"); 562 Value* ppMask = &*argitr++; 563 ppMask->setName("pMask"); 564 565 static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT, "Unsupported hot tile format"); 566 Value* dst[4]; 567 Value* constantColor[4]; 568 Value* src[4]; 569 Value* src1[4]; 570 Value* result[4]; 571 for (uint32_t i = 0; i < 4; ++i) 572 { 573 // load hot tile 574 dst[i] = LOAD(pDst, { i }); 575 576 // load constant color 577 constantColor[i] = VBROADCAST(LOAD(pBlendState, { 0, SWR_BLEND_STATE_constantColor, i })); 578 579 // load src 580 src[i] = LOAD(pSrc, { i }); 581 582 // load src1 583 src1[i] = LOAD(pSrc1, { i }); 584 } 585 Value* currentMask = VIMMED1(-1); 586 if (state.desc.alphaToCoverageEnable) 587 { 588 Value* pClampedSrc = FCLAMP(src[3], 0.0f, 1.0f); 589 uint32_t bits = (1 << state.desc.numSamples) - 1; 590 currentMask = FMUL(pClampedSrc, VBROADCAST(C((float)bits))); 591 currentMask = FP_TO_SI(FADD(currentMask, VIMMED1(0.5f)), mSimdInt32Ty); 592 } 593 594 // alpha test 595 if (state.desc.alphaTestEnable) 596 { 597 AlphaTest(state, pBlendState, pSrc0Alpha, ppMask); 598 } 599 600 // color blend 601 if (state.blendState.blendEnable) 602 { 603 // clamp sources 604 Clamp(state.format, src); 605 Clamp(state.format, src1); 606 Clamp(state.format, dst); 607 Clamp(state.format, constantColor); 608 609 // apply defaults to hottile contents to take into account missing components 610 ApplyDefaults(state.format, dst); 611 612 // Force defaults for unused 'X' components 613 ApplyUnusedDefaults(state.format, dst); 614 615 // Quantize low precision components 616 Quantize(state.format, dst); 617 618 // special case clamping for R11G11B10_float which has no sign bit 619 if (state.format == R11G11B10_FLOAT) 620 { 621 dst[0] = VMAXPS(dst[0], VIMMED1(0.0f)); 622 dst[1] = VMAXPS(dst[1], VIMMED1(0.0f)); 623 dst[2] = VMAXPS(dst[2], VIMMED1(0.0f)); 624 dst[3] = VMAXPS(dst[3], VIMMED1(0.0f)); 625 } 626 627 Value* srcFactor[4]; 628 Value* dstFactor[4]; 629 if (state.desc.independentAlphaBlendEnable) 630 { 631 GenerateBlendFactor<true, false>(state.blendState.sourceBlendFactor, constantColor, src, src1, dst, srcFactor); 632 GenerateBlendFactor<false, true>(state.blendState.sourceAlphaBlendFactor, constantColor, src, src1, dst, srcFactor); 633 634 GenerateBlendFactor<true, false>(state.blendState.destBlendFactor, constantColor, src, src1, dst, dstFactor); 635 GenerateBlendFactor<false, true>(state.blendState.destAlphaBlendFactor, constantColor, src, src1, dst, dstFactor); 636 637 BlendFunc<true, false>(state.blendState.colorBlendFunc, src, srcFactor, dst, dstFactor, result); 638 BlendFunc<false, true>(state.blendState.alphaBlendFunc, src, srcFactor, dst, dstFactor, result); 639 } 640 else 641 { 642 GenerateBlendFactor<true, true>(state.blendState.sourceBlendFactor, constantColor, src, src1, dst, srcFactor); 643 GenerateBlendFactor<true, true>(state.blendState.destBlendFactor, constantColor, src, src1, dst, dstFactor); 644 645 BlendFunc<true, true>(state.blendState.colorBlendFunc, src, srcFactor, dst, dstFactor, result); 646 } 647 648 // store results out 649 for (uint32_t i = 0; i < 4; ++i) 650 { 651 STORE(result[i], pResult, { i }); 652 } 653 } 654 655 if(state.blendState.logicOpEnable) 656 { 657 const SWR_FORMAT_INFO& info = GetFormatInfo(state.format); 658 Value* vMask[4]; 659 float scale[4]; 660 661 if (!state.blendState.blendEnable) 662 { 663 Clamp(state.format, src); 664 Clamp(state.format, dst); 665 } 666 667 for(uint32_t i = 0; i < 4; i++) 668 { 669 if (info.type[i] == SWR_TYPE_UNUSED) 670 { 671 continue; 672 } 673 674 if (info.bpc[i] >= 32) { 675 vMask[i] = VIMMED1(0xFFFFFFFF); 676 scale[i] = 0xFFFFFFFF; 677 } else { 678 vMask[i] = VIMMED1((1 << info.bpc[i]) - 1); 679 if (info.type[i] == SWR_TYPE_SNORM) 680 scale[i] = (1 << (info.bpc[i] - 1)) - 1; 681 else 682 scale[i] = (1 << info.bpc[i]) - 1; 683 } 684 685 switch (info.type[i]) { 686 default: 687 SWR_ASSERT(0, "Unsupported type for logic op\n"); 688 /* fallthrough */ 689 case SWR_TYPE_UINT: 690 case SWR_TYPE_SINT: 691 src[i] = BITCAST(src[i], mSimdInt32Ty); 692 dst[i] = BITCAST(dst[i], mSimdInt32Ty); 693 break; 694 case SWR_TYPE_SNORM: 695 src[i] = FP_TO_SI( 696 FMUL(src[i], VIMMED1(scale[i])), 697 mSimdInt32Ty); 698 dst[i] = FP_TO_SI( 699 FMUL(dst[i], VIMMED1(scale[i])), 700 mSimdInt32Ty); 701 break; 702 case SWR_TYPE_UNORM: 703 src[i] = FP_TO_UI( 704 FMUL(src[i], VIMMED1(scale[i])), 705 mSimdInt32Ty); 706 dst[i] = FP_TO_UI( 707 FMUL(dst[i], VIMMED1(scale[i])), 708 mSimdInt32Ty); 709 break; 710 } 711 } 712 713 LogicOpFunc(state.blendState.logicOpFunc, src, dst, result); 714 715 // store results out 716 for(uint32_t i = 0; i < 4; ++i) 717 { 718 if (info.type[i] == SWR_TYPE_UNUSED) 719 { 720 continue; 721 } 722 723 // clear upper bits from PS output not in RT format after doing logic op 724 result[i] = AND(result[i], vMask[i]); 725 726 switch (info.type[i]) { 727 default: 728 SWR_ASSERT(0, "Unsupported type for logic op\n"); 729 /* fallthrough */ 730 case SWR_TYPE_UINT: 731 case SWR_TYPE_SINT: 732 result[i] = BITCAST(result[i], mSimdFP32Ty); 733 break; 734 case SWR_TYPE_SNORM: 735 result[i] = SHL(result[i], C(32 - info.bpc[i])); 736 result[i] = ASHR(result[i], C(32 - info.bpc[i])); 737 result[i] = FMUL(SI_TO_FP(result[i], mSimdFP32Ty), 738 VIMMED1(1.0f / scale[i])); 739 break; 740 case SWR_TYPE_UNORM: 741 result[i] = FMUL(UI_TO_FP(result[i], mSimdFP32Ty), 742 VIMMED1(1.0f / scale[i])); 743 break; 744 } 745 746 STORE(result[i], pResult, {i}); 747 } 748 } 749 750 if(state.desc.oMaskEnable) 751 { 752 assert(!(state.desc.alphaToCoverageEnable)); 753 // load current mask 754 Value* oMask = LOAD(ppoMask); 755 Value* sampleMasked = VBROADCAST(SHL(C(1), sampleNum)); 756 oMask = AND(oMask, sampleMasked); 757 currentMask = AND(oMask, currentMask); 758 } 759 760 if(state.desc.sampleMaskEnable) 761 { 762 Value* sampleMask = LOAD(pBlendState, { 0, SWR_BLEND_STATE_sampleMask}); 763 Value* sampleMasked = SHL(C(1), sampleNum); 764 sampleMask = AND(sampleMask, sampleMasked); 765 sampleMask = VBROADCAST(ICMP_SGT(sampleMask, C(0))); 766 sampleMask = S_EXT(sampleMask, mSimdInt32Ty); 767 currentMask = AND(sampleMask, currentMask); 768 } 769 770 if (state.desc.alphaToCoverageEnable) 771 { 772 Value* sampleMasked = SHL(C(1), sampleNum); 773 currentMask = AND(currentMask, VBROADCAST(sampleMasked)); 774 } 775 776 if(state.desc.sampleMaskEnable || state.desc.alphaToCoverageEnable || 777 state.desc.oMaskEnable) 778 { 779 // load current mask 780 Value* pMask = LOAD(ppMask); 781 currentMask = S_EXT(ICMP_SGT(currentMask, VBROADCAST(C(0))), mSimdInt32Ty); 782 Value* outputMask = AND(pMask, currentMask); 783 // store new mask 784 STORE(outputMask, GEP(ppMask, C(0))); 785 } 786 787 RET_VOID(); 788 789 JitManager::DumpToFile(blendFunc, ""); 790 791 ::FunctionPassManager passes(JM()->mpCurrentModule); 792 793 passes.add(createBreakCriticalEdgesPass()); 794 passes.add(createCFGSimplificationPass()); 795 passes.add(createEarlyCSEPass()); 796 passes.add(createPromoteMemoryToRegisterPass()); 797 passes.add(createCFGSimplificationPass()); 798 passes.add(createEarlyCSEPass()); 799 passes.add(createInstructionCombiningPass()); 800 passes.add(createInstructionSimplifierPass()); 801 passes.add(createConstantPropagationPass()); 802 passes.add(createSCCPPass()); 803 passes.add(createAggressiveDCEPass()); 804 805 passes.run(*blendFunc); 806 807 JitManager::DumpToFile(blendFunc, "optimized"); 808 809 return blendFunc; 810 } 811 }; 812 813 ////////////////////////////////////////////////////////////////////////// 814 /// @brief JITs from fetch shader IR 815 /// @param hJitMgr - JitManager handle 816 /// @param func - LLVM function IR 817 /// @return PFN_FETCH_FUNC - pointer to fetch code 818 PFN_BLEND_JIT_FUNC JitBlendFunc(HANDLE hJitMgr, const HANDLE hFunc) 819 { 820 const llvm::Function *func = (const llvm::Function*)hFunc; 821 JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr); 822 PFN_BLEND_JIT_FUNC pfnBlend; 823 pfnBlend = (PFN_BLEND_JIT_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str())); 824 // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot add new IR to the module 825 pJitMgr->mIsModuleFinalized = true; 826 827 return pfnBlend; 828 } 829 830 ////////////////////////////////////////////////////////////////////////// 831 /// @brief JIT compiles blend shader 832 /// @param hJitMgr - JitManager handle 833 /// @param state - blend state to build function from 834 extern "C" PFN_BLEND_JIT_FUNC JITCALL JitCompileBlend(HANDLE hJitMgr, const BLEND_COMPILE_STATE& state) 835 { 836 JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr); 837 838 pJitMgr->SetupNewModule(); 839 840 BlendJit theJit(pJitMgr); 841 HANDLE hFunc = theJit.Create(state); 842 843 return JitBlendFunc(hJitMgr, hFunc); 844 } 845