1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #include "PixelRoutine.hpp" 16 17 #include "SamplerCore.hpp" 18 #include "Constants.hpp" 19 #include "Device/Renderer.hpp" 20 #include "Device/QuadRasterizer.hpp" 21 #include "Device/Surface.hpp" 22 #include "Device/Primitive.hpp" 23 #include "Vulkan/VkDebug.hpp" 24 25 namespace sw 26 { 27 extern bool complementaryDepthBuffer; 28 extern bool postBlendSRGB; 29 extern bool exactColorRounding; 30 extern bool forceClearRegisters; 31 32 PixelRoutine::PixelRoutine(const PixelProcessor::State &state, const PixelShader *shader) 33 : QuadRasterizer(state, shader), v(shader && shader->indirectAddressableInput) 34 { 35 if(!shader || shader->getShaderModel() < 0x0200 || forceClearRegisters) 36 { 37 for(int i = 0; i < MAX_FRAGMENT_INPUTS; i++) 38 { 39 v[i].x = Float4(0.0f); 40 v[i].y = Float4(0.0f); 41 v[i].z = Float4(0.0f); 42 v[i].w = Float4(0.0f); 43 } 44 } 45 } 46 47 PixelRoutine::~PixelRoutine() 48 { 49 } 50 51 void PixelRoutine::quad(Pointer<Byte> cBuffer[RENDERTARGETS], Pointer<Byte> &zBuffer, Pointer<Byte> &sBuffer, Int cMask[4], Int &x, Int &y) 52 { 53 #if PERF_PROFILE 54 Long pipeTime = Ticks(); 55 #endif 56 57 const bool earlyDepthTest = !state.depthOverride && !state.alphaTestActive(); 58 59 Int zMask[4]; // Depth mask 60 Int sMask[4]; // Stencil mask 61 62 for(unsigned int q = 0; q < state.multiSample; q++) 63 { 64 zMask[q] = cMask[q]; 65 sMask[q] = cMask[q]; 66 } 67 68 for(unsigned int q = 0; q < state.multiSample; q++) 69 { 70 stencilTest(sBuffer, q, x, sMask[q], cMask[q]); 71 } 72 73 Float4 f; 74 Float4 rhwCentroid; 75 76 Float4 xxxx = Float4(Float(x)) + *Pointer<Float4>(primitive + OFFSET(Primitive,xQuad), 16); 77 78 if(interpolateZ()) 79 { 80 for(unsigned int q = 0; q < state.multiSample; q++) 81 { 82 Float4 x = xxxx; 83 84 if(state.multiSample > 1) 85 { 86 x -= *Pointer<Float4>(constants + OFFSET(Constants,X) + q * sizeof(float4)); 87 } 88 89 z[q] = interpolate(x, Dz[q], z[q], primitive + OFFSET(Primitive,z), false, false, state.depthClamp); 90 } 91 } 92 93 Bool depthPass = false; 94 95 if(earlyDepthTest) 96 { 97 for(unsigned int q = 0; q < state.multiSample; q++) 98 { 99 depthPass = depthPass || depthTest(zBuffer, q, x, z[q], sMask[q], zMask[q], cMask[q]); 100 } 101 } 102 103 If(depthPass || Bool(!earlyDepthTest)) 104 { 105 #if PERF_PROFILE 106 Long interpTime = Ticks(); 107 #endif 108 109 Float4 yyyy = Float4(Float(y)) + *Pointer<Float4>(primitive + OFFSET(Primitive,yQuad), 16); 110 111 // Centroid locations 112 Float4 XXXX = Float4(0.0f); 113 Float4 YYYY = Float4(0.0f); 114 115 if(state.centroid) 116 { 117 Float4 WWWW(1.0e-9f); 118 119 for(unsigned int q = 0; q < state.multiSample; q++) 120 { 121 XXXX += *Pointer<Float4>(constants + OFFSET(Constants,sampleX[q]) + 16 * cMask[q]); 122 YYYY += *Pointer<Float4>(constants + OFFSET(Constants,sampleY[q]) + 16 * cMask[q]); 123 WWWW += *Pointer<Float4>(constants + OFFSET(Constants,weight) + 16 * cMask[q]); 124 } 125 126 WWWW = Rcp_pp(WWWW); 127 XXXX *= WWWW; 128 YYYY *= WWWW; 129 130 XXXX += xxxx; 131 YYYY += yyyy; 132 } 133 134 if(interpolateW()) 135 { 136 w = interpolate(xxxx, Dw, rhw, primitive + OFFSET(Primitive,w), false, false, false); 137 rhw = reciprocal(w, false, false, true); 138 139 if(state.centroid) 140 { 141 rhwCentroid = reciprocal(interpolateCentroid(XXXX, YYYY, rhwCentroid, primitive + OFFSET(Primitive,w), false, false)); 142 } 143 } 144 145 for(int interpolant = 0; interpolant < MAX_FRAGMENT_INPUTS; interpolant++) 146 { 147 for(int component = 0; component < 4; component++) 148 { 149 if(state.interpolant[interpolant].component & (1 << component)) 150 { 151 if(!state.interpolant[interpolant].centroid) 152 { 153 v[interpolant][component] = interpolate(xxxx, Dv[interpolant][component], rhw, primitive + OFFSET(Primitive, V[interpolant][component]), (state.interpolant[interpolant].flat & (1 << component)) != 0, state.perspective, false); 154 } 155 else 156 { 157 v[interpolant][component] = interpolateCentroid(XXXX, YYYY, rhwCentroid, primitive + OFFSET(Primitive, V[interpolant][component]), (state.interpolant[interpolant].flat & (1 << component)) != 0, state.perspective); 158 } 159 } 160 } 161 162 Float4 rcp; 163 164 switch(state.interpolant[interpolant].project) 165 { 166 case 0: 167 break; 168 case 1: 169 rcp = reciprocal(v[interpolant].y); 170 v[interpolant].x = v[interpolant].x * rcp; 171 break; 172 case 2: 173 rcp = reciprocal(v[interpolant].z); 174 v[interpolant].x = v[interpolant].x * rcp; 175 v[interpolant].y = v[interpolant].y * rcp; 176 break; 177 case 3: 178 rcp = reciprocal(v[interpolant].w); 179 v[interpolant].x = v[interpolant].x * rcp; 180 v[interpolant].y = v[interpolant].y * rcp; 181 v[interpolant].z = v[interpolant].z * rcp; 182 break; 183 } 184 } 185 186 if(state.fog.component) 187 { 188 f = interpolate(xxxx, Df, rhw, primitive + OFFSET(Primitive,f), state.fog.flat & 0x01, state.perspective, false); 189 } 190 191 setBuiltins(x, y, z, w); 192 193 #if PERF_PROFILE 194 cycles[PERF_INTERP] += Ticks() - interpTime; 195 #endif 196 197 Bool alphaPass = true; 198 199 if(colorUsed()) 200 { 201 #if PERF_PROFILE 202 Long shaderTime = Ticks(); 203 #endif 204 205 applyShader(cMask); 206 207 #if PERF_PROFILE 208 cycles[PERF_SHADER] += Ticks() - shaderTime; 209 #endif 210 211 alphaPass = alphaTest(cMask); 212 213 if((shader && shader->containsKill()) || state.alphaTestActive()) 214 { 215 for(unsigned int q = 0; q < state.multiSample; q++) 216 { 217 zMask[q] &= cMask[q]; 218 sMask[q] &= cMask[q]; 219 } 220 } 221 } 222 223 If(alphaPass) 224 { 225 if(!earlyDepthTest) 226 { 227 for(unsigned int q = 0; q < state.multiSample; q++) 228 { 229 depthPass = depthPass || depthTest(zBuffer, q, x, z[q], sMask[q], zMask[q], cMask[q]); 230 } 231 } 232 233 #if PERF_PROFILE 234 Long ropTime = Ticks(); 235 #endif 236 237 If(depthPass || Bool(earlyDepthTest)) 238 { 239 for(unsigned int q = 0; q < state.multiSample; q++) 240 { 241 if(state.multiSampleMask & (1 << q)) 242 { 243 writeDepth(zBuffer, q, x, z[q], zMask[q]); 244 245 if(state.occlusionEnabled) 246 { 247 occlusion += *Pointer<UInt>(constants + OFFSET(Constants,occlusionCount) + 4 * (zMask[q] & sMask[q])); 248 } 249 } 250 } 251 252 if(colorUsed()) 253 { 254 #if PERF_PROFILE 255 AddAtomic(Pointer<Long>(&profiler.ropOperations), 4); 256 #endif 257 258 rasterOperation(f, cBuffer, x, sMask, zMask, cMask); 259 } 260 } 261 262 #if PERF_PROFILE 263 cycles[PERF_ROP] += Ticks() - ropTime; 264 #endif 265 } 266 } 267 268 for(unsigned int q = 0; q < state.multiSample; q++) 269 { 270 if(state.multiSampleMask & (1 << q)) 271 { 272 writeStencil(sBuffer, q, x, sMask[q], zMask[q], cMask[q]); 273 } 274 } 275 276 #if PERF_PROFILE 277 cycles[PERF_PIPE] += Ticks() - pipeTime; 278 #endif 279 } 280 281 Float4 PixelRoutine::interpolateCentroid(Float4 &x, Float4 &y, Float4 &rhw, Pointer<Byte> planeEquation, bool flat, bool perspective) 282 { 283 Float4 interpolant = *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation,C), 16); 284 285 if(!flat) 286 { 287 interpolant += x * *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation,A), 16) + 288 y * *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation,B), 16); 289 290 if(perspective) 291 { 292 interpolant *= rhw; 293 } 294 } 295 296 return interpolant; 297 } 298 299 void PixelRoutine::stencilTest(Pointer<Byte> &sBuffer, int q, Int &x, Int &sMask, Int &cMask) 300 { 301 if(!state.stencilActive) 302 { 303 return; 304 } 305 306 // (StencilRef & StencilMask) CompFunc (StencilBufferValue & StencilMask) 307 308 Pointer<Byte> buffer = sBuffer + 2 * x; 309 310 if(q > 0) 311 { 312 buffer += q * *Pointer<Int>(data + OFFSET(DrawData,stencilSliceB)); 313 } 314 315 Byte8 value = *Pointer<Byte8>(buffer); 316 Byte8 valueCCW = value; 317 318 if(!state.noStencilMask) 319 { 320 value &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[0].testMaskQ)); 321 } 322 323 stencilTest(value, state.stencilCompareMode, false); 324 325 if(state.twoSidedStencil) 326 { 327 if(!state.noStencilMaskCCW) 328 { 329 valueCCW &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[1].testMaskQ)); 330 } 331 332 stencilTest(valueCCW, state.stencilCompareModeCCW, true); 333 334 value &= *Pointer<Byte8>(primitive + OFFSET(Primitive,clockwiseMask)); 335 valueCCW &= *Pointer<Byte8>(primitive + OFFSET(Primitive,invClockwiseMask)); 336 value |= valueCCW; 337 } 338 339 sMask = SignMask(value) & cMask; 340 } 341 342 void PixelRoutine::stencilTest(Byte8 &value, VkCompareOp stencilCompareMode, bool CCW) 343 { 344 Byte8 equal; 345 346 switch(stencilCompareMode) 347 { 348 case VK_COMPARE_OP_ALWAYS: 349 value = Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF); 350 break; 351 case VK_COMPARE_OP_NEVER: 352 value = Byte8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00); 353 break; 354 case VK_COMPARE_OP_LESS: // a < b ~ b > a 355 value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); 356 value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ))); 357 break; 358 case VK_COMPARE_OP_EQUAL: 359 value = CmpEQ(value, *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedQ))); 360 break; 361 case VK_COMPARE_OP_NOT_EQUAL: // a != b ~ !(a == b) 362 value = CmpEQ(value, *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedQ))); 363 value ^= Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF); 364 break; 365 case VK_COMPARE_OP_LESS_OR_EQUAL: // a <= b ~ (b > a) || (a == b) 366 equal = value; 367 equal = CmpEQ(equal, *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedQ))); 368 value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); 369 value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ))); 370 value |= equal; 371 break; 372 case VK_COMPARE_OP_GREATER: // a > b 373 equal = *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ)); 374 value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); 375 equal = CmpGT(As<SByte8>(equal), As<SByte8>(value)); 376 value = equal; 377 break; 378 case VK_COMPARE_OP_GREATER_OR_EQUAL: // a >= b ~ !(a < b) ~ !(b > a) 379 value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); 380 value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ))); 381 value ^= Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF); 382 break; 383 default: 384 ASSERT(false); 385 } 386 } 387 388 Bool PixelRoutine::depthTest(Pointer<Byte> &zBuffer, int q, Int &x, Float4 &z, Int &sMask, Int &zMask, Int &cMask) 389 { 390 if(!state.depthTestActive) 391 { 392 return true; 393 } 394 395 Float4 Z = z; 396 397 if(shader && shader->depthOverride()) 398 { 399 if(complementaryDepthBuffer) 400 { 401 Z = Float4(1.0f) - oDepth; 402 } 403 else 404 { 405 Z = oDepth; 406 } 407 } 408 409 Pointer<Byte> buffer; 410 Int pitch; 411 412 if(!state.quadLayoutDepthBuffer) 413 { 414 buffer = zBuffer + 4 * x; 415 pitch = *Pointer<Int>(data + OFFSET(DrawData,depthPitchB)); 416 } 417 else 418 { 419 buffer = zBuffer + 8 * x; 420 } 421 422 if(q > 0) 423 { 424 buffer += q * *Pointer<Int>(data + OFFSET(DrawData,depthSliceB)); 425 } 426 427 Float4 zValue; 428 429 if(state.depthCompareMode != VK_COMPARE_OP_NEVER || (state.depthCompareMode != VK_COMPARE_OP_ALWAYS && !state.depthWriteEnable)) 430 { 431 if(!state.quadLayoutDepthBuffer) 432 { 433 // FIXME: Properly optimizes? 434 zValue.xy = *Pointer<Float4>(buffer); 435 zValue.zw = *Pointer<Float4>(buffer + pitch - 8); 436 } 437 else 438 { 439 zValue = *Pointer<Float4>(buffer, 16); 440 } 441 } 442 443 Int4 zTest; 444 445 switch(state.depthCompareMode) 446 { 447 case VK_COMPARE_OP_ALWAYS: 448 // Optimized 449 break; 450 case VK_COMPARE_OP_NEVER: 451 // Optimized 452 break; 453 case VK_COMPARE_OP_EQUAL: 454 zTest = CmpEQ(zValue, Z); 455 break; 456 case VK_COMPARE_OP_NOT_EQUAL: 457 zTest = CmpNEQ(zValue, Z); 458 break; 459 case VK_COMPARE_OP_LESS: 460 if(complementaryDepthBuffer) 461 { 462 zTest = CmpLT(zValue, Z); 463 } 464 else 465 { 466 zTest = CmpNLE(zValue, Z); 467 } 468 break; 469 case VK_COMPARE_OP_GREATER_OR_EQUAL: 470 if(complementaryDepthBuffer) 471 { 472 zTest = CmpNLT(zValue, Z); 473 } 474 else 475 { 476 zTest = CmpLE(zValue, Z); 477 } 478 break; 479 case VK_COMPARE_OP_LESS_OR_EQUAL: 480 if(complementaryDepthBuffer) 481 { 482 zTest = CmpLE(zValue, Z); 483 } 484 else 485 { 486 zTest = CmpNLT(zValue, Z); 487 } 488 break; 489 case VK_COMPARE_OP_GREATER: 490 if(complementaryDepthBuffer) 491 { 492 zTest = CmpNLE(zValue, Z); 493 } 494 else 495 { 496 zTest = CmpLT(zValue, Z); 497 } 498 break; 499 default: 500 ASSERT(false); 501 } 502 503 switch(state.depthCompareMode) 504 { 505 case VK_COMPARE_OP_ALWAYS: 506 zMask = cMask; 507 break; 508 case VK_COMPARE_OP_NEVER: 509 zMask = 0x0; 510 break; 511 default: 512 zMask = SignMask(zTest) & cMask; 513 break; 514 } 515 516 if(state.stencilActive) 517 { 518 zMask &= sMask; 519 } 520 521 return zMask != 0; 522 } 523 524 void PixelRoutine::alphaTest(Int &aMask, Short4 &alpha) 525 { 526 Short4 cmp; 527 Short4 equal; 528 529 switch(state.alphaCompareMode) 530 { 531 case VK_COMPARE_OP_ALWAYS: 532 aMask = 0xF; 533 break; 534 case VK_COMPARE_OP_NEVER: 535 aMask = 0x0; 536 break; 537 case VK_COMPARE_OP_EQUAL: 538 cmp = CmpEQ(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4))); 539 aMask = SignMask(PackSigned(cmp, Short4(0x0000))); 540 break; 541 case VK_COMPARE_OP_NOT_EQUAL: // a != b ~ !(a == b) 542 cmp = CmpEQ(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4))) ^ Short4(0xFFFFu); // FIXME 543 aMask = SignMask(PackSigned(cmp, Short4(0x0000))); 544 break; 545 case VK_COMPARE_OP_LESS: // a < b ~ b > a 546 cmp = CmpGT(*Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)), alpha); 547 aMask = SignMask(PackSigned(cmp, Short4(0x0000))); 548 break; 549 case VK_COMPARE_OP_GREATER_OR_EQUAL: // a >= b ~ (a > b) || (a == b) ~ !(b > a) // TODO: Approximate 550 equal = CmpEQ(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4))); 551 cmp = CmpGT(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4))); 552 cmp |= equal; 553 aMask = SignMask(PackSigned(cmp, Short4(0x0000))); 554 break; 555 case VK_COMPARE_OP_LESS_OR_EQUAL: // a <= b ~ !(a > b) 556 cmp = CmpGT(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4))) ^ Short4(0xFFFFu); // FIXME 557 aMask = SignMask(PackSigned(cmp, Short4(0x0000))); 558 break; 559 case VK_COMPARE_OP_GREATER: // a > b 560 cmp = CmpGT(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4))); 561 aMask = SignMask(PackSigned(cmp, Short4(0x0000))); 562 break; 563 default: 564 ASSERT(false); 565 } 566 } 567 568 void PixelRoutine::alphaToCoverage(Int cMask[4], Float4 &alpha) 569 { 570 Int4 coverage0 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c0))); 571 Int4 coverage1 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c1))); 572 Int4 coverage2 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c2))); 573 Int4 coverage3 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c3))); 574 575 Int aMask0 = SignMask(coverage0); 576 Int aMask1 = SignMask(coverage1); 577 Int aMask2 = SignMask(coverage2); 578 Int aMask3 = SignMask(coverage3); 579 580 cMask[0] &= aMask0; 581 cMask[1] &= aMask1; 582 cMask[2] &= aMask2; 583 cMask[3] &= aMask3; 584 } 585 586 void PixelRoutine::writeDepth(Pointer<Byte> &zBuffer, int q, Int &x, Float4 &z, Int &zMask) 587 { 588 if(!state.depthWriteEnable) 589 { 590 return; 591 } 592 593 Float4 Z = z; 594 595 if(shader && shader->depthOverride()) 596 { 597 if(complementaryDepthBuffer) 598 { 599 Z = Float4(1.0f) - oDepth; 600 } 601 else 602 { 603 Z = oDepth; 604 } 605 } 606 607 Pointer<Byte> buffer; 608 Int pitch; 609 610 if(!state.quadLayoutDepthBuffer) 611 { 612 buffer = zBuffer + 4 * x; 613 pitch = *Pointer<Int>(data + OFFSET(DrawData,depthPitchB)); 614 } 615 else 616 { 617 buffer = zBuffer + 8 * x; 618 } 619 620 if(q > 0) 621 { 622 buffer += q * *Pointer<Int>(data + OFFSET(DrawData,depthSliceB)); 623 } 624 625 Float4 zValue; 626 627 if(state.depthCompareMode != VK_COMPARE_OP_NEVER || (state.depthCompareMode != VK_COMPARE_OP_ALWAYS && !state.depthWriteEnable)) 628 { 629 if(!state.quadLayoutDepthBuffer) 630 { 631 // FIXME: Properly optimizes? 632 zValue.xy = *Pointer<Float4>(buffer); 633 zValue.zw = *Pointer<Float4>(buffer + pitch - 8); 634 } 635 else 636 { 637 zValue = *Pointer<Float4>(buffer, 16); 638 } 639 } 640 641 Z = As<Float4>(As<Int4>(Z) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X) + zMask * 16, 16)); 642 zValue = As<Float4>(As<Int4>(zValue) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X) + zMask * 16, 16)); 643 Z = As<Float4>(As<Int4>(Z) | As<Int4>(zValue)); 644 645 if(!state.quadLayoutDepthBuffer) 646 { 647 // FIXME: Properly optimizes? 648 *Pointer<Float2>(buffer) = Float2(Z.xy); 649 *Pointer<Float2>(buffer + pitch) = Float2(Z.zw); 650 } 651 else 652 { 653 *Pointer<Float4>(buffer, 16) = Z; 654 } 655 } 656 657 void PixelRoutine::writeStencil(Pointer<Byte> &sBuffer, int q, Int &x, Int &sMask, Int &zMask, Int &cMask) 658 { 659 if(!state.stencilActive) 660 { 661 return; 662 } 663 664 if(state.stencilPassOperation == VK_STENCIL_OP_KEEP && state.stencilZFailOperation == VK_STENCIL_OP_KEEP && state.stencilFailOperation == VK_STENCIL_OP_KEEP) 665 { 666 if(!state.twoSidedStencil || (state.stencilPassOperationCCW == VK_STENCIL_OP_KEEP && state.stencilZFailOperationCCW == VK_STENCIL_OP_KEEP && state.stencilFailOperationCCW == VK_STENCIL_OP_KEEP)) 667 { 668 return; 669 } 670 } 671 672 if(state.stencilWriteMasked && (!state.twoSidedStencil || state.stencilWriteMaskedCCW)) 673 { 674 return; 675 } 676 677 Pointer<Byte> buffer = sBuffer + 2 * x; 678 679 if(q > 0) 680 { 681 buffer += q * *Pointer<Int>(data + OFFSET(DrawData,stencilSliceB)); 682 } 683 684 Byte8 bufferValue = *Pointer<Byte8>(buffer); 685 686 Byte8 newValue; 687 stencilOperation(newValue, bufferValue, state.stencilPassOperation, state.stencilZFailOperation, state.stencilFailOperation, false, zMask, sMask); 688 689 if(!state.noStencilWriteMask) 690 { 691 Byte8 maskedValue = bufferValue; 692 newValue &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[0].writeMaskQ)); 693 maskedValue &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[0].invWriteMaskQ)); 694 newValue |= maskedValue; 695 } 696 697 if(state.twoSidedStencil) 698 { 699 Byte8 newValueCCW; 700 701 stencilOperation(newValueCCW, bufferValue, state.stencilPassOperationCCW, state.stencilZFailOperationCCW, state.stencilFailOperationCCW, true, zMask, sMask); 702 703 if(!state.noStencilWriteMaskCCW) 704 { 705 Byte8 maskedValue = bufferValue; 706 newValueCCW &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[1].writeMaskQ)); 707 maskedValue &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[1].invWriteMaskQ)); 708 newValueCCW |= maskedValue; 709 } 710 711 newValue &= *Pointer<Byte8>(primitive + OFFSET(Primitive,clockwiseMask)); 712 newValueCCW &= *Pointer<Byte8>(primitive + OFFSET(Primitive,invClockwiseMask)); 713 newValue |= newValueCCW; 714 } 715 716 newValue &= *Pointer<Byte8>(constants + OFFSET(Constants,maskB4Q) + 8 * cMask); 717 bufferValue &= *Pointer<Byte8>(constants + OFFSET(Constants,invMaskB4Q) + 8 * cMask); 718 newValue |= bufferValue; 719 720 *Pointer<Byte4>(buffer) = Byte4(newValue); 721 } 722 723 void PixelRoutine::stencilOperation(Byte8 &newValue, Byte8 &bufferValue, VkStencilOp stencilPassOperation, VkStencilOp stencilZFailOperation, VkStencilOp stencilFailOperation, bool CCW, Int &zMask, Int &sMask) 724 { 725 Byte8 &pass = newValue; 726 Byte8 fail; 727 Byte8 zFail; 728 729 stencilOperation(pass, bufferValue, stencilPassOperation, CCW); 730 731 if(stencilZFailOperation != stencilPassOperation) 732 { 733 stencilOperation(zFail, bufferValue, stencilZFailOperation, CCW); 734 } 735 736 if(stencilFailOperation != stencilPassOperation || stencilFailOperation != stencilZFailOperation) 737 { 738 stencilOperation(fail, bufferValue, stencilFailOperation, CCW); 739 } 740 741 if(stencilFailOperation != stencilPassOperation || stencilFailOperation != stencilZFailOperation) 742 { 743 if(state.depthTestActive && stencilZFailOperation != stencilPassOperation) // zMask valid and values not the same 744 { 745 pass &= *Pointer<Byte8>(constants + OFFSET(Constants,maskB4Q) + 8 * zMask); 746 zFail &= *Pointer<Byte8>(constants + OFFSET(Constants,invMaskB4Q) + 8 * zMask); 747 pass |= zFail; 748 } 749 750 pass &= *Pointer<Byte8>(constants + OFFSET(Constants,maskB4Q) + 8 * sMask); 751 fail &= *Pointer<Byte8>(constants + OFFSET(Constants,invMaskB4Q) + 8 * sMask); 752 pass |= fail; 753 } 754 } 755 756 void PixelRoutine::stencilOperation(Byte8 &output, Byte8 &bufferValue, VkStencilOp operation, bool CCW) 757 { 758 switch(operation) 759 { 760 case VK_STENCIL_OP_KEEP: 761 output = bufferValue; 762 break; 763 case VK_STENCIL_OP_ZERO: 764 output = Byte8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00); 765 break; 766 case VK_STENCIL_OP_REPLACE: 767 output = *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceQ)); 768 break; 769 case VK_STENCIL_OP_INCREMENT_AND_CLAMP: 770 output = AddSat(bufferValue, Byte8(1, 1, 1, 1, 1, 1, 1, 1)); 771 break; 772 case VK_STENCIL_OP_DECREMENT_AND_CLAMP: 773 output = SubSat(bufferValue, Byte8(1, 1, 1, 1, 1, 1, 1, 1)); 774 break; 775 case VK_STENCIL_OP_INVERT: 776 output = bufferValue ^ Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF); 777 break; 778 case VK_STENCIL_OP_INCREMENT_AND_WRAP: 779 output = bufferValue + Byte8(1, 1, 1, 1, 1, 1, 1, 1); 780 break; 781 case VK_STENCIL_OP_DECREMENT_AND_WRAP: 782 output = bufferValue - Byte8(1, 1, 1, 1, 1, 1, 1, 1); 783 break; 784 default: 785 ASSERT(false); 786 } 787 } 788 789 void PixelRoutine::blendFactor(Vector4s &blendFactor, const Vector4s ¤t, const Vector4s &pixel, VkBlendFactor blendFactorActive) 790 { 791 switch(blendFactorActive) 792 { 793 case VK_BLEND_FACTOR_ZERO: 794 // Optimized 795 break; 796 case VK_BLEND_FACTOR_ONE: 797 // Optimized 798 break; 799 case VK_BLEND_FACTOR_SRC_COLOR: 800 blendFactor.x = current.x; 801 blendFactor.y = current.y; 802 blendFactor.z = current.z; 803 break; 804 case VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR: 805 blendFactor.x = Short4(0xFFFFu) - current.x; 806 blendFactor.y = Short4(0xFFFFu) - current.y; 807 blendFactor.z = Short4(0xFFFFu) - current.z; 808 break; 809 case VK_BLEND_FACTOR_DST_COLOR: 810 blendFactor.x = pixel.x; 811 blendFactor.y = pixel.y; 812 blendFactor.z = pixel.z; 813 break; 814 case VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR: 815 blendFactor.x = Short4(0xFFFFu) - pixel.x; 816 blendFactor.y = Short4(0xFFFFu) - pixel.y; 817 blendFactor.z = Short4(0xFFFFu) - pixel.z; 818 break; 819 case VK_BLEND_FACTOR_SRC_ALPHA: 820 blendFactor.x = current.w; 821 blendFactor.y = current.w; 822 blendFactor.z = current.w; 823 break; 824 case VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA: 825 blendFactor.x = Short4(0xFFFFu) - current.w; 826 blendFactor.y = Short4(0xFFFFu) - current.w; 827 blendFactor.z = Short4(0xFFFFu) - current.w; 828 break; 829 case VK_BLEND_FACTOR_DST_ALPHA: 830 blendFactor.x = pixel.w; 831 blendFactor.y = pixel.w; 832 blendFactor.z = pixel.w; 833 break; 834 case VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA: 835 blendFactor.x = Short4(0xFFFFu) - pixel.w; 836 blendFactor.y = Short4(0xFFFFu) - pixel.w; 837 blendFactor.z = Short4(0xFFFFu) - pixel.w; 838 break; 839 case VK_BLEND_FACTOR_SRC_ALPHA_SATURATE: 840 blendFactor.x = Short4(0xFFFFu) - pixel.w; 841 blendFactor.x = Min(As<UShort4>(blendFactor.x), As<UShort4>(current.w)); 842 blendFactor.y = blendFactor.x; 843 blendFactor.z = blendFactor.x; 844 break; 845 case VK_BLEND_FACTOR_CONSTANT_COLOR: 846 blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[0])); 847 blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[1])); 848 blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[2])); 849 break; 850 case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR: 851 blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[0])); 852 blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[1])); 853 blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[2])); 854 break; 855 case VK_BLEND_FACTOR_CONSTANT_ALPHA: 856 blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3])); 857 blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3])); 858 blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3])); 859 break; 860 case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA: 861 blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3])); 862 blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3])); 863 blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3])); 864 break; 865 default: 866 ASSERT(false); 867 } 868 } 869 870 void PixelRoutine::blendFactorAlpha(Vector4s &blendFactor, const Vector4s ¤t, const Vector4s &pixel, VkBlendFactor blendFactorAlphaActive) 871 { 872 switch(blendFactorAlphaActive) 873 { 874 case VK_BLEND_FACTOR_ZERO: 875 // Optimized 876 break; 877 case VK_BLEND_FACTOR_ONE: 878 // Optimized 879 break; 880 case VK_BLEND_FACTOR_SRC_COLOR: 881 blendFactor.w = current.w; 882 break; 883 case VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR: 884 blendFactor.w = Short4(0xFFFFu) - current.w; 885 break; 886 case VK_BLEND_FACTOR_DST_COLOR: 887 blendFactor.w = pixel.w; 888 break; 889 case VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR: 890 blendFactor.w = Short4(0xFFFFu) - pixel.w; 891 break; 892 case VK_BLEND_FACTOR_SRC_ALPHA: 893 blendFactor.w = current.w; 894 break; 895 case VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA: 896 blendFactor.w = Short4(0xFFFFu) - current.w; 897 break; 898 case VK_BLEND_FACTOR_DST_ALPHA: 899 blendFactor.w = pixel.w; 900 break; 901 case VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA: 902 blendFactor.w = Short4(0xFFFFu) - pixel.w; 903 break; 904 case VK_BLEND_FACTOR_SRC_ALPHA_SATURATE: 905 blendFactor.w = Short4(0xFFFFu); 906 break; 907 case VK_BLEND_FACTOR_CONSTANT_COLOR: 908 case VK_BLEND_FACTOR_CONSTANT_ALPHA: 909 blendFactor.w = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3])); 910 break; 911 case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR: 912 case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA: 913 blendFactor.w = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3])); 914 break; 915 default: 916 ASSERT(false); 917 } 918 } 919 920 bool PixelRoutine::isSRGB(int index) const 921 { 922 return Surface::isSRGBformat(state.targetFormat[index]); 923 } 924 925 void PixelRoutine::readPixel(int index, Pointer<Byte> &cBuffer, Int &x, Vector4s &pixel) 926 { 927 Short4 c01; 928 Short4 c23; 929 Pointer<Byte> buffer; 930 Pointer<Byte> buffer2; 931 932 switch(state.targetFormat[index]) 933 { 934 case VK_FORMAT_R5G6B5_UNORM_PACK16: 935 buffer = cBuffer + 2 * x; 936 buffer2 = buffer + *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index])); 937 c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2))); 938 939 pixel.x = c01 & Short4(0xF800u); 940 pixel.y = (c01 & Short4(0x07E0u)) << 5; 941 pixel.z = (c01 & Short4(0x001Fu)) << 11; 942 pixel.w = Short4(0xFFFFu); 943 break; 944 case VK_FORMAT_B8G8R8A8_UNORM: 945 buffer = cBuffer + 4 * x; 946 c01 = *Pointer<Short4>(buffer); 947 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index])); 948 c23 = *Pointer<Short4>(buffer); 949 pixel.z = c01; 950 pixel.y = c01; 951 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23)); 952 pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23)); 953 pixel.x = pixel.z; 954 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y)); 955 pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y)); 956 pixel.y = pixel.z; 957 pixel.w = pixel.x; 958 pixel.x = UnpackLow(As<Byte8>(pixel.x), As<Byte8>(pixel.x)); 959 pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y)); 960 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z)); 961 pixel.w = UnpackHigh(As<Byte8>(pixel.w), As<Byte8>(pixel.w)); 962 break; 963 case VK_FORMAT_R8G8B8A8_UNORM: 964 case VK_FORMAT_R8G8B8A8_SRGB: 965 buffer = cBuffer + 4 * x; 966 c01 = *Pointer<Short4>(buffer); 967 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index])); 968 c23 = *Pointer<Short4>(buffer); 969 pixel.z = c01; 970 pixel.y = c01; 971 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23)); 972 pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23)); 973 pixel.x = pixel.z; 974 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y)); 975 pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y)); 976 pixel.y = pixel.z; 977 pixel.w = pixel.x; 978 pixel.x = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z)); 979 pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y)); 980 pixel.z = UnpackLow(As<Byte8>(pixel.w), As<Byte8>(pixel.w)); 981 pixel.w = UnpackHigh(As<Byte8>(pixel.w), As<Byte8>(pixel.w)); 982 break; 983 case VK_FORMAT_R8_UNORM: 984 buffer = cBuffer + 1 * x; 985 pixel.x = Insert(pixel.x, *Pointer<Short>(buffer), 0); 986 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index])); 987 pixel.x = Insert(pixel.x, *Pointer<Short>(buffer), 1); 988 pixel.x = UnpackLow(As<Byte8>(pixel.x), As<Byte8>(pixel.x)); 989 pixel.y = Short4(0x0000); 990 pixel.z = Short4(0x0000); 991 pixel.w = Short4(0xFFFFu); 992 break; 993 case VK_FORMAT_R8G8_UNORM: 994 buffer = cBuffer + 2 * x; 995 c01 = As<Short4>(Insert(As<Int2>(c01), *Pointer<Int>(buffer), 0)); 996 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index])); 997 c01 = As<Short4>(Insert(As<Int2>(c01), *Pointer<Int>(buffer), 1)); 998 pixel.x = (c01 & Short4(0x00FFu)) | (c01 << 8); 999 pixel.y = (c01 & Short4(0xFF00u)) | As<Short4>(As<UShort4>(c01) >> 8); 1000 pixel.z = Short4(0x0000u); 1001 pixel.w = Short4(0xFFFFu); 1002 break; 1003 case VK_FORMAT_R16G16B16A16_UNORM: 1004 buffer = cBuffer; 1005 pixel.x = *Pointer<Short4>(buffer + 8 * x); 1006 pixel.y = *Pointer<Short4>(buffer + 8 * x + 8); 1007 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index])); 1008 pixel.z = *Pointer<Short4>(buffer + 8 * x); 1009 pixel.w = *Pointer<Short4>(buffer + 8 * x + 8); 1010 transpose4x4(pixel.x, pixel.y, pixel.z, pixel.w); 1011 break; 1012 case VK_FORMAT_R16G16_UNORM: 1013 buffer = cBuffer; 1014 pixel.x = *Pointer<Short4>(buffer + 4 * x); 1015 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index])); 1016 pixel.y = *Pointer<Short4>(buffer + 4 * x); 1017 pixel.z = pixel.x; 1018 pixel.x = As<Short4>(UnpackLow(pixel.x, pixel.y)); 1019 pixel.z = As<Short4>(UnpackHigh(pixel.z, pixel.y)); 1020 pixel.y = pixel.z; 1021 pixel.x = As<Short4>(UnpackLow(pixel.x, pixel.z)); 1022 pixel.y = As<Short4>(UnpackHigh(pixel.y, pixel.z)); 1023 pixel.z = Short4(0xFFFFu); 1024 pixel.w = Short4(0xFFFFu); 1025 break; 1026 default: 1027 ASSERT(false); 1028 } 1029 1030 if((postBlendSRGB && state.writeSRGB) || isSRGB(index)) 1031 { 1032 sRGBtoLinear16_12_16(pixel); 1033 } 1034 } 1035 1036 void PixelRoutine::alphaBlend(int index, Pointer<Byte> &cBuffer, Vector4s ¤t, Int &x) 1037 { 1038 if(!state.alphaBlendActive) 1039 { 1040 return; 1041 } 1042 1043 Vector4s pixel; 1044 readPixel(index, cBuffer, x, pixel); 1045 1046 // Final Color = ObjectColor * SourceBlendFactor + PixelColor * DestinationBlendFactor 1047 Vector4s sourceFactor; 1048 Vector4s destFactor; 1049 1050 blendFactor(sourceFactor, current, pixel, state.sourceBlendFactor); 1051 blendFactor(destFactor, current, pixel, state.destBlendFactor); 1052 1053 if(state.sourceBlendFactor != VK_BLEND_FACTOR_ONE && state.sourceBlendFactor != VK_BLEND_FACTOR_ZERO) 1054 { 1055 current.x = MulHigh(As<UShort4>(current.x), As<UShort4>(sourceFactor.x)); 1056 current.y = MulHigh(As<UShort4>(current.y), As<UShort4>(sourceFactor.y)); 1057 current.z = MulHigh(As<UShort4>(current.z), As<UShort4>(sourceFactor.z)); 1058 } 1059 1060 if(state.destBlendFactor != VK_BLEND_FACTOR_ONE && state.destBlendFactor != VK_BLEND_FACTOR_ZERO) 1061 { 1062 pixel.x = MulHigh(As<UShort4>(pixel.x), As<UShort4>(destFactor.x)); 1063 pixel.y = MulHigh(As<UShort4>(pixel.y), As<UShort4>(destFactor.y)); 1064 pixel.z = MulHigh(As<UShort4>(pixel.z), As<UShort4>(destFactor.z)); 1065 } 1066 1067 switch(state.blendOperation) 1068 { 1069 case VK_BLEND_OP_ADD: 1070 current.x = AddSat(As<UShort4>(current.x), As<UShort4>(pixel.x)); 1071 current.y = AddSat(As<UShort4>(current.y), As<UShort4>(pixel.y)); 1072 current.z = AddSat(As<UShort4>(current.z), As<UShort4>(pixel.z)); 1073 break; 1074 case VK_BLEND_OP_SUBTRACT: 1075 current.x = SubSat(As<UShort4>(current.x), As<UShort4>(pixel.x)); 1076 current.y = SubSat(As<UShort4>(current.y), As<UShort4>(pixel.y)); 1077 current.z = SubSat(As<UShort4>(current.z), As<UShort4>(pixel.z)); 1078 break; 1079 case VK_BLEND_OP_REVERSE_SUBTRACT: 1080 current.x = SubSat(As<UShort4>(pixel.x), As<UShort4>(current.x)); 1081 current.y = SubSat(As<UShort4>(pixel.y), As<UShort4>(current.y)); 1082 current.z = SubSat(As<UShort4>(pixel.z), As<UShort4>(current.z)); 1083 break; 1084 case VK_BLEND_OP_MIN: 1085 current.x = Min(As<UShort4>(current.x), As<UShort4>(pixel.x)); 1086 current.y = Min(As<UShort4>(current.y), As<UShort4>(pixel.y)); 1087 current.z = Min(As<UShort4>(current.z), As<UShort4>(pixel.z)); 1088 break; 1089 case VK_BLEND_OP_MAX: 1090 current.x = Max(As<UShort4>(current.x), As<UShort4>(pixel.x)); 1091 current.y = Max(As<UShort4>(current.y), As<UShort4>(pixel.y)); 1092 current.z = Max(As<UShort4>(current.z), As<UShort4>(pixel.z)); 1093 break; 1094 case VK_BLEND_OP_SRC_EXT: 1095 // No operation 1096 break; 1097 case VK_BLEND_OP_DST_EXT: 1098 current.x = pixel.x; 1099 current.y = pixel.y; 1100 current.z = pixel.z; 1101 break; 1102 case VK_BLEND_OP_ZERO_EXT: 1103 current.x = Short4(0x0000); 1104 current.y = Short4(0x0000); 1105 current.z = Short4(0x0000); 1106 break; 1107 default: 1108 ASSERT(false); 1109 } 1110 1111 blendFactorAlpha(sourceFactor, current, pixel, state.sourceBlendFactorAlpha); 1112 blendFactorAlpha(destFactor, current, pixel, state.destBlendFactorAlpha); 1113 1114 if(state.sourceBlendFactorAlpha != VK_BLEND_FACTOR_ONE && state.sourceBlendFactorAlpha != VK_BLEND_FACTOR_ZERO) 1115 { 1116 current.w = MulHigh(As<UShort4>(current.w), As<UShort4>(sourceFactor.w)); 1117 } 1118 1119 if(state.destBlendFactorAlpha != VK_BLEND_FACTOR_ONE && state.destBlendFactorAlpha != VK_BLEND_FACTOR_ZERO) 1120 { 1121 pixel.w = MulHigh(As<UShort4>(pixel.w), As<UShort4>(destFactor.w)); 1122 } 1123 1124 switch(state.blendOperationAlpha) 1125 { 1126 case VK_BLEND_OP_ADD: 1127 current.w = AddSat(As<UShort4>(current.w), As<UShort4>(pixel.w)); 1128 break; 1129 case VK_BLEND_OP_SUBTRACT: 1130 current.w = SubSat(As<UShort4>(current.w), As<UShort4>(pixel.w)); 1131 break; 1132 case VK_BLEND_OP_REVERSE_SUBTRACT: 1133 current.w = SubSat(As<UShort4>(pixel.w), As<UShort4>(current.w)); 1134 break; 1135 case VK_BLEND_OP_MIN: 1136 current.w = Min(As<UShort4>(current.w), As<UShort4>(pixel.w)); 1137 break; 1138 case VK_BLEND_OP_MAX: 1139 current.w = Max(As<UShort4>(current.w), As<UShort4>(pixel.w)); 1140 break; 1141 case VK_BLEND_OP_SRC_EXT: 1142 // No operation 1143 break; 1144 case VK_BLEND_OP_DST_EXT: 1145 current.w = pixel.w; 1146 break; 1147 case VK_BLEND_OP_ZERO_EXT: 1148 current.w = Short4(0x0000); 1149 break; 1150 default: 1151 ASSERT(false); 1152 } 1153 } 1154 1155 void PixelRoutine::logicOperation(int index, Pointer<Byte> &cBuffer, Vector4s ¤t, Int &x) 1156 { 1157 if(state.logicalOperation == VK_LOGIC_OP_COPY) 1158 { 1159 return; 1160 } 1161 1162 Vector4s pixel; 1163 readPixel(index, cBuffer, x, pixel); 1164 1165 switch(state.logicalOperation) 1166 { 1167 case VK_LOGIC_OP_CLEAR: 1168 current.x = UShort4(0); 1169 current.y = UShort4(0); 1170 current.z = UShort4(0); 1171 break; 1172 case VK_LOGIC_OP_SET: 1173 current.x = UShort4(0xFFFFu); 1174 current.y = UShort4(0xFFFFu); 1175 current.z = UShort4(0xFFFFu); 1176 break; 1177 case VK_LOGIC_OP_COPY: 1178 ASSERT(false); // Optimized out 1179 break; 1180 case VK_LOGIC_OP_COPY_INVERTED: 1181 current.x = ~current.x; 1182 current.y = ~current.y; 1183 current.z = ~current.z; 1184 break; 1185 case VK_LOGIC_OP_NO_OP: 1186 current.x = pixel.x; 1187 current.y = pixel.y; 1188 current.z = pixel.z; 1189 break; 1190 case VK_LOGIC_OP_INVERT: 1191 current.x = ~pixel.x; 1192 current.y = ~pixel.y; 1193 current.z = ~pixel.z; 1194 break; 1195 case VK_LOGIC_OP_AND: 1196 current.x = pixel.x & current.x; 1197 current.y = pixel.y & current.y; 1198 current.z = pixel.z & current.z; 1199 break; 1200 case VK_LOGIC_OP_NAND: 1201 current.x = ~(pixel.x & current.x); 1202 current.y = ~(pixel.y & current.y); 1203 current.z = ~(pixel.z & current.z); 1204 break; 1205 case VK_LOGIC_OP_OR: 1206 current.x = pixel.x | current.x; 1207 current.y = pixel.y | current.y; 1208 current.z = pixel.z | current.z; 1209 break; 1210 case VK_LOGIC_OP_NOR: 1211 current.x = ~(pixel.x | current.x); 1212 current.y = ~(pixel.y | current.y); 1213 current.z = ~(pixel.z | current.z); 1214 break; 1215 case VK_LOGIC_OP_XOR: 1216 current.x = pixel.x ^ current.x; 1217 current.y = pixel.y ^ current.y; 1218 current.z = pixel.z ^ current.z; 1219 break; 1220 case VK_LOGIC_OP_EQUIVALENT: 1221 current.x = ~(pixel.x ^ current.x); 1222 current.y = ~(pixel.y ^ current.y); 1223 current.z = ~(pixel.z ^ current.z); 1224 break; 1225 case VK_LOGIC_OP_AND_REVERSE: 1226 current.x = ~pixel.x & current.x; 1227 current.y = ~pixel.y & current.y; 1228 current.z = ~pixel.z & current.z; 1229 break; 1230 case VK_LOGIC_OP_AND_INVERTED: 1231 current.x = pixel.x & ~current.x; 1232 current.y = pixel.y & ~current.y; 1233 current.z = pixel.z & ~current.z; 1234 break; 1235 case VK_LOGIC_OP_OR_REVERSE: 1236 current.x = ~pixel.x | current.x; 1237 current.y = ~pixel.y | current.y; 1238 current.z = ~pixel.z | current.z; 1239 break; 1240 case VK_LOGIC_OP_OR_INVERTED: 1241 current.x = pixel.x | ~current.x; 1242 current.y = pixel.y | ~current.y; 1243 current.z = pixel.z | ~current.z; 1244 break; 1245 default: 1246 ASSERT(false); 1247 } 1248 } 1249 1250 void PixelRoutine::writeColor(int index, Pointer<Byte> &cBuffer, Int &x, Vector4s ¤t, Int &sMask, Int &zMask, Int &cMask) 1251 { 1252 if((postBlendSRGB && state.writeSRGB) || isSRGB(index)) 1253 { 1254 linearToSRGB16_12_16(current); 1255 } 1256 1257 if(exactColorRounding) 1258 { 1259 switch(state.targetFormat[index]) 1260 { 1261 case VK_FORMAT_R5G6B5_UNORM_PACK16: 1262 current.x = AddSat(As<UShort4>(current.x), UShort4(0x0400)); 1263 current.y = AddSat(As<UShort4>(current.y), UShort4(0x0200)); 1264 current.z = AddSat(As<UShort4>(current.z), UShort4(0x0400)); 1265 break; 1266 case VK_FORMAT_B8G8R8A8_UNORM: 1267 case VK_FORMAT_R8G8B8A8_UNORM: 1268 case VK_FORMAT_R8G8B8A8_SRGB: 1269 case VK_FORMAT_R8G8_UNORM: 1270 case VK_FORMAT_R8_UNORM: 1271 current.x = current.x - As<Short4>(As<UShort4>(current.x) >> 8) + Short4(0x0080); 1272 current.y = current.y - As<Short4>(As<UShort4>(current.y) >> 8) + Short4(0x0080); 1273 current.z = current.z - As<Short4>(As<UShort4>(current.z) >> 8) + Short4(0x0080); 1274 current.w = current.w - As<Short4>(As<UShort4>(current.w) >> 8) + Short4(0x0080); 1275 break; 1276 default: 1277 break; 1278 } 1279 } 1280 1281 int rgbaWriteMask = state.colorWriteActive(index); 1282 int bgraWriteMask = (rgbaWriteMask & 0x0000000A) | (rgbaWriteMask & 0x00000001) << 2 | (rgbaWriteMask & 0x00000004) >> 2; 1283 1284 switch(state.targetFormat[index]) 1285 { 1286 case VK_FORMAT_R5G6B5_UNORM_PACK16: 1287 { 1288 current.x = current.x & Short4(0xF800u); 1289 current.y = As<UShort4>(current.y & Short4(0xFC00u)) >> 5; 1290 current.z = As<UShort4>(current.z) >> 11; 1291 1292 current.x = current.x | current.y | current.z; 1293 } 1294 break; 1295 case VK_FORMAT_B8G8R8A8_UNORM: 1296 if(rgbaWriteMask == 0x7) 1297 { 1298 current.x = As<Short4>(As<UShort4>(current.x) >> 8); 1299 current.y = As<Short4>(As<UShort4>(current.y) >> 8); 1300 current.z = As<Short4>(As<UShort4>(current.z) >> 8); 1301 1302 current.z = As<Short4>(PackUnsigned(current.z, current.x)); 1303 current.y = As<Short4>(PackUnsigned(current.y, current.y)); 1304 1305 current.x = current.z; 1306 current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y)); 1307 current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y)); 1308 current.y = current.z; 1309 current.z = As<Short4>(UnpackLow(current.z, current.x)); 1310 current.y = As<Short4>(UnpackHigh(current.y, current.x)); 1311 } 1312 else 1313 { 1314 current.x = As<Short4>(As<UShort4>(current.x) >> 8); 1315 current.y = As<Short4>(As<UShort4>(current.y) >> 8); 1316 current.z = As<Short4>(As<UShort4>(current.z) >> 8); 1317 current.w = As<Short4>(As<UShort4>(current.w) >> 8); 1318 1319 current.z = As<Short4>(PackUnsigned(current.z, current.x)); 1320 current.y = As<Short4>(PackUnsigned(current.y, current.w)); 1321 1322 current.x = current.z; 1323 current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y)); 1324 current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y)); 1325 current.y = current.z; 1326 current.z = As<Short4>(UnpackLow(current.z, current.x)); 1327 current.y = As<Short4>(UnpackHigh(current.y, current.x)); 1328 } 1329 break; 1330 case VK_FORMAT_R8G8B8A8_UNORM: 1331 case VK_FORMAT_R8G8B8A8_SRGB: 1332 if(rgbaWriteMask == 0x7) 1333 { 1334 current.x = As<Short4>(As<UShort4>(current.x) >> 8); 1335 current.y = As<Short4>(As<UShort4>(current.y) >> 8); 1336 current.z = As<Short4>(As<UShort4>(current.z) >> 8); 1337 1338 current.z = As<Short4>(PackUnsigned(current.x, current.z)); 1339 current.y = As<Short4>(PackUnsigned(current.y, current.y)); 1340 1341 current.x = current.z; 1342 current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y)); 1343 current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y)); 1344 current.y = current.z; 1345 current.z = As<Short4>(UnpackLow(current.z, current.x)); 1346 current.y = As<Short4>(UnpackHigh(current.y, current.x)); 1347 } 1348 else 1349 { 1350 current.x = As<Short4>(As<UShort4>(current.x) >> 8); 1351 current.y = As<Short4>(As<UShort4>(current.y) >> 8); 1352 current.z = As<Short4>(As<UShort4>(current.z) >> 8); 1353 current.w = As<Short4>(As<UShort4>(current.w) >> 8); 1354 1355 current.z = As<Short4>(PackUnsigned(current.x, current.z)); 1356 current.y = As<Short4>(PackUnsigned(current.y, current.w)); 1357 1358 current.x = current.z; 1359 current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y)); 1360 current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y)); 1361 current.y = current.z; 1362 current.z = As<Short4>(UnpackLow(current.z, current.x)); 1363 current.y = As<Short4>(UnpackHigh(current.y, current.x)); 1364 } 1365 break; 1366 case VK_FORMAT_R8G8_UNORM: 1367 current.x = As<Short4>(As<UShort4>(current.x) >> 8); 1368 current.y = As<Short4>(As<UShort4>(current.y) >> 8); 1369 current.x = As<Short4>(PackUnsigned(current.x, current.x)); 1370 current.y = As<Short4>(PackUnsigned(current.y, current.y)); 1371 current.x = UnpackLow(As<Byte8>(current.x), As<Byte8>(current.y)); 1372 break; 1373 case VK_FORMAT_R8_UNORM: 1374 current.x = As<Short4>(As<UShort4>(current.x) >> 8); 1375 current.x = As<Short4>(PackUnsigned(current.x, current.x)); 1376 break; 1377 case VK_FORMAT_R16G16_UNORM: 1378 current.z = current.x; 1379 current.x = As<Short4>(UnpackLow(current.x, current.y)); 1380 current.z = As<Short4>(UnpackHigh(current.z, current.y)); 1381 current.y = current.z; 1382 break; 1383 case VK_FORMAT_R16G16B16A16_UNORM: 1384 transpose4x4(current.x, current.y, current.z, current.w); 1385 break; 1386 default: 1387 ASSERT(false); 1388 } 1389 1390 Short4 c01 = current.z; 1391 Short4 c23 = current.y; 1392 1393 Int xMask; // Combination of all masks 1394 1395 if(state.depthTestActive) 1396 { 1397 xMask = zMask; 1398 } 1399 else 1400 { 1401 xMask = cMask; 1402 } 1403 1404 if(state.stencilActive) 1405 { 1406 xMask &= sMask; 1407 } 1408 1409 switch(state.targetFormat[index]) 1410 { 1411 case VK_FORMAT_R5G6B5_UNORM_PACK16: 1412 { 1413 Pointer<Byte> buffer = cBuffer + 2 * x; 1414 Int value = *Pointer<Int>(buffer); 1415 1416 Int c01 = Extract(As<Int2>(current.x), 0); 1417 1418 if((bgraWriteMask & 0x00000007) != 0x00000007) 1419 { 1420 Int masked = value; 1421 c01 &= *Pointer<Int>(constants + OFFSET(Constants,mask565Q[bgraWriteMask & 0x7][0])); 1422 masked &= *Pointer<Int>(constants + OFFSET(Constants,mask565Q[~bgraWriteMask & 0x7][0])); 1423 c01 |= masked; 1424 } 1425 1426 c01 &= *Pointer<Int>(constants + OFFSET(Constants,maskW4Q[0][0]) + xMask * 8); 1427 value &= *Pointer<Int>(constants + OFFSET(Constants,invMaskW4Q[0][0]) + xMask * 8); 1428 c01 |= value; 1429 *Pointer<Int>(buffer) = c01; 1430 1431 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index])); 1432 value = *Pointer<Int>(buffer); 1433 1434 Int c23 = Extract(As<Int2>(current.x), 1); 1435 1436 if((bgraWriteMask & 0x00000007) != 0x00000007) 1437 { 1438 Int masked = value; 1439 c23 &= *Pointer<Int>(constants + OFFSET(Constants,mask565Q[bgraWriteMask & 0x7][0])); 1440 masked &= *Pointer<Int>(constants + OFFSET(Constants,mask565Q[~bgraWriteMask & 0x7][0])); 1441 c23 |= masked; 1442 } 1443 1444 c23 &= *Pointer<Int>(constants + OFFSET(Constants,maskW4Q[0][2]) + xMask * 8); 1445 value &= *Pointer<Int>(constants + OFFSET(Constants,invMaskW4Q[0][2]) + xMask * 8); 1446 c23 |= value; 1447 *Pointer<Int>(buffer) = c23; 1448 } 1449 break; 1450 case VK_FORMAT_B8G8R8A8_UNORM: 1451 { 1452 Pointer<Byte> buffer = cBuffer + x * 4; 1453 Short4 value = *Pointer<Short4>(buffer); 1454 1455 if(state.targetFormat[index] == VK_FORMAT_B8G8R8A8_UNORM && bgraWriteMask != 0x0000000F) // FIXME: Need for masking when XRGB && Fh? 1456 { 1457 Short4 masked = value; 1458 c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0])); 1459 masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0])); 1460 c01 |= masked; 1461 } 1462 1463 c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8); 1464 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD01Q) + xMask * 8); 1465 c01 |= value; 1466 *Pointer<Short4>(buffer) = c01; 1467 1468 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index])); 1469 value = *Pointer<Short4>(buffer); 1470 1471 if(state.targetFormat[index] == VK_FORMAT_B8G8R8A8_UNORM && bgraWriteMask != 0x0000000F) // FIXME: Need for masking when XRGB && Fh? 1472 { 1473 Short4 masked = value; 1474 c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0])); 1475 masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0])); 1476 c23 |= masked; 1477 } 1478 1479 c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8); 1480 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD23Q) + xMask * 8); 1481 c23 |= value; 1482 *Pointer<Short4>(buffer) = c23; 1483 } 1484 break; 1485 case VK_FORMAT_R8G8B8A8_UNORM: 1486 case VK_FORMAT_R8G8B8A8_SRGB: 1487 { 1488 Pointer<Byte> buffer = cBuffer + x * 4; 1489 Short4 value = *Pointer<Short4>(buffer); 1490 1491 bool masked = ((state.targetFormat[index] == VK_FORMAT_R8G8B8A8_UNORM || state.targetFormat[index] == VK_FORMAT_R8G8B8A8_SRGB) && rgbaWriteMask != 0x0000000F); // FIXME: Need for masking when XBGR && Fh? 1492 1493 if(masked) 1494 { 1495 Short4 masked = value; 1496 c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[rgbaWriteMask][0])); 1497 masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[rgbaWriteMask][0])); 1498 c01 |= masked; 1499 } 1500 1501 c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8); 1502 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD01Q) + xMask * 8); 1503 c01 |= value; 1504 *Pointer<Short4>(buffer) = c01; 1505 1506 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index])); 1507 value = *Pointer<Short4>(buffer); 1508 1509 if(masked) 1510 { 1511 Short4 masked = value; 1512 c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[rgbaWriteMask][0])); 1513 masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[rgbaWriteMask][0])); 1514 c23 |= masked; 1515 } 1516 1517 c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8); 1518 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD23Q) + xMask * 8); 1519 c23 |= value; 1520 *Pointer<Short4>(buffer) = c23; 1521 } 1522 break; 1523 case VK_FORMAT_R8G8_UNORM: 1524 if((rgbaWriteMask & 0x00000003) != 0x0) 1525 { 1526 Pointer<Byte> buffer = cBuffer + 2 * x; 1527 Int2 value; 1528 value = Insert(value, *Pointer<Int>(buffer), 0); 1529 Int pitch = *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index])); 1530 value = Insert(value, *Pointer<Int>(buffer + pitch), 1); 1531 1532 Int2 packedCol = As<Int2>(current.x); 1533 1534 UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q) + xMask * 8); 1535 if((rgbaWriteMask & 0x3) != 0x3) 1536 { 1537 Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskB4Q[5 * (rgbaWriteMask & 0x3)][0])); 1538 UInt2 rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask)); 1539 mergedMask &= rgbaMask; 1540 } 1541 1542 packedCol = As<Int2>((As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask)); 1543 1544 *Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 0)); 1545 *Pointer<UInt>(buffer + pitch) = As<UInt>(Extract(packedCol, 1)); 1546 } 1547 break; 1548 case VK_FORMAT_R8_UNORM: 1549 if(rgbaWriteMask & 0x00000001) 1550 { 1551 Pointer<Byte> buffer = cBuffer + 1 * x; 1552 Short4 value; 1553 value = Insert(value, *Pointer<Short>(buffer), 0); 1554 Int pitch = *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index])); 1555 value = Insert(value, *Pointer<Short>(buffer + pitch), 1); 1556 1557 current.x &= *Pointer<Short4>(constants + OFFSET(Constants, maskB4Q) + 8 * xMask); 1558 value &= *Pointer<Short4>(constants + OFFSET(Constants, invMaskB4Q) + 8 * xMask); 1559 current.x |= value; 1560 1561 *Pointer<Short>(buffer) = Extract(current.x, 0); 1562 *Pointer<Short>(buffer + pitch) = Extract(current.x, 1); 1563 } 1564 break; 1565 case VK_FORMAT_R16G16_UNORM: 1566 { 1567 Pointer<Byte> buffer = cBuffer + 4 * x; 1568 1569 Short4 value = *Pointer<Short4>(buffer); 1570 1571 if((rgbaWriteMask & 0x00000003) != 0x00000003) 1572 { 1573 Short4 masked = value; 1574 current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskW01Q[rgbaWriteMask & 0x3][0])); 1575 masked &= *Pointer<Short4>(constants + OFFSET(Constants,maskW01Q[~rgbaWriteMask & 0x3][0])); 1576 current.x |= masked; 1577 } 1578 1579 current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8); 1580 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD01Q) + xMask * 8); 1581 current.x |= value; 1582 *Pointer<Short4>(buffer) = current.x; 1583 1584 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index])); 1585 1586 value = *Pointer<Short4>(buffer); 1587 1588 if((rgbaWriteMask & 0x00000003) != 0x00000003) 1589 { 1590 Short4 masked = value; 1591 current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskW01Q[rgbaWriteMask & 0x3][0])); 1592 masked &= *Pointer<Short4>(constants + OFFSET(Constants,maskW01Q[~rgbaWriteMask & 0x3][0])); 1593 current.y |= masked; 1594 } 1595 1596 current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8); 1597 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD23Q) + xMask * 8); 1598 current.y |= value; 1599 *Pointer<Short4>(buffer) = current.y; 1600 } 1601 break; 1602 case VK_FORMAT_R16G16B16A16_UNORM: 1603 { 1604 Pointer<Byte> buffer = cBuffer + 8 * x; 1605 1606 { 1607 Short4 value = *Pointer<Short4>(buffer); 1608 1609 if(rgbaWriteMask != 0x0000000F) 1610 { 1611 Short4 masked = value; 1612 current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0])); 1613 masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0])); 1614 current.x |= masked; 1615 } 1616 1617 current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ0Q) + xMask * 8); 1618 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ0Q) + xMask * 8); 1619 current.x |= value; 1620 *Pointer<Short4>(buffer) = current.x; 1621 } 1622 1623 { 1624 Short4 value = *Pointer<Short4>(buffer + 8); 1625 1626 if(rgbaWriteMask != 0x0000000F) 1627 { 1628 Short4 masked = value; 1629 current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0])); 1630 masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0])); 1631 current.y |= masked; 1632 } 1633 1634 current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ1Q) + xMask * 8); 1635 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ1Q) + xMask * 8); 1636 current.y |= value; 1637 *Pointer<Short4>(buffer + 8) = current.y; 1638 } 1639 1640 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index])); 1641 1642 { 1643 Short4 value = *Pointer<Short4>(buffer); 1644 1645 if(rgbaWriteMask != 0x0000000F) 1646 { 1647 Short4 masked = value; 1648 current.z &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0])); 1649 masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0])); 1650 current.z |= masked; 1651 } 1652 1653 current.z &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ2Q) + xMask * 8); 1654 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ2Q) + xMask * 8); 1655 current.z |= value; 1656 *Pointer<Short4>(buffer) = current.z; 1657 } 1658 1659 { 1660 Short4 value = *Pointer<Short4>(buffer + 8); 1661 1662 if(rgbaWriteMask != 0x0000000F) 1663 { 1664 Short4 masked = value; 1665 current.w &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0])); 1666 masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0])); 1667 current.w |= masked; 1668 } 1669 1670 current.w &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ3Q) + xMask * 8); 1671 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ3Q) + xMask * 8); 1672 current.w |= value; 1673 *Pointer<Short4>(buffer + 8) = current.w; 1674 } 1675 } 1676 break; 1677 default: 1678 ASSERT(false); 1679 } 1680 } 1681 1682 void PixelRoutine::blendFactor(Vector4f &blendFactor, const Vector4f &oC, const Vector4f &pixel, VkBlendFactor blendFactorActive) 1683 { 1684 switch(blendFactorActive) 1685 { 1686 case VK_BLEND_FACTOR_ZERO: 1687 // Optimized 1688 break; 1689 case VK_BLEND_FACTOR_ONE: 1690 // Optimized 1691 break; 1692 case VK_BLEND_FACTOR_SRC_COLOR: 1693 blendFactor.x = oC.x; 1694 blendFactor.y = oC.y; 1695 blendFactor.z = oC.z; 1696 break; 1697 case VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR: 1698 blendFactor.x = Float4(1.0f) - oC.x; 1699 blendFactor.y = Float4(1.0f) - oC.y; 1700 blendFactor.z = Float4(1.0f) - oC.z; 1701 break; 1702 case VK_BLEND_FACTOR_DST_COLOR: 1703 blendFactor.x = pixel.x; 1704 blendFactor.y = pixel.y; 1705 blendFactor.z = pixel.z; 1706 break; 1707 case VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR: 1708 blendFactor.x = Float4(1.0f) - pixel.x; 1709 blendFactor.y = Float4(1.0f) - pixel.y; 1710 blendFactor.z = Float4(1.0f) - pixel.z; 1711 break; 1712 case VK_BLEND_FACTOR_SRC_ALPHA: 1713 blendFactor.x = oC.w; 1714 blendFactor.y = oC.w; 1715 blendFactor.z = oC.w; 1716 break; 1717 case VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA: 1718 blendFactor.x = Float4(1.0f) - oC.w; 1719 blendFactor.y = Float4(1.0f) - oC.w; 1720 blendFactor.z = Float4(1.0f) - oC.w; 1721 break; 1722 case VK_BLEND_FACTOR_DST_ALPHA: 1723 blendFactor.x = pixel.w; 1724 blendFactor.y = pixel.w; 1725 blendFactor.z = pixel.w; 1726 break; 1727 case VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA: 1728 blendFactor.x = Float4(1.0f) - pixel.w; 1729 blendFactor.y = Float4(1.0f) - pixel.w; 1730 blendFactor.z = Float4(1.0f) - pixel.w; 1731 break; 1732 case VK_BLEND_FACTOR_SRC_ALPHA_SATURATE: 1733 blendFactor.x = Float4(1.0f) - pixel.w; 1734 blendFactor.x = Min(blendFactor.x, oC.w); 1735 blendFactor.y = blendFactor.x; 1736 blendFactor.z = blendFactor.x; 1737 break; 1738 case VK_BLEND_FACTOR_CONSTANT_COLOR: 1739 blendFactor.x = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[0])); 1740 blendFactor.y = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[1])); 1741 blendFactor.z = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[2])); 1742 break; 1743 case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR: 1744 blendFactor.x = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[0])); 1745 blendFactor.y = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[1])); 1746 blendFactor.z = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[2])); 1747 break; 1748 default: 1749 ASSERT(false); 1750 } 1751 } 1752 1753 void PixelRoutine::blendFactorAlpha(Vector4f &blendFactor, const Vector4f &oC, const Vector4f &pixel, VkBlendFactor blendFactorAlphaActive) 1754 { 1755 switch(blendFactorAlphaActive) 1756 { 1757 case VK_BLEND_FACTOR_ZERO: 1758 // Optimized 1759 break; 1760 case VK_BLEND_FACTOR_ONE: 1761 // Optimized 1762 break; 1763 case VK_BLEND_FACTOR_SRC_COLOR: 1764 blendFactor.w = oC.w; 1765 break; 1766 case VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR: 1767 blendFactor.w = Float4(1.0f) - oC.w; 1768 break; 1769 case VK_BLEND_FACTOR_DST_COLOR: 1770 blendFactor.w = pixel.w; 1771 break; 1772 case VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR: 1773 blendFactor.w = Float4(1.0f) - pixel.w; 1774 break; 1775 case VK_BLEND_FACTOR_SRC_ALPHA: 1776 blendFactor.w = oC.w; 1777 break; 1778 case VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA: 1779 blendFactor.w = Float4(1.0f) - oC.w; 1780 break; 1781 case VK_BLEND_FACTOR_DST_ALPHA: 1782 blendFactor.w = pixel.w; 1783 break; 1784 case VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA: 1785 blendFactor.w = Float4(1.0f) - pixel.w; 1786 break; 1787 case VK_BLEND_FACTOR_SRC_ALPHA_SATURATE: 1788 blendFactor.w = Float4(1.0f); 1789 break; 1790 case VK_BLEND_FACTOR_CONSTANT_COLOR: 1791 blendFactor.w = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[3])); 1792 break; 1793 case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR: 1794 blendFactor.w = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[3])); 1795 break; 1796 default: 1797 ASSERT(false); 1798 } 1799 } 1800 1801 void PixelRoutine::alphaBlend(int index, Pointer<Byte> &cBuffer, Vector4f &oC, Int &x) 1802 { 1803 if(!state.alphaBlendActive) 1804 { 1805 return; 1806 } 1807 1808 Pointer<Byte> buffer; 1809 Vector4f pixel; 1810 1811 Vector4s color; 1812 Short4 c01; 1813 Short4 c23; 1814 1815 Float4 one; 1816 if(Surface::isFloatFormat(state.targetFormat[index])) 1817 { 1818 one = Float4(1.0f); 1819 } 1820 else if(Surface::isNonNormalizedInteger(state.targetFormat[index])) 1821 { 1822 one = As<Float4>(Surface::isUnsignedComponent(state.targetFormat[index], 0) ? Int4(0xFFFFFFFF) : Int4(0x7FFFFFFF)); 1823 } 1824 1825 switch(state.targetFormat[index]) 1826 { 1827 case VK_FORMAT_R32_SINT: 1828 case VK_FORMAT_R32_UINT: 1829 case VK_FORMAT_R32_SFLOAT: 1830 buffer = cBuffer; 1831 // FIXME: movlps 1832 pixel.x.x = *Pointer<Float>(buffer + 4 * x + 0); 1833 pixel.x.y = *Pointer<Float>(buffer + 4 * x + 4); 1834 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index])); 1835 // FIXME: movhps 1836 pixel.x.z = *Pointer<Float>(buffer + 4 * x + 0); 1837 pixel.x.w = *Pointer<Float>(buffer + 4 * x + 4); 1838 pixel.y = pixel.z = pixel.w = one; 1839 break; 1840 case VK_FORMAT_R32G32_SINT: 1841 case VK_FORMAT_R32G32_UINT: 1842 case VK_FORMAT_R32G32_SFLOAT: 1843 buffer = cBuffer; 1844 pixel.x = *Pointer<Float4>(buffer + 8 * x, 16); 1845 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index])); 1846 pixel.y = *Pointer<Float4>(buffer + 8 * x, 16); 1847 pixel.z = pixel.x; 1848 pixel.x = ShuffleLowHigh(pixel.x, pixel.y, 0x88); 1849 pixel.z = ShuffleLowHigh(pixel.z, pixel.y, 0xDD); 1850 pixel.y = pixel.z; 1851 pixel.z = pixel.w = one; 1852 break; 1853 case VK_FORMAT_R32G32B32A32_SFLOAT: 1854 case VK_FORMAT_R32G32B32A32_SINT: 1855 case VK_FORMAT_R32G32B32A32_UINT: 1856 buffer = cBuffer; 1857 pixel.x = *Pointer<Float4>(buffer + 16 * x, 16); 1858 pixel.y = *Pointer<Float4>(buffer + 16 * x + 16, 16); 1859 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index])); 1860 pixel.z = *Pointer<Float4>(buffer + 16 * x, 16); 1861 pixel.w = *Pointer<Float4>(buffer + 16 * x + 16, 16); 1862 transpose4x4(pixel.x, pixel.y, pixel.z, pixel.w); 1863 break; 1864 default: 1865 ASSERT(false); 1866 } 1867 1868 if((postBlendSRGB && state.writeSRGB) || isSRGB(index)) 1869 { 1870 sRGBtoLinear(pixel.x); 1871 sRGBtoLinear(pixel.y); 1872 sRGBtoLinear(pixel.z); 1873 } 1874 1875 // Final Color = ObjectColor * SourceBlendFactor + PixelColor * DestinationBlendFactor 1876 Vector4f sourceFactor; 1877 Vector4f destFactor; 1878 1879 blendFactor(sourceFactor, oC, pixel, state.sourceBlendFactor); 1880 blendFactor(destFactor, oC, pixel, state.destBlendFactor); 1881 1882 if(state.sourceBlendFactor != VK_BLEND_FACTOR_ONE && state.sourceBlendFactor != VK_BLEND_FACTOR_ZERO) 1883 { 1884 oC.x *= sourceFactor.x; 1885 oC.y *= sourceFactor.y; 1886 oC.z *= sourceFactor.z; 1887 } 1888 1889 if(state.destBlendFactor != VK_BLEND_FACTOR_ONE && state.destBlendFactor != VK_BLEND_FACTOR_ZERO) 1890 { 1891 pixel.x *= destFactor.x; 1892 pixel.y *= destFactor.y; 1893 pixel.z *= destFactor.z; 1894 } 1895 1896 switch(state.blendOperation) 1897 { 1898 case VK_BLEND_OP_ADD: 1899 oC.x += pixel.x; 1900 oC.y += pixel.y; 1901 oC.z += pixel.z; 1902 break; 1903 case VK_BLEND_OP_SUBTRACT: 1904 oC.x -= pixel.x; 1905 oC.y -= pixel.y; 1906 oC.z -= pixel.z; 1907 break; 1908 case VK_BLEND_OP_REVERSE_SUBTRACT: 1909 oC.x = pixel.x - oC.x; 1910 oC.y = pixel.y - oC.y; 1911 oC.z = pixel.z - oC.z; 1912 break; 1913 case VK_BLEND_OP_MIN: 1914 oC.x = Min(oC.x, pixel.x); 1915 oC.y = Min(oC.y, pixel.y); 1916 oC.z = Min(oC.z, pixel.z); 1917 break; 1918 case VK_BLEND_OP_MAX: 1919 oC.x = Max(oC.x, pixel.x); 1920 oC.y = Max(oC.y, pixel.y); 1921 oC.z = Max(oC.z, pixel.z); 1922 break; 1923 case VK_BLEND_OP_SRC_EXT: 1924 // No operation 1925 break; 1926 case VK_BLEND_OP_DST_EXT: 1927 oC.x = pixel.x; 1928 oC.y = pixel.y; 1929 oC.z = pixel.z; 1930 break; 1931 case VK_BLEND_OP_ZERO_EXT: 1932 oC.x = Float4(0.0f); 1933 oC.y = Float4(0.0f); 1934 oC.z = Float4(0.0f); 1935 break; 1936 default: 1937 ASSERT(false); 1938 } 1939 1940 blendFactorAlpha(sourceFactor, oC, pixel, state.sourceBlendFactorAlpha); 1941 blendFactorAlpha(destFactor, oC, pixel, state.destBlendFactorAlpha); 1942 1943 if(state.sourceBlendFactorAlpha != VK_BLEND_FACTOR_ONE && state.sourceBlendFactorAlpha != VK_BLEND_FACTOR_ZERO) 1944 { 1945 oC.w *= sourceFactor.w; 1946 } 1947 1948 if(state.destBlendFactorAlpha != VK_BLEND_FACTOR_ONE && state.destBlendFactorAlpha != VK_BLEND_FACTOR_ZERO) 1949 { 1950 pixel.w *= destFactor.w; 1951 } 1952 1953 switch(state.blendOperationAlpha) 1954 { 1955 case VK_BLEND_OP_ADD: 1956 oC.w += pixel.w; 1957 break; 1958 case VK_BLEND_OP_SUBTRACT: 1959 oC.w -= pixel.w; 1960 break; 1961 case VK_BLEND_OP_REVERSE_SUBTRACT: 1962 pixel.w -= oC.w; 1963 oC.w = pixel.w; 1964 break; 1965 case VK_BLEND_OP_MIN: 1966 oC.w = Min(oC.w, pixel.w); 1967 break; 1968 case VK_BLEND_OP_MAX: 1969 oC.w = Max(oC.w, pixel.w); 1970 break; 1971 case VK_BLEND_OP_SRC_EXT: 1972 // No operation 1973 break; 1974 case VK_BLEND_OP_DST_EXT: 1975 oC.w = pixel.w; 1976 break; 1977 case VK_BLEND_OP_ZERO_EXT: 1978 oC.w = Float4(0.0f); 1979 break; 1980 default: 1981 ASSERT(false); 1982 } 1983 } 1984 1985 void PixelRoutine::writeColor(int index, Pointer<Byte> &cBuffer, Int &x, Vector4f &oC, Int &sMask, Int &zMask, Int &cMask) 1986 { 1987 switch(state.targetFormat[index]) 1988 { 1989 case VK_FORMAT_R32_SFLOAT: 1990 case VK_FORMAT_R32_SINT: 1991 case VK_FORMAT_R32_UINT: 1992 case VK_FORMAT_R16_SINT: 1993 case VK_FORMAT_R16_UINT: 1994 case VK_FORMAT_R8_SINT: 1995 case VK_FORMAT_R8_UINT: 1996 break; 1997 case VK_FORMAT_R32G32_SFLOAT: 1998 case VK_FORMAT_R32G32_SINT: 1999 case VK_FORMAT_R32G32_UINT: 2000 case VK_FORMAT_R16G16_SINT: 2001 case VK_FORMAT_R16G16_UINT: 2002 case VK_FORMAT_R8G8_SINT: 2003 case VK_FORMAT_R8G8_UINT: 2004 oC.z = oC.x; 2005 oC.x = UnpackLow(oC.x, oC.y); 2006 oC.z = UnpackHigh(oC.z, oC.y); 2007 oC.y = oC.z; 2008 break; 2009 case VK_FORMAT_R32G32B32A32_SFLOAT: 2010 case VK_FORMAT_R32G32B32A32_SINT: 2011 case VK_FORMAT_R32G32B32A32_UINT: 2012 case VK_FORMAT_R16G16B16A16_SINT: 2013 case VK_FORMAT_R16G16B16A16_UINT: 2014 case VK_FORMAT_R8G8B8A8_SINT: 2015 case VK_FORMAT_R8G8B8A8_UINT: 2016 transpose4x4(oC.x, oC.y, oC.z, oC.w); 2017 break; 2018 default: 2019 ASSERT(false); 2020 } 2021 2022 int rgbaWriteMask = state.colorWriteActive(index); 2023 2024 Int xMask; // Combination of all masks 2025 2026 if(state.depthTestActive) 2027 { 2028 xMask = zMask; 2029 } 2030 else 2031 { 2032 xMask = cMask; 2033 } 2034 2035 if(state.stencilActive) 2036 { 2037 xMask &= sMask; 2038 } 2039 2040 Pointer<Byte> buffer; 2041 Float4 value; 2042 2043 switch(state.targetFormat[index]) 2044 { 2045 case VK_FORMAT_R32_SFLOAT: 2046 case VK_FORMAT_R32_SINT: 2047 case VK_FORMAT_R32_UINT: 2048 if(rgbaWriteMask & 0x00000001) 2049 { 2050 buffer = cBuffer + 4 * x; 2051 2052 // FIXME: movlps 2053 value.x = *Pointer<Float>(buffer + 0); 2054 value.y = *Pointer<Float>(buffer + 4); 2055 2056 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index])); 2057 2058 // FIXME: movhps 2059 value.z = *Pointer<Float>(buffer + 0); 2060 value.w = *Pointer<Float>(buffer + 4); 2061 2062 oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X) + xMask * 16, 16)); 2063 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X) + xMask * 16, 16)); 2064 oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value)); 2065 2066 // FIXME: movhps 2067 *Pointer<Float>(buffer + 0) = oC.x.z; 2068 *Pointer<Float>(buffer + 4) = oC.x.w; 2069 2070 buffer -= *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index])); 2071 2072 // FIXME: movlps 2073 *Pointer<Float>(buffer + 0) = oC.x.x; 2074 *Pointer<Float>(buffer + 4) = oC.x.y; 2075 } 2076 break; 2077 case VK_FORMAT_R16_SINT: 2078 case VK_FORMAT_R16_UINT: 2079 if(rgbaWriteMask & 0x00000001) 2080 { 2081 buffer = cBuffer + 2 * x; 2082 2083 UShort4 xyzw; 2084 xyzw = As<UShort4>(Insert(As<Int2>(xyzw), *Pointer<Int>(buffer), 0)); 2085 2086 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index])); 2087 2088 xyzw = As<UShort4>(Insert(As<Int2>(xyzw), *Pointer<Int>(buffer), 1)); 2089 value = As<Float4>(Int4(xyzw)); 2090 2091 oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X) + xMask * 16, 16)); 2092 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X) + xMask * 16, 16)); 2093 oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value)); 2094 2095 if(state.targetFormat[index] == VK_FORMAT_R16_SINT) 2096 { 2097 Float component = oC.x.z; 2098 *Pointer<Short>(buffer + 0) = Short(As<Int>(component)); 2099 component = oC.x.w; 2100 *Pointer<Short>(buffer + 2) = Short(As<Int>(component)); 2101 2102 buffer -= *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index])); 2103 2104 component = oC.x.x; 2105 *Pointer<Short>(buffer + 0) = Short(As<Int>(component)); 2106 component = oC.x.y; 2107 *Pointer<Short>(buffer + 2) = Short(As<Int>(component)); 2108 } 2109 else // VK_FORMAT_R16_UINT 2110 { 2111 Float component = oC.x.z; 2112 *Pointer<UShort>(buffer + 0) = UShort(As<Int>(component)); 2113 component = oC.x.w; 2114 *Pointer<UShort>(buffer + 2) = UShort(As<Int>(component)); 2115 2116 buffer -= *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index])); 2117 2118 component = oC.x.x; 2119 *Pointer<UShort>(buffer + 0) = UShort(As<Int>(component)); 2120 component = oC.x.y; 2121 *Pointer<UShort>(buffer + 2) = UShort(As<Int>(component)); 2122 } 2123 } 2124 break; 2125 case VK_FORMAT_R8_SINT: 2126 case VK_FORMAT_R8_UINT: 2127 if(rgbaWriteMask & 0x00000001) 2128 { 2129 buffer = cBuffer + x; 2130 2131 UInt xyzw, packedCol; 2132 2133 xyzw = UInt(*Pointer<UShort>(buffer)) & 0xFFFF; 2134 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index])); 2135 xyzw |= UInt(*Pointer<UShort>(buffer)) << 16; 2136 2137 Short4 tmpCol = Short4(As<Int4>(oC.x)); 2138 if(state.targetFormat[index] == VK_FORMAT_R8_SINT) 2139 { 2140 tmpCol = As<Short4>(PackSigned(tmpCol, tmpCol)); 2141 } 2142 else 2143 { 2144 tmpCol = As<Short4>(PackUnsigned(tmpCol, tmpCol)); 2145 } 2146 packedCol = Extract(As<Int2>(tmpCol), 0); 2147 2148 packedCol = (packedCol & *Pointer<UInt>(constants + OFFSET(Constants, maskB4Q) + 8 * xMask)) | 2149 (xyzw & *Pointer<UInt>(constants + OFFSET(Constants, invMaskB4Q) + 8 * xMask)); 2150 2151 *Pointer<UShort>(buffer) = UShort(packedCol >> 16); 2152 buffer -= *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index])); 2153 *Pointer<UShort>(buffer) = UShort(packedCol); 2154 } 2155 break; 2156 case VK_FORMAT_R32G32_SFLOAT: 2157 case VK_FORMAT_R32G32_SINT: 2158 case VK_FORMAT_R32G32_UINT: 2159 buffer = cBuffer + 8 * x; 2160 2161 value = *Pointer<Float4>(buffer); 2162 2163 if((rgbaWriteMask & 0x00000003) != 0x00000003) 2164 { 2165 Float4 masked = value; 2166 oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskD01X[rgbaWriteMask & 0x3][0]))); 2167 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,maskD01X[~rgbaWriteMask & 0x3][0]))); 2168 oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(masked)); 2169 } 2170 2171 oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskQ01X) + xMask * 16, 16)); 2172 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskQ01X) + xMask * 16, 16)); 2173 oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value)); 2174 *Pointer<Float4>(buffer) = oC.x; 2175 2176 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index])); 2177 2178 value = *Pointer<Float4>(buffer); 2179 2180 if((rgbaWriteMask & 0x00000003) != 0x00000003) 2181 { 2182 Float4 masked; 2183 2184 masked = value; 2185 oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskD01X[rgbaWriteMask & 0x3][0]))); 2186 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,maskD01X[~rgbaWriteMask & 0x3][0]))); 2187 oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(masked)); 2188 } 2189 2190 oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskQ23X) + xMask * 16, 16)); 2191 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskQ23X) + xMask * 16, 16)); 2192 oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(value)); 2193 *Pointer<Float4>(buffer) = oC.y; 2194 break; 2195 case VK_FORMAT_R16G16_SINT: 2196 case VK_FORMAT_R16G16_UINT: 2197 if((rgbaWriteMask & 0x00000003) != 0x0) 2198 { 2199 buffer = cBuffer + 4 * x; 2200 2201 UInt2 rgbaMask; 2202 UShort4 packedCol = UShort4(As<Int4>(oC.x)); 2203 UShort4 value = *Pointer<UShort4>(buffer); 2204 UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8); 2205 if((rgbaWriteMask & 0x3) != 0x3) 2206 { 2207 Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[rgbaWriteMask & 0x3][0])); 2208 rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask)); 2209 mergedMask &= rgbaMask; 2210 } 2211 *Pointer<UInt2>(buffer) = (As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask); 2212 2213 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index])); 2214 2215 packedCol = UShort4(As<Int4>(oC.y)); 2216 value = *Pointer<UShort4>(buffer); 2217 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8); 2218 if((rgbaWriteMask & 0x3) != 0x3) 2219 { 2220 mergedMask &= rgbaMask; 2221 } 2222 *Pointer<UInt2>(buffer) = (As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask); 2223 } 2224 break; 2225 case VK_FORMAT_R8G8_SINT: 2226 case VK_FORMAT_R8G8_UINT: 2227 if((rgbaWriteMask & 0x00000003) != 0x0) 2228 { 2229 buffer = cBuffer + 2 * x; 2230 2231 Int2 xyzw, packedCol; 2232 2233 xyzw = Insert(xyzw, *Pointer<Int>(buffer), 0); 2234 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index])); 2235 xyzw = Insert(xyzw, *Pointer<Int>(buffer), 1); 2236 2237 if(state.targetFormat[index] == VK_FORMAT_R8G8_SINT) 2238 { 2239 packedCol = As<Int2>(PackSigned(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y)))); 2240 } 2241 else 2242 { 2243 packedCol = As<Int2>(PackUnsigned(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y)))); 2244 } 2245 2246 UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q) + xMask * 8); 2247 if((rgbaWriteMask & 0x3) != 0x3) 2248 { 2249 Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskB4Q[5 * (rgbaWriteMask & 0x3)][0])); 2250 UInt2 rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask)); 2251 mergedMask &= rgbaMask; 2252 } 2253 2254 packedCol = As<Int2>((As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(xyzw) & ~mergedMask)); 2255 2256 *Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 1)); 2257 buffer -= *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index])); 2258 *Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 0)); 2259 } 2260 break; 2261 case VK_FORMAT_R32G32B32A32_SFLOAT: 2262 case VK_FORMAT_R32G32B32A32_SINT: 2263 case VK_FORMAT_R32G32B32A32_UINT: 2264 buffer = cBuffer + 16 * x; 2265 2266 { 2267 value = *Pointer<Float4>(buffer, 16); 2268 2269 if(rgbaWriteMask != 0x0000000F) 2270 { 2271 Float4 masked = value; 2272 oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0]))); 2273 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0]))); 2274 oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(masked)); 2275 } 2276 2277 oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskX0X) + xMask * 16, 16)); 2278 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX0X) + xMask * 16, 16)); 2279 oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value)); 2280 *Pointer<Float4>(buffer, 16) = oC.x; 2281 } 2282 2283 { 2284 value = *Pointer<Float4>(buffer + 16, 16); 2285 2286 if(rgbaWriteMask != 0x0000000F) 2287 { 2288 Float4 masked = value; 2289 oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0]))); 2290 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0]))); 2291 oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(masked)); 2292 } 2293 2294 oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskX1X) + xMask * 16, 16)); 2295 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX1X) + xMask * 16, 16)); 2296 oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(value)); 2297 *Pointer<Float4>(buffer + 16, 16) = oC.y; 2298 } 2299 2300 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index])); 2301 2302 { 2303 value = *Pointer<Float4>(buffer, 16); 2304 2305 if(rgbaWriteMask != 0x0000000F) 2306 { 2307 Float4 masked = value; 2308 oC.z = As<Float4>(As<Int4>(oC.z) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0]))); 2309 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0]))); 2310 oC.z = As<Float4>(As<Int4>(oC.z) | As<Int4>(masked)); 2311 } 2312 2313 oC.z = As<Float4>(As<Int4>(oC.z) & *Pointer<Int4>(constants + OFFSET(Constants,maskX2X) + xMask * 16, 16)); 2314 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX2X) + xMask * 16, 16)); 2315 oC.z = As<Float4>(As<Int4>(oC.z) | As<Int4>(value)); 2316 *Pointer<Float4>(buffer, 16) = oC.z; 2317 } 2318 2319 { 2320 value = *Pointer<Float4>(buffer + 16, 16); 2321 2322 if(rgbaWriteMask != 0x0000000F) 2323 { 2324 Float4 masked = value; 2325 oC.w = As<Float4>(As<Int4>(oC.w) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0]))); 2326 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0]))); 2327 oC.w = As<Float4>(As<Int4>(oC.w) | As<Int4>(masked)); 2328 } 2329 2330 oC.w = As<Float4>(As<Int4>(oC.w) & *Pointer<Int4>(constants + OFFSET(Constants,maskX3X) + xMask * 16, 16)); 2331 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX3X) + xMask * 16, 16)); 2332 oC.w = As<Float4>(As<Int4>(oC.w) | As<Int4>(value)); 2333 *Pointer<Float4>(buffer + 16, 16) = oC.w; 2334 } 2335 break; 2336 case VK_FORMAT_R16G16B16A16_SINT: 2337 case VK_FORMAT_R16G16B16A16_UINT: 2338 if((rgbaWriteMask & 0x0000000F) != 0x0) 2339 { 2340 buffer = cBuffer + 8 * x; 2341 2342 UInt4 rgbaMask; 2343 UShort8 value = *Pointer<UShort8>(buffer); 2344 UShort8 packedCol = UShort8(UShort4(As<Int4>(oC.x)), UShort4(As<Int4>(oC.y))); 2345 UInt4 mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ01X) + xMask * 16); 2346 if((rgbaWriteMask & 0xF) != 0xF) 2347 { 2348 UInt2 tmpMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q[rgbaWriteMask][0])); 2349 rgbaMask = UInt4(tmpMask, tmpMask); 2350 mergedMask &= rgbaMask; 2351 } 2352 *Pointer<UInt4>(buffer) = (As<UInt4>(packedCol) & mergedMask) | (As<UInt4>(value) & ~mergedMask); 2353 2354 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index])); 2355 2356 value = *Pointer<UShort8>(buffer); 2357 packedCol = UShort8(UShort4(As<Int4>(oC.z)), UShort4(As<Int4>(oC.w))); 2358 mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ23X) + xMask * 16); 2359 if((rgbaWriteMask & 0xF) != 0xF) 2360 { 2361 mergedMask &= rgbaMask; 2362 } 2363 *Pointer<UInt4>(buffer) = (As<UInt4>(packedCol) & mergedMask) | (As<UInt4>(value) & ~mergedMask); 2364 } 2365 break; 2366 case VK_FORMAT_R8G8B8A8_SINT: 2367 case VK_FORMAT_R8G8B8A8_UINT: 2368 if((rgbaWriteMask & 0x0000000F) != 0x0) 2369 { 2370 UInt2 value, packedCol, mergedMask; 2371 2372 buffer = cBuffer + 4 * x; 2373 2374 if(state.targetFormat[index] == VK_FORMAT_R8G8B8A8_SINT) 2375 { 2376 packedCol = As<UInt2>(PackSigned(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y)))); 2377 } 2378 else 2379 { 2380 packedCol = As<UInt2>(PackUnsigned(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y)))); 2381 } 2382 value = *Pointer<UInt2>(buffer, 16); 2383 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8); 2384 if(rgbaWriteMask != 0xF) 2385 { 2386 mergedMask &= *Pointer<UInt2>(constants + OFFSET(Constants, maskB4Q[rgbaWriteMask][0])); 2387 } 2388 *Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (value & ~mergedMask); 2389 2390 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index])); 2391 2392 if(state.targetFormat[index] == VK_FORMAT_R8G8B8A8_SINT) 2393 { 2394 packedCol = As<UInt2>(PackSigned(Short4(As<Int4>(oC.z)), Short4(As<Int4>(oC.w)))); 2395 } 2396 else 2397 { 2398 packedCol = As<UInt2>(PackUnsigned(Short4(As<Int4>(oC.z)), Short4(As<Int4>(oC.w)))); 2399 } 2400 value = *Pointer<UInt2>(buffer, 16); 2401 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8); 2402 if(rgbaWriteMask != 0xF) 2403 { 2404 mergedMask &= *Pointer<UInt2>(constants + OFFSET(Constants, maskB4Q[rgbaWriteMask][0])); 2405 } 2406 *Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (value & ~mergedMask); 2407 } 2408 break; 2409 default: 2410 ASSERT(false); 2411 } 2412 } 2413 2414 UShort4 PixelRoutine::convertFixed16(Float4 &cf, bool saturate) 2415 { 2416 return UShort4(cf * Float4(0xFFFF), saturate); 2417 } 2418 2419 void PixelRoutine::sRGBtoLinear16_12_16(Vector4s &c) 2420 { 2421 Pointer<Byte> LUT = constants + OFFSET(Constants,sRGBtoLinear12_16); 2422 2423 c.x = As<UShort4>(c.x) >> 4; 2424 c.y = As<UShort4>(c.y) >> 4; 2425 c.z = As<UShort4>(c.z) >> 4; 2426 2427 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 0))), 0); 2428 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 1))), 1); 2429 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 2))), 2); 2430 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 3))), 3); 2431 2432 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 0))), 0); 2433 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 1))), 1); 2434 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 2))), 2); 2435 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 3))), 3); 2436 2437 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 0))), 0); 2438 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 1))), 1); 2439 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 2))), 2); 2440 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 3))), 3); 2441 } 2442 2443 void PixelRoutine::linearToSRGB16_12_16(Vector4s &c) 2444 { 2445 c.x = As<UShort4>(c.x) >> 4; 2446 c.y = As<UShort4>(c.y) >> 4; 2447 c.z = As<UShort4>(c.z) >> 4; 2448 2449 linearToSRGB12_16(c); 2450 } 2451 2452 void PixelRoutine::linearToSRGB12_16(Vector4s &c) 2453 { 2454 Pointer<Byte> LUT = constants + OFFSET(Constants,linearToSRGB12_16); 2455 2456 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 0))), 0); 2457 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 1))), 1); 2458 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 2))), 2); 2459 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 3))), 3); 2460 2461 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 0))), 0); 2462 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 1))), 1); 2463 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 2))), 2); 2464 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 3))), 3); 2465 2466 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 0))), 0); 2467 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 1))), 1); 2468 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 2))), 2); 2469 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 3))), 3); 2470 } 2471 2472 Float4 PixelRoutine::sRGBtoLinear(const Float4 &x) // Approximates x^2.2 2473 { 2474 Float4 linear = x * x; 2475 linear = linear * Float4(0.73f) + linear * x * Float4(0.27f); 2476 2477 return Min(Max(linear, Float4(0.0f)), Float4(1.0f)); 2478 } 2479 2480 bool PixelRoutine::colorUsed() 2481 { 2482 return state.colorWriteMask || state.alphaTestActive() || state.shaderContainsKill; 2483 } 2484 } 2485