Home | History | Annotate | Download | only in Pipeline
      1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved.
      2 //
      3 // Licensed under the Apache License, Version 2.0 (the "License");
      4 // you may not use this file except in compliance with the License.
      5 // You may obtain a copy of the License at
      6 //
      7 //    http://www.apache.org/licenses/LICENSE-2.0
      8 //
      9 // Unless required by applicable law or agreed to in writing, software
     10 // distributed under the License is distributed on an "AS IS" BASIS,
     11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 // See the License for the specific language governing permissions and
     13 // limitations under the License.
     14 
     15 #include "PixelRoutine.hpp"
     16 
     17 #include "SamplerCore.hpp"
     18 #include "Constants.hpp"
     19 #include "Device/Renderer.hpp"
     20 #include "Device/QuadRasterizer.hpp"
     21 #include "Device/Surface.hpp"
     22 #include "Device/Primitive.hpp"
     23 #include "Vulkan/VkDebug.hpp"
     24 
     25 namespace sw
     26 {
     27 	extern bool complementaryDepthBuffer;
     28 	extern bool postBlendSRGB;
     29 	extern bool exactColorRounding;
     30 	extern bool forceClearRegisters;
     31 
     32 	PixelRoutine::PixelRoutine(const PixelProcessor::State &state, const PixelShader *shader)
     33 		: QuadRasterizer(state, shader), v(shader && shader->indirectAddressableInput)
     34 	{
     35 		if(!shader || shader->getShaderModel() < 0x0200 || forceClearRegisters)
     36 		{
     37 			for(int i = 0; i < MAX_FRAGMENT_INPUTS; i++)
     38 			{
     39 				v[i].x = Float4(0.0f);
     40 				v[i].y = Float4(0.0f);
     41 				v[i].z = Float4(0.0f);
     42 				v[i].w = Float4(0.0f);
     43 			}
     44 		}
     45 	}
     46 
     47 	PixelRoutine::~PixelRoutine()
     48 	{
     49 	}
     50 
     51 	void PixelRoutine::quad(Pointer<Byte> cBuffer[RENDERTARGETS], Pointer<Byte> &zBuffer, Pointer<Byte> &sBuffer, Int cMask[4], Int &x, Int &y)
     52 	{
     53 		#if PERF_PROFILE
     54 			Long pipeTime = Ticks();
     55 		#endif
     56 
     57 		const bool earlyDepthTest = !state.depthOverride && !state.alphaTestActive();
     58 
     59 		Int zMask[4];   // Depth mask
     60 		Int sMask[4];   // Stencil mask
     61 
     62 		for(unsigned int q = 0; q < state.multiSample; q++)
     63 		{
     64 			zMask[q] = cMask[q];
     65 			sMask[q] = cMask[q];
     66 		}
     67 
     68 		for(unsigned int q = 0; q < state.multiSample; q++)
     69 		{
     70 			stencilTest(sBuffer, q, x, sMask[q], cMask[q]);
     71 		}
     72 
     73 		Float4 f;
     74 		Float4 rhwCentroid;
     75 
     76 		Float4 xxxx = Float4(Float(x)) + *Pointer<Float4>(primitive + OFFSET(Primitive,xQuad), 16);
     77 
     78 		if(interpolateZ())
     79 		{
     80 			for(unsigned int q = 0; q < state.multiSample; q++)
     81 			{
     82 				Float4 x = xxxx;
     83 
     84 				if(state.multiSample > 1)
     85 				{
     86 					x -= *Pointer<Float4>(constants + OFFSET(Constants,X) + q * sizeof(float4));
     87 				}
     88 
     89 				z[q] = interpolate(x, Dz[q], z[q], primitive + OFFSET(Primitive,z), false, false, state.depthClamp);
     90 			}
     91 		}
     92 
     93 		Bool depthPass = false;
     94 
     95 		if(earlyDepthTest)
     96 		{
     97 			for(unsigned int q = 0; q < state.multiSample; q++)
     98 			{
     99 				depthPass = depthPass || depthTest(zBuffer, q, x, z[q], sMask[q], zMask[q], cMask[q]);
    100 			}
    101 		}
    102 
    103 		If(depthPass || Bool(!earlyDepthTest))
    104 		{
    105 			#if PERF_PROFILE
    106 				Long interpTime = Ticks();
    107 			#endif
    108 
    109 			Float4 yyyy = Float4(Float(y)) + *Pointer<Float4>(primitive + OFFSET(Primitive,yQuad), 16);
    110 
    111 			// Centroid locations
    112 			Float4 XXXX = Float4(0.0f);
    113 			Float4 YYYY = Float4(0.0f);
    114 
    115 			if(state.centroid)
    116 			{
    117 				Float4 WWWW(1.0e-9f);
    118 
    119 				for(unsigned int q = 0; q < state.multiSample; q++)
    120 				{
    121 					XXXX += *Pointer<Float4>(constants + OFFSET(Constants,sampleX[q]) + 16 * cMask[q]);
    122 					YYYY += *Pointer<Float4>(constants + OFFSET(Constants,sampleY[q]) + 16 * cMask[q]);
    123 					WWWW += *Pointer<Float4>(constants + OFFSET(Constants,weight) + 16 * cMask[q]);
    124 				}
    125 
    126 				WWWW = Rcp_pp(WWWW);
    127 				XXXX *= WWWW;
    128 				YYYY *= WWWW;
    129 
    130 				XXXX += xxxx;
    131 				YYYY += yyyy;
    132 			}
    133 
    134 			if(interpolateW())
    135 			{
    136 				w = interpolate(xxxx, Dw, rhw, primitive + OFFSET(Primitive,w), false, false, false);
    137 				rhw = reciprocal(w, false, false, true);
    138 
    139 				if(state.centroid)
    140 				{
    141 					rhwCentroid = reciprocal(interpolateCentroid(XXXX, YYYY, rhwCentroid, primitive + OFFSET(Primitive,w), false, false));
    142 				}
    143 			}
    144 
    145 			for(int interpolant = 0; interpolant < MAX_FRAGMENT_INPUTS; interpolant++)
    146 			{
    147 				for(int component = 0; component < 4; component++)
    148 				{
    149 					if(state.interpolant[interpolant].component & (1 << component))
    150 					{
    151 						if(!state.interpolant[interpolant].centroid)
    152 						{
    153 							v[interpolant][component] = interpolate(xxxx, Dv[interpolant][component], rhw, primitive + OFFSET(Primitive, V[interpolant][component]), (state.interpolant[interpolant].flat & (1 << component)) != 0, state.perspective, false);
    154 						}
    155 						else
    156 						{
    157 							v[interpolant][component] = interpolateCentroid(XXXX, YYYY, rhwCentroid, primitive + OFFSET(Primitive, V[interpolant][component]), (state.interpolant[interpolant].flat & (1 << component)) != 0, state.perspective);
    158 						}
    159 					}
    160 				}
    161 
    162 				Float4 rcp;
    163 
    164 				switch(state.interpolant[interpolant].project)
    165 				{
    166 				case 0:
    167 					break;
    168 				case 1:
    169 					rcp = reciprocal(v[interpolant].y);
    170 					v[interpolant].x = v[interpolant].x * rcp;
    171 					break;
    172 				case 2:
    173 					rcp = reciprocal(v[interpolant].z);
    174 					v[interpolant].x = v[interpolant].x * rcp;
    175 					v[interpolant].y = v[interpolant].y * rcp;
    176 					break;
    177 				case 3:
    178 					rcp = reciprocal(v[interpolant].w);
    179 					v[interpolant].x = v[interpolant].x * rcp;
    180 					v[interpolant].y = v[interpolant].y * rcp;
    181 					v[interpolant].z = v[interpolant].z * rcp;
    182 					break;
    183 				}
    184 			}
    185 
    186 			if(state.fog.component)
    187 			{
    188 				f = interpolate(xxxx, Df, rhw, primitive + OFFSET(Primitive,f), state.fog.flat & 0x01, state.perspective, false);
    189 			}
    190 
    191 			setBuiltins(x, y, z, w);
    192 
    193 			#if PERF_PROFILE
    194 				cycles[PERF_INTERP] += Ticks() - interpTime;
    195 			#endif
    196 
    197 			Bool alphaPass = true;
    198 
    199 			if(colorUsed())
    200 			{
    201 				#if PERF_PROFILE
    202 					Long shaderTime = Ticks();
    203 				#endif
    204 
    205 				applyShader(cMask);
    206 
    207 				#if PERF_PROFILE
    208 					cycles[PERF_SHADER] += Ticks() - shaderTime;
    209 				#endif
    210 
    211 				alphaPass = alphaTest(cMask);
    212 
    213 				if((shader && shader->containsKill()) || state.alphaTestActive())
    214 				{
    215 					for(unsigned int q = 0; q < state.multiSample; q++)
    216 					{
    217 						zMask[q] &= cMask[q];
    218 						sMask[q] &= cMask[q];
    219 					}
    220 				}
    221 			}
    222 
    223 			If(alphaPass)
    224 			{
    225 				if(!earlyDepthTest)
    226 				{
    227 					for(unsigned int q = 0; q < state.multiSample; q++)
    228 					{
    229 						depthPass = depthPass || depthTest(zBuffer, q, x, z[q], sMask[q], zMask[q], cMask[q]);
    230 					}
    231 				}
    232 
    233 				#if PERF_PROFILE
    234 					Long ropTime = Ticks();
    235 				#endif
    236 
    237 				If(depthPass || Bool(earlyDepthTest))
    238 				{
    239 					for(unsigned int q = 0; q < state.multiSample; q++)
    240 					{
    241 						if(state.multiSampleMask & (1 << q))
    242 						{
    243 							writeDepth(zBuffer, q, x, z[q], zMask[q]);
    244 
    245 							if(state.occlusionEnabled)
    246 							{
    247 								occlusion += *Pointer<UInt>(constants + OFFSET(Constants,occlusionCount) + 4 * (zMask[q] & sMask[q]));
    248 							}
    249 						}
    250 					}
    251 
    252 					if(colorUsed())
    253 					{
    254 						#if PERF_PROFILE
    255 							AddAtomic(Pointer<Long>(&profiler.ropOperations), 4);
    256 						#endif
    257 
    258 						rasterOperation(f, cBuffer, x, sMask, zMask, cMask);
    259 					}
    260 				}
    261 
    262 				#if PERF_PROFILE
    263 					cycles[PERF_ROP] += Ticks() - ropTime;
    264 				#endif
    265 			}
    266 		}
    267 
    268 		for(unsigned int q = 0; q < state.multiSample; q++)
    269 		{
    270 			if(state.multiSampleMask & (1 << q))
    271 			{
    272 				writeStencil(sBuffer, q, x, sMask[q], zMask[q], cMask[q]);
    273 			}
    274 		}
    275 
    276 		#if PERF_PROFILE
    277 			cycles[PERF_PIPE] += Ticks() - pipeTime;
    278 		#endif
    279 	}
    280 
    281 	Float4 PixelRoutine::interpolateCentroid(Float4 &x, Float4 &y, Float4 &rhw, Pointer<Byte> planeEquation, bool flat, bool perspective)
    282 	{
    283 		Float4 interpolant = *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation,C), 16);
    284 
    285 		if(!flat)
    286 		{
    287 			interpolant += x * *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation,A), 16) +
    288 			               y * *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation,B), 16);
    289 
    290 			if(perspective)
    291 			{
    292 				interpolant *= rhw;
    293 			}
    294 		}
    295 
    296 		return interpolant;
    297 	}
    298 
    299 	void PixelRoutine::stencilTest(Pointer<Byte> &sBuffer, int q, Int &x, Int &sMask, Int &cMask)
    300 	{
    301 		if(!state.stencilActive)
    302 		{
    303 			return;
    304 		}
    305 
    306 		// (StencilRef & StencilMask) CompFunc (StencilBufferValue & StencilMask)
    307 
    308 		Pointer<Byte> buffer = sBuffer + 2 * x;
    309 
    310 		if(q > 0)
    311 		{
    312 			buffer += q * *Pointer<Int>(data + OFFSET(DrawData,stencilSliceB));
    313 		}
    314 
    315 		Byte8 value = *Pointer<Byte8>(buffer);
    316 		Byte8 valueCCW = value;
    317 
    318 		if(!state.noStencilMask)
    319 		{
    320 			value &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[0].testMaskQ));
    321 		}
    322 
    323 		stencilTest(value, state.stencilCompareMode, false);
    324 
    325 		if(state.twoSidedStencil)
    326 		{
    327 			if(!state.noStencilMaskCCW)
    328 			{
    329 				valueCCW &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[1].testMaskQ));
    330 			}
    331 
    332 			stencilTest(valueCCW, state.stencilCompareModeCCW, true);
    333 
    334 			value &= *Pointer<Byte8>(primitive + OFFSET(Primitive,clockwiseMask));
    335 			valueCCW &= *Pointer<Byte8>(primitive + OFFSET(Primitive,invClockwiseMask));
    336 			value |= valueCCW;
    337 		}
    338 
    339 		sMask = SignMask(value) & cMask;
    340 	}
    341 
    342 	void PixelRoutine::stencilTest(Byte8 &value, VkCompareOp stencilCompareMode, bool CCW)
    343 	{
    344 		Byte8 equal;
    345 
    346 		switch(stencilCompareMode)
    347 		{
    348 		case VK_COMPARE_OP_ALWAYS:
    349 			value = Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
    350 			break;
    351 		case VK_COMPARE_OP_NEVER:
    352 			value = Byte8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
    353 			break;
    354 		case VK_COMPARE_OP_LESS:			// a < b ~ b > a
    355 			value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
    356 			value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ)));
    357 			break;
    358 		case VK_COMPARE_OP_EQUAL:
    359 			value = CmpEQ(value, *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedQ)));
    360 			break;
    361 		case VK_COMPARE_OP_NOT_EQUAL:		// a != b ~ !(a == b)
    362 			value = CmpEQ(value, *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedQ)));
    363 			value ^= Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
    364 			break;
    365 		case VK_COMPARE_OP_LESS_OR_EQUAL:	// a <= b ~ (b > a) || (a == b)
    366 			equal = value;
    367 			equal = CmpEQ(equal, *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedQ)));
    368 			value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
    369 			value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ)));
    370 			value |= equal;
    371 			break;
    372 		case VK_COMPARE_OP_GREATER:		// a > b
    373 			equal = *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ));
    374 			value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
    375 			equal = CmpGT(As<SByte8>(equal), As<SByte8>(value));
    376 			value = equal;
    377 			break;
    378 		case VK_COMPARE_OP_GREATER_OR_EQUAL:	// a >= b ~ !(a < b) ~ !(b > a)
    379 			value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
    380 			value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ)));
    381 			value ^= Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
    382 			break;
    383 		default:
    384 			ASSERT(false);
    385 		}
    386 	}
    387 
    388 	Bool PixelRoutine::depthTest(Pointer<Byte> &zBuffer, int q, Int &x, Float4 &z, Int &sMask, Int &zMask, Int &cMask)
    389 	{
    390 		if(!state.depthTestActive)
    391 		{
    392 			return true;
    393 		}
    394 
    395 		Float4 Z = z;
    396 
    397 		if(shader && shader->depthOverride())
    398 		{
    399 			if(complementaryDepthBuffer)
    400 			{
    401 				Z = Float4(1.0f) - oDepth;
    402 			}
    403 			else
    404 			{
    405 				Z = oDepth;
    406 			}
    407 		}
    408 
    409 		Pointer<Byte> buffer;
    410 		Int pitch;
    411 
    412 		if(!state.quadLayoutDepthBuffer)
    413 		{
    414 			buffer = zBuffer + 4 * x;
    415 			pitch = *Pointer<Int>(data + OFFSET(DrawData,depthPitchB));
    416 		}
    417 		else
    418 		{
    419 			buffer = zBuffer + 8 * x;
    420 		}
    421 
    422 		if(q > 0)
    423 		{
    424 			buffer += q * *Pointer<Int>(data + OFFSET(DrawData,depthSliceB));
    425 		}
    426 
    427 		Float4 zValue;
    428 
    429 		if(state.depthCompareMode != VK_COMPARE_OP_NEVER || (state.depthCompareMode != VK_COMPARE_OP_ALWAYS && !state.depthWriteEnable))
    430 		{
    431 			if(!state.quadLayoutDepthBuffer)
    432 			{
    433 				// FIXME: Properly optimizes?
    434 				zValue.xy = *Pointer<Float4>(buffer);
    435 				zValue.zw = *Pointer<Float4>(buffer + pitch - 8);
    436 			}
    437 			else
    438 			{
    439 				zValue = *Pointer<Float4>(buffer, 16);
    440 			}
    441 		}
    442 
    443 		Int4 zTest;
    444 
    445 		switch(state.depthCompareMode)
    446 		{
    447 		case VK_COMPARE_OP_ALWAYS:
    448 			// Optimized
    449 			break;
    450 		case VK_COMPARE_OP_NEVER:
    451 			// Optimized
    452 			break;
    453 		case VK_COMPARE_OP_EQUAL:
    454 			zTest = CmpEQ(zValue, Z);
    455 			break;
    456 		case VK_COMPARE_OP_NOT_EQUAL:
    457 			zTest = CmpNEQ(zValue, Z);
    458 			break;
    459 		case VK_COMPARE_OP_LESS:
    460 			if(complementaryDepthBuffer)
    461 			{
    462 				zTest = CmpLT(zValue, Z);
    463 			}
    464 			else
    465 			{
    466 				zTest = CmpNLE(zValue, Z);
    467 			}
    468 			break;
    469 		case VK_COMPARE_OP_GREATER_OR_EQUAL:
    470 			if(complementaryDepthBuffer)
    471 			{
    472 				zTest = CmpNLT(zValue, Z);
    473 			}
    474 			else
    475 			{
    476 				zTest = CmpLE(zValue, Z);
    477 			}
    478 			break;
    479 		case VK_COMPARE_OP_LESS_OR_EQUAL:
    480 			if(complementaryDepthBuffer)
    481 			{
    482 				zTest = CmpLE(zValue, Z);
    483 			}
    484 			else
    485 			{
    486 				zTest = CmpNLT(zValue, Z);
    487 			}
    488 			break;
    489 		case VK_COMPARE_OP_GREATER:
    490 			if(complementaryDepthBuffer)
    491 			{
    492 				zTest = CmpNLE(zValue, Z);
    493 			}
    494 			else
    495 			{
    496 				zTest = CmpLT(zValue, Z);
    497 			}
    498 			break;
    499 		default:
    500 			ASSERT(false);
    501 		}
    502 
    503 		switch(state.depthCompareMode)
    504 		{
    505 		case VK_COMPARE_OP_ALWAYS:
    506 			zMask = cMask;
    507 			break;
    508 		case VK_COMPARE_OP_NEVER:
    509 			zMask = 0x0;
    510 			break;
    511 		default:
    512 			zMask = SignMask(zTest) & cMask;
    513 			break;
    514 		}
    515 
    516 		if(state.stencilActive)
    517 		{
    518 			zMask &= sMask;
    519 		}
    520 
    521 		return zMask != 0;
    522 	}
    523 
    524 	void PixelRoutine::alphaTest(Int &aMask, Short4 &alpha)
    525 	{
    526 		Short4 cmp;
    527 		Short4 equal;
    528 
    529 		switch(state.alphaCompareMode)
    530 		{
    531 		case VK_COMPARE_OP_ALWAYS:
    532 			aMask = 0xF;
    533 			break;
    534 		case VK_COMPARE_OP_NEVER:
    535 			aMask = 0x0;
    536 			break;
    537 		case VK_COMPARE_OP_EQUAL:
    538 			cmp = CmpEQ(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)));
    539 			aMask = SignMask(PackSigned(cmp, Short4(0x0000)));
    540 			break;
    541 		case VK_COMPARE_OP_NOT_EQUAL:       // a != b ~ !(a == b)
    542 			cmp = CmpEQ(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4))) ^ Short4(0xFFFFu);   // FIXME
    543 			aMask = SignMask(PackSigned(cmp, Short4(0x0000)));
    544 			break;
    545 		case VK_COMPARE_OP_LESS:           // a < b ~ b > a
    546 			cmp = CmpGT(*Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)), alpha);
    547 			aMask = SignMask(PackSigned(cmp, Short4(0x0000)));
    548 			break;
    549 		case VK_COMPARE_OP_GREATER_OR_EQUAL:   // a >= b ~ (a > b) || (a == b) ~ !(b > a)   // TODO: Approximate
    550 			equal = CmpEQ(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)));
    551 			cmp = CmpGT(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)));
    552 			cmp |= equal;
    553 			aMask = SignMask(PackSigned(cmp, Short4(0x0000)));
    554 			break;
    555 		case VK_COMPARE_OP_LESS_OR_EQUAL:      // a <= b ~ !(a > b)
    556 			cmp = CmpGT(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4))) ^ Short4(0xFFFFu);   // FIXME
    557 			aMask = SignMask(PackSigned(cmp, Short4(0x0000)));
    558 			break;
    559 		case VK_COMPARE_OP_GREATER:        // a > b
    560 			cmp = CmpGT(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)));
    561 			aMask = SignMask(PackSigned(cmp, Short4(0x0000)));
    562 			break;
    563 		default:
    564 			ASSERT(false);
    565 		}
    566 	}
    567 
    568 	void PixelRoutine::alphaToCoverage(Int cMask[4], Float4 &alpha)
    569 	{
    570 		Int4 coverage0 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c0)));
    571 		Int4 coverage1 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c1)));
    572 		Int4 coverage2 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c2)));
    573 		Int4 coverage3 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c3)));
    574 
    575 		Int aMask0 = SignMask(coverage0);
    576 		Int aMask1 = SignMask(coverage1);
    577 		Int aMask2 = SignMask(coverage2);
    578 		Int aMask3 = SignMask(coverage3);
    579 
    580 		cMask[0] &= aMask0;
    581 		cMask[1] &= aMask1;
    582 		cMask[2] &= aMask2;
    583 		cMask[3] &= aMask3;
    584 	}
    585 
    586 	void PixelRoutine::writeDepth(Pointer<Byte> &zBuffer, int q, Int &x, Float4 &z, Int &zMask)
    587 	{
    588 		if(!state.depthWriteEnable)
    589 		{
    590 			return;
    591 		}
    592 
    593 		Float4 Z = z;
    594 
    595 		if(shader && shader->depthOverride())
    596 		{
    597 			if(complementaryDepthBuffer)
    598 			{
    599 				Z = Float4(1.0f) - oDepth;
    600 			}
    601 			else
    602 			{
    603 				Z = oDepth;
    604 			}
    605 		}
    606 
    607 		Pointer<Byte> buffer;
    608 		Int pitch;
    609 
    610 		if(!state.quadLayoutDepthBuffer)
    611 		{
    612 			buffer = zBuffer + 4 * x;
    613 			pitch = *Pointer<Int>(data + OFFSET(DrawData,depthPitchB));
    614 		}
    615 		else
    616 		{
    617 			buffer = zBuffer + 8 * x;
    618 		}
    619 
    620 		if(q > 0)
    621 		{
    622 			buffer += q * *Pointer<Int>(data + OFFSET(DrawData,depthSliceB));
    623 		}
    624 
    625 		Float4 zValue;
    626 
    627 		if(state.depthCompareMode != VK_COMPARE_OP_NEVER || (state.depthCompareMode != VK_COMPARE_OP_ALWAYS && !state.depthWriteEnable))
    628 		{
    629 			if(!state.quadLayoutDepthBuffer)
    630 			{
    631 				// FIXME: Properly optimizes?
    632 				zValue.xy = *Pointer<Float4>(buffer);
    633 				zValue.zw = *Pointer<Float4>(buffer + pitch - 8);
    634 			}
    635 			else
    636 			{
    637 				zValue = *Pointer<Float4>(buffer, 16);
    638 			}
    639 		}
    640 
    641 		Z = As<Float4>(As<Int4>(Z) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X) + zMask * 16, 16));
    642 		zValue = As<Float4>(As<Int4>(zValue) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X) + zMask * 16, 16));
    643 		Z = As<Float4>(As<Int4>(Z) | As<Int4>(zValue));
    644 
    645 		if(!state.quadLayoutDepthBuffer)
    646 		{
    647 			// FIXME: Properly optimizes?
    648 			*Pointer<Float2>(buffer) = Float2(Z.xy);
    649 			*Pointer<Float2>(buffer + pitch) = Float2(Z.zw);
    650 		}
    651 		else
    652 		{
    653 			*Pointer<Float4>(buffer, 16) = Z;
    654 		}
    655 	}
    656 
    657 	void PixelRoutine::writeStencil(Pointer<Byte> &sBuffer, int q, Int &x, Int &sMask, Int &zMask, Int &cMask)
    658 	{
    659 		if(!state.stencilActive)
    660 		{
    661 			return;
    662 		}
    663 
    664 		if(state.stencilPassOperation == VK_STENCIL_OP_KEEP && state.stencilZFailOperation == VK_STENCIL_OP_KEEP && state.stencilFailOperation == VK_STENCIL_OP_KEEP)
    665 		{
    666 			if(!state.twoSidedStencil || (state.stencilPassOperationCCW == VK_STENCIL_OP_KEEP && state.stencilZFailOperationCCW == VK_STENCIL_OP_KEEP && state.stencilFailOperationCCW == VK_STENCIL_OP_KEEP))
    667 			{
    668 				return;
    669 			}
    670 		}
    671 
    672 		if(state.stencilWriteMasked && (!state.twoSidedStencil || state.stencilWriteMaskedCCW))
    673 		{
    674 			return;
    675 		}
    676 
    677 		Pointer<Byte> buffer = sBuffer + 2 * x;
    678 
    679 		if(q > 0)
    680 		{
    681 			buffer += q * *Pointer<Int>(data + OFFSET(DrawData,stencilSliceB));
    682 		}
    683 
    684 		Byte8 bufferValue = *Pointer<Byte8>(buffer);
    685 
    686 		Byte8 newValue;
    687 		stencilOperation(newValue, bufferValue, state.stencilPassOperation, state.stencilZFailOperation, state.stencilFailOperation, false, zMask, sMask);
    688 
    689 		if(!state.noStencilWriteMask)
    690 		{
    691 			Byte8 maskedValue = bufferValue;
    692 			newValue &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[0].writeMaskQ));
    693 			maskedValue &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[0].invWriteMaskQ));
    694 			newValue |= maskedValue;
    695 		}
    696 
    697 		if(state.twoSidedStencil)
    698 		{
    699 			Byte8 newValueCCW;
    700 
    701 			stencilOperation(newValueCCW, bufferValue, state.stencilPassOperationCCW, state.stencilZFailOperationCCW, state.stencilFailOperationCCW, true, zMask, sMask);
    702 
    703 			if(!state.noStencilWriteMaskCCW)
    704 			{
    705 				Byte8 maskedValue = bufferValue;
    706 				newValueCCW &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[1].writeMaskQ));
    707 				maskedValue &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[1].invWriteMaskQ));
    708 				newValueCCW |= maskedValue;
    709 			}
    710 
    711 			newValue &= *Pointer<Byte8>(primitive + OFFSET(Primitive,clockwiseMask));
    712 			newValueCCW &= *Pointer<Byte8>(primitive + OFFSET(Primitive,invClockwiseMask));
    713 			newValue |= newValueCCW;
    714 		}
    715 
    716 		newValue &= *Pointer<Byte8>(constants + OFFSET(Constants,maskB4Q) + 8 * cMask);
    717 		bufferValue &= *Pointer<Byte8>(constants + OFFSET(Constants,invMaskB4Q) + 8 * cMask);
    718 		newValue |= bufferValue;
    719 
    720 		*Pointer<Byte4>(buffer) = Byte4(newValue);
    721 	}
    722 
    723 	void PixelRoutine::stencilOperation(Byte8 &newValue, Byte8 &bufferValue, VkStencilOp stencilPassOperation, VkStencilOp stencilZFailOperation, VkStencilOp stencilFailOperation, bool CCW, Int &zMask, Int &sMask)
    724 	{
    725 		Byte8 &pass = newValue;
    726 		Byte8 fail;
    727 		Byte8 zFail;
    728 
    729 		stencilOperation(pass, bufferValue, stencilPassOperation, CCW);
    730 
    731 		if(stencilZFailOperation != stencilPassOperation)
    732 		{
    733 			stencilOperation(zFail, bufferValue, stencilZFailOperation, CCW);
    734 		}
    735 
    736 		if(stencilFailOperation != stencilPassOperation || stencilFailOperation != stencilZFailOperation)
    737 		{
    738 			stencilOperation(fail, bufferValue, stencilFailOperation, CCW);
    739 		}
    740 
    741 		if(stencilFailOperation != stencilPassOperation || stencilFailOperation != stencilZFailOperation)
    742 		{
    743 			if(state.depthTestActive && stencilZFailOperation != stencilPassOperation)   // zMask valid and values not the same
    744 			{
    745 				pass &= *Pointer<Byte8>(constants + OFFSET(Constants,maskB4Q) + 8 * zMask);
    746 				zFail &= *Pointer<Byte8>(constants + OFFSET(Constants,invMaskB4Q) + 8 * zMask);
    747 				pass |= zFail;
    748 			}
    749 
    750 			pass &= *Pointer<Byte8>(constants + OFFSET(Constants,maskB4Q) + 8 * sMask);
    751 			fail &= *Pointer<Byte8>(constants + OFFSET(Constants,invMaskB4Q) + 8 * sMask);
    752 			pass |= fail;
    753 		}
    754 	}
    755 
    756 	void PixelRoutine::stencilOperation(Byte8 &output, Byte8 &bufferValue, VkStencilOp operation, bool CCW)
    757 	{
    758 		switch(operation)
    759 		{
    760 		case VK_STENCIL_OP_KEEP:
    761 			output = bufferValue;
    762 			break;
    763 		case VK_STENCIL_OP_ZERO:
    764 			output = Byte8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
    765 			break;
    766 		case VK_STENCIL_OP_REPLACE:
    767 			output = *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceQ));
    768 			break;
    769 		case VK_STENCIL_OP_INCREMENT_AND_CLAMP:
    770 			output = AddSat(bufferValue, Byte8(1, 1, 1, 1, 1, 1, 1, 1));
    771 			break;
    772 		case VK_STENCIL_OP_DECREMENT_AND_CLAMP:
    773 			output = SubSat(bufferValue, Byte8(1, 1, 1, 1, 1, 1, 1, 1));
    774 			break;
    775 		case VK_STENCIL_OP_INVERT:
    776 			output = bufferValue ^ Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
    777 			break;
    778 		case VK_STENCIL_OP_INCREMENT_AND_WRAP:
    779 			output = bufferValue + Byte8(1, 1, 1, 1, 1, 1, 1, 1);
    780 			break;
    781 		case VK_STENCIL_OP_DECREMENT_AND_WRAP:
    782 			output = bufferValue - Byte8(1, 1, 1, 1, 1, 1, 1, 1);
    783 			break;
    784 		default:
    785 			ASSERT(false);
    786 		}
    787 	}
    788 
    789 	void PixelRoutine::blendFactor(Vector4s &blendFactor, const Vector4s &current, const Vector4s &pixel, VkBlendFactor blendFactorActive)
    790 	{
    791 		switch(blendFactorActive)
    792 		{
    793 		case VK_BLEND_FACTOR_ZERO:
    794 			// Optimized
    795 			break;
    796 		case VK_BLEND_FACTOR_ONE:
    797 			// Optimized
    798 			break;
    799 		case VK_BLEND_FACTOR_SRC_COLOR:
    800 			blendFactor.x = current.x;
    801 			blendFactor.y = current.y;
    802 			blendFactor.z = current.z;
    803 			break;
    804 		case VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR:
    805 			blendFactor.x = Short4(0xFFFFu) - current.x;
    806 			blendFactor.y = Short4(0xFFFFu) - current.y;
    807 			blendFactor.z = Short4(0xFFFFu) - current.z;
    808 			break;
    809 		case VK_BLEND_FACTOR_DST_COLOR:
    810 			blendFactor.x = pixel.x;
    811 			blendFactor.y = pixel.y;
    812 			blendFactor.z = pixel.z;
    813 			break;
    814 		case VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR:
    815 			blendFactor.x = Short4(0xFFFFu) - pixel.x;
    816 			blendFactor.y = Short4(0xFFFFu) - pixel.y;
    817 			blendFactor.z = Short4(0xFFFFu) - pixel.z;
    818 			break;
    819 		case VK_BLEND_FACTOR_SRC_ALPHA:
    820 			blendFactor.x = current.w;
    821 			blendFactor.y = current.w;
    822 			blendFactor.z = current.w;
    823 			break;
    824 		case VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA:
    825 			blendFactor.x = Short4(0xFFFFu) - current.w;
    826 			blendFactor.y = Short4(0xFFFFu) - current.w;
    827 			blendFactor.z = Short4(0xFFFFu) - current.w;
    828 			break;
    829 		case VK_BLEND_FACTOR_DST_ALPHA:
    830 			blendFactor.x = pixel.w;
    831 			blendFactor.y = pixel.w;
    832 			blendFactor.z = pixel.w;
    833 			break;
    834 		case VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA:
    835 			blendFactor.x = Short4(0xFFFFu) - pixel.w;
    836 			blendFactor.y = Short4(0xFFFFu) - pixel.w;
    837 			blendFactor.z = Short4(0xFFFFu) - pixel.w;
    838 			break;
    839 		case VK_BLEND_FACTOR_SRC_ALPHA_SATURATE:
    840 			blendFactor.x = Short4(0xFFFFu) - pixel.w;
    841 			blendFactor.x = Min(As<UShort4>(blendFactor.x), As<UShort4>(current.w));
    842 			blendFactor.y = blendFactor.x;
    843 			blendFactor.z = blendFactor.x;
    844 			break;
    845 		case VK_BLEND_FACTOR_CONSTANT_COLOR:
    846 			blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[0]));
    847 			blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[1]));
    848 			blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[2]));
    849 			break;
    850 		case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR:
    851 			blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[0]));
    852 			blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[1]));
    853 			blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[2]));
    854 			break;
    855 		case VK_BLEND_FACTOR_CONSTANT_ALPHA:
    856 			blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3]));
    857 			blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3]));
    858 			blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3]));
    859 			break;
    860 		case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA:
    861 			blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
    862 			blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
    863 			blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
    864 			break;
    865 		default:
    866 			ASSERT(false);
    867 		}
    868 	}
    869 
    870 	void PixelRoutine::blendFactorAlpha(Vector4s &blendFactor, const Vector4s &current, const Vector4s &pixel, VkBlendFactor blendFactorAlphaActive)
    871 	{
    872 		switch(blendFactorAlphaActive)
    873 		{
    874 		case VK_BLEND_FACTOR_ZERO:
    875 			// Optimized
    876 			break;
    877 		case VK_BLEND_FACTOR_ONE:
    878 			// Optimized
    879 			break;
    880 		case VK_BLEND_FACTOR_SRC_COLOR:
    881 			blendFactor.w = current.w;
    882 			break;
    883 		case VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR:
    884 			blendFactor.w = Short4(0xFFFFu) - current.w;
    885 			break;
    886 		case VK_BLEND_FACTOR_DST_COLOR:
    887 			blendFactor.w = pixel.w;
    888 			break;
    889 		case VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR:
    890 			blendFactor.w = Short4(0xFFFFu) - pixel.w;
    891 			break;
    892 		case VK_BLEND_FACTOR_SRC_ALPHA:
    893 			blendFactor.w = current.w;
    894 			break;
    895 		case VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA:
    896 			blendFactor.w = Short4(0xFFFFu) - current.w;
    897 			break;
    898 		case VK_BLEND_FACTOR_DST_ALPHA:
    899 			blendFactor.w = pixel.w;
    900 			break;
    901 		case VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA:
    902 			blendFactor.w = Short4(0xFFFFu) - pixel.w;
    903 			break;
    904 		case VK_BLEND_FACTOR_SRC_ALPHA_SATURATE:
    905 			blendFactor.w = Short4(0xFFFFu);
    906 			break;
    907 		case VK_BLEND_FACTOR_CONSTANT_COLOR:
    908 		case VK_BLEND_FACTOR_CONSTANT_ALPHA:
    909 			blendFactor.w = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3]));
    910 			break;
    911 		case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR:
    912 		case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA:
    913 			blendFactor.w = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
    914 			break;
    915 		default:
    916 			ASSERT(false);
    917 		}
    918 	}
    919 
    920 	bool PixelRoutine::isSRGB(int index) const
    921 	{
    922 		return Surface::isSRGBformat(state.targetFormat[index]);
    923 	}
    924 
    925 	void PixelRoutine::readPixel(int index, Pointer<Byte> &cBuffer, Int &x, Vector4s &pixel)
    926 	{
    927 		Short4 c01;
    928 		Short4 c23;
    929 		Pointer<Byte> buffer;
    930 		Pointer<Byte> buffer2;
    931 
    932 		switch(state.targetFormat[index])
    933 		{
    934 		case VK_FORMAT_R5G6B5_UNORM_PACK16:
    935 			buffer = cBuffer + 2 * x;
    936 			buffer2 = buffer + *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
    937 			c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
    938 
    939 			pixel.x = c01 & Short4(0xF800u);
    940 			pixel.y = (c01 & Short4(0x07E0u)) << 5;
    941 			pixel.z = (c01 & Short4(0x001Fu)) << 11;
    942 			pixel.w = Short4(0xFFFFu);
    943 			break;
    944 		case VK_FORMAT_B8G8R8A8_UNORM:
    945 			buffer = cBuffer + 4 * x;
    946 			c01 = *Pointer<Short4>(buffer);
    947 			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
    948 			c23 = *Pointer<Short4>(buffer);
    949 			pixel.z = c01;
    950 			pixel.y = c01;
    951 			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
    952 			pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
    953 			pixel.x = pixel.z;
    954 			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
    955 			pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
    956 			pixel.y = pixel.z;
    957 			pixel.w = pixel.x;
    958 			pixel.x = UnpackLow(As<Byte8>(pixel.x), As<Byte8>(pixel.x));
    959 			pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
    960 			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
    961 			pixel.w = UnpackHigh(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
    962 			break;
    963 		case VK_FORMAT_R8G8B8A8_UNORM:
    964 		case VK_FORMAT_R8G8B8A8_SRGB:
    965 			buffer = cBuffer + 4 * x;
    966 			c01 = *Pointer<Short4>(buffer);
    967 			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
    968 			c23 = *Pointer<Short4>(buffer);
    969 			pixel.z = c01;
    970 			pixel.y = c01;
    971 			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
    972 			pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
    973 			pixel.x = pixel.z;
    974 			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
    975 			pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
    976 			pixel.y = pixel.z;
    977 			pixel.w = pixel.x;
    978 			pixel.x = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
    979 			pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
    980 			pixel.z = UnpackLow(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
    981 			pixel.w = UnpackHigh(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
    982 			break;
    983 		case VK_FORMAT_R8_UNORM:
    984 			buffer = cBuffer + 1 * x;
    985 			pixel.x = Insert(pixel.x, *Pointer<Short>(buffer), 0);
    986 			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
    987 			pixel.x = Insert(pixel.x, *Pointer<Short>(buffer), 1);
    988 			pixel.x = UnpackLow(As<Byte8>(pixel.x), As<Byte8>(pixel.x));
    989 			pixel.y = Short4(0x0000);
    990 			pixel.z = Short4(0x0000);
    991 			pixel.w = Short4(0xFFFFu);
    992 			break;
    993 		case VK_FORMAT_R8G8_UNORM:
    994 			buffer = cBuffer + 2 * x;
    995 			c01 = As<Short4>(Insert(As<Int2>(c01), *Pointer<Int>(buffer), 0));
    996 			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
    997 			c01 = As<Short4>(Insert(As<Int2>(c01), *Pointer<Int>(buffer), 1));
    998 			pixel.x = (c01 & Short4(0x00FFu)) | (c01 << 8);
    999 			pixel.y = (c01 & Short4(0xFF00u)) | As<Short4>(As<UShort4>(c01) >> 8);
   1000 			pixel.z = Short4(0x0000u);
   1001 			pixel.w = Short4(0xFFFFu);
   1002 			break;
   1003 		case VK_FORMAT_R16G16B16A16_UNORM:
   1004 			buffer = cBuffer;
   1005 			pixel.x = *Pointer<Short4>(buffer + 8 * x);
   1006 			pixel.y = *Pointer<Short4>(buffer + 8 * x + 8);
   1007 			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
   1008 			pixel.z = *Pointer<Short4>(buffer + 8 * x);
   1009 			pixel.w = *Pointer<Short4>(buffer + 8 * x + 8);
   1010 			transpose4x4(pixel.x, pixel.y, pixel.z, pixel.w);
   1011 			break;
   1012 		case VK_FORMAT_R16G16_UNORM:
   1013 			buffer = cBuffer;
   1014 			pixel.x = *Pointer<Short4>(buffer + 4 * x);
   1015 			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
   1016 			pixel.y = *Pointer<Short4>(buffer + 4 * x);
   1017 			pixel.z = pixel.x;
   1018 			pixel.x = As<Short4>(UnpackLow(pixel.x, pixel.y));
   1019 			pixel.z = As<Short4>(UnpackHigh(pixel.z, pixel.y));
   1020 			pixel.y = pixel.z;
   1021 			pixel.x = As<Short4>(UnpackLow(pixel.x, pixel.z));
   1022 			pixel.y = As<Short4>(UnpackHigh(pixel.y, pixel.z));
   1023 			pixel.z = Short4(0xFFFFu);
   1024 			pixel.w = Short4(0xFFFFu);
   1025 			break;
   1026 		default:
   1027 			ASSERT(false);
   1028 		}
   1029 
   1030 		if((postBlendSRGB && state.writeSRGB) || isSRGB(index))
   1031 		{
   1032 			sRGBtoLinear16_12_16(pixel);
   1033 		}
   1034 	}
   1035 
   1036 	void PixelRoutine::alphaBlend(int index, Pointer<Byte> &cBuffer, Vector4s &current, Int &x)
   1037 	{
   1038 		if(!state.alphaBlendActive)
   1039 		{
   1040 			return;
   1041 		}
   1042 
   1043 		Vector4s pixel;
   1044 		readPixel(index, cBuffer, x, pixel);
   1045 
   1046 		// Final Color = ObjectColor * SourceBlendFactor + PixelColor * DestinationBlendFactor
   1047 		Vector4s sourceFactor;
   1048 		Vector4s destFactor;
   1049 
   1050 		blendFactor(sourceFactor, current, pixel, state.sourceBlendFactor);
   1051 		blendFactor(destFactor, current, pixel, state.destBlendFactor);
   1052 
   1053 		if(state.sourceBlendFactor != VK_BLEND_FACTOR_ONE && state.sourceBlendFactor != VK_BLEND_FACTOR_ZERO)
   1054 		{
   1055 			current.x = MulHigh(As<UShort4>(current.x), As<UShort4>(sourceFactor.x));
   1056 			current.y = MulHigh(As<UShort4>(current.y), As<UShort4>(sourceFactor.y));
   1057 			current.z = MulHigh(As<UShort4>(current.z), As<UShort4>(sourceFactor.z));
   1058 		}
   1059 
   1060 		if(state.destBlendFactor != VK_BLEND_FACTOR_ONE && state.destBlendFactor != VK_BLEND_FACTOR_ZERO)
   1061 		{
   1062 			pixel.x = MulHigh(As<UShort4>(pixel.x), As<UShort4>(destFactor.x));
   1063 			pixel.y = MulHigh(As<UShort4>(pixel.y), As<UShort4>(destFactor.y));
   1064 			pixel.z = MulHigh(As<UShort4>(pixel.z), As<UShort4>(destFactor.z));
   1065 		}
   1066 
   1067 		switch(state.blendOperation)
   1068 		{
   1069 		case VK_BLEND_OP_ADD:
   1070 			current.x = AddSat(As<UShort4>(current.x), As<UShort4>(pixel.x));
   1071 			current.y = AddSat(As<UShort4>(current.y), As<UShort4>(pixel.y));
   1072 			current.z = AddSat(As<UShort4>(current.z), As<UShort4>(pixel.z));
   1073 			break;
   1074 		case VK_BLEND_OP_SUBTRACT:
   1075 			current.x = SubSat(As<UShort4>(current.x), As<UShort4>(pixel.x));
   1076 			current.y = SubSat(As<UShort4>(current.y), As<UShort4>(pixel.y));
   1077 			current.z = SubSat(As<UShort4>(current.z), As<UShort4>(pixel.z));
   1078 			break;
   1079 		case VK_BLEND_OP_REVERSE_SUBTRACT:
   1080 			current.x = SubSat(As<UShort4>(pixel.x), As<UShort4>(current.x));
   1081 			current.y = SubSat(As<UShort4>(pixel.y), As<UShort4>(current.y));
   1082 			current.z = SubSat(As<UShort4>(pixel.z), As<UShort4>(current.z));
   1083 			break;
   1084 		case VK_BLEND_OP_MIN:
   1085 			current.x = Min(As<UShort4>(current.x), As<UShort4>(pixel.x));
   1086 			current.y = Min(As<UShort4>(current.y), As<UShort4>(pixel.y));
   1087 			current.z = Min(As<UShort4>(current.z), As<UShort4>(pixel.z));
   1088 			break;
   1089 		case VK_BLEND_OP_MAX:
   1090 			current.x = Max(As<UShort4>(current.x), As<UShort4>(pixel.x));
   1091 			current.y = Max(As<UShort4>(current.y), As<UShort4>(pixel.y));
   1092 			current.z = Max(As<UShort4>(current.z), As<UShort4>(pixel.z));
   1093 			break;
   1094 		case VK_BLEND_OP_SRC_EXT:
   1095 			// No operation
   1096 			break;
   1097 		case VK_BLEND_OP_DST_EXT:
   1098 			current.x = pixel.x;
   1099 			current.y = pixel.y;
   1100 			current.z = pixel.z;
   1101 			break;
   1102 		case VK_BLEND_OP_ZERO_EXT:
   1103 			current.x = Short4(0x0000);
   1104 			current.y = Short4(0x0000);
   1105 			current.z = Short4(0x0000);
   1106 			break;
   1107 		default:
   1108 			ASSERT(false);
   1109 		}
   1110 
   1111 		blendFactorAlpha(sourceFactor, current, pixel, state.sourceBlendFactorAlpha);
   1112 		blendFactorAlpha(destFactor, current, pixel, state.destBlendFactorAlpha);
   1113 
   1114 		if(state.sourceBlendFactorAlpha != VK_BLEND_FACTOR_ONE && state.sourceBlendFactorAlpha != VK_BLEND_FACTOR_ZERO)
   1115 		{
   1116 			current.w = MulHigh(As<UShort4>(current.w), As<UShort4>(sourceFactor.w));
   1117 		}
   1118 
   1119 		if(state.destBlendFactorAlpha != VK_BLEND_FACTOR_ONE && state.destBlendFactorAlpha != VK_BLEND_FACTOR_ZERO)
   1120 		{
   1121 			pixel.w = MulHigh(As<UShort4>(pixel.w), As<UShort4>(destFactor.w));
   1122 		}
   1123 
   1124 		switch(state.blendOperationAlpha)
   1125 		{
   1126 		case VK_BLEND_OP_ADD:
   1127 			current.w = AddSat(As<UShort4>(current.w), As<UShort4>(pixel.w));
   1128 			break;
   1129 		case VK_BLEND_OP_SUBTRACT:
   1130 			current.w = SubSat(As<UShort4>(current.w), As<UShort4>(pixel.w));
   1131 			break;
   1132 		case VK_BLEND_OP_REVERSE_SUBTRACT:
   1133 			current.w = SubSat(As<UShort4>(pixel.w), As<UShort4>(current.w));
   1134 			break;
   1135 		case VK_BLEND_OP_MIN:
   1136 			current.w = Min(As<UShort4>(current.w), As<UShort4>(pixel.w));
   1137 			break;
   1138 		case VK_BLEND_OP_MAX:
   1139 			current.w = Max(As<UShort4>(current.w), As<UShort4>(pixel.w));
   1140 			break;
   1141 		case VK_BLEND_OP_SRC_EXT:
   1142 			// No operation
   1143 			break;
   1144 		case VK_BLEND_OP_DST_EXT:
   1145 			current.w = pixel.w;
   1146 			break;
   1147 		case VK_BLEND_OP_ZERO_EXT:
   1148 			current.w = Short4(0x0000);
   1149 			break;
   1150 		default:
   1151 			ASSERT(false);
   1152 		}
   1153 	}
   1154 
   1155 	void PixelRoutine::logicOperation(int index, Pointer<Byte> &cBuffer, Vector4s &current, Int &x)
   1156 	{
   1157 		if(state.logicalOperation == VK_LOGIC_OP_COPY)
   1158 		{
   1159 			return;
   1160 		}
   1161 
   1162 		Vector4s pixel;
   1163 		readPixel(index, cBuffer, x, pixel);
   1164 
   1165 		switch(state.logicalOperation)
   1166 		{
   1167 		case VK_LOGIC_OP_CLEAR:
   1168 			current.x = UShort4(0);
   1169 			current.y = UShort4(0);
   1170 			current.z = UShort4(0);
   1171 			break;
   1172 		case VK_LOGIC_OP_SET:
   1173 			current.x = UShort4(0xFFFFu);
   1174 			current.y = UShort4(0xFFFFu);
   1175 			current.z = UShort4(0xFFFFu);
   1176 			break;
   1177 		case VK_LOGIC_OP_COPY:
   1178 			ASSERT(false);   // Optimized out
   1179 			break;
   1180 		case VK_LOGIC_OP_COPY_INVERTED:
   1181 			current.x = ~current.x;
   1182 			current.y = ~current.y;
   1183 			current.z = ~current.z;
   1184 			break;
   1185 		case VK_LOGIC_OP_NO_OP:
   1186 			current.x = pixel.x;
   1187 			current.y = pixel.y;
   1188 			current.z = pixel.z;
   1189 			break;
   1190 		case VK_LOGIC_OP_INVERT:
   1191 			current.x = ~pixel.x;
   1192 			current.y = ~pixel.y;
   1193 			current.z = ~pixel.z;
   1194 			break;
   1195 		case VK_LOGIC_OP_AND:
   1196 			current.x = pixel.x & current.x;
   1197 			current.y = pixel.y & current.y;
   1198 			current.z = pixel.z & current.z;
   1199 			break;
   1200 		case VK_LOGIC_OP_NAND:
   1201 			current.x = ~(pixel.x & current.x);
   1202 			current.y = ~(pixel.y & current.y);
   1203 			current.z = ~(pixel.z & current.z);
   1204 			break;
   1205 		case VK_LOGIC_OP_OR:
   1206 			current.x = pixel.x | current.x;
   1207 			current.y = pixel.y | current.y;
   1208 			current.z = pixel.z | current.z;
   1209 			break;
   1210 		case VK_LOGIC_OP_NOR:
   1211 			current.x = ~(pixel.x | current.x);
   1212 			current.y = ~(pixel.y | current.y);
   1213 			current.z = ~(pixel.z | current.z);
   1214 			break;
   1215 		case VK_LOGIC_OP_XOR:
   1216 			current.x = pixel.x ^ current.x;
   1217 			current.y = pixel.y ^ current.y;
   1218 			current.z = pixel.z ^ current.z;
   1219 			break;
   1220 		case VK_LOGIC_OP_EQUIVALENT:
   1221 			current.x = ~(pixel.x ^ current.x);
   1222 			current.y = ~(pixel.y ^ current.y);
   1223 			current.z = ~(pixel.z ^ current.z);
   1224 			break;
   1225 		case VK_LOGIC_OP_AND_REVERSE:
   1226 			current.x = ~pixel.x & current.x;
   1227 			current.y = ~pixel.y & current.y;
   1228 			current.z = ~pixel.z & current.z;
   1229 			break;
   1230 		case VK_LOGIC_OP_AND_INVERTED:
   1231 			current.x = pixel.x & ~current.x;
   1232 			current.y = pixel.y & ~current.y;
   1233 			current.z = pixel.z & ~current.z;
   1234 			break;
   1235 		case VK_LOGIC_OP_OR_REVERSE:
   1236 			current.x = ~pixel.x | current.x;
   1237 			current.y = ~pixel.y | current.y;
   1238 			current.z = ~pixel.z | current.z;
   1239 			break;
   1240 		case VK_LOGIC_OP_OR_INVERTED:
   1241 			current.x = pixel.x | ~current.x;
   1242 			current.y = pixel.y | ~current.y;
   1243 			current.z = pixel.z | ~current.z;
   1244 			break;
   1245 		default:
   1246 			ASSERT(false);
   1247 		}
   1248 	}
   1249 
   1250 	void PixelRoutine::writeColor(int index, Pointer<Byte> &cBuffer, Int &x, Vector4s &current, Int &sMask, Int &zMask, Int &cMask)
   1251 	{
   1252 		if((postBlendSRGB && state.writeSRGB) || isSRGB(index))
   1253 		{
   1254 			linearToSRGB16_12_16(current);
   1255 		}
   1256 
   1257 		if(exactColorRounding)
   1258 		{
   1259 			switch(state.targetFormat[index])
   1260 			{
   1261 			case VK_FORMAT_R5G6B5_UNORM_PACK16:
   1262 				current.x = AddSat(As<UShort4>(current.x), UShort4(0x0400));
   1263 				current.y = AddSat(As<UShort4>(current.y), UShort4(0x0200));
   1264 				current.z = AddSat(As<UShort4>(current.z), UShort4(0x0400));
   1265 				break;
   1266 			case VK_FORMAT_B8G8R8A8_UNORM:
   1267 			case VK_FORMAT_R8G8B8A8_UNORM:
   1268 			case VK_FORMAT_R8G8B8A8_SRGB:
   1269 			case VK_FORMAT_R8G8_UNORM:
   1270 			case VK_FORMAT_R8_UNORM:
   1271 				current.x = current.x - As<Short4>(As<UShort4>(current.x) >> 8) + Short4(0x0080);
   1272 				current.y = current.y - As<Short4>(As<UShort4>(current.y) >> 8) + Short4(0x0080);
   1273 				current.z = current.z - As<Short4>(As<UShort4>(current.z) >> 8) + Short4(0x0080);
   1274 				current.w = current.w - As<Short4>(As<UShort4>(current.w) >> 8) + Short4(0x0080);
   1275 				break;
   1276 			default:
   1277 				break;
   1278 			}
   1279 		}
   1280 
   1281 		int rgbaWriteMask = state.colorWriteActive(index);
   1282 		int bgraWriteMask = (rgbaWriteMask & 0x0000000A) | (rgbaWriteMask & 0x00000001) << 2 | (rgbaWriteMask & 0x00000004) >> 2;
   1283 
   1284 		switch(state.targetFormat[index])
   1285 		{
   1286 		case VK_FORMAT_R5G6B5_UNORM_PACK16:
   1287 			{
   1288 				current.x = current.x & Short4(0xF800u);
   1289 				current.y = As<UShort4>(current.y & Short4(0xFC00u)) >> 5;
   1290 				current.z = As<UShort4>(current.z) >> 11;
   1291 
   1292 				current.x = current.x | current.y | current.z;
   1293 			}
   1294 			break;
   1295 		case VK_FORMAT_B8G8R8A8_UNORM:
   1296 			if(rgbaWriteMask == 0x7)
   1297 			{
   1298 				current.x = As<Short4>(As<UShort4>(current.x) >> 8);
   1299 				current.y = As<Short4>(As<UShort4>(current.y) >> 8);
   1300 				current.z = As<Short4>(As<UShort4>(current.z) >> 8);
   1301 
   1302 				current.z = As<Short4>(PackUnsigned(current.z, current.x));
   1303 				current.y = As<Short4>(PackUnsigned(current.y, current.y));
   1304 
   1305 				current.x = current.z;
   1306 				current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
   1307 				current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
   1308 				current.y = current.z;
   1309 				current.z = As<Short4>(UnpackLow(current.z, current.x));
   1310 				current.y = As<Short4>(UnpackHigh(current.y, current.x));
   1311 			}
   1312 			else
   1313 			{
   1314 				current.x = As<Short4>(As<UShort4>(current.x) >> 8);
   1315 				current.y = As<Short4>(As<UShort4>(current.y) >> 8);
   1316 				current.z = As<Short4>(As<UShort4>(current.z) >> 8);
   1317 				current.w = As<Short4>(As<UShort4>(current.w) >> 8);
   1318 
   1319 				current.z = As<Short4>(PackUnsigned(current.z, current.x));
   1320 				current.y = As<Short4>(PackUnsigned(current.y, current.w));
   1321 
   1322 				current.x = current.z;
   1323 				current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
   1324 				current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
   1325 				current.y = current.z;
   1326 				current.z = As<Short4>(UnpackLow(current.z, current.x));
   1327 				current.y = As<Short4>(UnpackHigh(current.y, current.x));
   1328 			}
   1329 			break;
   1330 		case VK_FORMAT_R8G8B8A8_UNORM:
   1331 		case VK_FORMAT_R8G8B8A8_SRGB:
   1332 			if(rgbaWriteMask == 0x7)
   1333 			{
   1334 				current.x = As<Short4>(As<UShort4>(current.x) >> 8);
   1335 				current.y = As<Short4>(As<UShort4>(current.y) >> 8);
   1336 				current.z = As<Short4>(As<UShort4>(current.z) >> 8);
   1337 
   1338 				current.z = As<Short4>(PackUnsigned(current.x, current.z));
   1339 				current.y = As<Short4>(PackUnsigned(current.y, current.y));
   1340 
   1341 				current.x = current.z;
   1342 				current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
   1343 				current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
   1344 				current.y = current.z;
   1345 				current.z = As<Short4>(UnpackLow(current.z, current.x));
   1346 				current.y = As<Short4>(UnpackHigh(current.y, current.x));
   1347 			}
   1348 			else
   1349 			{
   1350 				current.x = As<Short4>(As<UShort4>(current.x) >> 8);
   1351 				current.y = As<Short4>(As<UShort4>(current.y) >> 8);
   1352 				current.z = As<Short4>(As<UShort4>(current.z) >> 8);
   1353 				current.w = As<Short4>(As<UShort4>(current.w) >> 8);
   1354 
   1355 				current.z = As<Short4>(PackUnsigned(current.x, current.z));
   1356 				current.y = As<Short4>(PackUnsigned(current.y, current.w));
   1357 
   1358 				current.x = current.z;
   1359 				current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
   1360 				current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
   1361 				current.y = current.z;
   1362 				current.z = As<Short4>(UnpackLow(current.z, current.x));
   1363 				current.y = As<Short4>(UnpackHigh(current.y, current.x));
   1364 			}
   1365 			break;
   1366 		case VK_FORMAT_R8G8_UNORM:
   1367 			current.x = As<Short4>(As<UShort4>(current.x) >> 8);
   1368 			current.y = As<Short4>(As<UShort4>(current.y) >> 8);
   1369 			current.x = As<Short4>(PackUnsigned(current.x, current.x));
   1370 			current.y = As<Short4>(PackUnsigned(current.y, current.y));
   1371 			current.x = UnpackLow(As<Byte8>(current.x), As<Byte8>(current.y));
   1372 			break;
   1373 		case VK_FORMAT_R8_UNORM:
   1374 			current.x = As<Short4>(As<UShort4>(current.x) >> 8);
   1375 			current.x = As<Short4>(PackUnsigned(current.x, current.x));
   1376 			break;
   1377 		case VK_FORMAT_R16G16_UNORM:
   1378 			current.z = current.x;
   1379 			current.x = As<Short4>(UnpackLow(current.x, current.y));
   1380 			current.z = As<Short4>(UnpackHigh(current.z, current.y));
   1381 			current.y = current.z;
   1382 			break;
   1383 		case VK_FORMAT_R16G16B16A16_UNORM:
   1384 			transpose4x4(current.x, current.y, current.z, current.w);
   1385 			break;
   1386 		default:
   1387 			ASSERT(false);
   1388 		}
   1389 
   1390 		Short4 c01 = current.z;
   1391 		Short4 c23 = current.y;
   1392 
   1393 		Int xMask;   // Combination of all masks
   1394 
   1395 		if(state.depthTestActive)
   1396 		{
   1397 			xMask = zMask;
   1398 		}
   1399 		else
   1400 		{
   1401 			xMask = cMask;
   1402 		}
   1403 
   1404 		if(state.stencilActive)
   1405 		{
   1406 			xMask &= sMask;
   1407 		}
   1408 
   1409 		switch(state.targetFormat[index])
   1410 		{
   1411 		case VK_FORMAT_R5G6B5_UNORM_PACK16:
   1412 			{
   1413 				Pointer<Byte> buffer = cBuffer + 2 * x;
   1414 				Int value = *Pointer<Int>(buffer);
   1415 
   1416 				Int c01 = Extract(As<Int2>(current.x), 0);
   1417 
   1418 				if((bgraWriteMask & 0x00000007) != 0x00000007)
   1419 				{
   1420 					Int masked = value;
   1421 					c01 &= *Pointer<Int>(constants + OFFSET(Constants,mask565Q[bgraWriteMask & 0x7][0]));
   1422 					masked &= *Pointer<Int>(constants + OFFSET(Constants,mask565Q[~bgraWriteMask & 0x7][0]));
   1423 					c01 |= masked;
   1424 				}
   1425 
   1426 				c01 &= *Pointer<Int>(constants + OFFSET(Constants,maskW4Q[0][0]) + xMask * 8);
   1427 				value &= *Pointer<Int>(constants + OFFSET(Constants,invMaskW4Q[0][0]) + xMask * 8);
   1428 				c01 |= value;
   1429 				*Pointer<Int>(buffer) = c01;
   1430 
   1431 				buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
   1432 				value = *Pointer<Int>(buffer);
   1433 
   1434 				Int c23 = Extract(As<Int2>(current.x), 1);
   1435 
   1436 				if((bgraWriteMask & 0x00000007) != 0x00000007)
   1437 				{
   1438 					Int masked = value;
   1439 					c23 &= *Pointer<Int>(constants + OFFSET(Constants,mask565Q[bgraWriteMask & 0x7][0]));
   1440 					masked &= *Pointer<Int>(constants + OFFSET(Constants,mask565Q[~bgraWriteMask & 0x7][0]));
   1441 					c23 |= masked;
   1442 				}
   1443 
   1444 				c23 &= *Pointer<Int>(constants + OFFSET(Constants,maskW4Q[0][2]) + xMask * 8);
   1445 				value &= *Pointer<Int>(constants + OFFSET(Constants,invMaskW4Q[0][2]) + xMask * 8);
   1446 				c23 |= value;
   1447 				*Pointer<Int>(buffer) = c23;
   1448 			}
   1449 			break;
   1450 		case VK_FORMAT_B8G8R8A8_UNORM:
   1451 			{
   1452 				Pointer<Byte> buffer = cBuffer + x * 4;
   1453 				Short4 value = *Pointer<Short4>(buffer);
   1454 
   1455 				if(state.targetFormat[index] == VK_FORMAT_B8G8R8A8_UNORM && bgraWriteMask != 0x0000000F)   // FIXME: Need for masking when XRGB && Fh?
   1456 				{
   1457 					Short4 masked = value;
   1458 					c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
   1459 					masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
   1460 					c01 |= masked;
   1461 				}
   1462 
   1463 				c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8);
   1464 				value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
   1465 				c01 |= value;
   1466 				*Pointer<Short4>(buffer) = c01;
   1467 
   1468 				buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
   1469 				value = *Pointer<Short4>(buffer);
   1470 
   1471 				if(state.targetFormat[index] == VK_FORMAT_B8G8R8A8_UNORM && bgraWriteMask != 0x0000000F)   // FIXME: Need for masking when XRGB && Fh?
   1472 				{
   1473 					Short4 masked = value;
   1474 					c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
   1475 					masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
   1476 					c23 |= masked;
   1477 				}
   1478 
   1479 				c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8);
   1480 				value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
   1481 				c23 |= value;
   1482 				*Pointer<Short4>(buffer) = c23;
   1483 			}
   1484 			break;
   1485 		case VK_FORMAT_R8G8B8A8_UNORM:
   1486 		case VK_FORMAT_R8G8B8A8_SRGB:
   1487 			{
   1488 				Pointer<Byte> buffer = cBuffer + x * 4;
   1489 				Short4 value = *Pointer<Short4>(buffer);
   1490 
   1491 				bool masked = ((state.targetFormat[index] == VK_FORMAT_R8G8B8A8_UNORM || state.targetFormat[index] == VK_FORMAT_R8G8B8A8_SRGB) && rgbaWriteMask != 0x0000000F); // FIXME: Need for masking when XBGR && Fh?
   1492 
   1493 				if(masked)
   1494 				{
   1495 					Short4 masked = value;
   1496 					c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[rgbaWriteMask][0]));
   1497 					masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[rgbaWriteMask][0]));
   1498 					c01 |= masked;
   1499 				}
   1500 
   1501 				c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8);
   1502 				value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
   1503 				c01 |= value;
   1504 				*Pointer<Short4>(buffer) = c01;
   1505 
   1506 				buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
   1507 				value = *Pointer<Short4>(buffer);
   1508 
   1509 				if(masked)
   1510 				{
   1511 					Short4 masked = value;
   1512 					c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[rgbaWriteMask][0]));
   1513 					masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[rgbaWriteMask][0]));
   1514 					c23 |= masked;
   1515 				}
   1516 
   1517 				c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8);
   1518 				value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
   1519 				c23 |= value;
   1520 				*Pointer<Short4>(buffer) = c23;
   1521 			}
   1522 			break;
   1523 		case VK_FORMAT_R8G8_UNORM:
   1524 			if((rgbaWriteMask & 0x00000003) != 0x0)
   1525 			{
   1526 				Pointer<Byte> buffer = cBuffer + 2 * x;
   1527 				Int2 value;
   1528 				value = Insert(value, *Pointer<Int>(buffer), 0);
   1529 				Int pitch = *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
   1530 				value = Insert(value, *Pointer<Int>(buffer + pitch), 1);
   1531 
   1532 				Int2 packedCol = As<Int2>(current.x);
   1533 
   1534 				UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q) + xMask * 8);
   1535 				if((rgbaWriteMask & 0x3) != 0x3)
   1536 				{
   1537 					Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskB4Q[5 * (rgbaWriteMask & 0x3)][0]));
   1538 					UInt2 rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
   1539 					mergedMask &= rgbaMask;
   1540 				}
   1541 
   1542 				packedCol = As<Int2>((As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask));
   1543 
   1544 				*Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 0));
   1545 				*Pointer<UInt>(buffer + pitch) = As<UInt>(Extract(packedCol, 1));
   1546 			}
   1547 			break;
   1548 		case VK_FORMAT_R8_UNORM:
   1549 			if(rgbaWriteMask & 0x00000001)
   1550 			{
   1551 				Pointer<Byte> buffer = cBuffer + 1 * x;
   1552 				Short4 value;
   1553 				value = Insert(value, *Pointer<Short>(buffer), 0);
   1554 				Int pitch = *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
   1555 				value = Insert(value, *Pointer<Short>(buffer + pitch), 1);
   1556 
   1557 				current.x &= *Pointer<Short4>(constants + OFFSET(Constants, maskB4Q) + 8 * xMask);
   1558 				value &= *Pointer<Short4>(constants + OFFSET(Constants, invMaskB4Q) + 8 * xMask);
   1559 				current.x |= value;
   1560 
   1561 				*Pointer<Short>(buffer) = Extract(current.x, 0);
   1562 				*Pointer<Short>(buffer + pitch) = Extract(current.x, 1);
   1563 			}
   1564 			break;
   1565 		case VK_FORMAT_R16G16_UNORM:
   1566 			{
   1567 				Pointer<Byte> buffer = cBuffer + 4 * x;
   1568 
   1569 				Short4 value = *Pointer<Short4>(buffer);
   1570 
   1571 				if((rgbaWriteMask & 0x00000003) != 0x00000003)
   1572 				{
   1573 					Short4 masked = value;
   1574 					current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskW01Q[rgbaWriteMask & 0x3][0]));
   1575 					masked &= *Pointer<Short4>(constants + OFFSET(Constants,maskW01Q[~rgbaWriteMask & 0x3][0]));
   1576 					current.x |= masked;
   1577 				}
   1578 
   1579 				current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8);
   1580 				value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
   1581 				current.x |= value;
   1582 				*Pointer<Short4>(buffer) = current.x;
   1583 
   1584 				buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
   1585 
   1586 				value = *Pointer<Short4>(buffer);
   1587 
   1588 				if((rgbaWriteMask & 0x00000003) != 0x00000003)
   1589 				{
   1590 					Short4 masked = value;
   1591 					current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskW01Q[rgbaWriteMask & 0x3][0]));
   1592 					masked &= *Pointer<Short4>(constants + OFFSET(Constants,maskW01Q[~rgbaWriteMask & 0x3][0]));
   1593 					current.y |= masked;
   1594 				}
   1595 
   1596 				current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8);
   1597 				value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
   1598 				current.y |= value;
   1599 				*Pointer<Short4>(buffer) = current.y;
   1600 			}
   1601 			break;
   1602 		case VK_FORMAT_R16G16B16A16_UNORM:
   1603 			{
   1604 				Pointer<Byte> buffer = cBuffer + 8 * x;
   1605 
   1606 				{
   1607 					Short4 value = *Pointer<Short4>(buffer);
   1608 
   1609 					if(rgbaWriteMask != 0x0000000F)
   1610 					{
   1611 						Short4 masked = value;
   1612 						current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
   1613 						masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
   1614 						current.x |= masked;
   1615 					}
   1616 
   1617 					current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ0Q) + xMask * 8);
   1618 					value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ0Q) + xMask * 8);
   1619 					current.x |= value;
   1620 					*Pointer<Short4>(buffer) = current.x;
   1621 				}
   1622 
   1623 				{
   1624 					Short4 value = *Pointer<Short4>(buffer + 8);
   1625 
   1626 					if(rgbaWriteMask != 0x0000000F)
   1627 					{
   1628 						Short4 masked = value;
   1629 						current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
   1630 						masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
   1631 						current.y |= masked;
   1632 					}
   1633 
   1634 					current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ1Q) + xMask * 8);
   1635 					value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ1Q) + xMask * 8);
   1636 					current.y |= value;
   1637 					*Pointer<Short4>(buffer + 8) = current.y;
   1638 				}
   1639 
   1640 				buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
   1641 
   1642 				{
   1643 					Short4 value = *Pointer<Short4>(buffer);
   1644 
   1645 					if(rgbaWriteMask != 0x0000000F)
   1646 					{
   1647 						Short4 masked = value;
   1648 						current.z &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
   1649 						masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
   1650 						current.z |= masked;
   1651 					}
   1652 
   1653 					current.z &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ2Q) + xMask * 8);
   1654 					value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ2Q) + xMask * 8);
   1655 					current.z |= value;
   1656 					*Pointer<Short4>(buffer) = current.z;
   1657 				}
   1658 
   1659 				{
   1660 					Short4 value = *Pointer<Short4>(buffer + 8);
   1661 
   1662 					if(rgbaWriteMask != 0x0000000F)
   1663 					{
   1664 						Short4 masked = value;
   1665 						current.w &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
   1666 						masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
   1667 						current.w |= masked;
   1668 					}
   1669 
   1670 					current.w &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ3Q) + xMask * 8);
   1671 					value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ3Q) + xMask * 8);
   1672 					current.w |= value;
   1673 					*Pointer<Short4>(buffer + 8) = current.w;
   1674 				}
   1675 			}
   1676 			break;
   1677 		default:
   1678 			ASSERT(false);
   1679 		}
   1680 	}
   1681 
   1682 	void PixelRoutine::blendFactor(Vector4f &blendFactor, const Vector4f &oC, const Vector4f &pixel, VkBlendFactor blendFactorActive)
   1683 	{
   1684 		switch(blendFactorActive)
   1685 		{
   1686 		case VK_BLEND_FACTOR_ZERO:
   1687 			// Optimized
   1688 			break;
   1689 		case VK_BLEND_FACTOR_ONE:
   1690 			// Optimized
   1691 			break;
   1692 		case VK_BLEND_FACTOR_SRC_COLOR:
   1693 			blendFactor.x = oC.x;
   1694 			blendFactor.y = oC.y;
   1695 			blendFactor.z = oC.z;
   1696 			break;
   1697 		case VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR:
   1698 			blendFactor.x = Float4(1.0f) - oC.x;
   1699 			blendFactor.y = Float4(1.0f) - oC.y;
   1700 			blendFactor.z = Float4(1.0f) - oC.z;
   1701 			break;
   1702 		case VK_BLEND_FACTOR_DST_COLOR:
   1703 			blendFactor.x = pixel.x;
   1704 			blendFactor.y = pixel.y;
   1705 			blendFactor.z = pixel.z;
   1706 			break;
   1707 		case VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR:
   1708 			blendFactor.x = Float4(1.0f) - pixel.x;
   1709 			blendFactor.y = Float4(1.0f) - pixel.y;
   1710 			blendFactor.z = Float4(1.0f) - pixel.z;
   1711 			break;
   1712 		case VK_BLEND_FACTOR_SRC_ALPHA:
   1713 			blendFactor.x = oC.w;
   1714 			blendFactor.y = oC.w;
   1715 			blendFactor.z = oC.w;
   1716 			break;
   1717 		case VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA:
   1718 			blendFactor.x = Float4(1.0f) - oC.w;
   1719 			blendFactor.y = Float4(1.0f) - oC.w;
   1720 			blendFactor.z = Float4(1.0f) - oC.w;
   1721 			break;
   1722 		case VK_BLEND_FACTOR_DST_ALPHA:
   1723 			blendFactor.x = pixel.w;
   1724 			blendFactor.y = pixel.w;
   1725 			blendFactor.z = pixel.w;
   1726 			break;
   1727 		case VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA:
   1728 			blendFactor.x = Float4(1.0f) - pixel.w;
   1729 			blendFactor.y = Float4(1.0f) - pixel.w;
   1730 			blendFactor.z = Float4(1.0f) - pixel.w;
   1731 			break;
   1732 		case VK_BLEND_FACTOR_SRC_ALPHA_SATURATE:
   1733 			blendFactor.x = Float4(1.0f) - pixel.w;
   1734 			blendFactor.x = Min(blendFactor.x, oC.w);
   1735 			blendFactor.y = blendFactor.x;
   1736 			blendFactor.z = blendFactor.x;
   1737 			break;
   1738 		case VK_BLEND_FACTOR_CONSTANT_COLOR:
   1739 			blendFactor.x = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[0]));
   1740 			blendFactor.y = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[1]));
   1741 			blendFactor.z = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[2]));
   1742 			break;
   1743 		case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR:
   1744 			blendFactor.x = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[0]));
   1745 			blendFactor.y = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[1]));
   1746 			blendFactor.z = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[2]));
   1747 			break;
   1748 		default:
   1749 			ASSERT(false);
   1750 		}
   1751 	}
   1752 
   1753 	void PixelRoutine::blendFactorAlpha(Vector4f &blendFactor, const Vector4f &oC, const Vector4f &pixel, VkBlendFactor blendFactorAlphaActive)
   1754 	{
   1755 		switch(blendFactorAlphaActive)
   1756 		{
   1757 		case VK_BLEND_FACTOR_ZERO:
   1758 			// Optimized
   1759 			break;
   1760 		case VK_BLEND_FACTOR_ONE:
   1761 			// Optimized
   1762 			break;
   1763 		case VK_BLEND_FACTOR_SRC_COLOR:
   1764 			blendFactor.w = oC.w;
   1765 			break;
   1766 		case VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR:
   1767 			blendFactor.w = Float4(1.0f) - oC.w;
   1768 			break;
   1769 		case VK_BLEND_FACTOR_DST_COLOR:
   1770 			blendFactor.w = pixel.w;
   1771 			break;
   1772 		case VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR:
   1773 			blendFactor.w = Float4(1.0f) - pixel.w;
   1774 			break;
   1775 		case VK_BLEND_FACTOR_SRC_ALPHA:
   1776 			blendFactor.w = oC.w;
   1777 			break;
   1778 		case VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA:
   1779 			blendFactor.w = Float4(1.0f) - oC.w;
   1780 			break;
   1781 		case VK_BLEND_FACTOR_DST_ALPHA:
   1782 			blendFactor.w = pixel.w;
   1783 			break;
   1784 		case VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA:
   1785 			blendFactor.w = Float4(1.0f) - pixel.w;
   1786 			break;
   1787 		case VK_BLEND_FACTOR_SRC_ALPHA_SATURATE:
   1788 			blendFactor.w = Float4(1.0f);
   1789 			break;
   1790 		case VK_BLEND_FACTOR_CONSTANT_COLOR:
   1791 			blendFactor.w = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[3]));
   1792 			break;
   1793 		case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR:
   1794 			blendFactor.w = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[3]));
   1795 			break;
   1796 		default:
   1797 			ASSERT(false);
   1798 		}
   1799 	}
   1800 
   1801 	void PixelRoutine::alphaBlend(int index, Pointer<Byte> &cBuffer, Vector4f &oC, Int &x)
   1802 	{
   1803 		if(!state.alphaBlendActive)
   1804 		{
   1805 			return;
   1806 		}
   1807 
   1808 		Pointer<Byte> buffer;
   1809 		Vector4f pixel;
   1810 
   1811 		Vector4s color;
   1812 		Short4 c01;
   1813 		Short4 c23;
   1814 
   1815 		Float4 one;
   1816 		if(Surface::isFloatFormat(state.targetFormat[index]))
   1817 		{
   1818 			one = Float4(1.0f);
   1819 		}
   1820 		else if(Surface::isNonNormalizedInteger(state.targetFormat[index]))
   1821 		{
   1822 			one = As<Float4>(Surface::isUnsignedComponent(state.targetFormat[index], 0) ? Int4(0xFFFFFFFF) : Int4(0x7FFFFFFF));
   1823 		}
   1824 
   1825 		switch(state.targetFormat[index])
   1826 		{
   1827 		case VK_FORMAT_R32_SINT:
   1828 		case VK_FORMAT_R32_UINT:
   1829 		case VK_FORMAT_R32_SFLOAT:
   1830 			buffer = cBuffer;
   1831 			// FIXME: movlps
   1832 			pixel.x.x = *Pointer<Float>(buffer + 4 * x + 0);
   1833 			pixel.x.y = *Pointer<Float>(buffer + 4 * x + 4);
   1834 			buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
   1835 			// FIXME: movhps
   1836 			pixel.x.z = *Pointer<Float>(buffer + 4 * x + 0);
   1837 			pixel.x.w = *Pointer<Float>(buffer + 4 * x + 4);
   1838 			pixel.y = pixel.z = pixel.w = one;
   1839 			break;
   1840 		case VK_FORMAT_R32G32_SINT:
   1841 		case VK_FORMAT_R32G32_UINT:
   1842 		case VK_FORMAT_R32G32_SFLOAT:
   1843 			buffer = cBuffer;
   1844 			pixel.x = *Pointer<Float4>(buffer + 8 * x, 16);
   1845 			buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
   1846 			pixel.y = *Pointer<Float4>(buffer + 8 * x, 16);
   1847 			pixel.z = pixel.x;
   1848 			pixel.x = ShuffleLowHigh(pixel.x, pixel.y, 0x88);
   1849 			pixel.z = ShuffleLowHigh(pixel.z, pixel.y, 0xDD);
   1850 			pixel.y = pixel.z;
   1851 			pixel.z = pixel.w = one;
   1852 			break;
   1853 		case VK_FORMAT_R32G32B32A32_SFLOAT:
   1854 		case VK_FORMAT_R32G32B32A32_SINT:
   1855 		case VK_FORMAT_R32G32B32A32_UINT:
   1856 			buffer = cBuffer;
   1857 			pixel.x = *Pointer<Float4>(buffer + 16 * x, 16);
   1858 			pixel.y = *Pointer<Float4>(buffer + 16 * x + 16, 16);
   1859 			buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
   1860 			pixel.z = *Pointer<Float4>(buffer + 16 * x, 16);
   1861 			pixel.w = *Pointer<Float4>(buffer + 16 * x + 16, 16);
   1862 			transpose4x4(pixel.x, pixel.y, pixel.z, pixel.w);
   1863 			break;
   1864 		default:
   1865 			ASSERT(false);
   1866 		}
   1867 
   1868 		if((postBlendSRGB && state.writeSRGB) || isSRGB(index))
   1869 		{
   1870 			sRGBtoLinear(pixel.x);
   1871 			sRGBtoLinear(pixel.y);
   1872 			sRGBtoLinear(pixel.z);
   1873 		}
   1874 
   1875 		// Final Color = ObjectColor * SourceBlendFactor + PixelColor * DestinationBlendFactor
   1876 		Vector4f sourceFactor;
   1877 		Vector4f destFactor;
   1878 
   1879 		blendFactor(sourceFactor, oC, pixel, state.sourceBlendFactor);
   1880 		blendFactor(destFactor, oC, pixel, state.destBlendFactor);
   1881 
   1882 		if(state.sourceBlendFactor != VK_BLEND_FACTOR_ONE && state.sourceBlendFactor != VK_BLEND_FACTOR_ZERO)
   1883 		{
   1884 			oC.x *= sourceFactor.x;
   1885 			oC.y *= sourceFactor.y;
   1886 			oC.z *= sourceFactor.z;
   1887 		}
   1888 
   1889 		if(state.destBlendFactor != VK_BLEND_FACTOR_ONE && state.destBlendFactor != VK_BLEND_FACTOR_ZERO)
   1890 		{
   1891 			pixel.x *= destFactor.x;
   1892 			pixel.y *= destFactor.y;
   1893 			pixel.z *= destFactor.z;
   1894 		}
   1895 
   1896 		switch(state.blendOperation)
   1897 		{
   1898 		case VK_BLEND_OP_ADD:
   1899 			oC.x += pixel.x;
   1900 			oC.y += pixel.y;
   1901 			oC.z += pixel.z;
   1902 			break;
   1903 		case VK_BLEND_OP_SUBTRACT:
   1904 			oC.x -= pixel.x;
   1905 			oC.y -= pixel.y;
   1906 			oC.z -= pixel.z;
   1907 			break;
   1908 		case VK_BLEND_OP_REVERSE_SUBTRACT:
   1909 			oC.x = pixel.x - oC.x;
   1910 			oC.y = pixel.y - oC.y;
   1911 			oC.z = pixel.z - oC.z;
   1912 			break;
   1913 		case VK_BLEND_OP_MIN:
   1914 			oC.x = Min(oC.x, pixel.x);
   1915 			oC.y = Min(oC.y, pixel.y);
   1916 			oC.z = Min(oC.z, pixel.z);
   1917 			break;
   1918 		case VK_BLEND_OP_MAX:
   1919 			oC.x = Max(oC.x, pixel.x);
   1920 			oC.y = Max(oC.y, pixel.y);
   1921 			oC.z = Max(oC.z, pixel.z);
   1922 			break;
   1923 		case VK_BLEND_OP_SRC_EXT:
   1924 			// No operation
   1925 			break;
   1926 		case VK_BLEND_OP_DST_EXT:
   1927 			oC.x = pixel.x;
   1928 			oC.y = pixel.y;
   1929 			oC.z = pixel.z;
   1930 			break;
   1931 		case VK_BLEND_OP_ZERO_EXT:
   1932 			oC.x = Float4(0.0f);
   1933 			oC.y = Float4(0.0f);
   1934 			oC.z = Float4(0.0f);
   1935 			break;
   1936 		default:
   1937 			ASSERT(false);
   1938 		}
   1939 
   1940 		blendFactorAlpha(sourceFactor, oC, pixel, state.sourceBlendFactorAlpha);
   1941 		blendFactorAlpha(destFactor, oC, pixel, state.destBlendFactorAlpha);
   1942 
   1943 		if(state.sourceBlendFactorAlpha != VK_BLEND_FACTOR_ONE && state.sourceBlendFactorAlpha != VK_BLEND_FACTOR_ZERO)
   1944 		{
   1945 			oC.w *= sourceFactor.w;
   1946 		}
   1947 
   1948 		if(state.destBlendFactorAlpha != VK_BLEND_FACTOR_ONE && state.destBlendFactorAlpha != VK_BLEND_FACTOR_ZERO)
   1949 		{
   1950 			pixel.w *= destFactor.w;
   1951 		}
   1952 
   1953 		switch(state.blendOperationAlpha)
   1954 		{
   1955 		case VK_BLEND_OP_ADD:
   1956 			oC.w += pixel.w;
   1957 			break;
   1958 		case VK_BLEND_OP_SUBTRACT:
   1959 			oC.w -= pixel.w;
   1960 			break;
   1961 		case VK_BLEND_OP_REVERSE_SUBTRACT:
   1962 			pixel.w -= oC.w;
   1963 			oC.w = pixel.w;
   1964 			break;
   1965 		case VK_BLEND_OP_MIN:
   1966 			oC.w = Min(oC.w, pixel.w);
   1967 			break;
   1968 		case VK_BLEND_OP_MAX:
   1969 			oC.w = Max(oC.w, pixel.w);
   1970 			break;
   1971 		case VK_BLEND_OP_SRC_EXT:
   1972 			// No operation
   1973 			break;
   1974 		case VK_BLEND_OP_DST_EXT:
   1975 			oC.w = pixel.w;
   1976 			break;
   1977 		case VK_BLEND_OP_ZERO_EXT:
   1978 			oC.w = Float4(0.0f);
   1979 			break;
   1980 		default:
   1981 			ASSERT(false);
   1982 		}
   1983 	}
   1984 
   1985 	void PixelRoutine::writeColor(int index, Pointer<Byte> &cBuffer, Int &x, Vector4f &oC, Int &sMask, Int &zMask, Int &cMask)
   1986 	{
   1987 		switch(state.targetFormat[index])
   1988 		{
   1989 		case VK_FORMAT_R32_SFLOAT:
   1990 		case VK_FORMAT_R32_SINT:
   1991 		case VK_FORMAT_R32_UINT:
   1992 		case VK_FORMAT_R16_SINT:
   1993 		case VK_FORMAT_R16_UINT:
   1994 		case VK_FORMAT_R8_SINT:
   1995 		case VK_FORMAT_R8_UINT:
   1996 			break;
   1997 		case VK_FORMAT_R32G32_SFLOAT:
   1998 		case VK_FORMAT_R32G32_SINT:
   1999 		case VK_FORMAT_R32G32_UINT:
   2000 		case VK_FORMAT_R16G16_SINT:
   2001 		case VK_FORMAT_R16G16_UINT:
   2002 		case VK_FORMAT_R8G8_SINT:
   2003 		case VK_FORMAT_R8G8_UINT:
   2004 			oC.z = oC.x;
   2005 			oC.x = UnpackLow(oC.x, oC.y);
   2006 			oC.z = UnpackHigh(oC.z, oC.y);
   2007 			oC.y = oC.z;
   2008 			break;
   2009 		case VK_FORMAT_R32G32B32A32_SFLOAT:
   2010 		case VK_FORMAT_R32G32B32A32_SINT:
   2011 		case VK_FORMAT_R32G32B32A32_UINT:
   2012 		case VK_FORMAT_R16G16B16A16_SINT:
   2013 		case VK_FORMAT_R16G16B16A16_UINT:
   2014 		case VK_FORMAT_R8G8B8A8_SINT:
   2015 		case VK_FORMAT_R8G8B8A8_UINT:
   2016 			transpose4x4(oC.x, oC.y, oC.z, oC.w);
   2017 			break;
   2018 		default:
   2019 			ASSERT(false);
   2020 		}
   2021 
   2022 		int rgbaWriteMask = state.colorWriteActive(index);
   2023 
   2024 		Int xMask;   // Combination of all masks
   2025 
   2026 		if(state.depthTestActive)
   2027 		{
   2028 			xMask = zMask;
   2029 		}
   2030 		else
   2031 		{
   2032 			xMask = cMask;
   2033 		}
   2034 
   2035 		if(state.stencilActive)
   2036 		{
   2037 			xMask &= sMask;
   2038 		}
   2039 
   2040 		Pointer<Byte> buffer;
   2041 		Float4 value;
   2042 
   2043 		switch(state.targetFormat[index])
   2044 		{
   2045 		case VK_FORMAT_R32_SFLOAT:
   2046 		case VK_FORMAT_R32_SINT:
   2047 		case VK_FORMAT_R32_UINT:
   2048 			if(rgbaWriteMask & 0x00000001)
   2049 			{
   2050 				buffer = cBuffer + 4 * x;
   2051 
   2052 				// FIXME: movlps
   2053 				value.x = *Pointer<Float>(buffer + 0);
   2054 				value.y = *Pointer<Float>(buffer + 4);
   2055 
   2056 				buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
   2057 
   2058 				// FIXME: movhps
   2059 				value.z = *Pointer<Float>(buffer + 0);
   2060 				value.w = *Pointer<Float>(buffer + 4);
   2061 
   2062 				oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X) + xMask * 16, 16));
   2063 				value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X) + xMask * 16, 16));
   2064 				oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
   2065 
   2066 				// FIXME: movhps
   2067 				*Pointer<Float>(buffer + 0) = oC.x.z;
   2068 				*Pointer<Float>(buffer + 4) = oC.x.w;
   2069 
   2070 				buffer -= *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
   2071 
   2072 				// FIXME: movlps
   2073 				*Pointer<Float>(buffer + 0) = oC.x.x;
   2074 				*Pointer<Float>(buffer + 4) = oC.x.y;
   2075 			}
   2076 			break;
   2077 		case VK_FORMAT_R16_SINT:
   2078 		case VK_FORMAT_R16_UINT:
   2079 			if(rgbaWriteMask & 0x00000001)
   2080 			{
   2081 				buffer = cBuffer + 2 * x;
   2082 
   2083 				UShort4 xyzw;
   2084 				xyzw = As<UShort4>(Insert(As<Int2>(xyzw), *Pointer<Int>(buffer), 0));
   2085 
   2086 				buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
   2087 
   2088 				xyzw = As<UShort4>(Insert(As<Int2>(xyzw), *Pointer<Int>(buffer), 1));
   2089 				value = As<Float4>(Int4(xyzw));
   2090 
   2091 				oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X) + xMask * 16, 16));
   2092 				value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X) + xMask * 16, 16));
   2093 				oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
   2094 
   2095 				if(state.targetFormat[index] == VK_FORMAT_R16_SINT)
   2096 				{
   2097 					Float component = oC.x.z;
   2098 					*Pointer<Short>(buffer + 0) = Short(As<Int>(component));
   2099 					component = oC.x.w;
   2100 					*Pointer<Short>(buffer + 2) = Short(As<Int>(component));
   2101 
   2102 					buffer -= *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
   2103 
   2104 					component = oC.x.x;
   2105 					*Pointer<Short>(buffer + 0) = Short(As<Int>(component));
   2106 					component = oC.x.y;
   2107 					*Pointer<Short>(buffer + 2) = Short(As<Int>(component));
   2108 				}
   2109 				else // VK_FORMAT_R16_UINT
   2110 				{
   2111 					Float component = oC.x.z;
   2112 					*Pointer<UShort>(buffer + 0) = UShort(As<Int>(component));
   2113 					component = oC.x.w;
   2114 					*Pointer<UShort>(buffer + 2) = UShort(As<Int>(component));
   2115 
   2116 					buffer -= *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
   2117 
   2118 					component = oC.x.x;
   2119 					*Pointer<UShort>(buffer + 0) = UShort(As<Int>(component));
   2120 					component = oC.x.y;
   2121 					*Pointer<UShort>(buffer + 2) = UShort(As<Int>(component));
   2122 				}
   2123 			}
   2124 			break;
   2125 		case VK_FORMAT_R8_SINT:
   2126 		case VK_FORMAT_R8_UINT:
   2127 			if(rgbaWriteMask & 0x00000001)
   2128 			{
   2129 				buffer = cBuffer + x;
   2130 
   2131 				UInt xyzw, packedCol;
   2132 
   2133 				xyzw = UInt(*Pointer<UShort>(buffer)) & 0xFFFF;
   2134 				buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
   2135 				xyzw |= UInt(*Pointer<UShort>(buffer)) << 16;
   2136 
   2137 				Short4 tmpCol = Short4(As<Int4>(oC.x));
   2138 				if(state.targetFormat[index] == VK_FORMAT_R8_SINT)
   2139 				{
   2140 					tmpCol = As<Short4>(PackSigned(tmpCol, tmpCol));
   2141 				}
   2142 				else
   2143 				{
   2144 					tmpCol = As<Short4>(PackUnsigned(tmpCol, tmpCol));
   2145 				}
   2146 				packedCol = Extract(As<Int2>(tmpCol), 0);
   2147 
   2148 				packedCol = (packedCol & *Pointer<UInt>(constants + OFFSET(Constants, maskB4Q) + 8 * xMask)) |
   2149 				            (xyzw & *Pointer<UInt>(constants + OFFSET(Constants, invMaskB4Q) + 8 * xMask));
   2150 
   2151 				*Pointer<UShort>(buffer) = UShort(packedCol >> 16);
   2152 				buffer -= *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
   2153 				*Pointer<UShort>(buffer) = UShort(packedCol);
   2154 			}
   2155 			break;
   2156 		case VK_FORMAT_R32G32_SFLOAT:
   2157 		case VK_FORMAT_R32G32_SINT:
   2158 		case VK_FORMAT_R32G32_UINT:
   2159 			buffer = cBuffer + 8 * x;
   2160 
   2161 			value = *Pointer<Float4>(buffer);
   2162 
   2163 			if((rgbaWriteMask & 0x00000003) != 0x00000003)
   2164 			{
   2165 				Float4 masked = value;
   2166 				oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskD01X[rgbaWriteMask & 0x3][0])));
   2167 				masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,maskD01X[~rgbaWriteMask & 0x3][0])));
   2168 				oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(masked));
   2169 			}
   2170 
   2171 			oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskQ01X) + xMask * 16, 16));
   2172 			value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskQ01X) + xMask * 16, 16));
   2173 			oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
   2174 			*Pointer<Float4>(buffer) = oC.x;
   2175 
   2176 			buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
   2177 
   2178 			value = *Pointer<Float4>(buffer);
   2179 
   2180 			if((rgbaWriteMask & 0x00000003) != 0x00000003)
   2181 			{
   2182 				Float4 masked;
   2183 
   2184 				masked = value;
   2185 				oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskD01X[rgbaWriteMask & 0x3][0])));
   2186 				masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,maskD01X[~rgbaWriteMask & 0x3][0])));
   2187 				oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(masked));
   2188 			}
   2189 
   2190 			oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskQ23X) + xMask * 16, 16));
   2191 			value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskQ23X) + xMask * 16, 16));
   2192 			oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(value));
   2193 			*Pointer<Float4>(buffer) = oC.y;
   2194 			break;
   2195 		case VK_FORMAT_R16G16_SINT:
   2196 		case VK_FORMAT_R16G16_UINT:
   2197 			if((rgbaWriteMask & 0x00000003) != 0x0)
   2198 			{
   2199 				buffer = cBuffer + 4 * x;
   2200 
   2201 				UInt2 rgbaMask;
   2202 				UShort4 packedCol = UShort4(As<Int4>(oC.x));
   2203 				UShort4 value = *Pointer<UShort4>(buffer);
   2204 				UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
   2205 				if((rgbaWriteMask & 0x3) != 0x3)
   2206 				{
   2207 					Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[rgbaWriteMask & 0x3][0]));
   2208 					rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
   2209 					mergedMask &= rgbaMask;
   2210 				}
   2211 				*Pointer<UInt2>(buffer) = (As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask);
   2212 
   2213 				buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
   2214 
   2215 				packedCol = UShort4(As<Int4>(oC.y));
   2216 				value = *Pointer<UShort4>(buffer);
   2217 				mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
   2218 				if((rgbaWriteMask & 0x3) != 0x3)
   2219 				{
   2220 					mergedMask &= rgbaMask;
   2221 				}
   2222 				*Pointer<UInt2>(buffer) = (As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask);
   2223 			}
   2224 			break;
   2225 		case VK_FORMAT_R8G8_SINT:
   2226 		case VK_FORMAT_R8G8_UINT:
   2227 			if((rgbaWriteMask & 0x00000003) != 0x0)
   2228 			{
   2229 				buffer = cBuffer + 2 * x;
   2230 
   2231 				Int2 xyzw, packedCol;
   2232 
   2233 				xyzw = Insert(xyzw, *Pointer<Int>(buffer), 0);
   2234 				buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
   2235 				xyzw = Insert(xyzw, *Pointer<Int>(buffer), 1);
   2236 
   2237 				if(state.targetFormat[index] == VK_FORMAT_R8G8_SINT)
   2238 				{
   2239 					packedCol = As<Int2>(PackSigned(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y))));
   2240 				}
   2241 				else
   2242 				{
   2243 					packedCol = As<Int2>(PackUnsigned(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y))));
   2244 				}
   2245 
   2246 				UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q) + xMask * 8);
   2247 				if((rgbaWriteMask & 0x3) != 0x3)
   2248 				{
   2249 					Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskB4Q[5 * (rgbaWriteMask & 0x3)][0]));
   2250 					UInt2 rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
   2251 					mergedMask &= rgbaMask;
   2252 				}
   2253 
   2254 				packedCol = As<Int2>((As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(xyzw) & ~mergedMask));
   2255 
   2256 				*Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 1));
   2257 				buffer -= *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
   2258 				*Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 0));
   2259 			}
   2260 			break;
   2261 		case VK_FORMAT_R32G32B32A32_SFLOAT:
   2262 		case VK_FORMAT_R32G32B32A32_SINT:
   2263 		case VK_FORMAT_R32G32B32A32_UINT:
   2264 			buffer = cBuffer + 16 * x;
   2265 
   2266 			{
   2267 				value = *Pointer<Float4>(buffer, 16);
   2268 
   2269 				if(rgbaWriteMask != 0x0000000F)
   2270 				{
   2271 					Float4 masked = value;
   2272 					oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
   2273 					masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
   2274 					oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(masked));
   2275 				}
   2276 
   2277 				oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskX0X) + xMask * 16, 16));
   2278 				value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX0X) + xMask * 16, 16));
   2279 				oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
   2280 				*Pointer<Float4>(buffer, 16) = oC.x;
   2281 			}
   2282 
   2283 			{
   2284 				value = *Pointer<Float4>(buffer + 16, 16);
   2285 
   2286 				if(rgbaWriteMask != 0x0000000F)
   2287 				{
   2288 					Float4 masked = value;
   2289 					oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
   2290 					masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
   2291 					oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(masked));
   2292 				}
   2293 
   2294 				oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskX1X) + xMask * 16, 16));
   2295 				value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX1X) + xMask * 16, 16));
   2296 				oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(value));
   2297 				*Pointer<Float4>(buffer + 16, 16) = oC.y;
   2298 			}
   2299 
   2300 			buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
   2301 
   2302 			{
   2303 				value = *Pointer<Float4>(buffer, 16);
   2304 
   2305 				if(rgbaWriteMask != 0x0000000F)
   2306 				{
   2307 					Float4 masked = value;
   2308 					oC.z = As<Float4>(As<Int4>(oC.z) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
   2309 					masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
   2310 					oC.z = As<Float4>(As<Int4>(oC.z) | As<Int4>(masked));
   2311 				}
   2312 
   2313 				oC.z = As<Float4>(As<Int4>(oC.z) & *Pointer<Int4>(constants + OFFSET(Constants,maskX2X) + xMask * 16, 16));
   2314 				value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX2X) + xMask * 16, 16));
   2315 				oC.z = As<Float4>(As<Int4>(oC.z) | As<Int4>(value));
   2316 				*Pointer<Float4>(buffer, 16) = oC.z;
   2317 			}
   2318 
   2319 			{
   2320 				value = *Pointer<Float4>(buffer + 16, 16);
   2321 
   2322 				if(rgbaWriteMask != 0x0000000F)
   2323 				{
   2324 					Float4 masked = value;
   2325 					oC.w = As<Float4>(As<Int4>(oC.w) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
   2326 					masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
   2327 					oC.w = As<Float4>(As<Int4>(oC.w) | As<Int4>(masked));
   2328 				}
   2329 
   2330 				oC.w = As<Float4>(As<Int4>(oC.w) & *Pointer<Int4>(constants + OFFSET(Constants,maskX3X) + xMask * 16, 16));
   2331 				value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX3X) + xMask * 16, 16));
   2332 				oC.w = As<Float4>(As<Int4>(oC.w) | As<Int4>(value));
   2333 				*Pointer<Float4>(buffer + 16, 16) = oC.w;
   2334 			}
   2335 			break;
   2336 		case VK_FORMAT_R16G16B16A16_SINT:
   2337 		case VK_FORMAT_R16G16B16A16_UINT:
   2338 			if((rgbaWriteMask & 0x0000000F) != 0x0)
   2339 			{
   2340 				buffer = cBuffer + 8 * x;
   2341 
   2342 				UInt4 rgbaMask;
   2343 				UShort8 value = *Pointer<UShort8>(buffer);
   2344 				UShort8 packedCol = UShort8(UShort4(As<Int4>(oC.x)), UShort4(As<Int4>(oC.y)));
   2345 				UInt4 mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ01X) + xMask * 16);
   2346 				if((rgbaWriteMask & 0xF) != 0xF)
   2347 				{
   2348 					UInt2 tmpMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q[rgbaWriteMask][0]));
   2349 					rgbaMask = UInt4(tmpMask, tmpMask);
   2350 					mergedMask &= rgbaMask;
   2351 				}
   2352 				*Pointer<UInt4>(buffer) = (As<UInt4>(packedCol) & mergedMask) | (As<UInt4>(value) & ~mergedMask);
   2353 
   2354 				buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
   2355 
   2356 				value = *Pointer<UShort8>(buffer);
   2357 				packedCol = UShort8(UShort4(As<Int4>(oC.z)), UShort4(As<Int4>(oC.w)));
   2358 				mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ23X) + xMask * 16);
   2359 				if((rgbaWriteMask & 0xF) != 0xF)
   2360 				{
   2361 					mergedMask &= rgbaMask;
   2362 				}
   2363 				*Pointer<UInt4>(buffer) = (As<UInt4>(packedCol) & mergedMask) | (As<UInt4>(value) & ~mergedMask);
   2364 			}
   2365 			break;
   2366 		case VK_FORMAT_R8G8B8A8_SINT:
   2367 		case VK_FORMAT_R8G8B8A8_UINT:
   2368 			if((rgbaWriteMask & 0x0000000F) != 0x0)
   2369 			{
   2370 				UInt2 value, packedCol, mergedMask;
   2371 
   2372 				buffer = cBuffer + 4 * x;
   2373 
   2374 				if(state.targetFormat[index] == VK_FORMAT_R8G8B8A8_SINT)
   2375 				{
   2376 					packedCol = As<UInt2>(PackSigned(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y))));
   2377 				}
   2378 				else
   2379 				{
   2380 					packedCol = As<UInt2>(PackUnsigned(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y))));
   2381 				}
   2382 				value = *Pointer<UInt2>(buffer, 16);
   2383 				mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
   2384 				if(rgbaWriteMask != 0xF)
   2385 				{
   2386 					mergedMask &= *Pointer<UInt2>(constants + OFFSET(Constants, maskB4Q[rgbaWriteMask][0]));
   2387 				}
   2388 				*Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (value & ~mergedMask);
   2389 
   2390 				buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
   2391 
   2392 				if(state.targetFormat[index] == VK_FORMAT_R8G8B8A8_SINT)
   2393 				{
   2394 					packedCol = As<UInt2>(PackSigned(Short4(As<Int4>(oC.z)), Short4(As<Int4>(oC.w))));
   2395 				}
   2396 				else
   2397 				{
   2398 					packedCol = As<UInt2>(PackUnsigned(Short4(As<Int4>(oC.z)), Short4(As<Int4>(oC.w))));
   2399 				}
   2400 				value = *Pointer<UInt2>(buffer, 16);
   2401 				mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
   2402 				if(rgbaWriteMask != 0xF)
   2403 				{
   2404 					mergedMask &= *Pointer<UInt2>(constants + OFFSET(Constants, maskB4Q[rgbaWriteMask][0]));
   2405 				}
   2406 				*Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (value & ~mergedMask);
   2407 			}
   2408 			break;
   2409 		default:
   2410 			ASSERT(false);
   2411 		}
   2412 	}
   2413 
   2414 	UShort4 PixelRoutine::convertFixed16(Float4 &cf, bool saturate)
   2415 	{
   2416 		return UShort4(cf * Float4(0xFFFF), saturate);
   2417 	}
   2418 
   2419 	void PixelRoutine::sRGBtoLinear16_12_16(Vector4s &c)
   2420 	{
   2421 		Pointer<Byte> LUT = constants + OFFSET(Constants,sRGBtoLinear12_16);
   2422 
   2423 		c.x = As<UShort4>(c.x) >> 4;
   2424 		c.y = As<UShort4>(c.y) >> 4;
   2425 		c.z = As<UShort4>(c.z) >> 4;
   2426 
   2427 		c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 0))), 0);
   2428 		c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 1))), 1);
   2429 		c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 2))), 2);
   2430 		c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 3))), 3);
   2431 
   2432 		c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 0))), 0);
   2433 		c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 1))), 1);
   2434 		c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 2))), 2);
   2435 		c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 3))), 3);
   2436 
   2437 		c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 0))), 0);
   2438 		c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 1))), 1);
   2439 		c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 2))), 2);
   2440 		c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 3))), 3);
   2441 	}
   2442 
   2443 	void PixelRoutine::linearToSRGB16_12_16(Vector4s &c)
   2444 	{
   2445 		c.x = As<UShort4>(c.x) >> 4;
   2446 		c.y = As<UShort4>(c.y) >> 4;
   2447 		c.z = As<UShort4>(c.z) >> 4;
   2448 
   2449 		linearToSRGB12_16(c);
   2450 	}
   2451 
   2452 	void PixelRoutine::linearToSRGB12_16(Vector4s &c)
   2453 	{
   2454 		Pointer<Byte> LUT = constants + OFFSET(Constants,linearToSRGB12_16);
   2455 
   2456 		c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 0))), 0);
   2457 		c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 1))), 1);
   2458 		c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 2))), 2);
   2459 		c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 3))), 3);
   2460 
   2461 		c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 0))), 0);
   2462 		c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 1))), 1);
   2463 		c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 2))), 2);
   2464 		c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 3))), 3);
   2465 
   2466 		c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 0))), 0);
   2467 		c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 1))), 1);
   2468 		c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 2))), 2);
   2469 		c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 3))), 3);
   2470 	}
   2471 
   2472 	Float4 PixelRoutine::sRGBtoLinear(const Float4 &x)   // Approximates x^2.2
   2473 	{
   2474 		Float4 linear = x * x;
   2475 		linear = linear * Float4(0.73f) + linear * x * Float4(0.27f);
   2476 
   2477 		return Min(Max(linear, Float4(0.0f)), Float4(1.0f));
   2478 	}
   2479 
   2480 	bool PixelRoutine::colorUsed()
   2481 	{
   2482 		return state.colorWriteMask || state.alphaTestActive() || state.shaderContainsKill;
   2483 	}
   2484 }
   2485