Home | History | Annotate | Download | only in Shader
      1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved.
      2 //
      3 // Licensed under the Apache License, Version 2.0 (the "License");
      4 // you may not use this file except in compliance with the License.
      5 // You may obtain a copy of the License at
      6 //
      7 //    http://www.apache.org/licenses/LICENSE-2.0
      8 //
      9 // Unless required by applicable law or agreed to in writing, software
     10 // distributed under the License is distributed on an "AS IS" BASIS,
     11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 // See the License for the specific language governing permissions and
     13 // limitations under the License.
     14 
     15 #include "PixelRoutine.hpp"
     16 
     17 #include "SamplerCore.hpp"
     18 #include "Constants.hpp"
     19 #include "Renderer/Renderer.hpp"
     20 #include "Renderer/QuadRasterizer.hpp"
     21 #include "Renderer/Surface.hpp"
     22 #include "Renderer/Primitive.hpp"
     23 #include "Common/Debug.hpp"
     24 
     25 namespace sw
     26 {
     27 	extern bool complementaryDepthBuffer;
     28 	extern bool postBlendSRGB;
     29 	extern bool exactColorRounding;
     30 	extern bool forceClearRegisters;
     31 
     32 	PixelRoutine::PixelRoutine(const PixelProcessor::State &state, const PixelShader *shader)
     33 		: QuadRasterizer(state, shader), v(shader && shader->indirectAddressableInput)
     34 	{
     35 		if(!shader || shader->getShaderModel() < 0x0200 || forceClearRegisters)
     36 		{
     37 			for(int i = 0; i < MAX_FRAGMENT_INPUTS; i++)
     38 			{
     39 				v[i].x = Float4(0.0f);
     40 				v[i].y = Float4(0.0f);
     41 				v[i].z = Float4(0.0f);
     42 				v[i].w = Float4(0.0f);
     43 			}
     44 		}
     45 	}
     46 
     47 	PixelRoutine::~PixelRoutine()
     48 	{
     49 	}
     50 
     51 	void PixelRoutine::quad(Pointer<Byte> cBuffer[RENDERTARGETS], Pointer<Byte> &zBuffer, Pointer<Byte> &sBuffer, Int cMask[4], Int &x, Int &y)
     52 	{
     53 		#if PERF_PROFILE
     54 			Long pipeTime = Ticks();
     55 		#endif
     56 
     57 		const bool earlyDepthTest = !state.depthOverride && !state.alphaTestActive();
     58 
     59 		Int zMask[4];   // Depth mask
     60 		Int sMask[4];   // Stencil mask
     61 
     62 		for(unsigned int q = 0; q < state.multiSample; q++)
     63 		{
     64 			zMask[q] = cMask[q];
     65 			sMask[q] = cMask[q];
     66 		}
     67 
     68 		for(unsigned int q = 0; q < state.multiSample; q++)
     69 		{
     70 			stencilTest(sBuffer, q, x, sMask[q], cMask[q]);
     71 		}
     72 
     73 		Float4 f;
     74 		Float4 rhwCentroid;
     75 
     76 		Float4 xxxx = Float4(Float(x)) + *Pointer<Float4>(primitive + OFFSET(Primitive,xQuad), 16);
     77 
     78 		if(interpolateZ())
     79 		{
     80 			for(unsigned int q = 0; q < state.multiSample; q++)
     81 			{
     82 				Float4 x = xxxx;
     83 
     84 				if(state.multiSample > 1)
     85 				{
     86 					x -= *Pointer<Float4>(constants + OFFSET(Constants,X) + q * sizeof(float4));
     87 				}
     88 
     89 				z[q] = interpolate(x, Dz[q], z[q], primitive + OFFSET(Primitive,z), false, false, state.depthClamp);
     90 			}
     91 		}
     92 
     93 		Bool depthPass = false;
     94 
     95 		if(earlyDepthTest)
     96 		{
     97 			for(unsigned int q = 0; q < state.multiSample; q++)
     98 			{
     99 				depthPass = depthPass || depthTest(zBuffer, q, x, z[q], sMask[q], zMask[q], cMask[q]);
    100 			}
    101 		}
    102 
    103 		If(depthPass || Bool(!earlyDepthTest))
    104 		{
    105 			#if PERF_PROFILE
    106 				Long interpTime = Ticks();
    107 			#endif
    108 
    109 			Float4 yyyy = Float4(Float(y)) + *Pointer<Float4>(primitive + OFFSET(Primitive,yQuad), 16);
    110 
    111 			// Centroid locations
    112 			Float4 XXXX = Float4(0.0f);
    113 			Float4 YYYY = Float4(0.0f);
    114 
    115 			if(state.centroid)
    116 			{
    117 				Float4 WWWW(1.0e-9f);
    118 
    119 				for(unsigned int q = 0; q < state.multiSample; q++)
    120 				{
    121 					XXXX += *Pointer<Float4>(constants + OFFSET(Constants,sampleX[q]) + 16 * cMask[q]);
    122 					YYYY += *Pointer<Float4>(constants + OFFSET(Constants,sampleY[q]) + 16 * cMask[q]);
    123 					WWWW += *Pointer<Float4>(constants + OFFSET(Constants,weight) + 16 * cMask[q]);
    124 				}
    125 
    126 				WWWW = Rcp_pp(WWWW);
    127 				XXXX *= WWWW;
    128 				YYYY *= WWWW;
    129 
    130 				XXXX += xxxx;
    131 				YYYY += yyyy;
    132 			}
    133 
    134 			if(interpolateW())
    135 			{
    136 				w = interpolate(xxxx, Dw, rhw, primitive + OFFSET(Primitive,w), false, false, false);
    137 				rhw = reciprocal(w, false, false, true);
    138 
    139 				if(state.centroid)
    140 				{
    141 					rhwCentroid = reciprocal(interpolateCentroid(XXXX, YYYY, rhwCentroid, primitive + OFFSET(Primitive,w), false, false));
    142 				}
    143 			}
    144 
    145 			for(int interpolant = 0; interpolant < MAX_FRAGMENT_INPUTS; interpolant++)
    146 			{
    147 				for(int component = 0; component < 4; component++)
    148 				{
    149 					if(state.interpolant[interpolant].component & (1 << component))
    150 					{
    151 						if(!state.interpolant[interpolant].centroid)
    152 						{
    153 							v[interpolant][component] = interpolate(xxxx, Dv[interpolant][component], rhw, primitive + OFFSET(Primitive, V[interpolant][component]), (state.interpolant[interpolant].flat & (1 << component)) != 0, state.perspective, false);
    154 						}
    155 						else
    156 						{
    157 							v[interpolant][component] = interpolateCentroid(XXXX, YYYY, rhwCentroid, primitive + OFFSET(Primitive, V[interpolant][component]), (state.interpolant[interpolant].flat & (1 << component)) != 0, state.perspective);
    158 						}
    159 					}
    160 				}
    161 
    162 				Float4 rcp;
    163 
    164 				switch(state.interpolant[interpolant].project)
    165 				{
    166 				case 0:
    167 					break;
    168 				case 1:
    169 					rcp = reciprocal(v[interpolant].y);
    170 					v[interpolant].x = v[interpolant].x * rcp;
    171 					break;
    172 				case 2:
    173 					rcp = reciprocal(v[interpolant].z);
    174 					v[interpolant].x = v[interpolant].x * rcp;
    175 					v[interpolant].y = v[interpolant].y * rcp;
    176 					break;
    177 				case 3:
    178 					rcp = reciprocal(v[interpolant].w);
    179 					v[interpolant].x = v[interpolant].x * rcp;
    180 					v[interpolant].y = v[interpolant].y * rcp;
    181 					v[interpolant].z = v[interpolant].z * rcp;
    182 					break;
    183 				}
    184 			}
    185 
    186 			if(state.fog.component)
    187 			{
    188 				f = interpolate(xxxx, Df, rhw, primitive + OFFSET(Primitive,f), state.fog.flat & 0x01, state.perspective, false);
    189 			}
    190 
    191 			setBuiltins(x, y, z, w);
    192 
    193 			#if PERF_PROFILE
    194 				cycles[PERF_INTERP] += Ticks() - interpTime;
    195 			#endif
    196 
    197 			Bool alphaPass = true;
    198 
    199 			if(colorUsed())
    200 			{
    201 				#if PERF_PROFILE
    202 					Long shaderTime = Ticks();
    203 				#endif
    204 
    205 				applyShader(cMask);
    206 
    207 				#if PERF_PROFILE
    208 					cycles[PERF_SHADER] += Ticks() - shaderTime;
    209 				#endif
    210 
    211 				alphaPass = alphaTest(cMask);
    212 
    213 				if((shader && shader->containsKill()) || state.alphaTestActive())
    214 				{
    215 					for(unsigned int q = 0; q < state.multiSample; q++)
    216 					{
    217 						zMask[q] &= cMask[q];
    218 						sMask[q] &= cMask[q];
    219 					}
    220 				}
    221 			}
    222 
    223 			If(alphaPass)
    224 			{
    225 				if(!earlyDepthTest)
    226 				{
    227 					for(unsigned int q = 0; q < state.multiSample; q++)
    228 					{
    229 						depthPass = depthPass || depthTest(zBuffer, q, x, z[q], sMask[q], zMask[q], cMask[q]);
    230 					}
    231 				}
    232 
    233 				#if PERF_PROFILE
    234 					Long ropTime = Ticks();
    235 				#endif
    236 
    237 				If(depthPass || Bool(earlyDepthTest))
    238 				{
    239 					for(unsigned int q = 0; q < state.multiSample; q++)
    240 					{
    241 						if(state.multiSampleMask & (1 << q))
    242 						{
    243 							writeDepth(zBuffer, q, x, z[q], zMask[q]);
    244 
    245 							if(state.occlusionEnabled)
    246 							{
    247 								occlusion += *Pointer<UInt>(constants + OFFSET(Constants,occlusionCount) + 4 * (zMask[q] & sMask[q]));
    248 							}
    249 						}
    250 					}
    251 
    252 					if(colorUsed())
    253 					{
    254 						#if PERF_PROFILE
    255 							AddAtomic(Pointer<Long>(&profiler.ropOperations), 4);
    256 						#endif
    257 
    258 						rasterOperation(f, cBuffer, x, sMask, zMask, cMask);
    259 					}
    260 				}
    261 
    262 				#if PERF_PROFILE
    263 					cycles[PERF_ROP] += Ticks() - ropTime;
    264 				#endif
    265 			}
    266 		}
    267 
    268 		for(unsigned int q = 0; q < state.multiSample; q++)
    269 		{
    270 			if(state.multiSampleMask & (1 << q))
    271 			{
    272 				writeStencil(sBuffer, q, x, sMask[q], zMask[q], cMask[q]);
    273 			}
    274 		}
    275 
    276 		#if PERF_PROFILE
    277 			cycles[PERF_PIPE] += Ticks() - pipeTime;
    278 		#endif
    279 	}
    280 
    281 	Float4 PixelRoutine::interpolateCentroid(Float4 &x, Float4 &y, Float4 &rhw, Pointer<Byte> planeEquation, bool flat, bool perspective)
    282 	{
    283 		Float4 interpolant = *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation,C), 16);
    284 
    285 		if(!flat)
    286 		{
    287 			interpolant += x * *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation,A), 16) +
    288 			               y * *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation,B), 16);
    289 
    290 			if(perspective)
    291 			{
    292 				interpolant *= rhw;
    293 			}
    294 		}
    295 
    296 		return interpolant;
    297 	}
    298 
    299 	void PixelRoutine::stencilTest(Pointer<Byte> &sBuffer, int q, Int &x, Int &sMask, Int &cMask)
    300 	{
    301 		if(!state.stencilActive)
    302 		{
    303 			return;
    304 		}
    305 
    306 		// (StencilRef & StencilMask) CompFunc (StencilBufferValue & StencilMask)
    307 
    308 		Pointer<Byte> buffer = sBuffer + 2 * x;
    309 
    310 		if(q > 0)
    311 		{
    312 			buffer += q * *Pointer<Int>(data + OFFSET(DrawData,stencilSliceB));
    313 		}
    314 
    315 		Byte8 value = *Pointer<Byte8>(buffer);
    316 		Byte8 valueCCW = value;
    317 
    318 		if(!state.noStencilMask)
    319 		{
    320 			value &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[0].testMaskQ));
    321 		}
    322 
    323 		stencilTest(value, state.stencilCompareMode, false);
    324 
    325 		if(state.twoSidedStencil)
    326 		{
    327 			if(!state.noStencilMaskCCW)
    328 			{
    329 				valueCCW &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[1].testMaskQ));
    330 			}
    331 
    332 			stencilTest(valueCCW, state.stencilCompareModeCCW, true);
    333 
    334 			value &= *Pointer<Byte8>(primitive + OFFSET(Primitive,clockwiseMask));
    335 			valueCCW &= *Pointer<Byte8>(primitive + OFFSET(Primitive,invClockwiseMask));
    336 			value |= valueCCW;
    337 		}
    338 
    339 		sMask = SignMask(value) & cMask;
    340 	}
    341 
    342 	void PixelRoutine::stencilTest(Byte8 &value, StencilCompareMode stencilCompareMode, bool CCW)
    343 	{
    344 		Byte8 equal;
    345 
    346 		switch(stencilCompareMode)
    347 		{
    348 		case STENCIL_ALWAYS:
    349 			value = Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
    350 			break;
    351 		case STENCIL_NEVER:
    352 			value = Byte8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
    353 			break;
    354 		case STENCIL_LESS:			// a < b ~ b > a
    355 			value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
    356 			value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ)));
    357 			break;
    358 		case STENCIL_EQUAL:
    359 			value = CmpEQ(value, *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedQ)));
    360 			break;
    361 		case STENCIL_NOTEQUAL:		// a != b ~ !(a == b)
    362 			value = CmpEQ(value, *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedQ)));
    363 			value ^= Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
    364 			break;
    365 		case STENCIL_LESSEQUAL:	// a <= b ~ (b > a) || (a == b)
    366 			equal = value;
    367 			equal = CmpEQ(equal, *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedQ)));
    368 			value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
    369 			value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ)));
    370 			value |= equal;
    371 			break;
    372 		case STENCIL_GREATER:		// a > b
    373 			equal = *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ));
    374 			value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
    375 			equal = CmpGT(As<SByte8>(equal), As<SByte8>(value));
    376 			value = equal;
    377 			break;
    378 		case STENCIL_GREATEREQUAL:	// a >= b ~ !(a < b) ~ !(b > a)
    379 			value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
    380 			value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ)));
    381 			value ^= Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
    382 			break;
    383 		default:
    384 			ASSERT(false);
    385 		}
    386 	}
    387 
    388 	Bool PixelRoutine::depthTest(Pointer<Byte> &zBuffer, int q, Int &x, Float4 &z, Int &sMask, Int &zMask, Int &cMask)
    389 	{
    390 		if(!state.depthTestActive)
    391 		{
    392 			return true;
    393 		}
    394 
    395 		Float4 Z = z;
    396 
    397 		if(shader && shader->depthOverride())
    398 		{
    399 			if(complementaryDepthBuffer)
    400 			{
    401 				Z = Float4(1.0f) - oDepth;
    402 			}
    403 			else
    404 			{
    405 				Z = oDepth;
    406 			}
    407 		}
    408 
    409 		Pointer<Byte> buffer;
    410 		Int pitch;
    411 
    412 		if(!state.quadLayoutDepthBuffer)
    413 		{
    414 			buffer = zBuffer + 4 * x;
    415 			pitch = *Pointer<Int>(data + OFFSET(DrawData,depthPitchB));
    416 		}
    417 		else
    418 		{
    419 			buffer = zBuffer + 8 * x;
    420 		}
    421 
    422 		if(q > 0)
    423 		{
    424 			buffer += q * *Pointer<Int>(data + OFFSET(DrawData,depthSliceB));
    425 		}
    426 
    427 		Float4 zValue;
    428 
    429 		if(state.depthCompareMode != DEPTH_NEVER || (state.depthCompareMode != DEPTH_ALWAYS && !state.depthWriteEnable))
    430 		{
    431 			if(!state.quadLayoutDepthBuffer)
    432 			{
    433 				// FIXME: Properly optimizes?
    434 				zValue.xy = *Pointer<Float4>(buffer);
    435 				zValue.zw = *Pointer<Float4>(buffer + pitch - 8);
    436 			}
    437 			else
    438 			{
    439 				zValue = *Pointer<Float4>(buffer, 16);
    440 			}
    441 		}
    442 
    443 		Int4 zTest;
    444 
    445 		switch(state.depthCompareMode)
    446 		{
    447 		case DEPTH_ALWAYS:
    448 			// Optimized
    449 			break;
    450 		case DEPTH_NEVER:
    451 			// Optimized
    452 			break;
    453 		case DEPTH_EQUAL:
    454 			zTest = CmpEQ(zValue, Z);
    455 			break;
    456 		case DEPTH_NOTEQUAL:
    457 			zTest = CmpNEQ(zValue, Z);
    458 			break;
    459 		case DEPTH_LESS:
    460 			if(complementaryDepthBuffer)
    461 			{
    462 				zTest = CmpLT(zValue, Z);
    463 			}
    464 			else
    465 			{
    466 				zTest = CmpNLE(zValue, Z);
    467 			}
    468 			break;
    469 		case DEPTH_GREATEREQUAL:
    470 			if(complementaryDepthBuffer)
    471 			{
    472 				zTest = CmpNLT(zValue, Z);
    473 			}
    474 			else
    475 			{
    476 				zTest = CmpLE(zValue, Z);
    477 			}
    478 			break;
    479 		case DEPTH_LESSEQUAL:
    480 			if(complementaryDepthBuffer)
    481 			{
    482 				zTest = CmpLE(zValue, Z);
    483 			}
    484 			else
    485 			{
    486 				zTest = CmpNLT(zValue, Z);
    487 			}
    488 			break;
    489 		case DEPTH_GREATER:
    490 			if(complementaryDepthBuffer)
    491 			{
    492 				zTest = CmpNLE(zValue, Z);
    493 			}
    494 			else
    495 			{
    496 				zTest = CmpLT(zValue, Z);
    497 			}
    498 			break;
    499 		default:
    500 			ASSERT(false);
    501 		}
    502 
    503 		switch(state.depthCompareMode)
    504 		{
    505 		case DEPTH_ALWAYS:
    506 			zMask = cMask;
    507 			break;
    508 		case DEPTH_NEVER:
    509 			zMask = 0x0;
    510 			break;
    511 		default:
    512 			zMask = SignMask(zTest) & cMask;
    513 			break;
    514 		}
    515 
    516 		if(state.stencilActive)
    517 		{
    518 			zMask &= sMask;
    519 		}
    520 
    521 		return zMask != 0;
    522 	}
    523 
    524 	void PixelRoutine::alphaTest(Int &aMask, Short4 &alpha)
    525 	{
    526 		Short4 cmp;
    527 		Short4 equal;
    528 
    529 		switch(state.alphaCompareMode)
    530 		{
    531 		case ALPHA_ALWAYS:
    532 			aMask = 0xF;
    533 			break;
    534 		case ALPHA_NEVER:
    535 			aMask = 0x0;
    536 			break;
    537 		case ALPHA_EQUAL:
    538 			cmp = CmpEQ(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)));
    539 			aMask = SignMask(PackSigned(cmp, Short4(0x0000)));
    540 			break;
    541 		case ALPHA_NOTEQUAL:       // a != b ~ !(a == b)
    542 			cmp = CmpEQ(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4))) ^ Short4(0xFFFFu);   // FIXME
    543 			aMask = SignMask(PackSigned(cmp, Short4(0x0000)));
    544 			break;
    545 		case ALPHA_LESS:           // a < b ~ b > a
    546 			cmp = CmpGT(*Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)), alpha);
    547 			aMask = SignMask(PackSigned(cmp, Short4(0x0000)));
    548 			break;
    549 		case ALPHA_GREATEREQUAL:   // a >= b ~ (a > b) || (a == b) ~ !(b > a)   // TODO: Approximate
    550 			equal = CmpEQ(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)));
    551 			cmp = CmpGT(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)));
    552 			cmp |= equal;
    553 			aMask = SignMask(PackSigned(cmp, Short4(0x0000)));
    554 			break;
    555 		case ALPHA_LESSEQUAL:      // a <= b ~ !(a > b)
    556 			cmp = CmpGT(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4))) ^ Short4(0xFFFFu);   // FIXME
    557 			aMask = SignMask(PackSigned(cmp, Short4(0x0000)));
    558 			break;
    559 		case ALPHA_GREATER:        // a > b
    560 			cmp = CmpGT(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)));
    561 			aMask = SignMask(PackSigned(cmp, Short4(0x0000)));
    562 			break;
    563 		default:
    564 			ASSERT(false);
    565 		}
    566 	}
    567 
    568 	void PixelRoutine::alphaToCoverage(Int cMask[4], Float4 &alpha)
    569 	{
    570 		Int4 coverage0 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c0)));
    571 		Int4 coverage1 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c1)));
    572 		Int4 coverage2 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c2)));
    573 		Int4 coverage3 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c3)));
    574 
    575 		Int aMask0 = SignMask(coverage0);
    576 		Int aMask1 = SignMask(coverage1);
    577 		Int aMask2 = SignMask(coverage2);
    578 		Int aMask3 = SignMask(coverage3);
    579 
    580 		cMask[0] &= aMask0;
    581 		cMask[1] &= aMask1;
    582 		cMask[2] &= aMask2;
    583 		cMask[3] &= aMask3;
    584 	}
    585 
    586 	void PixelRoutine::fogBlend(Vector4f &c0, Float4 &fog)
    587 	{
    588 		if(!state.fogActive)
    589 		{
    590 			return;
    591 		}
    592 
    593 		if(state.pixelFogMode != FOG_NONE)
    594 		{
    595 			pixelFog(fog);
    596 
    597 			fog = Min(fog, Float4(1.0f));
    598 			fog = Max(fog, Float4(0.0f));
    599 		}
    600 
    601 		c0.x -= *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[0]));
    602 		c0.y -= *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[1]));
    603 		c0.z -= *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[2]));
    604 
    605 		c0.x *= fog;
    606 		c0.y *= fog;
    607 		c0.z *= fog;
    608 
    609 		c0.x += *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[0]));
    610 		c0.y += *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[1]));
    611 		c0.z += *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[2]));
    612 	}
    613 
    614 	void PixelRoutine::pixelFog(Float4 &visibility)
    615 	{
    616 		Float4 &zw = visibility;
    617 
    618 		if(state.pixelFogMode != FOG_NONE)
    619 		{
    620 			if(state.wBasedFog)
    621 			{
    622 				zw = rhw;
    623 			}
    624 			else
    625 			{
    626 				if(complementaryDepthBuffer)
    627 				{
    628 					zw = Float4(1.0f) - z[0];
    629 				}
    630 				else
    631 				{
    632 					zw = z[0];
    633 				}
    634 			}
    635 		}
    636 
    637 		switch(state.pixelFogMode)
    638 		{
    639 		case FOG_NONE:
    640 			break;
    641 		case FOG_LINEAR:
    642 			zw *= *Pointer<Float4>(data + OFFSET(DrawData,fog.scale));
    643 			zw += *Pointer<Float4>(data + OFFSET(DrawData,fog.offset));
    644 			break;
    645 		case FOG_EXP:
    646 			zw *= *Pointer<Float4>(data + OFFSET(DrawData,fog.densityE));
    647 			zw = exponential2(zw, true);
    648 			break;
    649 		case FOG_EXP2:
    650 			zw *= zw;
    651 			zw *= *Pointer<Float4>(data + OFFSET(DrawData,fog.density2E));
    652 			zw = exponential2(zw, true);
    653 			break;
    654 		default:
    655 			ASSERT(false);
    656 		}
    657 	}
    658 
    659 	void PixelRoutine::writeDepth(Pointer<Byte> &zBuffer, int q, Int &x, Float4 &z, Int &zMask)
    660 	{
    661 		if(!state.depthWriteEnable)
    662 		{
    663 			return;
    664 		}
    665 
    666 		Float4 Z = z;
    667 
    668 		if(shader && shader->depthOverride())
    669 		{
    670 			if(complementaryDepthBuffer)
    671 			{
    672 				Z = Float4(1.0f) - oDepth;
    673 			}
    674 			else
    675 			{
    676 				Z = oDepth;
    677 			}
    678 		}
    679 
    680 		Pointer<Byte> buffer;
    681 		Int pitch;
    682 
    683 		if(!state.quadLayoutDepthBuffer)
    684 		{
    685 			buffer = zBuffer + 4 * x;
    686 			pitch = *Pointer<Int>(data + OFFSET(DrawData,depthPitchB));
    687 		}
    688 		else
    689 		{
    690 			buffer = zBuffer + 8 * x;
    691 		}
    692 
    693 		if(q > 0)
    694 		{
    695 			buffer += q * *Pointer<Int>(data + OFFSET(DrawData,depthSliceB));
    696 		}
    697 
    698 		Float4 zValue;
    699 
    700 		if(state.depthCompareMode != DEPTH_NEVER || (state.depthCompareMode != DEPTH_ALWAYS && !state.depthWriteEnable))
    701 		{
    702 			if(!state.quadLayoutDepthBuffer)
    703 			{
    704 				// FIXME: Properly optimizes?
    705 				zValue.xy = *Pointer<Float4>(buffer);
    706 				zValue.zw = *Pointer<Float4>(buffer + pitch - 8);
    707 			}
    708 			else
    709 			{
    710 				zValue = *Pointer<Float4>(buffer, 16);
    711 			}
    712 		}
    713 
    714 		Z = As<Float4>(As<Int4>(Z) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X) + zMask * 16, 16));
    715 		zValue = As<Float4>(As<Int4>(zValue) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X) + zMask * 16, 16));
    716 		Z = As<Float4>(As<Int4>(Z) | As<Int4>(zValue));
    717 
    718 		if(!state.quadLayoutDepthBuffer)
    719 		{
    720 			// FIXME: Properly optimizes?
    721 			*Pointer<Float2>(buffer) = Float2(Z.xy);
    722 			*Pointer<Float2>(buffer + pitch) = Float2(Z.zw);
    723 		}
    724 		else
    725 		{
    726 			*Pointer<Float4>(buffer, 16) = Z;
    727 		}
    728 	}
    729 
    730 	void PixelRoutine::writeStencil(Pointer<Byte> &sBuffer, int q, Int &x, Int &sMask, Int &zMask, Int &cMask)
    731 	{
    732 		if(!state.stencilActive)
    733 		{
    734 			return;
    735 		}
    736 
    737 		if(state.stencilPassOperation == OPERATION_KEEP && state.stencilZFailOperation == OPERATION_KEEP && state.stencilFailOperation == OPERATION_KEEP)
    738 		{
    739 			if(!state.twoSidedStencil || (state.stencilPassOperationCCW == OPERATION_KEEP && state.stencilZFailOperationCCW == OPERATION_KEEP && state.stencilFailOperationCCW == OPERATION_KEEP))
    740 			{
    741 				return;
    742 			}
    743 		}
    744 
    745 		if(state.stencilWriteMasked && (!state.twoSidedStencil || state.stencilWriteMaskedCCW))
    746 		{
    747 			return;
    748 		}
    749 
    750 		Pointer<Byte> buffer = sBuffer + 2 * x;
    751 
    752 		if(q > 0)
    753 		{
    754 			buffer += q * *Pointer<Int>(data + OFFSET(DrawData,stencilSliceB));
    755 		}
    756 
    757 		Byte8 bufferValue = *Pointer<Byte8>(buffer);
    758 
    759 		Byte8 newValue;
    760 		stencilOperation(newValue, bufferValue, state.stencilPassOperation, state.stencilZFailOperation, state.stencilFailOperation, false, zMask, sMask);
    761 
    762 		if(!state.noStencilWriteMask)
    763 		{
    764 			Byte8 maskedValue = bufferValue;
    765 			newValue &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[0].writeMaskQ));
    766 			maskedValue &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[0].invWriteMaskQ));
    767 			newValue |= maskedValue;
    768 		}
    769 
    770 		if(state.twoSidedStencil)
    771 		{
    772 			Byte8 newValueCCW;
    773 
    774 			stencilOperation(newValueCCW, bufferValue, state.stencilPassOperationCCW, state.stencilZFailOperationCCW, state.stencilFailOperationCCW, true, zMask, sMask);
    775 
    776 			if(!state.noStencilWriteMaskCCW)
    777 			{
    778 				Byte8 maskedValue = bufferValue;
    779 				newValueCCW &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[1].writeMaskQ));
    780 				maskedValue &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[1].invWriteMaskQ));
    781 				newValueCCW |= maskedValue;
    782 			}
    783 
    784 			newValue &= *Pointer<Byte8>(primitive + OFFSET(Primitive,clockwiseMask));
    785 			newValueCCW &= *Pointer<Byte8>(primitive + OFFSET(Primitive,invClockwiseMask));
    786 			newValue |= newValueCCW;
    787 		}
    788 
    789 		newValue &= *Pointer<Byte8>(constants + OFFSET(Constants,maskB4Q) + 8 * cMask);
    790 		bufferValue &= *Pointer<Byte8>(constants + OFFSET(Constants,invMaskB4Q) + 8 * cMask);
    791 		newValue |= bufferValue;
    792 
    793 		*Pointer<Byte4>(buffer) = Byte4(newValue);
    794 	}
    795 
    796 	void PixelRoutine::stencilOperation(Byte8 &newValue, Byte8 &bufferValue, StencilOperation stencilPassOperation, StencilOperation stencilZFailOperation, StencilOperation stencilFailOperation, bool CCW, Int &zMask, Int &sMask)
    797 	{
    798 		Byte8 &pass = newValue;
    799 		Byte8 fail;
    800 		Byte8 zFail;
    801 
    802 		stencilOperation(pass, bufferValue, stencilPassOperation, CCW);
    803 
    804 		if(stencilZFailOperation != stencilPassOperation)
    805 		{
    806 			stencilOperation(zFail, bufferValue, stencilZFailOperation, CCW);
    807 		}
    808 
    809 		if(stencilFailOperation != stencilPassOperation || stencilFailOperation != stencilZFailOperation)
    810 		{
    811 			stencilOperation(fail, bufferValue, stencilFailOperation, CCW);
    812 		}
    813 
    814 		if(stencilFailOperation != stencilPassOperation || stencilFailOperation != stencilZFailOperation)
    815 		{
    816 			if(state.depthTestActive && stencilZFailOperation != stencilPassOperation)   // zMask valid and values not the same
    817 			{
    818 				pass &= *Pointer<Byte8>(constants + OFFSET(Constants,maskB4Q) + 8 * zMask);
    819 				zFail &= *Pointer<Byte8>(constants + OFFSET(Constants,invMaskB4Q) + 8 * zMask);
    820 				pass |= zFail;
    821 			}
    822 
    823 			pass &= *Pointer<Byte8>(constants + OFFSET(Constants,maskB4Q) + 8 * sMask);
    824 			fail &= *Pointer<Byte8>(constants + OFFSET(Constants,invMaskB4Q) + 8 * sMask);
    825 			pass |= fail;
    826 		}
    827 	}
    828 
    829 	void PixelRoutine::stencilOperation(Byte8 &output, Byte8 &bufferValue, StencilOperation operation, bool CCW)
    830 	{
    831 		switch(operation)
    832 		{
    833 		case OPERATION_KEEP:
    834 			output = bufferValue;
    835 			break;
    836 		case OPERATION_ZERO:
    837 			output = Byte8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
    838 			break;
    839 		case OPERATION_REPLACE:
    840 			output = *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceQ));
    841 			break;
    842 		case OPERATION_INCRSAT:
    843 			output = AddSat(bufferValue, Byte8(1, 1, 1, 1, 1, 1, 1, 1));
    844 			break;
    845 		case OPERATION_DECRSAT:
    846 			output = SubSat(bufferValue, Byte8(1, 1, 1, 1, 1, 1, 1, 1));
    847 			break;
    848 		case OPERATION_INVERT:
    849 			output = bufferValue ^ Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
    850 			break;
    851 		case OPERATION_INCR:
    852 			output = bufferValue + Byte8(1, 1, 1, 1, 1, 1, 1, 1);
    853 			break;
    854 		case OPERATION_DECR:
    855 			output = bufferValue - Byte8(1, 1, 1, 1, 1, 1, 1, 1);
    856 			break;
    857 		default:
    858 			ASSERT(false);
    859 		}
    860 	}
    861 
    862 	void PixelRoutine::blendFactor(Vector4s &blendFactor, const Vector4s &current, const Vector4s &pixel, BlendFactor blendFactorActive)
    863 	{
    864 		switch(blendFactorActive)
    865 		{
    866 		case BLEND_ZERO:
    867 			// Optimized
    868 			break;
    869 		case BLEND_ONE:
    870 			// Optimized
    871 			break;
    872 		case BLEND_SOURCE:
    873 			blendFactor.x = current.x;
    874 			blendFactor.y = current.y;
    875 			blendFactor.z = current.z;
    876 			break;
    877 		case BLEND_INVSOURCE:
    878 			blendFactor.x = Short4(0xFFFFu) - current.x;
    879 			blendFactor.y = Short4(0xFFFFu) - current.y;
    880 			blendFactor.z = Short4(0xFFFFu) - current.z;
    881 			break;
    882 		case BLEND_DEST:
    883 			blendFactor.x = pixel.x;
    884 			blendFactor.y = pixel.y;
    885 			blendFactor.z = pixel.z;
    886 			break;
    887 		case BLEND_INVDEST:
    888 			blendFactor.x = Short4(0xFFFFu) - pixel.x;
    889 			blendFactor.y = Short4(0xFFFFu) - pixel.y;
    890 			blendFactor.z = Short4(0xFFFFu) - pixel.z;
    891 			break;
    892 		case BLEND_SOURCEALPHA:
    893 			blendFactor.x = current.w;
    894 			blendFactor.y = current.w;
    895 			blendFactor.z = current.w;
    896 			break;
    897 		case BLEND_INVSOURCEALPHA:
    898 			blendFactor.x = Short4(0xFFFFu) - current.w;
    899 			blendFactor.y = Short4(0xFFFFu) - current.w;
    900 			blendFactor.z = Short4(0xFFFFu) - current.w;
    901 			break;
    902 		case BLEND_DESTALPHA:
    903 			blendFactor.x = pixel.w;
    904 			blendFactor.y = pixel.w;
    905 			blendFactor.z = pixel.w;
    906 			break;
    907 		case BLEND_INVDESTALPHA:
    908 			blendFactor.x = Short4(0xFFFFu) - pixel.w;
    909 			blendFactor.y = Short4(0xFFFFu) - pixel.w;
    910 			blendFactor.z = Short4(0xFFFFu) - pixel.w;
    911 			break;
    912 		case BLEND_SRCALPHASAT:
    913 			blendFactor.x = Short4(0xFFFFu) - pixel.w;
    914 			blendFactor.x = Min(As<UShort4>(blendFactor.x), As<UShort4>(current.w));
    915 			blendFactor.y = blendFactor.x;
    916 			blendFactor.z = blendFactor.x;
    917 			break;
    918 		case BLEND_CONSTANT:
    919 			blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[0]));
    920 			blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[1]));
    921 			blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[2]));
    922 			break;
    923 		case BLEND_INVCONSTANT:
    924 			blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[0]));
    925 			blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[1]));
    926 			blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[2]));
    927 			break;
    928 		case BLEND_CONSTANTALPHA:
    929 			blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3]));
    930 			blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3]));
    931 			blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3]));
    932 			break;
    933 		case BLEND_INVCONSTANTALPHA:
    934 			blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
    935 			blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
    936 			blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
    937 			break;
    938 		default:
    939 			ASSERT(false);
    940 		}
    941 	}
    942 
    943 	void PixelRoutine::blendFactorAlpha(Vector4s &blendFactor, const Vector4s &current, const Vector4s &pixel, BlendFactor blendFactorAlphaActive)
    944 	{
    945 		switch(blendFactorAlphaActive)
    946 		{
    947 		case BLEND_ZERO:
    948 			// Optimized
    949 			break;
    950 		case BLEND_ONE:
    951 			// Optimized
    952 			break;
    953 		case BLEND_SOURCE:
    954 			blendFactor.w = current.w;
    955 			break;
    956 		case BLEND_INVSOURCE:
    957 			blendFactor.w = Short4(0xFFFFu) - current.w;
    958 			break;
    959 		case BLEND_DEST:
    960 			blendFactor.w = pixel.w;
    961 			break;
    962 		case BLEND_INVDEST:
    963 			blendFactor.w = Short4(0xFFFFu) - pixel.w;
    964 			break;
    965 		case BLEND_SOURCEALPHA:
    966 			blendFactor.w = current.w;
    967 			break;
    968 		case BLEND_INVSOURCEALPHA:
    969 			blendFactor.w = Short4(0xFFFFu) - current.w;
    970 			break;
    971 		case BLEND_DESTALPHA:
    972 			blendFactor.w = pixel.w;
    973 			break;
    974 		case BLEND_INVDESTALPHA:
    975 			blendFactor.w = Short4(0xFFFFu) - pixel.w;
    976 			break;
    977 		case BLEND_SRCALPHASAT:
    978 			blendFactor.w = Short4(0xFFFFu);
    979 			break;
    980 		case BLEND_CONSTANT:
    981 		case BLEND_CONSTANTALPHA:
    982 			blendFactor.w = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3]));
    983 			break;
    984 		case BLEND_INVCONSTANT:
    985 		case BLEND_INVCONSTANTALPHA:
    986 			blendFactor.w = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
    987 			break;
    988 		default:
    989 			ASSERT(false);
    990 		}
    991 	}
    992 
    993 	bool PixelRoutine::isSRGB(int index) const
    994 	{
    995 		return Surface::isSRGBformat(state.targetFormat[index]);
    996 	}
    997 
    998 	void PixelRoutine::readPixel(int index, Pointer<Byte> &cBuffer, Int &x, Vector4s &pixel)
    999 	{
   1000 		Short4 c01;
   1001 		Short4 c23;
   1002 		Pointer<Byte> buffer;
   1003 		Pointer<Byte> buffer2;
   1004 
   1005 		switch(state.targetFormat[index])
   1006 		{
   1007 		case FORMAT_R5G6B5:
   1008 			buffer = cBuffer + 2 * x;
   1009 			buffer2 = buffer + *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
   1010 			c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
   1011 
   1012 			pixel.x = c01 & Short4(0xF800u);
   1013 			pixel.y = (c01 & Short4(0x07E0u)) << 5;
   1014 			pixel.z = (c01 & Short4(0x001Fu)) << 11;
   1015 			pixel.w = Short4(0xFFFFu);
   1016 			break;
   1017 		case FORMAT_A8R8G8B8:
   1018 			buffer = cBuffer + 4 * x;
   1019 			c01 = *Pointer<Short4>(buffer);
   1020 			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
   1021 			c23 = *Pointer<Short4>(buffer);
   1022 			pixel.z = c01;
   1023 			pixel.y = c01;
   1024 			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
   1025 			pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
   1026 			pixel.x = pixel.z;
   1027 			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
   1028 			pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
   1029 			pixel.y = pixel.z;
   1030 			pixel.w = pixel.x;
   1031 			pixel.x = UnpackLow(As<Byte8>(pixel.x), As<Byte8>(pixel.x));
   1032 			pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
   1033 			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
   1034 			pixel.w = UnpackHigh(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
   1035 			break;
   1036 		case FORMAT_A8B8G8R8:
   1037 		case FORMAT_SRGB8_A8:
   1038 			buffer = cBuffer + 4 * x;
   1039 			c01 = *Pointer<Short4>(buffer);
   1040 			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
   1041 			c23 = *Pointer<Short4>(buffer);
   1042 			pixel.z = c01;
   1043 			pixel.y = c01;
   1044 			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
   1045 			pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
   1046 			pixel.x = pixel.z;
   1047 			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
   1048 			pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
   1049 			pixel.y = pixel.z;
   1050 			pixel.w = pixel.x;
   1051 			pixel.x = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
   1052 			pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
   1053 			pixel.z = UnpackLow(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
   1054 			pixel.w = UnpackHigh(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
   1055 			break;
   1056 		case FORMAT_A8:
   1057 			buffer = cBuffer + 1 * x;
   1058 			pixel.w = Insert(pixel.w, *Pointer<Short>(buffer), 0);
   1059 			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
   1060 			pixel.w = Insert(pixel.w, *Pointer<Short>(buffer), 1);
   1061 			pixel.w = UnpackLow(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
   1062 			pixel.x = Short4(0x0000);
   1063 			pixel.y = Short4(0x0000);
   1064 			pixel.z = Short4(0x0000);
   1065 			break;
   1066 		case FORMAT_R8:
   1067 			buffer = cBuffer + 1 * x;
   1068 			pixel.x = Insert(pixel.x, *Pointer<Short>(buffer), 0);
   1069 			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
   1070 			pixel.x = Insert(pixel.x, *Pointer<Short>(buffer), 1);
   1071 			pixel.x = UnpackLow(As<Byte8>(pixel.x), As<Byte8>(pixel.x));
   1072 			pixel.y = Short4(0x0000);
   1073 			pixel.z = Short4(0x0000);
   1074 			pixel.w = Short4(0xFFFFu);
   1075 			break;
   1076 		case FORMAT_X8R8G8B8:
   1077 			buffer = cBuffer + 4 * x;
   1078 			c01 = *Pointer<Short4>(buffer);
   1079 			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
   1080 			c23 = *Pointer<Short4>(buffer);
   1081 			pixel.z = c01;
   1082 			pixel.y = c01;
   1083 			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
   1084 			pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
   1085 			pixel.x = pixel.z;
   1086 			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
   1087 			pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
   1088 			pixel.y = pixel.z;
   1089 			pixel.x = UnpackLow(As<Byte8>(pixel.x), As<Byte8>(pixel.x));
   1090 			pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
   1091 			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
   1092 			pixel.w = Short4(0xFFFFu);
   1093 			break;
   1094 		case FORMAT_G8R8:
   1095 			buffer = cBuffer + 2 * x;
   1096 			c01 = As<Short4>(Insert(As<Int2>(c01), *Pointer<Int>(buffer), 0));
   1097 			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
   1098 			c01 = As<Short4>(Insert(As<Int2>(c01), *Pointer<Int>(buffer), 1));
   1099 			pixel.x = (c01 & Short4(0x00FFu)) | (c01 << 8);
   1100 			pixel.y = (c01 & Short4(0xFF00u)) | As<Short4>(As<UShort4>(c01) >> 8);
   1101 			pixel.z = Short4(0x0000u);
   1102 			pixel.w = Short4(0xFFFFu);
   1103 			break;
   1104 		case FORMAT_X8B8G8R8:
   1105 		case FORMAT_SRGB8_X8:
   1106 			buffer = cBuffer + 4 * x;
   1107 			c01 = *Pointer<Short4>(buffer);
   1108 			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
   1109 			c23 = *Pointer<Short4>(buffer);
   1110 			pixel.z = c01;
   1111 			pixel.y = c01;
   1112 			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
   1113 			pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
   1114 			pixel.x = pixel.z;
   1115 			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
   1116 			pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
   1117 			pixel.y = pixel.z;
   1118 			pixel.w = pixel.x;
   1119 			pixel.x = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
   1120 			pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
   1121 			pixel.z = UnpackLow(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
   1122 			pixel.w = Short4(0xFFFFu);
   1123 			break;
   1124 		case FORMAT_A8G8R8B8Q:
   1125 			UNIMPLEMENTED();
   1126 		//	pixel.z = UnpackLow(As<Byte8>(pixel.z), *Pointer<Byte8>(cBuffer + 8 * x + 0));
   1127 		//	pixel.x = UnpackHigh(As<Byte8>(pixel.x), *Pointer<Byte8>(cBuffer + 8 * x + 0));
   1128 		//	pixel.y = UnpackLow(As<Byte8>(pixel.y), *Pointer<Byte8>(cBuffer + 8 * x + 8));
   1129 		//	pixel.w = UnpackHigh(As<Byte8>(pixel.w), *Pointer<Byte8>(cBuffer + 8 * x + 8));
   1130 			break;
   1131 		case FORMAT_X8G8R8B8Q:
   1132 			UNIMPLEMENTED();
   1133 		//	pixel.z = UnpackLow(As<Byte8>(pixel.z), *Pointer<Byte8>(cBuffer + 8 * x + 0));
   1134 		//	pixel.x = UnpackHigh(As<Byte8>(pixel.x), *Pointer<Byte8>(cBuffer + 8 * x + 0));
   1135 		//	pixel.y = UnpackLow(As<Byte8>(pixel.y), *Pointer<Byte8>(cBuffer + 8 * x + 8));
   1136 		//	pixel.w = Short4(0xFFFFu);
   1137 			break;
   1138 		case FORMAT_A16B16G16R16:
   1139 			buffer = cBuffer;
   1140 			pixel.x = *Pointer<Short4>(buffer + 8 * x);
   1141 			pixel.y = *Pointer<Short4>(buffer + 8 * x + 8);
   1142 			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
   1143 			pixel.z = *Pointer<Short4>(buffer + 8 * x);
   1144 			pixel.w = *Pointer<Short4>(buffer + 8 * x + 8);
   1145 			transpose4x4(pixel.x, pixel.y, pixel.z, pixel.w);
   1146 			break;
   1147 		case FORMAT_G16R16:
   1148 			buffer = cBuffer;
   1149 			pixel.x = *Pointer<Short4>(buffer + 4 * x);
   1150 			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
   1151 			pixel.y = *Pointer<Short4>(buffer + 4 * x);
   1152 			pixel.z = pixel.x;
   1153 			pixel.x = As<Short4>(UnpackLow(pixel.x, pixel.y));
   1154 			pixel.z = As<Short4>(UnpackHigh(pixel.z, pixel.y));
   1155 			pixel.y = pixel.z;
   1156 			pixel.x = As<Short4>(UnpackLow(pixel.x, pixel.z));
   1157 			pixel.y = As<Short4>(UnpackHigh(pixel.y, pixel.z));
   1158 			pixel.z = Short4(0xFFFFu);
   1159 			pixel.w = Short4(0xFFFFu);
   1160 			break;
   1161 		default:
   1162 			ASSERT(false);
   1163 		}
   1164 
   1165 		if((postBlendSRGB && state.writeSRGB) || isSRGB(index))
   1166 		{
   1167 			sRGBtoLinear16_12_16(pixel);
   1168 		}
   1169 	}
   1170 
   1171 	void PixelRoutine::alphaBlend(int index, Pointer<Byte> &cBuffer, Vector4s &current, Int &x)
   1172 	{
   1173 		if(!state.alphaBlendActive)
   1174 		{
   1175 			return;
   1176 		}
   1177 
   1178 		Vector4s pixel;
   1179 		readPixel(index, cBuffer, x, pixel);
   1180 
   1181 		// Final Color = ObjectColor * SourceBlendFactor + PixelColor * DestinationBlendFactor
   1182 		Vector4s sourceFactor;
   1183 		Vector4s destFactor;
   1184 
   1185 		blendFactor(sourceFactor, current, pixel, state.sourceBlendFactor);
   1186 		blendFactor(destFactor, current, pixel, state.destBlendFactor);
   1187 
   1188 		if(state.sourceBlendFactor != BLEND_ONE && state.sourceBlendFactor != BLEND_ZERO)
   1189 		{
   1190 			current.x = MulHigh(As<UShort4>(current.x), As<UShort4>(sourceFactor.x));
   1191 			current.y = MulHigh(As<UShort4>(current.y), As<UShort4>(sourceFactor.y));
   1192 			current.z = MulHigh(As<UShort4>(current.z), As<UShort4>(sourceFactor.z));
   1193 		}
   1194 
   1195 		if(state.destBlendFactor != BLEND_ONE && state.destBlendFactor != BLEND_ZERO)
   1196 		{
   1197 			pixel.x = MulHigh(As<UShort4>(pixel.x), As<UShort4>(destFactor.x));
   1198 			pixel.y = MulHigh(As<UShort4>(pixel.y), As<UShort4>(destFactor.y));
   1199 			pixel.z = MulHigh(As<UShort4>(pixel.z), As<UShort4>(destFactor.z));
   1200 		}
   1201 
   1202 		switch(state.blendOperation)
   1203 		{
   1204 		case BLENDOP_ADD:
   1205 			current.x = AddSat(As<UShort4>(current.x), As<UShort4>(pixel.x));
   1206 			current.y = AddSat(As<UShort4>(current.y), As<UShort4>(pixel.y));
   1207 			current.z = AddSat(As<UShort4>(current.z), As<UShort4>(pixel.z));
   1208 			break;
   1209 		case BLENDOP_SUB:
   1210 			current.x = SubSat(As<UShort4>(current.x), As<UShort4>(pixel.x));
   1211 			current.y = SubSat(As<UShort4>(current.y), As<UShort4>(pixel.y));
   1212 			current.z = SubSat(As<UShort4>(current.z), As<UShort4>(pixel.z));
   1213 			break;
   1214 		case BLENDOP_INVSUB:
   1215 			current.x = SubSat(As<UShort4>(pixel.x), As<UShort4>(current.x));
   1216 			current.y = SubSat(As<UShort4>(pixel.y), As<UShort4>(current.y));
   1217 			current.z = SubSat(As<UShort4>(pixel.z), As<UShort4>(current.z));
   1218 			break;
   1219 		case BLENDOP_MIN:
   1220 			current.x = Min(As<UShort4>(current.x), As<UShort4>(pixel.x));
   1221 			current.y = Min(As<UShort4>(current.y), As<UShort4>(pixel.y));
   1222 			current.z = Min(As<UShort4>(current.z), As<UShort4>(pixel.z));
   1223 			break;
   1224 		case BLENDOP_MAX:
   1225 			current.x = Max(As<UShort4>(current.x), As<UShort4>(pixel.x));
   1226 			current.y = Max(As<UShort4>(current.y), As<UShort4>(pixel.y));
   1227 			current.z = Max(As<UShort4>(current.z), As<UShort4>(pixel.z));
   1228 			break;
   1229 		case BLENDOP_SOURCE:
   1230 			// No operation
   1231 			break;
   1232 		case BLENDOP_DEST:
   1233 			current.x = pixel.x;
   1234 			current.y = pixel.y;
   1235 			current.z = pixel.z;
   1236 			break;
   1237 		case BLENDOP_NULL:
   1238 			current.x = Short4(0x0000);
   1239 			current.y = Short4(0x0000);
   1240 			current.z = Short4(0x0000);
   1241 			break;
   1242 		default:
   1243 			ASSERT(false);
   1244 		}
   1245 
   1246 		blendFactorAlpha(sourceFactor, current, pixel, state.sourceBlendFactorAlpha);
   1247 		blendFactorAlpha(destFactor, current, pixel, state.destBlendFactorAlpha);
   1248 
   1249 		if(state.sourceBlendFactorAlpha != BLEND_ONE && state.sourceBlendFactorAlpha != BLEND_ZERO)
   1250 		{
   1251 			current.w = MulHigh(As<UShort4>(current.w), As<UShort4>(sourceFactor.w));
   1252 		}
   1253 
   1254 		if(state.destBlendFactorAlpha != BLEND_ONE && state.destBlendFactorAlpha != BLEND_ZERO)
   1255 		{
   1256 			pixel.w = MulHigh(As<UShort4>(pixel.w), As<UShort4>(destFactor.w));
   1257 		}
   1258 
   1259 		switch(state.blendOperationAlpha)
   1260 		{
   1261 		case BLENDOP_ADD:
   1262 			current.w = AddSat(As<UShort4>(current.w), As<UShort4>(pixel.w));
   1263 			break;
   1264 		case BLENDOP_SUB:
   1265 			current.w = SubSat(As<UShort4>(current.w), As<UShort4>(pixel.w));
   1266 			break;
   1267 		case BLENDOP_INVSUB:
   1268 			current.w = SubSat(As<UShort4>(pixel.w), As<UShort4>(current.w));
   1269 			break;
   1270 		case BLENDOP_MIN:
   1271 			current.w = Min(As<UShort4>(current.w), As<UShort4>(pixel.w));
   1272 			break;
   1273 		case BLENDOP_MAX:
   1274 			current.w = Max(As<UShort4>(current.w), As<UShort4>(pixel.w));
   1275 			break;
   1276 		case BLENDOP_SOURCE:
   1277 			// No operation
   1278 			break;
   1279 		case BLENDOP_DEST:
   1280 			current.w = pixel.w;
   1281 			break;
   1282 		case BLENDOP_NULL:
   1283 			current.w = Short4(0x0000);
   1284 			break;
   1285 		default:
   1286 			ASSERT(false);
   1287 		}
   1288 	}
   1289 
   1290 	void PixelRoutine::logicOperation(int index, Pointer<Byte> &cBuffer, Vector4s &current, Int &x)
   1291 	{
   1292 		if(state.logicalOperation == LOGICALOP_COPY)
   1293 		{
   1294 			return;
   1295 		}
   1296 
   1297 		Vector4s pixel;
   1298 		readPixel(index, cBuffer, x, pixel);
   1299 
   1300 		switch(state.logicalOperation)
   1301 		{
   1302 		case LOGICALOP_CLEAR:
   1303 			current.x = UShort4(0);
   1304 			current.y = UShort4(0);
   1305 			current.z = UShort4(0);
   1306 			break;
   1307 		case LOGICALOP_SET:
   1308 			current.x = UShort4(0xFFFFu);
   1309 			current.y = UShort4(0xFFFFu);
   1310 			current.z = UShort4(0xFFFFu);
   1311 			break;
   1312 		case LOGICALOP_COPY:
   1313 			ASSERT(false);   // Optimized out
   1314 			break;
   1315 		case LOGICALOP_COPY_INVERTED:
   1316 			current.x = ~current.x;
   1317 			current.y = ~current.y;
   1318 			current.z = ~current.z;
   1319 			break;
   1320 		case LOGICALOP_NOOP:
   1321 			current.x = pixel.x;
   1322 			current.y = pixel.y;
   1323 			current.z = pixel.z;
   1324 			break;
   1325 		case LOGICALOP_INVERT:
   1326 			current.x = ~pixel.x;
   1327 			current.y = ~pixel.y;
   1328 			current.z = ~pixel.z;
   1329 			break;
   1330 		case LOGICALOP_AND:
   1331 			current.x = pixel.x & current.x;
   1332 			current.y = pixel.y & current.y;
   1333 			current.z = pixel.z & current.z;
   1334 			break;
   1335 		case LOGICALOP_NAND:
   1336 			current.x = ~(pixel.x & current.x);
   1337 			current.y = ~(pixel.y & current.y);
   1338 			current.z = ~(pixel.z & current.z);
   1339 			break;
   1340 		case LOGICALOP_OR:
   1341 			current.x = pixel.x | current.x;
   1342 			current.y = pixel.y | current.y;
   1343 			current.z = pixel.z | current.z;
   1344 			break;
   1345 		case LOGICALOP_NOR:
   1346 			current.x = ~(pixel.x | current.x);
   1347 			current.y = ~(pixel.y | current.y);
   1348 			current.z = ~(pixel.z | current.z);
   1349 			break;
   1350 		case LOGICALOP_XOR:
   1351 			current.x = pixel.x ^ current.x;
   1352 			current.y = pixel.y ^ current.y;
   1353 			current.z = pixel.z ^ current.z;
   1354 			break;
   1355 		case LOGICALOP_EQUIV:
   1356 			current.x = ~(pixel.x ^ current.x);
   1357 			current.y = ~(pixel.y ^ current.y);
   1358 			current.z = ~(pixel.z ^ current.z);
   1359 			break;
   1360 		case LOGICALOP_AND_REVERSE:
   1361 			current.x = ~pixel.x & current.x;
   1362 			current.y = ~pixel.y & current.y;
   1363 			current.z = ~pixel.z & current.z;
   1364 			break;
   1365 		case LOGICALOP_AND_INVERTED:
   1366 			current.x = pixel.x & ~current.x;
   1367 			current.y = pixel.y & ~current.y;
   1368 			current.z = pixel.z & ~current.z;
   1369 			break;
   1370 		case LOGICALOP_OR_REVERSE:
   1371 			current.x = ~pixel.x | current.x;
   1372 			current.y = ~pixel.y | current.y;
   1373 			current.z = ~pixel.z | current.z;
   1374 			break;
   1375 		case LOGICALOP_OR_INVERTED:
   1376 			current.x = pixel.x | ~current.x;
   1377 			current.y = pixel.y | ~current.y;
   1378 			current.z = pixel.z | ~current.z;
   1379 			break;
   1380 		default:
   1381 			ASSERT(false);
   1382 		}
   1383 	}
   1384 
   1385 	void PixelRoutine::writeColor(int index, Pointer<Byte> &cBuffer, Int &x, Vector4s &current, Int &sMask, Int &zMask, Int &cMask)
   1386 	{
   1387 		if((postBlendSRGB && state.writeSRGB) || isSRGB(index))
   1388 		{
   1389 			linearToSRGB16_12_16(current);
   1390 		}
   1391 
   1392 		if(exactColorRounding)
   1393 		{
   1394 			switch(state.targetFormat[index])
   1395 			{
   1396 			case FORMAT_R5G6B5:
   1397 				current.x = AddSat(As<UShort4>(current.x), UShort4(0x0400));
   1398 				current.y = AddSat(As<UShort4>(current.y), UShort4(0x0200));
   1399 				current.z = AddSat(As<UShort4>(current.z), UShort4(0x0400));
   1400 				break;
   1401 			case FORMAT_X8G8R8B8Q:
   1402 			case FORMAT_A8G8R8B8Q:
   1403 			case FORMAT_X8R8G8B8:
   1404 			case FORMAT_X8B8G8R8:
   1405 			case FORMAT_A8R8G8B8:
   1406 			case FORMAT_A8B8G8R8:
   1407 			case FORMAT_SRGB8_X8:
   1408 			case FORMAT_SRGB8_A8:
   1409 			case FORMAT_G8R8:
   1410 			case FORMAT_R8:
   1411 				current.x = current.x - As<Short4>(As<UShort4>(current.x) >> 8) + Short4(0x0080);
   1412 				current.y = current.y - As<Short4>(As<UShort4>(current.y) >> 8) + Short4(0x0080);
   1413 				current.z = current.z - As<Short4>(As<UShort4>(current.z) >> 8) + Short4(0x0080);
   1414 				current.w = current.w - As<Short4>(As<UShort4>(current.w) >> 8) + Short4(0x0080);
   1415 				break;
   1416 			default:
   1417 				break;
   1418 			}
   1419 		}
   1420 
   1421 		int rgbaWriteMask = state.colorWriteActive(index);
   1422 		int bgraWriteMask = (rgbaWriteMask & 0x0000000A) | (rgbaWriteMask & 0x00000001) << 2 | (rgbaWriteMask & 0x00000004) >> 2;
   1423 
   1424 		switch(state.targetFormat[index])
   1425 		{
   1426 		case FORMAT_R5G6B5:
   1427 			{
   1428 				current.x = current.x & Short4(0xF800u);
   1429 				current.y = As<UShort4>(current.y & Short4(0xFC00u)) >> 5;
   1430 				current.z = As<UShort4>(current.z) >> 11;
   1431 
   1432 				current.x = current.x | current.y | current.z;
   1433 			}
   1434 			break;
   1435 		case FORMAT_X8G8R8B8Q:
   1436 			UNIMPLEMENTED();
   1437 		//	current.x = As<Short4>(As<UShort4>(current.x) >> 8);
   1438 		//	current.y = As<Short4>(As<UShort4>(current.y) >> 8);
   1439 		//	current.z = As<Short4>(As<UShort4>(current.z) >> 8);
   1440 
   1441 		//	current.z = As<Short4>(Pack(As<UShort4>(current.z), As<UShort4>(current.x)));
   1442 		//	current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.y)));
   1443 			break;
   1444 		case FORMAT_A8G8R8B8Q:
   1445 			UNIMPLEMENTED();
   1446 		//	current.x = As<Short4>(As<UShort4>(current.x) >> 8);
   1447 		//	current.y = As<Short4>(As<UShort4>(current.y) >> 8);
   1448 		//	current.z = As<Short4>(As<UShort4>(current.z) >> 8);
   1449 		//	current.w = As<Short4>(As<UShort4>(current.w) >> 8);
   1450 
   1451 		//	current.z = As<Short4>(Pack(As<UShort4>(current.z), As<UShort4>(current.x)));
   1452 		//	current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.w)));
   1453 			break;
   1454 		case FORMAT_X8R8G8B8:
   1455 		case FORMAT_A8R8G8B8:
   1456 			if(state.targetFormat[index] == FORMAT_X8R8G8B8 || rgbaWriteMask == 0x7)
   1457 			{
   1458 				current.x = As<Short4>(As<UShort4>(current.x) >> 8);
   1459 				current.y = As<Short4>(As<UShort4>(current.y) >> 8);
   1460 				current.z = As<Short4>(As<UShort4>(current.z) >> 8);
   1461 
   1462 				current.z = As<Short4>(PackUnsigned(current.z, current.x));
   1463 				current.y = As<Short4>(PackUnsigned(current.y, current.y));
   1464 
   1465 				current.x = current.z;
   1466 				current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
   1467 				current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
   1468 				current.y = current.z;
   1469 				current.z = As<Short4>(UnpackLow(current.z, current.x));
   1470 				current.y = As<Short4>(UnpackHigh(current.y, current.x));
   1471 			}
   1472 			else
   1473 			{
   1474 				current.x = As<Short4>(As<UShort4>(current.x) >> 8);
   1475 				current.y = As<Short4>(As<UShort4>(current.y) >> 8);
   1476 				current.z = As<Short4>(As<UShort4>(current.z) >> 8);
   1477 				current.w = As<Short4>(As<UShort4>(current.w) >> 8);
   1478 
   1479 				current.z = As<Short4>(PackUnsigned(current.z, current.x));
   1480 				current.y = As<Short4>(PackUnsigned(current.y, current.w));
   1481 
   1482 				current.x = current.z;
   1483 				current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
   1484 				current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
   1485 				current.y = current.z;
   1486 				current.z = As<Short4>(UnpackLow(current.z, current.x));
   1487 				current.y = As<Short4>(UnpackHigh(current.y, current.x));
   1488 			}
   1489 			break;
   1490 		case FORMAT_X8B8G8R8:
   1491 		case FORMAT_A8B8G8R8:
   1492 		case FORMAT_SRGB8_X8:
   1493 		case FORMAT_SRGB8_A8:
   1494 			if(state.targetFormat[index] == FORMAT_X8B8G8R8 || state.targetFormat[index] == FORMAT_SRGB8_X8 || rgbaWriteMask == 0x7)
   1495 			{
   1496 				current.x = As<Short4>(As<UShort4>(current.x) >> 8);
   1497 				current.y = As<Short4>(As<UShort4>(current.y) >> 8);
   1498 				current.z = As<Short4>(As<UShort4>(current.z) >> 8);
   1499 
   1500 				current.z = As<Short4>(PackUnsigned(current.x, current.z));
   1501 				current.y = As<Short4>(PackUnsigned(current.y, current.y));
   1502 
   1503 				current.x = current.z;
   1504 				current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
   1505 				current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
   1506 				current.y = current.z;
   1507 				current.z = As<Short4>(UnpackLow(current.z, current.x));
   1508 				current.y = As<Short4>(UnpackHigh(current.y, current.x));
   1509 			}
   1510 			else
   1511 			{
   1512 				current.x = As<Short4>(As<UShort4>(current.x) >> 8);
   1513 				current.y = As<Short4>(As<UShort4>(current.y) >> 8);
   1514 				current.z = As<Short4>(As<UShort4>(current.z) >> 8);
   1515 				current.w = As<Short4>(As<UShort4>(current.w) >> 8);
   1516 
   1517 				current.z = As<Short4>(PackUnsigned(current.x, current.z));
   1518 				current.y = As<Short4>(PackUnsigned(current.y, current.w));
   1519 
   1520 				current.x = current.z;
   1521 				current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
   1522 				current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
   1523 				current.y = current.z;
   1524 				current.z = As<Short4>(UnpackLow(current.z, current.x));
   1525 				current.y = As<Short4>(UnpackHigh(current.y, current.x));
   1526 			}
   1527 			break;
   1528 		case FORMAT_G8R8:
   1529 			current.x = As<Short4>(As<UShort4>(current.x) >> 8);
   1530 			current.y = As<Short4>(As<UShort4>(current.y) >> 8);
   1531 			current.x = As<Short4>(PackUnsigned(current.x, current.x));
   1532 			current.y = As<Short4>(PackUnsigned(current.y, current.y));
   1533 			current.x = UnpackLow(As<Byte8>(current.x), As<Byte8>(current.y));
   1534 			break;
   1535 		case FORMAT_R8:
   1536 			current.x = As<Short4>(As<UShort4>(current.x) >> 8);
   1537 			current.x = As<Short4>(PackUnsigned(current.x, current.x));
   1538 			break;
   1539 		case FORMAT_A8:
   1540 			current.w = As<Short4>(As<UShort4>(current.w) >> 8);
   1541 			current.w = As<Short4>(PackUnsigned(current.w, current.w));
   1542 			break;
   1543 		case FORMAT_G16R16:
   1544 			current.z = current.x;
   1545 			current.x = As<Short4>(UnpackLow(current.x, current.y));
   1546 			current.z = As<Short4>(UnpackHigh(current.z, current.y));
   1547 			current.y = current.z;
   1548 			break;
   1549 		case FORMAT_A16B16G16R16:
   1550 			transpose4x4(current.x, current.y, current.z, current.w);
   1551 			break;
   1552 		default:
   1553 			ASSERT(false);
   1554 		}
   1555 
   1556 		Short4 c01 = current.z;
   1557 		Short4 c23 = current.y;
   1558 
   1559 		Int xMask;   // Combination of all masks
   1560 
   1561 		if(state.depthTestActive)
   1562 		{
   1563 			xMask = zMask;
   1564 		}
   1565 		else
   1566 		{
   1567 			xMask = cMask;
   1568 		}
   1569 
   1570 		if(state.stencilActive)
   1571 		{
   1572 			xMask &= sMask;
   1573 		}
   1574 
   1575 		switch(state.targetFormat[index])
   1576 		{
   1577 		case FORMAT_R5G6B5:
   1578 			{
   1579 				Pointer<Byte> buffer = cBuffer + 2 * x;
   1580 				Int value = *Pointer<Int>(buffer);
   1581 
   1582 				Int c01 = Extract(As<Int2>(current.x), 0);
   1583 
   1584 				if((bgraWriteMask & 0x00000007) != 0x00000007)
   1585 				{
   1586 					Int masked = value;
   1587 					c01 &= *Pointer<Int>(constants + OFFSET(Constants,mask565Q[bgraWriteMask & 0x7][0]));
   1588 					masked &= *Pointer<Int>(constants + OFFSET(Constants,mask565Q[~bgraWriteMask & 0x7][0]));
   1589 					c01 |= masked;
   1590 				}
   1591 
   1592 				c01 &= *Pointer<Int>(constants + OFFSET(Constants,maskW4Q[0][0]) + xMask * 8);
   1593 				value &= *Pointer<Int>(constants + OFFSET(Constants,invMaskW4Q[0][0]) + xMask * 8);
   1594 				c01 |= value;
   1595 				*Pointer<Int>(buffer) = c01;
   1596 
   1597 				buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
   1598 				value = *Pointer<Int>(buffer);
   1599 
   1600 				Int c23 = Extract(As<Int2>(current.x), 1);
   1601 
   1602 				if((bgraWriteMask & 0x00000007) != 0x00000007)
   1603 				{
   1604 					Int masked = value;
   1605 					c23 &= *Pointer<Int>(constants + OFFSET(Constants,mask565Q[bgraWriteMask & 0x7][0]));
   1606 					masked &= *Pointer<Int>(constants + OFFSET(Constants,mask565Q[~bgraWriteMask & 0x7][0]));
   1607 					c23 |= masked;
   1608 				}
   1609 
   1610 				c23 &= *Pointer<Int>(constants + OFFSET(Constants,maskW4Q[0][2]) + xMask * 8);
   1611 				value &= *Pointer<Int>(constants + OFFSET(Constants,invMaskW4Q[0][2]) + xMask * 8);
   1612 				c23 |= value;
   1613 				*Pointer<Int>(buffer) = c23;
   1614 			}
   1615 			break;
   1616 		case FORMAT_A8G8R8B8Q:
   1617 		case FORMAT_X8G8R8B8Q:   // FIXME: Don't touch alpha?
   1618 			UNIMPLEMENTED();
   1619 		//	value = *Pointer<Short4>(cBuffer + 8 * x + 0);
   1620 
   1621 		//	if((state.targetFormat[index] == FORMAT_A8G8R8B8Q && bgraWriteMask != 0x0000000F) ||
   1622 		//	   ((state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x00000007) &&
   1623 		//	    (state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x0000000F)))   // FIXME: Need for masking when XRGB && Fh?
   1624 		//	{
   1625 		//		Short4 masked = value;
   1626 		//		c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
   1627 		//		masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
   1628 		//		c01 |= masked;
   1629 		//	}
   1630 
   1631 		//	c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8);
   1632 		//	value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
   1633 		//	c01 |= value;
   1634 		//	*Pointer<Short4>(cBuffer + 8 * x + 0) = c01;
   1635 
   1636 		//	value = *Pointer<Short4>(cBuffer + 8 * x + 8);
   1637 
   1638 		//	if((state.targetFormat[index] == FORMAT_A8G8R8B8Q && bgraWriteMask != 0x0000000F) ||
   1639 		//	   ((state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x00000007) &&
   1640 		//	    (state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x0000000F)))   // FIXME: Need for masking when XRGB && Fh?
   1641 		//	{
   1642 		//		Short4 masked = value;
   1643 		//		c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
   1644 		//		masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
   1645 		//		c23 |= masked;
   1646 		//	}
   1647 
   1648 		//	c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8);
   1649 		//	value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
   1650 		//	c23 |= value;
   1651 		//	*Pointer<Short4>(cBuffer + 8 * x + 8) = c23;
   1652 			break;
   1653 		case FORMAT_A8R8G8B8:
   1654 		case FORMAT_X8R8G8B8:   // FIXME: Don't touch alpha?
   1655 			{
   1656 				Pointer<Byte> buffer = cBuffer + x * 4;
   1657 				Short4 value = *Pointer<Short4>(buffer);
   1658 
   1659 				if((state.targetFormat[index] == FORMAT_A8R8G8B8 && bgraWriteMask != 0x0000000F) ||
   1660 				   ((state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x00000007) &&
   1661 					(state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x0000000F)))   // FIXME: Need for masking when XRGB && Fh?
   1662 				{
   1663 					Short4 masked = value;
   1664 					c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
   1665 					masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
   1666 					c01 |= masked;
   1667 				}
   1668 
   1669 				c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8);
   1670 				value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
   1671 				c01 |= value;
   1672 				*Pointer<Short4>(buffer) = c01;
   1673 
   1674 				buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
   1675 				value = *Pointer<Short4>(buffer);
   1676 
   1677 				if((state.targetFormat[index] == FORMAT_A8R8G8B8 && bgraWriteMask != 0x0000000F) ||
   1678 				   ((state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x00000007) &&
   1679 					(state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x0000000F)))   // FIXME: Need for masking when XRGB && Fh?
   1680 				{
   1681 					Short4 masked = value;
   1682 					c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
   1683 					masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
   1684 					c23 |= masked;
   1685 				}
   1686 
   1687 				c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8);
   1688 				value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
   1689 				c23 |= value;
   1690 				*Pointer<Short4>(buffer) = c23;
   1691 			}
   1692 			break;
   1693 		case FORMAT_A8B8G8R8:
   1694 		case FORMAT_X8B8G8R8:   // FIXME: Don't touch alpha?
   1695 		case FORMAT_SRGB8_X8:
   1696 		case FORMAT_SRGB8_A8:
   1697 			{
   1698 				Pointer<Byte> buffer = cBuffer + x * 4;
   1699 				Short4 value = *Pointer<Short4>(buffer);
   1700 
   1701 				bool masked = (((state.targetFormat[index] == FORMAT_A8B8G8R8 || state.targetFormat[index] == FORMAT_SRGB8_A8) && rgbaWriteMask != 0x0000000F) ||
   1702 				              (((state.targetFormat[index] == FORMAT_X8B8G8R8 || state.targetFormat[index] == FORMAT_SRGB8_X8) && rgbaWriteMask != 0x00000007) &&
   1703 				               ((state.targetFormat[index] == FORMAT_X8B8G8R8 || state.targetFormat[index] == FORMAT_SRGB8_X8) && rgbaWriteMask != 0x0000000F))); // FIXME: Need for masking when XBGR && Fh?
   1704 
   1705 				if(masked)
   1706 				{
   1707 					Short4 masked = value;
   1708 					c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[rgbaWriteMask][0]));
   1709 					masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[rgbaWriteMask][0]));
   1710 					c01 |= masked;
   1711 				}
   1712 
   1713 				c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8);
   1714 				value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
   1715 				c01 |= value;
   1716 				*Pointer<Short4>(buffer) = c01;
   1717 
   1718 				buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
   1719 				value = *Pointer<Short4>(buffer);
   1720 
   1721 				if(masked)
   1722 				{
   1723 					Short4 masked = value;
   1724 					c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[rgbaWriteMask][0]));
   1725 					masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[rgbaWriteMask][0]));
   1726 					c23 |= masked;
   1727 				}
   1728 
   1729 				c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8);
   1730 				value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
   1731 				c23 |= value;
   1732 				*Pointer<Short4>(buffer) = c23;
   1733 			}
   1734 			break;
   1735 		case FORMAT_G8R8:
   1736 			if((rgbaWriteMask & 0x00000003) != 0x0)
   1737 			{
   1738 				Pointer<Byte> buffer = cBuffer + 2 * x;
   1739 				Int2 value;
   1740 				value = Insert(value, *Pointer<Int>(buffer), 0);
   1741 				Int pitch = *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
   1742 				value = Insert(value, *Pointer<Int>(buffer + pitch), 1);
   1743 
   1744 				Int2 packedCol = As<Int2>(current.x);
   1745 
   1746 				UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q) + xMask * 8);
   1747 				if((rgbaWriteMask & 0x3) != 0x3)
   1748 				{
   1749 					Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskB4Q[5 * (rgbaWriteMask & 0x3)][0]));
   1750 					UInt2 rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
   1751 					mergedMask &= rgbaMask;
   1752 				}
   1753 
   1754 				packedCol = As<Int2>((As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask));
   1755 
   1756 				*Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 0));
   1757 				*Pointer<UInt>(buffer + pitch) = As<UInt>(Extract(packedCol, 1));
   1758 			}
   1759 			break;
   1760 		case FORMAT_R8:
   1761 			if(rgbaWriteMask & 0x00000001)
   1762 			{
   1763 				Pointer<Byte> buffer = cBuffer + 1 * x;
   1764 				Short4 value;
   1765 				value = Insert(value, *Pointer<Short>(buffer), 0);
   1766 				Int pitch = *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
   1767 				value = Insert(value, *Pointer<Short>(buffer + pitch), 1);
   1768 
   1769 				current.x &= *Pointer<Short4>(constants + OFFSET(Constants, maskB4Q) + 8 * xMask);
   1770 				value &= *Pointer<Short4>(constants + OFFSET(Constants, invMaskB4Q) + 8 * xMask);
   1771 				current.x |= value;
   1772 
   1773 				*Pointer<Short>(buffer) = Extract(current.x, 0);
   1774 				*Pointer<Short>(buffer + pitch) = Extract(current.x, 1);
   1775 			}
   1776 			break;
   1777 		case FORMAT_A8:
   1778 			if(rgbaWriteMask & 0x00000008)
   1779 			{
   1780 				Pointer<Byte> buffer = cBuffer + 1 * x;
   1781 				Short4 value;
   1782 				value = Insert(value, *Pointer<Short>(buffer), 0);
   1783 				Int pitch = *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
   1784 				value = Insert(value, *Pointer<Short>(buffer + pitch), 1);
   1785 
   1786 				current.w &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q) + 8 * xMask);
   1787 				value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q) + 8 * xMask);
   1788 				current.w |= value;
   1789 
   1790 				*Pointer<Short>(buffer) = Extract(current.w, 0);
   1791 				*Pointer<Short>(buffer + pitch) = Extract(current.w, 1);
   1792 			}
   1793 			break;
   1794 		case FORMAT_G16R16:
   1795 			{
   1796 				Pointer<Byte> buffer = cBuffer + 4 * x;
   1797 
   1798 				Short4 value = *Pointer<Short4>(buffer);
   1799 
   1800 				if((rgbaWriteMask & 0x00000003) != 0x00000003)
   1801 				{
   1802 					Short4 masked = value;
   1803 					current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskW01Q[rgbaWriteMask & 0x3][0]));
   1804 					masked &= *Pointer<Short4>(constants + OFFSET(Constants,maskW01Q[~rgbaWriteMask & 0x3][0]));
   1805 					current.x |= masked;
   1806 				}
   1807 
   1808 				current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8);
   1809 				value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
   1810 				current.x |= value;
   1811 				*Pointer<Short4>(buffer) = current.x;
   1812 
   1813 				buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
   1814 
   1815 				value = *Pointer<Short4>(buffer);
   1816 
   1817 				if((rgbaWriteMask & 0x00000003) != 0x00000003)
   1818 				{
   1819 					Short4 masked = value;
   1820 					current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskW01Q[rgbaWriteMask & 0x3][0]));
   1821 					masked &= *Pointer<Short4>(constants + OFFSET(Constants,maskW01Q[~rgbaWriteMask & 0x3][0]));
   1822 					current.y |= masked;
   1823 				}
   1824 
   1825 				current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8);
   1826 				value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
   1827 				current.y |= value;
   1828 				*Pointer<Short4>(buffer) = current.y;
   1829 			}
   1830 			break;
   1831 		case FORMAT_A16B16G16R16:
   1832 			{
   1833 				Pointer<Byte> buffer = cBuffer + 8 * x;
   1834 
   1835 				{
   1836 					Short4 value = *Pointer<Short4>(buffer);
   1837 
   1838 					if(rgbaWriteMask != 0x0000000F)
   1839 					{
   1840 						Short4 masked = value;
   1841 						current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
   1842 						masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
   1843 						current.x |= masked;
   1844 					}
   1845 
   1846 					current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ0Q) + xMask * 8);
   1847 					value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ0Q) + xMask * 8);
   1848 					current.x |= value;
   1849 					*Pointer<Short4>(buffer) = current.x;
   1850 				}
   1851 
   1852 				{
   1853 					Short4 value = *Pointer<Short4>(buffer + 8);
   1854 
   1855 					if(rgbaWriteMask != 0x0000000F)
   1856 					{
   1857 						Short4 masked = value;
   1858 						current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
   1859 						masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
   1860 						current.y |= masked;
   1861 					}
   1862 
   1863 					current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ1Q) + xMask * 8);
   1864 					value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ1Q) + xMask * 8);
   1865 					current.y |= value;
   1866 					*Pointer<Short4>(buffer + 8) = current.y;
   1867 				}
   1868 
   1869 				buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
   1870 
   1871 				{
   1872 					Short4 value = *Pointer<Short4>(buffer);
   1873 
   1874 					if(rgbaWriteMask != 0x0000000F)
   1875 					{
   1876 						Short4 masked = value;
   1877 						current.z &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
   1878 						masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
   1879 						current.z |= masked;
   1880 					}
   1881 
   1882 					current.z &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ2Q) + xMask * 8);
   1883 					value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ2Q) + xMask * 8);
   1884 					current.z |= value;
   1885 					*Pointer<Short4>(buffer) = current.z;
   1886 				}
   1887 
   1888 				{
   1889 					Short4 value = *Pointer<Short4>(buffer + 8);
   1890 
   1891 					if(rgbaWriteMask != 0x0000000F)
   1892 					{
   1893 						Short4 masked = value;
   1894 						current.w &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
   1895 						masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
   1896 						current.w |= masked;
   1897 					}
   1898 
   1899 					current.w &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ3Q) + xMask * 8);
   1900 					value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ3Q) + xMask * 8);
   1901 					current.w |= value;
   1902 					*Pointer<Short4>(buffer + 8) = current.w;
   1903 				}
   1904 			}
   1905 			break;
   1906 		default:
   1907 			ASSERT(false);
   1908 		}
   1909 	}
   1910 
   1911 	void PixelRoutine::blendFactor(Vector4f &blendFactor, const Vector4f &oC, const Vector4f &pixel, BlendFactor blendFactorActive)
   1912 	{
   1913 		switch(blendFactorActive)
   1914 		{
   1915 		case BLEND_ZERO:
   1916 			// Optimized
   1917 			break;
   1918 		case BLEND_ONE:
   1919 			// Optimized
   1920 			break;
   1921 		case BLEND_SOURCE:
   1922 			blendFactor.x = oC.x;
   1923 			blendFactor.y = oC.y;
   1924 			blendFactor.z = oC.z;
   1925 			break;
   1926 		case BLEND_INVSOURCE:
   1927 			blendFactor.x = Float4(1.0f) - oC.x;
   1928 			blendFactor.y = Float4(1.0f) - oC.y;
   1929 			blendFactor.z = Float4(1.0f) - oC.z;
   1930 			break;
   1931 		case BLEND_DEST:
   1932 			blendFactor.x = pixel.x;
   1933 			blendFactor.y = pixel.y;
   1934 			blendFactor.z = pixel.z;
   1935 			break;
   1936 		case BLEND_INVDEST:
   1937 			blendFactor.x = Float4(1.0f) - pixel.x;
   1938 			blendFactor.y = Float4(1.0f) - pixel.y;
   1939 			blendFactor.z = Float4(1.0f) - pixel.z;
   1940 			break;
   1941 		case BLEND_SOURCEALPHA:
   1942 			blendFactor.x = oC.w;
   1943 			blendFactor.y = oC.w;
   1944 			blendFactor.z = oC.w;
   1945 			break;
   1946 		case BLEND_INVSOURCEALPHA:
   1947 			blendFactor.x = Float4(1.0f) - oC.w;
   1948 			blendFactor.y = Float4(1.0f) - oC.w;
   1949 			blendFactor.z = Float4(1.0f) - oC.w;
   1950 			break;
   1951 		case BLEND_DESTALPHA:
   1952 			blendFactor.x = pixel.w;
   1953 			blendFactor.y = pixel.w;
   1954 			blendFactor.z = pixel.w;
   1955 			break;
   1956 		case BLEND_INVDESTALPHA:
   1957 			blendFactor.x = Float4(1.0f) - pixel.w;
   1958 			blendFactor.y = Float4(1.0f) - pixel.w;
   1959 			blendFactor.z = Float4(1.0f) - pixel.w;
   1960 			break;
   1961 		case BLEND_SRCALPHASAT:
   1962 			blendFactor.x = Float4(1.0f) - pixel.w;
   1963 			blendFactor.x = Min(blendFactor.x, oC.w);
   1964 			blendFactor.y = blendFactor.x;
   1965 			blendFactor.z = blendFactor.x;
   1966 			break;
   1967 		case BLEND_CONSTANT:
   1968 			blendFactor.x = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[0]));
   1969 			blendFactor.y = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[1]));
   1970 			blendFactor.z = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[2]));
   1971 			break;
   1972 		case BLEND_INVCONSTANT:
   1973 			blendFactor.x = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[0]));
   1974 			blendFactor.y = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[1]));
   1975 			blendFactor.z = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[2]));
   1976 			break;
   1977 		default:
   1978 			ASSERT(false);
   1979 		}
   1980 	}
   1981 
   1982 	void PixelRoutine::blendFactorAlpha(Vector4f &blendFactor, const Vector4f &oC, const Vector4f &pixel, BlendFactor blendFactorAlphaActive)
   1983 	{
   1984 		switch(blendFactorAlphaActive)
   1985 		{
   1986 		case BLEND_ZERO:
   1987 			// Optimized
   1988 			break;
   1989 		case BLEND_ONE:
   1990 			// Optimized
   1991 			break;
   1992 		case BLEND_SOURCE:
   1993 			blendFactor.w = oC.w;
   1994 			break;
   1995 		case BLEND_INVSOURCE:
   1996 			blendFactor.w = Float4(1.0f) - oC.w;
   1997 			break;
   1998 		case BLEND_DEST:
   1999 			blendFactor.w = pixel.w;
   2000 			break;
   2001 		case BLEND_INVDEST:
   2002 			blendFactor.w = Float4(1.0f) - pixel.w;
   2003 			break;
   2004 		case BLEND_SOURCEALPHA:
   2005 			blendFactor.w = oC.w;
   2006 			break;
   2007 		case BLEND_INVSOURCEALPHA:
   2008 			blendFactor.w = Float4(1.0f) - oC.w;
   2009 			break;
   2010 		case BLEND_DESTALPHA:
   2011 			blendFactor.w = pixel.w;
   2012 			break;
   2013 		case BLEND_INVDESTALPHA:
   2014 			blendFactor.w = Float4(1.0f) - pixel.w;
   2015 			break;
   2016 		case BLEND_SRCALPHASAT:
   2017 			blendFactor.w = Float4(1.0f);
   2018 			break;
   2019 		case BLEND_CONSTANT:
   2020 			blendFactor.w = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[3]));
   2021 			break;
   2022 		case BLEND_INVCONSTANT:
   2023 			blendFactor.w = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[3]));
   2024 			break;
   2025 		default:
   2026 			ASSERT(false);
   2027 		}
   2028 	}
   2029 
   2030 	void PixelRoutine::alphaBlend(int index, Pointer<Byte> &cBuffer, Vector4f &oC, Int &x)
   2031 	{
   2032 		if(!state.alphaBlendActive)
   2033 		{
   2034 			return;
   2035 		}
   2036 
   2037 		Pointer<Byte> buffer;
   2038 		Vector4f pixel;
   2039 
   2040 		Vector4s color;
   2041 		Short4 c01;
   2042 		Short4 c23;
   2043 
   2044 		Float4 one;
   2045 		if(Surface::isFloatFormat(state.targetFormat[index]))
   2046 		{
   2047 			one = Float4(1.0f);
   2048 		}
   2049 		else if(Surface::isNonNormalizedInteger(state.targetFormat[index]))
   2050 		{
   2051 			one = As<Float4>(Surface::isUnsignedComponent(state.targetFormat[index], 0) ? Int4(0xFFFFFFFF) : Int4(0x7FFFFFFF));
   2052 		}
   2053 
   2054 		switch(state.targetFormat[index])
   2055 		{
   2056 		case FORMAT_R32I:
   2057 		case FORMAT_R32UI:
   2058 		case FORMAT_R32F:
   2059 			buffer = cBuffer;
   2060 			// FIXME: movlps
   2061 			pixel.x.x = *Pointer<Float>(buffer + 4 * x + 0);
   2062 			pixel.x.y = *Pointer<Float>(buffer + 4 * x + 4);
   2063 			buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
   2064 			// FIXME: movhps
   2065 			pixel.x.z = *Pointer<Float>(buffer + 4 * x + 0);
   2066 			pixel.x.w = *Pointer<Float>(buffer + 4 * x + 4);
   2067 			pixel.y = pixel.z = pixel.w = one;
   2068 			break;
   2069 		case FORMAT_G32R32I:
   2070 		case FORMAT_G32R32UI:
   2071 		case FORMAT_G32R32F:
   2072 			buffer = cBuffer;
   2073 			pixel.x = *Pointer<Float4>(buffer + 8 * x, 16);
   2074 			buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
   2075 			pixel.y = *Pointer<Float4>(buffer + 8 * x, 16);
   2076 			pixel.z = pixel.x;
   2077 			pixel.x = ShuffleLowHigh(pixel.x, pixel.y, 0x88);
   2078 			pixel.z = ShuffleLowHigh(pixel.z, pixel.y, 0xDD);
   2079 			pixel.y = pixel.z;
   2080 			pixel.z = pixel.w = one;
   2081 			break;
   2082 		case FORMAT_X32B32G32R32F:
   2083 		case FORMAT_A32B32G32R32F:
   2084 		case FORMAT_X32B32G32R32F_UNSIGNED:
   2085 		case FORMAT_A32B32G32R32I:
   2086 		case FORMAT_A32B32G32R32UI:
   2087 			buffer = cBuffer;
   2088 			pixel.x = *Pointer<Float4>(buffer + 16 * x, 16);
   2089 			pixel.y = *Pointer<Float4>(buffer + 16 * x + 16, 16);
   2090 			buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
   2091 			pixel.z = *Pointer<Float4>(buffer + 16 * x, 16);
   2092 			pixel.w = *Pointer<Float4>(buffer + 16 * x + 16, 16);
   2093 			transpose4x4(pixel.x, pixel.y, pixel.z, pixel.w);
   2094 			if(state.targetFormat[index] == FORMAT_X32B32G32R32F ||
   2095 			   state.targetFormat[index] == FORMAT_X32B32G32R32F_UNSIGNED)
   2096 			{
   2097 				pixel.w = Float4(1.0f);
   2098 			}
   2099 			break;
   2100 		default:
   2101 			ASSERT(false);
   2102 		}
   2103 
   2104 		if((postBlendSRGB && state.writeSRGB) || isSRGB(index))
   2105 		{
   2106 			sRGBtoLinear(pixel.x);
   2107 			sRGBtoLinear(pixel.y);
   2108 			sRGBtoLinear(pixel.z);
   2109 		}
   2110 
   2111 		// Final Color = ObjectColor * SourceBlendFactor + PixelColor * DestinationBlendFactor
   2112 		Vector4f sourceFactor;
   2113 		Vector4f destFactor;
   2114 
   2115 		blendFactor(sourceFactor, oC, pixel, state.sourceBlendFactor);
   2116 		blendFactor(destFactor, oC, pixel, state.destBlendFactor);
   2117 
   2118 		if(state.sourceBlendFactor != BLEND_ONE && state.sourceBlendFactor != BLEND_ZERO)
   2119 		{
   2120 			oC.x *= sourceFactor.x;
   2121 			oC.y *= sourceFactor.y;
   2122 			oC.z *= sourceFactor.z;
   2123 		}
   2124 
   2125 		if(state.destBlendFactor != BLEND_ONE && state.destBlendFactor != BLEND_ZERO)
   2126 		{
   2127 			pixel.x *= destFactor.x;
   2128 			pixel.y *= destFactor.y;
   2129 			pixel.z *= destFactor.z;
   2130 		}
   2131 
   2132 		switch(state.blendOperation)
   2133 		{
   2134 		case BLENDOP_ADD:
   2135 			oC.x += pixel.x;
   2136 			oC.y += pixel.y;
   2137 			oC.z += pixel.z;
   2138 			break;
   2139 		case BLENDOP_SUB:
   2140 			oC.x -= pixel.x;
   2141 			oC.y -= pixel.y;
   2142 			oC.z -= pixel.z;
   2143 			break;
   2144 		case BLENDOP_INVSUB:
   2145 			oC.x = pixel.x - oC.x;
   2146 			oC.y = pixel.y - oC.y;
   2147 			oC.z = pixel.z - oC.z;
   2148 			break;
   2149 		case BLENDOP_MIN:
   2150 			oC.x = Min(oC.x, pixel.x);
   2151 			oC.y = Min(oC.y, pixel.y);
   2152 			oC.z = Min(oC.z, pixel.z);
   2153 			break;
   2154 		case BLENDOP_MAX:
   2155 			oC.x = Max(oC.x, pixel.x);
   2156 			oC.y = Max(oC.y, pixel.y);
   2157 			oC.z = Max(oC.z, pixel.z);
   2158 			break;
   2159 		case BLENDOP_SOURCE:
   2160 			// No operation
   2161 			break;
   2162 		case BLENDOP_DEST:
   2163 			oC.x = pixel.x;
   2164 			oC.y = pixel.y;
   2165 			oC.z = pixel.z;
   2166 			break;
   2167 		case BLENDOP_NULL:
   2168 			oC.x = Float4(0.0f);
   2169 			oC.y = Float4(0.0f);
   2170 			oC.z = Float4(0.0f);
   2171 			break;
   2172 		default:
   2173 			ASSERT(false);
   2174 		}
   2175 
   2176 		blendFactorAlpha(sourceFactor, oC, pixel, state.sourceBlendFactorAlpha);
   2177 		blendFactorAlpha(destFactor, oC, pixel, state.destBlendFactorAlpha);
   2178 
   2179 		if(state.sourceBlendFactorAlpha != BLEND_ONE && state.sourceBlendFactorAlpha != BLEND_ZERO)
   2180 		{
   2181 			oC.w *= sourceFactor.w;
   2182 		}
   2183 
   2184 		if(state.destBlendFactorAlpha != BLEND_ONE && state.destBlendFactorAlpha != BLEND_ZERO)
   2185 		{
   2186 			pixel.w *= destFactor.w;
   2187 		}
   2188 
   2189 		switch(state.blendOperationAlpha)
   2190 		{
   2191 		case BLENDOP_ADD:
   2192 			oC.w += pixel.w;
   2193 			break;
   2194 		case BLENDOP_SUB:
   2195 			oC.w -= pixel.w;
   2196 			break;
   2197 		case BLENDOP_INVSUB:
   2198 			pixel.w -= oC.w;
   2199 			oC.w = pixel.w;
   2200 			break;
   2201 		case BLENDOP_MIN:
   2202 			oC.w = Min(oC.w, pixel.w);
   2203 			break;
   2204 		case BLENDOP_MAX:
   2205 			oC.w = Max(oC.w, pixel.w);
   2206 			break;
   2207 		case BLENDOP_SOURCE:
   2208 			// No operation
   2209 			break;
   2210 		case BLENDOP_DEST:
   2211 			oC.w = pixel.w;
   2212 			break;
   2213 		case BLENDOP_NULL:
   2214 			oC.w = Float4(0.0f);
   2215 			break;
   2216 		default:
   2217 			ASSERT(false);
   2218 		}
   2219 	}
   2220 
   2221 	void PixelRoutine::writeColor(int index, Pointer<Byte> &cBuffer, Int &x, Vector4f &oC, Int &sMask, Int &zMask, Int &cMask)
   2222 	{
   2223 		switch(state.targetFormat[index])
   2224 		{
   2225 		case FORMAT_R32F:
   2226 		case FORMAT_R32I:
   2227 		case FORMAT_R32UI:
   2228 		case FORMAT_R16I:
   2229 		case FORMAT_R16UI:
   2230 		case FORMAT_R8I:
   2231 		case FORMAT_R8UI:
   2232 			break;
   2233 		case FORMAT_G32R32F:
   2234 		case FORMAT_G32R32I:
   2235 		case FORMAT_G32R32UI:
   2236 		case FORMAT_G16R16I:
   2237 		case FORMAT_G16R16UI:
   2238 		case FORMAT_G8R8I:
   2239 		case FORMAT_G8R8UI:
   2240 			oC.z = oC.x;
   2241 			oC.x = UnpackLow(oC.x, oC.y);
   2242 			oC.z = UnpackHigh(oC.z, oC.y);
   2243 			oC.y = oC.z;
   2244 			break;
   2245 		case FORMAT_X32B32G32R32F:
   2246 		case FORMAT_A32B32G32R32F:
   2247 		case FORMAT_X32B32G32R32F_UNSIGNED:
   2248 		case FORMAT_A32B32G32R32I:
   2249 		case FORMAT_A32B32G32R32UI:
   2250 		case FORMAT_A16B16G16R16I:
   2251 		case FORMAT_A16B16G16R16UI:
   2252 		case FORMAT_A8B8G8R8I:
   2253 		case FORMAT_A8B8G8R8UI:
   2254 			transpose4x4(oC.x, oC.y, oC.z, oC.w);
   2255 			break;
   2256 		default:
   2257 			ASSERT(false);
   2258 		}
   2259 
   2260 		int rgbaWriteMask = state.colorWriteActive(index);
   2261 
   2262 		Int xMask;   // Combination of all masks
   2263 
   2264 		if(state.depthTestActive)
   2265 		{
   2266 			xMask = zMask;
   2267 		}
   2268 		else
   2269 		{
   2270 			xMask = cMask;
   2271 		}
   2272 
   2273 		if(state.stencilActive)
   2274 		{
   2275 			xMask &= sMask;
   2276 		}
   2277 
   2278 		Pointer<Byte> buffer;
   2279 		Float4 value;
   2280 
   2281 		switch(state.targetFormat[index])
   2282 		{
   2283 		case FORMAT_R32F:
   2284 		case FORMAT_R32I:
   2285 		case FORMAT_R32UI:
   2286 			if(rgbaWriteMask & 0x00000001)
   2287 			{
   2288 				buffer = cBuffer + 4 * x;
   2289 
   2290 				// FIXME: movlps
   2291 				value.x = *Pointer<Float>(buffer + 0);
   2292 				value.y = *Pointer<Float>(buffer + 4);
   2293 
   2294 				buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
   2295 
   2296 				// FIXME: movhps
   2297 				value.z = *Pointer<Float>(buffer + 0);
   2298 				value.w = *Pointer<Float>(buffer + 4);
   2299 
   2300 				oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X) + xMask * 16, 16));
   2301 				value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X) + xMask * 16, 16));
   2302 				oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
   2303 
   2304 				// FIXME: movhps
   2305 				*Pointer<Float>(buffer + 0) = oC.x.z;
   2306 				*Pointer<Float>(buffer + 4) = oC.x.w;
   2307 
   2308 				buffer -= *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
   2309 
   2310 				// FIXME: movlps
   2311 				*Pointer<Float>(buffer + 0) = oC.x.x;
   2312 				*Pointer<Float>(buffer + 4) = oC.x.y;
   2313 			}
   2314 			break;
   2315 		case FORMAT_R16I:
   2316 		case FORMAT_R16UI:
   2317 			if(rgbaWriteMask & 0x00000001)
   2318 			{
   2319 				buffer = cBuffer + 2 * x;
   2320 
   2321 				UShort4 xyzw;
   2322 				xyzw = As<UShort4>(Insert(As<Int2>(xyzw), *Pointer<Int>(buffer), 0));
   2323 
   2324 				buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
   2325 
   2326 				xyzw = As<UShort4>(Insert(As<Int2>(xyzw), *Pointer<Int>(buffer), 1));
   2327 				value = As<Float4>(Int4(xyzw));
   2328 
   2329 				oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X) + xMask * 16, 16));
   2330 				value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X) + xMask * 16, 16));
   2331 				oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
   2332 
   2333 				if(state.targetFormat[index] == FORMAT_R16I)
   2334 				{
   2335 					Float component = oC.x.z;
   2336 					*Pointer<Short>(buffer + 0) = Short(As<Int>(component));
   2337 					component = oC.x.w;
   2338 					*Pointer<Short>(buffer + 2) = Short(As<Int>(component));
   2339 
   2340 					buffer -= *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
   2341 
   2342 					component = oC.x.x;
   2343 					*Pointer<Short>(buffer + 0) = Short(As<Int>(component));
   2344 					component = oC.x.y;
   2345 					*Pointer<Short>(buffer + 2) = Short(As<Int>(component));
   2346 				}
   2347 				else // FORMAT_R16UI
   2348 				{
   2349 					Float component = oC.x.z;
   2350 					*Pointer<UShort>(buffer + 0) = UShort(As<Int>(component));
   2351 					component = oC.x.w;
   2352 					*Pointer<UShort>(buffer + 2) = UShort(As<Int>(component));
   2353 
   2354 					buffer -= *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
   2355 
   2356 					component = oC.x.x;
   2357 					*Pointer<UShort>(buffer + 0) = UShort(As<Int>(component));
   2358 					component = oC.x.y;
   2359 					*Pointer<UShort>(buffer + 2) = UShort(As<Int>(component));
   2360 				}
   2361 			}
   2362 			break;
   2363 		case FORMAT_R8I:
   2364 		case FORMAT_R8UI:
   2365 			if(rgbaWriteMask & 0x00000001)
   2366 			{
   2367 				buffer = cBuffer + x;
   2368 
   2369 				UInt xyzw, packedCol;
   2370 
   2371 				xyzw = UInt(*Pointer<UShort>(buffer)) & 0xFFFF;
   2372 				buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
   2373 				xyzw |= UInt(*Pointer<UShort>(buffer)) << 16;
   2374 
   2375 				Short4 tmpCol = Short4(As<Int4>(oC.x));
   2376 				if(state.targetFormat[index] == FORMAT_R8I)
   2377 				{
   2378 					tmpCol = As<Short4>(PackSigned(tmpCol, tmpCol));
   2379 				}
   2380 				else
   2381 				{
   2382 					tmpCol = As<Short4>(PackUnsigned(tmpCol, tmpCol));
   2383 				}
   2384 				packedCol = Extract(As<Int2>(tmpCol), 0);
   2385 
   2386 				packedCol = (packedCol & *Pointer<UInt>(constants + OFFSET(Constants, maskB4Q) + 8 * xMask)) |
   2387 				            (xyzw & *Pointer<UInt>(constants + OFFSET(Constants, invMaskB4Q) + 8 * xMask));
   2388 
   2389 				*Pointer<UShort>(buffer) = UShort(packedCol >> 16);
   2390 				buffer -= *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
   2391 				*Pointer<UShort>(buffer) = UShort(packedCol);
   2392 			}
   2393 			break;
   2394 		case FORMAT_G32R32F:
   2395 		case FORMAT_G32R32I:
   2396 		case FORMAT_G32R32UI:
   2397 			buffer = cBuffer + 8 * x;
   2398 
   2399 			value = *Pointer<Float4>(buffer);
   2400 
   2401 			if((rgbaWriteMask & 0x00000003) != 0x00000003)
   2402 			{
   2403 				Float4 masked = value;
   2404 				oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskD01X[rgbaWriteMask & 0x3][0])));
   2405 				masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,maskD01X[~rgbaWriteMask & 0x3][0])));
   2406 				oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(masked));
   2407 			}
   2408 
   2409 			oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskQ01X) + xMask * 16, 16));
   2410 			value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskQ01X) + xMask * 16, 16));
   2411 			oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
   2412 			*Pointer<Float4>(buffer) = oC.x;
   2413 
   2414 			buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
   2415 
   2416 			value = *Pointer<Float4>(buffer);
   2417 
   2418 			if((rgbaWriteMask & 0x00000003) != 0x00000003)
   2419 			{
   2420 				Float4 masked;
   2421 
   2422 				masked = value;
   2423 				oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskD01X[rgbaWriteMask & 0x3][0])));
   2424 				masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,maskD01X[~rgbaWriteMask & 0x3][0])));
   2425 				oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(masked));
   2426 			}
   2427 
   2428 			oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskQ23X) + xMask * 16, 16));
   2429 			value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskQ23X) + xMask * 16, 16));
   2430 			oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(value));
   2431 			*Pointer<Float4>(buffer) = oC.y;
   2432 			break;
   2433 		case FORMAT_G16R16I:
   2434 		case FORMAT_G16R16UI:
   2435 			if((rgbaWriteMask & 0x00000003) != 0x0)
   2436 			{
   2437 				buffer = cBuffer + 4 * x;
   2438 
   2439 				UInt2 rgbaMask;
   2440 				UShort4 packedCol = UShort4(As<Int4>(oC.x));
   2441 				UShort4 value = *Pointer<UShort4>(buffer);
   2442 				UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
   2443 				if((rgbaWriteMask & 0x3) != 0x3)
   2444 				{
   2445 					Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[rgbaWriteMask & 0x3][0]));
   2446 					rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
   2447 					mergedMask &= rgbaMask;
   2448 				}
   2449 				*Pointer<UInt2>(buffer) = (As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask);
   2450 
   2451 				buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
   2452 
   2453 				packedCol = UShort4(As<Int4>(oC.y));
   2454 				value = *Pointer<UShort4>(buffer);
   2455 				mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
   2456 				if((rgbaWriteMask & 0x3) != 0x3)
   2457 				{
   2458 					mergedMask &= rgbaMask;
   2459 				}
   2460 				*Pointer<UInt2>(buffer) = (As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask);
   2461 			}
   2462 			break;
   2463 		case FORMAT_G8R8I:
   2464 		case FORMAT_G8R8UI:
   2465 			if((rgbaWriteMask & 0x00000003) != 0x0)
   2466 			{
   2467 				buffer = cBuffer + 2 * x;
   2468 
   2469 				Int2 xyzw, packedCol;
   2470 
   2471 				xyzw = Insert(xyzw, *Pointer<Int>(buffer), 0);
   2472 				buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
   2473 				xyzw = Insert(xyzw, *Pointer<Int>(buffer), 1);
   2474 
   2475 				if(state.targetFormat[index] == FORMAT_G8R8I)
   2476 				{
   2477 					packedCol = As<Int2>(PackSigned(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y))));
   2478 				}
   2479 				else
   2480 				{
   2481 					packedCol = As<Int2>(PackUnsigned(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y))));
   2482 				}
   2483 
   2484 				UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q) + xMask * 8);
   2485 				if((rgbaWriteMask & 0x3) != 0x3)
   2486 				{
   2487 					Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskB4Q[5 * (rgbaWriteMask & 0x3)][0]));
   2488 					UInt2 rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
   2489 					mergedMask &= rgbaMask;
   2490 				}
   2491 
   2492 				packedCol = As<Int2>((As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(xyzw) & ~mergedMask));
   2493 
   2494 				*Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 1));
   2495 				buffer -= *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
   2496 				*Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 0));
   2497 			}
   2498 			break;
   2499 		case FORMAT_X32B32G32R32F:
   2500 		case FORMAT_A32B32G32R32F:
   2501 		case FORMAT_X32B32G32R32F_UNSIGNED:
   2502 		case FORMAT_A32B32G32R32I:
   2503 		case FORMAT_A32B32G32R32UI:
   2504 			buffer = cBuffer + 16 * x;
   2505 
   2506 			{
   2507 				value = *Pointer<Float4>(buffer, 16);
   2508 
   2509 				if(rgbaWriteMask != 0x0000000F)
   2510 				{
   2511 					Float4 masked = value;
   2512 					oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
   2513 					masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
   2514 					oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(masked));
   2515 				}
   2516 
   2517 				oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskX0X) + xMask * 16, 16));
   2518 				value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX0X) + xMask * 16, 16));
   2519 				oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
   2520 				*Pointer<Float4>(buffer, 16) = oC.x;
   2521 			}
   2522 
   2523 			{
   2524 				value = *Pointer<Float4>(buffer + 16, 16);
   2525 
   2526 				if(rgbaWriteMask != 0x0000000F)
   2527 				{
   2528 					Float4 masked = value;
   2529 					oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
   2530 					masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
   2531 					oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(masked));
   2532 				}
   2533 
   2534 				oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskX1X) + xMask * 16, 16));
   2535 				value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX1X) + xMask * 16, 16));
   2536 				oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(value));
   2537 				*Pointer<Float4>(buffer + 16, 16) = oC.y;
   2538 			}
   2539 
   2540 			buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
   2541 
   2542 			{
   2543 				value = *Pointer<Float4>(buffer, 16);
   2544 
   2545 				if(rgbaWriteMask != 0x0000000F)
   2546 				{
   2547 					Float4 masked = value;
   2548 					oC.z = As<Float4>(As<Int4>(oC.z) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
   2549 					masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
   2550 					oC.z = As<Float4>(As<Int4>(oC.z) | As<Int4>(masked));
   2551 				}
   2552 
   2553 				oC.z = As<Float4>(As<Int4>(oC.z) & *Pointer<Int4>(constants + OFFSET(Constants,maskX2X) + xMask * 16, 16));
   2554 				value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX2X) + xMask * 16, 16));
   2555 				oC.z = As<Float4>(As<Int4>(oC.z) | As<Int4>(value));
   2556 				*Pointer<Float4>(buffer, 16) = oC.z;
   2557 			}
   2558 
   2559 			{
   2560 				value = *Pointer<Float4>(buffer + 16, 16);
   2561 
   2562 				if(rgbaWriteMask != 0x0000000F)
   2563 				{
   2564 					Float4 masked = value;
   2565 					oC.w = As<Float4>(As<Int4>(oC.w) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
   2566 					masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
   2567 					oC.w = As<Float4>(As<Int4>(oC.w) | As<Int4>(masked));
   2568 				}
   2569 
   2570 				oC.w = As<Float4>(As<Int4>(oC.w) & *Pointer<Int4>(constants + OFFSET(Constants,maskX3X) + xMask * 16, 16));
   2571 				value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX3X) + xMask * 16, 16));
   2572 				oC.w = As<Float4>(As<Int4>(oC.w) | As<Int4>(value));
   2573 				*Pointer<Float4>(buffer + 16, 16) = oC.w;
   2574 			}
   2575 			break;
   2576 		case FORMAT_A16B16G16R16I:
   2577 		case FORMAT_A16B16G16R16UI:
   2578 			if((rgbaWriteMask & 0x0000000F) != 0x0)
   2579 			{
   2580 				buffer = cBuffer + 8 * x;
   2581 
   2582 				UInt4 rgbaMask;
   2583 				UShort8 value = *Pointer<UShort8>(buffer);
   2584 				UShort8 packedCol = UShort8(UShort4(As<Int4>(oC.x)), UShort4(As<Int4>(oC.y)));
   2585 				UInt4 mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ01X) + xMask * 16);
   2586 				if((rgbaWriteMask & 0xF) != 0xF)
   2587 				{
   2588 					UInt2 tmpMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q[rgbaWriteMask][0]));
   2589 					rgbaMask = UInt4(tmpMask, tmpMask);
   2590 					mergedMask &= rgbaMask;
   2591 				}
   2592 				*Pointer<UInt4>(buffer) = (As<UInt4>(packedCol) & mergedMask) | (As<UInt4>(value) & ~mergedMask);
   2593 
   2594 				buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
   2595 
   2596 				value = *Pointer<UShort8>(buffer);
   2597 				packedCol = UShort8(UShort4(As<Int4>(oC.z)), UShort4(As<Int4>(oC.w)));
   2598 				mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ23X) + xMask * 16);
   2599 				if((rgbaWriteMask & 0xF) != 0xF)
   2600 				{
   2601 					mergedMask &= rgbaMask;
   2602 				}
   2603 				*Pointer<UInt4>(buffer) = (As<UInt4>(packedCol) & mergedMask) | (As<UInt4>(value) & ~mergedMask);
   2604 			}
   2605 			break;
   2606 		case FORMAT_A8B8G8R8I:
   2607 		case FORMAT_A8B8G8R8UI:
   2608 			if((rgbaWriteMask & 0x0000000F) != 0x0)
   2609 			{
   2610 				UInt2 value, packedCol, mergedMask;
   2611 
   2612 				buffer = cBuffer + 4 * x;
   2613 
   2614 				if(state.targetFormat[index] == FORMAT_A8B8G8R8I)
   2615 				{
   2616 					packedCol = As<UInt2>(PackSigned(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y))));
   2617 				}
   2618 				else
   2619 				{
   2620 					packedCol = As<UInt2>(PackUnsigned(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y))));
   2621 				}
   2622 				value = *Pointer<UInt2>(buffer, 16);
   2623 				mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
   2624 				if(rgbaWriteMask != 0xF)
   2625 				{
   2626 					mergedMask &= *Pointer<UInt2>(constants + OFFSET(Constants, maskB4Q[rgbaWriteMask][0]));
   2627 				}
   2628 				*Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (value & ~mergedMask);
   2629 
   2630 				buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
   2631 
   2632 				if(state.targetFormat[index] == FORMAT_A8B8G8R8I)
   2633 				{
   2634 					packedCol = As<UInt2>(PackSigned(Short4(As<Int4>(oC.z)), Short4(As<Int4>(oC.w))));
   2635 				}
   2636 				else
   2637 				{
   2638 					packedCol = As<UInt2>(PackUnsigned(Short4(As<Int4>(oC.z)), Short4(As<Int4>(oC.w))));
   2639 				}
   2640 				value = *Pointer<UInt2>(buffer, 16);
   2641 				mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
   2642 				if(rgbaWriteMask != 0xF)
   2643 				{
   2644 					mergedMask &= *Pointer<UInt2>(constants + OFFSET(Constants, maskB4Q[rgbaWriteMask][0]));
   2645 				}
   2646 				*Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (value & ~mergedMask);
   2647 			}
   2648 			break;
   2649 		default:
   2650 			ASSERT(false);
   2651 		}
   2652 	}
   2653 
   2654 	UShort4 PixelRoutine::convertFixed16(Float4 &cf, bool saturate)
   2655 	{
   2656 		return UShort4(cf * Float4(0xFFFF), saturate);
   2657 	}
   2658 
   2659 	void PixelRoutine::sRGBtoLinear16_12_16(Vector4s &c)
   2660 	{
   2661 		Pointer<Byte> LUT = constants + OFFSET(Constants,sRGBtoLinear12_16);
   2662 
   2663 		c.x = As<UShort4>(c.x) >> 4;
   2664 		c.y = As<UShort4>(c.y) >> 4;
   2665 		c.z = As<UShort4>(c.z) >> 4;
   2666 
   2667 		c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 0))), 0);
   2668 		c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 1))), 1);
   2669 		c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 2))), 2);
   2670 		c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 3))), 3);
   2671 
   2672 		c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 0))), 0);
   2673 		c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 1))), 1);
   2674 		c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 2))), 2);
   2675 		c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 3))), 3);
   2676 
   2677 		c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 0))), 0);
   2678 		c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 1))), 1);
   2679 		c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 2))), 2);
   2680 		c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 3))), 3);
   2681 	}
   2682 
   2683 	void PixelRoutine::linearToSRGB16_12_16(Vector4s &c)
   2684 	{
   2685 		c.x = As<UShort4>(c.x) >> 4;
   2686 		c.y = As<UShort4>(c.y) >> 4;
   2687 		c.z = As<UShort4>(c.z) >> 4;
   2688 
   2689 		linearToSRGB12_16(c);
   2690 	}
   2691 
   2692 	void PixelRoutine::linearToSRGB12_16(Vector4s &c)
   2693 	{
   2694 		Pointer<Byte> LUT = constants + OFFSET(Constants,linearToSRGB12_16);
   2695 
   2696 		c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 0))), 0);
   2697 		c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 1))), 1);
   2698 		c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 2))), 2);
   2699 		c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 3))), 3);
   2700 
   2701 		c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 0))), 0);
   2702 		c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 1))), 1);
   2703 		c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 2))), 2);
   2704 		c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 3))), 3);
   2705 
   2706 		c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 0))), 0);
   2707 		c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 1))), 1);
   2708 		c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 2))), 2);
   2709 		c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 3))), 3);
   2710 	}
   2711 
   2712 	Float4 PixelRoutine::sRGBtoLinear(const Float4 &x)   // Approximates x^2.2
   2713 	{
   2714 		Float4 linear = x * x;
   2715 		linear = linear * Float4(0.73f) + linear * x * Float4(0.27f);
   2716 
   2717 		return Min(Max(linear, Float4(0.0f)), Float4(1.0f));
   2718 	}
   2719 
   2720 	bool PixelRoutine::colorUsed()
   2721 	{
   2722 		return state.colorWriteMask || state.alphaTestActive() || state.shaderContainsKill;
   2723 	}
   2724 }
   2725