Home | History | Annotate | Download | only in Shader
      1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved.
      2 //
      3 // Licensed under the Apache License, Version 2.0 (the "License");
      4 // you may not use this file except in compliance with the License.
      5 // You may obtain a copy of the License at
      6 //
      7 //    http://www.apache.org/licenses/LICENSE-2.0
      8 //
      9 // Unless required by applicable law or agreed to in writing, software
     10 // distributed under the License is distributed on an "AS IS" BASIS,
     11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 // See the License for the specific language governing permissions and
     13 // limitations under the License.
     14 
     15 #include "PixelRoutine.hpp"
     16 
     17 #include "Renderer.hpp"
     18 #include "QuadRasterizer.hpp"
     19 #include "Surface.hpp"
     20 #include "Primitive.hpp"
     21 #include "SamplerCore.hpp"
     22 #include "Constants.hpp"
     23 #include "Debug.hpp"
     24 
     25 namespace sw
     26 {
     27 	extern bool complementaryDepthBuffer;
     28 	extern bool postBlendSRGB;
     29 	extern bool exactColorRounding;
     30 	extern bool forceClearRegisters;
     31 
     32 	PixelRoutine::PixelRoutine(const PixelProcessor::State &state, const PixelShader *shader) : QuadRasterizer(state, shader), v(shader && shader->dynamicallyIndexedInput)
     33 	{
     34 		if(!shader || shader->getVersion() < 0x0200 || forceClearRegisters)
     35 		{
     36 			for(int i = 0; i < MAX_FRAGMENT_INPUTS; i++)
     37 			{
     38 				v[i].x = Float4(0.0f);
     39 				v[i].y = Float4(0.0f);
     40 				v[i].z = Float4(0.0f);
     41 				v[i].w = Float4(0.0f);
     42 			}
     43 		}
     44 	}
     45 
     46 	PixelRoutine::~PixelRoutine()
     47 	{
     48 		for(int i = 0; i < TEXTURE_IMAGE_UNITS; i++)
     49 		{
     50 			delete sampler[i];
     51 		}
     52 	}
     53 
     54 	void PixelRoutine::quad(Pointer<Byte> cBuffer[RENDERTARGETS], Pointer<Byte> &zBuffer, Pointer<Byte> &sBuffer, Int cMask[4], Int &x, Int &y)
     55 	{
     56 		#if PERF_PROFILE
     57 			Long pipeTime = Ticks();
     58 		#endif
     59 
     60 		for(int i = 0; i < TEXTURE_IMAGE_UNITS; i++)
     61 		{
     62 			sampler[i] = new SamplerCore(constants, state.sampler[i]);
     63 		}
     64 
     65 		const bool earlyDepthTest = !state.depthOverride && !state.alphaTestActive();
     66 
     67 		Int zMask[4];   // Depth mask
     68 		Int sMask[4];   // Stencil mask
     69 
     70 		for(unsigned int q = 0; q < state.multiSample; q++)
     71 		{
     72 			zMask[q] = cMask[q];
     73 			sMask[q] = cMask[q];
     74 		}
     75 
     76 		for(unsigned int q = 0; q < state.multiSample; q++)
     77 		{
     78 			stencilTest(sBuffer, q, x, sMask[q], cMask[q]);
     79 		}
     80 
     81 		Float4 f;
     82 		Float4 rhwCentroid;
     83 
     84 		Float4 xxxx = Float4(Float(x)) + *Pointer<Float4>(primitive + OFFSET(Primitive,xQuad), 16);
     85 
     86 		if(interpolateZ())
     87 		{
     88 			for(unsigned int q = 0; q < state.multiSample; q++)
     89 			{
     90 				Float4 x = xxxx;
     91 
     92 				if(state.multiSample > 1)
     93 				{
     94 					x -= *Pointer<Float4>(constants + OFFSET(Constants,X) + q * sizeof(float4));
     95 				}
     96 
     97 				z[q] = interpolate(x, Dz[q], z[q], primitive + OFFSET(Primitive,z), false, false);
     98 			}
     99 		}
    100 
    101 		Bool depthPass = false;
    102 
    103 		if(earlyDepthTest)
    104 		{
    105 			for(unsigned int q = 0; q < state.multiSample; q++)
    106 			{
    107 				depthPass = depthPass || depthTest(zBuffer, q, x, z[q], sMask[q], zMask[q], cMask[q]);
    108 			}
    109 		}
    110 
    111 		If(depthPass || Bool(!earlyDepthTest))
    112 		{
    113 			#if PERF_PROFILE
    114 				Long interpTime = Ticks();
    115 			#endif
    116 
    117 			Float4 yyyy = Float4(Float(y)) + *Pointer<Float4>(primitive + OFFSET(Primitive,yQuad), 16);
    118 
    119 			// Centroid locations
    120 			Float4 XXXX = Float4(0.0f);
    121 			Float4 YYYY = Float4(0.0f);
    122 
    123 			if(state.centroid)
    124 			{
    125 				Float4 WWWW(1.0e-9f);
    126 
    127 				for(unsigned int q = 0; q < state.multiSample; q++)
    128 				{
    129 					XXXX += *Pointer<Float4>(constants + OFFSET(Constants,sampleX[q]) + 16 * cMask[q]);
    130 					YYYY += *Pointer<Float4>(constants + OFFSET(Constants,sampleY[q]) + 16 * cMask[q]);
    131 					WWWW += *Pointer<Float4>(constants + OFFSET(Constants,weight) + 16 * cMask[q]);
    132 				}
    133 
    134 				WWWW = Rcp_pp(WWWW);
    135 				XXXX *= WWWW;
    136 				YYYY *= WWWW;
    137 
    138 				XXXX += xxxx;
    139 				YYYY += yyyy;
    140 			}
    141 
    142 			if(interpolateW())
    143 			{
    144 				w = interpolate(xxxx, Dw, rhw, primitive + OFFSET(Primitive,w), false, false);
    145 				rhw = reciprocal(w, false, false, true);
    146 
    147 				if(state.centroid)
    148 				{
    149 					rhwCentroid = reciprocal(interpolateCentroid(XXXX, YYYY, rhwCentroid, primitive + OFFSET(Primitive,w), false, false));
    150 				}
    151 			}
    152 
    153 			for(int interpolant = 0; interpolant < MAX_FRAGMENT_INPUTS; interpolant++)
    154 			{
    155 				for(int component = 0; component < 4; component++)
    156 				{
    157 					if(state.interpolant[interpolant].component & (1 << component))
    158 					{
    159 						if(!state.interpolant[interpolant].centroid)
    160 						{
    161 							v[interpolant][component] = interpolate(xxxx, Dv[interpolant][component], rhw, primitive + OFFSET(Primitive, V[interpolant][component]), (state.interpolant[interpolant].flat & (1 << component)) != 0, state.perspective);
    162 						}
    163 						else
    164 						{
    165 							v[interpolant][component] = interpolateCentroid(XXXX, YYYY, rhwCentroid, primitive + OFFSET(Primitive, V[interpolant][component]), (state.interpolant[interpolant].flat & (1 << component)) != 0, state.perspective);
    166 						}
    167 					}
    168 				}
    169 
    170 				Float4 rcp;
    171 
    172 				switch(state.interpolant[interpolant].project)
    173 				{
    174 				case 0:
    175 					break;
    176 				case 1:
    177 					rcp = reciprocal(v[interpolant].y);
    178 					v[interpolant].x = v[interpolant].x * rcp;
    179 					break;
    180 				case 2:
    181 					rcp = reciprocal(v[interpolant].z);
    182 					v[interpolant].x = v[interpolant].x * rcp;
    183 					v[interpolant].y = v[interpolant].y * rcp;
    184 					break;
    185 				case 3:
    186 					rcp = reciprocal(v[interpolant].w);
    187 					v[interpolant].x = v[interpolant].x * rcp;
    188 					v[interpolant].y = v[interpolant].y * rcp;
    189 					v[interpolant].z = v[interpolant].z * rcp;
    190 					break;
    191 				}
    192 			}
    193 
    194 			if(state.fog.component)
    195 			{
    196 				f = interpolate(xxxx, Df, rhw, primitive + OFFSET(Primitive,f), state.fog.flat & 0x01, state.perspective);
    197 			}
    198 
    199 			setBuiltins(x, y, z, w);
    200 
    201 			#if PERF_PROFILE
    202 				cycles[PERF_INTERP] += Ticks() - interpTime;
    203 			#endif
    204 
    205 			Bool alphaPass = true;
    206 
    207 			if(colorUsed())
    208 			{
    209 				#if PERF_PROFILE
    210 					Long shaderTime = Ticks();
    211 				#endif
    212 
    213 				applyShader(cMask);
    214 
    215 				#if PERF_PROFILE
    216 					cycles[PERF_SHADER] += Ticks() - shaderTime;
    217 				#endif
    218 
    219 				alphaPass = alphaTest(cMask);
    220 
    221 				if((shader && shader->containsKill()) || state.alphaTestActive())
    222 				{
    223 					for(unsigned int q = 0; q < state.multiSample; q++)
    224 					{
    225 						zMask[q] &= cMask[q];
    226 						sMask[q] &= cMask[q];
    227 					}
    228 				}
    229 			}
    230 
    231 			If(alphaPass)
    232 			{
    233 				if(!earlyDepthTest)
    234 				{
    235 					for(unsigned int q = 0; q < state.multiSample; q++)
    236 					{
    237 						depthPass = depthPass || depthTest(zBuffer, q, x, z[q], sMask[q], zMask[q], cMask[q]);
    238 					}
    239 				}
    240 
    241 				#if PERF_PROFILE
    242 					Long ropTime = Ticks();
    243 				#endif
    244 
    245 				If(depthPass || Bool(earlyDepthTest))
    246 				{
    247 					for(unsigned int q = 0; q < state.multiSample; q++)
    248 					{
    249 						if(state.multiSampleMask & (1 << q))
    250 						{
    251 							writeDepth(zBuffer, q, x, z[q], zMask[q]);
    252 
    253 							if(state.occlusionEnabled)
    254 							{
    255 								occlusion += *Pointer<UInt>(constants + OFFSET(Constants,occlusionCount) + 4 * (zMask[q] & sMask[q]));
    256 							}
    257 						}
    258 					}
    259 
    260 					if(colorUsed())
    261 					{
    262 						#if PERF_PROFILE
    263 							AddAtomic(Pointer<Long>(&profiler.ropOperations), 4);
    264 						#endif
    265 
    266 						rasterOperation(f, cBuffer, x, sMask, zMask, cMask);
    267 					}
    268 				}
    269 
    270 				#if PERF_PROFILE
    271 					cycles[PERF_ROP] += Ticks() - ropTime;
    272 				#endif
    273 			}
    274 		}
    275 
    276 		for(unsigned int q = 0; q < state.multiSample; q++)
    277 		{
    278 			if(state.multiSampleMask & (1 << q))
    279 			{
    280 				writeStencil(sBuffer, q, x, sMask[q], zMask[q], cMask[q]);
    281 			}
    282 		}
    283 
    284 		#if PERF_PROFILE
    285 			cycles[PERF_PIPE] += Ticks() - pipeTime;
    286 		#endif
    287 	}
    288 
    289 	Float4 PixelRoutine::interpolateCentroid(Float4 &x, Float4 &y, Float4 &rhw, Pointer<Byte> planeEquation, bool flat, bool perspective)
    290 	{
    291 		Float4 interpolant = *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation,C), 16);
    292 
    293 		if(!flat)
    294 		{
    295 			interpolant += x * *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation,A), 16) +
    296 			               y * *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation,B), 16);
    297 
    298 			if(perspective)
    299 			{
    300 				interpolant *= rhw;
    301 			}
    302 		}
    303 
    304 		return interpolant;
    305 	}
    306 
    307 	void PixelRoutine::stencilTest(Pointer<Byte> &sBuffer, int q, Int &x, Int &sMask, Int &cMask)
    308 	{
    309 		if(!state.stencilActive)
    310 		{
    311 			return;
    312 		}
    313 
    314 		// (StencilRef & StencilMask) CompFunc (StencilBufferValue & StencilMask)
    315 
    316 		Pointer<Byte> buffer = sBuffer + 2 * x;
    317 
    318 		if(q > 0)
    319 		{
    320 			buffer += q * *Pointer<Int>(data + OFFSET(DrawData,stencilSliceB));
    321 		}
    322 
    323 		Byte8 value = *Pointer<Byte8>(buffer);
    324 		Byte8 valueCCW = value;
    325 
    326 		if(!state.noStencilMask)
    327 		{
    328 			value &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[0].testMaskQ));
    329 		}
    330 
    331 		stencilTest(value, state.stencilCompareMode, false);
    332 
    333 		if(state.twoSidedStencil)
    334 		{
    335 			if(!state.noStencilMaskCCW)
    336 			{
    337 				valueCCW &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[1].testMaskQ));
    338 			}
    339 
    340 			stencilTest(valueCCW, state.stencilCompareModeCCW, true);
    341 
    342 			value &= *Pointer<Byte8>(primitive + OFFSET(Primitive,clockwiseMask));
    343 			valueCCW &= *Pointer<Byte8>(primitive + OFFSET(Primitive,invClockwiseMask));
    344 			value |= valueCCW;
    345 		}
    346 
    347 		sMask = SignMask(value) & cMask;
    348 	}
    349 
    350 	void PixelRoutine::stencilTest(Byte8 &value, StencilCompareMode stencilCompareMode, bool CCW)
    351 	{
    352 		Byte8 equal;
    353 
    354 		switch(stencilCompareMode)
    355 		{
    356 		case STENCIL_ALWAYS:
    357 			value = Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
    358 			break;
    359 		case STENCIL_NEVER:
    360 			value = Byte8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
    361 			break;
    362 		case STENCIL_LESS:			// a < b ~ b > a
    363 			value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
    364 			value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ)));
    365 			break;
    366 		case STENCIL_EQUAL:
    367 			value = CmpEQ(value, *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedQ)));
    368 			break;
    369 		case STENCIL_NOTEQUAL:		// a != b ~ !(a == b)
    370 			value = CmpEQ(value, *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedQ)));
    371 			value ^= Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
    372 			break;
    373 		case STENCIL_LESSEQUAL:	// a <= b ~ (b > a) || (a == b)
    374 			equal = value;
    375 			equal = CmpEQ(equal, *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedQ)));
    376 			value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
    377 			value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ)));
    378 			value |= equal;
    379 			break;
    380 		case STENCIL_GREATER:		// a > b
    381 			equal = *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ));
    382 			value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
    383 			equal = CmpGT(As<SByte8>(equal), As<SByte8>(value));
    384 			value = equal;
    385 			break;
    386 		case STENCIL_GREATEREQUAL:	// a >= b ~ !(a < b) ~ !(b > a)
    387 			value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
    388 			value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ)));
    389 			value ^= Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
    390 			break;
    391 		default:
    392 			ASSERT(false);
    393 		}
    394 	}
    395 
    396 	Bool PixelRoutine::depthTest(Pointer<Byte> &zBuffer, int q, Int &x, Float4 &z, Int &sMask, Int &zMask, Int &cMask)
    397 	{
    398 		if(!state.depthTestActive)
    399 		{
    400 			return true;
    401 		}
    402 
    403 		Float4 Z = z;
    404 
    405 		if(shader && shader->depthOverride())
    406 		{
    407 			if(complementaryDepthBuffer)
    408 			{
    409 				Z = Float4(1.0f) - oDepth;
    410 			}
    411 			else
    412 			{
    413 				Z = oDepth;
    414 			}
    415 		}
    416 
    417 		Pointer<Byte> buffer;
    418 		Int pitch;
    419 
    420 		if(!state.quadLayoutDepthBuffer)
    421 		{
    422 			buffer = zBuffer + 4 * x;
    423 			pitch = *Pointer<Int>(data + OFFSET(DrawData,depthPitchB));
    424 		}
    425 		else
    426 		{
    427 			buffer = zBuffer + 8 * x;
    428 		}
    429 
    430 		if(q > 0)
    431 		{
    432 			buffer += q * *Pointer<Int>(data + OFFSET(DrawData,depthSliceB));
    433 		}
    434 
    435 		Float4 zValue;
    436 
    437 		if(state.depthCompareMode != DEPTH_NEVER || (state.depthCompareMode != DEPTH_ALWAYS && !state.depthWriteEnable))
    438 		{
    439 			if(!state.quadLayoutDepthBuffer)
    440 			{
    441 				// FIXME: Properly optimizes?
    442 				zValue.xy = *Pointer<Float4>(buffer);
    443 				zValue.zw = *Pointer<Float4>(buffer + pitch - 8);
    444 			}
    445 			else
    446 			{
    447 				zValue = *Pointer<Float4>(buffer, 16);
    448 			}
    449 		}
    450 
    451 		Int4 zTest;
    452 
    453 		switch(state.depthCompareMode)
    454 		{
    455 		case DEPTH_ALWAYS:
    456 			// Optimized
    457 			break;
    458 		case DEPTH_NEVER:
    459 			// Optimized
    460 			break;
    461 		case DEPTH_EQUAL:
    462 			zTest = CmpEQ(zValue, Z);
    463 			break;
    464 		case DEPTH_NOTEQUAL:
    465 			zTest = CmpNEQ(zValue, Z);
    466 			break;
    467 		case DEPTH_LESS:
    468 			if(complementaryDepthBuffer)
    469 			{
    470 				zTest = CmpLT(zValue, Z);
    471 			}
    472 			else
    473 			{
    474 				zTest = CmpNLE(zValue, Z);
    475 			}
    476 			break;
    477 		case DEPTH_GREATEREQUAL:
    478 			if(complementaryDepthBuffer)
    479 			{
    480 				zTest = CmpNLT(zValue, Z);
    481 			}
    482 			else
    483 			{
    484 				zTest = CmpLE(zValue, Z);
    485 			}
    486 			break;
    487 		case DEPTH_LESSEQUAL:
    488 			if(complementaryDepthBuffer)
    489 			{
    490 				zTest = CmpLE(zValue, Z);
    491 			}
    492 			else
    493 			{
    494 				zTest = CmpNLT(zValue, Z);
    495 			}
    496 			break;
    497 		case DEPTH_GREATER:
    498 			if(complementaryDepthBuffer)
    499 			{
    500 				zTest = CmpNLE(zValue, Z);
    501 			}
    502 			else
    503 			{
    504 				zTest = CmpLT(zValue, Z);
    505 			}
    506 			break;
    507 		default:
    508 			ASSERT(false);
    509 		}
    510 
    511 		switch(state.depthCompareMode)
    512 		{
    513 		case DEPTH_ALWAYS:
    514 			zMask = cMask;
    515 			break;
    516 		case DEPTH_NEVER:
    517 			zMask = 0x0;
    518 			break;
    519 		default:
    520 			zMask = SignMask(zTest) & cMask;
    521 			break;
    522 		}
    523 
    524 		if(state.stencilActive)
    525 		{
    526 			zMask &= sMask;
    527 		}
    528 
    529 		return zMask != 0;
    530 	}
    531 
    532 	void PixelRoutine::alphaTest(Int &aMask, Short4 &alpha)
    533 	{
    534 		Short4 cmp;
    535 		Short4 equal;
    536 
    537 		switch(state.alphaCompareMode)
    538 		{
    539 		case ALPHA_ALWAYS:
    540 			aMask = 0xF;
    541 			break;
    542 		case ALPHA_NEVER:
    543 			aMask = 0x0;
    544 			break;
    545 		case ALPHA_EQUAL:
    546 			cmp = CmpEQ(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)));
    547 			aMask = SignMask(Pack(cmp, Short4(0x0000)));
    548 			break;
    549 		case ALPHA_NOTEQUAL:       // a != b ~ !(a == b)
    550 			cmp = CmpEQ(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4))) ^ Short4(0xFFFFu);   // FIXME
    551 			aMask = SignMask(Pack(cmp, Short4(0x0000)));
    552 			break;
    553 		case ALPHA_LESS:           // a < b ~ b > a
    554 			cmp = CmpGT(*Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)), alpha);
    555 			aMask = SignMask(Pack(cmp, Short4(0x0000)));
    556 			break;
    557 		case ALPHA_GREATEREQUAL:   // a >= b ~ (a > b) || (a == b) ~ !(b > a)   // TODO: Approximate
    558 			equal = CmpEQ(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)));
    559 			cmp = CmpGT(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)));
    560 			cmp |= equal;
    561 			aMask = SignMask(Pack(cmp, Short4(0x0000)));
    562 			break;
    563 		case ALPHA_LESSEQUAL:      // a <= b ~ !(a > b)
    564 			cmp = CmpGT(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4))) ^ Short4(0xFFFFu);   // FIXME
    565 			aMask = SignMask(Pack(cmp, Short4(0x0000)));
    566 			break;
    567 		case ALPHA_GREATER:        // a > b
    568 			cmp = CmpGT(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)));
    569 			aMask = SignMask(Pack(cmp, Short4(0x0000)));
    570 			break;
    571 		default:
    572 			ASSERT(false);
    573 		}
    574 	}
    575 
    576 	void PixelRoutine::alphaToCoverage(Int cMask[4], Float4 &alpha)
    577 	{
    578 		Int4 coverage0 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c0)));
    579 		Int4 coverage1 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c1)));
    580 		Int4 coverage2 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c2)));
    581 		Int4 coverage3 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c3)));
    582 
    583 		Int aMask0 = SignMask(coverage0);
    584 		Int aMask1 = SignMask(coverage1);
    585 		Int aMask2 = SignMask(coverage2);
    586 		Int aMask3 = SignMask(coverage3);
    587 
    588 		cMask[0] &= aMask0;
    589 		cMask[1] &= aMask1;
    590 		cMask[2] &= aMask2;
    591 		cMask[3] &= aMask3;
    592 	}
    593 
    594 	void PixelRoutine::fogBlend(Vector4f &c0, Float4 &fog)
    595 	{
    596 		if(!state.fogActive)
    597 		{
    598 			return;
    599 		}
    600 
    601 		if(state.pixelFogMode != FOG_NONE)
    602 		{
    603 			pixelFog(fog);
    604 
    605 			fog = Min(fog, Float4(1.0f));
    606 			fog = Max(fog, Float4(0.0f));
    607 		}
    608 
    609 		c0.x -= *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[0]));
    610 		c0.y -= *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[1]));
    611 		c0.z -= *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[2]));
    612 
    613 		c0.x *= fog;
    614 		c0.y *= fog;
    615 		c0.z *= fog;
    616 
    617 		c0.x += *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[0]));
    618 		c0.y += *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[1]));
    619 		c0.z += *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[2]));
    620 	}
    621 
    622 	void PixelRoutine::pixelFog(Float4 &visibility)
    623 	{
    624 		Float4 &zw = visibility;
    625 
    626 		if(state.pixelFogMode != FOG_NONE)
    627 		{
    628 			if(state.wBasedFog)
    629 			{
    630 				zw = rhw;
    631 			}
    632 			else
    633 			{
    634 				if(complementaryDepthBuffer)
    635 				{
    636 					zw = Float4(1.0f) - z[0];
    637 				}
    638 				else
    639 				{
    640 					zw = z[0];
    641 				}
    642 			}
    643 		}
    644 
    645 		switch(state.pixelFogMode)
    646 		{
    647 		case FOG_NONE:
    648 			break;
    649 		case FOG_LINEAR:
    650 			zw *= *Pointer<Float4>(data + OFFSET(DrawData,fog.scale));
    651 			zw += *Pointer<Float4>(data + OFFSET(DrawData,fog.offset));
    652 			break;
    653 		case FOG_EXP:
    654 			zw *= *Pointer<Float4>(data + OFFSET(DrawData,fog.densityE));
    655 			zw = exponential2(zw, true);
    656 			break;
    657 		case FOG_EXP2:
    658 			zw *= zw;
    659 			zw *= *Pointer<Float4>(data + OFFSET(DrawData,fog.density2E));
    660 			zw = exponential2(zw, true);
    661 			break;
    662 		default:
    663 			ASSERT(false);
    664 		}
    665 	}
    666 
    667 	void PixelRoutine::writeDepth(Pointer<Byte> &zBuffer, int q, Int &x, Float4 &z, Int &zMask)
    668 	{
    669 		if(!state.depthWriteEnable)
    670 		{
    671 			return;
    672 		}
    673 
    674 		Float4 Z = z;
    675 
    676 		if(shader && shader->depthOverride())
    677 		{
    678 			if(complementaryDepthBuffer)
    679 			{
    680 				Z = Float4(1.0f) - oDepth;
    681 			}
    682 			else
    683 			{
    684 				Z = oDepth;
    685 			}
    686 		}
    687 
    688 		Pointer<Byte> buffer;
    689 		Int pitch;
    690 
    691 		if(!state.quadLayoutDepthBuffer)
    692 		{
    693 			buffer = zBuffer + 4 * x;
    694 			pitch = *Pointer<Int>(data + OFFSET(DrawData,depthPitchB));
    695 		}
    696 		else
    697 		{
    698 			buffer = zBuffer + 8 * x;
    699 		}
    700 
    701 		if(q > 0)
    702 		{
    703 			buffer += q * *Pointer<Int>(data + OFFSET(DrawData,depthSliceB));
    704 		}
    705 
    706 		Float4 zValue;
    707 
    708 		if(state.depthCompareMode != DEPTH_NEVER || (state.depthCompareMode != DEPTH_ALWAYS && !state.depthWriteEnable))
    709 		{
    710 			if(!state.quadLayoutDepthBuffer)
    711 			{
    712 				// FIXME: Properly optimizes?
    713 				zValue.xy = *Pointer<Float4>(buffer);
    714 				zValue.zw = *Pointer<Float4>(buffer + pitch - 8);
    715 			}
    716 			else
    717 			{
    718 				zValue = *Pointer<Float4>(buffer, 16);
    719 			}
    720 		}
    721 
    722 		Z = As<Float4>(As<Int4>(Z) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X) + zMask * 16, 16));
    723 		zValue = As<Float4>(As<Int4>(zValue) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X) + zMask * 16, 16));
    724 		Z = As<Float4>(As<Int4>(Z) | As<Int4>(zValue));
    725 
    726 		if(!state.quadLayoutDepthBuffer)
    727 		{
    728 			// FIXME: Properly optimizes?
    729 			*Pointer<Float2>(buffer) = Float2(Z.xy);
    730 			*Pointer<Float2>(buffer + pitch) = Float2(Z.zw);
    731 		}
    732 		else
    733 		{
    734 			*Pointer<Float4>(buffer, 16) = Z;
    735 		}
    736 	}
    737 
    738 	void PixelRoutine::writeStencil(Pointer<Byte> &sBuffer, int q, Int &x, Int &sMask, Int &zMask, Int &cMask)
    739 	{
    740 		if(!state.stencilActive)
    741 		{
    742 			return;
    743 		}
    744 
    745 		if(state.stencilPassOperation == OPERATION_KEEP && state.stencilZFailOperation == OPERATION_KEEP && state.stencilFailOperation == OPERATION_KEEP)
    746 		{
    747 			if(!state.twoSidedStencil || (state.stencilPassOperationCCW == OPERATION_KEEP && state.stencilZFailOperationCCW == OPERATION_KEEP && state.stencilFailOperationCCW == OPERATION_KEEP))
    748 			{
    749 				return;
    750 			}
    751 		}
    752 
    753 		if(state.stencilWriteMasked && (!state.twoSidedStencil || state.stencilWriteMaskedCCW))
    754 		{
    755 			return;
    756 		}
    757 
    758 		Pointer<Byte> buffer = sBuffer + 2 * x;
    759 
    760 		if(q > 0)
    761 		{
    762 			buffer += q * *Pointer<Int>(data + OFFSET(DrawData,stencilSliceB));
    763 		}
    764 
    765 		Byte8 bufferValue = *Pointer<Byte8>(buffer);
    766 
    767 		Byte8 newValue;
    768 		stencilOperation(newValue, bufferValue, state.stencilPassOperation, state.stencilZFailOperation, state.stencilFailOperation, false, zMask, sMask);
    769 
    770 		if(!state.noStencilWriteMask)
    771 		{
    772 			Byte8 maskedValue = bufferValue;
    773 			newValue &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[0].writeMaskQ));
    774 			maskedValue &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[0].invWriteMaskQ));
    775 			newValue |= maskedValue;
    776 		}
    777 
    778 		if(state.twoSidedStencil)
    779 		{
    780 			Byte8 newValueCCW;
    781 
    782 			stencilOperation(newValueCCW, bufferValue, state.stencilPassOperationCCW, state.stencilZFailOperationCCW, state.stencilFailOperationCCW, true, zMask, sMask);
    783 
    784 			if(!state.noStencilWriteMaskCCW)
    785 			{
    786 				Byte8 maskedValue = bufferValue;
    787 				newValueCCW &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[1].writeMaskQ));
    788 				maskedValue &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[1].invWriteMaskQ));
    789 				newValueCCW |= maskedValue;
    790 			}
    791 
    792 			newValue &= *Pointer<Byte8>(primitive + OFFSET(Primitive,clockwiseMask));
    793 			newValueCCW &= *Pointer<Byte8>(primitive + OFFSET(Primitive,invClockwiseMask));
    794 			newValue |= newValueCCW;
    795 		}
    796 
    797 		newValue &= *Pointer<Byte8>(constants + OFFSET(Constants,maskB4Q) + 8 * cMask);
    798 		bufferValue &= *Pointer<Byte8>(constants + OFFSET(Constants,invMaskB4Q) + 8 * cMask);
    799 		newValue |= bufferValue;
    800 
    801 		*Pointer<Byte4>(buffer) = Byte4(newValue);
    802 	}
    803 
    804 	void PixelRoutine::stencilOperation(Byte8 &newValue, Byte8 &bufferValue, StencilOperation stencilPassOperation, StencilOperation stencilZFailOperation, StencilOperation stencilFailOperation, bool CCW, Int &zMask, Int &sMask)
    805 	{
    806 		Byte8 &pass = newValue;
    807 		Byte8 fail;
    808 		Byte8 zFail;
    809 
    810 		stencilOperation(pass, bufferValue, stencilPassOperation, CCW);
    811 
    812 		if(stencilZFailOperation != stencilPassOperation)
    813 		{
    814 			stencilOperation(zFail, bufferValue, stencilZFailOperation, CCW);
    815 		}
    816 
    817 		if(stencilFailOperation != stencilPassOperation || stencilFailOperation != stencilZFailOperation)
    818 		{
    819 			stencilOperation(fail, bufferValue, stencilFailOperation, CCW);
    820 		}
    821 
    822 		if(stencilFailOperation != stencilPassOperation || stencilFailOperation != stencilZFailOperation)
    823 		{
    824 			if(state.depthTestActive && stencilZFailOperation != stencilPassOperation)   // zMask valid and values not the same
    825 			{
    826 				pass &= *Pointer<Byte8>(constants + OFFSET(Constants,maskB4Q) + 8 * zMask);
    827 				zFail &= *Pointer<Byte8>(constants + OFFSET(Constants,invMaskB4Q) + 8 * zMask);
    828 				pass |= zFail;
    829 			}
    830 
    831 			pass &= *Pointer<Byte8>(constants + OFFSET(Constants,maskB4Q) + 8 * sMask);
    832 			fail &= *Pointer<Byte8>(constants + OFFSET(Constants,invMaskB4Q) + 8 * sMask);
    833 			pass |= fail;
    834 		}
    835 	}
    836 
    837 	void PixelRoutine::stencilOperation(Byte8 &output, Byte8 &bufferValue, StencilOperation operation, bool CCW)
    838 	{
    839 		switch(operation)
    840 		{
    841 		case OPERATION_KEEP:
    842 			output = bufferValue;
    843 			break;
    844 		case OPERATION_ZERO:
    845 			output = Byte8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
    846 			break;
    847 		case OPERATION_REPLACE:
    848 			output = *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceQ));
    849 			break;
    850 		case OPERATION_INCRSAT:
    851 			output = AddSat(bufferValue, Byte8(1, 1, 1, 1, 1, 1, 1, 1));
    852 			break;
    853 		case OPERATION_DECRSAT:
    854 			output = SubSat(bufferValue, Byte8(1, 1, 1, 1, 1, 1, 1, 1));
    855 			break;
    856 		case OPERATION_INVERT:
    857 			output = bufferValue ^ Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
    858 			break;
    859 		case OPERATION_INCR:
    860 			output = bufferValue + Byte8(1, 1, 1, 1, 1, 1, 1, 1);
    861 			break;
    862 		case OPERATION_DECR:
    863 			output = bufferValue - Byte8(1, 1, 1, 1, 1, 1, 1, 1);
    864 			break;
    865 		default:
    866 			ASSERT(false);
    867 		}
    868 	}
    869 
    870 	void PixelRoutine::blendFactor(Vector4s &blendFactor, const Vector4s &current, const Vector4s &pixel, BlendFactor blendFactorActive)
    871 	{
    872 		switch(blendFactorActive)
    873 		{
    874 		case BLEND_ZERO:
    875 			// Optimized
    876 			break;
    877 		case BLEND_ONE:
    878 			// Optimized
    879 			break;
    880 		case BLEND_SOURCE:
    881 			blendFactor.x = current.x;
    882 			blendFactor.y = current.y;
    883 			blendFactor.z = current.z;
    884 			break;
    885 		case BLEND_INVSOURCE:
    886 			blendFactor.x = Short4(0xFFFFu) - current.x;
    887 			blendFactor.y = Short4(0xFFFFu) - current.y;
    888 			blendFactor.z = Short4(0xFFFFu) - current.z;
    889 			break;
    890 		case BLEND_DEST:
    891 			blendFactor.x = pixel.x;
    892 			blendFactor.y = pixel.y;
    893 			blendFactor.z = pixel.z;
    894 			break;
    895 		case BLEND_INVDEST:
    896 			blendFactor.x = Short4(0xFFFFu) - pixel.x;
    897 			blendFactor.y = Short4(0xFFFFu) - pixel.y;
    898 			blendFactor.z = Short4(0xFFFFu) - pixel.z;
    899 			break;
    900 		case BLEND_SOURCEALPHA:
    901 			blendFactor.x = current.w;
    902 			blendFactor.y = current.w;
    903 			blendFactor.z = current.w;
    904 			break;
    905 		case BLEND_INVSOURCEALPHA:
    906 			blendFactor.x = Short4(0xFFFFu) - current.w;
    907 			blendFactor.y = Short4(0xFFFFu) - current.w;
    908 			blendFactor.z = Short4(0xFFFFu) - current.w;
    909 			break;
    910 		case BLEND_DESTALPHA:
    911 			blendFactor.x = pixel.w;
    912 			blendFactor.y = pixel.w;
    913 			blendFactor.z = pixel.w;
    914 			break;
    915 		case BLEND_INVDESTALPHA:
    916 			blendFactor.x = Short4(0xFFFFu) - pixel.w;
    917 			blendFactor.y = Short4(0xFFFFu) - pixel.w;
    918 			blendFactor.z = Short4(0xFFFFu) - pixel.w;
    919 			break;
    920 		case BLEND_SRCALPHASAT:
    921 			blendFactor.x = Short4(0xFFFFu) - pixel.w;
    922 			blendFactor.x = Min(As<UShort4>(blendFactor.x), As<UShort4>(current.w));
    923 			blendFactor.y = blendFactor.x;
    924 			blendFactor.z = blendFactor.x;
    925 			break;
    926 		case BLEND_CONSTANT:
    927 			blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[0]));
    928 			blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[1]));
    929 			blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[2]));
    930 			break;
    931 		case BLEND_INVCONSTANT:
    932 			blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[0]));
    933 			blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[1]));
    934 			blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[2]));
    935 			break;
    936 		case BLEND_CONSTANTALPHA:
    937 			blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3]));
    938 			blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3]));
    939 			blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3]));
    940 			break;
    941 		case BLEND_INVCONSTANTALPHA:
    942 			blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
    943 			blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
    944 			blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
    945 			break;
    946 		default:
    947 			ASSERT(false);
    948 		}
    949 	}
    950 
    951 	void PixelRoutine::blendFactorAlpha(Vector4s &blendFactor, const Vector4s &current, const Vector4s &pixel, BlendFactor blendFactorAlphaActive)
    952 	{
    953 		switch(blendFactorAlphaActive)
    954 		{
    955 		case BLEND_ZERO:
    956 			// Optimized
    957 			break;
    958 		case BLEND_ONE:
    959 			// Optimized
    960 			break;
    961 		case BLEND_SOURCE:
    962 			blendFactor.w = current.w;
    963 			break;
    964 		case BLEND_INVSOURCE:
    965 			blendFactor.w = Short4(0xFFFFu) - current.w;
    966 			break;
    967 		case BLEND_DEST:
    968 			blendFactor.w = pixel.w;
    969 			break;
    970 		case BLEND_INVDEST:
    971 			blendFactor.w = Short4(0xFFFFu) - pixel.w;
    972 			break;
    973 		case BLEND_SOURCEALPHA:
    974 			blendFactor.w = current.w;
    975 			break;
    976 		case BLEND_INVSOURCEALPHA:
    977 			blendFactor.w = Short4(0xFFFFu) - current.w;
    978 			break;
    979 		case BLEND_DESTALPHA:
    980 			blendFactor.w = pixel.w;
    981 			break;
    982 		case BLEND_INVDESTALPHA:
    983 			blendFactor.w = Short4(0xFFFFu) - pixel.w;
    984 			break;
    985 		case BLEND_SRCALPHASAT:
    986 			blendFactor.w = Short4(0xFFFFu);
    987 			break;
    988 		case BLEND_CONSTANT:
    989 		case BLEND_CONSTANTALPHA:
    990 			blendFactor.w = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3]));
    991 			break;
    992 		case BLEND_INVCONSTANT:
    993 		case BLEND_INVCONSTANTALPHA:
    994 			blendFactor.w = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
    995 			break;
    996 		default:
    997 			ASSERT(false);
    998 		}
    999 	}
   1000 
   1001 	bool PixelRoutine::isSRGB(int index) const
   1002 	{
   1003 		return state.targetFormat[index] == FORMAT_SRGB8_A8 || state.targetFormat[index] == FORMAT_SRGB8_X8;
   1004 	}
   1005 
   1006 	void PixelRoutine::readPixel(int index, Pointer<Byte> &cBuffer, Int &x, Vector4s &pixel)
   1007 	{
   1008 		Short4 c01;
   1009 		Short4 c23;
   1010 		Pointer<Byte> buffer;
   1011 		Pointer<Byte> buffer2;
   1012 
   1013 		switch(state.targetFormat[index])
   1014 		{
   1015 		case FORMAT_R5G6B5:
   1016 			buffer = cBuffer + 2 * x;
   1017 			buffer2 = buffer + *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
   1018 			c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
   1019 
   1020 			pixel.x = c01 & Short4(0xF800u);
   1021 			pixel.y = (c01 & Short4(0x07E0u)) << 5;
   1022 			pixel.z = (c01 & Short4(0x001Fu)) << 11;
   1023 			pixel.w = Short4(0xFFFFu);
   1024 			break;
   1025 		case FORMAT_A8R8G8B8:
   1026 			buffer = cBuffer + 4 * x;
   1027 			c01 = *Pointer<Short4>(buffer);
   1028 			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
   1029 			c23 = *Pointer<Short4>(buffer);
   1030 			pixel.z = c01;
   1031 			pixel.y = c01;
   1032 			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
   1033 			pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
   1034 			pixel.x = pixel.z;
   1035 			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
   1036 			pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
   1037 			pixel.y = pixel.z;
   1038 			pixel.w = pixel.x;
   1039 			pixel.x = UnpackLow(As<Byte8>(pixel.x), As<Byte8>(pixel.x));
   1040 			pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
   1041 			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
   1042 			pixel.w = UnpackHigh(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
   1043 			break;
   1044 		case FORMAT_A8B8G8R8:
   1045 		case FORMAT_SRGB8_A8:
   1046 			buffer = cBuffer + 4 * x;
   1047 			c01 = *Pointer<Short4>(buffer);
   1048 			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
   1049 			c23 = *Pointer<Short4>(buffer);
   1050 			pixel.z = c01;
   1051 			pixel.y = c01;
   1052 			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
   1053 			pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
   1054 			pixel.x = pixel.z;
   1055 			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
   1056 			pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
   1057 			pixel.y = pixel.z;
   1058 			pixel.w = pixel.x;
   1059 			pixel.x = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
   1060 			pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
   1061 			pixel.z = UnpackLow(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
   1062 			pixel.w = UnpackHigh(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
   1063 			break;
   1064 		case FORMAT_A8:
   1065 			buffer = cBuffer + 1 * x;
   1066 			pixel.w = Insert(pixel.w, *Pointer<Short>(buffer), 0);
   1067 			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
   1068 			pixel.w = Insert(pixel.w, *Pointer<Short>(buffer), 1);
   1069 			pixel.w = UnpackLow(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
   1070 			pixel.x = Short4(0x0000);
   1071 			pixel.y = Short4(0x0000);
   1072 			pixel.z = Short4(0x0000);
   1073 			break;
   1074 		case FORMAT_X8R8G8B8:
   1075 			buffer = cBuffer + 4 * x;
   1076 			c01 = *Pointer<Short4>(buffer);
   1077 			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
   1078 			c23 = *Pointer<Short4>(buffer);
   1079 			pixel.z = c01;
   1080 			pixel.y = c01;
   1081 			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
   1082 			pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
   1083 			pixel.x = pixel.z;
   1084 			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
   1085 			pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
   1086 			pixel.y = pixel.z;
   1087 			pixel.x = UnpackLow(As<Byte8>(pixel.x), As<Byte8>(pixel.x));
   1088 			pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
   1089 			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
   1090 			pixel.w = Short4(0xFFFFu);
   1091 			break;
   1092 		case FORMAT_X8B8G8R8:
   1093 		case FORMAT_SRGB8_X8:
   1094 			buffer = cBuffer + 4 * x;
   1095 			c01 = *Pointer<Short4>(buffer);
   1096 			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
   1097 			c23 = *Pointer<Short4>(buffer);
   1098 			pixel.z = c01;
   1099 			pixel.y = c01;
   1100 			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
   1101 			pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
   1102 			pixel.x = pixel.z;
   1103 			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
   1104 			pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
   1105 			pixel.y = pixel.z;
   1106 			pixel.w = pixel.x;
   1107 			pixel.x = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
   1108 			pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
   1109 			pixel.z = UnpackLow(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
   1110 			pixel.w = Short4(0xFFFFu);
   1111 			break;
   1112 		case FORMAT_A8G8R8B8Q:
   1113 			UNIMPLEMENTED();
   1114 		//	pixel.z = UnpackLow(As<Byte8>(pixel.z), *Pointer<Byte8>(cBuffer + 8 * x + 0));
   1115 		//	pixel.x = UnpackHigh(As<Byte8>(pixel.x), *Pointer<Byte8>(cBuffer + 8 * x + 0));
   1116 		//	pixel.y = UnpackLow(As<Byte8>(pixel.y), *Pointer<Byte8>(cBuffer + 8 * x + 8));
   1117 		//	pixel.w = UnpackHigh(As<Byte8>(pixel.w), *Pointer<Byte8>(cBuffer + 8 * x + 8));
   1118 			break;
   1119 		case FORMAT_X8G8R8B8Q:
   1120 			UNIMPLEMENTED();
   1121 		//	pixel.z = UnpackLow(As<Byte8>(pixel.z), *Pointer<Byte8>(cBuffer + 8 * x + 0));
   1122 		//	pixel.x = UnpackHigh(As<Byte8>(pixel.x), *Pointer<Byte8>(cBuffer + 8 * x + 0));
   1123 		//	pixel.y = UnpackLow(As<Byte8>(pixel.y), *Pointer<Byte8>(cBuffer + 8 * x + 8));
   1124 		//	pixel.w = Short4(0xFFFFu);
   1125 			break;
   1126 		case FORMAT_A16B16G16R16:
   1127 			buffer = cBuffer;
   1128 			pixel.x = *Pointer<Short4>(buffer + 8 * x);
   1129 			pixel.y = *Pointer<Short4>(buffer + 8 * x + 8);
   1130 			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
   1131 			pixel.z = *Pointer<Short4>(buffer + 8 * x);
   1132 			pixel.w = *Pointer<Short4>(buffer + 8 * x + 8);
   1133 			transpose4x4(pixel.x, pixel.y, pixel.z, pixel.w);
   1134 			break;
   1135 		case FORMAT_G16R16:
   1136 			buffer = cBuffer;
   1137 			pixel.x = *Pointer<Short4>(buffer + 4 * x);
   1138 			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
   1139 			pixel.y = *Pointer<Short4>(buffer + 4 * x);
   1140 			pixel.z = pixel.x;
   1141 			pixel.x = As<Short4>(UnpackLow(pixel.x, pixel.y));
   1142 			pixel.z = As<Short4>(UnpackHigh(pixel.z, pixel.y));
   1143 			pixel.y = pixel.z;
   1144 			pixel.x = As<Short4>(UnpackLow(pixel.x, pixel.z));
   1145 			pixel.y = As<Short4>(UnpackHigh(pixel.y, pixel.z));
   1146 			pixel.z = Short4(0xFFFFu);
   1147 			pixel.w = Short4(0xFFFFu);
   1148 			break;
   1149 		default:
   1150 			ASSERT(false);
   1151 		}
   1152 
   1153 		if((postBlendSRGB && state.writeSRGB) || isSRGB(index))
   1154 		{
   1155 			sRGBtoLinear16_12_16(pixel);
   1156 		}
   1157 	}
   1158 
   1159 	void PixelRoutine::alphaBlend(int index, Pointer<Byte> &cBuffer, Vector4s &current, Int &x)
   1160 	{
   1161 		if(!state.alphaBlendActive)
   1162 		{
   1163 			return;
   1164 		}
   1165 
   1166 		Vector4s pixel;
   1167 		readPixel(index, cBuffer, x, pixel);
   1168 
   1169 		// Final Color = ObjectColor * SourceBlendFactor + PixelColor * DestinationBlendFactor
   1170 		Vector4s sourceFactor;
   1171 		Vector4s destFactor;
   1172 
   1173 		blendFactor(sourceFactor, current, pixel, state.sourceBlendFactor);
   1174 		blendFactor(destFactor, current, pixel, state.destBlendFactor);
   1175 
   1176 		if(state.sourceBlendFactor != BLEND_ONE && state.sourceBlendFactor != BLEND_ZERO)
   1177 		{
   1178 			current.x = MulHigh(As<UShort4>(current.x), As<UShort4>(sourceFactor.x));
   1179 			current.y = MulHigh(As<UShort4>(current.y), As<UShort4>(sourceFactor.y));
   1180 			current.z = MulHigh(As<UShort4>(current.z), As<UShort4>(sourceFactor.z));
   1181 		}
   1182 
   1183 		if(state.destBlendFactor != BLEND_ONE && state.destBlendFactor != BLEND_ZERO)
   1184 		{
   1185 			pixel.x = MulHigh(As<UShort4>(pixel.x), As<UShort4>(destFactor.x));
   1186 			pixel.y = MulHigh(As<UShort4>(pixel.y), As<UShort4>(destFactor.y));
   1187 			pixel.z = MulHigh(As<UShort4>(pixel.z), As<UShort4>(destFactor.z));
   1188 		}
   1189 
   1190 		switch(state.blendOperation)
   1191 		{
   1192 		case BLENDOP_ADD:
   1193 			current.x = AddSat(As<UShort4>(current.x), As<UShort4>(pixel.x));
   1194 			current.y = AddSat(As<UShort4>(current.y), As<UShort4>(pixel.y));
   1195 			current.z = AddSat(As<UShort4>(current.z), As<UShort4>(pixel.z));
   1196 			break;
   1197 		case BLENDOP_SUB:
   1198 			current.x = SubSat(As<UShort4>(current.x), As<UShort4>(pixel.x));
   1199 			current.y = SubSat(As<UShort4>(current.y), As<UShort4>(pixel.y));
   1200 			current.z = SubSat(As<UShort4>(current.z), As<UShort4>(pixel.z));
   1201 			break;
   1202 		case BLENDOP_INVSUB:
   1203 			current.x = SubSat(As<UShort4>(pixel.x), As<UShort4>(current.x));
   1204 			current.y = SubSat(As<UShort4>(pixel.y), As<UShort4>(current.y));
   1205 			current.z = SubSat(As<UShort4>(pixel.z), As<UShort4>(current.z));
   1206 			break;
   1207 		case BLENDOP_MIN:
   1208 			current.x = Min(As<UShort4>(current.x), As<UShort4>(pixel.x));
   1209 			current.y = Min(As<UShort4>(current.y), As<UShort4>(pixel.y));
   1210 			current.z = Min(As<UShort4>(current.z), As<UShort4>(pixel.z));
   1211 			break;
   1212 		case BLENDOP_MAX:
   1213 			current.x = Max(As<UShort4>(current.x), As<UShort4>(pixel.x));
   1214 			current.y = Max(As<UShort4>(current.y), As<UShort4>(pixel.y));
   1215 			current.z = Max(As<UShort4>(current.z), As<UShort4>(pixel.z));
   1216 			break;
   1217 		case BLENDOP_SOURCE:
   1218 			// No operation
   1219 			break;
   1220 		case BLENDOP_DEST:
   1221 			current.x = pixel.x;
   1222 			current.y = pixel.y;
   1223 			current.z = pixel.z;
   1224 			break;
   1225 		case BLENDOP_NULL:
   1226 			current.x = Short4(0x0000);
   1227 			current.y = Short4(0x0000);
   1228 			current.z = Short4(0x0000);
   1229 			break;
   1230 		default:
   1231 			ASSERT(false);
   1232 		}
   1233 
   1234 		blendFactorAlpha(sourceFactor, current, pixel, state.sourceBlendFactorAlpha);
   1235 		blendFactorAlpha(destFactor, current, pixel, state.destBlendFactorAlpha);
   1236 
   1237 		if(state.sourceBlendFactorAlpha != BLEND_ONE && state.sourceBlendFactorAlpha != BLEND_ZERO)
   1238 		{
   1239 			current.w = MulHigh(As<UShort4>(current.w), As<UShort4>(sourceFactor.w));
   1240 		}
   1241 
   1242 		if(state.destBlendFactorAlpha != BLEND_ONE && state.destBlendFactorAlpha != BLEND_ZERO)
   1243 		{
   1244 			pixel.w = MulHigh(As<UShort4>(pixel.w), As<UShort4>(destFactor.w));
   1245 		}
   1246 
   1247 		switch(state.blendOperationAlpha)
   1248 		{
   1249 		case BLENDOP_ADD:
   1250 			current.w = AddSat(As<UShort4>(current.w), As<UShort4>(pixel.w));
   1251 			break;
   1252 		case BLENDOP_SUB:
   1253 			current.w = SubSat(As<UShort4>(current.w), As<UShort4>(pixel.w));
   1254 			break;
   1255 		case BLENDOP_INVSUB:
   1256 			current.w = SubSat(As<UShort4>(pixel.w), As<UShort4>(current.w));
   1257 			break;
   1258 		case BLENDOP_MIN:
   1259 			current.w = Min(As<UShort4>(current.w), As<UShort4>(pixel.w));
   1260 			break;
   1261 		case BLENDOP_MAX:
   1262 			current.w = Max(As<UShort4>(current.w), As<UShort4>(pixel.w));
   1263 			break;
   1264 		case BLENDOP_SOURCE:
   1265 			// No operation
   1266 			break;
   1267 		case BLENDOP_DEST:
   1268 			current.w = pixel.w;
   1269 			break;
   1270 		case BLENDOP_NULL:
   1271 			current.w = Short4(0x0000);
   1272 			break;
   1273 		default:
   1274 			ASSERT(false);
   1275 		}
   1276 	}
   1277 
   1278 	void PixelRoutine::logicOperation(int index, Pointer<Byte> &cBuffer, Vector4s &current, Int &x)
   1279 	{
   1280 		if(state.logicalOperation == LOGICALOP_COPY)
   1281 		{
   1282 			return;
   1283 		}
   1284 
   1285 		Vector4s pixel;
   1286 		readPixel(index, cBuffer, x, pixel);
   1287 
   1288 		switch(state.logicalOperation)
   1289 		{
   1290 		case LOGICALOP_CLEAR:
   1291 			current.x = UShort4(0);
   1292 			current.y = UShort4(0);
   1293 			current.z = UShort4(0);
   1294 			break;
   1295 		case LOGICALOP_SET:
   1296 			current.x = UShort4(0xFFFFu);
   1297 			current.y = UShort4(0xFFFFu);
   1298 			current.z = UShort4(0xFFFFu);
   1299 			break;
   1300 		case LOGICALOP_COPY:
   1301 			ASSERT(false);   // Optimized out
   1302 			break;
   1303 		case LOGICALOP_COPY_INVERTED:
   1304 			current.x = ~current.x;
   1305 			current.y = ~current.y;
   1306 			current.z = ~current.z;
   1307 			break;
   1308 		case LOGICALOP_NOOP:
   1309 			current.x = pixel.x;
   1310 			current.y = pixel.y;
   1311 			current.z = pixel.z;
   1312 			break;
   1313 		case LOGICALOP_INVERT:
   1314 			current.x = ~pixel.x;
   1315 			current.y = ~pixel.y;
   1316 			current.z = ~pixel.z;
   1317 			break;
   1318 		case LOGICALOP_AND:
   1319 			current.x = pixel.x & current.x;
   1320 			current.y = pixel.y & current.y;
   1321 			current.z = pixel.z & current.z;
   1322 			break;
   1323 		case LOGICALOP_NAND:
   1324 			current.x = ~(pixel.x & current.x);
   1325 			current.y = ~(pixel.y & current.y);
   1326 			current.z = ~(pixel.z & current.z);
   1327 			break;
   1328 		case LOGICALOP_OR:
   1329 			current.x = pixel.x | current.x;
   1330 			current.y = pixel.y | current.y;
   1331 			current.z = pixel.z | current.z;
   1332 			break;
   1333 		case LOGICALOP_NOR:
   1334 			current.x = ~(pixel.x | current.x);
   1335 			current.y = ~(pixel.y | current.y);
   1336 			current.z = ~(pixel.z | current.z);
   1337 			break;
   1338 		case LOGICALOP_XOR:
   1339 			current.x = pixel.x ^ current.x;
   1340 			current.y = pixel.y ^ current.y;
   1341 			current.z = pixel.z ^ current.z;
   1342 			break;
   1343 		case LOGICALOP_EQUIV:
   1344 			current.x = ~(pixel.x ^ current.x);
   1345 			current.y = ~(pixel.y ^ current.y);
   1346 			current.z = ~(pixel.z ^ current.z);
   1347 			break;
   1348 		case LOGICALOP_AND_REVERSE:
   1349 			current.x = ~pixel.x & current.x;
   1350 			current.y = ~pixel.y & current.y;
   1351 			current.z = ~pixel.z & current.z;
   1352 			break;
   1353 		case LOGICALOP_AND_INVERTED:
   1354 			current.x = pixel.x & ~current.x;
   1355 			current.y = pixel.y & ~current.y;
   1356 			current.z = pixel.z & ~current.z;
   1357 			break;
   1358 		case LOGICALOP_OR_REVERSE:
   1359 			current.x = ~pixel.x | current.x;
   1360 			current.y = ~pixel.y | current.y;
   1361 			current.z = ~pixel.z | current.z;
   1362 			break;
   1363 		case LOGICALOP_OR_INVERTED:
   1364 			current.x = pixel.x | ~current.x;
   1365 			current.y = pixel.y | ~current.y;
   1366 			current.z = pixel.z | ~current.z;
   1367 			break;
   1368 		default:
   1369 			ASSERT(false);
   1370 		}
   1371 	}
   1372 
   1373 	void PixelRoutine::writeColor(int index, Pointer<Byte> &cBuffer, Int &x, Vector4s &current, Int &sMask, Int &zMask, Int &cMask)
   1374 	{
   1375 		if((postBlendSRGB && state.writeSRGB) || isSRGB(index))
   1376 		{
   1377 			linearToSRGB16_12_16(current);
   1378 		}
   1379 
   1380 		if(exactColorRounding)
   1381 		{
   1382 			switch(state.targetFormat[index])
   1383 			{
   1384 			case FORMAT_R5G6B5:
   1385 				current.x = AddSat(As<UShort4>(current.x), UShort4(0x0400));
   1386 				current.y = AddSat(As<UShort4>(current.y), UShort4(0x0200));
   1387 				current.z = AddSat(As<UShort4>(current.z), UShort4(0x0400));
   1388 				break;
   1389 			case FORMAT_X8G8R8B8Q:
   1390 			case FORMAT_A8G8R8B8Q:
   1391 			case FORMAT_X8R8G8B8:
   1392 			case FORMAT_X8B8G8R8:
   1393 			case FORMAT_A8R8G8B8:
   1394 			case FORMAT_A8B8G8R8:
   1395 			case FORMAT_SRGB8_X8:
   1396 			case FORMAT_SRGB8_A8:
   1397 			case FORMAT_G8R8:
   1398 			case FORMAT_R8:
   1399 				current.x = current.x - As<Short4>(As<UShort4>(current.x) >> 8) + Short4(0x0080);
   1400 				current.y = current.y - As<Short4>(As<UShort4>(current.y) >> 8) + Short4(0x0080);
   1401 				current.z = current.z - As<Short4>(As<UShort4>(current.z) >> 8) + Short4(0x0080);
   1402 				current.w = current.w - As<Short4>(As<UShort4>(current.w) >> 8) + Short4(0x0080);
   1403 				break;
   1404 			default:
   1405 				break;
   1406 			}
   1407 		}
   1408 
   1409 		int rgbaWriteMask = state.colorWriteActive(index);
   1410 		int bgraWriteMask = (rgbaWriteMask & 0x0000000A) | (rgbaWriteMask & 0x00000001) << 2 | (rgbaWriteMask & 0x00000004) >> 2;
   1411 
   1412 		switch(state.targetFormat[index])
   1413 		{
   1414 		case FORMAT_R5G6B5:
   1415 			{
   1416 				current.x = current.x & Short4(0xF800u);
   1417 				current.y = As<UShort4>(current.y & Short4(0xFC00u)) >> 5;
   1418 				current.z = As<UShort4>(current.z) >> 11;
   1419 
   1420 				current.x = current.x | current.y | current.z;
   1421 			}
   1422 			break;
   1423 		case FORMAT_X8G8R8B8Q:
   1424 			UNIMPLEMENTED();
   1425 		//	current.x = As<Short4>(As<UShort4>(current.x) >> 8);
   1426 		//	current.y = As<Short4>(As<UShort4>(current.y) >> 8);
   1427 		//	current.z = As<Short4>(As<UShort4>(current.z) >> 8);
   1428 
   1429 		//	current.z = As<Short4>(Pack(As<UShort4>(current.z), As<UShort4>(current.x)));
   1430 		//	current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.y)));
   1431 			break;
   1432 		case FORMAT_A8G8R8B8Q:
   1433 			UNIMPLEMENTED();
   1434 		//	current.x = As<Short4>(As<UShort4>(current.x) >> 8);
   1435 		//	current.y = As<Short4>(As<UShort4>(current.y) >> 8);
   1436 		//	current.z = As<Short4>(As<UShort4>(current.z) >> 8);
   1437 		//	current.w = As<Short4>(As<UShort4>(current.w) >> 8);
   1438 
   1439 		//	current.z = As<Short4>(Pack(As<UShort4>(current.z), As<UShort4>(current.x)));
   1440 		//	current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.w)));
   1441 			break;
   1442 		case FORMAT_X8R8G8B8:
   1443 		case FORMAT_A8R8G8B8:
   1444 			if(state.targetFormat[index] == FORMAT_X8R8G8B8 || rgbaWriteMask == 0x7)
   1445 			{
   1446 				current.x = As<Short4>(As<UShort4>(current.x) >> 8);
   1447 				current.y = As<Short4>(As<UShort4>(current.y) >> 8);
   1448 				current.z = As<Short4>(As<UShort4>(current.z) >> 8);
   1449 
   1450 				current.z = As<Short4>(Pack(As<UShort4>(current.z), As<UShort4>(current.x)));
   1451 				current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.y)));
   1452 
   1453 				current.x = current.z;
   1454 				current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
   1455 				current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
   1456 				current.y = current.z;
   1457 				current.z = As<Short4>(UnpackLow(current.z, current.x));
   1458 				current.y = As<Short4>(UnpackHigh(current.y, current.x));
   1459 			}
   1460 			else
   1461 			{
   1462 				current.x = As<Short4>(As<UShort4>(current.x) >> 8);
   1463 				current.y = As<Short4>(As<UShort4>(current.y) >> 8);
   1464 				current.z = As<Short4>(As<UShort4>(current.z) >> 8);
   1465 				current.w = As<Short4>(As<UShort4>(current.w) >> 8);
   1466 
   1467 				current.z = As<Short4>(Pack(As<UShort4>(current.z), As<UShort4>(current.x)));
   1468 				current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.w)));
   1469 
   1470 				current.x = current.z;
   1471 				current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
   1472 				current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
   1473 				current.y = current.z;
   1474 				current.z = As<Short4>(UnpackLow(current.z, current.x));
   1475 				current.y = As<Short4>(UnpackHigh(current.y, current.x));
   1476 			}
   1477 			break;
   1478 		case FORMAT_X8B8G8R8:
   1479 		case FORMAT_A8B8G8R8:
   1480 		case FORMAT_SRGB8_X8:
   1481 		case FORMAT_SRGB8_A8:
   1482 			if(state.targetFormat[index] == FORMAT_X8B8G8R8 || state.targetFormat[index] == FORMAT_SRGB8_X8 || rgbaWriteMask == 0x7)
   1483 			{
   1484 				current.x = As<Short4>(As<UShort4>(current.x) >> 8);
   1485 				current.y = As<Short4>(As<UShort4>(current.y) >> 8);
   1486 				current.z = As<Short4>(As<UShort4>(current.z) >> 8);
   1487 
   1488 				current.z = As<Short4>(Pack(As<UShort4>(current.x), As<UShort4>(current.z)));
   1489 				current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.y)));
   1490 
   1491 				current.x = current.z;
   1492 				current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
   1493 				current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
   1494 				current.y = current.z;
   1495 				current.z = As<Short4>(UnpackLow(current.z, current.x));
   1496 				current.y = As<Short4>(UnpackHigh(current.y, current.x));
   1497 			}
   1498 			else
   1499 			{
   1500 				current.x = As<Short4>(As<UShort4>(current.x) >> 8);
   1501 				current.y = As<Short4>(As<UShort4>(current.y) >> 8);
   1502 				current.z = As<Short4>(As<UShort4>(current.z) >> 8);
   1503 				current.w = As<Short4>(As<UShort4>(current.w) >> 8);
   1504 
   1505 				current.z = As<Short4>(Pack(As<UShort4>(current.x), As<UShort4>(current.z)));
   1506 				current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.w)));
   1507 
   1508 				current.x = current.z;
   1509 				current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
   1510 				current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
   1511 				current.y = current.z;
   1512 				current.z = As<Short4>(UnpackLow(current.z, current.x));
   1513 				current.y = As<Short4>(UnpackHigh(current.y, current.x));
   1514 			}
   1515 			break;
   1516 		case FORMAT_G8R8:
   1517 			current.x = As<Short4>(As<UShort4>(current.x) >> 8);
   1518 			current.y = As<Short4>(As<UShort4>(current.y) >> 8);
   1519 			current.x = As<Short4>(Pack(As<UShort4>(current.x), As<UShort4>(current.x)));
   1520 			current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.y)));
   1521 			current.x = UnpackLow(As<Byte8>(current.x), As<Byte8>(current.y));
   1522 			break;
   1523 		case FORMAT_R8:
   1524 			current.x = As<Short4>(As<UShort4>(current.x) >> 8);
   1525 			current.x = As<Short4>(Pack(As<UShort4>(current.x), As<UShort4>(current.x)));
   1526 			break;
   1527 		case FORMAT_A8:
   1528 			current.w = As<Short4>(As<UShort4>(current.w) >> 8);
   1529 			current.w = As<Short4>(Pack(As<UShort4>(current.w), As<UShort4>(current.w)));
   1530 			break;
   1531 		case FORMAT_G16R16:
   1532 			current.z = current.x;
   1533 			current.x = As<Short4>(UnpackLow(current.x, current.y));
   1534 			current.z = As<Short4>(UnpackHigh(current.z, current.y));
   1535 			current.y = current.z;
   1536 			break;
   1537 		case FORMAT_A16B16G16R16:
   1538 			transpose4x4(current.x, current.y, current.z, current.w);
   1539 			break;
   1540 		default:
   1541 			ASSERT(false);
   1542 		}
   1543 
   1544 		Short4 c01 = current.z;
   1545 		Short4 c23 = current.y;
   1546 
   1547 		Int xMask;   // Combination of all masks
   1548 
   1549 		if(state.depthTestActive)
   1550 		{
   1551 			xMask = zMask;
   1552 		}
   1553 		else
   1554 		{
   1555 			xMask = cMask;
   1556 		}
   1557 
   1558 		if(state.stencilActive)
   1559 		{
   1560 			xMask &= sMask;
   1561 		}
   1562 
   1563 		switch(state.targetFormat[index])
   1564 		{
   1565 		case FORMAT_R5G6B5:
   1566 			{
   1567 				Pointer<Byte> buffer = cBuffer + 2 * x;
   1568 				Int value = *Pointer<Int>(buffer);
   1569 
   1570 				Int c01 = Extract(As<Int2>(current.x), 0);
   1571 
   1572 				if((bgraWriteMask & 0x00000007) != 0x00000007)
   1573 				{
   1574 					Int masked = value;
   1575 					c01 &= *Pointer<Int>(constants + OFFSET(Constants,mask565Q[bgraWriteMask & 0x7][0]));
   1576 					masked &= *Pointer<Int>(constants + OFFSET(Constants,mask565Q[~bgraWriteMask & 0x7][0]));
   1577 					c01 |= masked;
   1578 				}
   1579 
   1580 				c01 &= *Pointer<Int>(constants + OFFSET(Constants,maskW4Q[0][0]) + xMask * 8);
   1581 				value &= *Pointer<Int>(constants + OFFSET(Constants,invMaskW4Q[0][0]) + xMask * 8);
   1582 				c01 |= value;
   1583 				*Pointer<Int>(buffer) = c01;
   1584 
   1585 				buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
   1586 				value = *Pointer<Int>(buffer);
   1587 
   1588 				Int c23 = Extract(As<Int2>(current.x), 1);
   1589 
   1590 				if((bgraWriteMask & 0x00000007) != 0x00000007)
   1591 				{
   1592 					Int masked = value;
   1593 					c23 &= *Pointer<Int>(constants + OFFSET(Constants,mask565Q[bgraWriteMask & 0x7][0]));
   1594 					masked &= *Pointer<Int>(constants + OFFSET(Constants,mask565Q[~bgraWriteMask & 0x7][0]));
   1595 					c23 |= masked;
   1596 				}
   1597 
   1598 				c23 &= *Pointer<Int>(constants + OFFSET(Constants,maskW4Q[0][2]) + xMask * 8);
   1599 				value &= *Pointer<Int>(constants + OFFSET(Constants,invMaskW4Q[0][2]) + xMask * 8);
   1600 				c23 |= value;
   1601 				*Pointer<Int>(buffer) = c23;
   1602 			}
   1603 			break;
   1604 		case FORMAT_A8G8R8B8Q:
   1605 		case FORMAT_X8G8R8B8Q:   // FIXME: Don't touch alpha?
   1606 			UNIMPLEMENTED();
   1607 		//	value = *Pointer<Short4>(cBuffer + 8 * x + 0);
   1608 
   1609 		//	if((state.targetFormat[index] == FORMAT_A8G8R8B8Q && bgraWriteMask != 0x0000000F) ||
   1610 		//	   ((state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x00000007) &&
   1611 		//	    (state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x0000000F)))   // FIXME: Need for masking when XRGB && Fh?
   1612 		//	{
   1613 		//		Short4 masked = value;
   1614 		//		c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
   1615 		//		masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
   1616 		//		c01 |= masked;
   1617 		//	}
   1618 
   1619 		//	c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8);
   1620 		//	value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
   1621 		//	c01 |= value;
   1622 		//	*Pointer<Short4>(cBuffer + 8 * x + 0) = c01;
   1623 
   1624 		//	value = *Pointer<Short4>(cBuffer + 8 * x + 8);
   1625 
   1626 		//	if((state.targetFormat[index] == FORMAT_A8G8R8B8Q && bgraWriteMask != 0x0000000F) ||
   1627 		//	   ((state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x00000007) &&
   1628 		//	    (state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x0000000F)))   // FIXME: Need for masking when XRGB && Fh?
   1629 		//	{
   1630 		//		Short4 masked = value;
   1631 		//		c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
   1632 		//		masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
   1633 		//		c23 |= masked;
   1634 		//	}
   1635 
   1636 		//	c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8);
   1637 		//	value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
   1638 		//	c23 |= value;
   1639 		//	*Pointer<Short4>(cBuffer + 8 * x + 8) = c23;
   1640 			break;
   1641 		case FORMAT_A8R8G8B8:
   1642 		case FORMAT_X8R8G8B8:   // FIXME: Don't touch alpha?
   1643 			{
   1644 				Pointer<Byte> buffer = cBuffer + x * 4;
   1645 				Short4 value = *Pointer<Short4>(buffer);
   1646 
   1647 				if((state.targetFormat[index] == FORMAT_A8R8G8B8 && bgraWriteMask != 0x0000000F) ||
   1648 				   ((state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x00000007) &&
   1649 					(state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x0000000F)))   // FIXME: Need for masking when XRGB && Fh?
   1650 				{
   1651 					Short4 masked = value;
   1652 					c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
   1653 					masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
   1654 					c01 |= masked;
   1655 				}
   1656 
   1657 				c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8);
   1658 				value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
   1659 				c01 |= value;
   1660 				*Pointer<Short4>(buffer) = c01;
   1661 
   1662 				buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
   1663 				value = *Pointer<Short4>(buffer);
   1664 
   1665 				if((state.targetFormat[index] == FORMAT_A8R8G8B8 && bgraWriteMask != 0x0000000F) ||
   1666 				   ((state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x00000007) &&
   1667 					(state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x0000000F)))   // FIXME: Need for masking when XRGB && Fh?
   1668 				{
   1669 					Short4 masked = value;
   1670 					c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
   1671 					masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
   1672 					c23 |= masked;
   1673 				}
   1674 
   1675 				c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8);
   1676 				value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
   1677 				c23 |= value;
   1678 				*Pointer<Short4>(buffer) = c23;
   1679 			}
   1680 			break;
   1681 		case FORMAT_A8B8G8R8:
   1682 		case FORMAT_X8B8G8R8:   // FIXME: Don't touch alpha?
   1683 		case FORMAT_SRGB8_X8:
   1684 		case FORMAT_SRGB8_A8:
   1685 			{
   1686 				Pointer<Byte> buffer = cBuffer + x * 4;
   1687 				Short4 value = *Pointer<Short4>(buffer);
   1688 
   1689 				bool masked = (((state.targetFormat[index] == FORMAT_A8B8G8R8 || state.targetFormat[index] == FORMAT_SRGB8_A8) && rgbaWriteMask != 0x0000000F) ||
   1690 				              (((state.targetFormat[index] == FORMAT_X8B8G8R8 || state.targetFormat[index] == FORMAT_SRGB8_X8) && rgbaWriteMask != 0x00000007) &&
   1691 				               ((state.targetFormat[index] == FORMAT_X8B8G8R8 || state.targetFormat[index] == FORMAT_SRGB8_X8) && rgbaWriteMask != 0x0000000F))); // FIXME: Need for masking when XBGR && Fh?
   1692 
   1693 				if(masked)
   1694 				{
   1695 					Short4 masked = value;
   1696 					c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[rgbaWriteMask][0]));
   1697 					masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[rgbaWriteMask][0]));
   1698 					c01 |= masked;
   1699 				}
   1700 
   1701 				c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8);
   1702 				value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
   1703 				c01 |= value;
   1704 				*Pointer<Short4>(buffer) = c01;
   1705 
   1706 				buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
   1707 				value = *Pointer<Short4>(buffer);
   1708 
   1709 				if(masked)
   1710 				{
   1711 					Short4 masked = value;
   1712 					c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[rgbaWriteMask][0]));
   1713 					masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[rgbaWriteMask][0]));
   1714 					c23 |= masked;
   1715 				}
   1716 
   1717 				c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8);
   1718 				value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
   1719 				c23 |= value;
   1720 				*Pointer<Short4>(buffer) = c23;
   1721 			}
   1722 			break;
   1723 		case FORMAT_G8R8:
   1724 			if((rgbaWriteMask & 0x00000003) != 0x0)
   1725 			{
   1726 				Pointer<Byte> buffer = cBuffer + 2 * x;
   1727 				Int2 value;
   1728 				value = Insert(value, *Pointer<Int>(buffer), 0);
   1729 				Int pitch = *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
   1730 				value = Insert(value, *Pointer<Int>(buffer + pitch), 1);
   1731 
   1732 				Int2 packedCol = As<Int2>(current.x);
   1733 
   1734 				UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q) + xMask * 8);
   1735 				if((rgbaWriteMask & 0x3) != 0x3)
   1736 				{
   1737 					Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskB4Q[5 * (rgbaWriteMask & 0x3)][0]));
   1738 					UInt2 rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
   1739 					mergedMask &= rgbaMask;
   1740 				}
   1741 
   1742 				packedCol = As<Int2>((As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask));
   1743 
   1744 				*Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 0));
   1745 				*Pointer<UInt>(buffer + pitch) = As<UInt>(Extract(packedCol, 1));
   1746 			}
   1747 			break;
   1748 		case FORMAT_R8:
   1749 			if(rgbaWriteMask & 0x00000001)
   1750 			{
   1751 				Pointer<Byte> buffer = cBuffer + 1 * x;
   1752 				Short4 value;
   1753 				value = Insert(value, *Pointer<Short>(buffer), 0);
   1754 				Int pitch = *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
   1755 				value = Insert(value, *Pointer<Short>(buffer + pitch), 1);
   1756 				value = UnpackLow(As<Byte8>(value), As<Byte8>(value));
   1757 
   1758 				current.x &= *Pointer<Short4>(constants + OFFSET(Constants, maskB4Q) + 8 * xMask);
   1759 				value &= *Pointer<Short4>(constants + OFFSET(Constants, invMaskB4Q) + 8 * xMask);
   1760 				current.x |= value;
   1761 
   1762 				*Pointer<Short>(buffer) = Extract(current.x, 0);
   1763 				*Pointer<Short>(buffer + pitch) = Extract(current.x, 1);
   1764 			}
   1765 			break;
   1766 		case FORMAT_A8:
   1767 			if(rgbaWriteMask & 0x00000008)
   1768 			{
   1769 				Pointer<Byte> buffer = cBuffer + 1 * x;
   1770 				Short4 value;
   1771 				value = Insert(value, *Pointer<Short>(buffer), 0);
   1772 				Int pitch = *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
   1773 				value = Insert(value, *Pointer<Short>(buffer + pitch), 1);
   1774 				value = UnpackLow(As<Byte8>(value), As<Byte8>(value));
   1775 
   1776 				current.w &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q) + 8 * xMask);
   1777 				value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q) + 8 * xMask);
   1778 				current.w |= value;
   1779 
   1780 				*Pointer<Short>(buffer) = Extract(current.w, 0);
   1781 				*Pointer<Short>(buffer + pitch) = Extract(current.w, 1);
   1782 			}
   1783 			break;
   1784 		case FORMAT_G16R16:
   1785 			{
   1786 				Pointer<Byte> buffer = cBuffer + 4 * x;
   1787 
   1788 				Short4 value = *Pointer<Short4>(buffer);
   1789 
   1790 				if((rgbaWriteMask & 0x00000003) != 0x00000003)
   1791 				{
   1792 					Short4 masked = value;
   1793 					current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskW01Q[rgbaWriteMask & 0x3][0]));
   1794 					masked &= *Pointer<Short4>(constants + OFFSET(Constants,maskW01Q[~rgbaWriteMask & 0x3][0]));
   1795 					current.x |= masked;
   1796 				}
   1797 
   1798 				current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8);
   1799 				value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
   1800 				current.x |= value;
   1801 				*Pointer<Short4>(buffer) = current.x;
   1802 
   1803 				buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
   1804 
   1805 				value = *Pointer<Short4>(buffer);
   1806 
   1807 				if((rgbaWriteMask & 0x00000003) != 0x00000003)
   1808 				{
   1809 					Short4 masked = value;
   1810 					current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskW01Q[rgbaWriteMask & 0x3][0]));
   1811 					masked &= *Pointer<Short4>(constants + OFFSET(Constants,maskW01Q[~rgbaWriteMask & 0x3][0]));
   1812 					current.y |= masked;
   1813 				}
   1814 
   1815 				current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8);
   1816 				value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
   1817 				current.y |= value;
   1818 				*Pointer<Short4>(buffer) = current.y;
   1819 			}
   1820 			break;
   1821 		case FORMAT_A16B16G16R16:
   1822 			{
   1823 				Pointer<Byte> buffer = cBuffer + 8 * x;
   1824 
   1825 				{
   1826 					Short4 value = *Pointer<Short4>(buffer);
   1827 
   1828 					if(rgbaWriteMask != 0x0000000F)
   1829 					{
   1830 						Short4 masked = value;
   1831 						current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
   1832 						masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
   1833 						current.x |= masked;
   1834 					}
   1835 
   1836 					current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ0Q) + xMask * 8);
   1837 					value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ0Q) + xMask * 8);
   1838 					current.x |= value;
   1839 					*Pointer<Short4>(buffer) = current.x;
   1840 				}
   1841 
   1842 				{
   1843 					Short4 value = *Pointer<Short4>(buffer + 8);
   1844 
   1845 					if(rgbaWriteMask != 0x0000000F)
   1846 					{
   1847 						Short4 masked = value;
   1848 						current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
   1849 						masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
   1850 						current.y |= masked;
   1851 					}
   1852 
   1853 					current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ1Q) + xMask * 8);
   1854 					value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ1Q) + xMask * 8);
   1855 					current.y |= value;
   1856 					*Pointer<Short4>(buffer + 8) = current.y;
   1857 				}
   1858 
   1859 				buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
   1860 
   1861 				{
   1862 					Short4 value = *Pointer<Short4>(buffer);
   1863 
   1864 					if(rgbaWriteMask != 0x0000000F)
   1865 					{
   1866 						Short4 masked = value;
   1867 						current.z &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
   1868 						masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
   1869 						current.z |= masked;
   1870 					}
   1871 
   1872 					current.z &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ2Q) + xMask * 8);
   1873 					value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ2Q) + xMask * 8);
   1874 					current.z |= value;
   1875 					*Pointer<Short4>(buffer) = current.z;
   1876 				}
   1877 
   1878 				{
   1879 					Short4 value = *Pointer<Short4>(buffer + 8);
   1880 
   1881 					if(rgbaWriteMask != 0x0000000F)
   1882 					{
   1883 						Short4 masked = value;
   1884 						current.w &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
   1885 						masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
   1886 						current.w |= masked;
   1887 					}
   1888 
   1889 					current.w &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ3Q) + xMask * 8);
   1890 					value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ3Q) + xMask * 8);
   1891 					current.w |= value;
   1892 					*Pointer<Short4>(buffer + 8) = current.w;
   1893 				}
   1894 			}
   1895 			break;
   1896 		default:
   1897 			ASSERT(false);
   1898 		}
   1899 	}
   1900 
   1901 	void PixelRoutine::blendFactor(Vector4f &blendFactor, const Vector4f &oC, const Vector4f &pixel, BlendFactor blendFactorActive)
   1902 	{
   1903 		switch(blendFactorActive)
   1904 		{
   1905 		case BLEND_ZERO:
   1906 			// Optimized
   1907 			break;
   1908 		case BLEND_ONE:
   1909 			// Optimized
   1910 			break;
   1911 		case BLEND_SOURCE:
   1912 			blendFactor.x = oC.x;
   1913 			blendFactor.y = oC.y;
   1914 			blendFactor.z = oC.z;
   1915 			break;
   1916 		case BLEND_INVSOURCE:
   1917 			blendFactor.x = Float4(1.0f) - oC.x;
   1918 			blendFactor.y = Float4(1.0f) - oC.y;
   1919 			blendFactor.z = Float4(1.0f) - oC.z;
   1920 			break;
   1921 		case BLEND_DEST:
   1922 			blendFactor.x = pixel.x;
   1923 			blendFactor.y = pixel.y;
   1924 			blendFactor.z = pixel.z;
   1925 			break;
   1926 		case BLEND_INVDEST:
   1927 			blendFactor.x = Float4(1.0f) - pixel.x;
   1928 			blendFactor.y = Float4(1.0f) - pixel.y;
   1929 			blendFactor.z = Float4(1.0f) - pixel.z;
   1930 			break;
   1931 		case BLEND_SOURCEALPHA:
   1932 			blendFactor.x = oC.w;
   1933 			blendFactor.y = oC.w;
   1934 			blendFactor.z = oC.w;
   1935 			break;
   1936 		case BLEND_INVSOURCEALPHA:
   1937 			blendFactor.x = Float4(1.0f) - oC.w;
   1938 			blendFactor.y = Float4(1.0f) - oC.w;
   1939 			blendFactor.z = Float4(1.0f) - oC.w;
   1940 			break;
   1941 		case BLEND_DESTALPHA:
   1942 			blendFactor.x = pixel.w;
   1943 			blendFactor.y = pixel.w;
   1944 			blendFactor.z = pixel.w;
   1945 			break;
   1946 		case BLEND_INVDESTALPHA:
   1947 			blendFactor.x = Float4(1.0f) - pixel.w;
   1948 			blendFactor.y = Float4(1.0f) - pixel.w;
   1949 			blendFactor.z = Float4(1.0f) - pixel.w;
   1950 			break;
   1951 		case BLEND_SRCALPHASAT:
   1952 			blendFactor.x = Float4(1.0f) - pixel.w;
   1953 			blendFactor.x = Min(blendFactor.x, oC.w);
   1954 			blendFactor.y = blendFactor.x;
   1955 			blendFactor.z = blendFactor.x;
   1956 			break;
   1957 		case BLEND_CONSTANT:
   1958 			blendFactor.x = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[0]));
   1959 			blendFactor.y = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[1]));
   1960 			blendFactor.z = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[2]));
   1961 			break;
   1962 		case BLEND_INVCONSTANT:
   1963 			blendFactor.x = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[0]));
   1964 			blendFactor.y = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[1]));
   1965 			blendFactor.z = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[2]));
   1966 			break;
   1967 		default:
   1968 			ASSERT(false);
   1969 		}
   1970 	}
   1971 
   1972 	void PixelRoutine::blendFactorAlpha(Vector4f &blendFactor, const Vector4f &oC, const Vector4f &pixel, BlendFactor blendFactorAlphaActive)
   1973 	{
   1974 		switch(blendFactorAlphaActive)
   1975 		{
   1976 		case BLEND_ZERO:
   1977 			// Optimized
   1978 			break;
   1979 		case BLEND_ONE:
   1980 			// Optimized
   1981 			break;
   1982 		case BLEND_SOURCE:
   1983 			blendFactor.w = oC.w;
   1984 			break;
   1985 		case BLEND_INVSOURCE:
   1986 			blendFactor.w = Float4(1.0f) - oC.w;
   1987 			break;
   1988 		case BLEND_DEST:
   1989 			blendFactor.w = pixel.w;
   1990 			break;
   1991 		case BLEND_INVDEST:
   1992 			blendFactor.w = Float4(1.0f) - pixel.w;
   1993 			break;
   1994 		case BLEND_SOURCEALPHA:
   1995 			blendFactor.w = oC.w;
   1996 			break;
   1997 		case BLEND_INVSOURCEALPHA:
   1998 			blendFactor.w = Float4(1.0f) - oC.w;
   1999 			break;
   2000 		case BLEND_DESTALPHA:
   2001 			blendFactor.w = pixel.w;
   2002 			break;
   2003 		case BLEND_INVDESTALPHA:
   2004 			blendFactor.w = Float4(1.0f) - pixel.w;
   2005 			break;
   2006 		case BLEND_SRCALPHASAT:
   2007 			blendFactor.w = Float4(1.0f);
   2008 			break;
   2009 		case BLEND_CONSTANT:
   2010 			blendFactor.w = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[3]));
   2011 			break;
   2012 		case BLEND_INVCONSTANT:
   2013 			blendFactor.w = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[3]));
   2014 			break;
   2015 		default:
   2016 			ASSERT(false);
   2017 		}
   2018 	}
   2019 
   2020 	void PixelRoutine::alphaBlend(int index, Pointer<Byte> &cBuffer, Vector4f &oC, Int &x)
   2021 	{
   2022 		if(!state.alphaBlendActive)
   2023 		{
   2024 			return;
   2025 		}
   2026 
   2027 		Pointer<Byte> buffer;
   2028 		Vector4f pixel;
   2029 
   2030 		Vector4s color;
   2031 		Short4 c01;
   2032 		Short4 c23;
   2033 
   2034 		Float4 one;
   2035 		if(Surface::isFloatFormat(state.targetFormat[index]))
   2036 		{
   2037 			one = Float4(1.0f);
   2038 		}
   2039 		else if(Surface::isNonNormalizedInteger(state.targetFormat[index]))
   2040 		{
   2041 			one = As<Float4>(Surface::isUnsignedComponent(state.targetFormat[index], 0) ? Int4(0xFFFFFFFF) : Int4(0x7FFFFFFF));
   2042 		}
   2043 
   2044 		switch(state.targetFormat[index])
   2045 		{
   2046 		case FORMAT_R32I:
   2047 		case FORMAT_R32UI:
   2048 		case FORMAT_R32F:
   2049 			buffer = cBuffer;
   2050 			// FIXME: movlps
   2051 			pixel.x.x = *Pointer<Float>(buffer + 4 * x + 0);
   2052 			pixel.x.y = *Pointer<Float>(buffer + 4 * x + 4);
   2053 			buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
   2054 			// FIXME: movhps
   2055 			pixel.x.z = *Pointer<Float>(buffer + 4 * x + 0);
   2056 			pixel.x.w = *Pointer<Float>(buffer + 4 * x + 4);
   2057 			pixel.y = pixel.z = pixel.w = one;
   2058 			break;
   2059 		case FORMAT_G32R32I:
   2060 		case FORMAT_G32R32UI:
   2061 		case FORMAT_G32R32F:
   2062 			buffer = cBuffer;
   2063 			pixel.x = *Pointer<Float4>(buffer + 8 * x, 16);
   2064 			buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
   2065 			pixel.y = *Pointer<Float4>(buffer + 8 * x, 16);
   2066 			pixel.z = pixel.x;
   2067 			pixel.x = ShuffleLowHigh(pixel.x, pixel.y, 0x88);
   2068 			pixel.z = ShuffleLowHigh(pixel.z, pixel.y, 0xDD);
   2069 			pixel.y = pixel.z;
   2070 			pixel.z = pixel.w = one;
   2071 			break;
   2072 		case FORMAT_X32B32G32R32F:
   2073 		case FORMAT_A32B32G32R32F:
   2074 		case FORMAT_A32B32G32R32I:
   2075 		case FORMAT_A32B32G32R32UI:
   2076 			buffer = cBuffer;
   2077 			pixel.x = *Pointer<Float4>(buffer + 16 * x, 16);
   2078 			pixel.y = *Pointer<Float4>(buffer + 16 * x + 16, 16);
   2079 			buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
   2080 			pixel.z = *Pointer<Float4>(buffer + 16 * x, 16);
   2081 			pixel.w = *Pointer<Float4>(buffer + 16 * x + 16, 16);
   2082 			transpose4x4(pixel.x, pixel.y, pixel.z, pixel.w);
   2083 			if(state.targetFormat[index] == FORMAT_X32B32G32R32F)
   2084 			{
   2085 				pixel.w = Float4(1.0f);
   2086 			}
   2087 			break;
   2088 		default:
   2089 			ASSERT(false);
   2090 		}
   2091 
   2092 		if((postBlendSRGB && state.writeSRGB) || isSRGB(index))
   2093 		{
   2094 			sRGBtoLinear(pixel.x);
   2095 			sRGBtoLinear(pixel.y);
   2096 			sRGBtoLinear(pixel.z);
   2097 		}
   2098 
   2099 		// Final Color = ObjectColor * SourceBlendFactor + PixelColor * DestinationBlendFactor
   2100 		Vector4f sourceFactor;
   2101 		Vector4f destFactor;
   2102 
   2103 		blendFactor(sourceFactor, oC, pixel, state.sourceBlendFactor);
   2104 		blendFactor(destFactor, oC, pixel, state.destBlendFactor);
   2105 
   2106 		if(state.sourceBlendFactor != BLEND_ONE && state.sourceBlendFactor != BLEND_ZERO)
   2107 		{
   2108 			oC.x *= sourceFactor.x;
   2109 			oC.y *= sourceFactor.y;
   2110 			oC.z *= sourceFactor.z;
   2111 		}
   2112 
   2113 		if(state.destBlendFactor != BLEND_ONE && state.destBlendFactor != BLEND_ZERO)
   2114 		{
   2115 			pixel.x *= destFactor.x;
   2116 			pixel.y *= destFactor.y;
   2117 			pixel.z *= destFactor.z;
   2118 		}
   2119 
   2120 		switch(state.blendOperation)
   2121 		{
   2122 		case BLENDOP_ADD:
   2123 			oC.x += pixel.x;
   2124 			oC.y += pixel.y;
   2125 			oC.z += pixel.z;
   2126 			break;
   2127 		case BLENDOP_SUB:
   2128 			oC.x -= pixel.x;
   2129 			oC.y -= pixel.y;
   2130 			oC.z -= pixel.z;
   2131 			break;
   2132 		case BLENDOP_INVSUB:
   2133 			oC.x = pixel.x - oC.x;
   2134 			oC.y = pixel.y - oC.y;
   2135 			oC.z = pixel.z - oC.z;
   2136 			break;
   2137 		case BLENDOP_MIN:
   2138 			oC.x = Min(oC.x, pixel.x);
   2139 			oC.y = Min(oC.y, pixel.y);
   2140 			oC.z = Min(oC.z, pixel.z);
   2141 			break;
   2142 		case BLENDOP_MAX:
   2143 			oC.x = Max(oC.x, pixel.x);
   2144 			oC.y = Max(oC.y, pixel.y);
   2145 			oC.z = Max(oC.z, pixel.z);
   2146 			break;
   2147 		case BLENDOP_SOURCE:
   2148 			// No operation
   2149 			break;
   2150 		case BLENDOP_DEST:
   2151 			oC.x = pixel.x;
   2152 			oC.y = pixel.y;
   2153 			oC.z = pixel.z;
   2154 			break;
   2155 		case BLENDOP_NULL:
   2156 			oC.x = Float4(0.0f);
   2157 			oC.y = Float4(0.0f);
   2158 			oC.z = Float4(0.0f);
   2159 			break;
   2160 		default:
   2161 			ASSERT(false);
   2162 		}
   2163 
   2164 		blendFactorAlpha(sourceFactor, oC, pixel, state.sourceBlendFactorAlpha);
   2165 		blendFactorAlpha(destFactor, oC, pixel, state.destBlendFactorAlpha);
   2166 
   2167 		if(state.sourceBlendFactorAlpha != BLEND_ONE && state.sourceBlendFactorAlpha != BLEND_ZERO)
   2168 		{
   2169 			oC.w *= sourceFactor.w;
   2170 		}
   2171 
   2172 		if(state.destBlendFactorAlpha != BLEND_ONE && state.destBlendFactorAlpha != BLEND_ZERO)
   2173 		{
   2174 			pixel.w *= destFactor.w;
   2175 		}
   2176 
   2177 		switch(state.blendOperationAlpha)
   2178 		{
   2179 		case BLENDOP_ADD:
   2180 			oC.w += pixel.w;
   2181 			break;
   2182 		case BLENDOP_SUB:
   2183 			oC.w -= pixel.w;
   2184 			break;
   2185 		case BLENDOP_INVSUB:
   2186 			pixel.w -= oC.w;
   2187 			oC.w = pixel.w;
   2188 			break;
   2189 		case BLENDOP_MIN:
   2190 			oC.w = Min(oC.w, pixel.w);
   2191 			break;
   2192 		case BLENDOP_MAX:
   2193 			oC.w = Max(oC.w, pixel.w);
   2194 			break;
   2195 		case BLENDOP_SOURCE:
   2196 			// No operation
   2197 			break;
   2198 		case BLENDOP_DEST:
   2199 			oC.w = pixel.w;
   2200 			break;
   2201 		case BLENDOP_NULL:
   2202 			oC.w = Float4(0.0f);
   2203 			break;
   2204 		default:
   2205 			ASSERT(false);
   2206 		}
   2207 	}
   2208 
   2209 	void PixelRoutine::writeColor(int index, Pointer<Byte> &cBuffer, Int &x, Vector4f &oC, Int &sMask, Int &zMask, Int &cMask)
   2210 	{
   2211 		switch(state.targetFormat[index])
   2212 		{
   2213 		case FORMAT_R32F:
   2214 		case FORMAT_R32I:
   2215 		case FORMAT_R32UI:
   2216 		case FORMAT_R16I:
   2217 		case FORMAT_R16UI:
   2218 		case FORMAT_R8I:
   2219 		case FORMAT_R8UI:
   2220 			break;
   2221 		case FORMAT_G32R32F:
   2222 		case FORMAT_G32R32I:
   2223 		case FORMAT_G32R32UI:
   2224 		case FORMAT_G16R16I:
   2225 		case FORMAT_G16R16UI:
   2226 		case FORMAT_G8R8I:
   2227 		case FORMAT_G8R8UI:
   2228 			oC.z = oC.x;
   2229 			oC.x = UnpackLow(oC.x, oC.y);
   2230 			oC.z = UnpackHigh(oC.z, oC.y);
   2231 			oC.y = oC.z;
   2232 			break;
   2233 		case FORMAT_X32B32G32R32F:
   2234 		case FORMAT_A32B32G32R32F:
   2235 		case FORMAT_A32B32G32R32I:
   2236 		case FORMAT_A32B32G32R32UI:
   2237 		case FORMAT_A16B16G16R16I:
   2238 		case FORMAT_A16B16G16R16UI:
   2239 		case FORMAT_A8B8G8R8I:
   2240 		case FORMAT_A8B8G8R8UI:
   2241 			transpose4x4(oC.x, oC.y, oC.z, oC.w);
   2242 			break;
   2243 		default:
   2244 			ASSERT(false);
   2245 		}
   2246 
   2247 		int rgbaWriteMask = state.colorWriteActive(index);
   2248 
   2249 		Int xMask;   // Combination of all masks
   2250 
   2251 		if(state.depthTestActive)
   2252 		{
   2253 			xMask = zMask;
   2254 		}
   2255 		else
   2256 		{
   2257 			xMask = cMask;
   2258 		}
   2259 
   2260 		if(state.stencilActive)
   2261 		{
   2262 			xMask &= sMask;
   2263 		}
   2264 
   2265 		Pointer<Byte> buffer;
   2266 		Float4 value;
   2267 
   2268 		switch(state.targetFormat[index])
   2269 		{
   2270 		case FORMAT_R32F:
   2271 		case FORMAT_R32I:
   2272 		case FORMAT_R32UI:
   2273 			if(rgbaWriteMask & 0x00000001)
   2274 			{
   2275 				buffer = cBuffer + 4 * x;
   2276 
   2277 				// FIXME: movlps
   2278 				value.x = *Pointer<Float>(buffer + 0);
   2279 				value.y = *Pointer<Float>(buffer + 4);
   2280 
   2281 				buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
   2282 
   2283 				// FIXME: movhps
   2284 				value.z = *Pointer<Float>(buffer + 0);
   2285 				value.w = *Pointer<Float>(buffer + 4);
   2286 
   2287 				oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X) + xMask * 16, 16));
   2288 				value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X) + xMask * 16, 16));
   2289 				oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
   2290 
   2291 				// FIXME: movhps
   2292 				*Pointer<Float>(buffer + 0) = oC.x.z;
   2293 				*Pointer<Float>(buffer + 4) = oC.x.w;
   2294 
   2295 				buffer -= *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
   2296 
   2297 				// FIXME: movlps
   2298 				*Pointer<Float>(buffer + 0) = oC.x.x;
   2299 				*Pointer<Float>(buffer + 4) = oC.x.y;
   2300 			}
   2301 			break;
   2302 		case FORMAT_R16I:
   2303 		case FORMAT_R16UI:
   2304 			if(rgbaWriteMask & 0x00000001)
   2305 			{
   2306 				buffer = cBuffer + 2 * x;
   2307 
   2308 				UShort4 xyzw;
   2309 				xyzw = As<UShort4>(Insert(As<Int2>(xyzw), *Pointer<Int>(buffer), 0));
   2310 
   2311 				buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
   2312 
   2313 				xyzw = As<UShort4>(Insert(As<Int2>(xyzw), *Pointer<Int>(buffer), 1));
   2314 				value = As<Float4>(Int4(xyzw));
   2315 
   2316 				oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X) + xMask * 16, 16));
   2317 				value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X) + xMask * 16, 16));
   2318 				oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
   2319 
   2320 				if(state.targetFormat[index] == FORMAT_R16I)
   2321 				{
   2322 					Float component = oC.x.z;
   2323 					*Pointer<Short>(buffer + 0) = Short(As<Int>(component));
   2324 					component = oC.x.w;
   2325 					*Pointer<Short>(buffer + 2) = Short(As<Int>(component));
   2326 
   2327 					buffer -= *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
   2328 
   2329 					component = oC.x.x;
   2330 					*Pointer<Short>(buffer + 0) = Short(As<Int>(component));
   2331 					component = oC.x.y;
   2332 					*Pointer<Short>(buffer + 2) = Short(As<Int>(component));
   2333 				}
   2334 				else // FORMAT_R16UI
   2335 				{
   2336 					Float component = oC.x.z;
   2337 					*Pointer<UShort>(buffer + 0) = UShort(As<Int>(component));
   2338 					component = oC.x.w;
   2339 					*Pointer<UShort>(buffer + 2) = UShort(As<Int>(component));
   2340 
   2341 					buffer -= *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
   2342 
   2343 					component = oC.x.x;
   2344 					*Pointer<UShort>(buffer + 0) = UShort(As<Int>(component));
   2345 					component = oC.x.y;
   2346 					*Pointer<UShort>(buffer + 2) = UShort(As<Int>(component));
   2347 				}
   2348 			}
   2349 			break;
   2350 		case FORMAT_R8I:
   2351 		case FORMAT_R8UI:
   2352 			if(rgbaWriteMask & 0x00000001)
   2353 			{
   2354 				buffer = cBuffer + x;
   2355 
   2356 				UInt xyzw, packedCol;
   2357 
   2358 				xyzw = UInt(*Pointer<UShort>(buffer)) & 0xFFFF;
   2359 				buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
   2360 				xyzw |= UInt(*Pointer<UShort>(buffer)) << 16;
   2361 
   2362 				Short4 tmpCol = Short4(As<Int4>(oC.x));
   2363 				if(state.targetFormat[index] == FORMAT_R8I)
   2364 				{
   2365 					tmpCol = As<Short4>(Pack(tmpCol, tmpCol));
   2366 				}
   2367 				else
   2368 				{
   2369 					tmpCol = As<Short4>(Pack(As<UShort4>(tmpCol), As<UShort4>(tmpCol)));
   2370 				}
   2371 				packedCol = Extract(As<Int2>(tmpCol), 0);
   2372 
   2373 				packedCol = (packedCol & *Pointer<UInt>(constants + OFFSET(Constants, maskB4Q) + 8 * xMask)) |
   2374 				            (xyzw & *Pointer<UInt>(constants + OFFSET(Constants, invMaskB4Q) + 8 * xMask));
   2375 
   2376 				*Pointer<UShort>(buffer) = UShort(packedCol >> 16);
   2377 				buffer -= *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
   2378 				*Pointer<UShort>(buffer) = UShort(packedCol);
   2379 			}
   2380 			break;
   2381 		case FORMAT_G32R32F:
   2382 		case FORMAT_G32R32I:
   2383 		case FORMAT_G32R32UI:
   2384 			buffer = cBuffer + 8 * x;
   2385 
   2386 			value = *Pointer<Float4>(buffer);
   2387 
   2388 			if((rgbaWriteMask & 0x00000003) != 0x00000003)
   2389 			{
   2390 				Float4 masked = value;
   2391 				oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskD01X[rgbaWriteMask & 0x3][0])));
   2392 				masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,maskD01X[~rgbaWriteMask & 0x3][0])));
   2393 				oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(masked));
   2394 			}
   2395 
   2396 			oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskQ01X) + xMask * 16, 16));
   2397 			value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskQ01X) + xMask * 16, 16));
   2398 			oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
   2399 			*Pointer<Float4>(buffer) = oC.x;
   2400 
   2401 			buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
   2402 
   2403 			value = *Pointer<Float4>(buffer);
   2404 
   2405 			if((rgbaWriteMask & 0x00000003) != 0x00000003)
   2406 			{
   2407 				Float4 masked;
   2408 
   2409 				masked = value;
   2410 				oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskD01X[rgbaWriteMask & 0x3][0])));
   2411 				masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,maskD01X[~rgbaWriteMask & 0x3][0])));
   2412 				oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(masked));
   2413 			}
   2414 
   2415 			oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskQ23X) + xMask * 16, 16));
   2416 			value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskQ23X) + xMask * 16, 16));
   2417 			oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(value));
   2418 			*Pointer<Float4>(buffer) = oC.y;
   2419 			break;
   2420 		case FORMAT_G16R16I:
   2421 		case FORMAT_G16R16UI:
   2422 			if((rgbaWriteMask & 0x00000003) != 0x0)
   2423 			{
   2424 				buffer = cBuffer + 4 * x;
   2425 
   2426 				UInt2 rgbaMask;
   2427 				UShort4 packedCol = UShort4(As<Int4>(oC.x));
   2428 				UShort4 value = *Pointer<UShort4>(buffer);
   2429 				UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
   2430 				if((rgbaWriteMask & 0x3) != 0x3)
   2431 				{
   2432 					Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[rgbaWriteMask & 0x3][0]));
   2433 					rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
   2434 					mergedMask &= rgbaMask;
   2435 				}
   2436 				*Pointer<UInt2>(buffer) = (As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask);
   2437 
   2438 				buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
   2439 
   2440 				packedCol = UShort4(As<Int4>(oC.y));
   2441 				value = *Pointer<UShort4>(buffer);
   2442 				mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
   2443 				if((rgbaWriteMask & 0x3) != 0x3)
   2444 				{
   2445 					mergedMask &= rgbaMask;
   2446 				}
   2447 				*Pointer<UInt2>(buffer) = (As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask);
   2448 			}
   2449 			break;
   2450 		case FORMAT_G8R8I:
   2451 		case FORMAT_G8R8UI:
   2452 			if((rgbaWriteMask & 0x00000003) != 0x0)
   2453 			{
   2454 				buffer = cBuffer + 2 * x;
   2455 
   2456 				Int2 xyzw, packedCol;
   2457 
   2458 				xyzw = Insert(xyzw, *Pointer<Int>(buffer), 0);
   2459 				buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
   2460 				xyzw = Insert(xyzw, *Pointer<Int>(buffer), 1);
   2461 
   2462 				if(state.targetFormat[index] == FORMAT_G8R8I)
   2463 				{
   2464 					packedCol = As<Int2>(Pack(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y))));
   2465 				}
   2466 				else
   2467 				{
   2468 					packedCol = As<Int2>(Pack(UShort4(As<Int4>(oC.x)), UShort4(As<Int4>(oC.y))));
   2469 				}
   2470 
   2471 				UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q) + xMask * 8);
   2472 				if((rgbaWriteMask & 0x3) != 0x3)
   2473 				{
   2474 					Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskB4Q[5 * (rgbaWriteMask & 0x3)][0]));
   2475 					UInt2 rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
   2476 					mergedMask &= rgbaMask;
   2477 				}
   2478 
   2479 				packedCol = As<Int2>((As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(xyzw) & ~mergedMask));
   2480 
   2481 				*Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 1));
   2482 				buffer -= *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
   2483 				*Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 0));
   2484 			}
   2485 			break;
   2486 		case FORMAT_X32B32G32R32F:
   2487 		case FORMAT_A32B32G32R32F:
   2488 		case FORMAT_A32B32G32R32I:
   2489 		case FORMAT_A32B32G32R32UI:
   2490 			buffer = cBuffer + 16 * x;
   2491 
   2492 			{
   2493 				value = *Pointer<Float4>(buffer, 16);
   2494 
   2495 				if(rgbaWriteMask != 0x0000000F)
   2496 				{
   2497 					Float4 masked = value;
   2498 					oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
   2499 					masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
   2500 					oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(masked));
   2501 				}
   2502 
   2503 				oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskX0X) + xMask * 16, 16));
   2504 				value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX0X) + xMask * 16, 16));
   2505 				oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
   2506 				*Pointer<Float4>(buffer, 16) = oC.x;
   2507 			}
   2508 
   2509 			{
   2510 				value = *Pointer<Float4>(buffer + 16, 16);
   2511 
   2512 				if(rgbaWriteMask != 0x0000000F)
   2513 				{
   2514 					Float4 masked = value;
   2515 					oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
   2516 					masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
   2517 					oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(masked));
   2518 				}
   2519 
   2520 				oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskX1X) + xMask * 16, 16));
   2521 				value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX1X) + xMask * 16, 16));
   2522 				oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(value));
   2523 				*Pointer<Float4>(buffer + 16, 16) = oC.y;
   2524 			}
   2525 
   2526 			buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
   2527 
   2528 			{
   2529 				value = *Pointer<Float4>(buffer, 16);
   2530 
   2531 				if(rgbaWriteMask != 0x0000000F)
   2532 				{
   2533 					Float4 masked = value;
   2534 					oC.z = As<Float4>(As<Int4>(oC.z) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
   2535 					masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
   2536 					oC.z = As<Float4>(As<Int4>(oC.z) | As<Int4>(masked));
   2537 				}
   2538 
   2539 				oC.z = As<Float4>(As<Int4>(oC.z) & *Pointer<Int4>(constants + OFFSET(Constants,maskX2X) + xMask * 16, 16));
   2540 				value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX2X) + xMask * 16, 16));
   2541 				oC.z = As<Float4>(As<Int4>(oC.z) | As<Int4>(value));
   2542 				*Pointer<Float4>(buffer, 16) = oC.z;
   2543 			}
   2544 
   2545 			{
   2546 				value = *Pointer<Float4>(buffer + 16, 16);
   2547 
   2548 				if(rgbaWriteMask != 0x0000000F)
   2549 				{
   2550 					Float4 masked = value;
   2551 					oC.w = As<Float4>(As<Int4>(oC.w) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
   2552 					masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
   2553 					oC.w = As<Float4>(As<Int4>(oC.w) | As<Int4>(masked));
   2554 				}
   2555 
   2556 				oC.w = As<Float4>(As<Int4>(oC.w) & *Pointer<Int4>(constants + OFFSET(Constants,maskX3X) + xMask * 16, 16));
   2557 				value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX3X) + xMask * 16, 16));
   2558 				oC.w = As<Float4>(As<Int4>(oC.w) | As<Int4>(value));
   2559 				*Pointer<Float4>(buffer + 16, 16) = oC.w;
   2560 			}
   2561 			break;
   2562 		case FORMAT_A16B16G16R16I:
   2563 		case FORMAT_A16B16G16R16UI:
   2564 			if((rgbaWriteMask & 0x0000000F) != 0x0)
   2565 			{
   2566 				buffer = cBuffer + 8 * x;
   2567 
   2568 				UInt4 rgbaMask;
   2569 				UShort8 value = *Pointer<UShort8>(buffer);
   2570 				UShort8 packedCol = UShort8(UShort4(As<Int4>(oC.x)), UShort4(As<Int4>(oC.y)));
   2571 				UInt4 mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ01X) + xMask * 16);
   2572 				if((rgbaWriteMask & 0xF) != 0xF)
   2573 				{
   2574 					UInt2 tmpMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q[rgbaWriteMask][0]));
   2575 					rgbaMask = UInt4(tmpMask, tmpMask);
   2576 					mergedMask &= rgbaMask;
   2577 				}
   2578 				*Pointer<UInt4>(buffer) = (As<UInt4>(packedCol) & mergedMask) | (As<UInt4>(value) & ~mergedMask);
   2579 
   2580 				buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
   2581 
   2582 				value = *Pointer<UShort8>(buffer);
   2583 				packedCol = UShort8(UShort4(As<Int4>(oC.z)), UShort4(As<Int4>(oC.w)));
   2584 				mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ23X) + xMask * 16);
   2585 				if((rgbaWriteMask & 0xF) != 0xF)
   2586 				{
   2587 					mergedMask &= rgbaMask;
   2588 				}
   2589 				*Pointer<UInt4>(buffer) = (As<UInt4>(packedCol) & mergedMask) | (As<UInt4>(value) & ~mergedMask);
   2590 			}
   2591 			break;
   2592 		case FORMAT_A8B8G8R8I:
   2593 		case FORMAT_A8B8G8R8UI:
   2594 			if((rgbaWriteMask & 0x0000000F) != 0x0)
   2595 			{
   2596 				UInt2 value, packedCol, mergedMask;
   2597 
   2598 				buffer = cBuffer + 4 * x;
   2599 
   2600 				if(state.targetFormat[index] == FORMAT_A8B8G8R8I)
   2601 				{
   2602 					packedCol = As<UInt2>(Pack(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y))));
   2603 				}
   2604 				else
   2605 				{
   2606 					packedCol = As<UInt2>(Pack(UShort4(As<Int4>(oC.x)), UShort4(As<Int4>(oC.y))));
   2607 				}
   2608 				value = *Pointer<UInt2>(buffer, 16);
   2609 				mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
   2610 				if(rgbaWriteMask != 0xF)
   2611 				{
   2612 					mergedMask &= *Pointer<UInt2>(constants + OFFSET(Constants, maskB4Q[rgbaWriteMask][0]));
   2613 				}
   2614 				*Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (value & ~mergedMask);
   2615 
   2616 				buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
   2617 
   2618 				if(state.targetFormat[index] == FORMAT_A8B8G8R8I)
   2619 				{
   2620 					packedCol = As<UInt2>(Pack(Short4(As<Int4>(oC.z)), Short4(As<Int4>(oC.w))));
   2621 				}
   2622 				else
   2623 				{
   2624 					packedCol = As<UInt2>(Pack(UShort4(As<Int4>(oC.z)), UShort4(As<Int4>(oC.w))));
   2625 				}
   2626 				value = *Pointer<UInt2>(buffer, 16);
   2627 				mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
   2628 				if(rgbaWriteMask != 0xF)
   2629 				{
   2630 					mergedMask &= *Pointer<UInt2>(constants + OFFSET(Constants, maskB4Q[rgbaWriteMask][0]));
   2631 				}
   2632 				*Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (value & ~mergedMask);
   2633 			}
   2634 			break;
   2635 		default:
   2636 			ASSERT(false);
   2637 		}
   2638 	}
   2639 
   2640 	UShort4 PixelRoutine::convertFixed16(Float4 &cf, bool saturate)
   2641 	{
   2642 		return UShort4(cf * Float4(0xFFFF), saturate);
   2643 	}
   2644 
   2645 	void PixelRoutine::sRGBtoLinear16_12_16(Vector4s &c)
   2646 	{
   2647 		c.x = As<UShort4>(c.x) >> 4;
   2648 		c.y = As<UShort4>(c.y) >> 4;
   2649 		c.z = As<UShort4>(c.z) >> 4;
   2650 
   2651 		sRGBtoLinear12_16(c);
   2652 	}
   2653 
   2654 	void PixelRoutine::sRGBtoLinear12_16(Vector4s &c)
   2655 	{
   2656 		Pointer<Byte> LUT = constants + OFFSET(Constants,sRGBtoLinear12_16);
   2657 
   2658 		c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 0))), 0);
   2659 		c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 1))), 1);
   2660 		c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 2))), 2);
   2661 		c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 3))), 3);
   2662 
   2663 		c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 0))), 0);
   2664 		c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 1))), 1);
   2665 		c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 2))), 2);
   2666 		c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 3))), 3);
   2667 
   2668 		c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 0))), 0);
   2669 		c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 1))), 1);
   2670 		c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 2))), 2);
   2671 		c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 3))), 3);
   2672 	}
   2673 
   2674 	void PixelRoutine::linearToSRGB16_12_16(Vector4s &c)
   2675 	{
   2676 		c.x = As<UShort4>(c.x) >> 4;
   2677 		c.y = As<UShort4>(c.y) >> 4;
   2678 		c.z = As<UShort4>(c.z) >> 4;
   2679 
   2680 		linearToSRGB12_16(c);
   2681 	}
   2682 
   2683 	void PixelRoutine::linearToSRGB12_16(Vector4s &c)
   2684 	{
   2685 		Pointer<Byte> LUT = constants + OFFSET(Constants,linearToSRGB12_16);
   2686 
   2687 		c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 0))), 0);
   2688 		c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 1))), 1);
   2689 		c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 2))), 2);
   2690 		c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 3))), 3);
   2691 
   2692 		c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 0))), 0);
   2693 		c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 1))), 1);
   2694 		c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 2))), 2);
   2695 		c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 3))), 3);
   2696 
   2697 		c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 0))), 0);
   2698 		c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 1))), 1);
   2699 		c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 2))), 2);
   2700 		c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 3))), 3);
   2701 	}
   2702 
   2703 	Float4 PixelRoutine::sRGBtoLinear(const Float4 &x)   // Approximates x^2.2
   2704 	{
   2705 		Float4 linear = x * x;
   2706 		linear = linear * Float4(0.73f) + linear * x * Float4(0.27f);
   2707 
   2708 		return Min(Max(linear, Float4(0.0f)), Float4(1.0f));
   2709 	}
   2710 
   2711 	bool PixelRoutine::colorUsed()
   2712 	{
   2713 		return state.colorWriteMask || state.alphaTestActive() || state.shaderContainsKill;
   2714 	}
   2715 }
   2716