Home | History | Annotate | Download | only in Shader
      1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved.
      2 //
      3 // Licensed under the Apache License, Version 2.0 (the "License");
      4 // you may not use this file except in compliance with the License.
      5 // You may obtain a copy of the License at
      6 //
      7 //    http://www.apache.org/licenses/LICENSE-2.0
      8 //
      9 // Unless required by applicable law or agreed to in writing, software
     10 // distributed under the License is distributed on an "AS IS" BASIS,
     11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 // See the License for the specific language governing permissions and
     13 // limitations under the License.
     14 
     15 #include "VertexRoutine.hpp"
     16 
     17 #include "VertexShader.hpp"
     18 #include "Constants.hpp"
     19 #include "Renderer/Vertex.hpp"
     20 #include "Renderer/Renderer.hpp"
     21 #include "Common/Half.hpp"
     22 #include "Common/Debug.hpp"
     23 
     24 namespace sw
     25 {
     26 	extern bool halfIntegerCoordinates;     // Pixel centers are not at integer coordinates
     27 	extern bool symmetricNormalizedDepth;   // [-1, 1] instead of [0, 1]
     28 
     29 	VertexRoutine::VertexRoutine(const VertexProcessor::State &state, const VertexShader *shader)
     30 		: v(shader && shader->dynamicallyIndexedInput),
     31 		  o(shader && shader->dynamicallyIndexedOutput),
     32 		  state(state)
     33 	{
     34 	}
     35 
     36 	VertexRoutine::~VertexRoutine()
     37 	{
     38 	}
     39 
     40 	void VertexRoutine::generate()
     41 	{
     42 		const bool textureSampling = state.textureSampling;
     43 
     44 		Pointer<Byte> cache = task + OFFSET(VertexTask,vertexCache);
     45 		Pointer<Byte> vertexCache = cache + OFFSET(VertexCache,vertex);
     46 		Pointer<Byte> tagCache = cache + OFFSET(VertexCache,tag);
     47 
     48 		UInt vertexCount = *Pointer<UInt>(task + OFFSET(VertexTask,vertexCount));
     49 		UInt primitiveNumber = *Pointer<UInt>(task + OFFSET(VertexTask, primitiveStart));
     50 		UInt indexInPrimitive = 0;
     51 
     52 		constants = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData,constants));
     53 
     54 		Do
     55 		{
     56 			UInt index = *Pointer<UInt>(batch);
     57 			UInt tagIndex = index & 0x0000003C;
     58 			UInt indexQ = !textureSampling ? UInt(index & 0xFFFFFFFC) : index;   // FIXME: TEXLDL hack to have independent LODs, hurts performance.
     59 
     60 			If(*Pointer<UInt>(tagCache + tagIndex) != indexQ)
     61 			{
     62 				*Pointer<UInt>(tagCache + tagIndex) = indexQ;
     63 
     64 				readInput(indexQ);
     65 				pipeline(indexQ);
     66 				postTransform();
     67 				computeClipFlags();
     68 
     69 				Pointer<Byte> cacheLine0 = vertexCache + tagIndex * UInt((int)sizeof(Vertex));
     70 				writeCache(cacheLine0);
     71 			}
     72 
     73 			UInt cacheIndex = index & 0x0000003F;
     74 			Pointer<Byte> cacheLine = vertexCache + cacheIndex * UInt((int)sizeof(Vertex));
     75 			writeVertex(vertex, cacheLine);
     76 
     77 			if(state.transformFeedbackEnabled != 0)
     78 			{
     79 				transformFeedback(vertex, primitiveNumber, indexInPrimitive);
     80 
     81 				indexInPrimitive++;
     82 				If(indexInPrimitive == 3)
     83 				{
     84 					primitiveNumber++;
     85 					indexInPrimitive = 0;
     86 				}
     87 			}
     88 
     89 			vertex += sizeof(Vertex);
     90 			batch += sizeof(unsigned int);
     91 			vertexCount--;
     92 		}
     93 		Until(vertexCount == 0)
     94 
     95 		Return();
     96 	}
     97 
     98 	void VertexRoutine::readInput(UInt &index)
     99 	{
    100 		for(int i = 0; i < MAX_VERTEX_INPUTS; i++)
    101 		{
    102 			Pointer<Byte> input = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData,input) + sizeof(void*) * i);
    103 			UInt stride = *Pointer<UInt>(data + OFFSET(DrawData,stride) + sizeof(unsigned int) * i);
    104 
    105 			v[i] = readStream(input, stride, state.input[i], index);
    106 		}
    107 	}
    108 
    109 	void VertexRoutine::computeClipFlags()
    110 	{
    111 		int pos = state.positionRegister;
    112 
    113 		Int4 maxX = CmpLT(o[pos].w, o[pos].x);
    114 		Int4 maxY = CmpLT(o[pos].w, o[pos].y);
    115 		Int4 maxZ = CmpLT(o[pos].w, o[pos].z);
    116 		Int4 minX = CmpNLE(-o[pos].w, o[pos].x);
    117 		Int4 minY = CmpNLE(-o[pos].w, o[pos].y);
    118 		Int4 minZ = symmetricNormalizedDepth ? CmpNLE(-o[pos].w, o[pos].z) : CmpNLE(Float4(0.0f), o[pos].z);
    119 
    120 		clipFlags = *Pointer<Int>(constants + OFFSET(Constants,maxX) + SignMask(maxX) * 4);   // FIXME: Array indexing
    121 		clipFlags |= *Pointer<Int>(constants + OFFSET(Constants,maxY) + SignMask(maxY) * 4);
    122 		clipFlags |= *Pointer<Int>(constants + OFFSET(Constants,maxZ) + SignMask(maxZ) * 4);
    123 		clipFlags |= *Pointer<Int>(constants + OFFSET(Constants,minX) + SignMask(minX) * 4);
    124 		clipFlags |= *Pointer<Int>(constants + OFFSET(Constants,minY) + SignMask(minY) * 4);
    125 		clipFlags |= *Pointer<Int>(constants + OFFSET(Constants,minZ) + SignMask(minZ) * 4);
    126 
    127 		Int4 finiteX = CmpLE(Abs(o[pos].x), *Pointer<Float4>(constants + OFFSET(Constants,maxPos)));
    128 		Int4 finiteY = CmpLE(Abs(o[pos].y), *Pointer<Float4>(constants + OFFSET(Constants,maxPos)));
    129 		Int4 finiteZ = CmpLE(Abs(o[pos].z), *Pointer<Float4>(constants + OFFSET(Constants,maxPos)));
    130 
    131 		Int4 finiteXYZ = finiteX & finiteY & finiteZ;
    132 		clipFlags |= *Pointer<Int>(constants + OFFSET(Constants,fini) + SignMask(finiteXYZ) * 4);
    133 
    134 		if(state.preTransformed)
    135 		{
    136 			clipFlags &= 0xFBFBFBFB;   // Don't clip against far clip plane
    137 		}
    138 	}
    139 
    140 	Vector4f VertexRoutine::readStream(Pointer<Byte> &buffer, UInt &stride, const Stream &stream, const UInt &index)
    141 	{
    142 		const bool textureSampling = state.textureSampling;
    143 
    144 		Vector4f v;
    145 
    146 		Pointer<Byte> source0 = buffer + index * stride;
    147 		Pointer<Byte> source1 = source0 + (!textureSampling ? stride : 0);
    148 		Pointer<Byte> source2 = source1 + (!textureSampling ? stride : 0);
    149 		Pointer<Byte> source3 = source2 + (!textureSampling ? stride : 0);
    150 
    151 		bool isNativeFloatAttrib = (stream.attribType == VertexShader::ATTRIBTYPE_FLOAT) || stream.normalized;
    152 
    153 		switch(stream.type)
    154 		{
    155 		case STREAMTYPE_FLOAT:
    156 			{
    157 				if(stream.count == 0)
    158 				{
    159 					// Null stream, all default components
    160 				}
    161 				else
    162 				{
    163 					if(stream.count == 1)
    164 					{
    165 						v.x.x = *Pointer<Float>(source0);
    166 						v.x.y = *Pointer<Float>(source1);
    167 						v.x.z = *Pointer<Float>(source2);
    168 						v.x.w = *Pointer<Float>(source3);
    169 					}
    170 					else
    171 					{
    172 						v.x = *Pointer<Float4>(source0);
    173 						v.y = *Pointer<Float4>(source1);
    174 						v.z = *Pointer<Float4>(source2);
    175 						v.w = *Pointer<Float4>(source3);
    176 
    177 						transpose4xN(v.x, v.y, v.z, v.w, stream.count);
    178 					}
    179 
    180 					switch(stream.attribType)
    181 					{
    182 					case VertexShader::ATTRIBTYPE_INT:
    183 						if(stream.count >= 1) v.x = As<Float4>(Int4(v.x));
    184 						if(stream.count >= 2) v.x = As<Float4>(Int4(v.y));
    185 						if(stream.count >= 3) v.x = As<Float4>(Int4(v.z));
    186 						if(stream.count >= 4) v.x = As<Float4>(Int4(v.w));
    187 						break;
    188 					case VertexShader::ATTRIBTYPE_UINT:
    189 						if(stream.count >= 1) v.x = As<Float4>(UInt4(v.x));
    190 						if(stream.count >= 2) v.x = As<Float4>(UInt4(v.y));
    191 						if(stream.count >= 3) v.x = As<Float4>(UInt4(v.z));
    192 						if(stream.count >= 4) v.x = As<Float4>(UInt4(v.w));
    193 						break;
    194 					default:
    195 						break;
    196 					}
    197 				}
    198 			}
    199 			break;
    200 		case STREAMTYPE_BYTE:
    201 			if(isNativeFloatAttrib) // Stream: UByte, Shader attrib: Float
    202 			{
    203 				v.x = Float4(*Pointer<Byte4>(source0));
    204 				v.y = Float4(*Pointer<Byte4>(source1));
    205 				v.z = Float4(*Pointer<Byte4>(source2));
    206 				v.w = Float4(*Pointer<Byte4>(source3));
    207 
    208 				transpose4xN(v.x, v.y, v.z, v.w, stream.count);
    209 
    210 				if(stream.normalized)
    211 				{
    212 					if(stream.count >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
    213 					if(stream.count >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
    214 					if(stream.count >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
    215 					if(stream.count >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
    216 				}
    217 			}
    218 			else // Stream: UByte, Shader attrib: Int / UInt
    219 			{
    220 				v.x = As<Float4>(Int4(*Pointer<Byte4>(source0)));
    221 				v.y = As<Float4>(Int4(*Pointer<Byte4>(source1)));
    222 				v.z = As<Float4>(Int4(*Pointer<Byte4>(source2)));
    223 				v.w = As<Float4>(Int4(*Pointer<Byte4>(source3)));
    224 
    225 				transpose4xN(v.x, v.y, v.z, v.w, stream.count);
    226 			}
    227 			break;
    228 		case STREAMTYPE_SBYTE:
    229 			if(isNativeFloatAttrib) // Stream: SByte, Shader attrib: Float
    230 			{
    231 				v.x = Float4(*Pointer<SByte4>(source0));
    232 				v.y = Float4(*Pointer<SByte4>(source1));
    233 				v.z = Float4(*Pointer<SByte4>(source2));
    234 				v.w = Float4(*Pointer<SByte4>(source3));
    235 
    236 				transpose4xN(v.x, v.y, v.z, v.w, stream.count);
    237 
    238 				if(stream.normalized)
    239 				{
    240 					if(stream.count >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleSByte));
    241 					if(stream.count >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleSByte));
    242 					if(stream.count >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleSByte));
    243 					if(stream.count >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleSByte));
    244 				}
    245 			}
    246 			else // Stream: SByte, Shader attrib: Int / UInt
    247 			{
    248 				v.x = As<Float4>(Int4(*Pointer<SByte4>(source0)));
    249 				v.y = As<Float4>(Int4(*Pointer<SByte4>(source1)));
    250 				v.z = As<Float4>(Int4(*Pointer<SByte4>(source2)));
    251 				v.w = As<Float4>(Int4(*Pointer<SByte4>(source3)));
    252 
    253 				transpose4xN(v.x, v.y, v.z, v.w, stream.count);
    254 			}
    255 			break;
    256 		case STREAMTYPE_COLOR:
    257 			{
    258 				v.x = Float4(*Pointer<Byte4>(source0)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
    259 				v.y = Float4(*Pointer<Byte4>(source1)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
    260 				v.z = Float4(*Pointer<Byte4>(source2)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
    261 				v.w = Float4(*Pointer<Byte4>(source3)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
    262 
    263 				transpose4x4(v.x, v.y, v.z, v.w);
    264 
    265 				// Swap red and blue
    266 				Float4 t = v.x;
    267 				v.x = v.z;
    268 				v.z = t;
    269 			}
    270 			break;
    271 		case STREAMTYPE_SHORT:
    272 			if(isNativeFloatAttrib) // Stream: Int, Shader attrib: Float
    273 			{
    274 				v.x = Float4(*Pointer<Short4>(source0));
    275 				v.y = Float4(*Pointer<Short4>(source1));
    276 				v.z = Float4(*Pointer<Short4>(source2));
    277 				v.w = Float4(*Pointer<Short4>(source3));
    278 
    279 				transpose4xN(v.x, v.y, v.z, v.w, stream.count);
    280 
    281 				if(stream.normalized)
    282 				{
    283 					if(stream.count >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleShort));
    284 					if(stream.count >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleShort));
    285 					if(stream.count >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleShort));
    286 					if(stream.count >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleShort));
    287 				}
    288 			}
    289 			else // Stream: Short, Shader attrib: Int/UInt, no type conversion
    290 			{
    291 				v.x = As<Float4>(Int4(*Pointer<Short4>(source0)));
    292 				v.y = As<Float4>(Int4(*Pointer<Short4>(source1)));
    293 				v.z = As<Float4>(Int4(*Pointer<Short4>(source2)));
    294 				v.w = As<Float4>(Int4(*Pointer<Short4>(source3)));
    295 
    296 				transpose4xN(v.x, v.y, v.z, v.w, stream.count);
    297 			}
    298 			break;
    299 		case STREAMTYPE_USHORT:
    300 			if(isNativeFloatAttrib) // Stream: Int, Shader attrib: Float
    301 			{
    302 				v.x = Float4(*Pointer<UShort4>(source0));
    303 				v.y = Float4(*Pointer<UShort4>(source1));
    304 				v.z = Float4(*Pointer<UShort4>(source2));
    305 				v.w = Float4(*Pointer<UShort4>(source3));
    306 
    307 				transpose4xN(v.x, v.y, v.z, v.w, stream.count);
    308 
    309 				if(stream.normalized)
    310 				{
    311 					if(stream.count >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleUShort));
    312 					if(stream.count >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleUShort));
    313 					if(stream.count >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleUShort));
    314 					if(stream.count >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleUShort));
    315 				}
    316 			}
    317 			else // Stream: UShort, Shader attrib: Int/UInt, no type conversion
    318 			{
    319 				v.x = As<Float4>(Int4(*Pointer<UShort4>(source0)));
    320 				v.y = As<Float4>(Int4(*Pointer<UShort4>(source1)));
    321 				v.z = As<Float4>(Int4(*Pointer<UShort4>(source2)));
    322 				v.w = As<Float4>(Int4(*Pointer<UShort4>(source3)));
    323 
    324 				transpose4xN(v.x, v.y, v.z, v.w, stream.count);
    325 			}
    326 			break;
    327 		case STREAMTYPE_INT:
    328 			if(isNativeFloatAttrib) // Stream: Int, Shader attrib: Float
    329 			{
    330 				v.x = Float4(*Pointer<Int4>(source0));
    331 				v.y = Float4(*Pointer<Int4>(source1));
    332 				v.z = Float4(*Pointer<Int4>(source2));
    333 				v.w = Float4(*Pointer<Int4>(source3));
    334 
    335 				transpose4xN(v.x, v.y, v.z, v.w, stream.count);
    336 
    337 				if(stream.normalized)
    338 				{
    339 					if(stream.count >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleInt));
    340 					if(stream.count >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleInt));
    341 					if(stream.count >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleInt));
    342 					if(stream.count >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleInt));
    343 				}
    344 			}
    345 			else // Stream: Int, Shader attrib: Int/UInt, no type conversion
    346 			{
    347 				v.x = *Pointer<Float4>(source0);
    348 				v.y = *Pointer<Float4>(source1);
    349 				v.z = *Pointer<Float4>(source2);
    350 				v.w = *Pointer<Float4>(source3);
    351 
    352 				transpose4xN(v.x, v.y, v.z, v.w, stream.count);
    353 			}
    354 			break;
    355 		case STREAMTYPE_UINT:
    356 			if(isNativeFloatAttrib) // Stream: UInt, Shader attrib: Float
    357 			{
    358 				v.x = Float4(*Pointer<UInt4>(source0));
    359 				v.y = Float4(*Pointer<UInt4>(source1));
    360 				v.z = Float4(*Pointer<UInt4>(source2));
    361 				v.w = Float4(*Pointer<UInt4>(source3));
    362 
    363 				transpose4xN(v.x, v.y, v.z, v.w, stream.count);
    364 
    365 				if(stream.normalized)
    366 				{
    367 					if(stream.count >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleUInt));
    368 					if(stream.count >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleUInt));
    369 					if(stream.count >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleUInt));
    370 					if(stream.count >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleUInt));
    371 				}
    372 			}
    373 			else // Stream: UInt, Shader attrib: Int/UInt, no type conversion
    374 			{
    375 				v.x = *Pointer<Float4>(source0);
    376 				v.y = *Pointer<Float4>(source1);
    377 				v.z = *Pointer<Float4>(source2);
    378 				v.w = *Pointer<Float4>(source3);
    379 
    380 				transpose4xN(v.x, v.y, v.z, v.w, stream.count);
    381 			}
    382 			break;
    383 		case STREAMTYPE_UDEC3:
    384 			{
    385 				// FIXME: Vectorize
    386 				{
    387 					Int x, y, z;
    388 
    389 					x = y = z = *Pointer<Int>(source0);
    390 
    391 					v.x.x = Float(x & 0x000003FF);
    392 					v.x.y = Float(y & 0x000FFC00);
    393 					v.x.z = Float(z & 0x3FF00000);
    394 				}
    395 
    396 				{
    397 					Int x, y, z;
    398 
    399 					x = y = z = *Pointer<Int>(source1);
    400 
    401 					v.y.x = Float(x & 0x000003FF);
    402 					v.y.y = Float(y & 0x000FFC00);
    403 					v.y.z = Float(z & 0x3FF00000);
    404 				}
    405 
    406 				{
    407 					Int x, y, z;
    408 
    409 					x = y = z = *Pointer<Int>(source2);
    410 
    411 					v.z.x = Float(x & 0x000003FF);
    412 					v.z.y = Float(y & 0x000FFC00);
    413 					v.z.z = Float(z & 0x3FF00000);
    414 				}
    415 
    416 				{
    417 					Int x, y, z;
    418 
    419 					x = y = z = *Pointer<Int>(source3);
    420 
    421 					v.w.x = Float(x & 0x000003FF);
    422 					v.w.y = Float(y & 0x000FFC00);
    423 					v.w.z = Float(z & 0x3FF00000);
    424 				}
    425 
    426 				transpose4x3(v.x, v.y, v.z, v.w);
    427 
    428 				v.y *= Float4(1.0f / 0x00000400);
    429 				v.z *= Float4(1.0f / 0x00100000);
    430 			}
    431 			break;
    432 		case STREAMTYPE_DEC3N:
    433 			{
    434 				// FIXME: Vectorize
    435 				{
    436 					Int x, y, z;
    437 
    438 					x = y = z = *Pointer<Int>(source0);
    439 
    440 					v.x.x = Float((x << 22) & 0xFFC00000);
    441 					v.x.y = Float((y << 12) & 0xFFC00000);
    442 					v.x.z = Float((z << 2)  & 0xFFC00000);
    443 				}
    444 
    445 				{
    446 					Int x, y, z;
    447 
    448 					x = y = z = *Pointer<Int>(source1);
    449 
    450 					v.y.x = Float((x << 22) & 0xFFC00000);
    451 					v.y.y = Float((y << 12) & 0xFFC00000);
    452 					v.y.z = Float((z << 2)  & 0xFFC00000);
    453 				}
    454 
    455 				{
    456 					Int x, y, z;
    457 
    458 					x = y = z = *Pointer<Int>(source2);
    459 
    460 					v.z.x = Float((x << 22) & 0xFFC00000);
    461 					v.z.y = Float((y << 12) & 0xFFC00000);
    462 					v.z.z = Float((z << 2)  & 0xFFC00000);
    463 				}
    464 
    465 				{
    466 					Int x, y, z;
    467 
    468 					x = y = z = *Pointer<Int>(source3);
    469 
    470 					v.w.x = Float((x << 22) & 0xFFC00000);
    471 					v.w.y = Float((y << 12) & 0xFFC00000);
    472 					v.w.z = Float((z << 2)  & 0xFFC00000);
    473 				}
    474 
    475 				transpose4x3(v.x, v.y, v.z, v.w);
    476 
    477 				v.x *= Float4(1.0f / 0x00400000 / 511.0f);
    478 				v.y *= Float4(1.0f / 0x00400000 / 511.0f);
    479 				v.z *= Float4(1.0f / 0x00400000 / 511.0f);
    480 			}
    481 			break;
    482 		case STREAMTYPE_FIXED:
    483 			{
    484 				v.x = Float4(*Pointer<Int4>(source0)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleFixed));
    485 				v.y = Float4(*Pointer<Int4>(source1)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleFixed));
    486 				v.z = Float4(*Pointer<Int4>(source2)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleFixed));
    487 				v.w = Float4(*Pointer<Int4>(source3)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleFixed));
    488 
    489 				transpose4xN(v.x, v.y, v.z, v.w, stream.count);
    490 			}
    491 			break;
    492 		case STREAMTYPE_HALF:
    493 			{
    494 				if(stream.count >= 1)
    495 				{
    496 					UShort x0 = *Pointer<UShort>(source0 + 0);
    497 					UShort x1 = *Pointer<UShort>(source1 + 0);
    498 					UShort x2 = *Pointer<UShort>(source2 + 0);
    499 					UShort x3 = *Pointer<UShort>(source3 + 0);
    500 
    501 					v.x.x = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(x0) * 4);
    502 					v.x.y = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(x1) * 4);
    503 					v.x.z = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(x2) * 4);
    504 					v.x.w = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(x3) * 4);
    505 				}
    506 
    507 				if(stream.count >= 2)
    508 				{
    509 					UShort y0 = *Pointer<UShort>(source0 + 2);
    510 					UShort y1 = *Pointer<UShort>(source1 + 2);
    511 					UShort y2 = *Pointer<UShort>(source2 + 2);
    512 					UShort y3 = *Pointer<UShort>(source3 + 2);
    513 
    514 					v.y.x = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(y0) * 4);
    515 					v.y.y = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(y1) * 4);
    516 					v.y.z = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(y2) * 4);
    517 					v.y.w = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(y3) * 4);
    518 				}
    519 
    520 				if(stream.count >= 3)
    521 				{
    522 					UShort z0 = *Pointer<UShort>(source0 + 4);
    523 					UShort z1 = *Pointer<UShort>(source1 + 4);
    524 					UShort z2 = *Pointer<UShort>(source2 + 4);
    525 					UShort z3 = *Pointer<UShort>(source3 + 4);
    526 
    527 					v.z.x = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(z0) * 4);
    528 					v.z.y = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(z1) * 4);
    529 					v.z.z = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(z2) * 4);
    530 					v.z.w = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(z3) * 4);
    531 				}
    532 
    533 				if(stream.count >= 4)
    534 				{
    535 					UShort w0 = *Pointer<UShort>(source0 + 6);
    536 					UShort w1 = *Pointer<UShort>(source1 + 6);
    537 					UShort w2 = *Pointer<UShort>(source2 + 6);
    538 					UShort w3 = *Pointer<UShort>(source3 + 6);
    539 
    540 					v.w.x = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(w0) * 4);
    541 					v.w.y = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(w1) * 4);
    542 					v.w.z = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(w2) * 4);
    543 					v.w.w = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(w3) * 4);
    544 				}
    545 			}
    546 			break;
    547 		case STREAMTYPE_INDICES:
    548 			{
    549 				v.x.x = *Pointer<Float>(source0);
    550 				v.x.y = *Pointer<Float>(source1);
    551 				v.x.z = *Pointer<Float>(source2);
    552 				v.x.w = *Pointer<Float>(source3);
    553 			}
    554 			break;
    555 		case STREAMTYPE_2_10_10_10_INT:
    556 			{
    557 				Int4 src;
    558 				src = Insert(src, *Pointer<Int>(source0), 0);
    559 				src = Insert(src, *Pointer<Int>(source1), 1);
    560 				src = Insert(src, *Pointer<Int>(source2), 2);
    561 				src = Insert(src, *Pointer<Int>(source3), 3);
    562 
    563 				v.x = Float4((src << 22) >> 22);
    564 				v.y = Float4((src << 12) >> 22);
    565 				v.z = Float4((src << 02) >> 22);
    566 				v.w = Float4(src >> 30);
    567 
    568 				if(stream.normalized)
    569 				{
    570 					v.x = Max(v.x * Float4(1.0f / 0x1FF), Float4(-1.0f));
    571 					v.y = Max(v.y * Float4(1.0f / 0x1FF), Float4(-1.0f));
    572 					v.z = Max(v.z * Float4(1.0f / 0x1FF), Float4(-1.0f));
    573 					v.w = Max(v.w, Float4(-1.0f));
    574 				}
    575 			}
    576 			break;
    577 		case STREAMTYPE_2_10_10_10_UINT:
    578 			{
    579 				Int4 src;
    580 				src = Insert(src, *Pointer<Int>(source0), 0);
    581 				src = Insert(src, *Pointer<Int>(source1), 1);
    582 				src = Insert(src, *Pointer<Int>(source2), 2);
    583 				src = Insert(src, *Pointer<Int>(source3), 3);
    584 
    585 				v.x = Float4(src & Int4(0x3FF));
    586 				v.y = Float4((src >> 10) & Int4(0x3FF));
    587 				v.z = Float4((src >> 20) & Int4(0x3FF));
    588 				v.w = Float4((src >> 30) & Int4(0x3));
    589 
    590 				if(stream.normalized)
    591 				{
    592 					v.x *= Float4(1.0f / 0x3FF);
    593 					v.y *= Float4(1.0f / 0x3FF);
    594 					v.z *= Float4(1.0f / 0x3FF);
    595 					v.w *= Float4(1.0f / 0x3);
    596 				}
    597 			}
    598 			break;
    599 		default:
    600 			ASSERT(false);
    601 		}
    602 
    603 		if(stream.count < 1) v.x = Float4(0.0f);
    604 		if(stream.count < 2) v.y = Float4(0.0f);
    605 		if(stream.count < 3) v.z = Float4(0.0f);
    606 		if(stream.count < 4) v.w = isNativeFloatAttrib ? As<Float4>(Float4(1.0f)) : As<Float4>(Int4(0));
    607 
    608 		return v;
    609 	}
    610 
    611 	void VertexRoutine::postTransform()
    612 	{
    613 		int pos = state.positionRegister;
    614 
    615 		// Backtransform
    616 		if(state.preTransformed)
    617 		{
    618 			Float4 rhw = Float4(1.0f) / o[pos].w;
    619 
    620 			Float4 W = *Pointer<Float4>(data + OFFSET(DrawData,Wx16)) * Float4(1.0f / 16.0f);
    621 			Float4 H = *Pointer<Float4>(data + OFFSET(DrawData,Hx16)) * Float4(1.0f / 16.0f);
    622 			Float4 L = *Pointer<Float4>(data + OFFSET(DrawData,X0x16)) * Float4(1.0f / 16.0f);
    623 			Float4 T = *Pointer<Float4>(data + OFFSET(DrawData,Y0x16)) * Float4(1.0f / 16.0f);
    624 
    625 			o[pos].x = (o[pos].x - L) / W * rhw;
    626 			o[pos].y = (o[pos].y - T) / H * rhw;
    627 			o[pos].z = o[pos].z * rhw;
    628 			o[pos].w = rhw;
    629 		}
    630 
    631 		if(!halfIntegerCoordinates && !state.preTransformed)
    632 		{
    633 			o[pos].x = o[pos].x + *Pointer<Float4>(data + OFFSET(DrawData,halfPixelX)) * o[pos].w;
    634 			o[pos].y = o[pos].y + *Pointer<Float4>(data + OFFSET(DrawData,halfPixelY)) * o[pos].w;
    635 		}
    636 
    637 		if(state.superSampling)
    638 		{
    639 			o[pos].x = o[pos].x + *Pointer<Float4>(data + OFFSET(DrawData,XXXX)) * o[pos].w;
    640 			o[pos].y = o[pos].y + *Pointer<Float4>(data + OFFSET(DrawData,YYYY)) * o[pos].w;
    641 		}
    642 	}
    643 
    644 	void VertexRoutine::writeCache(Pointer<Byte> &cacheLine)
    645 	{
    646 		Vector4f v;
    647 
    648 		for(int i = 0; i < MAX_VERTEX_OUTPUTS; i++)
    649 		{
    650 			if(state.output[i].write)
    651 			{
    652 				v.x = o[i].x;
    653 				v.y = o[i].y;
    654 				v.z = o[i].z;
    655 				v.w = o[i].w;
    656 
    657 				if(state.output[i].xClamp)
    658 				{
    659 					v.x = Max(v.x, Float4(0.0f));
    660 					v.x = Min(v.x, Float4(1.0f));
    661 				}
    662 
    663 				if(state.output[i].yClamp)
    664 				{
    665 					v.y = Max(v.y, Float4(0.0f));
    666 					v.y = Min(v.y, Float4(1.0f));
    667 				}
    668 
    669 				if(state.output[i].zClamp)
    670 				{
    671 					v.z = Max(v.z, Float4(0.0f));
    672 					v.z = Min(v.z, Float4(1.0f));
    673 				}
    674 
    675 				if(state.output[i].wClamp)
    676 				{
    677 					v.w = Max(v.w, Float4(0.0f));
    678 					v.w = Min(v.w, Float4(1.0f));
    679 				}
    680 
    681 				if(state.output[i].write == 0x01)
    682 				{
    683 					*Pointer<Float>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 0) = v.x.x;
    684 					*Pointer<Float>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 1) = v.x.y;
    685 					*Pointer<Float>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 2) = v.x.z;
    686 					*Pointer<Float>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 3) = v.x.w;
    687 				}
    688 				else
    689 				{
    690 					if(state.output[i].write == 0x03)
    691 					{
    692 						transpose2x4(v.x, v.y, v.z, v.w);
    693 					}
    694 					else
    695 					{
    696 						transpose4x4(v.x, v.y, v.z, v.w);
    697 					}
    698 
    699 					*Pointer<Float4>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 0, 16) = v.x;
    700 					*Pointer<Float4>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 1, 16) = v.y;
    701 					*Pointer<Float4>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 2, 16) = v.z;
    702 					*Pointer<Float4>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 3, 16) = v.w;
    703 				}
    704 			}
    705 		}
    706 
    707 		*Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 0) = (clipFlags >> 0)  & 0x0000000FF;
    708 		*Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 1) = (clipFlags >> 8)  & 0x0000000FF;
    709 		*Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 2) = (clipFlags >> 16) & 0x0000000FF;
    710 		*Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 3) = (clipFlags >> 24) & 0x0000000FF;
    711 
    712 		// Viewport transform
    713 		int pos = state.positionRegister;
    714 
    715 		v.x = o[pos].x;
    716 		v.y = o[pos].y;
    717 		v.z = o[pos].z;
    718 		v.w = o[pos].w;
    719 
    720 		if(symmetricNormalizedDepth)
    721 		{
    722 			v.z = (v.z + v.w) * Float4(0.5f);   // [-1, 1] -> [0, 1]
    723 		}
    724 
    725 		Float4 w = As<Float4>(As<Int4>(v.w) | (As<Int4>(CmpEQ(v.w, Float4(0.0f))) & As<Int4>(Float4(1.0f))));
    726 		Float4 rhw = Float4(1.0f) / w;
    727 
    728 		v.x = As<Float4>(RoundInt(*Pointer<Float4>(data + OFFSET(DrawData,X0x16)) + v.x * rhw * *Pointer<Float4>(data + OFFSET(DrawData,Wx16))));
    729 		v.y = As<Float4>(RoundInt(*Pointer<Float4>(data + OFFSET(DrawData,Y0x16)) + v.y * rhw * *Pointer<Float4>(data + OFFSET(DrawData,Hx16))));
    730 		v.z = v.z * rhw;
    731 		v.w = rhw;
    732 
    733 		transpose4x4(v.x, v.y, v.z, v.w);
    734 
    735 		*Pointer<Float4>(cacheLine + OFFSET(Vertex,X) + sizeof(Vertex) * 0, 16) = v.x;
    736 		*Pointer<Float4>(cacheLine + OFFSET(Vertex,X) + sizeof(Vertex) * 1, 16) = v.y;
    737 		*Pointer<Float4>(cacheLine + OFFSET(Vertex,X) + sizeof(Vertex) * 2, 16) = v.z;
    738 		*Pointer<Float4>(cacheLine + OFFSET(Vertex,X) + sizeof(Vertex) * 3, 16) = v.w;
    739 	}
    740 
    741 	void VertexRoutine::writeVertex(const Pointer<Byte> &vertex, Pointer<Byte> &cache)
    742 	{
    743 		for(int i = 0; i < MAX_VERTEX_OUTPUTS; i++)
    744 		{
    745 			if(state.output[i].write)
    746 			{
    747 				*Pointer<Int4>(vertex + OFFSET(Vertex,v[i]), 16) = *Pointer<Int4>(cache + OFFSET(Vertex,v[i]), 16);
    748 			}
    749 		}
    750 
    751 		*Pointer<Int4>(vertex + OFFSET(Vertex,X)) = *Pointer<Int4>(cache + OFFSET(Vertex,X));
    752 		*Pointer<Int>(vertex + OFFSET(Vertex,clipFlags)) = *Pointer<Int>(cache + OFFSET(Vertex,clipFlags));
    753 	}
    754 
    755 	void VertexRoutine::transformFeedback(const Pointer<Byte> &vertex, const UInt &primitiveNumber, const UInt &indexInPrimitive)
    756 	{
    757 		If(indexInPrimitive < state.verticesPerPrimitive)
    758 		{
    759 			UInt tOffset = primitiveNumber * state.verticesPerPrimitive + indexInPrimitive;
    760 
    761 			for(int i = 0; i < MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS; i++)
    762 			{
    763 				if(state.transformFeedbackEnabled & (1ULL << i))
    764 				{
    765 					UInt reg = *Pointer<UInt>(data + OFFSET(DrawData, vs.reg[i]));
    766 					UInt row = *Pointer<UInt>(data + OFFSET(DrawData, vs.row[i]));
    767 					UInt col = *Pointer<UInt>(data + OFFSET(DrawData, vs.col[i]));
    768 					UInt str = *Pointer<UInt>(data + OFFSET(DrawData, vs.str[i]));
    769 
    770 					Pointer<Byte> t = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData, vs.t[i])) + (tOffset * str * sizeof(float));
    771 					Pointer<Byte> v = vertex + OFFSET(Vertex, v) + reg * sizeof(float);
    772 
    773 					For(UInt r = 0, r < row, r++)
    774 					{
    775 						UInt rOffsetX = r * col * sizeof(float);
    776 						UInt rOffset4 = r * sizeof(float4);
    777 
    778 						For(UInt c = 0, c < col, c++)
    779 						{
    780 							UInt cOffset = c * sizeof(float);
    781 							*Pointer<Float>(t + rOffsetX + cOffset) = *Pointer<Float>(v + rOffset4 + cOffset);
    782 						}
    783 					}
    784 				}
    785 			}
    786 		}
    787 	}
    788 }
    789