Home | History | Annotate | Download | only in Shader
      1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved.
      2 //
      3 // Licensed under the Apache License, Version 2.0 (the "License");
      4 // you may not use this file except in compliance with the License.
      5 // You may obtain a copy of the License at
      6 //
      7 //    http://www.apache.org/licenses/LICENSE-2.0
      8 //
      9 // Unless required by applicable law or agreed to in writing, software
     10 // distributed under the License is distributed on an "AS IS" BASIS,
     11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 // See the License for the specific language governing permissions and
     13 // limitations under the License.
     14 
     15 #include "ShaderCore.hpp"
     16 
     17 #include "Renderer/Renderer.hpp"
     18 #include "Common/Debug.hpp"
     19 
     20 #include <limits.h>
     21 
     22 namespace sw
     23 {
     24 	extern TranscendentalPrecision logPrecision;
     25 	extern TranscendentalPrecision expPrecision;
     26 	extern TranscendentalPrecision rcpPrecision;
     27 	extern TranscendentalPrecision rsqPrecision;
     28 
     29 	Vector4s::Vector4s()
     30 	{
     31 	}
     32 
     33 	Vector4s::Vector4s(unsigned short x, unsigned short y, unsigned short z, unsigned short w)
     34 	{
     35 		this->x = Short4(x);
     36 		this->y = Short4(y);
     37 		this->z = Short4(z);
     38 		this->w = Short4(w);
     39 	}
     40 
     41 	Vector4s::Vector4s(const Vector4s &rhs)
     42 	{
     43 		x = rhs.x;
     44 		y = rhs.y;
     45 		z = rhs.z;
     46 		w = rhs.w;
     47 	}
     48 
     49 	Vector4s &Vector4s::operator=(const Vector4s &rhs)
     50 	{
     51 		x = rhs.x;
     52 		y = rhs.y;
     53 		z = rhs.z;
     54 		w = rhs.w;
     55 
     56 		return *this;
     57 	}
     58 
     59 	Short4 &Vector4s::operator[](int i)
     60 	{
     61 		switch(i)
     62 		{
     63 		case 0: return x;
     64 		case 1: return y;
     65 		case 2: return z;
     66 		case 3: return w;
     67 		}
     68 
     69 		return x;
     70 	}
     71 
     72 	Vector4f::Vector4f()
     73 	{
     74 	}
     75 
     76 	Vector4f::Vector4f(float x, float y, float z, float w)
     77 	{
     78 		this->x = Float4(x);
     79 		this->y = Float4(y);
     80 		this->z = Float4(z);
     81 		this->w = Float4(w);
     82 	}
     83 
     84 	Vector4f::Vector4f(const Vector4f &rhs)
     85 	{
     86 		x = rhs.x;
     87 		y = rhs.y;
     88 		z = rhs.z;
     89 		w = rhs.w;
     90 	}
     91 
     92 	Vector4f &Vector4f::operator=(const Vector4f &rhs)
     93 	{
     94 		x = rhs.x;
     95 		y = rhs.y;
     96 		z = rhs.z;
     97 		w = rhs.w;
     98 
     99 		return *this;
    100 	}
    101 
    102 	Float4 &Vector4f::operator[](int i)
    103 	{
    104 		switch(i)
    105 		{
    106 		case 0: return x;
    107 		case 1: return y;
    108 		case 2: return z;
    109 		case 3: return w;
    110 		}
    111 
    112 		return x;
    113 	}
    114 
    115 	Float4 exponential2(RValue<Float4> x, bool pp)
    116 	{
    117 		// This implementation is based on 2^(i + f) = 2^i * 2^f,
    118 		// where i is the integer part of x and f is the fraction.
    119 
    120 		// For 2^i we can put the integer part directly in the exponent of
    121 		// the IEEE-754 floating-point number. Clamp to prevent overflow
    122 		// past the representation of infinity.
    123 		Float4 x0 = x;
    124 		x0 = Min(x0, As<Float4>(Int4(0x43010000)));   // 129.00000e+0f
    125 		x0 = Max(x0, As<Float4>(Int4(0xC2FDFFFF)));   // -126.99999e+0f
    126 
    127 		Int4 i = RoundInt(x0 - Float4(0.5f));
    128 		Float4 ii = As<Float4>((i + Int4(127)) << 23);   // Add single-precision bias, and shift into exponent.
    129 
    130 		// For the fractional part use a polynomial
    131 		// which approximates 2^f in the 0 to 1 range.
    132 		Float4 f = x0 - Float4(i);
    133 		Float4 ff = As<Float4>(Int4(0x3AF61905));     // 1.8775767e-3f
    134 		ff = ff * f + As<Float4>(Int4(0x3C134806));   // 8.9893397e-3f
    135 		ff = ff * f + As<Float4>(Int4(0x3D64AA23));   // 5.5826318e-2f
    136 		ff = ff * f + As<Float4>(Int4(0x3E75EAD4));   // 2.4015361e-1f
    137 		ff = ff * f + As<Float4>(Int4(0x3F31727B));   // 6.9315308e-1f
    138 		ff = ff * f + Float4(1.0f);
    139 
    140 		return ii * ff;
    141 	}
    142 
    143 	Float4 logarithm2(RValue<Float4> x, bool absolute, bool pp)
    144 	{
    145 		Float4 x0;
    146 		Float4 x1;
    147 		Float4 x2;
    148 		Float4 x3;
    149 
    150 		x0 = x;
    151 
    152 		x1 = As<Float4>(As<Int4>(x0) & Int4(0x7F800000));
    153 		x1 = As<Float4>(As<UInt4>(x1) >> 8);
    154 		x1 = As<Float4>(As<Int4>(x1) | As<Int4>(Float4(1.0f)));
    155 		x1 = (x1 - Float4(1.4960938f)) * Float4(256.0f);   // FIXME: (x1 - 1.4960938f) * 256.0f;
    156 		x0 = As<Float4>((As<Int4>(x0) & Int4(0x007FFFFF)) | As<Int4>(Float4(1.0f)));
    157 
    158 		x2 = (Float4(9.5428179e-2f) * x0 + Float4(4.7779095e-1f)) * x0 + Float4(1.9782813e-1f);
    159 		x3 = ((Float4(1.6618466e-2f) * x0 + Float4(2.0350508e-1f)) * x0 + Float4(2.7382900e-1f)) * x0 + Float4(4.0496687e-2f);
    160 		x2 /= x3;
    161 
    162 		x1 += (x0 - Float4(1.0f)) * x2;
    163 
    164 		Int4 pos_inf_x = CmpEQ(As<Int4>(x), Int4(0x7F800000));
    165 		return As<Float4>((pos_inf_x & As<Int4>(x)) | (~pos_inf_x & As<Int4>(x1)));
    166 	}
    167 
    168 	Float4 exponential(RValue<Float4> x, bool pp)
    169 	{
    170 		// FIXME: Propagate the constant
    171 		return exponential2(Float4(1.44269504f) * x, pp);   // 1/ln(2)
    172 	}
    173 
    174 	Float4 logarithm(RValue<Float4> x, bool absolute, bool pp)
    175 	{
    176 		// FIXME: Propagate the constant
    177 		return Float4(6.93147181e-1f) * logarithm2(x, absolute, pp);   // ln(2)
    178 	}
    179 
    180 	Float4 power(RValue<Float4> x, RValue<Float4> y, bool pp)
    181 	{
    182 		Float4 log = logarithm2(x, true, pp);
    183 		log *= y;
    184 		return exponential2(log, pp);
    185 	}
    186 
    187 	Float4 reciprocal(RValue<Float4> x, bool pp, bool finite, bool exactAtPow2)
    188 	{
    189 		Float4 rcp;
    190 
    191 		if(!pp && rcpPrecision >= WHQL)
    192 		{
    193 			rcp = Float4(1.0f) / x;
    194 		}
    195 		else
    196 		{
    197 			rcp = Rcp_pp(x, exactAtPow2);
    198 
    199 			if(!pp)
    200 			{
    201 				rcp = (rcp + rcp) - (x * rcp * rcp);
    202 			}
    203 		}
    204 
    205 		if(finite)
    206 		{
    207 			int big = 0x7F7FFFFF;
    208 			rcp = Min(rcp, Float4((float&)big));
    209 		}
    210 
    211 		return rcp;
    212 	}
    213 
    214 	Float4 reciprocalSquareRoot(RValue<Float4> x, bool absolute, bool pp)
    215 	{
    216 		Float4 abs = x;
    217 
    218 		if(absolute)
    219 		{
    220 			abs = Abs(abs);
    221 		}
    222 
    223 		Float4 rsq;
    224 
    225 		if(!pp)
    226 		{
    227 			rsq = Float4(1.0f) / Sqrt(abs);
    228 		}
    229 		else
    230 		{
    231 			rsq = RcpSqrt_pp(abs);
    232 
    233 			if(!pp)
    234 			{
    235 				rsq = rsq * (Float4(3.0f) - rsq * rsq * abs) * Float4(0.5f);
    236 			}
    237 
    238 			rsq = As<Float4>(CmpNEQ(As<Int4>(abs), Int4(0x7F800000)) & As<Int4>(rsq));
    239 		}
    240 
    241 		return rsq;
    242 	}
    243 
    244 	Float4 modulo(RValue<Float4> x, RValue<Float4> y)
    245 	{
    246 		return x - y * Floor(x / y);
    247 	}
    248 
    249 	Float4 sine_pi(RValue<Float4> x, bool pp)
    250 	{
    251 		const Float4 A = Float4(-4.05284734e-1f);   // -4/pi^2
    252 		const Float4 B = Float4(1.27323954e+0f);    // 4/pi
    253 		const Float4 C = Float4(7.75160950e-1f);
    254 		const Float4 D = Float4(2.24839049e-1f);
    255 
    256 		// Parabola approximating sine
    257 		Float4 sin = x * (Abs(x) * A + B);
    258 
    259 		// Improve precision from 0.06 to 0.001
    260 		if(true)
    261 		{
    262 			sin = sin * (Abs(sin) * D + C);
    263 		}
    264 
    265 		return sin;
    266 	}
    267 
    268 	Float4 cosine_pi(RValue<Float4> x, bool pp)
    269 	{
    270 		// cos(x) = sin(x + pi/2)
    271 		Float4 y = x + Float4(1.57079632e+0f);
    272 
    273 		// Wrap around
    274 		y -= As<Float4>(CmpNLT(y, Float4(3.14159265e+0f)) & As<Int4>(Float4(6.28318530e+0f)));
    275 
    276 		return sine_pi(y, pp);
    277 	}
    278 
    279 	Float4 sine(RValue<Float4> x, bool pp)
    280 	{
    281 		// Reduce to [-0.5, 0.5] range
    282 		Float4 y = x * Float4(1.59154943e-1f);   // 1/2pi
    283 		y = y - Round(y);
    284 
    285 		if(!pp)
    286 		{
    287 			// From the paper: "A Fast, Vectorizable Algorithm for Producing Single-Precision Sine-Cosine Pairs"
    288 			// This implementation passes OpenGL ES 3.0 precision requirements, at the cost of more operations:
    289 			// !pp : 17 mul, 7 add, 1 sub, 1 reciprocal
    290 			//  pp : 4 mul, 2 add, 2 abs
    291 
    292 			Float4 y2 = y * y;
    293 			Float4 c1 = y2 * (y2 * (y2 * Float4(-0.0204391631f) + Float4(0.2536086171f)) + Float4(-1.2336977925f)) + Float4(1.0f);
    294 			Float4 s1 = y * (y2 * (y2 * (y2 * Float4(-0.0046075748f) + Float4(0.0796819754f)) + Float4(-0.645963615f)) + Float4(1.5707963235f));
    295 			Float4 c2 = (c1 * c1) - (s1 * s1);
    296 			Float4 s2 = Float4(2.0f) * s1 * c1;
    297 			return Float4(2.0f) * s2 * c2 * reciprocal(s2 * s2 + c2 * c2, pp, true);
    298 		}
    299 
    300 		const Float4 A = Float4(-16.0f);
    301 		const Float4 B = Float4(8.0f);
    302 		const Float4 C = Float4(7.75160950e-1f);
    303 		const Float4 D = Float4(2.24839049e-1f);
    304 
    305 		// Parabola approximating sine
    306 		Float4 sin = y * (Abs(y) * A + B);
    307 
    308 		// Improve precision from 0.06 to 0.001
    309 		if(true)
    310 		{
    311 			sin = sin * (Abs(sin) * D + C);
    312 		}
    313 
    314 		return sin;
    315 	}
    316 
    317 	Float4 cosine(RValue<Float4> x, bool pp)
    318 	{
    319 		// cos(x) = sin(x + pi/2)
    320 		Float4 y = x + Float4(1.57079632e+0f);
    321 		return sine(y, pp);
    322 	}
    323 
    324 	Float4 tangent(RValue<Float4> x, bool pp)
    325 	{
    326 		return sine(x, pp) / cosine(x, pp);
    327 	}
    328 
    329 	Float4 arccos(RValue<Float4> x, bool pp)
    330 	{
    331 		// pi/2 - arcsin(x)
    332 		return Float4(1.57079632e+0f) - arcsin(x);
    333 	}
    334 
    335 	Float4 arcsin(RValue<Float4> x, bool pp)
    336 	{
    337 		if(false) // Simpler implementation fails even lowp precision tests
    338 		{
    339 			// x*(pi/2-sqrt(1-x*x)*pi/5)
    340 			return x * (Float4(1.57079632e+0f) - Sqrt(Float4(1.0f) - x*x) * Float4(6.28318531e-1f));
    341 		}
    342 		else
    343 		{
    344 			// From 4.4.45, page 81 of the Handbook of Mathematical Functions, by Milton Abramowitz and Irene Stegun
    345 			const Float4 half_pi(1.57079632f);
    346 			const Float4 a0(1.5707288f);
    347 			const Float4 a1(-0.2121144f);
    348 			const Float4 a2(0.0742610f);
    349 			const Float4 a3(-0.0187293f);
    350 			Float4 absx = Abs(x);
    351 			return As<Float4>(As<Int4>(half_pi - Sqrt(Float4(1.0f) - absx) * (a0 + absx * (a1 + absx * (a2 + absx * a3)))) ^
    352 			       (As<Int4>(x) & Int4(0x80000000)));
    353 		}
    354 	}
    355 
    356 	// Approximation of atan in [0..1]
    357 	Float4 arctan_01(Float4 x, bool pp)
    358 	{
    359 		if(pp)
    360 		{
    361 			return x * (Float4(-0.27f) * x + Float4(1.05539816f));
    362 		}
    363 		else
    364 		{
    365 			// From 4.4.49, page 81 of the Handbook of Mathematical Functions, by Milton Abramowitz and Irene Stegun
    366 			const Float4 a2(-0.3333314528f);
    367 			const Float4 a4(0.1999355085f);
    368 			const Float4 a6(-0.1420889944f);
    369 			const Float4 a8(0.1065626393f);
    370 			const Float4 a10(-0.0752896400f);
    371 			const Float4 a12(0.0429096138f);
    372 			const Float4 a14(-0.0161657367f);
    373 			const Float4 a16(0.0028662257f);
    374 			Float4 x2 = x * x;
    375 			return (x + x * (x2 * (a2 + x2 * (a4 + x2 * (a6 + x2 * (a8 + x2 * (a10 + x2 * (a12 + x2 * (a14 + x2 * a16)))))))));
    376 		}
    377 	}
    378 
    379 	Float4 arctan(RValue<Float4> x, bool pp)
    380 	{
    381 		Float4 absx = Abs(x);
    382 		Int4 O = CmpNLT(absx, Float4(1.0f));
    383 		Float4 y = As<Float4>((O & As<Int4>(Float4(1.0f) / absx)) | (~O & As<Int4>(absx))); // FIXME: Vector select
    384 
    385 		const Float4 half_pi(1.57079632f);
    386 		Float4 theta = arctan_01(y, pp);
    387 		return As<Float4>(((O & As<Int4>(half_pi - theta)) | (~O & As<Int4>(theta))) ^ // FIXME: Vector select
    388 		       (As<Int4>(x) & Int4(0x80000000)));
    389 	}
    390 
    391 	Float4 arctan(RValue<Float4> y, RValue<Float4> x, bool pp)
    392 	{
    393 		const Float4 pi(3.14159265f);            // pi
    394 		const Float4 minus_pi(-3.14159265f);     // -pi
    395 		const Float4 half_pi(1.57079632f);       // pi/2
    396 		const Float4 quarter_pi(7.85398163e-1f); // pi/4
    397 
    398 		// Rotate to upper semicircle when in lower semicircle
    399 		Int4 S = CmpLT(y, Float4(0.0f));
    400 		Float4 theta = As<Float4>(S & As<Int4>(minus_pi));
    401 		Float4 x0 = As<Float4>((As<Int4>(y) & Int4(0x80000000)) ^ As<Int4>(x));
    402 		Float4 y0 = Abs(y);
    403 
    404 		// Rotate to right quadrant when in left quadrant
    405 		Int4 non_zero_y = CmpNEQ(y0, Float4(0.0f));
    406 		Int4 Q = CmpLT(x0, Float4(0.0f)) & non_zero_y;
    407 		theta += As<Float4>(Q & As<Int4>(half_pi));
    408 		Float4 x1 = As<Float4>((Q & As<Int4>(y0)) | (~Q & As<Int4>(x0)));  // FIXME: Vector select
    409 		Float4 y1 = As<Float4>((Q & As<Int4>(-x0)) | (~Q & As<Int4>(y0))); // FIXME: Vector select
    410 
    411 		// Mirror to first octant when in second octant
    412 		Int4 O = CmpNLT(y1, x1) & non_zero_y;
    413 		Float4 x2 = As<Float4>((O & As<Int4>(y1)) | (~O & As<Int4>(x1))); // FIXME: Vector select
    414 		Float4 y2 = As<Float4>((O & As<Int4>(x1)) | (~O & As<Int4>(y1))); // FIXME: Vector select
    415 
    416 		// Approximation of atan in [0..1]
    417 		Int4 zero_x = CmpEQ(x2, Float4(0.0f));
    418 		Int4 inf_y = IsInf(y2); // Since x2 >= y2, this means x2 == y2 == inf, so we use 45 degrees or pi/4
    419 		Float4 atan2_theta = arctan_01(y2 / x2, pp);
    420 		theta += As<Float4>((~zero_x & ~inf_y & non_zero_y & ((O & As<Int4>(half_pi - atan2_theta)) | (~O & (As<Int4>(atan2_theta))))) | // FIXME: Vector select
    421 		                    (inf_y & As<Int4>(quarter_pi)));
    422 
    423 		// Recover loss of precision for tiny theta angles
    424 		Int4 precision_loss = S & Q & O & ~inf_y; // This combination results in (-pi + half_pi + half_pi - atan2_theta) which is equivalent to -atan2_theta
    425 		return As<Float4>((precision_loss & As<Int4>(-atan2_theta)) | (~precision_loss & As<Int4>(theta))); // FIXME: Vector select
    426 	}
    427 
    428 	Float4 sineh(RValue<Float4> x, bool pp)
    429 	{
    430 		return (exponential(x, pp) - exponential(-x, pp)) * Float4(0.5f);
    431 	}
    432 
    433 	Float4 cosineh(RValue<Float4> x, bool pp)
    434 	{
    435 		return (exponential(x, pp) + exponential(-x, pp)) * Float4(0.5f);
    436 	}
    437 
    438 	Float4 tangenth(RValue<Float4> x, bool pp)
    439 	{
    440 		Float4 e_x = exponential(x, pp);
    441 		Float4 e_minus_x = exponential(-x, pp);
    442 		return (e_x - e_minus_x) / (e_x + e_minus_x);
    443 	}
    444 
    445 	Float4 arccosh(RValue<Float4> x, bool pp)
    446 	{
    447 		return logarithm(x + Sqrt(x + Float4(1.0f)) * Sqrt(x - Float4(1.0f)), pp);
    448 	}
    449 
    450 	Float4 arcsinh(RValue<Float4> x, bool pp)
    451 	{
    452 		return logarithm(x + Sqrt(x * x + Float4(1.0f)), pp);
    453 	}
    454 
    455 	Float4 arctanh(RValue<Float4> x, bool pp)
    456 	{
    457 		return logarithm((Float4(1.0f) + x) / (Float4(1.0f) - x), pp) * Float4(0.5f);
    458 	}
    459 
    460 	Float4 dot2(const Vector4f &v0, const Vector4f &v1)
    461 	{
    462 		return v0.x * v1.x + v0.y * v1.y;
    463 	}
    464 
    465 	Float4 dot3(const Vector4f &v0, const Vector4f &v1)
    466 	{
    467 		return v0.x * v1.x + v0.y * v1.y + v0.z * v1.z;
    468 	}
    469 
    470 	Float4 dot4(const Vector4f &v0, const Vector4f &v1)
    471 	{
    472 		return v0.x * v1.x + v0.y * v1.y + v0.z * v1.z + v0.w * v1.w;
    473 	}
    474 
    475 	void transpose4x4(Short4 &row0, Short4 &row1, Short4 &row2, Short4 &row3)
    476 	{
    477 		Int2 tmp0 = UnpackHigh(row0, row1);
    478 		Int2 tmp1 = UnpackHigh(row2, row3);
    479 		Int2 tmp2 = UnpackLow(row0, row1);
    480 		Int2 tmp3 = UnpackLow(row2, row3);
    481 
    482 		row0 = UnpackLow(tmp2, tmp3);
    483 		row1 = UnpackHigh(tmp2, tmp3);
    484 		row2 = UnpackLow(tmp0, tmp1);
    485 		row3 = UnpackHigh(tmp0, tmp1);
    486 	}
    487 
    488 	void transpose4x3(Short4 &row0, Short4 &row1, Short4 &row2, Short4 &row3)
    489 	{
    490 		Int2 tmp0 = UnpackHigh(row0, row1);
    491 		Int2 tmp1 = UnpackHigh(row2, row3);
    492 		Int2 tmp2 = UnpackLow(row0, row1);
    493 		Int2 tmp3 = UnpackLow(row2, row3);
    494 
    495 		row0 = UnpackLow(tmp2, tmp3);
    496 		row1 = UnpackHigh(tmp2, tmp3);
    497 		row2 = UnpackLow(tmp0, tmp1);
    498 	}
    499 
    500 	void transpose4x4(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3)
    501 	{
    502 		Float4 tmp0 = UnpackLow(row0, row1);
    503 		Float4 tmp1 = UnpackLow(row2, row3);
    504 		Float4 tmp2 = UnpackHigh(row0, row1);
    505 		Float4 tmp3 = UnpackHigh(row2, row3);
    506 
    507 		row0 = Float4(tmp0.xy, tmp1.xy);
    508 		row1 = Float4(tmp0.zw, tmp1.zw);
    509 		row2 = Float4(tmp2.xy, tmp3.xy);
    510 		row3 = Float4(tmp2.zw, tmp3.zw);
    511 	}
    512 
    513 	void transpose4x3(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3)
    514 	{
    515 		Float4 tmp0 = UnpackLow(row0, row1);
    516 		Float4 tmp1 = UnpackLow(row2, row3);
    517 		Float4 tmp2 = UnpackHigh(row0, row1);
    518 		Float4 tmp3 = UnpackHigh(row2, row3);
    519 
    520 		row0 = Float4(tmp0.xy, tmp1.xy);
    521 		row1 = Float4(tmp0.zw, tmp1.zw);
    522 		row2 = Float4(tmp2.xy, tmp3.xy);
    523 	}
    524 
    525 	void transpose4x2(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3)
    526 	{
    527 		Float4 tmp0 = UnpackLow(row0, row1);
    528 		Float4 tmp1 = UnpackLow(row2, row3);
    529 
    530 		row0 = Float4(tmp0.xy, tmp1.xy);
    531 		row1 = Float4(tmp0.zw, tmp1.zw);
    532 	}
    533 
    534 	void transpose4x1(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3)
    535 	{
    536 		Float4 tmp0 = UnpackLow(row0, row1);
    537 		Float4 tmp1 = UnpackLow(row2, row3);
    538 
    539 		row0 = Float4(tmp0.xy, tmp1.xy);
    540 	}
    541 
    542 	void transpose2x4(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3)
    543 	{
    544 		Float4 tmp01 = UnpackLow(row0, row1);
    545 		Float4 tmp23 = UnpackHigh(row0, row1);
    546 
    547 		row0 = tmp01;
    548 		row1 = Float4(tmp01.zw, row1.zw);
    549 		row2 = tmp23;
    550 		row3 = Float4(tmp23.zw, row3.zw);
    551 	}
    552 
    553 	void transpose4xN(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3, int N)
    554 	{
    555 		switch(N)
    556 		{
    557 		case 1: transpose4x1(row0, row1, row2, row3); break;
    558 		case 2: transpose4x2(row0, row1, row2, row3); break;
    559 		case 3: transpose4x3(row0, row1, row2, row3); break;
    560 		case 4: transpose4x4(row0, row1, row2, row3); break;
    561 		}
    562 	}
    563 
    564 	void ShaderCore::mov(Vector4f &dst, const Vector4f &src, bool integerDestination)
    565 	{
    566 		if(integerDestination)
    567 		{
    568 			dst.x = As<Float4>(RoundInt(src.x));
    569 			dst.y = As<Float4>(RoundInt(src.y));
    570 			dst.z = As<Float4>(RoundInt(src.z));
    571 			dst.w = As<Float4>(RoundInt(src.w));
    572 		}
    573 		else
    574 		{
    575 			dst = src;
    576 		}
    577 	}
    578 
    579 	void ShaderCore::neg(Vector4f &dst, const Vector4f &src)
    580 	{
    581 		dst.x = -src.x;
    582 		dst.y = -src.y;
    583 		dst.z = -src.z;
    584 		dst.w = -src.w;
    585 	}
    586 
    587 	void ShaderCore::ineg(Vector4f &dst, const Vector4f &src)
    588 	{
    589 		dst.x = As<Float4>(-As<Int4>(src.x));
    590 		dst.y = As<Float4>(-As<Int4>(src.y));
    591 		dst.z = As<Float4>(-As<Int4>(src.z));
    592 		dst.w = As<Float4>(-As<Int4>(src.w));
    593 	}
    594 
    595 	void ShaderCore::f2b(Vector4f &dst, const Vector4f &src)
    596 	{
    597 		dst.x = As<Float4>(CmpNEQ(src.x, Float4(0.0f)));
    598 		dst.y = As<Float4>(CmpNEQ(src.y, Float4(0.0f)));
    599 		dst.z = As<Float4>(CmpNEQ(src.z, Float4(0.0f)));
    600 		dst.w = As<Float4>(CmpNEQ(src.w, Float4(0.0f)));
    601 	}
    602 
    603 	void ShaderCore::b2f(Vector4f &dst, const Vector4f &src)
    604 	{
    605 		dst.x = As<Float4>(As<Int4>(src.x) & As<Int4>(Float4(1.0f)));
    606 		dst.y = As<Float4>(As<Int4>(src.y) & As<Int4>(Float4(1.0f)));
    607 		dst.z = As<Float4>(As<Int4>(src.z) & As<Int4>(Float4(1.0f)));
    608 		dst.w = As<Float4>(As<Int4>(src.w) & As<Int4>(Float4(1.0f)));
    609 	}
    610 
    611 	void ShaderCore::f2i(Vector4f &dst, const Vector4f &src)
    612 	{
    613 		dst.x = As<Float4>(Int4(src.x));
    614 		dst.y = As<Float4>(Int4(src.y));
    615 		dst.z = As<Float4>(Int4(src.z));
    616 		dst.w = As<Float4>(Int4(src.w));
    617 	}
    618 
    619 	void ShaderCore::i2f(Vector4f &dst, const Vector4f &src)
    620 	{
    621 		dst.x = Float4(As<Int4>(src.x));
    622 		dst.y = Float4(As<Int4>(src.y));
    623 		dst.z = Float4(As<Int4>(src.z));
    624 		dst.w = Float4(As<Int4>(src.w));
    625 	}
    626 
    627 	void ShaderCore::f2u(Vector4f &dst, const Vector4f &src)
    628 	{
    629 		dst.x = As<Float4>(UInt4(src.x));
    630 		dst.y = As<Float4>(UInt4(src.y));
    631 		dst.z = As<Float4>(UInt4(src.z));
    632 		dst.w = As<Float4>(UInt4(src.w));
    633 	}
    634 
    635 	void ShaderCore::u2f(Vector4f &dst, const Vector4f &src)
    636 	{
    637 		dst.x = Float4(As<UInt4>(src.x));
    638 		dst.y = Float4(As<UInt4>(src.y));
    639 		dst.z = Float4(As<UInt4>(src.z));
    640 		dst.w = Float4(As<UInt4>(src.w));
    641 	}
    642 
    643 	void ShaderCore::i2b(Vector4f &dst, const Vector4f &src)
    644 	{
    645 		dst.x = As<Float4>(CmpNEQ(As<Int4>(src.x), Int4(0)));
    646 		dst.y = As<Float4>(CmpNEQ(As<Int4>(src.y), Int4(0)));
    647 		dst.z = As<Float4>(CmpNEQ(As<Int4>(src.z), Int4(0)));
    648 		dst.w = As<Float4>(CmpNEQ(As<Int4>(src.w), Int4(0)));
    649 	}
    650 
    651 	void ShaderCore::b2i(Vector4f &dst, const Vector4f &src)
    652 	{
    653 		dst.x = As<Float4>(As<Int4>(src.x) & Int4(1));
    654 		dst.y = As<Float4>(As<Int4>(src.y) & Int4(1));
    655 		dst.z = As<Float4>(As<Int4>(src.z) & Int4(1));
    656 		dst.w = As<Float4>(As<Int4>(src.w) & Int4(1));
    657 	}
    658 
    659 	void ShaderCore::add(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
    660 	{
    661 		dst.x = src0.x + src1.x;
    662 		dst.y = src0.y + src1.y;
    663 		dst.z = src0.z + src1.z;
    664 		dst.w = src0.w + src1.w;
    665 	}
    666 
    667 	void ShaderCore::iadd(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
    668 	{
    669 		dst.x = As<Float4>(As<Int4>(src0.x) + As<Int4>(src1.x));
    670 		dst.y = As<Float4>(As<Int4>(src0.y) + As<Int4>(src1.y));
    671 		dst.z = As<Float4>(As<Int4>(src0.z) + As<Int4>(src1.z));
    672 		dst.w = As<Float4>(As<Int4>(src0.w) + As<Int4>(src1.w));
    673 	}
    674 
    675 	void ShaderCore::sub(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
    676 	{
    677 		dst.x = src0.x - src1.x;
    678 		dst.y = src0.y - src1.y;
    679 		dst.z = src0.z - src1.z;
    680 		dst.w = src0.w - src1.w;
    681 	}
    682 
    683 	void ShaderCore::isub(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
    684 	{
    685 		dst.x = As<Float4>(As<Int4>(src0.x) - As<Int4>(src1.x));
    686 		dst.y = As<Float4>(As<Int4>(src0.y) - As<Int4>(src1.y));
    687 		dst.z = As<Float4>(As<Int4>(src0.z) - As<Int4>(src1.z));
    688 		dst.w = As<Float4>(As<Int4>(src0.w) - As<Int4>(src1.w));
    689 	}
    690 
    691 	void ShaderCore::mad(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2)
    692 	{
    693 		dst.x = src0.x * src1.x + src2.x;
    694 		dst.y = src0.y * src1.y + src2.y;
    695 		dst.z = src0.z * src1.z + src2.z;
    696 		dst.w = src0.w * src1.w + src2.w;
    697 	}
    698 
    699 	void ShaderCore::imad(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2)
    700 	{
    701 		dst.x = As<Float4>(As<Int4>(src0.x) * As<Int4>(src1.x) + As<Int4>(src2.x));
    702 		dst.y = As<Float4>(As<Int4>(src0.y) * As<Int4>(src1.y) + As<Int4>(src2.y));
    703 		dst.z = As<Float4>(As<Int4>(src0.z) * As<Int4>(src1.z) + As<Int4>(src2.z));
    704 		dst.w = As<Float4>(As<Int4>(src0.w) * As<Int4>(src1.w) + As<Int4>(src2.w));
    705 	}
    706 
    707 	void ShaderCore::mul(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
    708 	{
    709 		dst.x = src0.x * src1.x;
    710 		dst.y = src0.y * src1.y;
    711 		dst.z = src0.z * src1.z;
    712 		dst.w = src0.w * src1.w;
    713 	}
    714 
    715 	void ShaderCore::imul(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
    716 	{
    717 		dst.x = As<Float4>(As<Int4>(src0.x) * As<Int4>(src1.x));
    718 		dst.y = As<Float4>(As<Int4>(src0.y) * As<Int4>(src1.y));
    719 		dst.z = As<Float4>(As<Int4>(src0.z) * As<Int4>(src1.z));
    720 		dst.w = As<Float4>(As<Int4>(src0.w) * As<Int4>(src1.w));
    721 	}
    722 
    723 	void ShaderCore::rcpx(Vector4f &dst, const Vector4f &src, bool pp)
    724 	{
    725 		Float4 rcp = reciprocal(src.x, pp, true, true);
    726 
    727 		dst.x = rcp;
    728 		dst.y = rcp;
    729 		dst.z = rcp;
    730 		dst.w = rcp;
    731 	}
    732 
    733 	void ShaderCore::div(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
    734 	{
    735 		dst.x = src0.x / src1.x;
    736 		dst.y = src0.y / src1.y;
    737 		dst.z = src0.z / src1.z;
    738 		dst.w = src0.w / src1.w;
    739 	}
    740 
    741 	void ShaderCore::idiv(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
    742 	{
    743 		Float4 intMax(As<Float4>(Int4(INT_MAX)));
    744 		cmp0i(dst.x, src1.x, intMax, src1.x);
    745 		dst.x = As<Float4>(As<Int4>(src0.x) / As<Int4>(dst.x));
    746 		cmp0i(dst.y, src1.y, intMax, src1.y);
    747 		dst.y = As<Float4>(As<Int4>(src0.y) / As<Int4>(dst.y));
    748 		cmp0i(dst.z, src1.z, intMax, src1.z);
    749 		dst.z = As<Float4>(As<Int4>(src0.z) / As<Int4>(dst.z));
    750 		cmp0i(dst.w, src1.w, intMax, src1.w);
    751 		dst.w = As<Float4>(As<Int4>(src0.w) / As<Int4>(dst.w));
    752 	}
    753 
    754 	void ShaderCore::udiv(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
    755 	{
    756 		Float4 uintMax(As<Float4>(UInt4(UINT_MAX)));
    757 		cmp0i(dst.x, src1.x, uintMax, src1.x);
    758 		dst.x = As<Float4>(As<UInt4>(src0.x) / As<UInt4>(dst.x));
    759 		cmp0i(dst.y, src1.y, uintMax, src1.y);
    760 		dst.y = As<Float4>(As<UInt4>(src0.y) / As<UInt4>(dst.y));
    761 		cmp0i(dst.z, src1.z, uintMax, src1.z);
    762 		dst.z = As<Float4>(As<UInt4>(src0.z) / As<UInt4>(dst.z));
    763 		cmp0i(dst.w, src1.w, uintMax, src1.w);
    764 		dst.w = As<Float4>(As<UInt4>(src0.w) / As<UInt4>(dst.w));
    765 	}
    766 
    767 	void ShaderCore::mod(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
    768 	{
    769 		dst.x = modulo(src0.x, src1.x);
    770 		dst.y = modulo(src0.y, src1.y);
    771 		dst.z = modulo(src0.z, src1.z);
    772 		dst.w = modulo(src0.w, src1.w);
    773 	}
    774 
    775 	void ShaderCore::imod(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
    776 	{
    777 		Float4 intMax(As<Float4>(Int4(INT_MAX)));
    778 		cmp0i(dst.x, src1.x, intMax, src1.x);
    779 		dst.x = As<Float4>(As<Int4>(src0.x) % As<Int4>(dst.x));
    780 		cmp0i(dst.y, src1.y, intMax, src1.y);
    781 		dst.y = As<Float4>(As<Int4>(src0.y) % As<Int4>(dst.y));
    782 		cmp0i(dst.z, src1.z, intMax, src1.z);
    783 		dst.z = As<Float4>(As<Int4>(src0.z) % As<Int4>(dst.z));
    784 		cmp0i(dst.w, src1.w, intMax, src1.w);
    785 		dst.w = As<Float4>(As<Int4>(src0.w) % As<Int4>(dst.w));
    786 	}
    787 
    788 	void ShaderCore::umod(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
    789 	{
    790 		Float4 uintMax(As<Float4>(UInt4(UINT_MAX)));
    791 		cmp0i(dst.x, src1.x, uintMax, src1.x);
    792 		dst.x = As<Float4>(As<UInt4>(src0.x) % As<UInt4>(dst.x));
    793 		cmp0i(dst.y, src1.y, uintMax, src1.y);
    794 		dst.y = As<Float4>(As<UInt4>(src0.y) % As<UInt4>(dst.y));
    795 		cmp0i(dst.z, src1.z, uintMax, src1.z);
    796 		dst.z = As<Float4>(As<UInt4>(src0.z) % As<UInt4>(dst.z));
    797 		cmp0i(dst.w, src1.w, uintMax, src1.w);
    798 		dst.w = As<Float4>(As<UInt4>(src0.w) % As<UInt4>(dst.w));
    799 	}
    800 
    801 	void ShaderCore::shl(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
    802 	{
    803 		dst.x = As<Float4>(As<Int4>(src0.x) << As<Int4>(src1.x));
    804 		dst.y = As<Float4>(As<Int4>(src0.y) << As<Int4>(src1.y));
    805 		dst.z = As<Float4>(As<Int4>(src0.z) << As<Int4>(src1.z));
    806 		dst.w = As<Float4>(As<Int4>(src0.w) << As<Int4>(src1.w));
    807 	}
    808 
    809 	void ShaderCore::ishr(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
    810 	{
    811 		dst.x = As<Float4>(As<Int4>(src0.x) >> As<Int4>(src1.x));
    812 		dst.y = As<Float4>(As<Int4>(src0.y) >> As<Int4>(src1.y));
    813 		dst.z = As<Float4>(As<Int4>(src0.z) >> As<Int4>(src1.z));
    814 		dst.w = As<Float4>(As<Int4>(src0.w) >> As<Int4>(src1.w));
    815 	}
    816 
    817 	void ShaderCore::ushr(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
    818 	{
    819 		dst.x = As<Float4>(As<UInt4>(src0.x) >> As<UInt4>(src1.x));
    820 		dst.y = As<Float4>(As<UInt4>(src0.y) >> As<UInt4>(src1.y));
    821 		dst.z = As<Float4>(As<UInt4>(src0.z) >> As<UInt4>(src1.z));
    822 		dst.w = As<Float4>(As<UInt4>(src0.w) >> As<UInt4>(src1.w));
    823 	}
    824 
    825 	void ShaderCore::rsqx(Vector4f &dst, const Vector4f &src, bool pp)
    826 	{
    827 		Float4 rsq = reciprocalSquareRoot(src.x, true, pp);
    828 
    829 		dst.x = rsq;
    830 		dst.y = rsq;
    831 		dst.z = rsq;
    832 		dst.w = rsq;
    833 	}
    834 
    835 	void ShaderCore::sqrt(Vector4f &dst, const Vector4f &src, bool pp)
    836 	{
    837 		dst.x = Sqrt(src.x);
    838 		dst.y = Sqrt(src.y);
    839 		dst.z = Sqrt(src.z);
    840 		dst.w = Sqrt(src.w);
    841 	}
    842 
    843 	void ShaderCore::rsq(Vector4f &dst, const Vector4f &src, bool pp)
    844 	{
    845 		dst.x = reciprocalSquareRoot(src.x, false, pp);
    846 		dst.y = reciprocalSquareRoot(src.y, false, pp);
    847 		dst.z = reciprocalSquareRoot(src.z, false, pp);
    848 		dst.w = reciprocalSquareRoot(src.w, false, pp);
    849 	}
    850 
    851 	void ShaderCore::len2(Float4 &dst, const Vector4f &src, bool pp)
    852 	{
    853 		dst = Sqrt(dot2(src, src));
    854 	}
    855 
    856 	void ShaderCore::len3(Float4 &dst, const Vector4f &src, bool pp)
    857 	{
    858 		dst = Sqrt(dot3(src, src));
    859 	}
    860 
    861 	void ShaderCore::len4(Float4 &dst, const Vector4f &src, bool pp)
    862 	{
    863 		dst = Sqrt(dot4(src, src));
    864 	}
    865 
    866 	void ShaderCore::dist1(Float4 &dst, const Vector4f &src0, const Vector4f &src1, bool pp)
    867 	{
    868 		dst = Abs(src0.x - src1.x);
    869 	}
    870 
    871 	void ShaderCore::dist2(Float4 &dst, const Vector4f &src0, const Vector4f &src1, bool pp)
    872 	{
    873 		Float4 dx = src0.x - src1.x;
    874 		Float4 dy = src0.y - src1.y;
    875 		Float4 dot2 = dx * dx + dy * dy;
    876 		dst = Sqrt(dot2);
    877 	}
    878 
    879 	void ShaderCore::dist3(Float4 &dst, const Vector4f &src0, const Vector4f &src1, bool pp)
    880 	{
    881 		Float4 dx = src0.x - src1.x;
    882 		Float4 dy = src0.y - src1.y;
    883 		Float4 dz = src0.z - src1.z;
    884 		Float4 dot3 = dx * dx + dy * dy + dz * dz;
    885 		dst = Sqrt(dot3);
    886 	}
    887 
    888 	void ShaderCore::dist4(Float4 &dst, const Vector4f &src0, const Vector4f &src1, bool pp)
    889 	{
    890 		Float4 dx = src0.x - src1.x;
    891 		Float4 dy = src0.y - src1.y;
    892 		Float4 dz = src0.z - src1.z;
    893 		Float4 dw = src0.w - src1.w;
    894 		Float4 dot4 = dx * dx + dy * dy + dz * dz + dw * dw;
    895 		dst = Sqrt(dot4);
    896 	}
    897 
    898 	void ShaderCore::dp1(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
    899 	{
    900 		Float4 t = src0.x * src1.x;
    901 
    902 		dst.x = t;
    903 		dst.y = t;
    904 		dst.z = t;
    905 		dst.w = t;
    906 	}
    907 
    908 	void ShaderCore::dp2(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
    909 	{
    910 		Float4 t = dot2(src0, src1);
    911 
    912 		dst.x = t;
    913 		dst.y = t;
    914 		dst.z = t;
    915 		dst.w = t;
    916 	}
    917 
    918 	void ShaderCore::dp2add(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2)
    919 	{
    920 		Float4 t = dot2(src0, src1) + src2.x;
    921 
    922 		dst.x = t;
    923 		dst.y = t;
    924 		dst.z = t;
    925 		dst.w = t;
    926 	}
    927 
    928 	void ShaderCore::dp3(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
    929 	{
    930 		Float4 dot = dot3(src0, src1);
    931 
    932 		dst.x = dot;
    933 		dst.y = dot;
    934 		dst.z = dot;
    935 		dst.w = dot;
    936 	}
    937 
    938 	void ShaderCore::dp4(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
    939 	{
    940 		Float4 dot = dot4(src0, src1);
    941 
    942 		dst.x = dot;
    943 		dst.y = dot;
    944 		dst.z = dot;
    945 		dst.w = dot;
    946 	}
    947 
    948 	void ShaderCore::min(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
    949 	{
    950 		dst.x = Min(src0.x, src1.x);
    951 		dst.y = Min(src0.y, src1.y);
    952 		dst.z = Min(src0.z, src1.z);
    953 		dst.w = Min(src0.w, src1.w);
    954 	}
    955 
    956 	void ShaderCore::imin(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
    957 	{
    958 		dst.x = As<Float4>(Min(As<Int4>(src0.x), As<Int4>(src1.x)));
    959 		dst.y = As<Float4>(Min(As<Int4>(src0.y), As<Int4>(src1.y)));
    960 		dst.z = As<Float4>(Min(As<Int4>(src0.z), As<Int4>(src1.z)));
    961 		dst.w = As<Float4>(Min(As<Int4>(src0.w), As<Int4>(src1.w)));
    962 	}
    963 
    964 	void ShaderCore::umin(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
    965 	{
    966 		dst.x = As<Float4>(Min(As<UInt4>(src0.x), As<UInt4>(src1.x)));
    967 		dst.y = As<Float4>(Min(As<UInt4>(src0.y), As<UInt4>(src1.y)));
    968 		dst.z = As<Float4>(Min(As<UInt4>(src0.z), As<UInt4>(src1.z)));
    969 		dst.w = As<Float4>(Min(As<UInt4>(src0.w), As<UInt4>(src1.w)));
    970 	}
    971 
    972 	void ShaderCore::max(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
    973 	{
    974 		dst.x = Max(src0.x, src1.x);
    975 		dst.y = Max(src0.y, src1.y);
    976 		dst.z = Max(src0.z, src1.z);
    977 		dst.w = Max(src0.w, src1.w);
    978 	}
    979 
    980 	void ShaderCore::imax(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
    981 	{
    982 		dst.x = As<Float4>(Max(As<Int4>(src0.x), As<Int4>(src1.x)));
    983 		dst.y = As<Float4>(Max(As<Int4>(src0.y), As<Int4>(src1.y)));
    984 		dst.z = As<Float4>(Max(As<Int4>(src0.z), As<Int4>(src1.z)));
    985 		dst.w = As<Float4>(Max(As<Int4>(src0.w), As<Int4>(src1.w)));
    986 	}
    987 
    988 	void ShaderCore::umax(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
    989 	{
    990 		dst.x = As<Float4>(Max(As<Int4>(src0.x), As<Int4>(src1.x)));
    991 		dst.y = As<Float4>(Max(As<Int4>(src0.y), As<Int4>(src1.y)));
    992 		dst.z = As<Float4>(Max(As<Int4>(src0.z), As<Int4>(src1.z)));
    993 		dst.w = As<Float4>(Max(As<Int4>(src0.w), As<Int4>(src1.w)));
    994 	}
    995 
    996 	void ShaderCore::slt(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
    997 	{
    998 		dst.x = As<Float4>(As<Int4>(CmpLT(src0.x, src1.x)) & As<Int4>(Float4(1.0f)));
    999 		dst.y = As<Float4>(As<Int4>(CmpLT(src0.y, src1.y)) & As<Int4>(Float4(1.0f)));
   1000 		dst.z = As<Float4>(As<Int4>(CmpLT(src0.z, src1.z)) & As<Int4>(Float4(1.0f)));
   1001 		dst.w = As<Float4>(As<Int4>(CmpLT(src0.w, src1.w)) & As<Int4>(Float4(1.0f)));
   1002 	}
   1003 
   1004 	void ShaderCore::step(Vector4f &dst, const Vector4f &edge, const Vector4f &x)
   1005 	{
   1006 		dst.x = As<Float4>(CmpNLT(x.x, edge.x) & As<Int4>(Float4(1.0f)));
   1007 		dst.y = As<Float4>(CmpNLT(x.y, edge.y) & As<Int4>(Float4(1.0f)));
   1008 		dst.z = As<Float4>(CmpNLT(x.z, edge.z) & As<Int4>(Float4(1.0f)));
   1009 		dst.w = As<Float4>(CmpNLT(x.w, edge.w) & As<Int4>(Float4(1.0f)));
   1010 	}
   1011 
   1012 	void ShaderCore::exp2x(Vector4f &dst, const Vector4f &src, bool pp)
   1013 	{
   1014 		Float4 exp = exponential2(src.x, pp);
   1015 
   1016 		dst.x = exp;
   1017 		dst.y = exp;
   1018 		dst.z = exp;
   1019 		dst.w = exp;
   1020 	}
   1021 
   1022 	void ShaderCore::exp2(Vector4f &dst, const Vector4f &src, bool pp)
   1023 	{
   1024 		dst.x = exponential2(src.x, pp);
   1025 		dst.y = exponential2(src.y, pp);
   1026 		dst.z = exponential2(src.z, pp);
   1027 		dst.w = exponential2(src.w, pp);
   1028 	}
   1029 
   1030 	void ShaderCore::exp(Vector4f &dst, const Vector4f &src, bool pp)
   1031 	{
   1032 		dst.x = exponential(src.x, pp);
   1033 		dst.y = exponential(src.y, pp);
   1034 		dst.z = exponential(src.z, pp);
   1035 		dst.w = exponential(src.w, pp);
   1036 	}
   1037 
   1038 	void ShaderCore::log2x(Vector4f &dst, const Vector4f &src, bool pp)
   1039 	{
   1040 		Float4 log = logarithm2(src.x, true, pp);
   1041 
   1042 		dst.x = log;
   1043 		dst.y = log;
   1044 		dst.z = log;
   1045 		dst.w = log;
   1046 	}
   1047 
   1048 	void ShaderCore::log2(Vector4f &dst, const Vector4f &src, bool pp)
   1049 	{
   1050 		dst.x = logarithm2(src.x, false, pp);
   1051 		dst.y = logarithm2(src.y, false, pp);
   1052 		dst.z = logarithm2(src.z, false, pp);
   1053 		dst.w = logarithm2(src.w, false, pp);
   1054 	}
   1055 
   1056 	void ShaderCore::log(Vector4f &dst, const Vector4f &src, bool pp)
   1057 	{
   1058 		dst.x = logarithm(src.x, false, pp);
   1059 		dst.y = logarithm(src.y, false, pp);
   1060 		dst.z = logarithm(src.z, false, pp);
   1061 		dst.w = logarithm(src.w, false, pp);
   1062 	}
   1063 
   1064 	void ShaderCore::lit(Vector4f &dst, const Vector4f &src)
   1065 	{
   1066 		dst.x = Float4(1.0f);
   1067 		dst.y = Max(src.x, Float4(0.0f));
   1068 
   1069 		Float4 pow;
   1070 
   1071 		pow = src.w;
   1072 		pow = Min(pow, Float4(127.9961f));
   1073 		pow = Max(pow, Float4(-127.9961f));
   1074 
   1075 		dst.z = power(src.y, pow);
   1076 		dst.z = As<Float4>(As<Int4>(dst.z) & CmpNLT(src.x, Float4(0.0f)));
   1077 		dst.z = As<Float4>(As<Int4>(dst.z) & CmpNLT(src.y, Float4(0.0f)));
   1078 
   1079 		dst.w = Float4(1.0f);
   1080 	}
   1081 
   1082 	void ShaderCore::att(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
   1083 	{
   1084 		// Computes attenuation factors (1, d, d^2, 1/d) assuming src0 = d^2, src1 = 1/d
   1085 		dst.x = 1;
   1086 		dst.y = src0.y * src1.y;
   1087 		dst.z = src0.z;
   1088 		dst.w = src1.w;
   1089 	}
   1090 
   1091 	void ShaderCore::lrp(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2)
   1092 	{
   1093 		dst.x = src0.x * (src1.x - src2.x) + src2.x;
   1094 		dst.y = src0.y * (src1.y - src2.y) + src2.y;
   1095 		dst.z = src0.z * (src1.z - src2.z) + src2.z;
   1096 		dst.w = src0.w * (src1.w - src2.w) + src2.w;
   1097 	}
   1098 
   1099 	void ShaderCore::isinf(Vector4f &dst, const Vector4f &src)
   1100 	{
   1101 		dst.x = As<Float4>(IsInf(src.x));
   1102 		dst.y = As<Float4>(IsInf(src.y));
   1103 		dst.z = As<Float4>(IsInf(src.z));
   1104 		dst.w = As<Float4>(IsInf(src.w));
   1105 	}
   1106 
   1107 	void ShaderCore::isnan(Vector4f &dst, const Vector4f &src)
   1108 	{
   1109 		dst.x = As<Float4>(IsNan(src.x));
   1110 		dst.y = As<Float4>(IsNan(src.y));
   1111 		dst.z = As<Float4>(IsNan(src.z));
   1112 		dst.w = As<Float4>(IsNan(src.w));
   1113 	}
   1114 
   1115 	void ShaderCore::smooth(Vector4f &dst, const Vector4f &edge0, const Vector4f &edge1, const Vector4f &x)
   1116 	{
   1117 		Float4 tx = Min(Max((x.x - edge0.x) / (edge1.x - edge0.x), Float4(0.0f)), Float4(1.0f)); dst.x = tx * tx * (Float4(3.0f) - Float4(2.0f) * tx);
   1118 		Float4 ty = Min(Max((x.y - edge0.y) / (edge1.y - edge0.y), Float4(0.0f)), Float4(1.0f)); dst.y = ty * ty * (Float4(3.0f) - Float4(2.0f) * ty);
   1119 		Float4 tz = Min(Max((x.z - edge0.z) / (edge1.z - edge0.z), Float4(0.0f)), Float4(1.0f)); dst.z = tz * tz * (Float4(3.0f) - Float4(2.0f) * tz);
   1120 		Float4 tw = Min(Max((x.w - edge0.w) / (edge1.w - edge0.w), Float4(0.0f)), Float4(1.0f)); dst.w = tw * tw * (Float4(3.0f) - Float4(2.0f) * tw);
   1121 	}
   1122 
   1123 	void ShaderCore::floatToHalfBits(Float4& dst, const Float4& floatBits, bool storeInUpperBits)
   1124 	{
   1125 		static const uint32_t mask_sign = 0x80000000u;
   1126 		static const uint32_t mask_round = ~0xfffu;
   1127 		static const uint32_t c_f32infty = 255 << 23;
   1128 		static const uint32_t c_magic = 15 << 23;
   1129 		static const uint32_t c_nanbit = 0x200;
   1130 		static const uint32_t c_infty_as_fp16 = 0x7c00;
   1131 		static const uint32_t c_clamp = (31 << 23) - 0x1000;
   1132 
   1133 		UInt4 justsign = UInt4(mask_sign) & As<UInt4>(floatBits);
   1134 		UInt4 absf = As<UInt4>(floatBits) ^ justsign;
   1135 		UInt4 b_isnormal = CmpNLE(UInt4(c_f32infty), absf);
   1136 
   1137 		// Note: this version doesn't round to the nearest even in case of a tie as defined by IEEE 754-2008, it rounds to +inf
   1138 		//       instead of nearest even, since that's fine for GLSL ES 3.0's needs (see section 2.1.1 Floating-Point Computation)
   1139 		UInt4 joined = ((((As<UInt4>(Min(As<Float4>(absf & UInt4(mask_round)) * As<Float4>(UInt4(c_magic)),
   1140 		                                 As<Float4>(UInt4(c_clamp))))) - UInt4(mask_round)) >> 13) & b_isnormal) |
   1141 		               ((b_isnormal ^ UInt4(0xFFFFFFFF)) & ((CmpNLE(absf, UInt4(c_f32infty)) & UInt4(c_nanbit)) |
   1142 		               UInt4(c_infty_as_fp16)));
   1143 
   1144 		dst = As<Float4>(storeInUpperBits ? As<UInt4>(dst) | ((joined << 16) | justsign) : joined | (justsign >> 16));
   1145 	}
   1146 
   1147 	void ShaderCore::halfToFloatBits(Float4& dst, const Float4& halfBits)
   1148 	{
   1149 		static const uint32_t mask_nosign = 0x7FFF;
   1150 		static const uint32_t magic = (254 - 15) << 23;
   1151 		static const uint32_t was_infnan = 0x7BFF;
   1152 		static const uint32_t exp_infnan = 255 << 23;
   1153 
   1154 		UInt4 expmant = As<UInt4>(halfBits) & UInt4(mask_nosign);
   1155 		dst = As<Float4>(As<UInt4>(As<Float4>(expmant << 13) * As<Float4>(UInt4(magic))) |
   1156 		                 ((As<UInt4>(halfBits) ^ UInt4(expmant)) << 16) |
   1157 		                 (CmpNLE(As<UInt4>(expmant), UInt4(was_infnan)) & UInt4(exp_infnan)));
   1158 	}
   1159 
   1160 	void ShaderCore::packHalf2x16(Vector4f &d, const Vector4f &s0)
   1161 	{
   1162 		// half2 | half1
   1163 		floatToHalfBits(d.x, s0.x, false);
   1164 		floatToHalfBits(d.x, s0.y, true);
   1165 	}
   1166 
   1167 	void ShaderCore::unpackHalf2x16(Vector4f &dst, const Vector4f &s0)
   1168 	{
   1169 		// half2 | half1
   1170 		halfToFloatBits(dst.x, As<Float4>(As<UInt4>(s0.x) & UInt4(0x0000FFFF)));
   1171 		halfToFloatBits(dst.y, As<Float4>((As<UInt4>(s0.x) & UInt4(0xFFFF0000)) >> 16));
   1172 	}
   1173 
   1174 	void ShaderCore::packSnorm2x16(Vector4f &d, const Vector4f &s0)
   1175 	{
   1176 		// round(clamp(c, -1.0, 1.0) * 32767.0)
   1177 		d.x = As<Float4>((Int4(Round(Min(Max(s0.x, Float4(-1.0f)), Float4(1.0f)) * Float4(32767.0f))) & Int4(0xFFFF)) |
   1178 		                ((Int4(Round(Min(Max(s0.y, Float4(-1.0f)), Float4(1.0f)) * Float4(32767.0f))) & Int4(0xFFFF)) << 16));
   1179 	}
   1180 
   1181 	void ShaderCore::packUnorm2x16(Vector4f &d, const Vector4f &s0)
   1182 	{
   1183 		// round(clamp(c, 0.0, 1.0) * 65535.0)
   1184 		d.x = As<Float4>((Int4(Round(Min(Max(s0.x, Float4(0.0f)), Float4(1.0f)) * Float4(65535.0f))) & Int4(0xFFFF)) |
   1185 		                ((Int4(Round(Min(Max(s0.y, Float4(0.0f)), Float4(1.0f)) * Float4(65535.0f))) & Int4(0xFFFF)) << 16));
   1186 	}
   1187 
   1188 	void ShaderCore::unpackSnorm2x16(Vector4f &dst, const Vector4f &s0)
   1189 	{
   1190 		// clamp(f / 32727.0, -1.0, 1.0)
   1191 		dst.x = Min(Max(Float4(As<Int4>((As<UInt4>(s0.x) & UInt4(0x0000FFFF)) << 16)) * Float4(1.0f / float(0x7FFF0000)), Float4(-1.0f)), Float4(1.0f));
   1192 		dst.y = Min(Max(Float4(As<Int4>(As<UInt4>(s0.x) & UInt4(0xFFFF0000))) * Float4(1.0f / float(0x7FFF0000)), Float4(-1.0f)), Float4(1.0f));
   1193 	}
   1194 
   1195 	void ShaderCore::unpackUnorm2x16(Vector4f &dst, const Vector4f &s0)
   1196 	{
   1197 		// f / 65535.0
   1198 		dst.x = Float4((As<UInt4>(s0.x) & UInt4(0x0000FFFF)) << 16) * Float4(1.0f / float(0xFFFF0000));
   1199 		dst.y = Float4(As<UInt4>(s0.x) & UInt4(0xFFFF0000)) * Float4(1.0f / float(0xFFFF0000));
   1200 	}
   1201 
   1202 	void ShaderCore::det2(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
   1203 	{
   1204 		dst.x = src0.x * src1.y - src0.y * src1.x;
   1205 		dst.y = dst.z = dst.w = dst.x;
   1206 	}
   1207 
   1208 	void ShaderCore::det3(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2)
   1209 	{
   1210 		crs(dst, src1, src2);
   1211 		dp3(dst, dst, src0);
   1212 	}
   1213 
   1214 	void ShaderCore::det4(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2, const Vector4f &src3)
   1215 	{
   1216 		dst.x = src2.z * src3.w - src2.w * src3.z;
   1217 		dst.y = src1.w * src3.z - src1.z * src3.w;
   1218 		dst.z = src1.z * src2.w - src1.w * src2.z;
   1219 		dst.x = src0.x * (src1.y * dst.x + src2.y * dst.y + src3.y * dst.z) -
   1220 		        src0.y * (src1.x * dst.x + src2.x * dst.y + src3.x * dst.z) +
   1221 		        src0.z * (src1.x * (src2.y * src3.w - src2.w * src3.y) +
   1222 		                  src2.x * (src1.w * src3.y - src1.y * src3.w) +
   1223 		                  src3.x * (src1.y * src2.w - src1.w * src2.y)) +
   1224 		        src0.w * (src1.x * (src2.z * src3.y - src2.y * src3.z) +
   1225 		                  src2.x * (src1.y * src3.z - src1.z * src3.y) +
   1226 		                  src3.x * (src1.z * src2.y - src1.y * src2.z));
   1227 		dst.y = dst.z = dst.w = dst.x;
   1228 	}
   1229 
   1230 	void ShaderCore::frc(Vector4f &dst, const Vector4f &src)
   1231 	{
   1232 		dst.x = Frac(src.x);
   1233 		dst.y = Frac(src.y);
   1234 		dst.z = Frac(src.z);
   1235 		dst.w = Frac(src.w);
   1236 	}
   1237 
   1238 	void ShaderCore::trunc(Vector4f &dst, const Vector4f &src)
   1239 	{
   1240 		dst.x = Trunc(src.x);
   1241 		dst.y = Trunc(src.y);
   1242 		dst.z = Trunc(src.z);
   1243 		dst.w = Trunc(src.w);
   1244 	}
   1245 
   1246 	void ShaderCore::floor(Vector4f &dst, const Vector4f &src)
   1247 	{
   1248 		dst.x = Floor(src.x);
   1249 		dst.y = Floor(src.y);
   1250 		dst.z = Floor(src.z);
   1251 		dst.w = Floor(src.w);
   1252 	}
   1253 
   1254 	void ShaderCore::round(Vector4f &dst, const Vector4f &src)
   1255 	{
   1256 		dst.x = Round(src.x);
   1257 		dst.y = Round(src.y);
   1258 		dst.z = Round(src.z);
   1259 		dst.w = Round(src.w);
   1260 	}
   1261 
   1262 	void ShaderCore::roundEven(Vector4f &dst, const Vector4f &src)
   1263 	{
   1264 		// dst = round(src) + ((round(src) < src) * 2 - 1) * (fract(src) == 0.5) * isOdd(round(src));
   1265 		// ex.: 1.5:  2 + (0 * 2 - 1) * 1 * 0 = 2
   1266 		//      2.5:  3 + (0 * 2 - 1) * 1 * 1 = 2
   1267 		//     -1.5: -2 + (1 * 2 - 1) * 1 * 0 = -2
   1268 		//     -2.5: -3 + (1 * 2 - 1) * 1 * 1 = -2
   1269 		// Even if the round implementation rounds the other way:
   1270 		//      1.5:  1 + (1 * 2 - 1) * 1 * 1 = 2
   1271 		//      2.5:  2 + (1 * 2 - 1) * 1 * 0 = 2
   1272 		//     -1.5: -1 + (0 * 2 - 1) * 1 * 1 = -2
   1273 		//     -2.5: -2 + (0 * 2 - 1) * 1 * 0 = -2
   1274 		round(dst, src);
   1275 		dst.x += ((Float4(CmpLT(dst.x, src.x) & Int4(1)) * Float4(2.0f)) - Float4(1.0f)) * Float4(CmpEQ(Frac(src.x), Float4(0.5f)) & Int4(1)) * Float4(Int4(dst.x) & Int4(1));
   1276 		dst.y += ((Float4(CmpLT(dst.y, src.y) & Int4(1)) * Float4(2.0f)) - Float4(1.0f)) * Float4(CmpEQ(Frac(src.y), Float4(0.5f)) & Int4(1)) * Float4(Int4(dst.y) & Int4(1));
   1277 		dst.z += ((Float4(CmpLT(dst.z, src.z) & Int4(1)) * Float4(2.0f)) - Float4(1.0f)) * Float4(CmpEQ(Frac(src.z), Float4(0.5f)) & Int4(1)) * Float4(Int4(dst.z) & Int4(1));
   1278 		dst.w += ((Float4(CmpLT(dst.w, src.w) & Int4(1)) * Float4(2.0f)) - Float4(1.0f)) * Float4(CmpEQ(Frac(src.w), Float4(0.5f)) & Int4(1)) * Float4(Int4(dst.w) & Int4(1));
   1279 	}
   1280 
   1281 	void ShaderCore::ceil(Vector4f &dst, const Vector4f &src)
   1282 	{
   1283 		dst.x = Ceil(src.x);
   1284 		dst.y = Ceil(src.y);
   1285 		dst.z = Ceil(src.z);
   1286 		dst.w = Ceil(src.w);
   1287 	}
   1288 
   1289 	void ShaderCore::powx(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, bool pp)
   1290 	{
   1291 		Float4 pow = power(src0.x, src1.x, pp);
   1292 
   1293 		dst.x = pow;
   1294 		dst.y = pow;
   1295 		dst.z = pow;
   1296 		dst.w = pow;
   1297 	}
   1298 
   1299 	void ShaderCore::pow(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, bool pp)
   1300 	{
   1301 		dst.x = power(src0.x, src1.x, pp);
   1302 		dst.y = power(src0.y, src1.y, pp);
   1303 		dst.z = power(src0.z, src1.z, pp);
   1304 		dst.w = power(src0.w, src1.w, pp);
   1305 	}
   1306 
   1307 	void ShaderCore::crs(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
   1308 	{
   1309 		dst.x = src0.y * src1.z - src0.z * src1.y;
   1310 		dst.y = src0.z * src1.x - src0.x * src1.z;
   1311 		dst.z = src0.x * src1.y - src0.y * src1.x;
   1312 	}
   1313 
   1314 	void ShaderCore::forward1(Vector4f &dst, const Vector4f &N, const Vector4f &I, const Vector4f &Nref)
   1315 	{
   1316 		Int4 flip = CmpNLT(Nref.x * I.x, Float4(0.0f)) & Int4(0x80000000);
   1317 
   1318 		dst.x =  As<Float4>(flip ^ As<Int4>(N.x));
   1319 	}
   1320 
   1321 	void ShaderCore::forward2(Vector4f &dst, const Vector4f &N, const Vector4f &I, const Vector4f &Nref)
   1322 	{
   1323 		Int4 flip = CmpNLT(dot2(Nref, I), Float4(0.0f)) & Int4(0x80000000);
   1324 
   1325 		dst.x =  As<Float4>(flip ^ As<Int4>(N.x));
   1326 		dst.y =  As<Float4>(flip ^ As<Int4>(N.y));
   1327 	}
   1328 
   1329 	void ShaderCore::forward3(Vector4f &dst, const Vector4f &N, const Vector4f &I, const Vector4f &Nref)
   1330 	{
   1331 		Int4 flip = CmpNLT(dot3(Nref, I), Float4(0.0f)) & Int4(0x80000000);
   1332 
   1333 		dst.x =  As<Float4>(flip ^ As<Int4>(N.x));
   1334 		dst.y =  As<Float4>(flip ^ As<Int4>(N.y));
   1335 		dst.z =  As<Float4>(flip ^ As<Int4>(N.z));
   1336 	}
   1337 
   1338 	void ShaderCore::forward4(Vector4f &dst, const Vector4f &N, const Vector4f &I, const Vector4f &Nref)
   1339 	{
   1340 		Int4 flip = CmpNLT(dot4(Nref, I), Float4(0.0f)) & Int4(0x80000000);
   1341 
   1342 		dst.x =  As<Float4>(flip ^ As<Int4>(N.x));
   1343 		dst.y =  As<Float4>(flip ^ As<Int4>(N.y));
   1344 		dst.z =  As<Float4>(flip ^ As<Int4>(N.z));
   1345 		dst.w =  As<Float4>(flip ^ As<Int4>(N.w));
   1346 	}
   1347 
   1348 	void ShaderCore::reflect1(Vector4f &dst, const Vector4f &I, const Vector4f &N)
   1349 	{
   1350 		Float4 d = N.x * I.x;
   1351 
   1352 		dst.x = I.x - Float4(2.0f) * d * N.x;
   1353 	}
   1354 
   1355 	void ShaderCore::reflect2(Vector4f &dst, const Vector4f &I, const Vector4f &N)
   1356 	{
   1357 		Float4 d = dot2(N, I);
   1358 
   1359 		dst.x = I.x - Float4(2.0f) * d * N.x;
   1360 		dst.y = I.y - Float4(2.0f) * d * N.y;
   1361 	}
   1362 
   1363 	void ShaderCore::reflect3(Vector4f &dst, const Vector4f &I, const Vector4f &N)
   1364 	{
   1365 		Float4 d = dot3(N, I);
   1366 
   1367 		dst.x = I.x - Float4(2.0f) * d * N.x;
   1368 		dst.y = I.y - Float4(2.0f) * d * N.y;
   1369 		dst.z = I.z - Float4(2.0f) * d * N.z;
   1370 	}
   1371 
   1372 	void ShaderCore::reflect4(Vector4f &dst, const Vector4f &I, const Vector4f &N)
   1373 	{
   1374 		Float4 d = dot4(N, I);
   1375 
   1376 		dst.x = I.x - Float4(2.0f) * d * N.x;
   1377 		dst.y = I.y - Float4(2.0f) * d * N.y;
   1378 		dst.z = I.z - Float4(2.0f) * d * N.z;
   1379 		dst.w = I.w - Float4(2.0f) * d * N.w;
   1380 	}
   1381 
   1382 	void ShaderCore::refract1(Vector4f &dst, const Vector4f &I, const Vector4f &N, const Float4 &eta)
   1383 	{
   1384 		Float4 d = N.x * I.x;
   1385 		Float4 k = Float4(1.0f) - eta * eta * (Float4(1.0f) - d * d);
   1386 		Int4 pos = CmpNLT(k, Float4(0.0f));
   1387 		Float4 t = (eta * d + Sqrt(k));
   1388 
   1389 		dst.x = As<Float4>(pos & As<Int4>(eta * I.x - t * N.x));
   1390 	}
   1391 
   1392 	void ShaderCore::refract2(Vector4f &dst, const Vector4f &I, const Vector4f &N, const Float4 &eta)
   1393 	{
   1394 		Float4 d = dot2(N, I);
   1395 		Float4 k = Float4(1.0f) - eta * eta * (Float4(1.0f) - d * d);
   1396 		Int4 pos = CmpNLT(k, Float4(0.0f));
   1397 		Float4 t = (eta * d + Sqrt(k));
   1398 
   1399 		dst.x = As<Float4>(pos & As<Int4>(eta * I.x - t * N.x));
   1400 		dst.y = As<Float4>(pos & As<Int4>(eta * I.y - t * N.y));
   1401 	}
   1402 
   1403 	void ShaderCore::refract3(Vector4f &dst, const Vector4f &I, const Vector4f &N, const Float4 &eta)
   1404 	{
   1405 		Float4 d = dot3(N, I);
   1406 		Float4 k = Float4(1.0f) - eta * eta * (Float4(1.0f) - d * d);
   1407 		Int4 pos = CmpNLT(k, Float4(0.0f));
   1408 		Float4 t = (eta * d + Sqrt(k));
   1409 
   1410 		dst.x = As<Float4>(pos & As<Int4>(eta * I.x - t * N.x));
   1411 		dst.y = As<Float4>(pos & As<Int4>(eta * I.y - t * N.y));
   1412 		dst.z = As<Float4>(pos & As<Int4>(eta * I.z - t * N.z));
   1413 	}
   1414 
   1415 	void ShaderCore::refract4(Vector4f &dst, const Vector4f &I, const Vector4f &N, const Float4 &eta)
   1416 	{
   1417 		Float4 d = dot4(N, I);
   1418 		Float4 k = Float4(1.0f) - eta * eta * (Float4(1.0f) - d * d);
   1419 		Int4 pos = CmpNLT(k, Float4(0.0f));
   1420 		Float4 t = (eta * d + Sqrt(k));
   1421 
   1422 		dst.x = As<Float4>(pos & As<Int4>(eta * I.x - t * N.x));
   1423 		dst.y = As<Float4>(pos & As<Int4>(eta * I.y - t * N.y));
   1424 		dst.z = As<Float4>(pos & As<Int4>(eta * I.z - t * N.z));
   1425 		dst.w = As<Float4>(pos & As<Int4>(eta * I.w - t * N.w));
   1426 	}
   1427 
   1428 	void ShaderCore::sgn(Vector4f &dst, const Vector4f &src)
   1429 	{
   1430 		sgn(dst.x, src.x);
   1431 		sgn(dst.y, src.y);
   1432 		sgn(dst.z, src.z);
   1433 		sgn(dst.w, src.w);
   1434 	}
   1435 
   1436 	void ShaderCore::isgn(Vector4f &dst, const Vector4f &src)
   1437 	{
   1438 		isgn(dst.x, src.x);
   1439 		isgn(dst.y, src.y);
   1440 		isgn(dst.z, src.z);
   1441 		isgn(dst.w, src.w);
   1442 	}
   1443 
   1444 	void ShaderCore::abs(Vector4f &dst, const Vector4f &src)
   1445 	{
   1446 		dst.x = Abs(src.x);
   1447 		dst.y = Abs(src.y);
   1448 		dst.z = Abs(src.z);
   1449 		dst.w = Abs(src.w);
   1450 	}
   1451 
   1452 	void ShaderCore::iabs(Vector4f &dst, const Vector4f &src)
   1453 	{
   1454 		dst.x = As<Float4>(Abs(As<Int4>(src.x)));
   1455 		dst.y = As<Float4>(Abs(As<Int4>(src.y)));
   1456 		dst.z = As<Float4>(Abs(As<Int4>(src.z)));
   1457 		dst.w = As<Float4>(Abs(As<Int4>(src.w)));
   1458 	}
   1459 
   1460 	void ShaderCore::nrm2(Vector4f &dst, const Vector4f &src, bool pp)
   1461 	{
   1462 		Float4 dot = dot2(src, src);
   1463 		Float4 rsq = reciprocalSquareRoot(dot, false, pp);
   1464 
   1465 		dst.x = src.x * rsq;
   1466 		dst.y = src.y * rsq;
   1467 		dst.z = src.z * rsq;
   1468 		dst.w = src.w * rsq;
   1469 	}
   1470 
   1471 	void ShaderCore::nrm3(Vector4f &dst, const Vector4f &src, bool pp)
   1472 	{
   1473 		Float4 dot = dot3(src, src);
   1474 		Float4 rsq = reciprocalSquareRoot(dot, false, pp);
   1475 
   1476 		dst.x = src.x * rsq;
   1477 		dst.y = src.y * rsq;
   1478 		dst.z = src.z * rsq;
   1479 		dst.w = src.w * rsq;
   1480 	}
   1481 
   1482 	void ShaderCore::nrm4(Vector4f &dst, const Vector4f &src, bool pp)
   1483 	{
   1484 		Float4 dot = dot4(src, src);
   1485 		Float4 rsq = reciprocalSquareRoot(dot, false, pp);
   1486 
   1487 		dst.x = src.x * rsq;
   1488 		dst.y = src.y * rsq;
   1489 		dst.z = src.z * rsq;
   1490 		dst.w = src.w * rsq;
   1491 	}
   1492 
   1493 	void ShaderCore::sincos(Vector4f &dst, const Vector4f &src, bool pp)
   1494 	{
   1495 		dst.x = cosine_pi(src.x, pp);
   1496 		dst.y = sine_pi(src.x, pp);
   1497 	}
   1498 
   1499 	void ShaderCore::cos(Vector4f &dst, const Vector4f &src, bool pp)
   1500 	{
   1501 		dst.x = cosine(src.x, pp);
   1502 		dst.y = cosine(src.y, pp);
   1503 		dst.z = cosine(src.z, pp);
   1504 		dst.w = cosine(src.w, pp);
   1505 	}
   1506 
   1507 	void ShaderCore::sin(Vector4f &dst, const Vector4f &src, bool pp)
   1508 	{
   1509 		dst.x = sine(src.x, pp);
   1510 		dst.y = sine(src.y, pp);
   1511 		dst.z = sine(src.z, pp);
   1512 		dst.w = sine(src.w, pp);
   1513 	}
   1514 
   1515 	void ShaderCore::tan(Vector4f &dst, const Vector4f &src, bool pp)
   1516 	{
   1517 		dst.x = tangent(src.x, pp);
   1518 		dst.y = tangent(src.y, pp);
   1519 		dst.z = tangent(src.z, pp);
   1520 		dst.w = tangent(src.w, pp);
   1521 	}
   1522 
   1523 	void ShaderCore::acos(Vector4f &dst, const Vector4f &src, bool pp)
   1524 	{
   1525 		dst.x = arccos(src.x, pp);
   1526 		dst.y = arccos(src.y, pp);
   1527 		dst.z = arccos(src.z, pp);
   1528 		dst.w = arccos(src.w, pp);
   1529 	}
   1530 
   1531 	void ShaderCore::asin(Vector4f &dst, const Vector4f &src, bool pp)
   1532 	{
   1533 		dst.x = arcsin(src.x, pp);
   1534 		dst.y = arcsin(src.y, pp);
   1535 		dst.z = arcsin(src.z, pp);
   1536 		dst.w = arcsin(src.w, pp);
   1537 	}
   1538 
   1539 	void ShaderCore::atan(Vector4f &dst, const Vector4f &src, bool pp)
   1540 	{
   1541 		dst.x = arctan(src.x, pp);
   1542 		dst.y = arctan(src.y, pp);
   1543 		dst.z = arctan(src.z, pp);
   1544 		dst.w = arctan(src.w, pp);
   1545 	}
   1546 
   1547 	void ShaderCore::atan2(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, bool pp)
   1548 	{
   1549 		dst.x = arctan(src0.x, src1.x, pp);
   1550 		dst.y = arctan(src0.y, src1.y, pp);
   1551 		dst.z = arctan(src0.z, src1.z, pp);
   1552 		dst.w = arctan(src0.w, src1.w, pp);
   1553 	}
   1554 
   1555 	void ShaderCore::cosh(Vector4f &dst, const Vector4f &src, bool pp)
   1556 	{
   1557 		dst.x = cosineh(src.x, pp);
   1558 		dst.y = cosineh(src.y, pp);
   1559 		dst.z = cosineh(src.z, pp);
   1560 		dst.w = cosineh(src.w, pp);
   1561 	}
   1562 
   1563 	void ShaderCore::sinh(Vector4f &dst, const Vector4f &src, bool pp)
   1564 	{
   1565 		dst.x = sineh(src.x, pp);
   1566 		dst.y = sineh(src.y, pp);
   1567 		dst.z = sineh(src.z, pp);
   1568 		dst.w = sineh(src.w, pp);
   1569 	}
   1570 
   1571 	void ShaderCore::tanh(Vector4f &dst, const Vector4f &src, bool pp)
   1572 	{
   1573 		dst.x = tangenth(src.x, pp);
   1574 		dst.y = tangenth(src.y, pp);
   1575 		dst.z = tangenth(src.z, pp);
   1576 		dst.w = tangenth(src.w, pp);
   1577 	}
   1578 
   1579 	void ShaderCore::acosh(Vector4f &dst, const Vector4f &src, bool pp)
   1580 	{
   1581 		dst.x = arccosh(src.x, pp);
   1582 		dst.y = arccosh(src.y, pp);
   1583 		dst.z = arccosh(src.z, pp);
   1584 		dst.w = arccosh(src.w, pp);
   1585 	}
   1586 
   1587 	void ShaderCore::asinh(Vector4f &dst, const Vector4f &src, bool pp)
   1588 	{
   1589 		dst.x = arcsinh(src.x, pp);
   1590 		dst.y = arcsinh(src.y, pp);
   1591 		dst.z = arcsinh(src.z, pp);
   1592 		dst.w = arcsinh(src.w, pp);
   1593 	}
   1594 
   1595 	void ShaderCore::atanh(Vector4f &dst, const Vector4f &src, bool pp)
   1596 	{
   1597 		dst.x = arctanh(src.x, pp);
   1598 		dst.y = arctanh(src.y, pp);
   1599 		dst.z = arctanh(src.z, pp);
   1600 		dst.w = arctanh(src.w, pp);
   1601 	}
   1602 
   1603 	void ShaderCore::expp(Vector4f &dst, const Vector4f &src, unsigned short shaderModel)
   1604 	{
   1605 		if(shaderModel < 0x0200)
   1606 		{
   1607 			Float4 frc = Frac(src.x);
   1608 			Float4 floor = src.x - frc;
   1609 
   1610 			dst.x = exponential2(floor, true);
   1611 			dst.y = frc;
   1612 			dst.z = exponential2(src.x, true);
   1613 			dst.w = Float4(1.0f);
   1614 		}
   1615 		else   // Version >= 2.0
   1616 		{
   1617 			exp2x(dst, src, true);   // FIXME: 10-bit precision suffices
   1618 		}
   1619 	}
   1620 
   1621 	void ShaderCore::logp(Vector4f &dst, const Vector4f &src, unsigned short shaderModel)
   1622 	{
   1623 		if(shaderModel < 0x0200)
   1624 		{
   1625 			Float4 tmp0;
   1626 			Float4 tmp1;
   1627 			Float4 t;
   1628 			Int4 r;
   1629 
   1630 			tmp0 = Abs(src.x);
   1631 			tmp1 = tmp0;
   1632 
   1633 			// X component
   1634 			r = As<Int4>(As<UInt4>(tmp0) >> 23) - Int4(127);
   1635 			dst.x = Float4(r);
   1636 
   1637 			// Y component
   1638 			dst.y = As<Float4>((As<Int4>(tmp1) & Int4(0x007FFFFF)) | As<Int4>(Float4(1.0f)));
   1639 
   1640 			// Z component
   1641 			dst.z = logarithm2(src.x, true, true);
   1642 
   1643 			// W component
   1644 			dst.w = 1.0f;
   1645 		}
   1646 		else
   1647 		{
   1648 			log2x(dst, src, true);
   1649 		}
   1650 	}
   1651 
   1652 	void ShaderCore::cmp0(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2)
   1653 	{
   1654 		cmp0(dst.x, src0.x, src1.x, src2.x);
   1655 		cmp0(dst.y, src0.y, src1.y, src2.y);
   1656 		cmp0(dst.z, src0.z, src1.z, src2.z);
   1657 		cmp0(dst.w, src0.w, src1.w, src2.w);
   1658 	}
   1659 
   1660 	void ShaderCore::select(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2)
   1661 	{
   1662 		select(dst.x, As<Int4>(src0.x), src1.x, src2.x);
   1663 		select(dst.y, As<Int4>(src0.y), src1.y, src2.y);
   1664 		select(dst.z, As<Int4>(src0.z), src1.z, src2.z);
   1665 		select(dst.w, As<Int4>(src0.w), src1.w, src2.w);
   1666 	}
   1667 
   1668 	void ShaderCore::extract(Float4 &dst, const Vector4f &src0, const Float4 &src1)
   1669 	{
   1670 		select(dst, CmpEQ(As<Int4>(src1), Int4(1)), src0.y, src0.x);
   1671 		select(dst, CmpEQ(As<Int4>(src1), Int4(2)), src0.z, dst);
   1672 		select(dst, CmpEQ(As<Int4>(src1), Int4(3)), src0.w, dst);
   1673 	}
   1674 
   1675 	void ShaderCore::insert(Vector4f &dst, const Vector4f &src, const Float4 &element, const Float4 &index)
   1676 	{
   1677 		select(dst.x, CmpEQ(As<Int4>(index), Int4(0)), element, src.x);
   1678 		select(dst.y, CmpEQ(As<Int4>(index), Int4(1)), element, src.y);
   1679 		select(dst.z, CmpEQ(As<Int4>(index), Int4(2)), element, src.z);
   1680 		select(dst.w, CmpEQ(As<Int4>(index), Int4(3)), element, src.w);
   1681 	}
   1682 
   1683 	void ShaderCore::sgn(Float4 &dst, const Float4 &src)
   1684 	{
   1685 		Int4 neg = As<Int4>(CmpLT(src, Float4(-0.0f))) & As<Int4>(Float4(-1.0f));
   1686 		Int4 pos = As<Int4>(CmpNLE(src, Float4(+0.0f))) & As<Int4>(Float4(1.0f));
   1687 		dst = As<Float4>(neg | pos);
   1688 	}
   1689 
   1690 	void ShaderCore::isgn(Float4 &dst, const Float4 &src)
   1691 	{
   1692 		Int4 neg = CmpLT(As<Int4>(src), Int4(0)) & Int4(-1);
   1693 		Int4 pos = CmpNLE(As<Int4>(src), Int4(0)) & Int4(1);
   1694 		dst = As<Float4>(neg | pos);
   1695 	}
   1696 
   1697 	void ShaderCore::cmp0(Float4 &dst, const Float4 &src0, const Float4 &src1, const Float4 &src2)
   1698 	{
   1699 		Int4 pos = CmpLE(Float4(0.0f), src0);
   1700 		select(dst, pos, src1, src2);
   1701 	}
   1702 
   1703 	void ShaderCore::cmp0i(Float4 &dst, const Float4 &src0, const Float4 &src1, const Float4 &src2)
   1704 	{
   1705 		Int4 pos = CmpEQ(Int4(0), As<Int4>(src0));
   1706 		select(dst, pos, src1, src2);
   1707 	}
   1708 
   1709 	void ShaderCore::select(Float4 &dst, RValue<Int4> src0, const Float4 &src1, const Float4 &src2)
   1710 	{
   1711 		// FIXME: LLVM vector select
   1712 		dst = As<Float4>((src0 & As<Int4>(src1)) | (~src0 & As<Int4>(src2)));
   1713 	}
   1714 
   1715 	void ShaderCore::cmp(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, Control control)
   1716 	{
   1717 		switch(control)
   1718 		{
   1719 		case Shader::CONTROL_GT:
   1720 			dst.x = As<Float4>(CmpNLE(src0.x, src1.x));
   1721 			dst.y = As<Float4>(CmpNLE(src0.y, src1.y));
   1722 			dst.z = As<Float4>(CmpNLE(src0.z, src1.z));
   1723 			dst.w = As<Float4>(CmpNLE(src0.w, src1.w));
   1724 			break;
   1725 		case Shader::CONTROL_EQ:
   1726 			dst.x = As<Float4>(CmpEQ(src0.x, src1.x));
   1727 			dst.y = As<Float4>(CmpEQ(src0.y, src1.y));
   1728 			dst.z = As<Float4>(CmpEQ(src0.z, src1.z));
   1729 			dst.w = As<Float4>(CmpEQ(src0.w, src1.w));
   1730 			break;
   1731 		case Shader::CONTROL_GE:
   1732 			dst.x = As<Float4>(CmpNLT(src0.x, src1.x));
   1733 			dst.y = As<Float4>(CmpNLT(src0.y, src1.y));
   1734 			dst.z = As<Float4>(CmpNLT(src0.z, src1.z));
   1735 			dst.w = As<Float4>(CmpNLT(src0.w, src1.w));
   1736 			break;
   1737 		case Shader::CONTROL_LT:
   1738 			dst.x = As<Float4>(CmpLT(src0.x, src1.x));
   1739 			dst.y = As<Float4>(CmpLT(src0.y, src1.y));
   1740 			dst.z = As<Float4>(CmpLT(src0.z, src1.z));
   1741 			dst.w = As<Float4>(CmpLT(src0.w, src1.w));
   1742 			break;
   1743 		case Shader::CONTROL_NE:
   1744 			dst.x = As<Float4>(CmpNEQ(src0.x, src1.x));
   1745 			dst.y = As<Float4>(CmpNEQ(src0.y, src1.y));
   1746 			dst.z = As<Float4>(CmpNEQ(src0.z, src1.z));
   1747 			dst.w = As<Float4>(CmpNEQ(src0.w, src1.w));
   1748 			break;
   1749 		case Shader::CONTROL_LE:
   1750 			dst.x = As<Float4>(CmpLE(src0.x, src1.x));
   1751 			dst.y = As<Float4>(CmpLE(src0.y, src1.y));
   1752 			dst.z = As<Float4>(CmpLE(src0.z, src1.z));
   1753 			dst.w = As<Float4>(CmpLE(src0.w, src1.w));
   1754 			break;
   1755 		default:
   1756 			ASSERT(false);
   1757 		}
   1758 	}
   1759 
   1760 	void ShaderCore::icmp(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, Control control)
   1761 	{
   1762 		switch(control)
   1763 		{
   1764 		case Shader::CONTROL_GT:
   1765 			dst.x = As<Float4>(CmpNLE(As<Int4>(src0.x), As<Int4>(src1.x)));
   1766 			dst.y = As<Float4>(CmpNLE(As<Int4>(src0.y), As<Int4>(src1.y)));
   1767 			dst.z = As<Float4>(CmpNLE(As<Int4>(src0.z), As<Int4>(src1.z)));
   1768 			dst.w = As<Float4>(CmpNLE(As<Int4>(src0.w), As<Int4>(src1.w)));
   1769 			break;
   1770 		case Shader::CONTROL_EQ:
   1771 			dst.x = As<Float4>(CmpEQ(As<Int4>(src0.x), As<Int4>(src1.x)));
   1772 			dst.y = As<Float4>(CmpEQ(As<Int4>(src0.y), As<Int4>(src1.y)));
   1773 			dst.z = As<Float4>(CmpEQ(As<Int4>(src0.z), As<Int4>(src1.z)));
   1774 			dst.w = As<Float4>(CmpEQ(As<Int4>(src0.w), As<Int4>(src1.w)));
   1775 			break;
   1776 		case Shader::CONTROL_GE:
   1777 			dst.x = As<Float4>(CmpNLT(As<Int4>(src0.x), As<Int4>(src1.x)));
   1778 			dst.y = As<Float4>(CmpNLT(As<Int4>(src0.y), As<Int4>(src1.y)));
   1779 			dst.z = As<Float4>(CmpNLT(As<Int4>(src0.z), As<Int4>(src1.z)));
   1780 			dst.w = As<Float4>(CmpNLT(As<Int4>(src0.w), As<Int4>(src1.w)));
   1781 			break;
   1782 		case Shader::CONTROL_LT:
   1783 			dst.x = As<Float4>(CmpLT(As<Int4>(src0.x), As<Int4>(src1.x)));
   1784 			dst.y = As<Float4>(CmpLT(As<Int4>(src0.y), As<Int4>(src1.y)));
   1785 			dst.z = As<Float4>(CmpLT(As<Int4>(src0.z), As<Int4>(src1.z)));
   1786 			dst.w = As<Float4>(CmpLT(As<Int4>(src0.w), As<Int4>(src1.w)));
   1787 			break;
   1788 		case Shader::CONTROL_NE:
   1789 			dst.x = As<Float4>(CmpNEQ(As<Int4>(src0.x), As<Int4>(src1.x)));
   1790 			dst.y = As<Float4>(CmpNEQ(As<Int4>(src0.y), As<Int4>(src1.y)));
   1791 			dst.z = As<Float4>(CmpNEQ(As<Int4>(src0.z), As<Int4>(src1.z)));
   1792 			dst.w = As<Float4>(CmpNEQ(As<Int4>(src0.w), As<Int4>(src1.w)));
   1793 			break;
   1794 		case Shader::CONTROL_LE:
   1795 			dst.x = As<Float4>(CmpLE(As<Int4>(src0.x), As<Int4>(src1.x)));
   1796 			dst.y = As<Float4>(CmpLE(As<Int4>(src0.y), As<Int4>(src1.y)));
   1797 			dst.z = As<Float4>(CmpLE(As<Int4>(src0.z), As<Int4>(src1.z)));
   1798 			dst.w = As<Float4>(CmpLE(As<Int4>(src0.w), As<Int4>(src1.w)));
   1799 			break;
   1800 		default:
   1801 			ASSERT(false);
   1802 		}
   1803 	}
   1804 
   1805 	void ShaderCore::ucmp(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, Control control)
   1806 	{
   1807 		switch(control)
   1808 		{
   1809 		case Shader::CONTROL_GT:
   1810 			dst.x = As<Float4>(CmpNLE(As<UInt4>(src0.x), As<UInt4>(src1.x)));
   1811 			dst.y = As<Float4>(CmpNLE(As<UInt4>(src0.y), As<UInt4>(src1.y)));
   1812 			dst.z = As<Float4>(CmpNLE(As<UInt4>(src0.z), As<UInt4>(src1.z)));
   1813 			dst.w = As<Float4>(CmpNLE(As<UInt4>(src0.w), As<UInt4>(src1.w)));
   1814 			break;
   1815 		case Shader::CONTROL_EQ:
   1816 			dst.x = As<Float4>(CmpEQ(As<UInt4>(src0.x), As<UInt4>(src1.x)));
   1817 			dst.y = As<Float4>(CmpEQ(As<UInt4>(src0.y), As<UInt4>(src1.y)));
   1818 			dst.z = As<Float4>(CmpEQ(As<UInt4>(src0.z), As<UInt4>(src1.z)));
   1819 			dst.w = As<Float4>(CmpEQ(As<UInt4>(src0.w), As<UInt4>(src1.w)));
   1820 			break;
   1821 		case Shader::CONTROL_GE:
   1822 			dst.x = As<Float4>(CmpNLT(As<UInt4>(src0.x), As<UInt4>(src1.x)));
   1823 			dst.y = As<Float4>(CmpNLT(As<UInt4>(src0.y), As<UInt4>(src1.y)));
   1824 			dst.z = As<Float4>(CmpNLT(As<UInt4>(src0.z), As<UInt4>(src1.z)));
   1825 			dst.w = As<Float4>(CmpNLT(As<UInt4>(src0.w), As<UInt4>(src1.w)));
   1826 			break;
   1827 		case Shader::CONTROL_LT:
   1828 			dst.x = As<Float4>(CmpLT(As<UInt4>(src0.x), As<UInt4>(src1.x)));
   1829 			dst.y = As<Float4>(CmpLT(As<UInt4>(src0.y), As<UInt4>(src1.y)));
   1830 			dst.z = As<Float4>(CmpLT(As<UInt4>(src0.z), As<UInt4>(src1.z)));
   1831 			dst.w = As<Float4>(CmpLT(As<UInt4>(src0.w), As<UInt4>(src1.w)));
   1832 			break;
   1833 		case Shader::CONTROL_NE:
   1834 			dst.x = As<Float4>(CmpNEQ(As<UInt4>(src0.x), As<UInt4>(src1.x)));
   1835 			dst.y = As<Float4>(CmpNEQ(As<UInt4>(src0.y), As<UInt4>(src1.y)));
   1836 			dst.z = As<Float4>(CmpNEQ(As<UInt4>(src0.z), As<UInt4>(src1.z)));
   1837 			dst.w = As<Float4>(CmpNEQ(As<UInt4>(src0.w), As<UInt4>(src1.w)));
   1838 			break;
   1839 		case Shader::CONTROL_LE:
   1840 			dst.x = As<Float4>(CmpLE(As<UInt4>(src0.x), As<UInt4>(src1.x)));
   1841 			dst.y = As<Float4>(CmpLE(As<UInt4>(src0.y), As<UInt4>(src1.y)));
   1842 			dst.z = As<Float4>(CmpLE(As<UInt4>(src0.z), As<UInt4>(src1.z)));
   1843 			dst.w = As<Float4>(CmpLE(As<UInt4>(src0.w), As<UInt4>(src1.w)));
   1844 			break;
   1845 		default:
   1846 			ASSERT(false);
   1847 		}
   1848 	}
   1849 
   1850 	void ShaderCore::all(Float4 &dst, const Vector4f &src)
   1851 	{
   1852 		dst = As<Float4>(As<Int4>(src.x) & As<Int4>(src.y) & As<Int4>(src.z) & As<Int4>(src.w));
   1853 	}
   1854 
   1855 	void ShaderCore::any(Float4 &dst, const Vector4f &src)
   1856 	{
   1857 		dst = As<Float4>(As<Int4>(src.x) | As<Int4>(src.y) | As<Int4>(src.z) | As<Int4>(src.w));
   1858 	}
   1859 
   1860 	void ShaderCore::bitwise_not(Vector4f &dst, const Vector4f &src)
   1861 	{
   1862 		dst.x = As<Float4>(As<Int4>(src.x) ^ Int4(0xFFFFFFFF));
   1863 		dst.y = As<Float4>(As<Int4>(src.y) ^ Int4(0xFFFFFFFF));
   1864 		dst.z = As<Float4>(As<Int4>(src.z) ^ Int4(0xFFFFFFFF));
   1865 		dst.w = As<Float4>(As<Int4>(src.w) ^ Int4(0xFFFFFFFF));
   1866 	}
   1867 
   1868 	void ShaderCore::bitwise_or(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
   1869 	{
   1870 		dst.x = As<Float4>(As<Int4>(src0.x) | As<Int4>(src1.x));
   1871 		dst.y = As<Float4>(As<Int4>(src0.y) | As<Int4>(src1.y));
   1872 		dst.z = As<Float4>(As<Int4>(src0.z) | As<Int4>(src1.z));
   1873 		dst.w = As<Float4>(As<Int4>(src0.w) | As<Int4>(src1.w));
   1874 	}
   1875 
   1876 	void ShaderCore::bitwise_xor(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
   1877 	{
   1878 		dst.x = As<Float4>(As<Int4>(src0.x) ^ As<Int4>(src1.x));
   1879 		dst.y = As<Float4>(As<Int4>(src0.y) ^ As<Int4>(src1.y));
   1880 		dst.z = As<Float4>(As<Int4>(src0.z) ^ As<Int4>(src1.z));
   1881 		dst.w = As<Float4>(As<Int4>(src0.w) ^ As<Int4>(src1.w));
   1882 	}
   1883 
   1884 	void ShaderCore::bitwise_and(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
   1885 	{
   1886 		dst.x = As<Float4>(As<Int4>(src0.x) & As<Int4>(src1.x));
   1887 		dst.y = As<Float4>(As<Int4>(src0.y) & As<Int4>(src1.y));
   1888 		dst.z = As<Float4>(As<Int4>(src0.z) & As<Int4>(src1.z));
   1889 		dst.w = As<Float4>(As<Int4>(src0.w) & As<Int4>(src1.w));
   1890 	}
   1891 
   1892 	void ShaderCore::equal(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
   1893 	{
   1894 		dst.x = As<Float4>(CmpEQ(As<UInt4>(src0.x), As<UInt4>(src1.x)) &
   1895 		                   CmpEQ(As<UInt4>(src0.y), As<UInt4>(src1.y)) &
   1896 		                   CmpEQ(As<UInt4>(src0.z), As<UInt4>(src1.z)) &
   1897 		                   CmpEQ(As<UInt4>(src0.w), As<UInt4>(src1.w)));
   1898 		dst.y = dst.x;
   1899 		dst.z = dst.x;
   1900 		dst.w = dst.x;
   1901 	}
   1902 
   1903 	void ShaderCore::notEqual(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
   1904 	{
   1905 		dst.x = As<Float4>(CmpNEQ(As<UInt4>(src0.x), As<UInt4>(src1.x)) |
   1906 		                   CmpNEQ(As<UInt4>(src0.y), As<UInt4>(src1.y)) |
   1907 		                   CmpNEQ(As<UInt4>(src0.z), As<UInt4>(src1.z)) |
   1908 		                   CmpNEQ(As<UInt4>(src0.w), As<UInt4>(src1.w)));
   1909 		dst.y = dst.x;
   1910 		dst.z = dst.x;
   1911 		dst.w = dst.x;
   1912 	}
   1913 }
   1914