Home | History | Annotate | Download | only in Shader
      1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved.
      2 //
      3 // Licensed under the Apache License, Version 2.0 (the "License");
      4 // you may not use this file except in compliance with the License.
      5 // You may obtain a copy of the License at
      6 //
      7 //    http://www.apache.org/licenses/LICENSE-2.0
      8 //
      9 // Unless required by applicable law or agreed to in writing, software
     10 // distributed under the License is distributed on an "AS IS" BASIS,
     11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 // See the License for the specific language governing permissions and
     13 // limitations under the License.
     14 
     15 #include "ShaderCore.hpp"
     16 
     17 #include "Renderer/Renderer.hpp"
     18 #include "Common/Debug.hpp"
     19 
     20 #include <limits.h>
     21 
     22 namespace sw
     23 {
     24 	extern TranscendentalPrecision logPrecision;
     25 	extern TranscendentalPrecision expPrecision;
     26 	extern TranscendentalPrecision rcpPrecision;
     27 	extern TranscendentalPrecision rsqPrecision;
     28 
     29 	Vector4s::Vector4s()
     30 	{
     31 	}
     32 
     33 	Vector4s::Vector4s(unsigned short x, unsigned short y, unsigned short z, unsigned short w)
     34 	{
     35 		this->x = Short4(x);
     36 		this->y = Short4(y);
     37 		this->z = Short4(z);
     38 		this->w = Short4(w);
     39 	}
     40 
     41 	Vector4s::Vector4s(const Vector4s &rhs)
     42 	{
     43 		x = rhs.x;
     44 		y = rhs.y;
     45 		z = rhs.z;
     46 		w = rhs.w;
     47 	}
     48 
     49 	Vector4s &Vector4s::operator=(const Vector4s &rhs)
     50 	{
     51 		x = rhs.x;
     52 		y = rhs.y;
     53 		z = rhs.z;
     54 		w = rhs.w;
     55 
     56 		return *this;
     57 	}
     58 
     59 	Short4 &Vector4s::operator[](int i)
     60 	{
     61 		switch(i)
     62 		{
     63 		case 0: return x;
     64 		case 1: return y;
     65 		case 2: return z;
     66 		case 3: return w;
     67 		}
     68 
     69 		return x;
     70 	}
     71 
     72 	Vector4f::Vector4f()
     73 	{
     74 	}
     75 
     76 	Vector4f::Vector4f(float x, float y, float z, float w)
     77 	{
     78 		this->x = Float4(x);
     79 		this->y = Float4(y);
     80 		this->z = Float4(z);
     81 		this->w = Float4(w);
     82 	}
     83 
     84 	Vector4f::Vector4f(const Vector4f &rhs)
     85 	{
     86 		x = rhs.x;
     87 		y = rhs.y;
     88 		z = rhs.z;
     89 		w = rhs.w;
     90 	}
     91 
     92 	Vector4f &Vector4f::operator=(const Vector4f &rhs)
     93 	{
     94 		x = rhs.x;
     95 		y = rhs.y;
     96 		z = rhs.z;
     97 		w = rhs.w;
     98 
     99 		return *this;
    100 	}
    101 
    102 	Float4 &Vector4f::operator[](int i)
    103 	{
    104 		switch(i)
    105 		{
    106 		case 0: return x;
    107 		case 1: return y;
    108 		case 2: return z;
    109 		case 3: return w;
    110 		}
    111 
    112 		return x;
    113 	}
    114 
    115 	Float4 exponential2(RValue<Float4> x, bool pp)
    116 	{
    117 		Float4 x0;
    118 		Float4 x1;
    119 		Int4 x2;
    120 
    121 		x0 = x;
    122 
    123 		x0 = Min(x0, As<Float4>(Int4(0x43010000)));   // 129.00000e+0f
    124 		x0 = Max(x0, As<Float4>(Int4(0xC2FDFFFF)));   // -126.99999e+0f
    125 		x1 = x0;
    126 		x1 -= Float4(0.5f);
    127 		x2 = RoundInt(x1);
    128 		x1 = Float4(x2);
    129 		x2 += Int4(0x0000007F);   // 127
    130 		x2 = x2 << 23;
    131 		x0 -= x1;
    132 		x1 = As<Float4>(Int4(0x3AF61905));   // 1.8775767e-3f
    133 		x1 *= x0;
    134 		x1 += As<Float4>(Int4(0x3C134806));   // 8.9893397e-3f
    135 		x1 *= x0;
    136 		x1 += As<Float4>(Int4(0x3D64AA23));   // 5.5826318e-2f
    137 		x1 *= x0;
    138 		x1 += As<Float4>(Int4(0x3E75EAD4));   // 2.4015361e-1f
    139 		x1 *= x0;
    140 		x1 += As<Float4>(Int4(0x3F31727B));   // 6.9315308e-1f
    141 		x1 *= x0;
    142 		x1 += As<Float4>(Int4(0x3F7FFFFF));   // 9.9999994e-1f
    143 		x1 *= As<Float4>(x2);
    144 
    145 		return x1;
    146 	}
    147 
    148 	Float4 logarithm2(RValue<Float4> x, bool absolute, bool pp)
    149 	{
    150 		Float4 x0;
    151 		Float4 x1;
    152 		Float4 x2;
    153 		Float4 x3;
    154 
    155 		x0 = x;
    156 
    157 		x1 = As<Float4>(As<Int4>(x0) & Int4(0x7F800000));
    158 		x1 = As<Float4>(As<UInt4>(x1) >> 8);
    159 		x1 = As<Float4>(As<Int4>(x1) | As<Int4>(Float4(1.0f)));
    160 		x1 = (x1 - Float4(1.4960938f)) * Float4(256.0f);   // FIXME: (x1 - 1.4960938f) * 256.0f;
    161 		x0 = As<Float4>((As<Int4>(x0) & Int4(0x007FFFFF)) | As<Int4>(Float4(1.0f)));
    162 
    163 		x2 = (Float4(9.5428179e-2f) * x0 + Float4(4.7779095e-1f)) * x0 + Float4(1.9782813e-1f);
    164 		x3 = ((Float4(1.6618466e-2f) * x0 + Float4(2.0350508e-1f)) * x0 + Float4(2.7382900e-1f)) * x0 + Float4(4.0496687e-2f);
    165 		x2 /= x3;
    166 
    167 		x1 += (x0 - Float4(1.0f)) * x2;
    168 
    169 		return x1;
    170 	}
    171 
    172 	Float4 exponential(RValue<Float4> x, bool pp)
    173 	{
    174 		// FIXME: Propagate the constant
    175 		return exponential2(Float4(1.44269541f) * x, pp);   // 1/ln(2)
    176 	}
    177 
    178 	Float4 logarithm(RValue<Float4> x, bool absolute, bool pp)
    179 	{
    180 		// FIXME: Propagate the constant
    181 		return Float4(6.93147181e-1f) * logarithm2(x, absolute, pp);   // ln(2)
    182 	}
    183 
    184 	Float4 power(RValue<Float4> x, RValue<Float4> y, bool pp)
    185 	{
    186 		Float4 log = logarithm2(x, true, pp);
    187 		log *= y;
    188 		return exponential2(log, pp);
    189 	}
    190 
    191 	Float4 reciprocal(RValue<Float4> x, bool pp, bool finite, bool exactAtPow2)
    192 	{
    193 		Float4 rcp;
    194 
    195 		if(!pp && rcpPrecision >= WHQL)
    196 		{
    197 			rcp = Float4(1.0f) / x;
    198 		}
    199 		else
    200 		{
    201 			rcp = Rcp_pp(x, exactAtPow2);
    202 
    203 			if(!pp)
    204 			{
    205 				rcp = (rcp + rcp) - (x * rcp * rcp);
    206 			}
    207 		}
    208 
    209 		if(finite)
    210 		{
    211 			int big = 0x7F7FFFFF;
    212 			rcp = Min(rcp, Float4((float&)big));
    213 		}
    214 
    215 		return rcp;
    216 	}
    217 
    218 	Float4 reciprocalSquareRoot(RValue<Float4> x, bool absolute, bool pp)
    219 	{
    220 		Float4 abs = x;
    221 
    222 		if(absolute)
    223 		{
    224 			abs = Abs(abs);
    225 		}
    226 
    227 		Float4 rsq;
    228 
    229 		if(!pp && rsqPrecision >= IEEE)
    230 		{
    231 			rsq = Float4(1.0f) / Sqrt(abs);
    232 		}
    233 		else
    234 		{
    235 			rsq = RcpSqrt_pp(abs);
    236 
    237 			if(!pp)
    238 			{
    239 				rsq = rsq * (Float4(3.0f) - rsq * rsq * abs) * Float4(0.5f);
    240 			}
    241 		}
    242 
    243 		int big = 0x7F7FFFFF;
    244 		rsq = Min(rsq, Float4((float&)big));
    245 
    246 		return rsq;
    247 	}
    248 
    249 	Float4 modulo(RValue<Float4> x, RValue<Float4> y)
    250 	{
    251 		return x - y * Floor(x / y);
    252 	}
    253 
    254 	Float4 sine_pi(RValue<Float4> x, bool pp)
    255 	{
    256 		const Float4 A = Float4(-4.05284734e-1f);   // -4/pi^2
    257 		const Float4 B = Float4(1.27323954e+0f);    // 4/pi
    258 		const Float4 C = Float4(7.75160950e-1f);
    259 		const Float4 D = Float4(2.24839049e-1f);
    260 
    261 		// Parabola approximating sine
    262 		Float4 sin = x * (Abs(x) * A + B);
    263 
    264 		// Improve precision from 0.06 to 0.001
    265 		if(true)
    266 		{
    267 			sin = sin * (Abs(sin) * D + C);
    268 		}
    269 
    270 		return sin;
    271 	}
    272 
    273 	Float4 cosine_pi(RValue<Float4> x, bool pp)
    274 	{
    275 		// cos(x) = sin(x + pi/2)
    276 		Float4 y = x + Float4(1.57079632e+0f);
    277 
    278 		// Wrap around
    279 		y -= As<Float4>(CmpNLT(y, Float4(3.14159265e+0f)) & As<Int4>(Float4(6.28318530e+0f)));
    280 
    281 		return sine_pi(y, pp);
    282 	}
    283 
    284 	Float4 sine(RValue<Float4> x, bool pp)
    285 	{
    286 		// Reduce to [-0.5, 0.5] range
    287 		Float4 y = x * Float4(1.59154943e-1f);   // 1/2pi
    288 		y = y - Round(y);
    289 
    290 		const Float4 A = Float4(-16.0f);
    291 		const Float4 B = Float4(8.0f);
    292 		const Float4 C = Float4(7.75160950e-1f);
    293 		const Float4 D = Float4(2.24839049e-1f);
    294 
    295 		// Parabola approximating sine
    296 		Float4 sin = y * (Abs(y) * A + B);
    297 
    298 		// Improve precision from 0.06 to 0.001
    299 		if(true)
    300 		{
    301 			sin = sin * (Abs(sin) * D + C);
    302 		}
    303 
    304 		return sin;
    305 	}
    306 
    307 	Float4 cosine(RValue<Float4> x, bool pp)
    308 	{
    309 		// cos(x) = sin(x + pi/2)
    310 		Float4 y = x + Float4(1.57079632e+0f);
    311 		return sine(y, pp);
    312 	}
    313 
    314 	Float4 tangent(RValue<Float4> x, bool pp)
    315 	{
    316 		return sine(x, pp) / cosine(x, pp);
    317 	}
    318 
    319 	Float4 arccos(RValue<Float4> x, bool pp)
    320 	{
    321 		// pi/2 - arcsin(x)
    322 		return Float4(1.57079632e+0f) - arcsin(x);
    323 	}
    324 
    325 	Float4 arcsin(RValue<Float4> x, bool pp)
    326 	{
    327 		// x*(pi/2-sqrt(1-x*x)*pi/5)
    328 		return x * (Float4(1.57079632e+0f) - Sqrt(Float4(1.0f) - x*x) * Float4(6.28318531e-1f));
    329 	}
    330 
    331 	Float4 arctan(RValue<Float4> x, bool pp)
    332 	{
    333 		Int4 O = CmpNLT(Abs(x), Float4(1.0f));
    334 		Float4 y = As<Float4>((O & As<Int4>(Float4(1.0f) / x)) | (~O & As<Int4>(x)));   // FIXME: Vector select
    335 
    336 		// Approximation of atan in [-1..1]
    337 		Float4 theta = y * (Float4(-0.27f) * Abs(y) + Float4(1.05539816f));
    338 
    339 		// +/-pi/2 depending on sign of x
    340 		Float4 sgnPi_2 = As<Float4>(As<Int4>(Float4(1.57079632e+0f)) ^ (As<Int4>(x) & Int4(0x80000000)));
    341 
    342 		theta = As<Float4>((O & As<Int4>(sgnPi_2 - theta)) | (~O & As<Int4>(theta)));   // FIXME: Vector select
    343 
    344 		return theta;
    345 	}
    346 
    347 	Float4 arctan(RValue<Float4> y, RValue<Float4> x, bool pp)
    348 	{
    349 		// Rotate to upper semicircle when in lower semicircle
    350 		Int4 S = CmpLT(y, Float4(0.0f));
    351 		Float4 theta = As<Float4>(S & As<Int4>(Float4(-3.14159265e+0f)));   // -pi
    352 		Float4 x0 = As<Float4>((As<Int4>(y) & Int4(0x80000000)) ^ As<Int4>(x));
    353 		Float4 y0 = Abs(y);
    354 
    355 		// Rotate to right quadrant when in left quadrant
    356 		Int4 Q = CmpLT(x0, Float4(0.0f));
    357 		theta += As<Float4>(Q & As<Int4>(Float4(1.57079632e+0f)));   // pi/2
    358 		Float4 x1 = As<Float4>((Q & As<Int4>(y0)) | (~Q & As<Int4>(x0)));    // FIXME: Vector select
    359 		Float4 y1 = As<Float4>((Q & As<Int4>(-x0)) | (~Q & As<Int4>(y0)));   // FIXME: Vector select
    360 
    361 		// Rotate to first octant when in second octant
    362 		Int4 O = CmpNLT(y1, x1);
    363 		theta += As<Float4>(O & As<Int4>(Float4(7.85398163e-1f)));   // pi/4
    364 		Float4 x2 = As<Float4>((O & As<Int4>(Float4(7.07106781e-1f) * x1 + Float4(7.07106781e-1f) * y1)) | (~O & As<Int4>(x1)));   // sqrt(2)/2   // FIXME: Vector select
    365 		Float4 y2 = As<Float4>((O & As<Int4>(Float4(7.07106781e-1f) * y1 - Float4(7.07106781e-1f) * x1)) | (~O & As<Int4>(y1)));   // FIXME: Vector select
    366 
    367 		// Approximation of atan in [0..1]
    368 		Float4 y_x = y2 / x2;
    369 		theta += y_x * (Float4(-0.27f) * y_x + Float4(1.05539816f));
    370 
    371 		return theta;
    372 	}
    373 
    374 	Float4 sineh(RValue<Float4> x, bool pp)
    375 	{
    376 		return (exponential(x, pp) - exponential(-x, pp)) * Float4(0.5f);
    377 	}
    378 
    379 	Float4 cosineh(RValue<Float4> x, bool pp)
    380 	{
    381 		return (exponential(x, pp) + exponential(-x, pp)) * Float4(0.5f);
    382 	}
    383 
    384 	Float4 tangenth(RValue<Float4> x, bool pp)
    385 	{
    386 		Float4 e_x = exponential(x, pp);
    387 		Float4 e_minus_x = exponential(-x, pp);
    388 		return (e_x - e_minus_x) / (e_x + e_minus_x);
    389 	}
    390 
    391 	Float4 arccosh(RValue<Float4> x, bool pp)
    392 	{
    393 		return logarithm(x + Sqrt(x + Float4(1.0f)) * Sqrt(x - Float4(1.0f)), pp);
    394 	}
    395 
    396 	Float4 arcsinh(RValue<Float4> x, bool pp)
    397 	{
    398 		return logarithm(x + Sqrt(x * x + Float4(1.0f)), pp);
    399 	}
    400 
    401 	Float4 arctanh(RValue<Float4> x, bool pp)
    402 	{
    403 		return logarithm((Float4(1.0f) + x) / (Float4(1.0f) - x), pp) * Float4(0.5f);
    404 	}
    405 
    406 	Float4 dot2(const Vector4f &v0, const Vector4f &v1)
    407 	{
    408 		return v0.x * v1.x + v0.y * v1.y;
    409 	}
    410 
    411 	Float4 dot3(const Vector4f &v0, const Vector4f &v1)
    412 	{
    413 		return v0.x * v1.x + v0.y * v1.y + v0.z * v1.z;
    414 	}
    415 
    416 	Float4 dot4(const Vector4f &v0, const Vector4f &v1)
    417 	{
    418 		return v0.x * v1.x + v0.y * v1.y + v0.z * v1.z + v0.w * v1.w;
    419 	}
    420 
    421 	void transpose4x4(Short4 &row0, Short4 &row1, Short4 &row2, Short4 &row3)
    422 	{
    423 		Int2 tmp0 = UnpackHigh(row0, row1);
    424 		Int2 tmp1 = UnpackHigh(row2, row3);
    425 		Int2 tmp2 = UnpackLow(row0, row1);
    426 		Int2 tmp3 = UnpackLow(row2, row3);
    427 
    428 		row0 = UnpackLow(tmp2, tmp3);
    429 		row1 = UnpackHigh(tmp2, tmp3);
    430 		row2 = UnpackLow(tmp0, tmp1);
    431 		row3 = UnpackHigh(tmp0, tmp1);
    432 	}
    433 
    434 	void transpose4x4(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3)
    435 	{
    436 		Float4 tmp0 = UnpackLow(row0, row1);
    437 		Float4 tmp1 = UnpackLow(row2, row3);
    438 		Float4 tmp2 = UnpackHigh(row0, row1);
    439 		Float4 tmp3 = UnpackHigh(row2, row3);
    440 
    441 		row0 = Float4(tmp0.xy, tmp1.xy);
    442 		row1 = Float4(tmp0.zw, tmp1.zw);
    443 		row2 = Float4(tmp2.xy, tmp3.xy);
    444 		row3 = Float4(tmp2.zw, tmp3.zw);
    445 	}
    446 
    447 	void transpose4x3(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3)
    448 	{
    449 		Float4 tmp0 = UnpackLow(row0, row1);
    450 		Float4 tmp1 = UnpackLow(row2, row3);
    451 		Float4 tmp2 = UnpackHigh(row0, row1);
    452 		Float4 tmp3 = UnpackHigh(row2, row3);
    453 
    454 		row0 = Float4(tmp0.xy, tmp1.xy);
    455 		row1 = Float4(tmp0.zw, tmp1.zw);
    456 		row2 = Float4(tmp2.xy, tmp3.xy);
    457 	}
    458 
    459 	void transpose4x2(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3)
    460 	{
    461 		Float4 tmp0 = UnpackLow(row0, row1);
    462 		Float4 tmp1 = UnpackLow(row2, row3);
    463 
    464 		row0 = Float4(tmp0.xy, tmp1.xy);
    465 		row1 = Float4(tmp0.zw, tmp1.zw);
    466 	}
    467 
    468 	void transpose4x1(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3)
    469 	{
    470 		Float4 tmp0 = UnpackLow(row0, row1);
    471 		Float4 tmp1 = UnpackLow(row2, row3);
    472 
    473 		row0 = Float4(tmp0.xy, tmp1.xy);
    474 	}
    475 
    476 	void transpose2x4(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3)
    477 	{
    478 		Float4 tmp01 = UnpackLow(row0, row1);
    479 		Float4 tmp23 = UnpackHigh(row0, row1);
    480 
    481 		row0 = tmp01;
    482 		row1 = Float4(tmp01.zw, row1.zw);
    483 		row2 = tmp23;
    484 		row3 = Float4(tmp23.zw, row3.zw);
    485 	}
    486 
    487 	void transpose4xN(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3, int N)
    488 	{
    489 		switch(N)
    490 		{
    491 		case 1: transpose4x1(row0, row1, row2, row3); break;
    492 		case 2: transpose4x2(row0, row1, row2, row3); break;
    493 		case 3: transpose4x3(row0, row1, row2, row3); break;
    494 		case 4: transpose4x4(row0, row1, row2, row3); break;
    495 		}
    496 	}
    497 
    498 	void ShaderCore::mov(Vector4f &dst, const Vector4f &src, bool integerDestination)
    499 	{
    500 		if(integerDestination)
    501 		{
    502 			dst.x = As<Float4>(RoundInt(src.x));
    503 			dst.y = As<Float4>(RoundInt(src.y));
    504 			dst.z = As<Float4>(RoundInt(src.z));
    505 			dst.w = As<Float4>(RoundInt(src.w));
    506 		}
    507 		else
    508 		{
    509 			dst = src;
    510 		}
    511 	}
    512 
    513 	void ShaderCore::neg(Vector4f &dst, const Vector4f &src)
    514 	{
    515 		dst.x = -src.x;
    516 		dst.y = -src.y;
    517 		dst.z = -src.z;
    518 		dst.w = -src.w;
    519 	}
    520 
    521 	void ShaderCore::ineg(Vector4f &dst, const Vector4f &src)
    522 	{
    523 		dst.x = As<Float4>(-As<Int4>(src.x));
    524 		dst.y = As<Float4>(-As<Int4>(src.y));
    525 		dst.z = As<Float4>(-As<Int4>(src.z));
    526 		dst.w = As<Float4>(-As<Int4>(src.w));
    527 	}
    528 
    529 	void ShaderCore::f2b(Vector4f &dst, const Vector4f &src)
    530 	{
    531 		dst.x = As<Float4>(CmpNEQ(src.x, Float4(0.0f)));
    532 		dst.y = As<Float4>(CmpNEQ(src.y, Float4(0.0f)));
    533 		dst.z = As<Float4>(CmpNEQ(src.z, Float4(0.0f)));
    534 		dst.w = As<Float4>(CmpNEQ(src.w, Float4(0.0f)));
    535 	}
    536 
    537 	void ShaderCore::b2f(Vector4f &dst, const Vector4f &src)
    538 	{
    539 		dst.x = As<Float4>(As<Int4>(src.x) & As<Int4>(Float4(1.0f)));
    540 		dst.y = As<Float4>(As<Int4>(src.y) & As<Int4>(Float4(1.0f)));
    541 		dst.z = As<Float4>(As<Int4>(src.z) & As<Int4>(Float4(1.0f)));
    542 		dst.w = As<Float4>(As<Int4>(src.w) & As<Int4>(Float4(1.0f)));
    543 	}
    544 
    545 	void ShaderCore::f2i(Vector4f &dst, const Vector4f &src)
    546 	{
    547 		dst.x = As<Float4>(Int4(src.x));
    548 		dst.y = As<Float4>(Int4(src.y));
    549 		dst.z = As<Float4>(Int4(src.z));
    550 		dst.w = As<Float4>(Int4(src.w));
    551 	}
    552 
    553 	void ShaderCore::i2f(Vector4f &dst, const Vector4f &src)
    554 	{
    555 		dst.x = Float4(As<Int4>(src.x));
    556 		dst.y = Float4(As<Int4>(src.y));
    557 		dst.z = Float4(As<Int4>(src.z));
    558 		dst.w = Float4(As<Int4>(src.w));
    559 	}
    560 
    561 	void ShaderCore::f2u(Vector4f &dst, const Vector4f &src)
    562 	{
    563 		dst.x = As<Float4>(UInt4(src.x));
    564 		dst.y = As<Float4>(UInt4(src.y));
    565 		dst.z = As<Float4>(UInt4(src.z));
    566 		dst.w = As<Float4>(UInt4(src.w));
    567 	}
    568 
    569 	void ShaderCore::u2f(Vector4f &dst, const Vector4f &src)
    570 	{
    571 		dst.x = Float4(As<UInt4>(src.x));
    572 		dst.y = Float4(As<UInt4>(src.y));
    573 		dst.z = Float4(As<UInt4>(src.z));
    574 		dst.w = Float4(As<UInt4>(src.w));
    575 	}
    576 
    577 	void ShaderCore::i2b(Vector4f &dst, const Vector4f &src)
    578 	{
    579 		dst.x = As<Float4>(CmpNEQ(As<Int4>(src.x), Int4(0)));
    580 		dst.y = As<Float4>(CmpNEQ(As<Int4>(src.y), Int4(0)));
    581 		dst.z = As<Float4>(CmpNEQ(As<Int4>(src.z), Int4(0)));
    582 		dst.w = As<Float4>(CmpNEQ(As<Int4>(src.w), Int4(0)));
    583 	}
    584 
    585 	void ShaderCore::b2i(Vector4f &dst, const Vector4f &src)
    586 	{
    587 		dst.x = As<Float4>(As<Int4>(src.x) & Int4(1));
    588 		dst.y = As<Float4>(As<Int4>(src.y) & Int4(1));
    589 		dst.z = As<Float4>(As<Int4>(src.z) & Int4(1));
    590 		dst.w = As<Float4>(As<Int4>(src.w) & Int4(1));
    591 	}
    592 
    593 	void ShaderCore::add(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
    594 	{
    595 		dst.x = src0.x + src1.x;
    596 		dst.y = src0.y + src1.y;
    597 		dst.z = src0.z + src1.z;
    598 		dst.w = src0.w + src1.w;
    599 	}
    600 
    601 	void ShaderCore::iadd(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
    602 	{
    603 		dst.x = As<Float4>(As<Int4>(src0.x) + As<Int4>(src1.x));
    604 		dst.y = As<Float4>(As<Int4>(src0.y) + As<Int4>(src1.y));
    605 		dst.z = As<Float4>(As<Int4>(src0.z) + As<Int4>(src1.z));
    606 		dst.w = As<Float4>(As<Int4>(src0.w) + As<Int4>(src1.w));
    607 	}
    608 
    609 	void ShaderCore::sub(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
    610 	{
    611 		dst.x = src0.x - src1.x;
    612 		dst.y = src0.y - src1.y;
    613 		dst.z = src0.z - src1.z;
    614 		dst.w = src0.w - src1.w;
    615 	}
    616 
    617 	void ShaderCore::isub(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
    618 	{
    619 		dst.x = As<Float4>(As<Int4>(src0.x) - As<Int4>(src1.x));
    620 		dst.y = As<Float4>(As<Int4>(src0.y) - As<Int4>(src1.y));
    621 		dst.z = As<Float4>(As<Int4>(src0.z) - As<Int4>(src1.z));
    622 		dst.w = As<Float4>(As<Int4>(src0.w) - As<Int4>(src1.w));
    623 	}
    624 
    625 	void ShaderCore::mad(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2)
    626 	{
    627 		dst.x = src0.x * src1.x + src2.x;
    628 		dst.y = src0.y * src1.y + src2.y;
    629 		dst.z = src0.z * src1.z + src2.z;
    630 		dst.w = src0.w * src1.w + src2.w;
    631 	}
    632 
    633 	void ShaderCore::imad(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2)
    634 	{
    635 		dst.x = As<Float4>(As<Int4>(src0.x) * As<Int4>(src1.x) + As<Int4>(src2.x));
    636 		dst.y = As<Float4>(As<Int4>(src0.y) * As<Int4>(src1.y) + As<Int4>(src2.y));
    637 		dst.z = As<Float4>(As<Int4>(src0.z) * As<Int4>(src1.z) + As<Int4>(src2.z));
    638 		dst.w = As<Float4>(As<Int4>(src0.w) * As<Int4>(src1.w) + As<Int4>(src2.w));
    639 	}
    640 
    641 	void ShaderCore::mul(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
    642 	{
    643 		dst.x = src0.x * src1.x;
    644 		dst.y = src0.y * src1.y;
    645 		dst.z = src0.z * src1.z;
    646 		dst.w = src0.w * src1.w;
    647 	}
    648 
    649 	void ShaderCore::imul(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
    650 	{
    651 		dst.x = As<Float4>(As<Int4>(src0.x) * As<Int4>(src1.x));
    652 		dst.y = As<Float4>(As<Int4>(src0.y) * As<Int4>(src1.y));
    653 		dst.z = As<Float4>(As<Int4>(src0.z) * As<Int4>(src1.z));
    654 		dst.w = As<Float4>(As<Int4>(src0.w) * As<Int4>(src1.w));
    655 	}
    656 
    657 	void ShaderCore::rcpx(Vector4f &dst, const Vector4f &src, bool pp)
    658 	{
    659 		Float4 rcp = reciprocal(src.x, pp, true);
    660 
    661 		dst.x = rcp;
    662 		dst.y = rcp;
    663 		dst.z = rcp;
    664 		dst.w = rcp;
    665 	}
    666 
    667 	void ShaderCore::div(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
    668 	{
    669 		dst.x = src0.x / src1.x;
    670 		dst.y = src0.y / src1.y;
    671 		dst.z = src0.z / src1.z;
    672 		dst.w = src0.w / src1.w;
    673 	}
    674 
    675 	void ShaderCore::idiv(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
    676 	{
    677 		Float4 intMax(As<Float4>(Int4(INT_MAX)));
    678 		cmp0i(dst.x, src1.x, intMax, src1.x);
    679 		dst.x = As<Float4>(As<Int4>(src0.x) / As<Int4>(dst.x));
    680 		cmp0i(dst.y, src1.y, intMax, src1.y);
    681 		dst.y = As<Float4>(As<Int4>(src0.y) / As<Int4>(dst.y));
    682 		cmp0i(dst.z, src1.z, intMax, src1.z);
    683 		dst.z = As<Float4>(As<Int4>(src0.z) / As<Int4>(dst.z));
    684 		cmp0i(dst.w, src1.w, intMax, src1.w);
    685 		dst.w = As<Float4>(As<Int4>(src0.w) / As<Int4>(dst.w));
    686 	}
    687 
    688 	void ShaderCore::udiv(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
    689 	{
    690 		Float4 uintMax(As<Float4>(UInt4(UINT_MAX)));
    691 		cmp0i(dst.x, src1.x, uintMax, src1.x);
    692 		dst.x = As<Float4>(As<UInt4>(src0.x) / As<UInt4>(dst.x));
    693 		cmp0i(dst.y, src1.y, uintMax, src1.y);
    694 		dst.y = As<Float4>(As<UInt4>(src0.y) / As<UInt4>(dst.y));
    695 		cmp0i(dst.z, src1.z, uintMax, src1.z);
    696 		dst.z = As<Float4>(As<UInt4>(src0.z) / As<UInt4>(dst.z));
    697 		cmp0i(dst.w, src1.w, uintMax, src1.w);
    698 		dst.w = As<Float4>(As<UInt4>(src0.w) / As<UInt4>(dst.w));
    699 	}
    700 
    701 	void ShaderCore::mod(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
    702 	{
    703 		dst.x = modulo(src0.x, src1.x);
    704 		dst.y = modulo(src0.y, src1.y);
    705 		dst.z = modulo(src0.z, src1.z);
    706 		dst.w = modulo(src0.w, src1.w);
    707 	}
    708 
    709 	void ShaderCore::imod(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
    710 	{
    711 		cmp0i(dst.x, src1.x, src0.x, src1.x);
    712 		dst.x = As<Float4>(As<Int4>(src0.x) % As<Int4>(dst.x));
    713 		cmp0i(dst.y, src1.y, src0.y, src1.y);
    714 		dst.y = As<Float4>(As<Int4>(src0.y) % As<Int4>(dst.y));
    715 		cmp0i(dst.z, src1.z, src0.z, src1.z);
    716 		dst.z = As<Float4>(As<Int4>(src0.z) % As<Int4>(dst.z));
    717 		cmp0i(dst.w, src1.w, src0.w, src1.w);
    718 		dst.w = As<Float4>(As<Int4>(src0.w) % As<Int4>(dst.w));
    719 	}
    720 	void ShaderCore::umod(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
    721 	{
    722 		cmp0i(dst.x, src1.x, src0.x, src1.x);
    723 		dst.x = As<Float4>(As<UInt4>(src0.x) % As<UInt4>(dst.x));
    724 		cmp0i(dst.y, src1.y, src0.y, src1.y);
    725 		dst.y = As<Float4>(As<UInt4>(src0.y) % As<UInt4>(dst.y));
    726 		cmp0i(dst.z, src1.z, src0.z, src1.z);
    727 		dst.z = As<Float4>(As<UInt4>(src0.z) % As<UInt4>(dst.z));
    728 		cmp0i(dst.w, src1.w, src0.w, src1.w);
    729 		dst.w = As<Float4>(As<UInt4>(src0.w) % As<UInt4>(dst.w));
    730 	}
    731 
    732 	void ShaderCore::shl(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
    733 	{
    734 		dst.x = As<Float4>(As<Int4>(src0.x) << As<Int4>(src1.x));
    735 		dst.y = As<Float4>(As<Int4>(src0.y) << As<Int4>(src1.y));
    736 		dst.z = As<Float4>(As<Int4>(src0.z) << As<Int4>(src1.z));
    737 		dst.w = As<Float4>(As<Int4>(src0.w) << As<Int4>(src1.w));
    738 	}
    739 
    740 	void ShaderCore::ishr(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
    741 	{
    742 		dst.x = As<Float4>(As<Int4>(src0.x) >> As<Int4>(src1.x));
    743 		dst.y = As<Float4>(As<Int4>(src0.y) >> As<Int4>(src1.y));
    744 		dst.z = As<Float4>(As<Int4>(src0.z) >> As<Int4>(src1.z));
    745 		dst.w = As<Float4>(As<Int4>(src0.w) >> As<Int4>(src1.w));
    746 	}
    747 
    748 	void ShaderCore::ushr(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
    749 	{
    750 		dst.x = As<Float4>(As<UInt4>(src0.x) >> As<UInt4>(src1.x));
    751 		dst.y = As<Float4>(As<UInt4>(src0.y) >> As<UInt4>(src1.y));
    752 		dst.z = As<Float4>(As<UInt4>(src0.z) >> As<UInt4>(src1.z));
    753 		dst.w = As<Float4>(As<UInt4>(src0.w) >> As<UInt4>(src1.w));
    754 	}
    755 
    756 	void ShaderCore::rsqx(Vector4f &dst, const Vector4f &src, bool pp)
    757 	{
    758 		Float4 rsq = reciprocalSquareRoot(src.x, true, pp);
    759 
    760 		dst.x = rsq;
    761 		dst.y = rsq;
    762 		dst.z = rsq;
    763 		dst.w = rsq;
    764 	}
    765 
    766 	void ShaderCore::sqrt(Vector4f &dst, const Vector4f &src, bool pp)
    767 	{
    768 		dst.x = Sqrt(src.x);
    769 		dst.y = Sqrt(src.y);
    770 		dst.z = Sqrt(src.z);
    771 		dst.w = Sqrt(src.w);
    772 	}
    773 
    774 	void ShaderCore::rsq(Vector4f &dst, const Vector4f &src, bool pp)
    775 	{
    776 		dst.x = reciprocalSquareRoot(src.x, false, pp);
    777 		dst.y = reciprocalSquareRoot(src.y, false, pp);
    778 		dst.z = reciprocalSquareRoot(src.z, false, pp);
    779 		dst.w = reciprocalSquareRoot(src.w, false, pp);
    780 	}
    781 
    782 	void ShaderCore::len2(Float4 &dst, const Vector4f &src, bool pp)
    783 	{
    784 		dst = Sqrt(dot2(src, src));
    785 	}
    786 
    787 	void ShaderCore::len3(Float4 &dst, const Vector4f &src, bool pp)
    788 	{
    789 		dst = Sqrt(dot3(src, src));
    790 	}
    791 
    792 	void ShaderCore::len4(Float4 &dst, const Vector4f &src, bool pp)
    793 	{
    794 		dst = Sqrt(dot4(src, src));
    795 	}
    796 
    797 	void ShaderCore::dist1(Float4 &dst, const Vector4f &src0, const Vector4f &src1, bool pp)
    798 	{
    799 		dst = Abs(src0.x - src1.x);
    800 	}
    801 
    802 	void ShaderCore::dist2(Float4 &dst, const Vector4f &src0, const Vector4f &src1, bool pp)
    803 	{
    804 		Float4 dx = src0.x - src1.x;
    805 		Float4 dy = src0.y - src1.y;
    806 		Float4 dot2 = dx * dx + dy * dy;
    807 		dst = Sqrt(dot2);
    808 	}
    809 
    810 	void ShaderCore::dist3(Float4 &dst, const Vector4f &src0, const Vector4f &src1, bool pp)
    811 	{
    812 		Float4 dx = src0.x - src1.x;
    813 		Float4 dy = src0.y - src1.y;
    814 		Float4 dz = src0.z - src1.z;
    815 		Float4 dot3 = dx * dx + dy * dy + dz * dz;
    816 		dst = Sqrt(dot3);
    817 	}
    818 
    819 	void ShaderCore::dist4(Float4 &dst, const Vector4f &src0, const Vector4f &src1, bool pp)
    820 	{
    821 		Float4 dx = src0.x - src1.x;
    822 		Float4 dy = src0.y - src1.y;
    823 		Float4 dz = src0.z - src1.z;
    824 		Float4 dw = src0.w - src1.w;
    825 		Float4 dot4 = dx * dx + dy * dy + dz * dz + dw * dw;
    826 		dst = Sqrt(dot4);
    827 	}
    828 
    829 	void ShaderCore::dp1(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
    830 	{
    831 		Float4 t = src0.x * src1.x;
    832 
    833 		dst.x = t;
    834 		dst.y = t;
    835 		dst.z = t;
    836 		dst.w = t;
    837 	}
    838 
    839 	void ShaderCore::dp2(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
    840 	{
    841 		Float4 t = dot2(src0, src1);
    842 
    843 		dst.x = t;
    844 		dst.y = t;
    845 		dst.z = t;
    846 		dst.w = t;
    847 	}
    848 
    849 	void ShaderCore::dp2add(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2)
    850 	{
    851 		Float4 t = dot2(src0, src1) + src2.x;
    852 
    853 		dst.x = t;
    854 		dst.y = t;
    855 		dst.z = t;
    856 		dst.w = t;
    857 	}
    858 
    859 	void ShaderCore::dp3(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
    860 	{
    861 		Float4 dot = dot3(src0, src1);
    862 
    863 		dst.x = dot;
    864 		dst.y = dot;
    865 		dst.z = dot;
    866 		dst.w = dot;
    867 	}
    868 
    869 	void ShaderCore::dp4(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
    870 	{
    871 		Float4 dot = dot4(src0, src1);
    872 
    873 		dst.x = dot;
    874 		dst.y = dot;
    875 		dst.z = dot;
    876 		dst.w = dot;
    877 	}
    878 
    879 	void ShaderCore::min(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
    880 	{
    881 		dst.x = Min(src0.x, src1.x);
    882 		dst.y = Min(src0.y, src1.y);
    883 		dst.z = Min(src0.z, src1.z);
    884 		dst.w = Min(src0.w, src1.w);
    885 	}
    886 
    887 	void ShaderCore::imin(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
    888 	{
    889 		dst.x = As<Float4>(Min(As<Int4>(src0.x), As<Int4>(src1.x)));
    890 		dst.y = As<Float4>(Min(As<Int4>(src0.y), As<Int4>(src1.y)));
    891 		dst.z = As<Float4>(Min(As<Int4>(src0.z), As<Int4>(src1.z)));
    892 		dst.w = As<Float4>(Min(As<Int4>(src0.w), As<Int4>(src1.w)));
    893 	}
    894 
    895 	void ShaderCore::umin(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
    896 	{
    897 		dst.x = As<Float4>(Min(As<UInt4>(src0.x), As<UInt4>(src1.x)));
    898 		dst.y = As<Float4>(Min(As<UInt4>(src0.y), As<UInt4>(src1.y)));
    899 		dst.z = As<Float4>(Min(As<UInt4>(src0.z), As<UInt4>(src1.z)));
    900 		dst.w = As<Float4>(Min(As<UInt4>(src0.w), As<UInt4>(src1.w)));
    901 	}
    902 
    903 	void ShaderCore::max(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
    904 	{
    905 		dst.x = Max(src0.x, src1.x);
    906 		dst.y = Max(src0.y, src1.y);
    907 		dst.z = Max(src0.z, src1.z);
    908 		dst.w = Max(src0.w, src1.w);
    909 	}
    910 
    911 	void ShaderCore::imax(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
    912 	{
    913 		dst.x = As<Float4>(Max(As<Int4>(src0.x), As<Int4>(src1.x)));
    914 		dst.y = As<Float4>(Max(As<Int4>(src0.y), As<Int4>(src1.y)));
    915 		dst.z = As<Float4>(Max(As<Int4>(src0.z), As<Int4>(src1.z)));
    916 		dst.w = As<Float4>(Max(As<Int4>(src0.w), As<Int4>(src1.w)));
    917 	}
    918 
    919 	void ShaderCore::umax(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
    920 	{
    921 		dst.x = As<Float4>(Max(As<Int4>(src0.x), As<Int4>(src1.x)));
    922 		dst.y = As<Float4>(Max(As<Int4>(src0.y), As<Int4>(src1.y)));
    923 		dst.z = As<Float4>(Max(As<Int4>(src0.z), As<Int4>(src1.z)));
    924 		dst.w = As<Float4>(Max(As<Int4>(src0.w), As<Int4>(src1.w)));
    925 	}
    926 
    927 	void ShaderCore::slt(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
    928 	{
    929 		dst.x = As<Float4>(As<Int4>(CmpLT(src0.x, src1.x)) & As<Int4>(Float4(1.0f)));
    930 		dst.y = As<Float4>(As<Int4>(CmpLT(src0.y, src1.y)) & As<Int4>(Float4(1.0f)));
    931 		dst.z = As<Float4>(As<Int4>(CmpLT(src0.z, src1.z)) & As<Int4>(Float4(1.0f)));
    932 		dst.w = As<Float4>(As<Int4>(CmpLT(src0.w, src1.w)) & As<Int4>(Float4(1.0f)));
    933 	}
    934 
    935 	void ShaderCore::step(Vector4f &dst, const Vector4f &edge, const Vector4f &x)
    936 	{
    937 		dst.x = As<Float4>(CmpNLT(x.x, edge.x) & As<Int4>(Float4(1.0f)));
    938 		dst.y = As<Float4>(CmpNLT(x.y, edge.y) & As<Int4>(Float4(1.0f)));
    939 		dst.z = As<Float4>(CmpNLT(x.z, edge.z) & As<Int4>(Float4(1.0f)));
    940 		dst.w = As<Float4>(CmpNLT(x.w, edge.w) & As<Int4>(Float4(1.0f)));
    941 	}
    942 
    943 	void ShaderCore::exp2x(Vector4f &dst, const Vector4f &src, bool pp)
    944 	{
    945 		Float4 exp = exponential2(src.x, pp);
    946 
    947 		dst.x = exp;
    948 		dst.y = exp;
    949 		dst.z = exp;
    950 		dst.w = exp;
    951 	}
    952 
    953 	void ShaderCore::exp2(Vector4f &dst, const Vector4f &src, bool pp)
    954 	{
    955 		dst.x = exponential2(src.x, pp);
    956 		dst.y = exponential2(src.y, pp);
    957 		dst.z = exponential2(src.z, pp);
    958 		dst.w = exponential2(src.w, pp);
    959 	}
    960 
    961 	void ShaderCore::exp(Vector4f &dst, const Vector4f &src, bool pp)
    962 	{
    963 		dst.x = exponential(src.x, pp);
    964 		dst.y = exponential(src.y, pp);
    965 		dst.z = exponential(src.z, pp);
    966 		dst.w = exponential(src.w, pp);
    967 	}
    968 
    969 	void ShaderCore::log2x(Vector4f &dst, const Vector4f &src, bool pp)
    970 	{
    971 		Float4 log = logarithm2(src.x, true, pp);
    972 
    973 		dst.x = log;
    974 		dst.y = log;
    975 		dst.z = log;
    976 		dst.w = log;
    977 	}
    978 
    979 	void ShaderCore::log2(Vector4f &dst, const Vector4f &src, bool pp)
    980 	{
    981 		dst.x = logarithm2(src.x, false, pp);
    982 		dst.y = logarithm2(src.y, false, pp);
    983 		dst.z = logarithm2(src.z, false, pp);
    984 		dst.w = logarithm2(src.w, false, pp);
    985 	}
    986 
    987 	void ShaderCore::log(Vector4f &dst, const Vector4f &src, bool pp)
    988 	{
    989 		dst.x = logarithm(src.x, false, pp);
    990 		dst.y = logarithm(src.y, false, pp);
    991 		dst.z = logarithm(src.z, false, pp);
    992 		dst.w = logarithm(src.w, false, pp);
    993 	}
    994 
    995 	void ShaderCore::lit(Vector4f &dst, const Vector4f &src)
    996 	{
    997 		dst.x = Float4(1.0f);
    998 		dst.y = Max(src.x, Float4(0.0f));
    999 
   1000 		Float4 pow;
   1001 
   1002 		pow = src.w;
   1003 		pow = Min(pow, Float4(127.9961f));
   1004 		pow = Max(pow, Float4(-127.9961f));
   1005 
   1006 		dst.z = power(src.y, pow);
   1007 		dst.z = As<Float4>(As<Int4>(dst.z) & CmpNLT(src.x, Float4(0.0f)));
   1008 		dst.z = As<Float4>(As<Int4>(dst.z) & CmpNLT(src.y, Float4(0.0f)));
   1009 
   1010 		dst.w = Float4(1.0f);
   1011 	}
   1012 
   1013 	void ShaderCore::att(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
   1014 	{
   1015 		// Computes attenuation factors (1, d, d^2, 1/d) assuming src0 = d^2, src1 = 1/d
   1016 		dst.x = 1;
   1017 		dst.y = src0.y * src1.y;
   1018 		dst.z = src0.z;
   1019 		dst.w = src1.w;
   1020 	}
   1021 
   1022 	void ShaderCore::lrp(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2)
   1023 	{
   1024 		dst.x = src0.x * (src1.x - src2.x) + src2.x;
   1025 		dst.y = src0.y * (src1.y - src2.y) + src2.y;
   1026 		dst.z = src0.z * (src1.z - src2.z) + src2.z;
   1027 		dst.w = src0.w * (src1.w - src2.w) + src2.w;
   1028 	}
   1029 
   1030 	void ShaderCore::smooth(Vector4f &dst, const Vector4f &edge0, const Vector4f &edge1, const Vector4f &x)
   1031 	{
   1032 		Float4 tx = Min(Max((x.x - edge0.x) / (edge1.x - edge0.x), Float4(0.0f)), Float4(1.0f)); dst.x = tx * tx * (Float4(3.0f) - Float4(2.0f) * tx);
   1033 		Float4 ty = Min(Max((x.y - edge0.y) / (edge1.y - edge0.y), Float4(0.0f)), Float4(1.0f)); dst.y = ty * ty * (Float4(3.0f) - Float4(2.0f) * ty);
   1034 		Float4 tz = Min(Max((x.z - edge0.z) / (edge1.z - edge0.z), Float4(0.0f)), Float4(1.0f)); dst.z = tz * tz * (Float4(3.0f) - Float4(2.0f) * tz);
   1035 		Float4 tw = Min(Max((x.w - edge0.w) / (edge1.w - edge0.w), Float4(0.0f)), Float4(1.0f)); dst.w = tw * tw * (Float4(3.0f) - Float4(2.0f) * tw);
   1036 	}
   1037 
   1038 	void ShaderCore::floatToHalfBits(Float4& dst, const Float4& floatBits, bool storeInUpperBits)
   1039 	{
   1040 		static const uint32_t mask_sign = 0x80000000u;
   1041 		static const uint32_t mask_round = ~0xfffu;
   1042 		static const uint32_t c_f32infty = 255 << 23;
   1043 		static const uint32_t c_magic = 15 << 23;
   1044 		static const uint32_t c_nanbit = 0x200;
   1045 		static const uint32_t c_infty_as_fp16 = 0x7c00;
   1046 		static const uint32_t c_clamp = (31 << 23) - 0x1000;
   1047 
   1048 		UInt4 justsign = UInt4(mask_sign) & As<UInt4>(floatBits);
   1049 		UInt4 absf = As<UInt4>(floatBits) ^ justsign;
   1050 		UInt4 b_isnormal = CmpNLE(UInt4(c_f32infty), absf);
   1051 
   1052 		// Note: this version doesn't round to the nearest even in case of a tie as defined by IEEE 754-2008, it rounds to +inf
   1053 		//       instead of nearest even, since that's fine for GLSL ES 3.0's needs (see section 2.1.1 Floating-Point Computation)
   1054 		UInt4 joined = ((((As<UInt4>(Min(As<Float4>(absf & UInt4(mask_round)) * As<Float4>(UInt4(c_magic)),
   1055 		                                 As<Float4>(UInt4(c_clamp))))) - UInt4(mask_round)) >> 13) & b_isnormal) |
   1056 		               ((b_isnormal ^ UInt4(0xFFFFFFFF)) & ((CmpNLE(absf, UInt4(c_f32infty)) & UInt4(c_nanbit)) |
   1057 		               UInt4(c_infty_as_fp16)));
   1058 
   1059 		dst = As<Float4>(storeInUpperBits ? As<UInt4>(dst) | ((joined << 16) | justsign) : joined | (justsign >> 16));
   1060 	}
   1061 
   1062 	void ShaderCore::halfToFloatBits(Float4& dst, const Float4& halfBits)
   1063 	{
   1064 		static const uint32_t mask_nosign = 0x7FFF;
   1065 		static const uint32_t magic = (254 - 15) << 23;
   1066 		static const uint32_t was_infnan = 0x7BFF;
   1067 		static const uint32_t exp_infnan = 255 << 23;
   1068 
   1069 		UInt4 expmant = As<UInt4>(halfBits) & UInt4(mask_nosign);
   1070 		dst = As<Float4>(As<UInt4>(As<Float4>(expmant << 13) * As<Float4>(UInt4(magic))) |
   1071 		                 ((As<UInt4>(halfBits) ^ UInt4(expmant)) << 16) |
   1072 		                 (CmpNLE(As<UInt4>(expmant), UInt4(was_infnan)) & UInt4(exp_infnan)));
   1073 	}
   1074 
   1075 	void ShaderCore::packHalf2x16(Vector4f &d, const Vector4f &s0)
   1076 	{
   1077 		// half2 | half1
   1078 		floatToHalfBits(d.x, s0.x, false);
   1079 		floatToHalfBits(d.x, s0.y, true);
   1080 	}
   1081 
   1082 	void ShaderCore::unpackHalf2x16(Vector4f &dst, const Vector4f &s0)
   1083 	{
   1084 		// half2 | half1
   1085 		halfToFloatBits(dst.x, As<Float4>(As<UInt4>(s0.x) & UInt4(0x0000FFFF)));
   1086 		halfToFloatBits(dst.y, As<Float4>((As<UInt4>(s0.x) & UInt4(0xFFFF0000)) >> 16));
   1087 	}
   1088 
   1089 	void ShaderCore::packSnorm2x16(Vector4f &d, const Vector4f &s0)
   1090 	{
   1091 		// round(clamp(c, -1.0, 1.0) * 32767.0)
   1092 		d.x = As<Float4>((Int4(Round(Min(Max(s0.x, Float4(-1.0f)), Float4(1.0f)) * Float4(32767.0f))) & Int4(0xFFFF)) |
   1093 		                ((Int4(Round(Min(Max(s0.y, Float4(-1.0f)), Float4(1.0f)) * Float4(32767.0f))) & Int4(0xFFFF)) << 16));
   1094 	}
   1095 
   1096 	void ShaderCore::packUnorm2x16(Vector4f &d, const Vector4f &s0)
   1097 	{
   1098 		// round(clamp(c, 0.0, 1.0) * 65535.0)
   1099 		d.x = As<Float4>((Int4(Round(Min(Max(s0.x, Float4(0.0f)), Float4(1.0f)) * Float4(65535.0f))) & Int4(0xFFFF)) |
   1100 		                ((Int4(Round(Min(Max(s0.y, Float4(0.0f)), Float4(1.0f)) * Float4(65535.0f))) & Int4(0xFFFF)) << 16));
   1101 	}
   1102 
   1103 	void ShaderCore::unpackSnorm2x16(Vector4f &dst, const Vector4f &s0)
   1104 	{
   1105 		// clamp(f / 32727.0, -1.0, 1.0)
   1106 		dst.x = Min(Max(Float4(As<Int4>((As<UInt4>(s0.x) & UInt4(0x0000FFFF)) << 16)) * Float4(1.0f / float(0x7FFF0000)), Float4(-1.0f)), Float4(1.0f));
   1107 		dst.y = Min(Max(Float4(As<Int4>(As<UInt4>(s0.x) & UInt4(0xFFFF0000))) * Float4(1.0f / float(0x7FFF0000)), Float4(-1.0f)), Float4(1.0f));
   1108 	}
   1109 
   1110 	void ShaderCore::unpackUnorm2x16(Vector4f &dst, const Vector4f &s0)
   1111 	{
   1112 		// f / 65535.0
   1113 		dst.x = Float4((As<UInt4>(s0.x) & UInt4(0x0000FFFF)) << 16) * Float4(1.0f / float(0xFFFF0000));
   1114 		dst.y = Float4(As<UInt4>(s0.x) & UInt4(0xFFFF0000)) * Float4(1.0f / float(0xFFFF0000));
   1115 	}
   1116 
   1117 	void ShaderCore::det2(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
   1118 	{
   1119 		dst.x = src0.x * src1.y - src0.y * src1.x;
   1120 		dst.y = dst.z = dst.w = dst.x;
   1121 	}
   1122 
   1123 	void ShaderCore::det3(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2)
   1124 	{
   1125 		crs(dst, src1, src2);
   1126 		dp3(dst, dst, src0);
   1127 	}
   1128 
   1129 	void ShaderCore::det4(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2, const Vector4f &src3)
   1130 	{
   1131 		dst.x = src2.z * src3.w - src2.w * src3.z;
   1132 		dst.y = src1.w * src3.z - src1.z * src3.w;
   1133 		dst.z = src1.z * src2.w - src1.w * src2.z;
   1134 		dst.x = src0.x * (src1.y * dst.x + src2.y * dst.y + src3.y * dst.z) -
   1135 		        src0.y * (src1.x * dst.x + src2.x * dst.y + src3.x * dst.z) +
   1136 		        src0.z * (src1.x * (src2.y * src3.w - src2.w * src3.y) +
   1137 		                  src2.x * (src1.w * src3.y - src1.y * src3.w) +
   1138 		                  src3.x * (src1.y * src2.w - src1.w * src2.y)) +
   1139 		        src0.w * (src1.x * (src2.z * src3.y - src2.y * src3.z) +
   1140 		                  src2.x * (src1.y * src3.z - src1.z * src3.y) +
   1141 		                  src3.x * (src1.z * src2.y - src1.y * src2.z));
   1142 		dst.y = dst.z = dst.w = dst.x;
   1143 	}
   1144 
   1145 	void ShaderCore::frc(Vector4f &dst, const Vector4f &src)
   1146 	{
   1147 		dst.x = Frac(src.x);
   1148 		dst.y = Frac(src.y);
   1149 		dst.z = Frac(src.z);
   1150 		dst.w = Frac(src.w);
   1151 	}
   1152 
   1153 	void ShaderCore::trunc(Vector4f &dst, const Vector4f &src)
   1154 	{
   1155 		dst.x = Trunc(src.x);
   1156 		dst.y = Trunc(src.y);
   1157 		dst.z = Trunc(src.z);
   1158 		dst.w = Trunc(src.w);
   1159 	}
   1160 
   1161 	void ShaderCore::floor(Vector4f &dst, const Vector4f &src)
   1162 	{
   1163 		dst.x = Floor(src.x);
   1164 		dst.y = Floor(src.y);
   1165 		dst.z = Floor(src.z);
   1166 		dst.w = Floor(src.w);
   1167 	}
   1168 
   1169 	void ShaderCore::round(Vector4f &dst, const Vector4f &src)
   1170 	{
   1171 		dst.x = Round(src.x);
   1172 		dst.y = Round(src.y);
   1173 		dst.z = Round(src.z);
   1174 		dst.w = Round(src.w);
   1175 	}
   1176 
   1177 	void ShaderCore::roundEven(Vector4f &dst, const Vector4f &src)
   1178 	{
   1179 		// dst = round(src) + ((round(src) < src) * 2 - 1) * (fract(src) == 0.5) * isOdd(round(src));
   1180 		// ex.: 1.5:  2 + (0 * 2 - 1) * 1 * 0 = 2
   1181 		//      2.5:  3 + (0 * 2 - 1) * 1 * 1 = 2
   1182 		//     -1.5: -2 + (1 * 2 - 1) * 1 * 0 = -2
   1183 		//     -2.5: -3 + (1 * 2 - 1) * 1 * 1 = -2
   1184 		// Even if the round implementation rounds the other way:
   1185 		//      1.5:  1 + (1 * 2 - 1) * 1 * 1 = 2
   1186 		//      2.5:  2 + (1 * 2 - 1) * 1 * 0 = 2
   1187 		//     -1.5: -1 + (0 * 2 - 1) * 1 * 1 = -2
   1188 		//     -2.5: -2 + (0 * 2 - 1) * 1 * 0 = -2
   1189 		round(dst, src);
   1190 		dst.x += ((Float4(CmpLT(dst.x, src.x) & Int4(1)) * Float4(2.0f)) - Float4(1.0f)) * Float4(CmpEQ(Frac(src.x), Float4(0.5f)) & Int4(1)) * Float4(Int4(dst.x) & Int4(1));
   1191 		dst.y += ((Float4(CmpLT(dst.y, src.y) & Int4(1)) * Float4(2.0f)) - Float4(1.0f)) * Float4(CmpEQ(Frac(src.y), Float4(0.5f)) & Int4(1)) * Float4(Int4(dst.y) & Int4(1));
   1192 		dst.z += ((Float4(CmpLT(dst.z, src.z) & Int4(1)) * Float4(2.0f)) - Float4(1.0f)) * Float4(CmpEQ(Frac(src.z), Float4(0.5f)) & Int4(1)) * Float4(Int4(dst.z) & Int4(1));
   1193 		dst.w += ((Float4(CmpLT(dst.w, src.w) & Int4(1)) * Float4(2.0f)) - Float4(1.0f)) * Float4(CmpEQ(Frac(src.w), Float4(0.5f)) & Int4(1)) * Float4(Int4(dst.w) & Int4(1));
   1194 	}
   1195 
   1196 	void ShaderCore::ceil(Vector4f &dst, const Vector4f &src)
   1197 	{
   1198 		dst.x = Ceil(src.x);
   1199 		dst.y = Ceil(src.y);
   1200 		dst.z = Ceil(src.z);
   1201 		dst.w = Ceil(src.w);
   1202 	}
   1203 
   1204 	void ShaderCore::powx(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, bool pp)
   1205 	{
   1206 		Float4 pow = power(src0.x, src1.x, pp);
   1207 
   1208 		dst.x = pow;
   1209 		dst.y = pow;
   1210 		dst.z = pow;
   1211 		dst.w = pow;
   1212 	}
   1213 
   1214 	void ShaderCore::pow(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, bool pp)
   1215 	{
   1216 		dst.x = power(src0.x, src1.x, pp);
   1217 		dst.y = power(src0.y, src1.y, pp);
   1218 		dst.z = power(src0.z, src1.z, pp);
   1219 		dst.w = power(src0.w, src1.w, pp);
   1220 	}
   1221 
   1222 	void ShaderCore::crs(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
   1223 	{
   1224 		dst.x = src0.y * src1.z - src0.z * src1.y;
   1225 		dst.y = src0.z * src1.x - src0.x * src1.z;
   1226 		dst.z = src0.x * src1.y - src0.y * src1.x;
   1227 	}
   1228 
   1229 	void ShaderCore::forward1(Vector4f &dst, const Vector4f &N, const Vector4f &I, const Vector4f &Nref)
   1230 	{
   1231 		Int4 flip = CmpNLT(Nref.x * I.x, Float4(0.0f)) & Int4(0x80000000);
   1232 
   1233 		dst.x =  As<Float4>(flip ^ As<Int4>(N.x));
   1234 	}
   1235 
   1236 	void ShaderCore::forward2(Vector4f &dst, const Vector4f &N, const Vector4f &I, const Vector4f &Nref)
   1237 	{
   1238 		Int4 flip = CmpNLT(dot2(Nref, I), Float4(0.0f)) & Int4(0x80000000);
   1239 
   1240 		dst.x =  As<Float4>(flip ^ As<Int4>(N.x));
   1241 		dst.y =  As<Float4>(flip ^ As<Int4>(N.y));
   1242 	}
   1243 
   1244 	void ShaderCore::forward3(Vector4f &dst, const Vector4f &N, const Vector4f &I, const Vector4f &Nref)
   1245 	{
   1246 		Int4 flip = CmpNLT(dot3(Nref, I), Float4(0.0f)) & Int4(0x80000000);
   1247 
   1248 		dst.x =  As<Float4>(flip ^ As<Int4>(N.x));
   1249 		dst.y =  As<Float4>(flip ^ As<Int4>(N.y));
   1250 		dst.z =  As<Float4>(flip ^ As<Int4>(N.z));
   1251 	}
   1252 
   1253 	void ShaderCore::forward4(Vector4f &dst, const Vector4f &N, const Vector4f &I, const Vector4f &Nref)
   1254 	{
   1255 		Int4 flip = CmpNLT(dot4(Nref, I), Float4(0.0f)) & Int4(0x80000000);
   1256 
   1257 		dst.x =  As<Float4>(flip ^ As<Int4>(N.x));
   1258 		dst.y =  As<Float4>(flip ^ As<Int4>(N.y));
   1259 		dst.z =  As<Float4>(flip ^ As<Int4>(N.z));
   1260 		dst.w =  As<Float4>(flip ^ As<Int4>(N.w));
   1261 	}
   1262 
   1263 	void ShaderCore::reflect1(Vector4f &dst, const Vector4f &I, const Vector4f &N)
   1264 	{
   1265 		Float4 d = N.x * I.x;
   1266 
   1267 		dst.x = I.x - Float4(2.0f) * d * N.x;
   1268 	}
   1269 
   1270 	void ShaderCore::reflect2(Vector4f &dst, const Vector4f &I, const Vector4f &N)
   1271 	{
   1272 		Float4 d = dot2(N, I);
   1273 
   1274 		dst.x = I.x - Float4(2.0f) * d * N.x;
   1275 		dst.y = I.y - Float4(2.0f) * d * N.y;
   1276 	}
   1277 
   1278 	void ShaderCore::reflect3(Vector4f &dst, const Vector4f &I, const Vector4f &N)
   1279 	{
   1280 		Float4 d = dot3(N, I);
   1281 
   1282 		dst.x = I.x - Float4(2.0f) * d * N.x;
   1283 		dst.y = I.y - Float4(2.0f) * d * N.y;
   1284 		dst.z = I.z - Float4(2.0f) * d * N.z;
   1285 	}
   1286 
   1287 	void ShaderCore::reflect4(Vector4f &dst, const Vector4f &I, const Vector4f &N)
   1288 	{
   1289 		Float4 d = dot4(N, I);
   1290 
   1291 		dst.x = I.x - Float4(2.0f) * d * N.x;
   1292 		dst.y = I.y - Float4(2.0f) * d * N.y;
   1293 		dst.z = I.z - Float4(2.0f) * d * N.z;
   1294 		dst.w = I.w - Float4(2.0f) * d * N.w;
   1295 	}
   1296 
   1297 	void ShaderCore::refract1(Vector4f &dst, const Vector4f &I, const Vector4f &N, const Float4 &eta)
   1298 	{
   1299 		Float4 d = N.x * I.x;
   1300 		Float4 k = Float4(1.0f) - eta * eta * (Float4(1.0f) - d * d);
   1301 		Int4 pos = CmpNLT(k, Float4(0.0f));
   1302 		Float4 t = (eta * d + Sqrt(k));
   1303 
   1304 		dst.x = As<Float4>(pos & As<Int4>(eta * I.x - t * N.x));
   1305 	}
   1306 
   1307 	void ShaderCore::refract2(Vector4f &dst, const Vector4f &I, const Vector4f &N, const Float4 &eta)
   1308 	{
   1309 		Float4 d = dot2(N, I);
   1310 		Float4 k = Float4(1.0f) - eta * eta * (Float4(1.0f) - d * d);
   1311 		Int4 pos = CmpNLT(k, Float4(0.0f));
   1312 		Float4 t = (eta * d + Sqrt(k));
   1313 
   1314 		dst.x = As<Float4>(pos & As<Int4>(eta * I.x - t * N.x));
   1315 		dst.y = As<Float4>(pos & As<Int4>(eta * I.y - t * N.y));
   1316 	}
   1317 
   1318 	void ShaderCore::refract3(Vector4f &dst, const Vector4f &I, const Vector4f &N, const Float4 &eta)
   1319 	{
   1320 		Float4 d = dot3(N, I);
   1321 		Float4 k = Float4(1.0f) - eta * eta * (Float4(1.0f) - d * d);
   1322 		Int4 pos = CmpNLT(k, Float4(0.0f));
   1323 		Float4 t = (eta * d + Sqrt(k));
   1324 
   1325 		dst.x = As<Float4>(pos & As<Int4>(eta * I.x - t * N.x));
   1326 		dst.y = As<Float4>(pos & As<Int4>(eta * I.y - t * N.y));
   1327 		dst.z = As<Float4>(pos & As<Int4>(eta * I.z - t * N.z));
   1328 	}
   1329 
   1330 	void ShaderCore::refract4(Vector4f &dst, const Vector4f &I, const Vector4f &N, const Float4 &eta)
   1331 	{
   1332 		Float4 d = dot4(N, I);
   1333 		Float4 k = Float4(1.0f) - eta * eta * (Float4(1.0f) - d * d);
   1334 		Int4 pos = CmpNLT(k, Float4(0.0f));
   1335 		Float4 t = (eta * d + Sqrt(k));
   1336 
   1337 		dst.x = As<Float4>(pos & As<Int4>(eta * I.x - t * N.x));
   1338 		dst.y = As<Float4>(pos & As<Int4>(eta * I.y - t * N.y));
   1339 		dst.z = As<Float4>(pos & As<Int4>(eta * I.z - t * N.z));
   1340 		dst.w = As<Float4>(pos & As<Int4>(eta * I.w - t * N.w));
   1341 	}
   1342 
   1343 	void ShaderCore::sgn(Vector4f &dst, const Vector4f &src)
   1344 	{
   1345 		sgn(dst.x, src.x);
   1346 		sgn(dst.y, src.y);
   1347 		sgn(dst.z, src.z);
   1348 		sgn(dst.w, src.w);
   1349 	}
   1350 
   1351 	void ShaderCore::isgn(Vector4f &dst, const Vector4f &src)
   1352 	{
   1353 		isgn(dst.x, src.x);
   1354 		isgn(dst.y, src.y);
   1355 		isgn(dst.z, src.z);
   1356 		isgn(dst.w, src.w);
   1357 	}
   1358 
   1359 	void ShaderCore::abs(Vector4f &dst, const Vector4f &src)
   1360 	{
   1361 		dst.x = Abs(src.x);
   1362 		dst.y = Abs(src.y);
   1363 		dst.z = Abs(src.z);
   1364 		dst.w = Abs(src.w);
   1365 	}
   1366 
   1367 	void ShaderCore::iabs(Vector4f &dst, const Vector4f &src)
   1368 	{
   1369 		dst.x = As<Float4>(Abs(As<Int4>(src.x)));
   1370 		dst.y = As<Float4>(Abs(As<Int4>(src.y)));
   1371 		dst.z = As<Float4>(Abs(As<Int4>(src.z)));
   1372 		dst.w = As<Float4>(Abs(As<Int4>(src.w)));
   1373 	}
   1374 
   1375 	void ShaderCore::nrm2(Vector4f &dst, const Vector4f &src, bool pp)
   1376 	{
   1377 		Float4 dot = dot2(src, src);
   1378 		Float4 rsq = reciprocalSquareRoot(dot, false, pp);
   1379 
   1380 		dst.x = src.x * rsq;
   1381 		dst.y = src.y * rsq;
   1382 		dst.z = src.z * rsq;
   1383 		dst.w = src.w * rsq;
   1384 	}
   1385 
   1386 	void ShaderCore::nrm3(Vector4f &dst, const Vector4f &src, bool pp)
   1387 	{
   1388 		Float4 dot = dot3(src, src);
   1389 		Float4 rsq = reciprocalSquareRoot(dot, false, pp);
   1390 
   1391 		dst.x = src.x * rsq;
   1392 		dst.y = src.y * rsq;
   1393 		dst.z = src.z * rsq;
   1394 		dst.w = src.w * rsq;
   1395 	}
   1396 
   1397 	void ShaderCore::nrm4(Vector4f &dst, const Vector4f &src, bool pp)
   1398 	{
   1399 		Float4 dot = dot4(src, src);
   1400 		Float4 rsq = reciprocalSquareRoot(dot, false, pp);
   1401 
   1402 		dst.x = src.x * rsq;
   1403 		dst.y = src.y * rsq;
   1404 		dst.z = src.z * rsq;
   1405 		dst.w = src.w * rsq;
   1406 	}
   1407 
   1408 	void ShaderCore::sincos(Vector4f &dst, const Vector4f &src, bool pp)
   1409 	{
   1410 		dst.x = cosine_pi(src.x, pp);
   1411 		dst.y = sine_pi(src.x, pp);
   1412 	}
   1413 
   1414 	void ShaderCore::cos(Vector4f &dst, const Vector4f &src, bool pp)
   1415 	{
   1416 		dst.x = cosine(src.x, pp);
   1417 		dst.y = cosine(src.y, pp);
   1418 		dst.z = cosine(src.z, pp);
   1419 		dst.w = cosine(src.w, pp);
   1420 	}
   1421 
   1422 	void ShaderCore::sin(Vector4f &dst, const Vector4f &src, bool pp)
   1423 	{
   1424 		dst.x = sine(src.x, pp);
   1425 		dst.y = sine(src.y, pp);
   1426 		dst.z = sine(src.z, pp);
   1427 		dst.w = sine(src.w, pp);
   1428 	}
   1429 
   1430 	void ShaderCore::tan(Vector4f &dst, const Vector4f &src, bool pp)
   1431 	{
   1432 		dst.x = tangent(src.x, pp);
   1433 		dst.y = tangent(src.y, pp);
   1434 		dst.z = tangent(src.z, pp);
   1435 		dst.w = tangent(src.w, pp);
   1436 	}
   1437 
   1438 	void ShaderCore::acos(Vector4f &dst, const Vector4f &src, bool pp)
   1439 	{
   1440 		dst.x = arccos(src.x, pp);
   1441 		dst.y = arccos(src.y, pp);
   1442 		dst.z = arccos(src.z, pp);
   1443 		dst.w = arccos(src.w, pp);
   1444 	}
   1445 
   1446 	void ShaderCore::asin(Vector4f &dst, const Vector4f &src, bool pp)
   1447 	{
   1448 		dst.x = arcsin(src.x, pp);
   1449 		dst.y = arcsin(src.y, pp);
   1450 		dst.z = arcsin(src.z, pp);
   1451 		dst.w = arcsin(src.w, pp);
   1452 	}
   1453 
   1454 	void ShaderCore::atan(Vector4f &dst, const Vector4f &src, bool pp)
   1455 	{
   1456 		dst.x = arctan(src.x, pp);
   1457 		dst.y = arctan(src.y, pp);
   1458 		dst.z = arctan(src.z, pp);
   1459 		dst.w = arctan(src.w, pp);
   1460 	}
   1461 
   1462 	void ShaderCore::atan2(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, bool pp)
   1463 	{
   1464 		dst.x = arctan(src0.x, src1.x, pp);
   1465 		dst.y = arctan(src0.y, src1.y, pp);
   1466 		dst.z = arctan(src0.z, src1.z, pp);
   1467 		dst.w = arctan(src0.w, src1.w, pp);
   1468 	}
   1469 
   1470 	void ShaderCore::cosh(Vector4f &dst, const Vector4f &src, bool pp)
   1471 	{
   1472 		dst.x = cosineh(src.x, pp);
   1473 		dst.y = cosineh(src.y, pp);
   1474 		dst.z = cosineh(src.z, pp);
   1475 		dst.w = cosineh(src.w, pp);
   1476 	}
   1477 
   1478 	void ShaderCore::sinh(Vector4f &dst, const Vector4f &src, bool pp)
   1479 	{
   1480 		dst.x = sineh(src.x, pp);
   1481 		dst.y = sineh(src.y, pp);
   1482 		dst.z = sineh(src.z, pp);
   1483 		dst.w = sineh(src.w, pp);
   1484 	}
   1485 
   1486 	void ShaderCore::tanh(Vector4f &dst, const Vector4f &src, bool pp)
   1487 	{
   1488 		dst.x = tangenth(src.x, pp);
   1489 		dst.y = tangenth(src.y, pp);
   1490 		dst.z = tangenth(src.z, pp);
   1491 		dst.w = tangenth(src.w, pp);
   1492 	}
   1493 
   1494 	void ShaderCore::acosh(Vector4f &dst, const Vector4f &src, bool pp)
   1495 	{
   1496 		dst.x = arccosh(src.x, pp);
   1497 		dst.y = arccosh(src.y, pp);
   1498 		dst.z = arccosh(src.z, pp);
   1499 		dst.w = arccosh(src.w, pp);
   1500 	}
   1501 
   1502 	void ShaderCore::asinh(Vector4f &dst, const Vector4f &src, bool pp)
   1503 	{
   1504 		dst.x = arcsinh(src.x, pp);
   1505 		dst.y = arcsinh(src.y, pp);
   1506 		dst.z = arcsinh(src.z, pp);
   1507 		dst.w = arcsinh(src.w, pp);
   1508 	}
   1509 
   1510 	void ShaderCore::atanh(Vector4f &dst, const Vector4f &src, bool pp)
   1511 	{
   1512 		dst.x = arctanh(src.x, pp);
   1513 		dst.y = arctanh(src.y, pp);
   1514 		dst.z = arctanh(src.z, pp);
   1515 		dst.w = arctanh(src.w, pp);
   1516 	}
   1517 
   1518 	void ShaderCore::expp(Vector4f &dst, const Vector4f &src, unsigned short version)
   1519 	{
   1520 		if(version < 0x0200)
   1521 		{
   1522 			Float4 frc = Frac(src.x);
   1523 			Float4 floor = src.x - frc;
   1524 
   1525 			dst.x = exponential2(floor, true);
   1526 			dst.y = frc;
   1527 			dst.z = exponential2(src.x, true);
   1528 			dst.w = Float4(1.0f);
   1529 		}
   1530 		else   // Version >= 2.0
   1531 		{
   1532 			exp2x(dst, src, true);   // FIXME: 10-bit precision suffices
   1533 		}
   1534 	}
   1535 
   1536 	void ShaderCore::logp(Vector4f &dst, const Vector4f &src, unsigned short version)
   1537 	{
   1538 		if(version < 0x0200)
   1539 		{
   1540 			Float4 tmp0;
   1541 			Float4 tmp1;
   1542 			Float4 t;
   1543 			Int4 r;
   1544 
   1545 			tmp0 = Abs(src.x);
   1546 			tmp1 = tmp0;
   1547 
   1548 			// X component
   1549 			r = As<Int4>(As<UInt4>(tmp0) >> 23) - Int4(127);
   1550 			dst.x = Float4(r);
   1551 
   1552 			// Y component
   1553 			dst.y = As<Float4>((As<Int4>(tmp1) & Int4(0x007FFFFF)) | As<Int4>(Float4(1.0f)));
   1554 
   1555 			// Z component
   1556 			dst.z = logarithm2(src.x, true, true);
   1557 
   1558 			// W component
   1559 			dst.w = 1.0f;
   1560 		}
   1561 		else
   1562 		{
   1563 			log2x(dst, src, true);
   1564 		}
   1565 	}
   1566 
   1567 	void ShaderCore::cmp0(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2)
   1568 	{
   1569 		cmp0(dst.x, src0.x, src1.x, src2.x);
   1570 		cmp0(dst.y, src0.y, src1.y, src2.y);
   1571 		cmp0(dst.z, src0.z, src1.z, src2.z);
   1572 		cmp0(dst.w, src0.w, src1.w, src2.w);
   1573 	}
   1574 
   1575 	void ShaderCore::select(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2)
   1576 	{
   1577 		select(dst.x, As<Int4>(src0.x), src1.x, src2.x);
   1578 		select(dst.y, As<Int4>(src0.y), src1.y, src2.y);
   1579 		select(dst.z, As<Int4>(src0.z), src1.z, src2.z);
   1580 		select(dst.w, As<Int4>(src0.w), src1.w, src2.w);
   1581 	}
   1582 
   1583 	void ShaderCore::extract(Float4 &dst, const Vector4f &src0, const Float4 &src1)
   1584 	{
   1585 		select(dst, CmpEQ(As<Int4>(src1), Int4(1)), src0.y, src0.x);
   1586 		select(dst, CmpEQ(As<Int4>(src1), Int4(2)), src0.z, dst);
   1587 		select(dst, CmpEQ(As<Int4>(src1), Int4(3)), src0.w, dst);
   1588 	}
   1589 
   1590 	void ShaderCore::insert(Vector4f &dst, const Vector4f &src, const Float4 &element, const Float4 &index)
   1591 	{
   1592 		select(dst.x, CmpEQ(As<Int4>(index), Int4(0)), element, src.x);
   1593 		select(dst.y, CmpEQ(As<Int4>(index), Int4(1)), element, src.y);
   1594 		select(dst.z, CmpEQ(As<Int4>(index), Int4(2)), element, src.z);
   1595 		select(dst.w, CmpEQ(As<Int4>(index), Int4(3)), element, src.w);
   1596 	}
   1597 
   1598 	void ShaderCore::sgn(Float4 &dst, const Float4 &src)
   1599 	{
   1600 		Int4 neg = As<Int4>(CmpLT(src, Float4(-0.0f))) & As<Int4>(Float4(-1.0f));
   1601 		Int4 pos = As<Int4>(CmpNLE(src, Float4(+0.0f))) & As<Int4>(Float4(1.0f));
   1602 		dst = As<Float4>(neg | pos);
   1603 	}
   1604 
   1605 	void ShaderCore::isgn(Float4 &dst, const Float4 &src)
   1606 	{
   1607 		Int4 neg = CmpLT(As<Int4>(src), Int4(0)) & Int4(-1);
   1608 		Int4 pos = CmpNLE(As<Int4>(src), Int4(0)) & Int4(1);
   1609 		dst = As<Float4>(neg | pos);
   1610 	}
   1611 
   1612 	void ShaderCore::cmp0(Float4 &dst, const Float4 &src0, const Float4 &src1, const Float4 &src2)
   1613 	{
   1614 		Int4 pos = CmpLE(Float4(0.0f), src0);
   1615 		select(dst, pos, src1, src2);
   1616 	}
   1617 
   1618 	void ShaderCore::cmp0i(Float4 &dst, const Float4 &src0, const Float4 &src1, const Float4 &src2)
   1619 	{
   1620 		Int4 pos = CmpEQ(Int4(0), As<Int4>(src0));
   1621 		select(dst, pos, src1, src2);
   1622 	}
   1623 
   1624 	void ShaderCore::select(Float4 &dst, RValue<Int4> src0, const Float4 &src1, const Float4 &src2)
   1625 	{
   1626 		// FIXME: LLVM vector select
   1627 		dst = As<Float4>((src0 & As<Int4>(src1)) | (~src0 & As<Int4>(src2)));
   1628 	}
   1629 
   1630 	void ShaderCore::cmp(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, Control control)
   1631 	{
   1632 		switch(control)
   1633 		{
   1634 		case Shader::CONTROL_GT:
   1635 			dst.x = As<Float4>(CmpNLE(src0.x, src1.x));
   1636 			dst.y = As<Float4>(CmpNLE(src0.y, src1.y));
   1637 			dst.z = As<Float4>(CmpNLE(src0.z, src1.z));
   1638 			dst.w = As<Float4>(CmpNLE(src0.w, src1.w));
   1639 			break;
   1640 		case Shader::CONTROL_EQ:
   1641 			dst.x = As<Float4>(CmpEQ(src0.x, src1.x));
   1642 			dst.y = As<Float4>(CmpEQ(src0.y, src1.y));
   1643 			dst.z = As<Float4>(CmpEQ(src0.z, src1.z));
   1644 			dst.w = As<Float4>(CmpEQ(src0.w, src1.w));
   1645 			break;
   1646 		case Shader::CONTROL_GE:
   1647 			dst.x = As<Float4>(CmpNLT(src0.x, src1.x));
   1648 			dst.y = As<Float4>(CmpNLT(src0.y, src1.y));
   1649 			dst.z = As<Float4>(CmpNLT(src0.z, src1.z));
   1650 			dst.w = As<Float4>(CmpNLT(src0.w, src1.w));
   1651 			break;
   1652 		case Shader::CONTROL_LT:
   1653 			dst.x = As<Float4>(CmpLT(src0.x, src1.x));
   1654 			dst.y = As<Float4>(CmpLT(src0.y, src1.y));
   1655 			dst.z = As<Float4>(CmpLT(src0.z, src1.z));
   1656 			dst.w = As<Float4>(CmpLT(src0.w, src1.w));
   1657 			break;
   1658 		case Shader::CONTROL_NE:
   1659 			dst.x = As<Float4>(CmpNEQ(src0.x, src1.x));
   1660 			dst.y = As<Float4>(CmpNEQ(src0.y, src1.y));
   1661 			dst.z = As<Float4>(CmpNEQ(src0.z, src1.z));
   1662 			dst.w = As<Float4>(CmpNEQ(src0.w, src1.w));
   1663 			break;
   1664 		case Shader::CONTROL_LE:
   1665 			dst.x = As<Float4>(CmpLE(src0.x, src1.x));
   1666 			dst.y = As<Float4>(CmpLE(src0.y, src1.y));
   1667 			dst.z = As<Float4>(CmpLE(src0.z, src1.z));
   1668 			dst.w = As<Float4>(CmpLE(src0.w, src1.w));
   1669 			break;
   1670 		default:
   1671 			ASSERT(false);
   1672 		}
   1673 	}
   1674 
   1675 	void ShaderCore::icmp(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, Control control)
   1676 	{
   1677 		switch(control)
   1678 		{
   1679 		case Shader::CONTROL_GT:
   1680 			dst.x = As<Float4>(CmpNLE(As<Int4>(src0.x), As<Int4>(src1.x)));
   1681 			dst.y = As<Float4>(CmpNLE(As<Int4>(src0.y), As<Int4>(src1.y)));
   1682 			dst.z = As<Float4>(CmpNLE(As<Int4>(src0.z), As<Int4>(src1.z)));
   1683 			dst.w = As<Float4>(CmpNLE(As<Int4>(src0.w), As<Int4>(src1.w)));
   1684 			break;
   1685 		case Shader::CONTROL_EQ:
   1686 			dst.x = As<Float4>(CmpEQ(As<Int4>(src0.x), As<Int4>(src1.x)));
   1687 			dst.y = As<Float4>(CmpEQ(As<Int4>(src0.y), As<Int4>(src1.y)));
   1688 			dst.z = As<Float4>(CmpEQ(As<Int4>(src0.z), As<Int4>(src1.z)));
   1689 			dst.w = As<Float4>(CmpEQ(As<Int4>(src0.w), As<Int4>(src1.w)));
   1690 			break;
   1691 		case Shader::CONTROL_GE:
   1692 			dst.x = As<Float4>(CmpNLT(As<Int4>(src0.x), As<Int4>(src1.x)));
   1693 			dst.y = As<Float4>(CmpNLT(As<Int4>(src0.y), As<Int4>(src1.y)));
   1694 			dst.z = As<Float4>(CmpNLT(As<Int4>(src0.z), As<Int4>(src1.z)));
   1695 			dst.w = As<Float4>(CmpNLT(As<Int4>(src0.w), As<Int4>(src1.w)));
   1696 			break;
   1697 		case Shader::CONTROL_LT:
   1698 			dst.x = As<Float4>(CmpLT(As<Int4>(src0.x), As<Int4>(src1.x)));
   1699 			dst.y = As<Float4>(CmpLT(As<Int4>(src0.y), As<Int4>(src1.y)));
   1700 			dst.z = As<Float4>(CmpLT(As<Int4>(src0.z), As<Int4>(src1.z)));
   1701 			dst.w = As<Float4>(CmpLT(As<Int4>(src0.w), As<Int4>(src1.w)));
   1702 			break;
   1703 		case Shader::CONTROL_NE:
   1704 			dst.x = As<Float4>(CmpNEQ(As<Int4>(src0.x), As<Int4>(src1.x)));
   1705 			dst.y = As<Float4>(CmpNEQ(As<Int4>(src0.y), As<Int4>(src1.y)));
   1706 			dst.z = As<Float4>(CmpNEQ(As<Int4>(src0.z), As<Int4>(src1.z)));
   1707 			dst.w = As<Float4>(CmpNEQ(As<Int4>(src0.w), As<Int4>(src1.w)));
   1708 			break;
   1709 		case Shader::CONTROL_LE:
   1710 			dst.x = As<Float4>(CmpLE(As<Int4>(src0.x), As<Int4>(src1.x)));
   1711 			dst.y = As<Float4>(CmpLE(As<Int4>(src0.y), As<Int4>(src1.y)));
   1712 			dst.z = As<Float4>(CmpLE(As<Int4>(src0.z), As<Int4>(src1.z)));
   1713 			dst.w = As<Float4>(CmpLE(As<Int4>(src0.w), As<Int4>(src1.w)));
   1714 			break;
   1715 		default:
   1716 			ASSERT(false);
   1717 		}
   1718 	}
   1719 
   1720 	void ShaderCore::ucmp(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, Control control)
   1721 	{
   1722 		switch(control)
   1723 		{
   1724 		case Shader::CONTROL_GT:
   1725 			dst.x = As<Float4>(CmpNLE(As<UInt4>(src0.x), As<UInt4>(src1.x)));
   1726 			dst.y = As<Float4>(CmpNLE(As<UInt4>(src0.y), As<UInt4>(src1.y)));
   1727 			dst.z = As<Float4>(CmpNLE(As<UInt4>(src0.z), As<UInt4>(src1.z)));
   1728 			dst.w = As<Float4>(CmpNLE(As<UInt4>(src0.w), As<UInt4>(src1.w)));
   1729 			break;
   1730 		case Shader::CONTROL_EQ:
   1731 			dst.x = As<Float4>(CmpEQ(As<UInt4>(src0.x), As<UInt4>(src1.x)));
   1732 			dst.y = As<Float4>(CmpEQ(As<UInt4>(src0.y), As<UInt4>(src1.y)));
   1733 			dst.z = As<Float4>(CmpEQ(As<UInt4>(src0.z), As<UInt4>(src1.z)));
   1734 			dst.w = As<Float4>(CmpEQ(As<UInt4>(src0.w), As<UInt4>(src1.w)));
   1735 			break;
   1736 		case Shader::CONTROL_GE:
   1737 			dst.x = As<Float4>(CmpNLT(As<UInt4>(src0.x), As<UInt4>(src1.x)));
   1738 			dst.y = As<Float4>(CmpNLT(As<UInt4>(src0.y), As<UInt4>(src1.y)));
   1739 			dst.z = As<Float4>(CmpNLT(As<UInt4>(src0.z), As<UInt4>(src1.z)));
   1740 			dst.w = As<Float4>(CmpNLT(As<UInt4>(src0.w), As<UInt4>(src1.w)));
   1741 			break;
   1742 		case Shader::CONTROL_LT:
   1743 			dst.x = As<Float4>(CmpLT(As<UInt4>(src0.x), As<UInt4>(src1.x)));
   1744 			dst.y = As<Float4>(CmpLT(As<UInt4>(src0.y), As<UInt4>(src1.y)));
   1745 			dst.z = As<Float4>(CmpLT(As<UInt4>(src0.z), As<UInt4>(src1.z)));
   1746 			dst.w = As<Float4>(CmpLT(As<UInt4>(src0.w), As<UInt4>(src1.w)));
   1747 			break;
   1748 		case Shader::CONTROL_NE:
   1749 			dst.x = As<Float4>(CmpNEQ(As<UInt4>(src0.x), As<UInt4>(src1.x)));
   1750 			dst.y = As<Float4>(CmpNEQ(As<UInt4>(src0.y), As<UInt4>(src1.y)));
   1751 			dst.z = As<Float4>(CmpNEQ(As<UInt4>(src0.z), As<UInt4>(src1.z)));
   1752 			dst.w = As<Float4>(CmpNEQ(As<UInt4>(src0.w), As<UInt4>(src1.w)));
   1753 			break;
   1754 		case Shader::CONTROL_LE:
   1755 			dst.x = As<Float4>(CmpLE(As<UInt4>(src0.x), As<UInt4>(src1.x)));
   1756 			dst.y = As<Float4>(CmpLE(As<UInt4>(src0.y), As<UInt4>(src1.y)));
   1757 			dst.z = As<Float4>(CmpLE(As<UInt4>(src0.z), As<UInt4>(src1.z)));
   1758 			dst.w = As<Float4>(CmpLE(As<UInt4>(src0.w), As<UInt4>(src1.w)));
   1759 			break;
   1760 		default:
   1761 			ASSERT(false);
   1762 		}
   1763 	}
   1764 
   1765 	void ShaderCore::all(Float4 &dst, const Vector4f &src)
   1766 	{
   1767 		dst = As<Float4>(As<Int4>(src.x) & As<Int4>(src.y) & As<Int4>(src.z) & As<Int4>(src.w));
   1768 	}
   1769 
   1770 	void ShaderCore::any(Float4 &dst, const Vector4f &src)
   1771 	{
   1772 		dst = As<Float4>(As<Int4>(src.x) | As<Int4>(src.y) | As<Int4>(src.z) | As<Int4>(src.w));
   1773 	}
   1774 
   1775 	void ShaderCore::bitwise_not(Vector4f &dst, const Vector4f &src)
   1776 	{
   1777 		dst.x = As<Float4>(As<Int4>(src.x) ^ Int4(0xFFFFFFFF));
   1778 		dst.y = As<Float4>(As<Int4>(src.y) ^ Int4(0xFFFFFFFF));
   1779 		dst.z = As<Float4>(As<Int4>(src.z) ^ Int4(0xFFFFFFFF));
   1780 		dst.w = As<Float4>(As<Int4>(src.w) ^ Int4(0xFFFFFFFF));
   1781 	}
   1782 
   1783 	void ShaderCore::bitwise_or(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
   1784 	{
   1785 		dst.x = As<Float4>(As<Int4>(src0.x) | As<Int4>(src1.x));
   1786 		dst.y = As<Float4>(As<Int4>(src0.y) | As<Int4>(src1.y));
   1787 		dst.z = As<Float4>(As<Int4>(src0.z) | As<Int4>(src1.z));
   1788 		dst.w = As<Float4>(As<Int4>(src0.w) | As<Int4>(src1.w));
   1789 	}
   1790 
   1791 	void ShaderCore::bitwise_xor(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
   1792 	{
   1793 		dst.x = As<Float4>(As<Int4>(src0.x) ^ As<Int4>(src1.x));
   1794 		dst.y = As<Float4>(As<Int4>(src0.y) ^ As<Int4>(src1.y));
   1795 		dst.z = As<Float4>(As<Int4>(src0.z) ^ As<Int4>(src1.z));
   1796 		dst.w = As<Float4>(As<Int4>(src0.w) ^ As<Int4>(src1.w));
   1797 	}
   1798 
   1799 	void ShaderCore::bitwise_and(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
   1800 	{
   1801 		dst.x = As<Float4>(As<Int4>(src0.x) & As<Int4>(src1.x));
   1802 		dst.y = As<Float4>(As<Int4>(src0.y) & As<Int4>(src1.y));
   1803 		dst.z = As<Float4>(As<Int4>(src0.z) & As<Int4>(src1.z));
   1804 		dst.w = As<Float4>(As<Int4>(src0.w) & As<Int4>(src1.w));
   1805 	}
   1806 
   1807 	void ShaderCore::equal(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
   1808 	{
   1809 		dst.x = As<Float4>(CmpEQ(As<UInt4>(src0.x), As<UInt4>(src1.x)) &
   1810 		                   CmpEQ(As<UInt4>(src0.y), As<UInt4>(src1.y)) &
   1811 		                   CmpEQ(As<UInt4>(src0.z), As<UInt4>(src1.z)) &
   1812 		                   CmpEQ(As<UInt4>(src0.w), As<UInt4>(src1.w)));
   1813 		dst.y = dst.x;
   1814 		dst.z = dst.x;
   1815 		dst.w = dst.x;
   1816 	}
   1817 
   1818 	void ShaderCore::notEqual(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
   1819 	{
   1820 		dst.x = As<Float4>(CmpNEQ(As<UInt4>(src0.x), As<UInt4>(src1.x)) |
   1821 		                   CmpNEQ(As<UInt4>(src0.y), As<UInt4>(src1.y)) |
   1822 		                   CmpNEQ(As<UInt4>(src0.z), As<UInt4>(src1.z)) |
   1823 		                   CmpNEQ(As<UInt4>(src0.w), As<UInt4>(src1.w)));
   1824 		dst.y = dst.x;
   1825 		dst.z = dst.x;
   1826 		dst.w = dst.x;
   1827 	}
   1828 }
   1829