Home | History | Annotate | Download | only in Renderer
      1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved.
      2 //
      3 // Licensed under the Apache License, Version 2.0 (the "License");
      4 // you may not use this file except in compliance with the License.
      5 // You may obtain a copy of the License at
      6 //
      7 //    http://www.apache.org/licenses/LICENSE-2.0
      8 //
      9 // Unless required by applicable law or agreed to in writing, software
     10 // distributed under the License is distributed on an "AS IS" BASIS,
     11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 // See the License for the specific language governing permissions and
     13 // limitations under the License.
     14 
     15 #include "Surface.hpp"
     16 
     17 #include "Color.hpp"
     18 #include "Context.hpp"
     19 #include "ETC_Decoder.hpp"
     20 #include "Renderer.hpp"
     21 #include "Common/Half.hpp"
     22 #include "Common/Memory.hpp"
     23 #include "Common/CPUID.hpp"
     24 #include "Common/Resource.hpp"
     25 #include "Common/Debug.hpp"
     26 #include "Reactor/Reactor.hpp"
     27 
     28 #if defined(__i386__) || defined(__x86_64__)
     29 	#include <xmmintrin.h>
     30 	#include <emmintrin.h>
     31 #endif
     32 
     33 #undef min
     34 #undef max
     35 
     36 namespace sw
     37 {
     38 	extern bool quadLayoutEnabled;
     39 	extern bool complementaryDepthBuffer;
     40 	extern TranscendentalPrecision logPrecision;
     41 
     42 	unsigned int *Surface::palette = 0;
     43 	unsigned int Surface::paletteID = 0;
     44 
     45 	void Rect::clip(int minX, int minY, int maxX, int maxY)
     46 	{
     47 		x0 = clamp(x0, minX, maxX);
     48 		y0 = clamp(y0, minY, maxY);
     49 		x1 = clamp(x1, minX, maxX);
     50 		y1 = clamp(y1, minY, maxY);
     51 	}
     52 
     53 	void Surface::Buffer::write(int x, int y, int z, const Color<float> &color)
     54 	{
     55 		void *element = (unsigned char*)buffer + x * bytes + y * pitchB + z * sliceB;
     56 
     57 		write(element, color);
     58 	}
     59 
     60 	void Surface::Buffer::write(int x, int y, const Color<float> &color)
     61 	{
     62 		void *element = (unsigned char*)buffer + x * bytes + y * pitchB;
     63 
     64 		write(element, color);
     65 	}
     66 
     67 	inline void Surface::Buffer::write(void *element, const Color<float> &color)
     68 	{
     69 		switch(format)
     70 		{
     71 		case FORMAT_A8:
     72 			*(unsigned char*)element = unorm<8>(color.a);
     73 			break;
     74 		case FORMAT_R8I_SNORM:
     75 			*(char*)element = snorm<8>(color.r);
     76 			break;
     77 		case FORMAT_R8:
     78 			*(unsigned char*)element = unorm<8>(color.r);
     79 			break;
     80 		case FORMAT_R8I:
     81 			*(char*)element = scast<8>(color.r);
     82 			break;
     83 		case FORMAT_R8UI:
     84 			*(unsigned char*)element = ucast<8>(color.r);
     85 			break;
     86 		case FORMAT_R16I:
     87 			*(short*)element = scast<16>(color.r);
     88 			break;
     89 		case FORMAT_R16UI:
     90 			*(unsigned short*)element = ucast<16>(color.r);
     91 			break;
     92 		case FORMAT_R32I:
     93 			*(int*)element = static_cast<int>(color.r);
     94 			break;
     95 		case FORMAT_R32UI:
     96 			*(unsigned int*)element = static_cast<unsigned int>(color.r);
     97 			break;
     98 		case FORMAT_R3G3B2:
     99 			*(unsigned char*)element = (unorm<3>(color.r) << 5) | (unorm<3>(color.g) << 2) | (unorm<2>(color.b) << 0);
    100 			break;
    101 		case FORMAT_A8R3G3B2:
    102 			*(unsigned short*)element = (unorm<8>(color.a) << 8) | (unorm<3>(color.r) << 5) | (unorm<3>(color.g) << 2) | (unorm<2>(color.b) << 0);
    103 			break;
    104 		case FORMAT_X4R4G4B4:
    105 			*(unsigned short*)element = 0xF000 | (unorm<4>(color.r) << 8) | (unorm<4>(color.g) << 4) | (unorm<4>(color.b) << 0);
    106 			break;
    107 		case FORMAT_A4R4G4B4:
    108 			*(unsigned short*)element = (unorm<4>(color.a) << 12) | (unorm<4>(color.r) << 8) | (unorm<4>(color.g) << 4) | (unorm<4>(color.b) << 0);
    109 			break;
    110 		case FORMAT_R4G4B4A4:
    111 			*(unsigned short*)element = (unorm<4>(color.r) << 12) | (unorm<4>(color.g) << 8) | (unorm<4>(color.b) << 4) | (unorm<4>(color.a) << 0);
    112 			break;
    113 		case FORMAT_R5G6B5:
    114 			*(unsigned short*)element = (unorm<5>(color.r) << 11) | (unorm<6>(color.g) << 5) | (unorm<5>(color.b) << 0);
    115 			break;
    116 		case FORMAT_A1R5G5B5:
    117 			*(unsigned short*)element = (unorm<1>(color.a) << 15) | (unorm<5>(color.r) << 10) | (unorm<5>(color.g) << 5) | (unorm<5>(color.b) << 0);
    118 			break;
    119 		case FORMAT_R5G5B5A1:
    120 			*(unsigned short*)element = (unorm<5>(color.r) << 11) | (unorm<5>(color.g) << 6) | (unorm<5>(color.b) << 1) | (unorm<5>(color.a) << 0);
    121 			break;
    122 		case FORMAT_X1R5G5B5:
    123 			*(unsigned short*)element = 0x8000 | (unorm<5>(color.r) << 10) | (unorm<5>(color.g) << 5) | (unorm<5>(color.b) << 0);
    124 			break;
    125 		case FORMAT_A8R8G8B8:
    126 			*(unsigned int*)element = (unorm<8>(color.a) << 24) | (unorm<8>(color.r) << 16) | (unorm<8>(color.g) << 8) | (unorm<8>(color.b) << 0);
    127 			break;
    128 		case FORMAT_X8R8G8B8:
    129 			*(unsigned int*)element = 0xFF000000 | (unorm<8>(color.r) << 16) | (unorm<8>(color.g) << 8) | (unorm<8>(color.b) << 0);
    130 			break;
    131 		case FORMAT_A8B8G8R8I_SNORM:
    132 			*(unsigned int*)element = (static_cast<unsigned int>(snorm<8>(color.a)) << 24) |
    133 			                          (static_cast<unsigned int>(snorm<8>(color.b)) << 16) |
    134 			                          (static_cast<unsigned int>(snorm<8>(color.g)) << 8) |
    135 			                          (static_cast<unsigned int>(snorm<8>(color.r)) << 0);
    136 			break;
    137 		case FORMAT_A8B8G8R8:
    138 		case FORMAT_SRGB8_A8:
    139 			*(unsigned int*)element = (unorm<8>(color.a) << 24) | (unorm<8>(color.b) << 16) | (unorm<8>(color.g) << 8) | (unorm<8>(color.r) << 0);
    140 			break;
    141 		case FORMAT_A8B8G8R8I:
    142 			*(unsigned int*)element = (static_cast<unsigned int>(scast<8>(color.a)) << 24) |
    143 			                          (static_cast<unsigned int>(scast<8>(color.b)) << 16) |
    144 			                          (static_cast<unsigned int>(scast<8>(color.g)) << 8) |
    145 			                          (static_cast<unsigned int>(scast<8>(color.r)) << 0);
    146 			break;
    147 		case FORMAT_A8B8G8R8UI:
    148 			*(unsigned int*)element = (ucast<8>(color.a) << 24) | (ucast<8>(color.b) << 16) | (ucast<8>(color.g) << 8) | (ucast<8>(color.r) << 0);
    149 			break;
    150 		case FORMAT_X8B8G8R8I_SNORM:
    151 			*(unsigned int*)element = 0x7F000000 |
    152 			                          (static_cast<unsigned int>(snorm<8>(color.b)) << 16) |
    153 			                          (static_cast<unsigned int>(snorm<8>(color.g)) << 8) |
    154 			                          (static_cast<unsigned int>(snorm<8>(color.r)) << 0);
    155 			break;
    156 		case FORMAT_X8B8G8R8:
    157 		case FORMAT_SRGB8_X8:
    158 			*(unsigned int*)element = 0xFF000000 | (unorm<8>(color.b) << 16) | (unorm<8>(color.g) << 8) | (unorm<8>(color.r) << 0);
    159 			break;
    160 		case FORMAT_X8B8G8R8I:
    161 			*(unsigned int*)element = 0x7F000000 |
    162 			                          (static_cast<unsigned int>(scast<8>(color.b)) << 16) |
    163 			                          (static_cast<unsigned int>(scast<8>(color.g)) << 8) |
    164 			                          (static_cast<unsigned int>(scast<8>(color.r)) << 0);
    165 		case FORMAT_X8B8G8R8UI:
    166 			*(unsigned int*)element = 0xFF000000 | (ucast<8>(color.b) << 16) | (ucast<8>(color.g) << 8) | (ucast<8>(color.r) << 0);
    167 			break;
    168 		case FORMAT_A2R10G10B10:
    169 			*(unsigned int*)element = (unorm<2>(color.a) << 30) | (unorm<10>(color.r) << 20) | (unorm<10>(color.g) << 10) | (unorm<10>(color.b) << 0);
    170 			break;
    171 		case FORMAT_A2B10G10R10:
    172 			*(unsigned int*)element = (unorm<2>(color.a) << 30) | (unorm<10>(color.b) << 20) | (unorm<10>(color.g) << 10) | (unorm<10>(color.r) << 0);
    173 			break;
    174 		case FORMAT_G8R8I_SNORM:
    175 			*(unsigned short*)element = (static_cast<unsigned short>(snorm<8>(color.g)) << 8) |
    176 			                            (static_cast<unsigned short>(snorm<8>(color.r)) << 0);
    177 			break;
    178 		case FORMAT_G8R8:
    179 			*(unsigned short*)element = (unorm<8>(color.g) << 8) | (unorm<8>(color.r) << 0);
    180 			break;
    181 		case FORMAT_G8R8I:
    182 			*(unsigned short*)element = (static_cast<unsigned short>(scast<8>(color.g)) << 8) |
    183 			                            (static_cast<unsigned short>(scast<8>(color.r)) << 0);
    184 			break;
    185 		case FORMAT_G8R8UI:
    186 			*(unsigned short*)element = (ucast<8>(color.g) << 8) | (ucast<8>(color.r) << 0);
    187 			break;
    188 		case FORMAT_G16R16:
    189 			*(unsigned int*)element = (unorm<16>(color.g) << 16) | (unorm<16>(color.r) << 0);
    190 			break;
    191 		case FORMAT_G16R16I:
    192 			*(unsigned int*)element = (static_cast<unsigned int>(scast<16>(color.g)) << 16) |
    193 			                          (static_cast<unsigned int>(scast<16>(color.r)) << 0);
    194 			break;
    195 		case FORMAT_G16R16UI:
    196 			*(unsigned int*)element = (ucast<16>(color.g) << 16) | (ucast<16>(color.r) << 0);
    197 			break;
    198 		case FORMAT_G32R32I:
    199 		case FORMAT_G32R32UI:
    200 			((unsigned int*)element)[0] = static_cast<unsigned int>(color.r);
    201 			((unsigned int*)element)[1] = static_cast<unsigned int>(color.g);
    202 			break;
    203 		case FORMAT_A16B16G16R16:
    204 			((unsigned short*)element)[0] = unorm<16>(color.r);
    205 			((unsigned short*)element)[1] = unorm<16>(color.g);
    206 			((unsigned short*)element)[2] = unorm<16>(color.b);
    207 			((unsigned short*)element)[3] = unorm<16>(color.a);
    208 			break;
    209 		case FORMAT_A16B16G16R16I:
    210 			((unsigned short*)element)[0] = static_cast<unsigned short>(scast<16>(color.r));
    211 			((unsigned short*)element)[1] = static_cast<unsigned short>(scast<16>(color.g));
    212 			((unsigned short*)element)[2] = static_cast<unsigned short>(scast<16>(color.b));
    213 			((unsigned short*)element)[3] = static_cast<unsigned short>(scast<16>(color.a));
    214 			break;
    215 		case FORMAT_A16B16G16R16UI:
    216 			((unsigned short*)element)[0] = static_cast<unsigned short>(ucast<16>(color.r));
    217 			((unsigned short*)element)[1] = static_cast<unsigned short>(ucast<16>(color.g));
    218 			((unsigned short*)element)[2] = static_cast<unsigned short>(ucast<16>(color.b));
    219 			((unsigned short*)element)[3] = static_cast<unsigned short>(ucast<16>(color.a));
    220 			break;
    221 		case FORMAT_X16B16G16R16I:
    222 			((unsigned short*)element)[0] = static_cast<unsigned short>(scast<16>(color.r));
    223 			((unsigned short*)element)[1] = static_cast<unsigned short>(scast<16>(color.g));
    224 			((unsigned short*)element)[2] = static_cast<unsigned short>(scast<16>(color.b));
    225 			break;
    226 		case FORMAT_X16B16G16R16UI:
    227 			((unsigned short*)element)[0] = static_cast<unsigned short>(ucast<16>(color.r));
    228 			((unsigned short*)element)[1] = static_cast<unsigned short>(ucast<16>(color.g));
    229 			((unsigned short*)element)[2] = static_cast<unsigned short>(ucast<16>(color.b));
    230 			break;
    231 		case FORMAT_A32B32G32R32I:
    232 		case FORMAT_A32B32G32R32UI:
    233 			((unsigned int*)element)[0] = static_cast<unsigned int>(color.r);
    234 			((unsigned int*)element)[1] = static_cast<unsigned int>(color.g);
    235 			((unsigned int*)element)[2] = static_cast<unsigned int>(color.b);
    236 			((unsigned int*)element)[3] = static_cast<unsigned int>(color.a);
    237 			break;
    238 		case FORMAT_X32B32G32R32I:
    239 		case FORMAT_X32B32G32R32UI:
    240 			((unsigned int*)element)[0] = static_cast<unsigned int>(color.r);
    241 			((unsigned int*)element)[1] = static_cast<unsigned int>(color.g);
    242 			((unsigned int*)element)[2] = static_cast<unsigned int>(color.b);
    243 			break;
    244 		case FORMAT_V8U8:
    245 			*(unsigned short*)element = (snorm<8>(color.g) << 8) | (snorm<8>(color.r) << 0);
    246 			break;
    247 		case FORMAT_L6V5U5:
    248 			*(unsigned short*)element = (unorm<6>(color.b) << 10) | (snorm<5>(color.g) << 5) | (snorm<5>(color.r) << 0);
    249 			break;
    250 		case FORMAT_Q8W8V8U8:
    251 			*(unsigned int*)element = (snorm<8>(color.a) << 24) | (snorm<8>(color.b) << 16) | (snorm<8>(color.g) << 8) | (snorm<8>(color.r) << 0);
    252 			break;
    253 		case FORMAT_X8L8V8U8:
    254 			*(unsigned int*)element = 0xFF000000 | (unorm<8>(color.b) << 16) | (snorm<8>(color.g) << 8) | (snorm<8>(color.r) << 0);
    255 			break;
    256 		case FORMAT_V16U16:
    257 			*(unsigned int*)element = (snorm<16>(color.g) << 16) | (snorm<16>(color.r) << 0);
    258 			break;
    259 		case FORMAT_A2W10V10U10:
    260 			*(unsigned int*)element = (unorm<2>(color.a) << 30) | (snorm<10>(color.b) << 20) | (snorm<10>(color.g) << 10) | (snorm<10>(color.r) << 0);
    261 			break;
    262 		case FORMAT_A16W16V16U16:
    263 			((unsigned short*)element)[0] = snorm<16>(color.r);
    264 			((unsigned short*)element)[1] = snorm<16>(color.g);
    265 			((unsigned short*)element)[2] = snorm<16>(color.b);
    266 			((unsigned short*)element)[3] = unorm<16>(color.a);
    267 			break;
    268 		case FORMAT_Q16W16V16U16:
    269 			((unsigned short*)element)[0] = snorm<16>(color.r);
    270 			((unsigned short*)element)[1] = snorm<16>(color.g);
    271 			((unsigned short*)element)[2] = snorm<16>(color.b);
    272 			((unsigned short*)element)[3] = snorm<16>(color.a);
    273 			break;
    274 		case FORMAT_R8G8B8:
    275 			((unsigned char*)element)[0] = unorm<8>(color.b);
    276 			((unsigned char*)element)[1] = unorm<8>(color.g);
    277 			((unsigned char*)element)[2] = unorm<8>(color.r);
    278 			break;
    279 		case FORMAT_B8G8R8:
    280 			((unsigned char*)element)[0] = unorm<8>(color.r);
    281 			((unsigned char*)element)[1] = unorm<8>(color.g);
    282 			((unsigned char*)element)[2] = unorm<8>(color.b);
    283 			break;
    284 		case FORMAT_R16F:
    285 			*(half*)element = (half)color.r;
    286 			break;
    287 		case FORMAT_A16F:
    288 			*(half*)element = (half)color.a;
    289 			break;
    290 		case FORMAT_G16R16F:
    291 			((half*)element)[0] = (half)color.r;
    292 			((half*)element)[1] = (half)color.g;
    293 			break;
    294 		case FORMAT_B16G16R16F:
    295 			((half*)element)[0] = (half)color.r;
    296 			((half*)element)[1] = (half)color.g;
    297 			((half*)element)[2] = (half)color.b;
    298 			break;
    299 		case FORMAT_A16B16G16R16F:
    300 			((half*)element)[0] = (half)color.r;
    301 			((half*)element)[1] = (half)color.g;
    302 			((half*)element)[2] = (half)color.b;
    303 			((half*)element)[3] = (half)color.a;
    304 			break;
    305 		case FORMAT_A32F:
    306 			*(float*)element = color.a;
    307 			break;
    308 		case FORMAT_R32F:
    309 			*(float*)element = color.r;
    310 			break;
    311 		case FORMAT_G32R32F:
    312 			((float*)element)[0] = color.r;
    313 			((float*)element)[1] = color.g;
    314 			break;
    315 		case FORMAT_X32B32G32R32F:
    316 			((float*)element)[3] = 1.0f;
    317 		case FORMAT_B32G32R32F:
    318 			((float*)element)[0] = color.r;
    319 			((float*)element)[1] = color.g;
    320 			((float*)element)[2] = color.b;
    321 			break;
    322 		case FORMAT_A32B32G32R32F:
    323 			((float*)element)[0] = color.r;
    324 			((float*)element)[1] = color.g;
    325 			((float*)element)[2] = color.b;
    326 			((float*)element)[3] = color.a;
    327 			break;
    328 		case FORMAT_D32F:
    329 		case FORMAT_D32F_LOCKABLE:
    330 		case FORMAT_D32FS8_TEXTURE:
    331 		case FORMAT_D32FS8_SHADOW:
    332 			*((float*)element) = color.r;
    333 			break;
    334 		case FORMAT_D32F_COMPLEMENTARY:
    335 			*((float*)element) = 1 - color.r;
    336 			break;
    337 		case FORMAT_S8:
    338 			*((unsigned char*)element) = unorm<8>(color.r);
    339 			break;
    340 		case FORMAT_L8:
    341 			*(unsigned char*)element = unorm<8>(color.r);
    342 			break;
    343 		case FORMAT_A4L4:
    344 			*(unsigned char*)element = (unorm<4>(color.a) << 4) | (unorm<4>(color.r) << 0);
    345 			break;
    346 		case FORMAT_L16:
    347 			*(unsigned short*)element = unorm<16>(color.r);
    348 			break;
    349 		case FORMAT_A8L8:
    350 			*(unsigned short*)element = (unorm<8>(color.a) << 8) | (unorm<8>(color.r) << 0);
    351 			break;
    352 		case FORMAT_L16F:
    353 			*(half*)element = (half)color.r;
    354 			break;
    355 		case FORMAT_A16L16F:
    356 			((half*)element)[0] = (half)color.r;
    357 			((half*)element)[1] = (half)color.a;
    358 			break;
    359 		case FORMAT_L32F:
    360 			*(float*)element = color.r;
    361 			break;
    362 		case FORMAT_A32L32F:
    363 			((float*)element)[0] = color.r;
    364 			((float*)element)[1] = color.a;
    365 			break;
    366 		default:
    367 			ASSERT(false);
    368 		}
    369 	}
    370 
    371 	Color<float> Surface::Buffer::read(int x, int y, int z) const
    372 	{
    373 		void *element = (unsigned char*)buffer + x * bytes + y * pitchB + z * sliceB;
    374 
    375 		return read(element);
    376 	}
    377 
    378 	Color<float> Surface::Buffer::read(int x, int y) const
    379 	{
    380 		void *element = (unsigned char*)buffer + x * bytes + y * pitchB;
    381 
    382 		return read(element);
    383 	}
    384 
    385 	inline Color<float> Surface::Buffer::read(void *element) const
    386 	{
    387 		float r = 0.0f;
    388 		float g = 0.0f;
    389 		float b = 0.0f;
    390 		float a = 1.0f;
    391 
    392 		switch(format)
    393 		{
    394 		case FORMAT_P8:
    395 			{
    396 				ASSERT(palette);
    397 
    398 				unsigned int abgr = palette[*(unsigned char*)element];
    399 
    400 				r = (abgr & 0x000000FF) * (1.0f / 0x000000FF);
    401 				g = (abgr & 0x0000FF00) * (1.0f / 0x0000FF00);
    402 				b = (abgr & 0x00FF0000) * (1.0f / 0x00FF0000);
    403 				a = (abgr & 0xFF000000) * (1.0f / 0xFF000000);
    404 			}
    405 			break;
    406 		case FORMAT_A8P8:
    407 			{
    408 				ASSERT(palette);
    409 
    410 				unsigned int bgr = palette[((unsigned char*)element)[0]];
    411 
    412 				r = (bgr & 0x000000FF) * (1.0f / 0x000000FF);
    413 				g = (bgr & 0x0000FF00) * (1.0f / 0x0000FF00);
    414 				b = (bgr & 0x00FF0000) * (1.0f / 0x00FF0000);
    415 				a = ((unsigned char*)element)[1] * (1.0f / 0xFF);
    416 			}
    417 			break;
    418 		case FORMAT_A8:
    419 			r = 0;
    420 			g = 0;
    421 			b = 0;
    422 			a = *(unsigned char*)element * (1.0f / 0xFF);
    423 			break;
    424 		case FORMAT_R8I_SNORM:
    425 			r = max((*(signed char*)element) * (1.0f / 0x7F), -1.0f);
    426 			break;
    427 		case FORMAT_R8:
    428 			r = *(unsigned char*)element * (1.0f / 0xFF);
    429 			break;
    430 		case FORMAT_R8I:
    431 			r = *(signed char*)element;
    432 			break;
    433 		case FORMAT_R8UI:
    434 			r = *(unsigned char*)element;
    435 			break;
    436 		case FORMAT_R3G3B2:
    437 			{
    438 				unsigned char rgb = *(unsigned char*)element;
    439 
    440 				r = (rgb & 0xE0) * (1.0f / 0xE0);
    441 				g = (rgb & 0x1C) * (1.0f / 0x1C);
    442 				b = (rgb & 0x03) * (1.0f / 0x03);
    443 			}
    444 			break;
    445 		case FORMAT_A8R3G3B2:
    446 			{
    447 				unsigned short argb = *(unsigned short*)element;
    448 
    449 				a = (argb & 0xFF00) * (1.0f / 0xFF00);
    450 				r = (argb & 0x00E0) * (1.0f / 0x00E0);
    451 				g = (argb & 0x001C) * (1.0f / 0x001C);
    452 				b = (argb & 0x0003) * (1.0f / 0x0003);
    453 			}
    454 			break;
    455 		case FORMAT_X4R4G4B4:
    456 			{
    457 				unsigned short rgb = *(unsigned short*)element;
    458 
    459 				r = (rgb & 0x0F00) * (1.0f / 0x0F00);
    460 				g = (rgb & 0x00F0) * (1.0f / 0x00F0);
    461 				b = (rgb & 0x000F) * (1.0f / 0x000F);
    462 			}
    463 			break;
    464 		case FORMAT_A4R4G4B4:
    465 			{
    466 				unsigned short argb = *(unsigned short*)element;
    467 
    468 				a = (argb & 0xF000) * (1.0f / 0xF000);
    469 				r = (argb & 0x0F00) * (1.0f / 0x0F00);
    470 				g = (argb & 0x00F0) * (1.0f / 0x00F0);
    471 				b = (argb & 0x000F) * (1.0f / 0x000F);
    472 			}
    473 			break;
    474 		case FORMAT_R4G4B4A4:
    475 			{
    476 				unsigned short rgba = *(unsigned short*)element;
    477 
    478 				r = (rgba & 0xF000) * (1.0f / 0xF000);
    479 				g = (rgba & 0x0F00) * (1.0f / 0x0F00);
    480 				b = (rgba & 0x00F0) * (1.0f / 0x00F0);
    481 				a = (rgba & 0x000F) * (1.0f / 0x000F);
    482 			}
    483 			break;
    484 		case FORMAT_R5G6B5:
    485 			{
    486 				unsigned short rgb = *(unsigned short*)element;
    487 
    488 				r = (rgb & 0xF800) * (1.0f / 0xF800);
    489 				g = (rgb & 0x07E0) * (1.0f / 0x07E0);
    490 				b = (rgb & 0x001F) * (1.0f / 0x001F);
    491 			}
    492 			break;
    493 		case FORMAT_A1R5G5B5:
    494 			{
    495 				unsigned short argb = *(unsigned short*)element;
    496 
    497 				a = (argb & 0x8000) * (1.0f / 0x8000);
    498 				r = (argb & 0x7C00) * (1.0f / 0x7C00);
    499 				g = (argb & 0x03E0) * (1.0f / 0x03E0);
    500 				b = (argb & 0x001F) * (1.0f / 0x001F);
    501 			}
    502 			break;
    503 		case FORMAT_R5G5B5A1:
    504 			{
    505 				unsigned short rgba = *(unsigned short*)element;
    506 
    507 				r = (rgba & 0xF800) * (1.0f / 0xF800);
    508 				g = (rgba & 0x07C0) * (1.0f / 0x07C0);
    509 				b = (rgba & 0x003E) * (1.0f / 0x003E);
    510 				a = (rgba & 0x0001) * (1.0f / 0x0001);
    511 			}
    512 			break;
    513 		case FORMAT_X1R5G5B5:
    514 			{
    515 				unsigned short xrgb = *(unsigned short*)element;
    516 
    517 				r = (xrgb & 0x7C00) * (1.0f / 0x7C00);
    518 				g = (xrgb & 0x03E0) * (1.0f / 0x03E0);
    519 				b = (xrgb & 0x001F) * (1.0f / 0x001F);
    520 			}
    521 			break;
    522 		case FORMAT_A8R8G8B8:
    523 			{
    524 				unsigned int argb = *(unsigned int*)element;
    525 
    526 				a = (argb & 0xFF000000) * (1.0f / 0xFF000000);
    527 				r = (argb & 0x00FF0000) * (1.0f / 0x00FF0000);
    528 				g = (argb & 0x0000FF00) * (1.0f / 0x0000FF00);
    529 				b = (argb & 0x000000FF) * (1.0f / 0x000000FF);
    530 			}
    531 			break;
    532 		case FORMAT_X8R8G8B8:
    533 			{
    534 				unsigned int xrgb = *(unsigned int*)element;
    535 
    536 				r = (xrgb & 0x00FF0000) * (1.0f / 0x00FF0000);
    537 				g = (xrgb & 0x0000FF00) * (1.0f / 0x0000FF00);
    538 				b = (xrgb & 0x000000FF) * (1.0f / 0x000000FF);
    539 			}
    540 			break;
    541 		case FORMAT_A8B8G8R8I_SNORM:
    542 			{
    543 				signed char* abgr = (signed char*)element;
    544 
    545 				r = max(abgr[0] * (1.0f / 0x7F), -1.0f);
    546 				g = max(abgr[1] * (1.0f / 0x7F), -1.0f);
    547 				b = max(abgr[2] * (1.0f / 0x7F), -1.0f);
    548 				a = max(abgr[3] * (1.0f / 0x7F), -1.0f);
    549 			}
    550 			break;
    551 		case FORMAT_A8B8G8R8:
    552 		case FORMAT_SRGB8_A8:
    553 			{
    554 				unsigned int abgr = *(unsigned int*)element;
    555 
    556 				a = (abgr & 0xFF000000) * (1.0f / 0xFF000000);
    557 				b = (abgr & 0x00FF0000) * (1.0f / 0x00FF0000);
    558 				g = (abgr & 0x0000FF00) * (1.0f / 0x0000FF00);
    559 				r = (abgr & 0x000000FF) * (1.0f / 0x000000FF);
    560 			}
    561 			break;
    562 		case FORMAT_A8B8G8R8I:
    563 			{
    564 				signed char* abgr = (signed char*)element;
    565 
    566 				r = abgr[0];
    567 				g = abgr[1];
    568 				b = abgr[2];
    569 				a = abgr[3];
    570 			}
    571 			break;
    572 		case FORMAT_A8B8G8R8UI:
    573 			{
    574 				unsigned char* abgr = (unsigned char*)element;
    575 
    576 				r = abgr[0];
    577 				g = abgr[1];
    578 				b = abgr[2];
    579 				a = abgr[3];
    580 			}
    581 			break;
    582 		case FORMAT_X8B8G8R8I_SNORM:
    583 			{
    584 				signed char* bgr = (signed char*)element;
    585 
    586 				r = max(bgr[0] * (1.0f / 0x7F), -1.0f);
    587 				g = max(bgr[1] * (1.0f / 0x7F), -1.0f);
    588 				b = max(bgr[2] * (1.0f / 0x7F), -1.0f);
    589 			}
    590 			break;
    591 		case FORMAT_X8B8G8R8:
    592 		case FORMAT_SRGB8_X8:
    593 			{
    594 				unsigned int xbgr = *(unsigned int*)element;
    595 
    596 				b = (xbgr & 0x00FF0000) * (1.0f / 0x00FF0000);
    597 				g = (xbgr & 0x0000FF00) * (1.0f / 0x0000FF00);
    598 				r = (xbgr & 0x000000FF) * (1.0f / 0x000000FF);
    599 			}
    600 			break;
    601 		case FORMAT_X8B8G8R8I:
    602 			{
    603 				signed char* bgr = (signed char*)element;
    604 
    605 				r = bgr[0];
    606 				g = bgr[1];
    607 				b = bgr[2];
    608 			}
    609 			break;
    610 		case FORMAT_X8B8G8R8UI:
    611 			{
    612 				unsigned char* bgr = (unsigned char*)element;
    613 
    614 				r = bgr[0];
    615 				g = bgr[1];
    616 				b = bgr[2];
    617 			}
    618 			break;
    619 		case FORMAT_G8R8I_SNORM:
    620 			{
    621 				signed char* gr = (signed char*)element;
    622 
    623 				r = (gr[0] & 0xFF00) * (1.0f / 0xFF00);
    624 				g = (gr[1] & 0x00FF) * (1.0f / 0x00FF);
    625 			}
    626 			break;
    627 		case FORMAT_G8R8:
    628 			{
    629 				unsigned short gr = *(unsigned short*)element;
    630 
    631 				g = (gr & 0xFF00) * (1.0f / 0xFF00);
    632 				r = (gr & 0x00FF) * (1.0f / 0x00FF);
    633 			}
    634 			break;
    635 		case FORMAT_G8R8I:
    636 			{
    637 				signed char* gr = (signed char*)element;
    638 
    639 				r = gr[0];
    640 				g = gr[1];
    641 			}
    642 			break;
    643 		case FORMAT_G8R8UI:
    644 			{
    645 				unsigned char* gr = (unsigned char*)element;
    646 
    647 				r = gr[0];
    648 				g = gr[1];
    649 			}
    650 			break;
    651 		case FORMAT_R16I:
    652 			r = *((short*)element);
    653 			break;
    654 		case FORMAT_R16UI:
    655 			r = *((unsigned short*)element);
    656 			break;
    657 		case FORMAT_G16R16I:
    658 			{
    659 				short* gr = (short*)element;
    660 
    661 				r = gr[0];
    662 				g = gr[1];
    663 			}
    664 			break;
    665 		case FORMAT_G16R16:
    666 			{
    667 				unsigned int gr = *(unsigned int*)element;
    668 
    669 				g = (gr & 0xFFFF0000) * (1.0f / 0xFFFF0000);
    670 				r = (gr & 0x0000FFFF) * (1.0f / 0x0000FFFF);
    671 			}
    672 			break;
    673 		case FORMAT_G16R16UI:
    674 			{
    675 				unsigned short* gr = (unsigned short*)element;
    676 
    677 				r = gr[0];
    678 				g = gr[1];
    679 			}
    680 			break;
    681 		case FORMAT_A2R10G10B10:
    682 			{
    683 				unsigned int argb = *(unsigned int*)element;
    684 
    685 				a = (argb & 0xC0000000) * (1.0f / 0xC0000000);
    686 				r = (argb & 0x3FF00000) * (1.0f / 0x3FF00000);
    687 				g = (argb & 0x000FFC00) * (1.0f / 0x000FFC00);
    688 				b = (argb & 0x000003FF) * (1.0f / 0x000003FF);
    689 			}
    690 			break;
    691 		case FORMAT_A2B10G10R10:
    692 			{
    693 				unsigned int abgr = *(unsigned int*)element;
    694 
    695 				a = (abgr & 0xC0000000) * (1.0f / 0xC0000000);
    696 				b = (abgr & 0x3FF00000) * (1.0f / 0x3FF00000);
    697 				g = (abgr & 0x000FFC00) * (1.0f / 0x000FFC00);
    698 				r = (abgr & 0x000003FF) * (1.0f / 0x000003FF);
    699 			}
    700 			break;
    701 		case FORMAT_A16B16G16R16I:
    702 			{
    703 				short* abgr = (short*)element;
    704 
    705 				r = abgr[0];
    706 				g = abgr[1];
    707 				b = abgr[2];
    708 				a = abgr[3];
    709 			}
    710 			break;
    711 		case FORMAT_A16B16G16R16:
    712 			r = ((unsigned short*)element)[0] * (1.0f / 0xFFFF);
    713 			g = ((unsigned short*)element)[1] * (1.0f / 0xFFFF);
    714 			b = ((unsigned short*)element)[2] * (1.0f / 0xFFFF);
    715 			a = ((unsigned short*)element)[3] * (1.0f / 0xFFFF);
    716 			break;
    717 		case FORMAT_A16B16G16R16UI:
    718 			{
    719 				unsigned short* abgr = (unsigned short*)element;
    720 
    721 				r = abgr[0];
    722 				g = abgr[1];
    723 				b = abgr[2];
    724 				a = abgr[3];
    725 			}
    726 			break;
    727 		case FORMAT_X16B16G16R16I:
    728 			{
    729 				short* bgr = (short*)element;
    730 
    731 				r = bgr[0];
    732 				g = bgr[1];
    733 				b = bgr[2];
    734 			}
    735 			break;
    736 		case FORMAT_X16B16G16R16UI:
    737 			{
    738 				unsigned short* bgr = (unsigned short*)element;
    739 
    740 				r = bgr[0];
    741 				g = bgr[1];
    742 				b = bgr[2];
    743 			}
    744 			break;
    745 		case FORMAT_A32B32G32R32I:
    746 			{
    747 				int* abgr = (int*)element;
    748 
    749 				r = static_cast<float>(abgr[0]);
    750 				g = static_cast<float>(abgr[1]);
    751 				b = static_cast<float>(abgr[2]);
    752 				a = static_cast<float>(abgr[3]);
    753 			}
    754 			break;
    755 		case FORMAT_A32B32G32R32UI:
    756 			{
    757 				unsigned int* abgr = (unsigned int*)element;
    758 
    759 				r = static_cast<float>(abgr[0]);
    760 				g = static_cast<float>(abgr[1]);
    761 				b = static_cast<float>(abgr[2]);
    762 				a = static_cast<float>(abgr[3]);
    763 			}
    764 			break;
    765 		case FORMAT_X32B32G32R32I:
    766 			{
    767 				int* bgr = (int*)element;
    768 
    769 				r = static_cast<float>(bgr[0]);
    770 				g = static_cast<float>(bgr[1]);
    771 				b = static_cast<float>(bgr[2]);
    772 			}
    773 			break;
    774 		case FORMAT_X32B32G32R32UI:
    775 			{
    776 				unsigned int* bgr = (unsigned int*)element;
    777 
    778 				r = static_cast<float>(bgr[0]);
    779 				g = static_cast<float>(bgr[1]);
    780 				b = static_cast<float>(bgr[2]);
    781 			}
    782 			break;
    783 		case FORMAT_G32R32I:
    784 			{
    785 				int* gr = (int*)element;
    786 
    787 				r = static_cast<float>(gr[0]);
    788 				g = static_cast<float>(gr[1]);
    789 			}
    790 			break;
    791 		case FORMAT_G32R32UI:
    792 			{
    793 				unsigned int* gr = (unsigned int*)element;
    794 
    795 				r = static_cast<float>(gr[0]);
    796 				g = static_cast<float>(gr[1]);
    797 			}
    798 			break;
    799 		case FORMAT_R32I:
    800 			r = static_cast<float>(*((int*)element));
    801 			break;
    802 		case FORMAT_R32UI:
    803 			r = static_cast<float>(*((unsigned int*)element));
    804 			break;
    805 		case FORMAT_V8U8:
    806 			{
    807 				unsigned short vu = *(unsigned short*)element;
    808 
    809 				r = ((int)(vu & 0x00FF) << 24) * (1.0f / 0x7F000000);
    810 				g = ((int)(vu & 0xFF00) << 16) * (1.0f / 0x7F000000);
    811 			}
    812 			break;
    813 		case FORMAT_L6V5U5:
    814 			{
    815 				unsigned short lvu = *(unsigned short*)element;
    816 
    817 				r = ((int)(lvu & 0x001F) << 27) * (1.0f / 0x78000000);
    818 				g = ((int)(lvu & 0x03E0) << 22) * (1.0f / 0x78000000);
    819 				b = (lvu & 0xFC00) * (1.0f / 0xFC00);
    820 			}
    821 			break;
    822 		case FORMAT_Q8W8V8U8:
    823 			{
    824 				unsigned int qwvu = *(unsigned int*)element;
    825 
    826 				r = ((int)(qwvu & 0x000000FF) << 24) * (1.0f / 0x7F000000);
    827 				g = ((int)(qwvu & 0x0000FF00) << 16) * (1.0f / 0x7F000000);
    828 				b = ((int)(qwvu & 0x00FF0000) << 8)  * (1.0f / 0x7F000000);
    829 				a = ((int)(qwvu & 0xFF000000) << 0)  * (1.0f / 0x7F000000);
    830 			}
    831 			break;
    832 		case FORMAT_X8L8V8U8:
    833 			{
    834 				unsigned int xlvu = *(unsigned int*)element;
    835 
    836 				r = ((int)(xlvu & 0x000000FF) << 24) * (1.0f / 0x7F000000);
    837 				g = ((int)(xlvu & 0x0000FF00) << 16) * (1.0f / 0x7F000000);
    838 				b = (xlvu & 0x00FF0000) * (1.0f / 0x00FF0000);
    839 			}
    840 			break;
    841 		case FORMAT_R8G8B8:
    842 			r = ((unsigned char*)element)[2] * (1.0f / 0xFF);
    843 			g = ((unsigned char*)element)[1] * (1.0f / 0xFF);
    844 			b = ((unsigned char*)element)[0] * (1.0f / 0xFF);
    845 			break;
    846 		case FORMAT_B8G8R8:
    847 			r = ((unsigned char*)element)[0] * (1.0f / 0xFF);
    848 			g = ((unsigned char*)element)[1] * (1.0f / 0xFF);
    849 			b = ((unsigned char*)element)[2] * (1.0f / 0xFF);
    850 			break;
    851 		case FORMAT_V16U16:
    852 			{
    853 				unsigned int vu = *(unsigned int*)element;
    854 
    855 				r = ((int)(vu & 0x0000FFFF) << 16) * (1.0f / 0x7FFF0000);
    856 				g = ((int)(vu & 0xFFFF0000) << 0)  * (1.0f / 0x7FFF0000);
    857 			}
    858 			break;
    859 		case FORMAT_A2W10V10U10:
    860 			{
    861 				unsigned int awvu = *(unsigned int*)element;
    862 
    863 				r = ((int)(awvu & 0x000003FF) << 22) * (1.0f / 0x7FC00000);
    864 				g = ((int)(awvu & 0x000FFC00) << 12) * (1.0f / 0x7FC00000);
    865 				b = ((int)(awvu & 0x3FF00000) << 2)  * (1.0f / 0x7FC00000);
    866 				a = (awvu & 0xC0000000) * (1.0f / 0xC0000000);
    867 			}
    868 			break;
    869 		case FORMAT_A16W16V16U16:
    870 			r = ((signed short*)element)[0] * (1.0f / 0x7FFF);
    871 			g = ((signed short*)element)[1] * (1.0f / 0x7FFF);
    872 			b = ((signed short*)element)[2] * (1.0f / 0x7FFF);
    873 			a = ((unsigned short*)element)[3] * (1.0f / 0xFFFF);
    874 			break;
    875 		case FORMAT_Q16W16V16U16:
    876 			r = ((signed short*)element)[0] * (1.0f / 0x7FFF);
    877 			g = ((signed short*)element)[1] * (1.0f / 0x7FFF);
    878 			b = ((signed short*)element)[2] * (1.0f / 0x7FFF);
    879 			a = ((signed short*)element)[3] * (1.0f / 0x7FFF);
    880 			break;
    881 		case FORMAT_L8:
    882 			r =
    883 			g =
    884 			b = *(unsigned char*)element * (1.0f / 0xFF);
    885 			break;
    886 		case FORMAT_A4L4:
    887 			{
    888 				unsigned char al = *(unsigned char*)element;
    889 
    890 				r =
    891 				g =
    892 				b = (al & 0x0F) * (1.0f / 0x0F);
    893 				a = (al & 0xF0) * (1.0f / 0xF0);
    894 			}
    895 			break;
    896 		case FORMAT_L16:
    897 			r =
    898 			g =
    899 			b = *(unsigned short*)element * (1.0f / 0xFFFF);
    900 			break;
    901 		case FORMAT_A8L8:
    902 			r =
    903 			g =
    904 			b = ((unsigned char*)element)[0] * (1.0f / 0xFF);
    905 			a = ((unsigned char*)element)[1] * (1.0f / 0xFF);
    906 			break;
    907 		case FORMAT_L16F:
    908 			r =
    909 			g =
    910 			b = *(half*)element;
    911 			break;
    912 		case FORMAT_A16L16F:
    913 			r =
    914 			g =
    915 			b = ((half*)element)[0];
    916 			a = ((half*)element)[1];
    917 			break;
    918 		case FORMAT_L32F:
    919 			r =
    920 			g =
    921 			b = *(float*)element;
    922 			break;
    923 		case FORMAT_A32L32F:
    924 			r =
    925 			g =
    926 			b = ((float*)element)[0];
    927 			a = ((float*)element)[1];
    928 			break;
    929 		case FORMAT_A16F:
    930 			a = *(half*)element;
    931 			break;
    932 		case FORMAT_R16F:
    933 			r = *(half*)element;
    934 			break;
    935 		case FORMAT_G16R16F:
    936 			r = ((half*)element)[0];
    937 			g = ((half*)element)[1];
    938 			break;
    939 		case FORMAT_B16G16R16F:
    940 			r = ((half*)element)[0];
    941 			g = ((half*)element)[1];
    942 			b = ((half*)element)[2];
    943 			break;
    944 		case FORMAT_A16B16G16R16F:
    945 			r = ((half*)element)[0];
    946 			g = ((half*)element)[1];
    947 			b = ((half*)element)[2];
    948 			a = ((half*)element)[3];
    949 			break;
    950 		case FORMAT_A32F:
    951 			a = *(float*)element;
    952 			break;
    953 		case FORMAT_R32F:
    954 			r = *(float*)element;
    955 			break;
    956 		case FORMAT_G32R32F:
    957 			r = ((float*)element)[0];
    958 			g = ((float*)element)[1];
    959 			break;
    960 		case FORMAT_X32B32G32R32F:
    961 		case FORMAT_B32G32R32F:
    962 			r = ((float*)element)[0];
    963 			g = ((float*)element)[1];
    964 			b = ((float*)element)[2];
    965 			break;
    966 		case FORMAT_A32B32G32R32F:
    967 			r = ((float*)element)[0];
    968 			g = ((float*)element)[1];
    969 			b = ((float*)element)[2];
    970 			a = ((float*)element)[3];
    971 			break;
    972 		case FORMAT_D32F:
    973 		case FORMAT_D32F_LOCKABLE:
    974 		case FORMAT_D32FS8_TEXTURE:
    975 		case FORMAT_D32FS8_SHADOW:
    976 			r = *(float*)element;
    977 			g = r;
    978 			b = r;
    979 			a = r;
    980 			break;
    981 		case FORMAT_D32F_COMPLEMENTARY:
    982 			r = 1.0f - *(float*)element;
    983 			g = r;
    984 			b = r;
    985 			a = r;
    986 			break;
    987 		case FORMAT_S8:
    988 			r = *(unsigned char*)element * (1.0f / 0xFF);
    989 			break;
    990 		default:
    991 			ASSERT(false);
    992 		}
    993 
    994 	//	if(sRGB)
    995 	//	{
    996 	//		r = sRGBtoLinear(r);
    997 	//		g = sRGBtoLinear(g);
    998 	//		b = sRGBtoLinear(b);
    999 	//	}
   1000 
   1001 		return Color<float>(r, g, b, a);
   1002 	}
   1003 
   1004 	Color<float> Surface::Buffer::sample(float x, float y, float z) const
   1005 	{
   1006 		x -= 0.5f;
   1007 		y -= 0.5f;
   1008 		z -= 0.5f;
   1009 
   1010 		int x0 = clamp((int)x, 0, width - 1);
   1011 		int x1 = (x0 + 1 >= width) ? x0 : x0 + 1;
   1012 
   1013 		int y0 = clamp((int)y, 0, height - 1);
   1014 		int y1 = (y0 + 1 >= height) ? y0 : y0 + 1;
   1015 
   1016 		int z0 = clamp((int)z, 0, depth - 1);
   1017 		int z1 = (z0 + 1 >= depth) ? z0 : z0 + 1;
   1018 
   1019 		Color<float> c000 = read(x0, y0, z0);
   1020 		Color<float> c100 = read(x1, y0, z0);
   1021 		Color<float> c010 = read(x0, y1, z0);
   1022 		Color<float> c110 = read(x1, y1, z0);
   1023 		Color<float> c001 = read(x0, y0, z1);
   1024 		Color<float> c101 = read(x1, y0, z1);
   1025 		Color<float> c011 = read(x0, y1, z1);
   1026 		Color<float> c111 = read(x1, y1, z1);
   1027 
   1028 		float fx = x - x0;
   1029 		float fy = y - y0;
   1030 		float fz = z - z0;
   1031 
   1032 		c000 *= (1 - fx) * (1 - fy) * (1 - fz);
   1033 		c100 *= fx * (1 - fy) * (1 - fz);
   1034 		c010 *= (1 - fx) * fy * (1 - fz);
   1035 		c110 *= fx * fy * (1 - fz);
   1036 		c001 *= (1 - fx) * (1 - fy) * fz;
   1037 		c101 *= fx * (1 - fy) * fz;
   1038 		c011 *= (1 - fx) * fy * fz;
   1039 		c111 *= fx * fy * fz;
   1040 
   1041 		return c000 + c100 + c010 + c110 + c001 + c101 + c011 + c111;
   1042 	}
   1043 
   1044 	Color<float> Surface::Buffer::sample(float x, float y) const
   1045 	{
   1046 		x -= 0.5f;
   1047 		y -= 0.5f;
   1048 
   1049 		int x0 = clamp((int)x, 0, width - 1);
   1050 		int x1 = (x0 + 1 >= width) ? x0 : x0 + 1;
   1051 
   1052 		int y0 = clamp((int)y, 0, height - 1);
   1053 		int y1 = (y0 + 1 >= height) ? y0 : y0 + 1;
   1054 
   1055 		Color<float> c00 = read(x0, y0);
   1056 		Color<float> c10 = read(x1, y0);
   1057 		Color<float> c01 = read(x0, y1);
   1058 		Color<float> c11 = read(x1, y1);
   1059 
   1060 		float fx = x - x0;
   1061 		float fy = y - y0;
   1062 
   1063 		c00 *= (1 - fx) * (1 - fy);
   1064 		c10 *= fx * (1 - fy);
   1065 		c01 *= (1 - fx) * fy;
   1066 		c11 *= fx * fy;
   1067 
   1068 		return c00 + c10 + c01 + c11;
   1069 	}
   1070 
   1071 	void *Surface::Buffer::lockRect(int x, int y, int z, Lock lock)
   1072 	{
   1073 		this->lock = lock;
   1074 
   1075 		switch(lock)
   1076 		{
   1077 		case LOCK_UNLOCKED:
   1078 		case LOCK_READONLY:
   1079 			break;
   1080 		case LOCK_WRITEONLY:
   1081 		case LOCK_READWRITE:
   1082 		case LOCK_DISCARD:
   1083 			dirty = true;
   1084 			break;
   1085 		default:
   1086 			ASSERT(false);
   1087 		}
   1088 
   1089 		if(buffer)
   1090 		{
   1091 			switch(format)
   1092 			{
   1093 			#if S3TC_SUPPORT
   1094 			case FORMAT_DXT1:
   1095 			#endif
   1096 			case FORMAT_ATI1:
   1097 			case FORMAT_ETC1:
   1098 			case FORMAT_R11_EAC:
   1099 			case FORMAT_SIGNED_R11_EAC:
   1100 			case FORMAT_RGB8_ETC2:
   1101 			case FORMAT_SRGB8_ETC2:
   1102 			case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
   1103 			case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
   1104 				return (unsigned char*)buffer + 8 * (x / 4) + (y / 4) * pitchB + z * sliceB;
   1105 			case FORMAT_RG11_EAC:
   1106 			case FORMAT_SIGNED_RG11_EAC:
   1107 			case FORMAT_RGBA8_ETC2_EAC:
   1108 			case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
   1109 			case FORMAT_RGBA_ASTC_4x4_KHR:
   1110 			case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
   1111 				return (unsigned char*)buffer + 16 * (x / 4) + (y / 4) * pitchB + z * sliceB;
   1112 			case FORMAT_RGBA_ASTC_5x4_KHR:
   1113 			case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
   1114 				return (unsigned char*)buffer + 16 * (x / 5) + (y / 4) * pitchB + z * sliceB;
   1115 			case FORMAT_RGBA_ASTC_5x5_KHR:
   1116 			case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
   1117 				return (unsigned char*)buffer + 16 * (x / 5) + (y / 5) * pitchB + z * sliceB;
   1118 			case FORMAT_RGBA_ASTC_6x5_KHR:
   1119 			case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
   1120 				return (unsigned char*)buffer + 16 * (x / 6) + (y / 5) * pitchB + z * sliceB;
   1121 			case FORMAT_RGBA_ASTC_6x6_KHR:
   1122 			case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
   1123 				return (unsigned char*)buffer + 16 * (x / 6) + (y / 6) * pitchB + z * sliceB;
   1124 			case FORMAT_RGBA_ASTC_8x5_KHR:
   1125 			case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
   1126 				return (unsigned char*)buffer + 16 * (x / 8) + (y / 5) * pitchB + z * sliceB;
   1127 			case FORMAT_RGBA_ASTC_8x6_KHR:
   1128 			case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
   1129 				return (unsigned char*)buffer + 16 * (x / 8) + (y / 6) * pitchB + z * sliceB;
   1130 			case FORMAT_RGBA_ASTC_8x8_KHR:
   1131 			case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
   1132 				return (unsigned char*)buffer + 16 * (x / 8) + (y / 8) * pitchB + z * sliceB;
   1133 			case FORMAT_RGBA_ASTC_10x5_KHR:
   1134 			case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
   1135 				return (unsigned char*)buffer + 16 * (x / 10) + (y / 5) * pitchB + z * sliceB;
   1136 			case FORMAT_RGBA_ASTC_10x6_KHR:
   1137 			case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
   1138 				return (unsigned char*)buffer + 16 * (x / 10) + (y / 6) * pitchB + z * sliceB;
   1139 			case FORMAT_RGBA_ASTC_10x8_KHR:
   1140 			case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
   1141 				return (unsigned char*)buffer + 16 * (x / 10) + (y / 8) * pitchB + z * sliceB;
   1142 			case FORMAT_RGBA_ASTC_10x10_KHR:
   1143 			case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
   1144 				return (unsigned char*)buffer + 16 * (x / 10) + (y / 10) * pitchB + z * sliceB;
   1145 			case FORMAT_RGBA_ASTC_12x10_KHR:
   1146 			case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
   1147 				return (unsigned char*)buffer + 16 * (x / 12) + (y / 10) * pitchB + z * sliceB;
   1148 			case FORMAT_RGBA_ASTC_12x12_KHR:
   1149 			case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
   1150 				return (unsigned char*)buffer + 16 * (x / 12) + (y / 12) * pitchB + z * sliceB;
   1151 			#if S3TC_SUPPORT
   1152 			case FORMAT_DXT3:
   1153 			case FORMAT_DXT5:
   1154 			#endif
   1155 			case FORMAT_ATI2:
   1156 				return (unsigned char*)buffer + 16 * (x / 4) + (y / 4) * pitchB + z * sliceB;
   1157 			default:
   1158 				return (unsigned char*)buffer + x * bytes + y * pitchB + z * sliceB;
   1159 			}
   1160 		}
   1161 
   1162 		return 0;
   1163 	}
   1164 
   1165 	void Surface::Buffer::unlockRect()
   1166 	{
   1167 		lock = LOCK_UNLOCKED;
   1168 	}
   1169 
   1170 	class SurfaceImplementation : public Surface
   1171 	{
   1172 	public:
   1173 		SurfaceImplementation(int width, int height, int depth, Format format, void *pixels, int pitch, int slice)
   1174 			: Surface(width, height, depth, format, pixels, pitch, slice) {}
   1175 		SurfaceImplementation(Resource *texture, int width, int height, int depth, Format format, bool lockable, bool renderTarget, int pitchP = 0)
   1176 			: Surface(texture, width, height, depth, format, lockable, renderTarget, pitchP) {}
   1177 		~SurfaceImplementation() override {};
   1178 
   1179 		void *lockInternal(int x, int y, int z, Lock lock, Accessor client) override
   1180 		{
   1181 			return Surface::lockInternal(x, y, z, lock, client);
   1182 		}
   1183 
   1184 		void unlockInternal() override
   1185 		{
   1186 			Surface::unlockInternal();
   1187 		}
   1188 	};
   1189 
   1190 	Surface *Surface::create(int width, int height, int depth, Format format, void *pixels, int pitch, int slice)
   1191 	{
   1192 		return new SurfaceImplementation(width, height, depth, format, pixels, pitch, slice);
   1193 	}
   1194 
   1195 	Surface *Surface::create(Resource *texture, int width, int height, int depth, Format format, bool lockable, bool renderTarget, int pitchPprovided)
   1196 	{
   1197 		return new SurfaceImplementation(texture, width, height, depth, format, lockable, renderTarget, pitchPprovided);
   1198 	}
   1199 
   1200 	Surface::Surface(int width, int height, int depth, Format format, void *pixels, int pitch, int slice) : lockable(true), renderTarget(false)
   1201 	{
   1202 		resource = new Resource(0);
   1203 		hasParent = false;
   1204 		ownExternal = false;
   1205 		depth = max(1, depth);
   1206 
   1207 		external.buffer = pixels;
   1208 		external.width = width;
   1209 		external.height = height;
   1210 		external.depth = depth;
   1211 		external.format = format;
   1212 		external.bytes = bytes(external.format);
   1213 		external.pitchB = pitch;
   1214 		external.pitchP = external.bytes ? pitch / external.bytes : 0;
   1215 		external.sliceB = slice;
   1216 		external.sliceP = external.bytes ? slice / external.bytes : 0;
   1217 		external.lock = LOCK_UNLOCKED;
   1218 		external.dirty = true;
   1219 
   1220 		internal.buffer = 0;
   1221 		internal.width = width;
   1222 		internal.height = height;
   1223 		internal.depth = depth;
   1224 		internal.format = selectInternalFormat(format);
   1225 		internal.bytes = bytes(internal.format);
   1226 		internal.pitchB = pitchB(internal.width, internal.format, false);
   1227 		internal.pitchP = pitchP(internal.width, internal.format, false);
   1228 		internal.sliceB = sliceB(internal.width, internal.height, internal.format, false);
   1229 		internal.sliceP = sliceP(internal.width, internal.height, internal.format, false);
   1230 		internal.lock = LOCK_UNLOCKED;
   1231 		internal.dirty = false;
   1232 
   1233 		stencil.buffer = 0;
   1234 		stencil.width = width;
   1235 		stencil.height = height;
   1236 		stencil.depth = depth;
   1237 		stencil.format = FORMAT_S8;
   1238 		stencil.bytes = bytes(stencil.format);
   1239 		stencil.pitchB = pitchB(stencil.width, stencil.format, false);
   1240 		stencil.pitchP = pitchP(stencil.width, stencil.format, false);
   1241 		stencil.sliceB = sliceB(stencil.width, stencil.height, stencil.format, false);
   1242 		stencil.sliceP = sliceP(stencil.width, stencil.height, stencil.format, false);
   1243 		stencil.lock = LOCK_UNLOCKED;
   1244 		stencil.dirty = false;
   1245 
   1246 		dirtyMipmaps = true;
   1247 		paletteUsed = 0;
   1248 	}
   1249 
   1250 	Surface::Surface(Resource *texture, int width, int height, int depth, Format format, bool lockable, bool renderTarget, int pitchPprovided) : lockable(lockable), renderTarget(renderTarget)
   1251 	{
   1252 		resource = texture ? texture : new Resource(0);
   1253 		hasParent = texture != 0;
   1254 		ownExternal = true;
   1255 		depth = max(1, depth);
   1256 
   1257 		external.buffer = 0;
   1258 		external.width = width;
   1259 		external.height = height;
   1260 		external.depth = depth;
   1261 		external.format = format;
   1262 		external.bytes = bytes(external.format);
   1263 		external.pitchB = pitchB(external.width, external.format, renderTarget && !texture);
   1264 		external.pitchP = pitchP(external.width, external.format, renderTarget && !texture);
   1265 		external.sliceB = sliceB(external.width, external.height, external.format, renderTarget && !texture);
   1266 		external.sliceP = sliceP(external.width, external.height, external.format, renderTarget && !texture);
   1267 		external.lock = LOCK_UNLOCKED;
   1268 		external.dirty = false;
   1269 
   1270 		internal.buffer = 0;
   1271 		internal.width = width;
   1272 		internal.height = height;
   1273 		internal.depth = depth;
   1274 		internal.format = selectInternalFormat(format);
   1275 		internal.bytes = bytes(internal.format);
   1276 		internal.pitchB = !pitchPprovided ? pitchB(internal.width, internal.format, renderTarget) : pitchPprovided * internal.bytes;
   1277 		internal.pitchP = !pitchPprovided ? pitchP(internal.width, internal.format, renderTarget) : pitchPprovided;
   1278 		internal.sliceB = sliceB(internal.width, internal.height, internal.format, renderTarget);
   1279 		internal.sliceP = sliceP(internal.width, internal.height, internal.format, renderTarget);
   1280 		internal.lock = LOCK_UNLOCKED;
   1281 		internal.dirty = false;
   1282 
   1283 		stencil.buffer = 0;
   1284 		stencil.width = width;
   1285 		stencil.height = height;
   1286 		stencil.depth = depth;
   1287 		stencil.format = FORMAT_S8;
   1288 		stencil.bytes = bytes(stencil.format);
   1289 		stencil.pitchB = pitchB(stencil.width, stencil.format, renderTarget);
   1290 		stencil.pitchP = pitchP(stencil.width, stencil.format, renderTarget);
   1291 		stencil.sliceB = sliceB(stencil.width, stencil.height, stencil.format, renderTarget);
   1292 		stencil.sliceP = sliceP(stencil.width, stencil.height, stencil.format, renderTarget);
   1293 		stencil.lock = LOCK_UNLOCKED;
   1294 		stencil.dirty = false;
   1295 
   1296 		dirtyMipmaps = true;
   1297 		paletteUsed = 0;
   1298 	}
   1299 
   1300 	Surface::~Surface()
   1301 	{
   1302 		// sync() must be called before this destructor to ensure all locks have been released.
   1303 		// We can't call it here because the parent resource may already have been destroyed.
   1304 		ASSERT(isUnlocked());
   1305 
   1306 		if(!hasParent)
   1307 		{
   1308 			resource->destruct();
   1309 		}
   1310 
   1311 		if(ownExternal)
   1312 		{
   1313 			deallocate(external.buffer);
   1314 		}
   1315 
   1316 		if(internal.buffer != external.buffer)
   1317 		{
   1318 			deallocate(internal.buffer);
   1319 		}
   1320 
   1321 		deallocate(stencil.buffer);
   1322 
   1323 		external.buffer = 0;
   1324 		internal.buffer = 0;
   1325 		stencil.buffer = 0;
   1326 	}
   1327 
   1328 	void *Surface::lockExternal(int x, int y, int z, Lock lock, Accessor client)
   1329 	{
   1330 		resource->lock(client);
   1331 
   1332 		if(!external.buffer)
   1333 		{
   1334 			if(internal.buffer && identicalFormats())
   1335 			{
   1336 				external.buffer = internal.buffer;
   1337 			}
   1338 			else
   1339 			{
   1340 				external.buffer = allocateBuffer(external.width, external.height, external.depth, external.format);
   1341 			}
   1342 		}
   1343 
   1344 		if(internal.dirty)
   1345 		{
   1346 			if(lock != LOCK_DISCARD)
   1347 			{
   1348 				update(external, internal);
   1349 			}
   1350 
   1351 			internal.dirty = false;
   1352 		}
   1353 
   1354 		switch(lock)
   1355 		{
   1356 		case LOCK_READONLY:
   1357 			break;
   1358 		case LOCK_WRITEONLY:
   1359 		case LOCK_READWRITE:
   1360 		case LOCK_DISCARD:
   1361 			dirtyMipmaps = true;
   1362 			break;
   1363 		default:
   1364 			ASSERT(false);
   1365 		}
   1366 
   1367 		return external.lockRect(x, y, z, lock);
   1368 	}
   1369 
   1370 	void Surface::unlockExternal()
   1371 	{
   1372 		external.unlockRect();
   1373 
   1374 		resource->unlock();
   1375 	}
   1376 
   1377 	void *Surface::lockInternal(int x, int y, int z, Lock lock, Accessor client)
   1378 	{
   1379 		if(lock != LOCK_UNLOCKED)
   1380 		{
   1381 			resource->lock(client);
   1382 		}
   1383 
   1384 		if(!internal.buffer)
   1385 		{
   1386 			if(external.buffer && identicalFormats())
   1387 			{
   1388 				internal.buffer = external.buffer;
   1389 			}
   1390 			else
   1391 			{
   1392 				internal.buffer = allocateBuffer(internal.width, internal.height, internal.depth, internal.format);
   1393 			}
   1394 		}
   1395 
   1396 		// FIXME: WHQL requires conversion to lower external precision and back
   1397 		if(logPrecision >= WHQL)
   1398 		{
   1399 			if(internal.dirty && renderTarget && internal.format != external.format)
   1400 			{
   1401 				if(lock != LOCK_DISCARD)
   1402 				{
   1403 					switch(external.format)
   1404 					{
   1405 					case FORMAT_R3G3B2:
   1406 					case FORMAT_A8R3G3B2:
   1407 					case FORMAT_A1R5G5B5:
   1408 					case FORMAT_A2R10G10B10:
   1409 					case FORMAT_A2B10G10R10:
   1410 						lockExternal(0, 0, 0, LOCK_READWRITE, client);
   1411 						unlockExternal();
   1412 						break;
   1413 					default:
   1414 						// Difference passes WHQL
   1415 						break;
   1416 					}
   1417 				}
   1418 			}
   1419 		}
   1420 
   1421 		if(external.dirty || (isPalette(external.format) && paletteUsed != Surface::paletteID))
   1422 		{
   1423 			if(lock != LOCK_DISCARD)
   1424 			{
   1425 				update(internal, external);
   1426 			}
   1427 
   1428 			external.dirty = false;
   1429 			paletteUsed = Surface::paletteID;
   1430 		}
   1431 
   1432 		switch(lock)
   1433 		{
   1434 		case LOCK_UNLOCKED:
   1435 		case LOCK_READONLY:
   1436 			break;
   1437 		case LOCK_WRITEONLY:
   1438 		case LOCK_READWRITE:
   1439 		case LOCK_DISCARD:
   1440 			dirtyMipmaps = true;
   1441 			break;
   1442 		default:
   1443 			ASSERT(false);
   1444 		}
   1445 
   1446 		if(lock == LOCK_READONLY && client == PUBLIC)
   1447 		{
   1448 			resolve();
   1449 		}
   1450 
   1451 		return internal.lockRect(x, y, z, lock);
   1452 	}
   1453 
   1454 	void Surface::unlockInternal()
   1455 	{
   1456 		internal.unlockRect();
   1457 
   1458 		resource->unlock();
   1459 	}
   1460 
   1461 	void *Surface::lockStencil(int x, int y, int front, Accessor client)
   1462 	{
   1463 		resource->lock(client);
   1464 
   1465 		if(!stencil.buffer)
   1466 		{
   1467 			stencil.buffer = allocateBuffer(stencil.width, stencil.height, stencil.depth, stencil.format);
   1468 		}
   1469 
   1470 		return stencil.lockRect(x, y, front, LOCK_READWRITE);   // FIXME
   1471 	}
   1472 
   1473 	void Surface::unlockStencil()
   1474 	{
   1475 		stencil.unlockRect();
   1476 
   1477 		resource->unlock();
   1478 	}
   1479 
   1480 	int Surface::bytes(Format format)
   1481 	{
   1482 		switch(format)
   1483 		{
   1484 		case FORMAT_NULL:				return 0;
   1485 		case FORMAT_P8:					return 1;
   1486 		case FORMAT_A8P8:				return 2;
   1487 		case FORMAT_A8:					return 1;
   1488 		case FORMAT_R8I:				return 1;
   1489 		case FORMAT_R8:					return 1;
   1490 		case FORMAT_R3G3B2:				return 1;
   1491 		case FORMAT_R16I:				return 2;
   1492 		case FORMAT_R16UI:				return 2;
   1493 		case FORMAT_A8R3G3B2:			return 2;
   1494 		case FORMAT_R5G6B5:				return 2;
   1495 		case FORMAT_A1R5G5B5:			return 2;
   1496 		case FORMAT_X1R5G5B5:			return 2;
   1497 		case FORMAT_R5G5B5A1:           return 2;
   1498 		case FORMAT_X4R4G4B4:			return 2;
   1499 		case FORMAT_A4R4G4B4:			return 2;
   1500 		case FORMAT_R4G4B4A4:           return 2;
   1501 		case FORMAT_R8G8B8:				return 3;
   1502 		case FORMAT_B8G8R8:             return 3;
   1503 		case FORMAT_R32I:				return 4;
   1504 		case FORMAT_R32UI:				return 4;
   1505 		case FORMAT_X8R8G8B8:			return 4;
   1506 	//	case FORMAT_X8G8R8B8Q:			return 4;
   1507 		case FORMAT_A8R8G8B8:			return 4;
   1508 	//	case FORMAT_A8G8R8B8Q:			return 4;
   1509 		case FORMAT_X8B8G8R8I:			return 4;
   1510 		case FORMAT_X8B8G8R8:			return 4;
   1511 		case FORMAT_SRGB8_X8:			return 4;
   1512 		case FORMAT_SRGB8_A8:			return 4;
   1513 		case FORMAT_A8B8G8R8I:			return 4;
   1514 		case FORMAT_R8UI:				return 1;
   1515 		case FORMAT_G8R8UI:				return 2;
   1516 		case FORMAT_X8B8G8R8UI:			return 4;
   1517 		case FORMAT_A8B8G8R8UI:			return 4;
   1518 		case FORMAT_A8B8G8R8:			return 4;
   1519 		case FORMAT_R8I_SNORM:			return 1;
   1520 		case FORMAT_G8R8I_SNORM:		return 2;
   1521 		case FORMAT_X8B8G8R8I_SNORM:	return 4;
   1522 		case FORMAT_A8B8G8R8I_SNORM:	return 4;
   1523 		case FORMAT_A2R10G10B10:		return 4;
   1524 		case FORMAT_A2B10G10R10:		return 4;
   1525 		case FORMAT_G8R8I:				return 2;
   1526 		case FORMAT_G8R8:				return 2;
   1527 		case FORMAT_G16R16I:			return 4;
   1528 		case FORMAT_G16R16UI:			return 4;
   1529 		case FORMAT_G16R16:				return 4;
   1530 		case FORMAT_G32R32I:			return 8;
   1531 		case FORMAT_G32R32UI:			return 8;
   1532 		case FORMAT_X16B16G16R16I:		return 8;
   1533 		case FORMAT_X16B16G16R16UI:		return 8;
   1534 		case FORMAT_A16B16G16R16I:		return 8;
   1535 		case FORMAT_A16B16G16R16UI:		return 8;
   1536 		case FORMAT_A16B16G16R16:		return 8;
   1537 		case FORMAT_X32B32G32R32I:		return 16;
   1538 		case FORMAT_X32B32G32R32UI:		return 16;
   1539 		case FORMAT_A32B32G32R32I:		return 16;
   1540 		case FORMAT_A32B32G32R32UI:		return 16;
   1541 		// Compressed formats
   1542 		#if S3TC_SUPPORT
   1543 		case FORMAT_DXT1:				return 2;   // Column of four pixels
   1544 		case FORMAT_DXT3:				return 4;   // Column of four pixels
   1545 		case FORMAT_DXT5:				return 4;   // Column of four pixels
   1546 		#endif
   1547 		case FORMAT_ATI1:				return 2;   // Column of four pixels
   1548 		case FORMAT_ATI2:				return 4;   // Column of four pixels
   1549 		case FORMAT_ETC1:				return 2;   // Column of four pixels
   1550 		case FORMAT_R11_EAC:			return 2;
   1551 		case FORMAT_SIGNED_R11_EAC:		return 2;
   1552 		case FORMAT_RG11_EAC:			return 4;
   1553 		case FORMAT_SIGNED_RG11_EAC:	return 4;
   1554 		case FORMAT_RGB8_ETC2:			return 2;
   1555 		case FORMAT_SRGB8_ETC2:			return 2;
   1556 		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:	return 2;
   1557 		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:	return 2;
   1558 		case FORMAT_RGBA8_ETC2_EAC:			return 4;
   1559 		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:	return 4;
   1560 		case FORMAT_RGBA_ASTC_4x4_KHR:
   1561 		case FORMAT_RGBA_ASTC_5x4_KHR:
   1562 		case FORMAT_RGBA_ASTC_5x5_KHR:
   1563 		case FORMAT_RGBA_ASTC_6x5_KHR:
   1564 		case FORMAT_RGBA_ASTC_6x6_KHR:
   1565 		case FORMAT_RGBA_ASTC_8x5_KHR:
   1566 		case FORMAT_RGBA_ASTC_8x6_KHR:
   1567 		case FORMAT_RGBA_ASTC_8x8_KHR:
   1568 		case FORMAT_RGBA_ASTC_10x5_KHR:
   1569 		case FORMAT_RGBA_ASTC_10x6_KHR:
   1570 		case FORMAT_RGBA_ASTC_10x8_KHR:
   1571 		case FORMAT_RGBA_ASTC_10x10_KHR:
   1572 		case FORMAT_RGBA_ASTC_12x10_KHR:
   1573 		case FORMAT_RGBA_ASTC_12x12_KHR:
   1574 		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
   1575 		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
   1576 		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
   1577 		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
   1578 		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
   1579 		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
   1580 		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
   1581 		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
   1582 		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
   1583 		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
   1584 		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
   1585 		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
   1586 		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
   1587 		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR: return 0; // FIXME
   1588 		// Bumpmap formats
   1589 		case FORMAT_V8U8:				return 2;
   1590 		case FORMAT_L6V5U5:				return 2;
   1591 		case FORMAT_Q8W8V8U8:			return 4;
   1592 		case FORMAT_X8L8V8U8:			return 4;
   1593 		case FORMAT_A2W10V10U10:		return 4;
   1594 		case FORMAT_V16U16:				return 4;
   1595 		case FORMAT_A16W16V16U16:		return 8;
   1596 		case FORMAT_Q16W16V16U16:		return 8;
   1597 		// Luminance formats
   1598 		case FORMAT_L8:					return 1;
   1599 		case FORMAT_A4L4:				return 1;
   1600 		case FORMAT_L16:				return 2;
   1601 		case FORMAT_A8L8:				return 2;
   1602 		case FORMAT_L16F:               return 2;
   1603 		case FORMAT_A16L16F:            return 4;
   1604 		case FORMAT_L32F:               return 4;
   1605 		case FORMAT_A32L32F:            return 8;
   1606 		// Floating-point formats
   1607 		case FORMAT_A16F:				return 2;
   1608 		case FORMAT_R16F:				return 2;
   1609 		case FORMAT_G16R16F:			return 4;
   1610 		case FORMAT_B16G16R16F:			return 6;
   1611 		case FORMAT_A16B16G16R16F:		return 8;
   1612 		case FORMAT_A32F:				return 4;
   1613 		case FORMAT_R32F:				return 4;
   1614 		case FORMAT_G32R32F:			return 8;
   1615 		case FORMAT_B32G32R32F:			return 12;
   1616 		case FORMAT_X32B32G32R32F:		return 16;
   1617 		case FORMAT_A32B32G32R32F:		return 16;
   1618 		// Depth/stencil formats
   1619 		case FORMAT_D16:				return 2;
   1620 		case FORMAT_D32:				return 4;
   1621 		case FORMAT_D24X8:				return 4;
   1622 		case FORMAT_D24S8:				return 4;
   1623 		case FORMAT_D24FS8:				return 4;
   1624 		case FORMAT_D32F:				return 4;
   1625 		case FORMAT_D32F_COMPLEMENTARY:	return 4;
   1626 		case FORMAT_D32F_LOCKABLE:		return 4;
   1627 		case FORMAT_D32FS8_TEXTURE:		return 4;
   1628 		case FORMAT_D32FS8_SHADOW:		return 4;
   1629 		case FORMAT_DF24S8:				return 4;
   1630 		case FORMAT_DF16S8:				return 2;
   1631 		case FORMAT_INTZ:				return 4;
   1632 		case FORMAT_S8:					return 1;
   1633 		case FORMAT_YV12_BT601:         return 1;   // Y plane only
   1634 		case FORMAT_YV12_BT709:         return 1;   // Y plane only
   1635 		case FORMAT_YV12_JFIF:          return 1;   // Y plane only
   1636 		default:
   1637 			ASSERT(false);
   1638 		}
   1639 
   1640 		return 0;
   1641 	}
   1642 
   1643 	int Surface::pitchB(int width, Format format, bool target)
   1644 	{
   1645 		if(target || isDepth(format) || isStencil(format))
   1646 		{
   1647 			width = align(width, 2);
   1648 		}
   1649 
   1650 		switch(format)
   1651 		{
   1652 		#if S3TC_SUPPORT
   1653 		case FORMAT_DXT1:
   1654 		#endif
   1655 		case FORMAT_ETC1:
   1656 		case FORMAT_R11_EAC:
   1657 		case FORMAT_SIGNED_R11_EAC:
   1658 		case FORMAT_RGB8_ETC2:
   1659 		case FORMAT_SRGB8_ETC2:
   1660 		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
   1661 		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
   1662 			return 8 * ((width + 3) / 4);    // 64 bit per 4x4 block, computed per 4 rows
   1663 		case FORMAT_RG11_EAC:
   1664 		case FORMAT_SIGNED_RG11_EAC:
   1665 		case FORMAT_RGBA8_ETC2_EAC:
   1666 		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
   1667 		case FORMAT_RGBA_ASTC_4x4_KHR:
   1668 		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
   1669 			return 16 * ((width + 3) / 4);    // 128 bit per 4x4 block, computed per 4 rows
   1670 		case FORMAT_RGBA_ASTC_5x4_KHR:
   1671 		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
   1672 		case FORMAT_RGBA_ASTC_5x5_KHR:
   1673 		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
   1674 			return 16 * ((width + 4) / 5);
   1675 		case FORMAT_RGBA_ASTC_6x5_KHR:
   1676 		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
   1677 		case FORMAT_RGBA_ASTC_6x6_KHR:
   1678 		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
   1679 			return 16 * ((width + 5) / 6);
   1680 		case FORMAT_RGBA_ASTC_8x5_KHR:
   1681 		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
   1682 		case FORMAT_RGBA_ASTC_8x6_KHR:
   1683 		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
   1684 		case FORMAT_RGBA_ASTC_8x8_KHR:
   1685 		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
   1686 			return 16 * ((width + 7) / 8);
   1687 		case FORMAT_RGBA_ASTC_10x5_KHR:
   1688 		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
   1689 		case FORMAT_RGBA_ASTC_10x6_KHR:
   1690 		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
   1691 		case FORMAT_RGBA_ASTC_10x8_KHR:
   1692 		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
   1693 		case FORMAT_RGBA_ASTC_10x10_KHR:
   1694 		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
   1695 			return 16 * ((width + 9) / 10);
   1696 		case FORMAT_RGBA_ASTC_12x10_KHR:
   1697 		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
   1698 		case FORMAT_RGBA_ASTC_12x12_KHR:
   1699 		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
   1700 			return 16 * ((width + 11) / 12);
   1701 		#if S3TC_SUPPORT
   1702 		case FORMAT_DXT3:
   1703 		case FORMAT_DXT5:
   1704 			return 16 * ((width + 3) / 4);   // 128 bit per 4x4 block, computed per 4 rows
   1705 		#endif
   1706 		case FORMAT_ATI1:
   1707 			return 2 * ((width + 3) / 4);    // 64 bit per 4x4 block, computed per row
   1708 		case FORMAT_ATI2:
   1709 			return 4 * ((width + 3) / 4);    // 128 bit per 4x4 block, computed per row
   1710 		case FORMAT_YV12_BT601:
   1711 		case FORMAT_YV12_BT709:
   1712 		case FORMAT_YV12_JFIF:
   1713 			return align(width, 16);
   1714 		default:
   1715 			return bytes(format) * width;
   1716 		}
   1717 	}
   1718 
   1719 	int Surface::pitchP(int width, Format format, bool target)
   1720 	{
   1721 		int B = bytes(format);
   1722 
   1723 		return B > 0 ? pitchB(width, format, target) / B : 0;
   1724 	}
   1725 
   1726 	int Surface::sliceB(int width, int height, Format format, bool target)
   1727 	{
   1728 		if(target || isDepth(format) || isStencil(format))
   1729 		{
   1730 			height = ((height + 1) & ~1);
   1731 		}
   1732 
   1733 		switch(format)
   1734 		{
   1735 		#if S3TC_SUPPORT
   1736 		case FORMAT_DXT1:
   1737 		case FORMAT_DXT3:
   1738 		case FORMAT_DXT5:
   1739 		#endif
   1740 		case FORMAT_ETC1:
   1741 		case FORMAT_R11_EAC:
   1742 		case FORMAT_SIGNED_R11_EAC:
   1743 		case FORMAT_RG11_EAC:
   1744 		case FORMAT_SIGNED_RG11_EAC:
   1745 		case FORMAT_RGB8_ETC2:
   1746 		case FORMAT_SRGB8_ETC2:
   1747 		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
   1748 		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
   1749 		case FORMAT_RGBA8_ETC2_EAC:
   1750 		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
   1751 		case FORMAT_RGBA_ASTC_4x4_KHR:
   1752 		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
   1753 		case FORMAT_RGBA_ASTC_5x4_KHR:
   1754 		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
   1755 			return pitchB(width, format, target) * ((height + 3) / 4);   // Pitch computed per 4 rows
   1756 		case FORMAT_RGBA_ASTC_5x5_KHR:
   1757 		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
   1758 		case FORMAT_RGBA_ASTC_6x5_KHR:
   1759 		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
   1760 		case FORMAT_RGBA_ASTC_8x5_KHR:
   1761 		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
   1762 		case FORMAT_RGBA_ASTC_10x5_KHR:
   1763 		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
   1764 			return pitchB(width, format, target) * ((height + 4) / 5);   // Pitch computed per 5 rows
   1765 		case FORMAT_RGBA_ASTC_6x6_KHR:
   1766 		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
   1767 		case FORMAT_RGBA_ASTC_8x6_KHR:
   1768 		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
   1769 		case FORMAT_RGBA_ASTC_10x6_KHR:
   1770 		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
   1771 			return pitchB(width, format, target) * ((height + 5) / 6);   // Pitch computed per 6 rows
   1772 		case FORMAT_RGBA_ASTC_8x8_KHR:
   1773 		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
   1774 		case FORMAT_RGBA_ASTC_10x8_KHR:
   1775 		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
   1776 			return pitchB(width, format, target) * ((height + 7) / 8);   // Pitch computed per 8 rows
   1777 		case FORMAT_RGBA_ASTC_10x10_KHR:
   1778 		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
   1779 		case FORMAT_RGBA_ASTC_12x10_KHR:
   1780 		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
   1781 			return pitchB(width, format, target) * ((height + 9) / 10);   // Pitch computed per 10 rows
   1782 		case FORMAT_RGBA_ASTC_12x12_KHR:
   1783 		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
   1784 			return pitchB(width, format, target) * ((height + 11) / 12);   // Pitch computed per 12 rows
   1785 		case FORMAT_ATI1:
   1786 		case FORMAT_ATI2:
   1787 		default:
   1788 			return pitchB(width, format, target) * height;   // Pitch computed per row
   1789 		}
   1790 	}
   1791 
   1792 	int Surface::sliceP(int width, int height, Format format, bool target)
   1793 	{
   1794 		int B = bytes(format);
   1795 
   1796 		return B > 0 ? sliceB(width, height, format, target) / B : 0;
   1797 	}
   1798 
   1799 	void Surface::update(Buffer &destination, Buffer &source)
   1800 	{
   1801 	//	ASSERT(source.lock != LOCK_UNLOCKED);
   1802 	//	ASSERT(destination.lock != LOCK_UNLOCKED);
   1803 
   1804 		if(destination.buffer != source.buffer)
   1805 		{
   1806 			ASSERT(source.dirty && !destination.dirty);
   1807 
   1808 			switch(source.format)
   1809 			{
   1810 			case FORMAT_R8G8B8:		decodeR8G8B8(destination, source);		break;   // FIXME: Check destination format
   1811 			case FORMAT_X1R5G5B5:	decodeX1R5G5B5(destination, source);	break;   // FIXME: Check destination format
   1812 			case FORMAT_A1R5G5B5:	decodeA1R5G5B5(destination, source);	break;   // FIXME: Check destination format
   1813 			case FORMAT_X4R4G4B4:	decodeX4R4G4B4(destination, source);	break;   // FIXME: Check destination format
   1814 			case FORMAT_A4R4G4B4:	decodeA4R4G4B4(destination, source);	break;   // FIXME: Check destination format
   1815 			case FORMAT_P8:			decodeP8(destination, source);			break;   // FIXME: Check destination format
   1816 			#if S3TC_SUPPORT
   1817 			case FORMAT_DXT1:		decodeDXT1(destination, source);		break;   // FIXME: Check destination format
   1818 			case FORMAT_DXT3:		decodeDXT3(destination, source);		break;   // FIXME: Check destination format
   1819 			case FORMAT_DXT5:		decodeDXT5(destination, source);		break;   // FIXME: Check destination format
   1820 			#endif
   1821 			case FORMAT_ATI1:		decodeATI1(destination, source);		break;   // FIXME: Check destination format
   1822 			case FORMAT_ATI2:		decodeATI2(destination, source);		break;   // FIXME: Check destination format
   1823 			case FORMAT_R11_EAC:         decodeEAC(destination, source, 1, false); break; // FIXME: Check destination format
   1824 			case FORMAT_SIGNED_R11_EAC:  decodeEAC(destination, source, 1, true);  break; // FIXME: Check destination format
   1825 			case FORMAT_RG11_EAC:        decodeEAC(destination, source, 2, false); break; // FIXME: Check destination format
   1826 			case FORMAT_SIGNED_RG11_EAC: decodeEAC(destination, source, 2, true);  break; // FIXME: Check destination format
   1827 			case FORMAT_ETC1:
   1828 			case FORMAT_RGB8_ETC2:                      decodeETC2(destination, source, 0, false); break; // FIXME: Check destination format
   1829 			case FORMAT_SRGB8_ETC2:                     decodeETC2(destination, source, 0, true);  break; // FIXME: Check destination format
   1830 			case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:  decodeETC2(destination, source, 1, false); break; // FIXME: Check destination format
   1831 			case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2: decodeETC2(destination, source, 1, true);  break; // FIXME: Check destination format
   1832 			case FORMAT_RGBA8_ETC2_EAC:                 decodeETC2(destination, source, 8, false); break; // FIXME: Check destination format
   1833 			case FORMAT_SRGB8_ALPHA8_ETC2_EAC:          decodeETC2(destination, source, 8, true);  break; // FIXME: Check destination format
   1834 			case FORMAT_RGBA_ASTC_4x4_KHR:           decodeASTC(destination, source, 4,  4,  1, false); break; // FIXME: Check destination format
   1835 			case FORMAT_RGBA_ASTC_5x4_KHR:           decodeASTC(destination, source, 5,  4,  1, false); break; // FIXME: Check destination format
   1836 			case FORMAT_RGBA_ASTC_5x5_KHR:           decodeASTC(destination, source, 5,  5,  1, false); break; // FIXME: Check destination format
   1837 			case FORMAT_RGBA_ASTC_6x5_KHR:           decodeASTC(destination, source, 6,  5,  1, false); break; // FIXME: Check destination format
   1838 			case FORMAT_RGBA_ASTC_6x6_KHR:           decodeASTC(destination, source, 6,  6,  1, false); break; // FIXME: Check destination format
   1839 			case FORMAT_RGBA_ASTC_8x5_KHR:           decodeASTC(destination, source, 8,  5,  1, false); break; // FIXME: Check destination format
   1840 			case FORMAT_RGBA_ASTC_8x6_KHR:           decodeASTC(destination, source, 8,  6,  1, false); break; // FIXME: Check destination format
   1841 			case FORMAT_RGBA_ASTC_8x8_KHR:           decodeASTC(destination, source, 8,  8,  1, false); break; // FIXME: Check destination format
   1842 			case FORMAT_RGBA_ASTC_10x5_KHR:          decodeASTC(destination, source, 10, 5,  1, false); break; // FIXME: Check destination format
   1843 			case FORMAT_RGBA_ASTC_10x6_KHR:          decodeASTC(destination, source, 10, 6,  1, false); break; // FIXME: Check destination format
   1844 			case FORMAT_RGBA_ASTC_10x8_KHR:          decodeASTC(destination, source, 10, 8,  1, false); break; // FIXME: Check destination format
   1845 			case FORMAT_RGBA_ASTC_10x10_KHR:         decodeASTC(destination, source, 10, 10, 1, false); break; // FIXME: Check destination format
   1846 			case FORMAT_RGBA_ASTC_12x10_KHR:         decodeASTC(destination, source, 12, 10, 1, false); break; // FIXME: Check destination format
   1847 			case FORMAT_RGBA_ASTC_12x12_KHR:         decodeASTC(destination, source, 12, 12, 1, false); break; // FIXME: Check destination format
   1848 			case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:   decodeASTC(destination, source, 4,  4,  1, true);  break; // FIXME: Check destination format
   1849 			case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:   decodeASTC(destination, source, 5,  4,  1, true);  break; // FIXME: Check destination format
   1850 			case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:   decodeASTC(destination, source, 5,  5,  1, true);  break; // FIXME: Check destination format
   1851 			case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:   decodeASTC(destination, source, 6,  5,  1, true);  break; // FIXME: Check destination format
   1852 			case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:   decodeASTC(destination, source, 6,  6,  1, true);  break; // FIXME: Check destination format
   1853 			case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:   decodeASTC(destination, source, 8,  5,  1, true);  break; // FIXME: Check destination format
   1854 			case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:   decodeASTC(destination, source, 8,  6,  1, true);  break; // FIXME: Check destination format
   1855 			case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:   decodeASTC(destination, source, 8,  8,  1, true);  break; // FIXME: Check destination format
   1856 			case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:  decodeASTC(destination, source, 10, 5,  1, true);  break; // FIXME: Check destination format
   1857 			case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:  decodeASTC(destination, source, 10, 6,  1, true);  break; // FIXME: Check destination format
   1858 			case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:  decodeASTC(destination, source, 10, 8,  1, true);  break; // FIXME: Check destination format
   1859 			case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR: decodeASTC(destination, source, 10, 10, 1, true);  break; // FIXME: Check destination format
   1860 			case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR: decodeASTC(destination, source, 12, 10, 1, true);  break; // FIXME: Check destination format
   1861 			case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR: decodeASTC(destination, source, 12, 12, 1, true);  break; // FIXME: Check destination format
   1862 			default:				genericUpdate(destination, source);		break;
   1863 			}
   1864 		}
   1865 	}
   1866 
   1867 	void Surface::genericUpdate(Buffer &destination, Buffer &source)
   1868 	{
   1869 		unsigned char *sourceSlice = (unsigned char*)source.buffer;
   1870 		unsigned char *destinationSlice = (unsigned char*)destination.buffer;
   1871 
   1872 		int depth = min(destination.depth, source.depth);
   1873 		int height = min(destination.height, source.height);
   1874 		int width = min(destination.width, source.width);
   1875 		int rowBytes = width * source.bytes;
   1876 
   1877 		for(int z = 0; z < depth; z++)
   1878 		{
   1879 			unsigned char *sourceRow = sourceSlice;
   1880 			unsigned char *destinationRow = destinationSlice;
   1881 
   1882 			for(int y = 0; y < height; y++)
   1883 			{
   1884 				if(source.format == destination.format)
   1885 				{
   1886 					memcpy(destinationRow, sourceRow, rowBytes);
   1887 				}
   1888 				else
   1889 				{
   1890 					unsigned char *sourceElement = sourceRow;
   1891 					unsigned char *destinationElement = destinationRow;
   1892 
   1893 					for(int x = 0; x < width; x++)
   1894 					{
   1895 						Color<float> color = source.read(sourceElement);
   1896 						destination.write(destinationElement, color);
   1897 
   1898 						sourceElement += source.bytes;
   1899 						destinationElement += destination.bytes;
   1900 					}
   1901 				}
   1902 
   1903 				sourceRow += source.pitchB;
   1904 				destinationRow += destination.pitchB;
   1905 			}
   1906 
   1907 			sourceSlice += source.sliceB;
   1908 			destinationSlice += destination.sliceB;
   1909 		}
   1910 	}
   1911 
   1912 	void Surface::decodeR8G8B8(Buffer &destination, const Buffer &source)
   1913 	{
   1914 		unsigned char *sourceSlice = (unsigned char*)source.buffer;
   1915 		unsigned char *destinationSlice = (unsigned char*)destination.buffer;
   1916 
   1917 		for(int z = 0; z < destination.depth && z < source.depth; z++)
   1918 		{
   1919 			unsigned char *sourceRow = sourceSlice;
   1920 			unsigned char *destinationRow = destinationSlice;
   1921 
   1922 			for(int y = 0; y < destination.height && y < source.height; y++)
   1923 			{
   1924 				unsigned char *sourceElement = sourceRow;
   1925 				unsigned char *destinationElement = destinationRow;
   1926 
   1927 				for(int x = 0; x < destination.width && x < source.width; x++)
   1928 				{
   1929 					unsigned int b = sourceElement[0];
   1930 					unsigned int g = sourceElement[1];
   1931 					unsigned int r = sourceElement[2];
   1932 
   1933 					*(unsigned int*)destinationElement = 0xFF000000 | (r << 16) | (g << 8) | (b << 0);
   1934 
   1935 					sourceElement += source.bytes;
   1936 					destinationElement += destination.bytes;
   1937 				}
   1938 
   1939 				sourceRow += source.pitchB;
   1940 				destinationRow += destination.pitchB;
   1941 			}
   1942 
   1943 			sourceSlice += source.sliceB;
   1944 			destinationSlice += destination.sliceB;
   1945 		}
   1946 	}
   1947 
   1948 	void Surface::decodeX1R5G5B5(Buffer &destination, const Buffer &source)
   1949 	{
   1950 		unsigned char *sourceSlice = (unsigned char*)source.buffer;
   1951 		unsigned char *destinationSlice = (unsigned char*)destination.buffer;
   1952 
   1953 		for(int z = 0; z < destination.depth && z < source.depth; z++)
   1954 		{
   1955 			unsigned char *sourceRow = sourceSlice;
   1956 			unsigned char *destinationRow = destinationSlice;
   1957 
   1958 			for(int y = 0; y < destination.height && y < source.height; y++)
   1959 			{
   1960 				unsigned char *sourceElement = sourceRow;
   1961 				unsigned char *destinationElement = destinationRow;
   1962 
   1963 				for(int x = 0; x < destination.width && x < source.width; x++)
   1964 				{
   1965 					unsigned int xrgb = *(unsigned short*)sourceElement;
   1966 
   1967 					unsigned int r = (((xrgb & 0x7C00) * 134771 + 0x800000) >> 8) & 0x00FF0000;
   1968 					unsigned int g = (((xrgb & 0x03E0) * 16846 + 0x8000) >> 8) & 0x0000FF00;
   1969 					unsigned int b = (((xrgb & 0x001F) * 2106  + 0x80) >> 8);
   1970 
   1971 					*(unsigned int*)destinationElement = 0xFF000000 | r | g | b;
   1972 
   1973 					sourceElement += source.bytes;
   1974 					destinationElement += destination.bytes;
   1975 				}
   1976 
   1977 				sourceRow += source.pitchB;
   1978 				destinationRow += destination.pitchB;
   1979 			}
   1980 
   1981 			sourceSlice += source.sliceB;
   1982 			destinationSlice += destination.sliceB;
   1983 		}
   1984 	}
   1985 
   1986 	void Surface::decodeA1R5G5B5(Buffer &destination, const Buffer &source)
   1987 	{
   1988 		unsigned char *sourceSlice = (unsigned char*)source.buffer;
   1989 		unsigned char *destinationSlice = (unsigned char*)destination.buffer;
   1990 
   1991 		for(int z = 0; z < destination.depth && z < source.depth; z++)
   1992 		{
   1993 			unsigned char *sourceRow = sourceSlice;
   1994 			unsigned char *destinationRow = destinationSlice;
   1995 
   1996 			for(int y = 0; y < destination.height && y < source.height; y++)
   1997 			{
   1998 				unsigned char *sourceElement = sourceRow;
   1999 				unsigned char *destinationElement = destinationRow;
   2000 
   2001 				for(int x = 0; x < destination.width && x < source.width; x++)
   2002 				{
   2003 					unsigned int argb = *(unsigned short*)sourceElement;
   2004 
   2005 					unsigned int a =   (argb & 0x8000) * 130560;
   2006 					unsigned int r = (((argb & 0x7C00) * 134771 + 0x800000) >> 8) & 0x00FF0000;
   2007 					unsigned int g = (((argb & 0x03E0) * 16846  + 0x8000) >> 8) & 0x0000FF00;
   2008 					unsigned int b = (((argb & 0x001F) * 2106   + 0x80) >> 8);
   2009 
   2010 					*(unsigned int*)destinationElement = a | r | g | b;
   2011 
   2012 					sourceElement += source.bytes;
   2013 					destinationElement += destination.bytes;
   2014 				}
   2015 
   2016 				sourceRow += source.pitchB;
   2017 				destinationRow += destination.pitchB;
   2018 			}
   2019 
   2020 			sourceSlice += source.sliceB;
   2021 			destinationSlice += destination.sliceB;
   2022 		}
   2023 	}
   2024 
   2025 	void Surface::decodeX4R4G4B4(Buffer &destination, const Buffer &source)
   2026 	{
   2027 		unsigned char *sourceSlice = (unsigned char*)source.buffer;
   2028 		unsigned char *destinationSlice = (unsigned char*)destination.buffer;
   2029 
   2030 		for(int z = 0; z < destination.depth && z < source.depth; z++)
   2031 		{
   2032 			unsigned char *sourceRow = sourceSlice;
   2033 			unsigned char *destinationRow = destinationSlice;
   2034 
   2035 			for(int y = 0; y < destination.height && y < source.height; y++)
   2036 			{
   2037 				unsigned char *sourceElement = sourceRow;
   2038 				unsigned char *destinationElement = destinationRow;
   2039 
   2040 				for(int x = 0; x < destination.width && x < source.width; x++)
   2041 				{
   2042 					unsigned int xrgb = *(unsigned short*)sourceElement;
   2043 
   2044 					unsigned int r = ((xrgb & 0x0F00) * 0x00001100) & 0x00FF0000;
   2045 					unsigned int g = ((xrgb & 0x00F0) * 0x00000110) & 0x0000FF00;
   2046 					unsigned int b =  (xrgb & 0x000F) * 0x00000011;
   2047 
   2048 					*(unsigned int*)destinationElement = 0xFF000000 | r | g | b;
   2049 
   2050 					sourceElement += source.bytes;
   2051 					destinationElement += destination.bytes;
   2052 				}
   2053 
   2054 				sourceRow += source.pitchB;
   2055 				destinationRow += destination.pitchB;
   2056 			}
   2057 
   2058 			sourceSlice += source.sliceB;
   2059 			destinationSlice += destination.sliceB;
   2060 		}
   2061 	}
   2062 
   2063 	void Surface::decodeA4R4G4B4(Buffer &destination, const Buffer &source)
   2064 	{
   2065 		unsigned char *sourceSlice = (unsigned char*)source.buffer;
   2066 		unsigned char *destinationSlice = (unsigned char*)destination.buffer;
   2067 
   2068 		for(int z = 0; z < destination.depth && z < source.depth; z++)
   2069 		{
   2070 			unsigned char *sourceRow = sourceSlice;
   2071 			unsigned char *destinationRow = destinationSlice;
   2072 
   2073 			for(int y = 0; y < destination.height && y < source.height; y++)
   2074 			{
   2075 				unsigned char *sourceElement = sourceRow;
   2076 				unsigned char *destinationElement = destinationRow;
   2077 
   2078 				for(int x = 0; x < destination.width && x < source.width; x++)
   2079 				{
   2080 					unsigned int argb = *(unsigned short*)sourceElement;
   2081 
   2082 					unsigned int a = ((argb & 0xF000) * 0x00011000) & 0xFF000000;
   2083 					unsigned int r = ((argb & 0x0F00) * 0x00001100) & 0x00FF0000;
   2084 					unsigned int g = ((argb & 0x00F0) * 0x00000110) & 0x0000FF00;
   2085 					unsigned int b =  (argb & 0x000F) * 0x00000011;
   2086 
   2087 					*(unsigned int*)destinationElement = a | r | g | b;
   2088 
   2089 					sourceElement += source.bytes;
   2090 					destinationElement += destination.bytes;
   2091 				}
   2092 
   2093 				sourceRow += source.pitchB;
   2094 				destinationRow += destination.pitchB;
   2095 			}
   2096 
   2097 			sourceSlice += source.sliceB;
   2098 			destinationSlice += destination.sliceB;
   2099 		}
   2100 	}
   2101 
   2102 	void Surface::decodeP8(Buffer &destination, const Buffer &source)
   2103 	{
   2104 		unsigned char *sourceSlice = (unsigned char*)source.buffer;
   2105 		unsigned char *destinationSlice = (unsigned char*)destination.buffer;
   2106 
   2107 		for(int z = 0; z < destination.depth && z < source.depth; z++)
   2108 		{
   2109 			unsigned char *sourceRow = sourceSlice;
   2110 			unsigned char *destinationRow = destinationSlice;
   2111 
   2112 			for(int y = 0; y < destination.height && y < source.height; y++)
   2113 			{
   2114 				unsigned char *sourceElement = sourceRow;
   2115 				unsigned char *destinationElement = destinationRow;
   2116 
   2117 				for(int x = 0; x < destination.width && x < source.width; x++)
   2118 				{
   2119 					unsigned int abgr = palette[*(unsigned char*)sourceElement];
   2120 
   2121 					unsigned int r = (abgr & 0x000000FF) << 16;
   2122 					unsigned int g = (abgr & 0x0000FF00) << 0;
   2123 					unsigned int b = (abgr & 0x00FF0000) >> 16;
   2124 					unsigned int a = (abgr & 0xFF000000) >> 0;
   2125 
   2126 					*(unsigned int*)destinationElement = a | r | g | b;
   2127 
   2128 					sourceElement += source.bytes;
   2129 					destinationElement += destination.bytes;
   2130 				}
   2131 
   2132 				sourceRow += source.pitchB;
   2133 				destinationRow += destination.pitchB;
   2134 			}
   2135 
   2136 			sourceSlice += source.sliceB;
   2137 			destinationSlice += destination.sliceB;
   2138 		}
   2139 	}
   2140 
   2141 #if S3TC_SUPPORT
   2142 	void Surface::decodeDXT1(Buffer &internal, const Buffer &external)
   2143 	{
   2144 		unsigned int *destSlice = (unsigned int*)internal.buffer;
   2145 		const DXT1 *source = (const DXT1*)external.buffer;
   2146 
   2147 		for(int z = 0; z < external.depth; z++)
   2148 		{
   2149 			unsigned int *dest = destSlice;
   2150 
   2151 			for(int y = 0; y < external.height; y += 4)
   2152 			{
   2153 				for(int x = 0; x < external.width; x += 4)
   2154 				{
   2155 					Color<byte> c[4];
   2156 
   2157 					c[0] = source->c0;
   2158 					c[1] = source->c1;
   2159 
   2160 					if(source->c0 > source->c1)   // No transparency
   2161 					{
   2162 						// c2 = 2 / 3 * c0 + 1 / 3 * c1
   2163 						c[2].r = (byte)((2 * (word)c[0].r + (word)c[1].r + 1) / 3);
   2164 						c[2].g = (byte)((2 * (word)c[0].g + (word)c[1].g + 1) / 3);
   2165 						c[2].b = (byte)((2 * (word)c[0].b + (word)c[1].b + 1) / 3);
   2166 						c[2].a = 0xFF;
   2167 
   2168 						// c3 = 1 / 3 * c0 + 2 / 3 * c1
   2169 						c[3].r = (byte)(((word)c[0].r + 2 * (word)c[1].r + 1) / 3);
   2170 						c[3].g = (byte)(((word)c[0].g + 2 * (word)c[1].g + 1) / 3);
   2171 						c[3].b = (byte)(((word)c[0].b + 2 * (word)c[1].b + 1) / 3);
   2172 						c[3].a = 0xFF;
   2173 					}
   2174 					else   // c3 transparent
   2175 					{
   2176 						// c2 = 1 / 2 * c0 + 1 / 2 * c1
   2177 						c[2].r = (byte)(((word)c[0].r + (word)c[1].r) / 2);
   2178 						c[2].g = (byte)(((word)c[0].g + (word)c[1].g) / 2);
   2179 						c[2].b = (byte)(((word)c[0].b + (word)c[1].b) / 2);
   2180 						c[2].a = 0xFF;
   2181 
   2182 						c[3].r = 0;
   2183 						c[3].g = 0;
   2184 						c[3].b = 0;
   2185 						c[3].a = 0;
   2186 					}
   2187 
   2188 					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
   2189 					{
   2190 						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
   2191 						{
   2192 							dest[(x + i) + (y + j) * internal.width] = c[(unsigned int)(source->lut >> 2 * (i + j * 4)) % 4];
   2193 						}
   2194 					}
   2195 
   2196 					source++;
   2197 				}
   2198 			}
   2199 
   2200 			(byte*&)destSlice += internal.sliceB;
   2201 		}
   2202 	}
   2203 
   2204 	void Surface::decodeDXT3(Buffer &internal, const Buffer &external)
   2205 	{
   2206 		unsigned int *destSlice = (unsigned int*)internal.buffer;
   2207 		const DXT3 *source = (const DXT3*)external.buffer;
   2208 
   2209 		for(int z = 0; z < external.depth; z++)
   2210 		{
   2211 			unsigned int *dest = destSlice;
   2212 
   2213 			for(int y = 0; y < external.height; y += 4)
   2214 			{
   2215 				for(int x = 0; x < external.width; x += 4)
   2216 				{
   2217 					Color<byte> c[4];
   2218 
   2219 					c[0] = source->c0;
   2220 					c[1] = source->c1;
   2221 
   2222 					// c2 = 2 / 3 * c0 + 1 / 3 * c1
   2223 					c[2].r = (byte)((2 * (word)c[0].r + (word)c[1].r + 1) / 3);
   2224 					c[2].g = (byte)((2 * (word)c[0].g + (word)c[1].g + 1) / 3);
   2225 					c[2].b = (byte)((2 * (word)c[0].b + (word)c[1].b + 1) / 3);
   2226 
   2227 					// c3 = 1 / 3 * c0 + 2 / 3 * c1
   2228 					c[3].r = (byte)(((word)c[0].r + 2 * (word)c[1].r + 1) / 3);
   2229 					c[3].g = (byte)(((word)c[0].g + 2 * (word)c[1].g + 1) / 3);
   2230 					c[3].b = (byte)(((word)c[0].b + 2 * (word)c[1].b + 1) / 3);
   2231 
   2232 					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
   2233 					{
   2234 						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
   2235 						{
   2236 							unsigned int a = (unsigned int)(source->a >> 4 * (i + j * 4)) & 0x0F;
   2237 							unsigned int color = (c[(unsigned int)(source->lut >> 2 * (i + j * 4)) % 4] & 0x00FFFFFF) | ((a << 28) + (a << 24));
   2238 
   2239 							dest[(x + i) + (y + j) * internal.width] = color;
   2240 						}
   2241 					}
   2242 
   2243 					source++;
   2244 				}
   2245 			}
   2246 
   2247 			(byte*&)destSlice += internal.sliceB;
   2248 		}
   2249 	}
   2250 
   2251 	void Surface::decodeDXT5(Buffer &internal, const Buffer &external)
   2252 	{
   2253 		unsigned int *destSlice = (unsigned int*)internal.buffer;
   2254 		const DXT5 *source = (const DXT5*)external.buffer;
   2255 
   2256 		for(int z = 0; z < external.depth; z++)
   2257 		{
   2258 			unsigned int *dest = destSlice;
   2259 
   2260 			for(int y = 0; y < external.height; y += 4)
   2261 			{
   2262 				for(int x = 0; x < external.width; x += 4)
   2263 				{
   2264 					Color<byte> c[4];
   2265 
   2266 					c[0] = source->c0;
   2267 					c[1] = source->c1;
   2268 
   2269 					// c2 = 2 / 3 * c0 + 1 / 3 * c1
   2270 					c[2].r = (byte)((2 * (word)c[0].r + (word)c[1].r + 1) / 3);
   2271 					c[2].g = (byte)((2 * (word)c[0].g + (word)c[1].g + 1) / 3);
   2272 					c[2].b = (byte)((2 * (word)c[0].b + (word)c[1].b + 1) / 3);
   2273 
   2274 					// c3 = 1 / 3 * c0 + 2 / 3 * c1
   2275 					c[3].r = (byte)(((word)c[0].r + 2 * (word)c[1].r + 1) / 3);
   2276 					c[3].g = (byte)(((word)c[0].g + 2 * (word)c[1].g + 1) / 3);
   2277 					c[3].b = (byte)(((word)c[0].b + 2 * (word)c[1].b + 1) / 3);
   2278 
   2279 					byte a[8];
   2280 
   2281 					a[0] = source->a0;
   2282 					a[1] = source->a1;
   2283 
   2284 					if(a[0] > a[1])
   2285 					{
   2286 						a[2] = (byte)((6 * (word)a[0] + 1 * (word)a[1] + 3) / 7);
   2287 						a[3] = (byte)((5 * (word)a[0] + 2 * (word)a[1] + 3) / 7);
   2288 						a[4] = (byte)((4 * (word)a[0] + 3 * (word)a[1] + 3) / 7);
   2289 						a[5] = (byte)((3 * (word)a[0] + 4 * (word)a[1] + 3) / 7);
   2290 						a[6] = (byte)((2 * (word)a[0] + 5 * (word)a[1] + 3) / 7);
   2291 						a[7] = (byte)((1 * (word)a[0] + 6 * (word)a[1] + 3) / 7);
   2292 					}
   2293 					else
   2294 					{
   2295 						a[2] = (byte)((4 * (word)a[0] + 1 * (word)a[1] + 2) / 5);
   2296 						a[3] = (byte)((3 * (word)a[0] + 2 * (word)a[1] + 2) / 5);
   2297 						a[4] = (byte)((2 * (word)a[0] + 3 * (word)a[1] + 2) / 5);
   2298 						a[5] = (byte)((1 * (word)a[0] + 4 * (word)a[1] + 2) / 5);
   2299 						a[6] = 0;
   2300 						a[7] = 0xFF;
   2301 					}
   2302 
   2303 					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
   2304 					{
   2305 						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
   2306 						{
   2307 							unsigned int alpha = (unsigned int)a[(unsigned int)(source->alut >> (16 + 3 * (i + j * 4))) % 8] << 24;
   2308 							unsigned int color = (c[(source->clut >> 2 * (i + j * 4)) % 4] & 0x00FFFFFF) | alpha;
   2309 
   2310 							dest[(x + i) + (y + j) * internal.width] = color;
   2311 						}
   2312 					}
   2313 
   2314 					source++;
   2315 				}
   2316 			}
   2317 
   2318 			(byte*&)destSlice += internal.sliceB;
   2319 		}
   2320 	}
   2321 #endif
   2322 
   2323 	void Surface::decodeATI1(Buffer &internal, const Buffer &external)
   2324 	{
   2325 		byte *destSlice = (byte*)internal.buffer;
   2326 		const ATI1 *source = (const ATI1*)external.buffer;
   2327 
   2328 		for(int z = 0; z < external.depth; z++)
   2329 		{
   2330 			byte *dest = destSlice;
   2331 
   2332 			for(int y = 0; y < external.height; y += 4)
   2333 			{
   2334 				for(int x = 0; x < external.width; x += 4)
   2335 				{
   2336 					byte r[8];
   2337 
   2338 					r[0] = source->r0;
   2339 					r[1] = source->r1;
   2340 
   2341 					if(r[0] > r[1])
   2342 					{
   2343 						r[2] = (byte)((6 * (word)r[0] + 1 * (word)r[1] + 3) / 7);
   2344 						r[3] = (byte)((5 * (word)r[0] + 2 * (word)r[1] + 3) / 7);
   2345 						r[4] = (byte)((4 * (word)r[0] + 3 * (word)r[1] + 3) / 7);
   2346 						r[5] = (byte)((3 * (word)r[0] + 4 * (word)r[1] + 3) / 7);
   2347 						r[6] = (byte)((2 * (word)r[0] + 5 * (word)r[1] + 3) / 7);
   2348 						r[7] = (byte)((1 * (word)r[0] + 6 * (word)r[1] + 3) / 7);
   2349 					}
   2350 					else
   2351 					{
   2352 						r[2] = (byte)((4 * (word)r[0] + 1 * (word)r[1] + 2) / 5);
   2353 						r[3] = (byte)((3 * (word)r[0] + 2 * (word)r[1] + 2) / 5);
   2354 						r[4] = (byte)((2 * (word)r[0] + 3 * (word)r[1] + 2) / 5);
   2355 						r[5] = (byte)((1 * (word)r[0] + 4 * (word)r[1] + 2) / 5);
   2356 						r[6] = 0;
   2357 						r[7] = 0xFF;
   2358 					}
   2359 
   2360 					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
   2361 					{
   2362 						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
   2363 						{
   2364 							dest[(x + i) + (y + j) * internal.width] = r[(unsigned int)(source->rlut >> (16 + 3 * (i + j * 4))) % 8];
   2365 						}
   2366 					}
   2367 
   2368 					source++;
   2369 				}
   2370 			}
   2371 
   2372 			destSlice += internal.sliceB;
   2373 		}
   2374 	}
   2375 
   2376 	void Surface::decodeATI2(Buffer &internal, const Buffer &external)
   2377 	{
   2378 		word *destSlice = (word*)internal.buffer;
   2379 		const ATI2 *source = (const ATI2*)external.buffer;
   2380 
   2381 		for(int z = 0; z < external.depth; z++)
   2382 		{
   2383 			word *dest = destSlice;
   2384 
   2385 			for(int y = 0; y < external.height; y += 4)
   2386 			{
   2387 				for(int x = 0; x < external.width; x += 4)
   2388 				{
   2389 					byte X[8];
   2390 
   2391 					X[0] = source->x0;
   2392 					X[1] = source->x1;
   2393 
   2394 					if(X[0] > X[1])
   2395 					{
   2396 						X[2] = (byte)((6 * (word)X[0] + 1 * (word)X[1] + 3) / 7);
   2397 						X[3] = (byte)((5 * (word)X[0] + 2 * (word)X[1] + 3) / 7);
   2398 						X[4] = (byte)((4 * (word)X[0] + 3 * (word)X[1] + 3) / 7);
   2399 						X[5] = (byte)((3 * (word)X[0] + 4 * (word)X[1] + 3) / 7);
   2400 						X[6] = (byte)((2 * (word)X[0] + 5 * (word)X[1] + 3) / 7);
   2401 						X[7] = (byte)((1 * (word)X[0] + 6 * (word)X[1] + 3) / 7);
   2402 					}
   2403 					else
   2404 					{
   2405 						X[2] = (byte)((4 * (word)X[0] + 1 * (word)X[1] + 2) / 5);
   2406 						X[3] = (byte)((3 * (word)X[0] + 2 * (word)X[1] + 2) / 5);
   2407 						X[4] = (byte)((2 * (word)X[0] + 3 * (word)X[1] + 2) / 5);
   2408 						X[5] = (byte)((1 * (word)X[0] + 4 * (word)X[1] + 2) / 5);
   2409 						X[6] = 0;
   2410 						X[7] = 0xFF;
   2411 					}
   2412 
   2413 					byte Y[8];
   2414 
   2415 					Y[0] = source->y0;
   2416 					Y[1] = source->y1;
   2417 
   2418 					if(Y[0] > Y[1])
   2419 					{
   2420 						Y[2] = (byte)((6 * (word)Y[0] + 1 * (word)Y[1] + 3) / 7);
   2421 						Y[3] = (byte)((5 * (word)Y[0] + 2 * (word)Y[1] + 3) / 7);
   2422 						Y[4] = (byte)((4 * (word)Y[0] + 3 * (word)Y[1] + 3) / 7);
   2423 						Y[5] = (byte)((3 * (word)Y[0] + 4 * (word)Y[1] + 3) / 7);
   2424 						Y[6] = (byte)((2 * (word)Y[0] + 5 * (word)Y[1] + 3) / 7);
   2425 						Y[7] = (byte)((1 * (word)Y[0] + 6 * (word)Y[1] + 3) / 7);
   2426 					}
   2427 					else
   2428 					{
   2429 						Y[2] = (byte)((4 * (word)Y[0] + 1 * (word)Y[1] + 2) / 5);
   2430 						Y[3] = (byte)((3 * (word)Y[0] + 2 * (word)Y[1] + 2) / 5);
   2431 						Y[4] = (byte)((2 * (word)Y[0] + 3 * (word)Y[1] + 2) / 5);
   2432 						Y[5] = (byte)((1 * (word)Y[0] + 4 * (word)Y[1] + 2) / 5);
   2433 						Y[6] = 0;
   2434 						Y[7] = 0xFF;
   2435 					}
   2436 
   2437 					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
   2438 					{
   2439 						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
   2440 						{
   2441 							word r = X[(unsigned int)(source->xlut >> (16 + 3 * (i + j * 4))) % 8];
   2442 							word g = Y[(unsigned int)(source->ylut >> (16 + 3 * (i + j * 4))) % 8];
   2443 
   2444 							dest[(x + i) + (y + j) * internal.width] = (g << 8) + r;
   2445 						}
   2446 					}
   2447 
   2448 					source++;
   2449 				}
   2450 			}
   2451 
   2452 			(byte*&)destSlice += internal.sliceB;
   2453 		}
   2454 	}
   2455 
   2456 	void Surface::decodeETC2(Buffer &internal, const Buffer &external, int nbAlphaBits, bool isSRGB)
   2457 	{
   2458 		ETC_Decoder::Decode((const byte*)external.buffer, (byte*)internal.buffer, external.width, external.height, internal.width, internal.height, internal.pitchB, internal.bytes,
   2459 		                    (nbAlphaBits == 8) ? ETC_Decoder::ETC_RGBA : ((nbAlphaBits == 1) ? ETC_Decoder::ETC_RGB_PUNCHTHROUGH_ALPHA : ETC_Decoder::ETC_RGB));
   2460 
   2461 		if(isSRGB)
   2462 		{
   2463 			static byte sRGBtoLinearTable[256];
   2464 			static bool sRGBtoLinearTableDirty = true;
   2465 			if(sRGBtoLinearTableDirty)
   2466 			{
   2467 				for(int i = 0; i < 256; i++)
   2468 				{
   2469 					sRGBtoLinearTable[i] = static_cast<byte>(sRGBtoLinear(static_cast<float>(i) / 255.0f) * 255.0f + 0.5f);
   2470 				}
   2471 				sRGBtoLinearTableDirty = false;
   2472 			}
   2473 
   2474 			// Perform sRGB conversion in place after decoding
   2475 			byte* src = (byte*)internal.buffer;
   2476 			for(int y = 0; y < internal.height; y++)
   2477 			{
   2478 				byte* srcRow = src + y * internal.pitchB;
   2479 				for(int x = 0; x <  internal.width; x++)
   2480 				{
   2481 					byte* srcPix = srcRow + x * internal.bytes;
   2482 					for(int i = 0; i < 3; i++)
   2483 					{
   2484 						srcPix[i] = sRGBtoLinearTable[srcPix[i]];
   2485 					}
   2486 				}
   2487 			}
   2488 		}
   2489 	}
   2490 
   2491 	void Surface::decodeEAC(Buffer &internal, const Buffer &external, int nbChannels, bool isSigned)
   2492 	{
   2493 		ASSERT(nbChannels == 1 || nbChannels == 2);
   2494 
   2495 		ETC_Decoder::Decode((const byte*)external.buffer, (byte*)internal.buffer, external.width, external.height, internal.width, internal.height, internal.pitchB, internal.bytes,
   2496 		                    (nbChannels == 1) ? (isSigned ? ETC_Decoder::ETC_R_SIGNED : ETC_Decoder::ETC_R_UNSIGNED) : (isSigned ? ETC_Decoder::ETC_RG_SIGNED : ETC_Decoder::ETC_RG_UNSIGNED));
   2497 
   2498 		// FIXME: We convert signed data to float, until signed integer internal formats are supported
   2499 		//        This code can be removed if signed ETC2 images are decoded to internal 8 bit signed R/RG formats
   2500 		if(isSigned)
   2501 		{
   2502 			sbyte* src = (sbyte*)internal.buffer;
   2503 
   2504 			for(int y = 0; y < internal.height; y++)
   2505 			{
   2506 				sbyte* srcRow = src + y * internal.pitchB;
   2507 				for(int x = internal.width - 1; x >= 0; x--)
   2508 				{
   2509 					int dx = x & 0xFFFFFFFC;
   2510 					int mx = x - dx;
   2511 					sbyte* srcPix = srcRow + dx * internal.bytes + mx * nbChannels;
   2512 					float* dstPix = (float*)(srcRow + x * internal.bytes);
   2513 					for(int c = nbChannels - 1; c >= 0; c--)
   2514 					{
   2515 						static const float normalization = 1.0f / 127.875f;
   2516 						dstPix[c] = clamp(static_cast<float>(srcPix[c]) * normalization, -1.0f, 1.0f);
   2517 					}
   2518 				}
   2519 			}
   2520 		}
   2521 	}
   2522 
   2523 	void Surface::decodeASTC(Buffer &internal, const Buffer &external, int xBlockSize, int yBlockSize, int zBlockSize, bool isSRGB)
   2524 	{
   2525 	}
   2526 
   2527 	unsigned int Surface::size(int width, int height, int depth, Format format)
   2528 	{
   2529 		// Dimensions rounded up to multiples of 4, used for compressed formats
   2530 		int width4 = align(width, 4);
   2531 		int height4 = align(height, 4);
   2532 
   2533 		switch(format)
   2534 		{
   2535 		#if S3TC_SUPPORT
   2536 		case FORMAT_DXT1:
   2537 		#endif
   2538 		case FORMAT_ATI1:
   2539 		case FORMAT_ETC1:
   2540 		case FORMAT_R11_EAC:
   2541 		case FORMAT_SIGNED_R11_EAC:
   2542 		case FORMAT_RGB8_ETC2:
   2543 		case FORMAT_SRGB8_ETC2:
   2544 		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
   2545 		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
   2546 			return width4 * height4 * depth / 2;
   2547 		#if S3TC_SUPPORT
   2548 		case FORMAT_DXT3:
   2549 		case FORMAT_DXT5:
   2550 		#endif
   2551 		case FORMAT_ATI2:
   2552 		case FORMAT_RG11_EAC:
   2553 		case FORMAT_SIGNED_RG11_EAC:
   2554 		case FORMAT_RGBA8_ETC2_EAC:
   2555 		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
   2556 		case FORMAT_RGBA_ASTC_4x4_KHR:
   2557 		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
   2558 			return width4 * height4 * depth;
   2559 		case FORMAT_RGBA_ASTC_5x4_KHR:
   2560 		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
   2561 			return align(width, 5) * height4 * depth;
   2562 		case FORMAT_RGBA_ASTC_5x5_KHR:
   2563 		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
   2564 			return align(width, 5) * align(height, 5) * depth;
   2565 		case FORMAT_RGBA_ASTC_6x5_KHR:
   2566 		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
   2567 			return align(width, 6) * align(height, 5) * depth;
   2568 		case FORMAT_RGBA_ASTC_6x6_KHR:
   2569 		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
   2570 			return align(width, 6) * align(height, 6) * depth;
   2571 		case FORMAT_RGBA_ASTC_8x5_KHR:
   2572 		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
   2573 			return align(width, 8) * align(height, 5) * depth;
   2574 		case FORMAT_RGBA_ASTC_8x6_KHR:
   2575 		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
   2576 			return align(width, 8) * align(height, 6) * depth;
   2577 		case FORMAT_RGBA_ASTC_8x8_KHR:
   2578 		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
   2579 			return align(width, 8) * align(height, 8) * depth;
   2580 		case FORMAT_RGBA_ASTC_10x5_KHR:
   2581 		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
   2582 			return align(width, 10) * align(height, 5) * depth;
   2583 		case FORMAT_RGBA_ASTC_10x6_KHR:
   2584 		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
   2585 			return align(width, 10) * align(height, 6) * depth;
   2586 		case FORMAT_RGBA_ASTC_10x8_KHR:
   2587 		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
   2588 			return align(width, 10) * align(height, 8) * depth;
   2589 		case FORMAT_RGBA_ASTC_10x10_KHR:
   2590 		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
   2591 			return align(width, 10) * align(height, 10) * depth;
   2592 		case FORMAT_RGBA_ASTC_12x10_KHR:
   2593 		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
   2594 			return align(width, 12) * align(height, 10) * depth;
   2595 		case FORMAT_RGBA_ASTC_12x12_KHR:
   2596 		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
   2597 			return align(width, 12) * align(height, 12) * depth;
   2598 		case FORMAT_YV12_BT601:
   2599 		case FORMAT_YV12_BT709:
   2600 		case FORMAT_YV12_JFIF:
   2601 			{
   2602 				unsigned int YStride = align(width, 16);
   2603 				unsigned int YSize = YStride * height;
   2604 				unsigned int CStride = align(YStride / 2, 16);
   2605 				unsigned int CSize = CStride * height / 2;
   2606 
   2607 				return YSize + 2 * CSize;
   2608 			}
   2609 		default:
   2610 			return bytes(format) * width * height * depth;
   2611 		}
   2612 	}
   2613 
   2614 	bool Surface::isStencil(Format format)
   2615 	{
   2616 		switch(format)
   2617 		{
   2618 		case FORMAT_D32:
   2619 		case FORMAT_D16:
   2620 		case FORMAT_D24X8:
   2621 		case FORMAT_D32F:
   2622 		case FORMAT_D32F_COMPLEMENTARY:
   2623 		case FORMAT_D32F_LOCKABLE:
   2624 			return false;
   2625 		case FORMAT_D24S8:
   2626 		case FORMAT_D24FS8:
   2627 		case FORMAT_S8:
   2628 		case FORMAT_DF24S8:
   2629 		case FORMAT_DF16S8:
   2630 		case FORMAT_D32FS8_TEXTURE:
   2631 		case FORMAT_D32FS8_SHADOW:
   2632 		case FORMAT_INTZ:
   2633 			return true;
   2634 		default:
   2635 			return false;
   2636 		}
   2637 	}
   2638 
   2639 	bool Surface::isDepth(Format format)
   2640 	{
   2641 		switch(format)
   2642 		{
   2643 		case FORMAT_D32:
   2644 		case FORMAT_D16:
   2645 		case FORMAT_D24X8:
   2646 		case FORMAT_D24S8:
   2647 		case FORMAT_D24FS8:
   2648 		case FORMAT_D32F:
   2649 		case FORMAT_D32F_COMPLEMENTARY:
   2650 		case FORMAT_D32F_LOCKABLE:
   2651 		case FORMAT_DF24S8:
   2652 		case FORMAT_DF16S8:
   2653 		case FORMAT_D32FS8_TEXTURE:
   2654 		case FORMAT_D32FS8_SHADOW:
   2655 		case FORMAT_INTZ:
   2656 			return true;
   2657 		case FORMAT_S8:
   2658 			return false;
   2659 		default:
   2660 			return false;
   2661 		}
   2662 	}
   2663 
   2664 	bool Surface::hasQuadLayout(Format format)
   2665 	{
   2666 		switch(format)
   2667 		{
   2668 		case FORMAT_D32:
   2669 		case FORMAT_D16:
   2670 		case FORMAT_D24X8:
   2671 		case FORMAT_D24S8:
   2672 		case FORMAT_D24FS8:
   2673 		case FORMAT_D32F:
   2674 		case FORMAT_D32F_COMPLEMENTARY:
   2675 		case FORMAT_DF24S8:
   2676 		case FORMAT_DF16S8:
   2677 		case FORMAT_INTZ:
   2678 		case FORMAT_S8:
   2679 		case FORMAT_A8G8R8B8Q:
   2680 		case FORMAT_X8G8R8B8Q:
   2681 			return true;
   2682 		case FORMAT_D32F_LOCKABLE:
   2683 		case FORMAT_D32FS8_TEXTURE:
   2684 		case FORMAT_D32FS8_SHADOW:
   2685 		default:
   2686 			break;
   2687 		}
   2688 
   2689 		return false;
   2690 	}
   2691 
   2692 	bool Surface::isPalette(Format format)
   2693 	{
   2694 		switch(format)
   2695 		{
   2696 		case FORMAT_P8:
   2697 		case FORMAT_A8P8:
   2698 			return true;
   2699 		default:
   2700 			return false;
   2701 		}
   2702 	}
   2703 
   2704 	bool Surface::isFloatFormat(Format format)
   2705 	{
   2706 		switch(format)
   2707 		{
   2708 		case FORMAT_R5G6B5:
   2709 		case FORMAT_R8G8B8:
   2710 		case FORMAT_B8G8R8:
   2711 		case FORMAT_X8R8G8B8:
   2712 		case FORMAT_X8B8G8R8I:
   2713 		case FORMAT_X8B8G8R8:
   2714 		case FORMAT_A8R8G8B8:
   2715 		case FORMAT_SRGB8_X8:
   2716 		case FORMAT_SRGB8_A8:
   2717 		case FORMAT_A8B8G8R8I:
   2718 		case FORMAT_R8UI:
   2719 		case FORMAT_G8R8UI:
   2720 		case FORMAT_X8B8G8R8UI:
   2721 		case FORMAT_A8B8G8R8UI:
   2722 		case FORMAT_A8B8G8R8:
   2723 		case FORMAT_G8R8I:
   2724 		case FORMAT_G8R8:
   2725 		case FORMAT_A2B10G10R10:
   2726 		case FORMAT_R8I_SNORM:
   2727 		case FORMAT_G8R8I_SNORM:
   2728 		case FORMAT_X8B8G8R8I_SNORM:
   2729 		case FORMAT_A8B8G8R8I_SNORM:
   2730 		case FORMAT_R16I:
   2731 		case FORMAT_R16UI:
   2732 		case FORMAT_G16R16I:
   2733 		case FORMAT_G16R16UI:
   2734 		case FORMAT_G16R16:
   2735 		case FORMAT_X16B16G16R16I:
   2736 		case FORMAT_X16B16G16R16UI:
   2737 		case FORMAT_A16B16G16R16I:
   2738 		case FORMAT_A16B16G16R16UI:
   2739 		case FORMAT_A16B16G16R16:
   2740 		case FORMAT_V8U8:
   2741 		case FORMAT_Q8W8V8U8:
   2742 		case FORMAT_X8L8V8U8:
   2743 		case FORMAT_V16U16:
   2744 		case FORMAT_A16W16V16U16:
   2745 		case FORMAT_Q16W16V16U16:
   2746 		case FORMAT_A8:
   2747 		case FORMAT_R8I:
   2748 		case FORMAT_R8:
   2749 		case FORMAT_S8:
   2750 		case FORMAT_L8:
   2751 		case FORMAT_L16:
   2752 		case FORMAT_A8L8:
   2753 		case FORMAT_YV12_BT601:
   2754 		case FORMAT_YV12_BT709:
   2755 		case FORMAT_YV12_JFIF:
   2756 		case FORMAT_R32I:
   2757 		case FORMAT_R32UI:
   2758 		case FORMAT_G32R32I:
   2759 		case FORMAT_G32R32UI:
   2760 		case FORMAT_X32B32G32R32I:
   2761 		case FORMAT_X32B32G32R32UI:
   2762 		case FORMAT_A32B32G32R32I:
   2763 		case FORMAT_A32B32G32R32UI:
   2764 			return false;
   2765 		case FORMAT_R16F:
   2766 		case FORMAT_G16R16F:
   2767 		case FORMAT_B16G16R16F:
   2768 		case FORMAT_A16B16G16R16F:
   2769 		case FORMAT_R32F:
   2770 		case FORMAT_G32R32F:
   2771 		case FORMAT_B32G32R32F:
   2772 		case FORMAT_X32B32G32R32F:
   2773 		case FORMAT_A32B32G32R32F:
   2774 		case FORMAT_D32F:
   2775 		case FORMAT_D32F_COMPLEMENTARY:
   2776 		case FORMAT_D32F_LOCKABLE:
   2777 		case FORMAT_D32FS8_TEXTURE:
   2778 		case FORMAT_D32FS8_SHADOW:
   2779 		case FORMAT_L16F:
   2780 		case FORMAT_A16L16F:
   2781 		case FORMAT_L32F:
   2782 		case FORMAT_A32L32F:
   2783 			return true;
   2784 		default:
   2785 			ASSERT(false);
   2786 		}
   2787 
   2788 		return false;
   2789 	}
   2790 
   2791 	bool Surface::isUnsignedComponent(Format format, int component)
   2792 	{
   2793 		switch(format)
   2794 		{
   2795 		case FORMAT_NULL:
   2796 		case FORMAT_R5G6B5:
   2797 		case FORMAT_R8G8B8:
   2798 		case FORMAT_B8G8R8:
   2799 		case FORMAT_X8R8G8B8:
   2800 		case FORMAT_X8B8G8R8:
   2801 		case FORMAT_A8R8G8B8:
   2802 		case FORMAT_A8B8G8R8:
   2803 		case FORMAT_SRGB8_X8:
   2804 		case FORMAT_SRGB8_A8:
   2805 		case FORMAT_G8R8:
   2806 		case FORMAT_A2B10G10R10:
   2807 		case FORMAT_R16UI:
   2808 		case FORMAT_G16R16:
   2809 		case FORMAT_G16R16UI:
   2810 		case FORMAT_X16B16G16R16UI:
   2811 		case FORMAT_A16B16G16R16:
   2812 		case FORMAT_A16B16G16R16UI:
   2813 		case FORMAT_R32UI:
   2814 		case FORMAT_G32R32UI:
   2815 		case FORMAT_X32B32G32R32UI:
   2816 		case FORMAT_A32B32G32R32UI:
   2817 		case FORMAT_R8UI:
   2818 		case FORMAT_G8R8UI:
   2819 		case FORMAT_X8B8G8R8UI:
   2820 		case FORMAT_A8B8G8R8UI:
   2821 		case FORMAT_D32F:
   2822 		case FORMAT_D32F_COMPLEMENTARY:
   2823 		case FORMAT_D32F_LOCKABLE:
   2824 		case FORMAT_D32FS8_TEXTURE:
   2825 		case FORMAT_D32FS8_SHADOW:
   2826 		case FORMAT_A8:
   2827 		case FORMAT_R8:
   2828 		case FORMAT_L8:
   2829 		case FORMAT_L16:
   2830 		case FORMAT_A8L8:
   2831 		case FORMAT_YV12_BT601:
   2832 		case FORMAT_YV12_BT709:
   2833 		case FORMAT_YV12_JFIF:
   2834 			return true;
   2835 		case FORMAT_A8B8G8R8I:
   2836 		case FORMAT_A16B16G16R16I:
   2837 		case FORMAT_A32B32G32R32I:
   2838 		case FORMAT_A8B8G8R8I_SNORM:
   2839 		case FORMAT_Q8W8V8U8:
   2840 		case FORMAT_Q16W16V16U16:
   2841 		case FORMAT_A32B32G32R32F:
   2842 			return false;
   2843 		case FORMAT_R32F:
   2844 		case FORMAT_R8I:
   2845 		case FORMAT_R16I:
   2846 		case FORMAT_R32I:
   2847 		case FORMAT_R8I_SNORM:
   2848 			return component >= 1;
   2849 		case FORMAT_V8U8:
   2850 		case FORMAT_X8L8V8U8:
   2851 		case FORMAT_V16U16:
   2852 		case FORMAT_G32R32F:
   2853 		case FORMAT_G8R8I:
   2854 		case FORMAT_G16R16I:
   2855 		case FORMAT_G32R32I:
   2856 		case FORMAT_G8R8I_SNORM:
   2857 			return component >= 2;
   2858 		case FORMAT_A16W16V16U16:
   2859 		case FORMAT_B32G32R32F:
   2860 		case FORMAT_X32B32G32R32F:
   2861 		case FORMAT_X8B8G8R8I:
   2862 		case FORMAT_X16B16G16R16I:
   2863 		case FORMAT_X32B32G32R32I:
   2864 		case FORMAT_X8B8G8R8I_SNORM:
   2865 			return component >= 3;
   2866 		default:
   2867 			ASSERT(false);
   2868 		}
   2869 
   2870 		return false;
   2871 	}
   2872 
   2873 	bool Surface::isSRGBreadable(Format format)
   2874 	{
   2875 		// Keep in sync with Capabilities::isSRGBreadable
   2876 		switch(format)
   2877 		{
   2878 		case FORMAT_L8:
   2879 		case FORMAT_A8L8:
   2880 		case FORMAT_R8G8B8:
   2881 		case FORMAT_A8R8G8B8:
   2882 		case FORMAT_X8R8G8B8:
   2883 		case FORMAT_A8B8G8R8:
   2884 		case FORMAT_X8B8G8R8:
   2885 		case FORMAT_SRGB8_X8:
   2886 		case FORMAT_SRGB8_A8:
   2887 		case FORMAT_R5G6B5:
   2888 		case FORMAT_X1R5G5B5:
   2889 		case FORMAT_A1R5G5B5:
   2890 		case FORMAT_A4R4G4B4:
   2891 		#if S3TC_SUPPORT
   2892 		case FORMAT_DXT1:
   2893 		case FORMAT_DXT3:
   2894 		case FORMAT_DXT5:
   2895 		#endif
   2896 		case FORMAT_ATI1:
   2897 		case FORMAT_ATI2:
   2898 			return true;
   2899 		default:
   2900 			return false;
   2901 		}
   2902 	}
   2903 
   2904 	bool Surface::isSRGBwritable(Format format)
   2905 	{
   2906 		// Keep in sync with Capabilities::isSRGBwritable
   2907 		switch(format)
   2908 		{
   2909 		case FORMAT_NULL:
   2910 		case FORMAT_A8R8G8B8:
   2911 		case FORMAT_X8R8G8B8:
   2912 		case FORMAT_A8B8G8R8:
   2913 		case FORMAT_X8B8G8R8:
   2914 		case FORMAT_SRGB8_X8:
   2915 		case FORMAT_SRGB8_A8:
   2916 		case FORMAT_R5G6B5:
   2917 			return true;
   2918 		default:
   2919 			return false;
   2920 		}
   2921 	}
   2922 
   2923 	bool Surface::isCompressed(Format format)
   2924 	{
   2925 		switch(format)
   2926 		{
   2927 		#if S3TC_SUPPORT
   2928 		case FORMAT_DXT1:
   2929 		case FORMAT_DXT3:
   2930 		case FORMAT_DXT5:
   2931 		#endif
   2932 		case FORMAT_ATI1:
   2933 		case FORMAT_ATI2:
   2934 		case FORMAT_ETC1:
   2935 		case FORMAT_R11_EAC:
   2936 		case FORMAT_SIGNED_R11_EAC:
   2937 		case FORMAT_RG11_EAC:
   2938 		case FORMAT_SIGNED_RG11_EAC:
   2939 		case FORMAT_RGB8_ETC2:
   2940 		case FORMAT_SRGB8_ETC2:
   2941 		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
   2942 		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
   2943 		case FORMAT_RGBA8_ETC2_EAC:
   2944 		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
   2945 		case FORMAT_RGBA_ASTC_4x4_KHR:
   2946 		case FORMAT_RGBA_ASTC_5x4_KHR:
   2947 		case FORMAT_RGBA_ASTC_5x5_KHR:
   2948 		case FORMAT_RGBA_ASTC_6x5_KHR:
   2949 		case FORMAT_RGBA_ASTC_6x6_KHR:
   2950 		case FORMAT_RGBA_ASTC_8x5_KHR:
   2951 		case FORMAT_RGBA_ASTC_8x6_KHR:
   2952 		case FORMAT_RGBA_ASTC_8x8_KHR:
   2953 		case FORMAT_RGBA_ASTC_10x5_KHR:
   2954 		case FORMAT_RGBA_ASTC_10x6_KHR:
   2955 		case FORMAT_RGBA_ASTC_10x8_KHR:
   2956 		case FORMAT_RGBA_ASTC_10x10_KHR:
   2957 		case FORMAT_RGBA_ASTC_12x10_KHR:
   2958 		case FORMAT_RGBA_ASTC_12x12_KHR:
   2959 		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
   2960 		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
   2961 		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
   2962 		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
   2963 		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
   2964 		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
   2965 		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
   2966 		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
   2967 		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
   2968 		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
   2969 		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
   2970 		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
   2971 		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
   2972 		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
   2973 			return true;
   2974 		default:
   2975 			return false;
   2976 		}
   2977 	}
   2978 
   2979 	bool Surface::isSignedNonNormalizedInteger(Format format)
   2980 	{
   2981 		switch(format)
   2982 		{
   2983 		case FORMAT_A8B8G8R8I:
   2984 		case FORMAT_X8B8G8R8I:
   2985 		case FORMAT_G8R8I:
   2986 		case FORMAT_R8I:
   2987 		case FORMAT_A16B16G16R16I:
   2988 		case FORMAT_X16B16G16R16I:
   2989 		case FORMAT_G16R16I:
   2990 		case FORMAT_R16I:
   2991 		case FORMAT_A32B32G32R32I:
   2992 		case FORMAT_X32B32G32R32I:
   2993 		case FORMAT_G32R32I:
   2994 		case FORMAT_R32I:
   2995 			return true;
   2996 		default:
   2997 			return false;
   2998 		}
   2999 	}
   3000 
   3001 	bool Surface::isUnsignedNonNormalizedInteger(Format format)
   3002 	{
   3003 		switch(format)
   3004 		{
   3005 		case FORMAT_A8B8G8R8UI:
   3006 		case FORMAT_X8B8G8R8UI:
   3007 		case FORMAT_G8R8UI:
   3008 		case FORMAT_R8UI:
   3009 		case FORMAT_A16B16G16R16UI:
   3010 		case FORMAT_X16B16G16R16UI:
   3011 		case FORMAT_G16R16UI:
   3012 		case FORMAT_R16UI:
   3013 		case FORMAT_A32B32G32R32UI:
   3014 		case FORMAT_X32B32G32R32UI:
   3015 		case FORMAT_G32R32UI:
   3016 		case FORMAT_R32UI:
   3017 			return true;
   3018 		default:
   3019 			return false;
   3020 		}
   3021 	}
   3022 
   3023 	bool Surface::isNonNormalizedInteger(Format format)
   3024 	{
   3025 		return isSignedNonNormalizedInteger(format) ||
   3026 		       isUnsignedNonNormalizedInteger(format);
   3027 	}
   3028 
   3029 	bool Surface::isNormalizedInteger(Format format)
   3030 	{
   3031 		return !isFloatFormat(format) &&
   3032 		       !isNonNormalizedInteger(format) &&
   3033 		       !isCompressed(format) &&
   3034 		       !isDepth(format) &&
   3035 		       !isStencil(format);
   3036 	}
   3037 
   3038 	int Surface::componentCount(Format format)
   3039 	{
   3040 		switch(format)
   3041 		{
   3042 		case FORMAT_R5G6B5:         return 3;
   3043 		case FORMAT_X8R8G8B8:       return 3;
   3044 		case FORMAT_X8B8G8R8I:      return 3;
   3045 		case FORMAT_X8B8G8R8:       return 3;
   3046 		case FORMAT_A8R8G8B8:       return 4;
   3047 		case FORMAT_SRGB8_X8:       return 3;
   3048 		case FORMAT_SRGB8_A8:       return 4;
   3049 		case FORMAT_A8B8G8R8I:      return 4;
   3050 		case FORMAT_A8B8G8R8:       return 4;
   3051 		case FORMAT_G8R8I:          return 2;
   3052 		case FORMAT_G8R8:           return 2;
   3053 		case FORMAT_R8I_SNORM:      return 1;
   3054 		case FORMAT_G8R8I_SNORM:    return 2;
   3055 		case FORMAT_X8B8G8R8I_SNORM:return 3;
   3056 		case FORMAT_A8B8G8R8I_SNORM:return 4;
   3057 		case FORMAT_R8UI:           return 1;
   3058 		case FORMAT_G8R8UI:         return 2;
   3059 		case FORMAT_X8B8G8R8UI:     return 3;
   3060 		case FORMAT_A8B8G8R8UI:     return 4;
   3061 		case FORMAT_A2B10G10R10:    return 4;
   3062 		case FORMAT_G16R16I:        return 2;
   3063 		case FORMAT_G16R16UI:       return 2;
   3064 		case FORMAT_G16R16:         return 2;
   3065 		case FORMAT_G32R32I:        return 2;
   3066 		case FORMAT_G32R32UI:       return 2;
   3067 		case FORMAT_X16B16G16R16I:  return 3;
   3068 		case FORMAT_X16B16G16R16UI: return 3;
   3069 		case FORMAT_A16B16G16R16I:  return 4;
   3070 		case FORMAT_A16B16G16R16UI: return 4;
   3071 		case FORMAT_A16B16G16R16:   return 4;
   3072 		case FORMAT_X32B32G32R32I:  return 3;
   3073 		case FORMAT_X32B32G32R32UI: return 3;
   3074 		case FORMAT_A32B32G32R32I:  return 4;
   3075 		case FORMAT_A32B32G32R32UI: return 4;
   3076 		case FORMAT_V8U8:           return 2;
   3077 		case FORMAT_Q8W8V8U8:       return 4;
   3078 		case FORMAT_X8L8V8U8:       return 3;
   3079 		case FORMAT_V16U16:         return 2;
   3080 		case FORMAT_A16W16V16U16:   return 4;
   3081 		case FORMAT_Q16W16V16U16:   return 4;
   3082 		case FORMAT_R32F:           return 1;
   3083 		case FORMAT_G32R32F:        return 2;
   3084 		case FORMAT_X32B32G32R32F:  return 3;
   3085 		case FORMAT_A32B32G32R32F:  return 4;
   3086 		case FORMAT_D32F:           return 1;
   3087 		case FORMAT_D32F_LOCKABLE:  return 1;
   3088 		case FORMAT_D32FS8_TEXTURE: return 1;
   3089 		case FORMAT_D32FS8_SHADOW:  return 1;
   3090 		case FORMAT_A8:             return 1;
   3091 		case FORMAT_R8I:            return 1;
   3092 		case FORMAT_R8:             return 1;
   3093 		case FORMAT_R16I:           return 1;
   3094 		case FORMAT_R16UI:          return 1;
   3095 		case FORMAT_R32I:           return 1;
   3096 		case FORMAT_R32UI:          return 1;
   3097 		case FORMAT_L8:             return 1;
   3098 		case FORMAT_L16:            return 1;
   3099 		case FORMAT_A8L8:           return 2;
   3100 		case FORMAT_YV12_BT601:     return 3;
   3101 		case FORMAT_YV12_BT709:     return 3;
   3102 		case FORMAT_YV12_JFIF:      return 3;
   3103 		default:
   3104 			ASSERT(false);
   3105 		}
   3106 
   3107 		return 1;
   3108 	}
   3109 
   3110 	void *Surface::allocateBuffer(int width, int height, int depth, Format format)
   3111 	{
   3112 		// Render targets require 2x2 quads
   3113 		int width2 = (width + 1) & ~1;
   3114 		int height2 = (height + 1) & ~1;
   3115 
   3116 		// FIXME: Unpacking byte4 to short4 in the sampler currently involves reading 8 bytes,
   3117 		// and stencil operations also read 8 bytes per four 8-bit stencil values,
   3118 		// so we have to allocate 4 extra bytes to avoid buffer overruns.
   3119 		return allocate(size(width2, height2, depth, format) + 4);
   3120 	}
   3121 
   3122 	void Surface::memfill4(void *buffer, int pattern, int bytes)
   3123 	{
   3124 		while((size_t)buffer & 0x1 && bytes >= 1)
   3125 		{
   3126 			*(char*)buffer = (char)pattern;
   3127 			(char*&)buffer += 1;
   3128 			bytes -= 1;
   3129 		}
   3130 
   3131 		while((size_t)buffer & 0x3 && bytes >= 2)
   3132 		{
   3133 			*(short*)buffer = (short)pattern;
   3134 			(short*&)buffer += 1;
   3135 			bytes -= 2;
   3136 		}
   3137 
   3138 		#if defined(__i386__) || defined(__x86_64__)
   3139 			if(CPUID::supportsSSE())
   3140 			{
   3141 				while((size_t)buffer & 0xF && bytes >= 4)
   3142 				{
   3143 					*(int*)buffer = pattern;
   3144 					(int*&)buffer += 1;
   3145 					bytes -= 4;
   3146 				}
   3147 
   3148 				__m128 quad = _mm_set_ps1((float&)pattern);
   3149 
   3150 				float *pointer = (float*)buffer;
   3151 				int qxwords = bytes / 64;
   3152 				bytes -= qxwords * 64;
   3153 
   3154 				while(qxwords--)
   3155 				{
   3156 					_mm_stream_ps(pointer + 0, quad);
   3157 					_mm_stream_ps(pointer + 4, quad);
   3158 					_mm_stream_ps(pointer + 8, quad);
   3159 					_mm_stream_ps(pointer + 12, quad);
   3160 
   3161 					pointer += 16;
   3162 				}
   3163 
   3164 				buffer = pointer;
   3165 			}
   3166 		#endif
   3167 
   3168 		while(bytes >= 4)
   3169 		{
   3170 			*(int*)buffer = (int)pattern;
   3171 			(int*&)buffer += 1;
   3172 			bytes -= 4;
   3173 		}
   3174 
   3175 		while(bytes >= 2)
   3176 		{
   3177 			*(short*)buffer = (short)pattern;
   3178 			(short*&)buffer += 1;
   3179 			bytes -= 2;
   3180 		}
   3181 
   3182 		while(bytes >= 1)
   3183 		{
   3184 			*(char*)buffer = (char)pattern;
   3185 			(char*&)buffer += 1;
   3186 			bytes -= 1;
   3187 		}
   3188 	}
   3189 
   3190 	void Surface::sync()
   3191 	{
   3192 		resource->lock(EXCLUSIVE);
   3193 		resource->unlock();
   3194 	}
   3195 
   3196 	bool Surface::isEntire(const Rect& rect) const
   3197 	{
   3198 		return (rect.x0 == 0 && rect.y0 == 0 && rect.x1 == internal.width && rect.y1 == internal.height && internal.depth == 1);
   3199 	}
   3200 
   3201 	Rect Surface::getRect() const
   3202 	{
   3203 		return Rect(0, 0, internal.width, internal.height);
   3204 	}
   3205 
   3206 	void Surface::clearDepth(float depth, int x0, int y0, int width, int height)
   3207 	{
   3208 		if(width == 0 || height == 0) return;
   3209 
   3210 		// Not overlapping
   3211 		if(x0 > internal.width) return;
   3212 		if(y0 > internal.height) return;
   3213 		if(x0 + width < 0) return;
   3214 		if(y0 + height < 0) return;
   3215 
   3216 		// Clip against dimensions
   3217 		if(x0 < 0) {width += x0; x0 = 0;}
   3218 		if(x0 + width > internal.width) width = internal.width - x0;
   3219 		if(y0 < 0) {height += y0; y0 = 0;}
   3220 		if(y0 + height > internal.height) height = internal.height - y0;
   3221 
   3222 		const bool entire = x0 == 0 && y0 == 0 && width == internal.width && height == internal.height;
   3223 		const Lock lock = entire ? LOCK_DISCARD : LOCK_WRITEONLY;
   3224 
   3225 		int width2 = (internal.width + 1) & ~1;
   3226 
   3227 		int x1 = x0 + width;
   3228 		int y1 = y0 + height;
   3229 
   3230 		if(internal.format == FORMAT_D32F_LOCKABLE ||
   3231 		   internal.format == FORMAT_D32FS8_TEXTURE ||
   3232 		   internal.format == FORMAT_D32FS8_SHADOW)
   3233 		{
   3234 			float *target = (float*)lockInternal(0, 0, 0, lock, PUBLIC) + x0 + width2 * y0;
   3235 
   3236 			for(int z = 0; z < internal.depth; z++)
   3237 			{
   3238 				for(int y = y0; y < y1; y++)
   3239 				{
   3240 					memfill4(target, (int&)depth, 4 * width);
   3241 					target += width2;
   3242 				}
   3243 			}
   3244 
   3245 			unlockInternal();
   3246 		}
   3247 		else   // Quad layout
   3248 		{
   3249 			if(complementaryDepthBuffer)
   3250 			{
   3251 				depth = 1 - depth;
   3252 			}
   3253 
   3254 			float *buffer = (float*)lockInternal(0, 0, 0, lock, PUBLIC);
   3255 
   3256 			int oddX0 = (x0 & ~1) * 2 + (x0 & 1);
   3257 			int oddX1 = (x1 & ~1) * 2;
   3258 			int evenX0 = ((x0 + 1) & ~1) * 2;
   3259 			int evenBytes = (oddX1 - evenX0) * sizeof(float);
   3260 
   3261 			for(int z = 0; z < internal.depth; z++)
   3262 			{
   3263 				for(int y = y0; y < y1; y++)
   3264 				{
   3265 					float *target = buffer + (y & ~1) * width2 + (y & 1) * 2;
   3266 
   3267 					if((y & 1) == 0 && y + 1 < y1)   // Fill quad line at once
   3268 					{
   3269 						if((x0 & 1) != 0)
   3270 						{
   3271 							target[oddX0 + 0] = depth;
   3272 							target[oddX0 + 2] = depth;
   3273 						}
   3274 
   3275 					//	for(int x2 = evenX0; x2 < x1 * 2; x2 += 4)
   3276 					//	{
   3277 					//		target[x2 + 0] = depth;
   3278 					//		target[x2 + 1] = depth;
   3279 					//		target[x2 + 2] = depth;
   3280 					//		target[x2 + 3] = depth;
   3281 					//	}
   3282 
   3283 					//	__asm
   3284 					//	{
   3285 					//		movss xmm0, depth
   3286 					//		shufps xmm0, xmm0, 0x00
   3287 					//
   3288 					//		mov eax, x0
   3289 					//		add eax, 1
   3290 					//		and eax, 0xFFFFFFFE
   3291 					//		cmp eax, x1
   3292 					//		jge qEnd
   3293 					//
   3294 					//		mov edi, target
   3295 					//
   3296 					//	qLoop:
   3297 					//		movntps [edi+8*eax], xmm0
   3298 					//
   3299 					//		add eax, 2
   3300 					//		cmp eax, x1
   3301 					//		jl qLoop
   3302 					//	qEnd:
   3303 					//	}
   3304 
   3305 						memfill4(&target[evenX0], (int&)depth, evenBytes);
   3306 
   3307 						if((x1 & 1) != 0)
   3308 						{
   3309 							target[oddX1 + 0] = depth;
   3310 							target[oddX1 + 2] = depth;
   3311 						}
   3312 
   3313 						y++;
   3314 					}
   3315 					else
   3316 					{
   3317 						for(int x = x0, i = oddX0; x < x1; x++, i = (x & ~1) * 2 + (x & 1))
   3318 						{
   3319 							target[i] = depth;
   3320 						}
   3321 					}
   3322 				}
   3323 
   3324 				buffer += internal.sliceP;
   3325 			}
   3326 
   3327 			unlockInternal();
   3328 		}
   3329 	}
   3330 
   3331 	void Surface::clearStencil(unsigned char s, unsigned char mask, int x0, int y0, int width, int height)
   3332 	{
   3333 		if(mask == 0 || width == 0 || height == 0) return;
   3334 
   3335 		// Not overlapping
   3336 		if(x0 > internal.width) return;
   3337 		if(y0 > internal.height) return;
   3338 		if(x0 + width < 0) return;
   3339 		if(y0 + height < 0) return;
   3340 
   3341 		// Clip against dimensions
   3342 		if(x0 < 0) {width += x0; x0 = 0;}
   3343 		if(x0 + width > internal.width) width = internal.width - x0;
   3344 		if(y0 < 0) {height += y0; y0 = 0;}
   3345 		if(y0 + height > internal.height) height = internal.height - y0;
   3346 
   3347 		int width2 = (internal.width + 1) & ~1;
   3348 
   3349 		int x1 = x0 + width;
   3350 		int y1 = y0 + height;
   3351 
   3352 		int oddX0 = (x0 & ~1) * 2 + (x0 & 1);
   3353 		int oddX1 = (x1 & ~1) * 2;
   3354 		int evenX0 = ((x0 + 1) & ~1) * 2;
   3355 		int evenBytes = oddX1 - evenX0;
   3356 
   3357 		unsigned char maskedS = s & mask;
   3358 		unsigned char invMask = ~mask;
   3359 		unsigned int fill = maskedS;
   3360 		fill = fill | (fill << 8) | (fill << 16) | (fill << 24);
   3361 
   3362 		char *buffer = (char*)lockStencil(0, 0, 0, PUBLIC);
   3363 
   3364 		// Stencil buffers are assumed to use quad layout
   3365 		for(int z = 0; z < stencil.depth; z++)
   3366 		{
   3367 			for(int y = y0; y < y1; y++)
   3368 			{
   3369 				char *target = buffer + (y & ~1) * width2 + (y & 1) * 2;
   3370 
   3371 				if((y & 1) == 0 && y + 1 < y1 && mask == 0xFF)   // Fill quad line at once
   3372 				{
   3373 					if((x0 & 1) != 0)
   3374 					{
   3375 						target[oddX0 + 0] = fill;
   3376 						target[oddX0 + 2] = fill;
   3377 					}
   3378 
   3379 					memfill4(&target[evenX0], fill, evenBytes);
   3380 
   3381 					if((x1 & 1) != 0)
   3382 					{
   3383 						target[oddX1 + 0] = fill;
   3384 						target[oddX1 + 2] = fill;
   3385 					}
   3386 
   3387 					y++;
   3388 				}
   3389 				else
   3390 				{
   3391 					for(int x = x0, i = oddX0; x < x1; x++, i = (x & ~1) * 2 + (x & 1))
   3392 					{
   3393 						target[i] = maskedS | (target[i] & invMask);
   3394 					}
   3395 				}
   3396 			}
   3397 
   3398 			buffer += stencil.sliceP;
   3399 		}
   3400 
   3401 		unlockStencil();
   3402 	}
   3403 
   3404 	void Surface::fill(const Color<float> &color, int x0, int y0, int width, int height)
   3405 	{
   3406 		unsigned char *row;
   3407 		Buffer *buffer;
   3408 
   3409 		if(internal.dirty)
   3410 		{
   3411 			row = (unsigned char*)lockInternal(x0, y0, 0, LOCK_WRITEONLY, PUBLIC);
   3412 			buffer = &internal;
   3413 		}
   3414 		else
   3415 		{
   3416 			row = (unsigned char*)lockExternal(x0, y0, 0, LOCK_WRITEONLY, PUBLIC);
   3417 			buffer = &external;
   3418 		}
   3419 
   3420 		if(buffer->bytes <= 4)
   3421 		{
   3422 			int c;
   3423 			buffer->write(&c, color);
   3424 
   3425 			if(buffer->bytes <= 1) c = (c << 8)  | c;
   3426 			if(buffer->bytes <= 2) c = (c << 16) | c;
   3427 
   3428 			for(int y = 0; y < height; y++)
   3429 			{
   3430 				memfill4(row, c, width * buffer->bytes);
   3431 
   3432 				row += buffer->pitchB;
   3433 			}
   3434 		}
   3435 		else   // Generic
   3436 		{
   3437 			for(int y = 0; y < height; y++)
   3438 			{
   3439 				unsigned char *element = row;
   3440 
   3441 				for(int x = 0; x < width; x++)
   3442 				{
   3443 					buffer->write(element, color);
   3444 
   3445 					element += buffer->bytes;
   3446 				}
   3447 
   3448 				row += buffer->pitchB;
   3449 			}
   3450 		}
   3451 
   3452 		if(buffer == &internal)
   3453 		{
   3454 			unlockInternal();
   3455 		}
   3456 		else
   3457 		{
   3458 			unlockExternal();
   3459 		}
   3460 	}
   3461 
   3462 	void Surface::copyInternal(const Surface* source, int x, int y, float srcX, float srcY, bool filter)
   3463 	{
   3464 		ASSERT(internal.lock != LOCK_UNLOCKED && source && source->internal.lock != LOCK_UNLOCKED);
   3465 
   3466 		sw::Color<float> color;
   3467 
   3468 		if(!filter)
   3469 		{
   3470 			color = source->internal.read((int)srcX, (int)srcY);
   3471 		}
   3472 		else   // Bilinear filtering
   3473 		{
   3474 			color = source->internal.sample(srcX, srcY);
   3475 		}
   3476 
   3477 		internal.write(x, y, color);
   3478 	}
   3479 
   3480 	void Surface::copyInternal(const Surface* source, int x, int y, int z, float srcX, float srcY, float srcZ, bool filter)
   3481 	{
   3482 		ASSERT(internal.lock != LOCK_UNLOCKED && source && source->internal.lock != LOCK_UNLOCKED);
   3483 
   3484 		sw::Color<float> color;
   3485 
   3486 		if(!filter)
   3487 		{
   3488 			color = source->internal.read((int)srcX, (int)srcY, int(srcZ));
   3489 		}
   3490 		else   // Bilinear filtering
   3491 		{
   3492 			color = source->internal.sample(srcX, srcY, srcZ);
   3493 		}
   3494 
   3495 		internal.write(x, y, z, color);
   3496 	}
   3497 
   3498 	bool Surface::hasStencil() const
   3499 	{
   3500 		return isStencil(external.format);
   3501 	}
   3502 
   3503 	bool Surface::hasDepth() const
   3504 	{
   3505 		return isDepth(external.format);
   3506 	}
   3507 
   3508 	bool Surface::hasPalette() const
   3509 	{
   3510 		return isPalette(external.format);
   3511 	}
   3512 
   3513 	bool Surface::isRenderTarget() const
   3514 	{
   3515 		return renderTarget;
   3516 	}
   3517 
   3518 	bool Surface::hasDirtyMipmaps() const
   3519 	{
   3520 		return dirtyMipmaps;
   3521 	}
   3522 
   3523 	void Surface::cleanMipmaps()
   3524 	{
   3525 		dirtyMipmaps = false;
   3526 	}
   3527 
   3528 	Resource *Surface::getResource()
   3529 	{
   3530 		return resource;
   3531 	}
   3532 
   3533 	bool Surface::identicalFormats() const
   3534 	{
   3535 		return external.format == internal.format &&
   3536 		       external.width  == internal.width &&
   3537 		       external.height == internal.height &&
   3538 		       external.depth  == internal.depth &&
   3539 		       external.pitchB == internal.pitchB &&
   3540 		       external.sliceB == internal.sliceB;
   3541 	}
   3542 
   3543 	Format Surface::selectInternalFormat(Format format) const
   3544 	{
   3545 		switch(format)
   3546 		{
   3547 		case FORMAT_NULL:
   3548 			return FORMAT_NULL;
   3549 		case FORMAT_P8:
   3550 		case FORMAT_A8P8:
   3551 		case FORMAT_A4R4G4B4:
   3552 		case FORMAT_A1R5G5B5:
   3553 		case FORMAT_A8R3G3B2:
   3554 			return FORMAT_A8R8G8B8;
   3555 		case FORMAT_A8:
   3556 			return FORMAT_A8;
   3557 		case FORMAT_R8I:
   3558 			return FORMAT_R8I;
   3559 		case FORMAT_R8UI:
   3560 			return FORMAT_R8UI;
   3561 		case FORMAT_R8I_SNORM:
   3562 			return FORMAT_R8I_SNORM;
   3563 		case FORMAT_R8:
   3564 			return FORMAT_R8;
   3565 		case FORMAT_R16I:
   3566 			return FORMAT_R16I;
   3567 		case FORMAT_R16UI:
   3568 			return FORMAT_R16UI;
   3569 		case FORMAT_R32I:
   3570 			return FORMAT_R32I;
   3571 		case FORMAT_R32UI:
   3572 			return FORMAT_R32UI;
   3573 		case FORMAT_X16B16G16R16I:
   3574 		case FORMAT_A16B16G16R16I:
   3575 			return FORMAT_A16B16G16R16I;
   3576 		case FORMAT_X16B16G16R16UI:
   3577 		case FORMAT_A16B16G16R16UI:
   3578 			return FORMAT_A16B16G16R16UI;
   3579 		case FORMAT_A2R10G10B10:
   3580 		case FORMAT_A2B10G10R10:
   3581 		case FORMAT_A16B16G16R16:
   3582 			return FORMAT_A16B16G16R16;
   3583 		case FORMAT_X32B32G32R32I:
   3584 		case FORMAT_A32B32G32R32I:
   3585 			return FORMAT_A32B32G32R32I;
   3586 		case FORMAT_X32B32G32R32UI:
   3587 		case FORMAT_A32B32G32R32UI:
   3588 			return FORMAT_A32B32G32R32UI;
   3589 		case FORMAT_G8R8I:
   3590 			return FORMAT_G8R8I;
   3591 		case FORMAT_G8R8UI:
   3592 			return FORMAT_G8R8UI;
   3593 		case FORMAT_G8R8I_SNORM:
   3594 			return FORMAT_G8R8I_SNORM;
   3595 		case FORMAT_G8R8:
   3596 			return FORMAT_G8R8;
   3597 		case FORMAT_G16R16I:
   3598 			return FORMAT_G16R16I;
   3599 		case FORMAT_G16R16UI:
   3600 			return FORMAT_G16R16UI;
   3601 		case FORMAT_G16R16:
   3602 			return FORMAT_G16R16;
   3603 		case FORMAT_G32R32I:
   3604 			return FORMAT_G32R32I;
   3605 		case FORMAT_G32R32UI:
   3606 			return FORMAT_G32R32UI;
   3607 		case FORMAT_A8R8G8B8:
   3608 			if(lockable || !quadLayoutEnabled)
   3609 			{
   3610 				return FORMAT_A8R8G8B8;
   3611 			}
   3612 			else
   3613 			{
   3614 				return FORMAT_A8G8R8B8Q;
   3615 			}
   3616 		case FORMAT_A8B8G8R8I:
   3617 			return FORMAT_A8B8G8R8I;
   3618 		case FORMAT_A8B8G8R8UI:
   3619 			return FORMAT_A8B8G8R8UI;
   3620 		case FORMAT_A8B8G8R8I_SNORM:
   3621 			return FORMAT_A8B8G8R8I_SNORM;
   3622 		case FORMAT_R5G5B5A1:
   3623 		case FORMAT_R4G4B4A4:
   3624 		case FORMAT_A8B8G8R8:
   3625 			return FORMAT_A8B8G8R8;
   3626 		case FORMAT_R5G6B5:
   3627 			return FORMAT_R5G6B5;
   3628 		case FORMAT_R3G3B2:
   3629 		case FORMAT_R8G8B8:
   3630 		case FORMAT_X4R4G4B4:
   3631 		case FORMAT_X1R5G5B5:
   3632 		case FORMAT_X8R8G8B8:
   3633 			if(lockable || !quadLayoutEnabled)
   3634 			{
   3635 				return FORMAT_X8R8G8B8;
   3636 			}
   3637 			else
   3638 			{
   3639 				return FORMAT_X8G8R8B8Q;
   3640 			}
   3641 		case FORMAT_X8B8G8R8I:
   3642 			return FORMAT_X8B8G8R8I;
   3643 		case FORMAT_X8B8G8R8UI:
   3644 			return FORMAT_X8B8G8R8UI;
   3645 		case FORMAT_X8B8G8R8I_SNORM:
   3646 			return FORMAT_X8B8G8R8I_SNORM;
   3647 		case FORMAT_B8G8R8:
   3648 		case FORMAT_X8B8G8R8:
   3649 			return FORMAT_X8B8G8R8;
   3650 		case FORMAT_SRGB8_X8:
   3651 			return FORMAT_SRGB8_X8;
   3652 		case FORMAT_SRGB8_A8:
   3653 			return FORMAT_SRGB8_A8;
   3654 		// Compressed formats
   3655 		#if S3TC_SUPPORT
   3656 		case FORMAT_DXT1:
   3657 		case FORMAT_DXT3:
   3658 		case FORMAT_DXT5:
   3659 		#endif
   3660 		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
   3661 		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
   3662 		case FORMAT_RGBA8_ETC2_EAC:
   3663 		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
   3664 		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
   3665 		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
   3666 		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
   3667 		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
   3668 		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
   3669 		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
   3670 		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
   3671 		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
   3672 		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
   3673 		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
   3674 		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
   3675 		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
   3676 		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
   3677 		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
   3678 			return FORMAT_A8R8G8B8;
   3679 		case FORMAT_RGBA_ASTC_4x4_KHR:
   3680 		case FORMAT_RGBA_ASTC_5x4_KHR:
   3681 		case FORMAT_RGBA_ASTC_5x5_KHR:
   3682 		case FORMAT_RGBA_ASTC_6x5_KHR:
   3683 		case FORMAT_RGBA_ASTC_6x6_KHR:
   3684 		case FORMAT_RGBA_ASTC_8x5_KHR:
   3685 		case FORMAT_RGBA_ASTC_8x6_KHR:
   3686 		case FORMAT_RGBA_ASTC_8x8_KHR:
   3687 		case FORMAT_RGBA_ASTC_10x5_KHR:
   3688 		case FORMAT_RGBA_ASTC_10x6_KHR:
   3689 		case FORMAT_RGBA_ASTC_10x8_KHR:
   3690 		case FORMAT_RGBA_ASTC_10x10_KHR:
   3691 		case FORMAT_RGBA_ASTC_12x10_KHR:
   3692 		case FORMAT_RGBA_ASTC_12x12_KHR:
   3693 			// ASTC supports HDR, so a floating point format is required to represent it properly
   3694 			return FORMAT_A32B32G32R32F; // FIXME: 16FP is probably sufficient, but it's currently unsupported
   3695 		case FORMAT_ATI1:
   3696 		case FORMAT_R11_EAC:
   3697 			return FORMAT_R8;
   3698 		case FORMAT_SIGNED_R11_EAC:
   3699 			return FORMAT_R32F; // FIXME: Signed 8bit format would be sufficient
   3700 		case FORMAT_ATI2:
   3701 		case FORMAT_RG11_EAC:
   3702 			return FORMAT_G8R8;
   3703 		case FORMAT_SIGNED_RG11_EAC:
   3704 			return FORMAT_G32R32F; // FIXME: Signed 8bit format would be sufficient
   3705 		case FORMAT_ETC1:
   3706 		case FORMAT_RGB8_ETC2:
   3707 		case FORMAT_SRGB8_ETC2:
   3708 			return FORMAT_X8R8G8B8;
   3709 		// Bumpmap formats
   3710 		case FORMAT_V8U8:			return FORMAT_V8U8;
   3711 		case FORMAT_L6V5U5:			return FORMAT_X8L8V8U8;
   3712 		case FORMAT_Q8W8V8U8:		return FORMAT_Q8W8V8U8;
   3713 		case FORMAT_X8L8V8U8:		return FORMAT_X8L8V8U8;
   3714 		case FORMAT_V16U16:			return FORMAT_V16U16;
   3715 		case FORMAT_A2W10V10U10:	return FORMAT_A16W16V16U16;
   3716 		case FORMAT_Q16W16V16U16:	return FORMAT_Q16W16V16U16;
   3717 		// Floating-point formats
   3718 		case FORMAT_A16F:			return FORMAT_A32B32G32R32F;
   3719 		case FORMAT_R16F:			return FORMAT_R32F;
   3720 		case FORMAT_G16R16F:		return FORMAT_G32R32F;
   3721 		case FORMAT_B16G16R16F:     return FORMAT_X32B32G32R32F;
   3722 		case FORMAT_A16B16G16R16F:	return FORMAT_A32B32G32R32F;
   3723 		case FORMAT_A32F:			return FORMAT_A32B32G32R32F;
   3724 		case FORMAT_R32F:			return FORMAT_R32F;
   3725 		case FORMAT_G32R32F:		return FORMAT_G32R32F;
   3726 		case FORMAT_B32G32R32F:     return FORMAT_X32B32G32R32F;
   3727 		case FORMAT_X32B32G32R32F:  return FORMAT_X32B32G32R32F;
   3728 		case FORMAT_A32B32G32R32F:	return FORMAT_A32B32G32R32F;
   3729 		// Luminance formats
   3730 		case FORMAT_L8:				return FORMAT_L8;
   3731 		case FORMAT_A4L4:			return FORMAT_A8L8;
   3732 		case FORMAT_L16:			return FORMAT_L16;
   3733 		case FORMAT_A8L8:			return FORMAT_A8L8;
   3734 		case FORMAT_L16F:           return FORMAT_X32B32G32R32F;
   3735 		case FORMAT_A16L16F:        return FORMAT_A32B32G32R32F;
   3736 		case FORMAT_L32F:           return FORMAT_X32B32G32R32F;
   3737 		case FORMAT_A32L32F:        return FORMAT_A32B32G32R32F;
   3738 		// Depth/stencil formats
   3739 		case FORMAT_D16:
   3740 		case FORMAT_D32:
   3741 		case FORMAT_D24X8:
   3742 		case FORMAT_D24S8:
   3743 		case FORMAT_D24FS8:
   3744 			if(hasParent)   // Texture
   3745 			{
   3746 				return FORMAT_D32FS8_SHADOW;
   3747 			}
   3748 			else if(complementaryDepthBuffer)
   3749 			{
   3750 				return FORMAT_D32F_COMPLEMENTARY;
   3751 			}
   3752 			else
   3753 			{
   3754 				return FORMAT_D32F;
   3755 			}
   3756 		case FORMAT_D32F:           return FORMAT_D32F;
   3757 		case FORMAT_D32F_LOCKABLE:  return FORMAT_D32F_LOCKABLE;
   3758 		case FORMAT_D32FS8_TEXTURE: return FORMAT_D32FS8_TEXTURE;
   3759 		case FORMAT_INTZ:           return FORMAT_D32FS8_TEXTURE;
   3760 		case FORMAT_DF24S8:         return FORMAT_D32FS8_SHADOW;
   3761 		case FORMAT_DF16S8:         return FORMAT_D32FS8_SHADOW;
   3762 		case FORMAT_YV12_BT601:     return FORMAT_YV12_BT601;
   3763 		case FORMAT_YV12_BT709:     return FORMAT_YV12_BT709;
   3764 		case FORMAT_YV12_JFIF:      return FORMAT_YV12_JFIF;
   3765 		default:
   3766 			ASSERT(false);
   3767 		}
   3768 
   3769 		return FORMAT_NULL;
   3770 	}
   3771 
   3772 	void Surface::setTexturePalette(unsigned int *palette)
   3773 	{
   3774 		Surface::palette = palette;
   3775 		Surface::paletteID++;
   3776 	}
   3777 
   3778 	void Surface::resolve()
   3779 	{
   3780 		if(internal.depth <= 1 || !internal.dirty || !renderTarget || internal.format == FORMAT_NULL)
   3781 		{
   3782 			return;
   3783 		}
   3784 
   3785 		void *source = internal.lockRect(0, 0, 0, LOCK_READWRITE);
   3786 
   3787 		int width = internal.width;
   3788 		int height = internal.height;
   3789 		int pitch = internal.pitchB;
   3790 		int slice = internal.sliceB;
   3791 
   3792 		unsigned char *source0 = (unsigned char*)source;
   3793 		unsigned char *source1 = source0 + slice;
   3794 		unsigned char *source2 = source1 + slice;
   3795 		unsigned char *source3 = source2 + slice;
   3796 		unsigned char *source4 = source3 + slice;
   3797 		unsigned char *source5 = source4 + slice;
   3798 		unsigned char *source6 = source5 + slice;
   3799 		unsigned char *source7 = source6 + slice;
   3800 		unsigned char *source8 = source7 + slice;
   3801 		unsigned char *source9 = source8 + slice;
   3802 		unsigned char *sourceA = source9 + slice;
   3803 		unsigned char *sourceB = sourceA + slice;
   3804 		unsigned char *sourceC = sourceB + slice;
   3805 		unsigned char *sourceD = sourceC + slice;
   3806 		unsigned char *sourceE = sourceD + slice;
   3807 		unsigned char *sourceF = sourceE + slice;
   3808 
   3809 		if(internal.format == FORMAT_X8R8G8B8 || internal.format == FORMAT_A8R8G8B8 ||
   3810 		   internal.format == FORMAT_X8B8G8R8 || internal.format == FORMAT_A8B8G8R8 ||
   3811 		   internal.format == FORMAT_SRGB8_X8 || internal.format == FORMAT_SRGB8_A8)
   3812 		{
   3813 			#if defined(__i386__) || defined(__x86_64__)
   3814 				if(CPUID::supportsSSE2() && (width % 4) == 0)
   3815 				{
   3816 					if(internal.depth == 2)
   3817 					{
   3818 						for(int y = 0; y < height; y++)
   3819 						{
   3820 							for(int x = 0; x < width; x += 4)
   3821 							{
   3822 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
   3823 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
   3824 
   3825 								c0 = _mm_avg_epu8(c0, c1);
   3826 
   3827 								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
   3828 							}
   3829 
   3830 							source0 += pitch;
   3831 							source1 += pitch;
   3832 						}
   3833 					}
   3834 					else if(internal.depth == 4)
   3835 					{
   3836 						for(int y = 0; y < height; y++)
   3837 						{
   3838 							for(int x = 0; x < width; x += 4)
   3839 							{
   3840 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
   3841 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
   3842 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
   3843 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
   3844 
   3845 								c0 = _mm_avg_epu8(c0, c1);
   3846 								c2 = _mm_avg_epu8(c2, c3);
   3847 								c0 = _mm_avg_epu8(c0, c2);
   3848 
   3849 								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
   3850 							}
   3851 
   3852 							source0 += pitch;
   3853 							source1 += pitch;
   3854 							source2 += pitch;
   3855 							source3 += pitch;
   3856 						}
   3857 					}
   3858 					else if(internal.depth == 8)
   3859 					{
   3860 						for(int y = 0; y < height; y++)
   3861 						{
   3862 							for(int x = 0; x < width; x += 4)
   3863 							{
   3864 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
   3865 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
   3866 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
   3867 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
   3868 								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
   3869 								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
   3870 								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
   3871 								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
   3872 
   3873 								c0 = _mm_avg_epu8(c0, c1);
   3874 								c2 = _mm_avg_epu8(c2, c3);
   3875 								c4 = _mm_avg_epu8(c4, c5);
   3876 								c6 = _mm_avg_epu8(c6, c7);
   3877 								c0 = _mm_avg_epu8(c0, c2);
   3878 								c4 = _mm_avg_epu8(c4, c6);
   3879 								c0 = _mm_avg_epu8(c0, c4);
   3880 
   3881 								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
   3882 							}
   3883 
   3884 							source0 += pitch;
   3885 							source1 += pitch;
   3886 							source2 += pitch;
   3887 							source3 += pitch;
   3888 							source4 += pitch;
   3889 							source5 += pitch;
   3890 							source6 += pitch;
   3891 							source7 += pitch;
   3892 						}
   3893 					}
   3894 					else if(internal.depth == 16)
   3895 					{
   3896 						for(int y = 0; y < height; y++)
   3897 						{
   3898 							for(int x = 0; x < width; x += 4)
   3899 							{
   3900 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
   3901 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
   3902 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
   3903 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
   3904 								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
   3905 								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
   3906 								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
   3907 								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
   3908 								__m128i c8 = _mm_load_si128((__m128i*)(source8 + 4 * x));
   3909 								__m128i c9 = _mm_load_si128((__m128i*)(source9 + 4 * x));
   3910 								__m128i cA = _mm_load_si128((__m128i*)(sourceA + 4 * x));
   3911 								__m128i cB = _mm_load_si128((__m128i*)(sourceB + 4 * x));
   3912 								__m128i cC = _mm_load_si128((__m128i*)(sourceC + 4 * x));
   3913 								__m128i cD = _mm_load_si128((__m128i*)(sourceD + 4 * x));
   3914 								__m128i cE = _mm_load_si128((__m128i*)(sourceE + 4 * x));
   3915 								__m128i cF = _mm_load_si128((__m128i*)(sourceF + 4 * x));
   3916 
   3917 								c0 = _mm_avg_epu8(c0, c1);
   3918 								c2 = _mm_avg_epu8(c2, c3);
   3919 								c4 = _mm_avg_epu8(c4, c5);
   3920 								c6 = _mm_avg_epu8(c6, c7);
   3921 								c8 = _mm_avg_epu8(c8, c9);
   3922 								cA = _mm_avg_epu8(cA, cB);
   3923 								cC = _mm_avg_epu8(cC, cD);
   3924 								cE = _mm_avg_epu8(cE, cF);
   3925 								c0 = _mm_avg_epu8(c0, c2);
   3926 								c4 = _mm_avg_epu8(c4, c6);
   3927 								c8 = _mm_avg_epu8(c8, cA);
   3928 								cC = _mm_avg_epu8(cC, cE);
   3929 								c0 = _mm_avg_epu8(c0, c4);
   3930 								c8 = _mm_avg_epu8(c8, cC);
   3931 								c0 = _mm_avg_epu8(c0, c8);
   3932 
   3933 								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
   3934 							}
   3935 
   3936 							source0 += pitch;
   3937 							source1 += pitch;
   3938 							source2 += pitch;
   3939 							source3 += pitch;
   3940 							source4 += pitch;
   3941 							source5 += pitch;
   3942 							source6 += pitch;
   3943 							source7 += pitch;
   3944 							source8 += pitch;
   3945 							source9 += pitch;
   3946 							sourceA += pitch;
   3947 							sourceB += pitch;
   3948 							sourceC += pitch;
   3949 							sourceD += pitch;
   3950 							sourceE += pitch;
   3951 							sourceF += pitch;
   3952 						}
   3953 					}
   3954 					else ASSERT(false);
   3955 				}
   3956 				else
   3957 			#endif
   3958 			{
   3959 				#define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7F7F7F7F) + (((x) ^ (y)) & 0x01010101))
   3960 
   3961 				if(internal.depth == 2)
   3962 				{
   3963 					for(int y = 0; y < height; y++)
   3964 					{
   3965 						for(int x = 0; x < width; x++)
   3966 						{
   3967 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
   3968 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
   3969 
   3970 							c0 = AVERAGE(c0, c1);
   3971 
   3972 							*(unsigned int*)(source0 + 4 * x) = c0;
   3973 						}
   3974 
   3975 						source0 += pitch;
   3976 						source1 += pitch;
   3977 					}
   3978 				}
   3979 				else if(internal.depth == 4)
   3980 				{
   3981 					for(int y = 0; y < height; y++)
   3982 					{
   3983 						for(int x = 0; x < width; x++)
   3984 						{
   3985 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
   3986 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
   3987 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
   3988 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
   3989 
   3990 							c0 = AVERAGE(c0, c1);
   3991 							c2 = AVERAGE(c2, c3);
   3992 							c0 = AVERAGE(c0, c2);
   3993 
   3994 							*(unsigned int*)(source0 + 4 * x) = c0;
   3995 						}
   3996 
   3997 						source0 += pitch;
   3998 						source1 += pitch;
   3999 						source2 += pitch;
   4000 						source3 += pitch;
   4001 					}
   4002 				}
   4003 				else if(internal.depth == 8)
   4004 				{
   4005 					for(int y = 0; y < height; y++)
   4006 					{
   4007 						for(int x = 0; x < width; x++)
   4008 						{
   4009 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
   4010 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
   4011 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
   4012 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
   4013 							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
   4014 							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
   4015 							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
   4016 							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
   4017 
   4018 							c0 = AVERAGE(c0, c1);
   4019 							c2 = AVERAGE(c2, c3);
   4020 							c4 = AVERAGE(c4, c5);
   4021 							c6 = AVERAGE(c6, c7);
   4022 							c0 = AVERAGE(c0, c2);
   4023 							c4 = AVERAGE(c4, c6);
   4024 							c0 = AVERAGE(c0, c4);
   4025 
   4026 							*(unsigned int*)(source0 + 4 * x) = c0;
   4027 						}
   4028 
   4029 						source0 += pitch;
   4030 						source1 += pitch;
   4031 						source2 += pitch;
   4032 						source3 += pitch;
   4033 						source4 += pitch;
   4034 						source5 += pitch;
   4035 						source6 += pitch;
   4036 						source7 += pitch;
   4037 					}
   4038 				}
   4039 				else if(internal.depth == 16)
   4040 				{
   4041 					for(int y = 0; y < height; y++)
   4042 					{
   4043 						for(int x = 0; x < width; x++)
   4044 						{
   4045 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
   4046 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
   4047 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
   4048 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
   4049 							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
   4050 							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
   4051 							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
   4052 							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
   4053 							unsigned int c8 = *(unsigned int*)(source8 + 4 * x);
   4054 							unsigned int c9 = *(unsigned int*)(source9 + 4 * x);
   4055 							unsigned int cA = *(unsigned int*)(sourceA + 4 * x);
   4056 							unsigned int cB = *(unsigned int*)(sourceB + 4 * x);
   4057 							unsigned int cC = *(unsigned int*)(sourceC + 4 * x);
   4058 							unsigned int cD = *(unsigned int*)(sourceD + 4 * x);
   4059 							unsigned int cE = *(unsigned int*)(sourceE + 4 * x);
   4060 							unsigned int cF = *(unsigned int*)(sourceF + 4 * x);
   4061 
   4062 							c0 = AVERAGE(c0, c1);
   4063 							c2 = AVERAGE(c2, c3);
   4064 							c4 = AVERAGE(c4, c5);
   4065 							c6 = AVERAGE(c6, c7);
   4066 							c8 = AVERAGE(c8, c9);
   4067 							cA = AVERAGE(cA, cB);
   4068 							cC = AVERAGE(cC, cD);
   4069 							cE = AVERAGE(cE, cF);
   4070 							c0 = AVERAGE(c0, c2);
   4071 							c4 = AVERAGE(c4, c6);
   4072 							c8 = AVERAGE(c8, cA);
   4073 							cC = AVERAGE(cC, cE);
   4074 							c0 = AVERAGE(c0, c4);
   4075 							c8 = AVERAGE(c8, cC);
   4076 							c0 = AVERAGE(c0, c8);
   4077 
   4078 							*(unsigned int*)(source0 + 4 * x) = c0;
   4079 						}
   4080 
   4081 						source0 += pitch;
   4082 						source1 += pitch;
   4083 						source2 += pitch;
   4084 						source3 += pitch;
   4085 						source4 += pitch;
   4086 						source5 += pitch;
   4087 						source6 += pitch;
   4088 						source7 += pitch;
   4089 						source8 += pitch;
   4090 						source9 += pitch;
   4091 						sourceA += pitch;
   4092 						sourceB += pitch;
   4093 						sourceC += pitch;
   4094 						sourceD += pitch;
   4095 						sourceE += pitch;
   4096 						sourceF += pitch;
   4097 					}
   4098 				}
   4099 				else ASSERT(false);
   4100 
   4101 				#undef AVERAGE
   4102 			}
   4103 		}
   4104 		else if(internal.format == FORMAT_G16R16)
   4105 		{
   4106 
   4107 			#if defined(__i386__) || defined(__x86_64__)
   4108 				if(CPUID::supportsSSE2() && (width % 4) == 0)
   4109 				{
   4110 					if(internal.depth == 2)
   4111 					{
   4112 						for(int y = 0; y < height; y++)
   4113 						{
   4114 							for(int x = 0; x < width; x += 4)
   4115 							{
   4116 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
   4117 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
   4118 
   4119 								c0 = _mm_avg_epu16(c0, c1);
   4120 
   4121 								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
   4122 							}
   4123 
   4124 							source0 += pitch;
   4125 							source1 += pitch;
   4126 						}
   4127 					}
   4128 					else if(internal.depth == 4)
   4129 					{
   4130 						for(int y = 0; y < height; y++)
   4131 						{
   4132 							for(int x = 0; x < width; x += 4)
   4133 							{
   4134 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
   4135 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
   4136 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
   4137 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
   4138 
   4139 								c0 = _mm_avg_epu16(c0, c1);
   4140 								c2 = _mm_avg_epu16(c2, c3);
   4141 								c0 = _mm_avg_epu16(c0, c2);
   4142 
   4143 								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
   4144 							}
   4145 
   4146 							source0 += pitch;
   4147 							source1 += pitch;
   4148 							source2 += pitch;
   4149 							source3 += pitch;
   4150 						}
   4151 					}
   4152 					else if(internal.depth == 8)
   4153 					{
   4154 						for(int y = 0; y < height; y++)
   4155 						{
   4156 							for(int x = 0; x < width; x += 4)
   4157 							{
   4158 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
   4159 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
   4160 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
   4161 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
   4162 								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
   4163 								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
   4164 								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
   4165 								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
   4166 
   4167 								c0 = _mm_avg_epu16(c0, c1);
   4168 								c2 = _mm_avg_epu16(c2, c3);
   4169 								c4 = _mm_avg_epu16(c4, c5);
   4170 								c6 = _mm_avg_epu16(c6, c7);
   4171 								c0 = _mm_avg_epu16(c0, c2);
   4172 								c4 = _mm_avg_epu16(c4, c6);
   4173 								c0 = _mm_avg_epu16(c0, c4);
   4174 
   4175 								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
   4176 							}
   4177 
   4178 							source0 += pitch;
   4179 							source1 += pitch;
   4180 							source2 += pitch;
   4181 							source3 += pitch;
   4182 							source4 += pitch;
   4183 							source5 += pitch;
   4184 							source6 += pitch;
   4185 							source7 += pitch;
   4186 						}
   4187 					}
   4188 					else if(internal.depth == 16)
   4189 					{
   4190 						for(int y = 0; y < height; y++)
   4191 						{
   4192 							for(int x = 0; x < width; x += 4)
   4193 							{
   4194 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
   4195 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
   4196 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
   4197 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
   4198 								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
   4199 								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
   4200 								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
   4201 								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
   4202 								__m128i c8 = _mm_load_si128((__m128i*)(source8 + 4 * x));
   4203 								__m128i c9 = _mm_load_si128((__m128i*)(source9 + 4 * x));
   4204 								__m128i cA = _mm_load_si128((__m128i*)(sourceA + 4 * x));
   4205 								__m128i cB = _mm_load_si128((__m128i*)(sourceB + 4 * x));
   4206 								__m128i cC = _mm_load_si128((__m128i*)(sourceC + 4 * x));
   4207 								__m128i cD = _mm_load_si128((__m128i*)(sourceD + 4 * x));
   4208 								__m128i cE = _mm_load_si128((__m128i*)(sourceE + 4 * x));
   4209 								__m128i cF = _mm_load_si128((__m128i*)(sourceF + 4 * x));
   4210 
   4211 								c0 = _mm_avg_epu16(c0, c1);
   4212 								c2 = _mm_avg_epu16(c2, c3);
   4213 								c4 = _mm_avg_epu16(c4, c5);
   4214 								c6 = _mm_avg_epu16(c6, c7);
   4215 								c8 = _mm_avg_epu16(c8, c9);
   4216 								cA = _mm_avg_epu16(cA, cB);
   4217 								cC = _mm_avg_epu16(cC, cD);
   4218 								cE = _mm_avg_epu16(cE, cF);
   4219 								c0 = _mm_avg_epu16(c0, c2);
   4220 								c4 = _mm_avg_epu16(c4, c6);
   4221 								c8 = _mm_avg_epu16(c8, cA);
   4222 								cC = _mm_avg_epu16(cC, cE);
   4223 								c0 = _mm_avg_epu16(c0, c4);
   4224 								c8 = _mm_avg_epu16(c8, cC);
   4225 								c0 = _mm_avg_epu16(c0, c8);
   4226 
   4227 								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
   4228 							}
   4229 
   4230 							source0 += pitch;
   4231 							source1 += pitch;
   4232 							source2 += pitch;
   4233 							source3 += pitch;
   4234 							source4 += pitch;
   4235 							source5 += pitch;
   4236 							source6 += pitch;
   4237 							source7 += pitch;
   4238 							source8 += pitch;
   4239 							source9 += pitch;
   4240 							sourceA += pitch;
   4241 							sourceB += pitch;
   4242 							sourceC += pitch;
   4243 							sourceD += pitch;
   4244 							sourceE += pitch;
   4245 							sourceF += pitch;
   4246 						}
   4247 					}
   4248 					else ASSERT(false);
   4249 				}
   4250 				else
   4251 			#endif
   4252 			{
   4253 				#define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7FFF7FFF) + (((x) ^ (y)) & 0x00010001))
   4254 
   4255 				if(internal.depth == 2)
   4256 				{
   4257 					for(int y = 0; y < height; y++)
   4258 					{
   4259 						for(int x = 0; x < width; x++)
   4260 						{
   4261 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
   4262 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
   4263 
   4264 							c0 = AVERAGE(c0, c1);
   4265 
   4266 							*(unsigned int*)(source0 + 4 * x) = c0;
   4267 						}
   4268 
   4269 						source0 += pitch;
   4270 						source1 += pitch;
   4271 					}
   4272 				}
   4273 				else if(internal.depth == 4)
   4274 				{
   4275 					for(int y = 0; y < height; y++)
   4276 					{
   4277 						for(int x = 0; x < width; x++)
   4278 						{
   4279 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
   4280 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
   4281 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
   4282 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
   4283 
   4284 							c0 = AVERAGE(c0, c1);
   4285 							c2 = AVERAGE(c2, c3);
   4286 							c0 = AVERAGE(c0, c2);
   4287 
   4288 							*(unsigned int*)(source0 + 4 * x) = c0;
   4289 						}
   4290 
   4291 						source0 += pitch;
   4292 						source1 += pitch;
   4293 						source2 += pitch;
   4294 						source3 += pitch;
   4295 					}
   4296 				}
   4297 				else if(internal.depth == 8)
   4298 				{
   4299 					for(int y = 0; y < height; y++)
   4300 					{
   4301 						for(int x = 0; x < width; x++)
   4302 						{
   4303 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
   4304 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
   4305 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
   4306 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
   4307 							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
   4308 							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
   4309 							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
   4310 							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
   4311 
   4312 							c0 = AVERAGE(c0, c1);
   4313 							c2 = AVERAGE(c2, c3);
   4314 							c4 = AVERAGE(c4, c5);
   4315 							c6 = AVERAGE(c6, c7);
   4316 							c0 = AVERAGE(c0, c2);
   4317 							c4 = AVERAGE(c4, c6);
   4318 							c0 = AVERAGE(c0, c4);
   4319 
   4320 							*(unsigned int*)(source0 + 4 * x) = c0;
   4321 						}
   4322 
   4323 						source0 += pitch;
   4324 						source1 += pitch;
   4325 						source2 += pitch;
   4326 						source3 += pitch;
   4327 						source4 += pitch;
   4328 						source5 += pitch;
   4329 						source6 += pitch;
   4330 						source7 += pitch;
   4331 					}
   4332 				}
   4333 				else if(internal.depth == 16)
   4334 				{
   4335 					for(int y = 0; y < height; y++)
   4336 					{
   4337 						for(int x = 0; x < width; x++)
   4338 						{
   4339 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
   4340 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
   4341 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
   4342 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
   4343 							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
   4344 							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
   4345 							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
   4346 							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
   4347 							unsigned int c8 = *(unsigned int*)(source8 + 4 * x);
   4348 							unsigned int c9 = *(unsigned int*)(source9 + 4 * x);
   4349 							unsigned int cA = *(unsigned int*)(sourceA + 4 * x);
   4350 							unsigned int cB = *(unsigned int*)(sourceB + 4 * x);
   4351 							unsigned int cC = *(unsigned int*)(sourceC + 4 * x);
   4352 							unsigned int cD = *(unsigned int*)(sourceD + 4 * x);
   4353 							unsigned int cE = *(unsigned int*)(sourceE + 4 * x);
   4354 							unsigned int cF = *(unsigned int*)(sourceF + 4 * x);
   4355 
   4356 							c0 = AVERAGE(c0, c1);
   4357 							c2 = AVERAGE(c2, c3);
   4358 							c4 = AVERAGE(c4, c5);
   4359 							c6 = AVERAGE(c6, c7);
   4360 							c8 = AVERAGE(c8, c9);
   4361 							cA = AVERAGE(cA, cB);
   4362 							cC = AVERAGE(cC, cD);
   4363 							cE = AVERAGE(cE, cF);
   4364 							c0 = AVERAGE(c0, c2);
   4365 							c4 = AVERAGE(c4, c6);
   4366 							c8 = AVERAGE(c8, cA);
   4367 							cC = AVERAGE(cC, cE);
   4368 							c0 = AVERAGE(c0, c4);
   4369 							c8 = AVERAGE(c8, cC);
   4370 							c0 = AVERAGE(c0, c8);
   4371 
   4372 							*(unsigned int*)(source0 + 4 * x) = c0;
   4373 						}
   4374 
   4375 						source0 += pitch;
   4376 						source1 += pitch;
   4377 						source2 += pitch;
   4378 						source3 += pitch;
   4379 						source4 += pitch;
   4380 						source5 += pitch;
   4381 						source6 += pitch;
   4382 						source7 += pitch;
   4383 						source8 += pitch;
   4384 						source9 += pitch;
   4385 						sourceA += pitch;
   4386 						sourceB += pitch;
   4387 						sourceC += pitch;
   4388 						sourceD += pitch;
   4389 						sourceE += pitch;
   4390 						sourceF += pitch;
   4391 					}
   4392 				}
   4393 				else ASSERT(false);
   4394 
   4395 				#undef AVERAGE
   4396 			}
   4397 		}
   4398 		else if(internal.format == FORMAT_A16B16G16R16)
   4399 		{
   4400 			#if defined(__i386__) || defined(__x86_64__)
   4401 				if(CPUID::supportsSSE2() && (width % 2) == 0)
   4402 				{
   4403 					if(internal.depth == 2)
   4404 					{
   4405 						for(int y = 0; y < height; y++)
   4406 						{
   4407 							for(int x = 0; x < width; x += 2)
   4408 							{
   4409 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
   4410 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
   4411 
   4412 								c0 = _mm_avg_epu16(c0, c1);
   4413 
   4414 								_mm_store_si128((__m128i*)(source0 + 8 * x), c0);
   4415 							}
   4416 
   4417 							source0 += pitch;
   4418 							source1 += pitch;
   4419 						}
   4420 					}
   4421 					else if(internal.depth == 4)
   4422 					{
   4423 						for(int y = 0; y < height; y++)
   4424 						{
   4425 							for(int x = 0; x < width; x += 2)
   4426 							{
   4427 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
   4428 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
   4429 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 8 * x));
   4430 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 8 * x));
   4431 
   4432 								c0 = _mm_avg_epu16(c0, c1);
   4433 								c2 = _mm_avg_epu16(c2, c3);
   4434 								c0 = _mm_avg_epu16(c0, c2);
   4435 
   4436 								_mm_store_si128((__m128i*)(source0 + 8 * x), c0);
   4437 							}
   4438 
   4439 							source0 += pitch;
   4440 							source1 += pitch;
   4441 							source2 += pitch;
   4442 							source3 += pitch;
   4443 						}
   4444 					}
   4445 					else if(internal.depth == 8)
   4446 					{
   4447 						for(int y = 0; y < height; y++)
   4448 						{
   4449 							for(int x = 0; x < width; x += 2)
   4450 							{
   4451 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
   4452 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
   4453 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 8 * x));
   4454 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 8 * x));
   4455 								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 8 * x));
   4456 								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 8 * x));
   4457 								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 8 * x));
   4458 								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 8 * x));
   4459 
   4460 								c0 = _mm_avg_epu16(c0, c1);
   4461 								c2 = _mm_avg_epu16(c2, c3);
   4462 								c4 = _mm_avg_epu16(c4, c5);
   4463 								c6 = _mm_avg_epu16(c6, c7);
   4464 								c0 = _mm_avg_epu16(c0, c2);
   4465 								c4 = _mm_avg_epu16(c4, c6);
   4466 								c0 = _mm_avg_epu16(c0, c4);
   4467 
   4468 								_mm_store_si128((__m128i*)(source0 + 8 * x), c0);
   4469 							}
   4470 
   4471 							source0 += pitch;
   4472 							source1 += pitch;
   4473 							source2 += pitch;
   4474 							source3 += pitch;
   4475 							source4 += pitch;
   4476 							source5 += pitch;
   4477 							source6 += pitch;
   4478 							source7 += pitch;
   4479 						}
   4480 					}
   4481 					else if(internal.depth == 16)
   4482 					{
   4483 						for(int y = 0; y < height; y++)
   4484 						{
   4485 							for(int x = 0; x < width; x += 2)
   4486 							{
   4487 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
   4488 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
   4489 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 8 * x));
   4490 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 8 * x));
   4491 								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 8 * x));
   4492 								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 8 * x));
   4493 								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 8 * x));
   4494 								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 8 * x));
   4495 								__m128i c8 = _mm_load_si128((__m128i*)(source8 + 8 * x));
   4496 								__m128i c9 = _mm_load_si128((__m128i*)(source9 + 8 * x));
   4497 								__m128i cA = _mm_load_si128((__m128i*)(sourceA + 8 * x));
   4498 								__m128i cB = _mm_load_si128((__m128i*)(sourceB + 8 * x));
   4499 								__m128i cC = _mm_load_si128((__m128i*)(sourceC + 8 * x));
   4500 								__m128i cD = _mm_load_si128((__m128i*)(sourceD + 8 * x));
   4501 								__m128i cE = _mm_load_si128((__m128i*)(sourceE + 8 * x));
   4502 								__m128i cF = _mm_load_si128((__m128i*)(sourceF + 8 * x));
   4503 
   4504 								c0 = _mm_avg_epu16(c0, c1);
   4505 								c2 = _mm_avg_epu16(c2, c3);
   4506 								c4 = _mm_avg_epu16(c4, c5);
   4507 								c6 = _mm_avg_epu16(c6, c7);
   4508 								c8 = _mm_avg_epu16(c8, c9);
   4509 								cA = _mm_avg_epu16(cA, cB);
   4510 								cC = _mm_avg_epu16(cC, cD);
   4511 								cE = _mm_avg_epu16(cE, cF);
   4512 								c0 = _mm_avg_epu16(c0, c2);
   4513 								c4 = _mm_avg_epu16(c4, c6);
   4514 								c8 = _mm_avg_epu16(c8, cA);
   4515 								cC = _mm_avg_epu16(cC, cE);
   4516 								c0 = _mm_avg_epu16(c0, c4);
   4517 								c8 = _mm_avg_epu16(c8, cC);
   4518 								c0 = _mm_avg_epu16(c0, c8);
   4519 
   4520 								_mm_store_si128((__m128i*)(source0 + 8 * x), c0);
   4521 							}
   4522 
   4523 							source0 += pitch;
   4524 							source1 += pitch;
   4525 							source2 += pitch;
   4526 							source3 += pitch;
   4527 							source4 += pitch;
   4528 							source5 += pitch;
   4529 							source6 += pitch;
   4530 							source7 += pitch;
   4531 							source8 += pitch;
   4532 							source9 += pitch;
   4533 							sourceA += pitch;
   4534 							sourceB += pitch;
   4535 							sourceC += pitch;
   4536 							sourceD += pitch;
   4537 							sourceE += pitch;
   4538 							sourceF += pitch;
   4539 						}
   4540 					}
   4541 					else ASSERT(false);
   4542 				}
   4543 				else
   4544 			#endif
   4545 			{
   4546 				#define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7FFF7FFF) + (((x) ^ (y)) & 0x00010001))
   4547 
   4548 				if(internal.depth == 2)
   4549 				{
   4550 					for(int y = 0; y < height; y++)
   4551 					{
   4552 						for(int x = 0; x < 2 * width; x++)
   4553 						{
   4554 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
   4555 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
   4556 
   4557 							c0 = AVERAGE(c0, c1);
   4558 
   4559 							*(unsigned int*)(source0 + 4 * x) = c0;
   4560 						}
   4561 
   4562 						source0 += pitch;
   4563 						source1 += pitch;
   4564 					}
   4565 				}
   4566 				else if(internal.depth == 4)
   4567 				{
   4568 					for(int y = 0; y < height; y++)
   4569 					{
   4570 						for(int x = 0; x < 2 * width; x++)
   4571 						{
   4572 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
   4573 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
   4574 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
   4575 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
   4576 
   4577 							c0 = AVERAGE(c0, c1);
   4578 							c2 = AVERAGE(c2, c3);
   4579 							c0 = AVERAGE(c0, c2);
   4580 
   4581 							*(unsigned int*)(source0 + 4 * x) = c0;
   4582 						}
   4583 
   4584 						source0 += pitch;
   4585 						source1 += pitch;
   4586 						source2 += pitch;
   4587 						source3 += pitch;
   4588 					}
   4589 				}
   4590 				else if(internal.depth == 8)
   4591 				{
   4592 					for(int y = 0; y < height; y++)
   4593 					{
   4594 						for(int x = 0; x < 2 * width; x++)
   4595 						{
   4596 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
   4597 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
   4598 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
   4599 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
   4600 							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
   4601 							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
   4602 							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
   4603 							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
   4604 
   4605 							c0 = AVERAGE(c0, c1);
   4606 							c2 = AVERAGE(c2, c3);
   4607 							c4 = AVERAGE(c4, c5);
   4608 							c6 = AVERAGE(c6, c7);
   4609 							c0 = AVERAGE(c0, c2);
   4610 							c4 = AVERAGE(c4, c6);
   4611 							c0 = AVERAGE(c0, c4);
   4612 
   4613 							*(unsigned int*)(source0 + 4 * x) = c0;
   4614 						}
   4615 
   4616 						source0 += pitch;
   4617 						source1 += pitch;
   4618 						source2 += pitch;
   4619 						source3 += pitch;
   4620 						source4 += pitch;
   4621 						source5 += pitch;
   4622 						source6 += pitch;
   4623 						source7 += pitch;
   4624 					}
   4625 				}
   4626 				else if(internal.depth == 16)
   4627 				{
   4628 					for(int y = 0; y < height; y++)
   4629 					{
   4630 						for(int x = 0; x < 2 * width; x++)
   4631 						{
   4632 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
   4633 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
   4634 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
   4635 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
   4636 							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
   4637 							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
   4638 							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
   4639 							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
   4640 							unsigned int c8 = *(unsigned int*)(source8 + 4 * x);
   4641 							unsigned int c9 = *(unsigned int*)(source9 + 4 * x);
   4642 							unsigned int cA = *(unsigned int*)(sourceA + 4 * x);
   4643 							unsigned int cB = *(unsigned int*)(sourceB + 4 * x);
   4644 							unsigned int cC = *(unsigned int*)(sourceC + 4 * x);
   4645 							unsigned int cD = *(unsigned int*)(sourceD + 4 * x);
   4646 							unsigned int cE = *(unsigned int*)(sourceE + 4 * x);
   4647 							unsigned int cF = *(unsigned int*)(sourceF + 4 * x);
   4648 
   4649 							c0 = AVERAGE(c0, c1);
   4650 							c2 = AVERAGE(c2, c3);
   4651 							c4 = AVERAGE(c4, c5);
   4652 							c6 = AVERAGE(c6, c7);
   4653 							c8 = AVERAGE(c8, c9);
   4654 							cA = AVERAGE(cA, cB);
   4655 							cC = AVERAGE(cC, cD);
   4656 							cE = AVERAGE(cE, cF);
   4657 							c0 = AVERAGE(c0, c2);
   4658 							c4 = AVERAGE(c4, c6);
   4659 							c8 = AVERAGE(c8, cA);
   4660 							cC = AVERAGE(cC, cE);
   4661 							c0 = AVERAGE(c0, c4);
   4662 							c8 = AVERAGE(c8, cC);
   4663 							c0 = AVERAGE(c0, c8);
   4664 
   4665 							*(unsigned int*)(source0 + 4 * x) = c0;
   4666 						}
   4667 
   4668 						source0 += pitch;
   4669 						source1 += pitch;
   4670 						source2 += pitch;
   4671 						source3 += pitch;
   4672 						source4 += pitch;
   4673 						source5 += pitch;
   4674 						source6 += pitch;
   4675 						source7 += pitch;
   4676 						source8 += pitch;
   4677 						source9 += pitch;
   4678 						sourceA += pitch;
   4679 						sourceB += pitch;
   4680 						sourceC += pitch;
   4681 						sourceD += pitch;
   4682 						sourceE += pitch;
   4683 						sourceF += pitch;
   4684 					}
   4685 				}
   4686 				else ASSERT(false);
   4687 
   4688 				#undef AVERAGE
   4689 			}
   4690 		}
   4691 		else if(internal.format == FORMAT_R32F)
   4692 		{
   4693 			#if defined(__i386__) || defined(__x86_64__)
   4694 				if(CPUID::supportsSSE() && (width % 4) == 0)
   4695 				{
   4696 					if(internal.depth == 2)
   4697 					{
   4698 						for(int y = 0; y < height; y++)
   4699 						{
   4700 							for(int x = 0; x < width; x += 4)
   4701 							{
   4702 								__m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
   4703 								__m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
   4704 
   4705 								c0 = _mm_add_ps(c0, c1);
   4706 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 2.0f));
   4707 
   4708 								_mm_store_ps((float*)(source0 + 4 * x), c0);
   4709 							}
   4710 
   4711 							source0 += pitch;
   4712 							source1 += pitch;
   4713 						}
   4714 					}
   4715 					else if(internal.depth == 4)
   4716 					{
   4717 						for(int y = 0; y < height; y++)
   4718 						{
   4719 							for(int x = 0; x < width; x += 4)
   4720 							{
   4721 								__m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
   4722 								__m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
   4723 								__m128 c2 = _mm_load_ps((float*)(source2 + 4 * x));
   4724 								__m128 c3 = _mm_load_ps((float*)(source3 + 4 * x));
   4725 
   4726 								c0 = _mm_add_ps(c0, c1);
   4727 								c2 = _mm_add_ps(c2, c3);
   4728 								c0 = _mm_add_ps(c0, c2);
   4729 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 4.0f));
   4730 
   4731 								_mm_store_ps((float*)(source0 + 4 * x), c0);
   4732 							}
   4733 
   4734 							source0 += pitch;
   4735 							source1 += pitch;
   4736 							source2 += pitch;
   4737 							source3 += pitch;
   4738 						}
   4739 					}
   4740 					else if(internal.depth == 8)
   4741 					{
   4742 						for(int y = 0; y < height; y++)
   4743 						{
   4744 							for(int x = 0; x < width; x += 4)
   4745 							{
   4746 								__m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
   4747 								__m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
   4748 								__m128 c2 = _mm_load_ps((float*)(source2 + 4 * x));
   4749 								__m128 c3 = _mm_load_ps((float*)(source3 + 4 * x));
   4750 								__m128 c4 = _mm_load_ps((float*)(source4 + 4 * x));
   4751 								__m128 c5 = _mm_load_ps((float*)(source5 + 4 * x));
   4752 								__m128 c6 = _mm_load_ps((float*)(source6 + 4 * x));
   4753 								__m128 c7 = _mm_load_ps((float*)(source7 + 4 * x));
   4754 
   4755 								c0 = _mm_add_ps(c0, c1);
   4756 								c2 = _mm_add_ps(c2, c3);
   4757 								c4 = _mm_add_ps(c4, c5);
   4758 								c6 = _mm_add_ps(c6, c7);
   4759 								c0 = _mm_add_ps(c0, c2);
   4760 								c4 = _mm_add_ps(c4, c6);
   4761 								c0 = _mm_add_ps(c0, c4);
   4762 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 8.0f));
   4763 
   4764 								_mm_store_ps((float*)(source0 + 4 * x), c0);
   4765 							}
   4766 
   4767 							source0 += pitch;
   4768 							source1 += pitch;
   4769 							source2 += pitch;
   4770 							source3 += pitch;
   4771 							source4 += pitch;
   4772 							source5 += pitch;
   4773 							source6 += pitch;
   4774 							source7 += pitch;
   4775 						}
   4776 					}
   4777 					else if(internal.depth == 16)
   4778 					{
   4779 						for(int y = 0; y < height; y++)
   4780 						{
   4781 							for(int x = 0; x < width; x += 4)
   4782 							{
   4783 								__m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
   4784 								__m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
   4785 								__m128 c2 = _mm_load_ps((float*)(source2 + 4 * x));
   4786 								__m128 c3 = _mm_load_ps((float*)(source3 + 4 * x));
   4787 								__m128 c4 = _mm_load_ps((float*)(source4 + 4 * x));
   4788 								__m128 c5 = _mm_load_ps((float*)(source5 + 4 * x));
   4789 								__m128 c6 = _mm_load_ps((float*)(source6 + 4 * x));
   4790 								__m128 c7 = _mm_load_ps((float*)(source7 + 4 * x));
   4791 								__m128 c8 = _mm_load_ps((float*)(source8 + 4 * x));
   4792 								__m128 c9 = _mm_load_ps((float*)(source9 + 4 * x));
   4793 								__m128 cA = _mm_load_ps((float*)(sourceA + 4 * x));
   4794 								__m128 cB = _mm_load_ps((float*)(sourceB + 4 * x));
   4795 								__m128 cC = _mm_load_ps((float*)(sourceC + 4 * x));
   4796 								__m128 cD = _mm_load_ps((float*)(sourceD + 4 * x));
   4797 								__m128 cE = _mm_load_ps((float*)(sourceE + 4 * x));
   4798 								__m128 cF = _mm_load_ps((float*)(sourceF + 4 * x));
   4799 
   4800 								c0 = _mm_add_ps(c0, c1);
   4801 								c2 = _mm_add_ps(c2, c3);
   4802 								c4 = _mm_add_ps(c4, c5);
   4803 								c6 = _mm_add_ps(c6, c7);
   4804 								c8 = _mm_add_ps(c8, c9);
   4805 								cA = _mm_add_ps(cA, cB);
   4806 								cC = _mm_add_ps(cC, cD);
   4807 								cE = _mm_add_ps(cE, cF);
   4808 								c0 = _mm_add_ps(c0, c2);
   4809 								c4 = _mm_add_ps(c4, c6);
   4810 								c8 = _mm_add_ps(c8, cA);
   4811 								cC = _mm_add_ps(cC, cE);
   4812 								c0 = _mm_add_ps(c0, c4);
   4813 								c8 = _mm_add_ps(c8, cC);
   4814 								c0 = _mm_add_ps(c0, c8);
   4815 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 16.0f));
   4816 
   4817 								_mm_store_ps((float*)(source0 + 4 * x), c0);
   4818 							}
   4819 
   4820 							source0 += pitch;
   4821 							source1 += pitch;
   4822 							source2 += pitch;
   4823 							source3 += pitch;
   4824 							source4 += pitch;
   4825 							source5 += pitch;
   4826 							source6 += pitch;
   4827 							source7 += pitch;
   4828 							source8 += pitch;
   4829 							source9 += pitch;
   4830 							sourceA += pitch;
   4831 							sourceB += pitch;
   4832 							sourceC += pitch;
   4833 							sourceD += pitch;
   4834 							sourceE += pitch;
   4835 							sourceF += pitch;
   4836 						}
   4837 					}
   4838 					else ASSERT(false);
   4839 				}
   4840 				else
   4841 			#endif
   4842 			{
   4843 				if(internal.depth == 2)
   4844 				{
   4845 					for(int y = 0; y < height; y++)
   4846 					{
   4847 						for(int x = 0; x < width; x++)
   4848 						{
   4849 							float c0 = *(float*)(source0 + 4 * x);
   4850 							float c1 = *(float*)(source1 + 4 * x);
   4851 
   4852 							c0 = c0 + c1;
   4853 							c0 *= 1.0f / 2.0f;
   4854 
   4855 							*(float*)(source0 + 4 * x) = c0;
   4856 						}
   4857 
   4858 						source0 += pitch;
   4859 						source1 += pitch;
   4860 					}
   4861 				}
   4862 				else if(internal.depth == 4)
   4863 				{
   4864 					for(int y = 0; y < height; y++)
   4865 					{
   4866 						for(int x = 0; x < width; x++)
   4867 						{
   4868 							float c0 = *(float*)(source0 + 4 * x);
   4869 							float c1 = *(float*)(source1 + 4 * x);
   4870 							float c2 = *(float*)(source2 + 4 * x);
   4871 							float c3 = *(float*)(source3 + 4 * x);
   4872 
   4873 							c0 = c0 + c1;
   4874 							c2 = c2 + c3;
   4875 							c0 = c0 + c2;
   4876 							c0 *= 1.0f / 4.0f;
   4877 
   4878 							*(float*)(source0 + 4 * x) = c0;
   4879 						}
   4880 
   4881 						source0 += pitch;
   4882 						source1 += pitch;
   4883 						source2 += pitch;
   4884 						source3 += pitch;
   4885 					}
   4886 				}
   4887 				else if(internal.depth == 8)
   4888 				{
   4889 					for(int y = 0; y < height; y++)
   4890 					{
   4891 						for(int x = 0; x < width; x++)
   4892 						{
   4893 							float c0 = *(float*)(source0 + 4 * x);
   4894 							float c1 = *(float*)(source1 + 4 * x);
   4895 							float c2 = *(float*)(source2 + 4 * x);
   4896 							float c3 = *(float*)(source3 + 4 * x);
   4897 							float c4 = *(float*)(source4 + 4 * x);
   4898 							float c5 = *(float*)(source5 + 4 * x);
   4899 							float c6 = *(float*)(source6 + 4 * x);
   4900 							float c7 = *(float*)(source7 + 4 * x);
   4901 
   4902 							c0 = c0 + c1;
   4903 							c2 = c2 + c3;
   4904 							c4 = c4 + c5;
   4905 							c6 = c6 + c7;
   4906 							c0 = c0 + c2;
   4907 							c4 = c4 + c6;
   4908 							c0 = c0 + c4;
   4909 							c0 *= 1.0f / 8.0f;
   4910 
   4911 							*(float*)(source0 + 4 * x) = c0;
   4912 						}
   4913 
   4914 						source0 += pitch;
   4915 						source1 += pitch;
   4916 						source2 += pitch;
   4917 						source3 += pitch;
   4918 						source4 += pitch;
   4919 						source5 += pitch;
   4920 						source6 += pitch;
   4921 						source7 += pitch;
   4922 					}
   4923 				}
   4924 				else if(internal.depth == 16)
   4925 				{
   4926 					for(int y = 0; y < height; y++)
   4927 					{
   4928 						for(int x = 0; x < width; x++)
   4929 						{
   4930 							float c0 = *(float*)(source0 + 4 * x);
   4931 							float c1 = *(float*)(source1 + 4 * x);
   4932 							float c2 = *(float*)(source2 + 4 * x);
   4933 							float c3 = *(float*)(source3 + 4 * x);
   4934 							float c4 = *(float*)(source4 + 4 * x);
   4935 							float c5 = *(float*)(source5 + 4 * x);
   4936 							float c6 = *(float*)(source6 + 4 * x);
   4937 							float c7 = *(float*)(source7 + 4 * x);
   4938 							float c8 = *(float*)(source8 + 4 * x);
   4939 							float c9 = *(float*)(source9 + 4 * x);
   4940 							float cA = *(float*)(sourceA + 4 * x);
   4941 							float cB = *(float*)(sourceB + 4 * x);
   4942 							float cC = *(float*)(sourceC + 4 * x);
   4943 							float cD = *(float*)(sourceD + 4 * x);
   4944 							float cE = *(float*)(sourceE + 4 * x);
   4945 							float cF = *(float*)(sourceF + 4 * x);
   4946 
   4947 							c0 = c0 + c1;
   4948 							c2 = c2 + c3;
   4949 							c4 = c4 + c5;
   4950 							c6 = c6 + c7;
   4951 							c8 = c8 + c9;
   4952 							cA = cA + cB;
   4953 							cC = cC + cD;
   4954 							cE = cE + cF;
   4955 							c0 = c0 + c2;
   4956 							c4 = c4 + c6;
   4957 							c8 = c8 + cA;
   4958 							cC = cC + cE;
   4959 							c0 = c0 + c4;
   4960 							c8 = c8 + cC;
   4961 							c0 = c0 + c8;
   4962 							c0 *= 1.0f / 16.0f;
   4963 
   4964 							*(float*)(source0 + 4 * x) = c0;
   4965 						}
   4966 
   4967 						source0 += pitch;
   4968 						source1 += pitch;
   4969 						source2 += pitch;
   4970 						source3 += pitch;
   4971 						source4 += pitch;
   4972 						source5 += pitch;
   4973 						source6 += pitch;
   4974 						source7 += pitch;
   4975 						source8 += pitch;
   4976 						source9 += pitch;
   4977 						sourceA += pitch;
   4978 						sourceB += pitch;
   4979 						sourceC += pitch;
   4980 						sourceD += pitch;
   4981 						sourceE += pitch;
   4982 						sourceF += pitch;
   4983 					}
   4984 				}
   4985 				else ASSERT(false);
   4986 			}
   4987 		}
   4988 		else if(internal.format == FORMAT_G32R32F)
   4989 		{
   4990 			#if defined(__i386__) || defined(__x86_64__)
   4991 				if(CPUID::supportsSSE() && (width % 2) == 0)
   4992 				{
   4993 					if(internal.depth == 2)
   4994 					{
   4995 						for(int y = 0; y < height; y++)
   4996 						{
   4997 							for(int x = 0; x < width; x += 2)
   4998 							{
   4999 								__m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
   5000 								__m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
   5001 
   5002 								c0 = _mm_add_ps(c0, c1);
   5003 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 2.0f));
   5004 
   5005 								_mm_store_ps((float*)(source0 + 8 * x), c0);
   5006 							}
   5007 
   5008 							source0 += pitch;
   5009 							source1 += pitch;
   5010 						}
   5011 					}
   5012 					else if(internal.depth == 4)
   5013 					{
   5014 						for(int y = 0; y < height; y++)
   5015 						{
   5016 							for(int x = 0; x < width; x += 2)
   5017 							{
   5018 								__m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
   5019 								__m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
   5020 								__m128 c2 = _mm_load_ps((float*)(source2 + 8 * x));
   5021 								__m128 c3 = _mm_load_ps((float*)(source3 + 8 * x));
   5022 
   5023 								c0 = _mm_add_ps(c0, c1);
   5024 								c2 = _mm_add_ps(c2, c3);
   5025 								c0 = _mm_add_ps(c0, c2);
   5026 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 4.0f));
   5027 
   5028 								_mm_store_ps((float*)(source0 + 8 * x), c0);
   5029 							}
   5030 
   5031 							source0 += pitch;
   5032 							source1 += pitch;
   5033 							source2 += pitch;
   5034 							source3 += pitch;
   5035 						}
   5036 					}
   5037 					else if(internal.depth == 8)
   5038 					{
   5039 						for(int y = 0; y < height; y++)
   5040 						{
   5041 							for(int x = 0; x < width; x += 2)
   5042 							{
   5043 								__m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
   5044 								__m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
   5045 								__m128 c2 = _mm_load_ps((float*)(source2 + 8 * x));
   5046 								__m128 c3 = _mm_load_ps((float*)(source3 + 8 * x));
   5047 								__m128 c4 = _mm_load_ps((float*)(source4 + 8 * x));
   5048 								__m128 c5 = _mm_load_ps((float*)(source5 + 8 * x));
   5049 								__m128 c6 = _mm_load_ps((float*)(source6 + 8 * x));
   5050 								__m128 c7 = _mm_load_ps((float*)(source7 + 8 * x));
   5051 
   5052 								c0 = _mm_add_ps(c0, c1);
   5053 								c2 = _mm_add_ps(c2, c3);
   5054 								c4 = _mm_add_ps(c4, c5);
   5055 								c6 = _mm_add_ps(c6, c7);
   5056 								c0 = _mm_add_ps(c0, c2);
   5057 								c4 = _mm_add_ps(c4, c6);
   5058 								c0 = _mm_add_ps(c0, c4);
   5059 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 8.0f));
   5060 
   5061 								_mm_store_ps((float*)(source0 + 8 * x), c0);
   5062 							}
   5063 
   5064 							source0 += pitch;
   5065 							source1 += pitch;
   5066 							source2 += pitch;
   5067 							source3 += pitch;
   5068 							source4 += pitch;
   5069 							source5 += pitch;
   5070 							source6 += pitch;
   5071 							source7 += pitch;
   5072 						}
   5073 					}
   5074 					else if(internal.depth == 16)
   5075 					{
   5076 						for(int y = 0; y < height; y++)
   5077 						{
   5078 							for(int x = 0; x < width; x += 2)
   5079 							{
   5080 								__m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
   5081 								__m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
   5082 								__m128 c2 = _mm_load_ps((float*)(source2 + 8 * x));
   5083 								__m128 c3 = _mm_load_ps((float*)(source3 + 8 * x));
   5084 								__m128 c4 = _mm_load_ps((float*)(source4 + 8 * x));
   5085 								__m128 c5 = _mm_load_ps((float*)(source5 + 8 * x));
   5086 								__m128 c6 = _mm_load_ps((float*)(source6 + 8 * x));
   5087 								__m128 c7 = _mm_load_ps((float*)(source7 + 8 * x));
   5088 								__m128 c8 = _mm_load_ps((float*)(source8 + 8 * x));
   5089 								__m128 c9 = _mm_load_ps((float*)(source9 + 8 * x));
   5090 								__m128 cA = _mm_load_ps((float*)(sourceA + 8 * x));
   5091 								__m128 cB = _mm_load_ps((float*)(sourceB + 8 * x));
   5092 								__m128 cC = _mm_load_ps((float*)(sourceC + 8 * x));
   5093 								__m128 cD = _mm_load_ps((float*)(sourceD + 8 * x));
   5094 								__m128 cE = _mm_load_ps((float*)(sourceE + 8 * x));
   5095 								__m128 cF = _mm_load_ps((float*)(sourceF + 8 * x));
   5096 
   5097 								c0 = _mm_add_ps(c0, c1);
   5098 								c2 = _mm_add_ps(c2, c3);
   5099 								c4 = _mm_add_ps(c4, c5);
   5100 								c6 = _mm_add_ps(c6, c7);
   5101 								c8 = _mm_add_ps(c8, c9);
   5102 								cA = _mm_add_ps(cA, cB);
   5103 								cC = _mm_add_ps(cC, cD);
   5104 								cE = _mm_add_ps(cE, cF);
   5105 								c0 = _mm_add_ps(c0, c2);
   5106 								c4 = _mm_add_ps(c4, c6);
   5107 								c8 = _mm_add_ps(c8, cA);
   5108 								cC = _mm_add_ps(cC, cE);
   5109 								c0 = _mm_add_ps(c0, c4);
   5110 								c8 = _mm_add_ps(c8, cC);
   5111 								c0 = _mm_add_ps(c0, c8);
   5112 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 16.0f));
   5113 
   5114 								_mm_store_ps((float*)(source0 + 8 * x), c0);
   5115 							}
   5116 
   5117 							source0 += pitch;
   5118 							source1 += pitch;
   5119 							source2 += pitch;
   5120 							source3 += pitch;
   5121 							source4 += pitch;
   5122 							source5 += pitch;
   5123 							source6 += pitch;
   5124 							source7 += pitch;
   5125 							source8 += pitch;
   5126 							source9 += pitch;
   5127 							sourceA += pitch;
   5128 							sourceB += pitch;
   5129 							sourceC += pitch;
   5130 							sourceD += pitch;
   5131 							sourceE += pitch;
   5132 							sourceF += pitch;
   5133 						}
   5134 					}
   5135 					else ASSERT(false);
   5136 				}
   5137 				else
   5138 			#endif
   5139 			{
   5140 				if(internal.depth == 2)
   5141 				{
   5142 					for(int y = 0; y < height; y++)
   5143 					{
   5144 						for(int x = 0; x < 2 * width; x++)
   5145 						{
   5146 							float c0 = *(float*)(source0 + 4 * x);
   5147 							float c1 = *(float*)(source1 + 4 * x);
   5148 
   5149 							c0 = c0 + c1;
   5150 							c0 *= 1.0f / 2.0f;
   5151 
   5152 							*(float*)(source0 + 4 * x) = c0;
   5153 						}
   5154 
   5155 						source0 += pitch;
   5156 						source1 += pitch;
   5157 					}
   5158 				}
   5159 				else if(internal.depth == 4)
   5160 				{
   5161 					for(int y = 0; y < height; y++)
   5162 					{
   5163 						for(int x = 0; x < 2 * width; x++)
   5164 						{
   5165 							float c0 = *(float*)(source0 + 4 * x);
   5166 							float c1 = *(float*)(source1 + 4 * x);
   5167 							float c2 = *(float*)(source2 + 4 * x);
   5168 							float c3 = *(float*)(source3 + 4 * x);
   5169 
   5170 							c0 = c0 + c1;
   5171 							c2 = c2 + c3;
   5172 							c0 = c0 + c2;
   5173 							c0 *= 1.0f / 4.0f;
   5174 
   5175 							*(float*)(source0 + 4 * x) = c0;
   5176 						}
   5177 
   5178 						source0 += pitch;
   5179 						source1 += pitch;
   5180 						source2 += pitch;
   5181 						source3 += pitch;
   5182 					}
   5183 				}
   5184 				else if(internal.depth == 8)
   5185 				{
   5186 					for(int y = 0; y < height; y++)
   5187 					{
   5188 						for(int x = 0; x < 2 * width; x++)
   5189 						{
   5190 							float c0 = *(float*)(source0 + 4 * x);
   5191 							float c1 = *(float*)(source1 + 4 * x);
   5192 							float c2 = *(float*)(source2 + 4 * x);
   5193 							float c3 = *(float*)(source3 + 4 * x);
   5194 							float c4 = *(float*)(source4 + 4 * x);
   5195 							float c5 = *(float*)(source5 + 4 * x);
   5196 							float c6 = *(float*)(source6 + 4 * x);
   5197 							float c7 = *(float*)(source7 + 4 * x);
   5198 
   5199 							c0 = c0 + c1;
   5200 							c2 = c2 + c3;
   5201 							c4 = c4 + c5;
   5202 							c6 = c6 + c7;
   5203 							c0 = c0 + c2;
   5204 							c4 = c4 + c6;
   5205 							c0 = c0 + c4;
   5206 							c0 *= 1.0f / 8.0f;
   5207 
   5208 							*(float*)(source0 + 4 * x) = c0;
   5209 						}
   5210 
   5211 						source0 += pitch;
   5212 						source1 += pitch;
   5213 						source2 += pitch;
   5214 						source3 += pitch;
   5215 						source4 += pitch;
   5216 						source5 += pitch;
   5217 						source6 += pitch;
   5218 						source7 += pitch;
   5219 					}
   5220 				}
   5221 				else if(internal.depth == 16)
   5222 				{
   5223 					for(int y = 0; y < height; y++)
   5224 					{
   5225 						for(int x = 0; x < 2 * width; x++)
   5226 						{
   5227 							float c0 = *(float*)(source0 + 4 * x);
   5228 							float c1 = *(float*)(source1 + 4 * x);
   5229 							float c2 = *(float*)(source2 + 4 * x);
   5230 							float c3 = *(float*)(source3 + 4 * x);
   5231 							float c4 = *(float*)(source4 + 4 * x);
   5232 							float c5 = *(float*)(source5 + 4 * x);
   5233 							float c6 = *(float*)(source6 + 4 * x);
   5234 							float c7 = *(float*)(source7 + 4 * x);
   5235 							float c8 = *(float*)(source8 + 4 * x);
   5236 							float c9 = *(float*)(source9 + 4 * x);
   5237 							float cA = *(float*)(sourceA + 4 * x);
   5238 							float cB = *(float*)(sourceB + 4 * x);
   5239 							float cC = *(float*)(sourceC + 4 * x);
   5240 							float cD = *(float*)(sourceD + 4 * x);
   5241 							float cE = *(float*)(sourceE + 4 * x);
   5242 							float cF = *(float*)(sourceF + 4 * x);
   5243 
   5244 							c0 = c0 + c1;
   5245 							c2 = c2 + c3;
   5246 							c4 = c4 + c5;
   5247 							c6 = c6 + c7;
   5248 							c8 = c8 + c9;
   5249 							cA = cA + cB;
   5250 							cC = cC + cD;
   5251 							cE = cE + cF;
   5252 							c0 = c0 + c2;
   5253 							c4 = c4 + c6;
   5254 							c8 = c8 + cA;
   5255 							cC = cC + cE;
   5256 							c0 = c0 + c4;
   5257 							c8 = c8 + cC;
   5258 							c0 = c0 + c8;
   5259 							c0 *= 1.0f / 16.0f;
   5260 
   5261 							*(float*)(source0 + 4 * x) = c0;
   5262 						}
   5263 
   5264 						source0 += pitch;
   5265 						source1 += pitch;
   5266 						source2 += pitch;
   5267 						source3 += pitch;
   5268 						source4 += pitch;
   5269 						source5 += pitch;
   5270 						source6 += pitch;
   5271 						source7 += pitch;
   5272 						source8 += pitch;
   5273 						source9 += pitch;
   5274 						sourceA += pitch;
   5275 						sourceB += pitch;
   5276 						sourceC += pitch;
   5277 						sourceD += pitch;
   5278 						sourceE += pitch;
   5279 						sourceF += pitch;
   5280 					}
   5281 				}
   5282 				else ASSERT(false);
   5283 			}
   5284 		}
   5285 		else if(internal.format == FORMAT_A32B32G32R32F || internal.format == FORMAT_X32B32G32R32F)
   5286 		{
   5287 			#if defined(__i386__) || defined(__x86_64__)
   5288 				if(CPUID::supportsSSE())
   5289 				{
   5290 					if(internal.depth == 2)
   5291 					{
   5292 						for(int y = 0; y < height; y++)
   5293 						{
   5294 							for(int x = 0; x < width; x++)
   5295 							{
   5296 								__m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
   5297 								__m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
   5298 
   5299 								c0 = _mm_add_ps(c0, c1);
   5300 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 2.0f));
   5301 
   5302 								_mm_store_ps((float*)(source0 + 16 * x), c0);
   5303 							}
   5304 
   5305 							source0 += pitch;
   5306 							source1 += pitch;
   5307 						}
   5308 					}
   5309 					else if(internal.depth == 4)
   5310 					{
   5311 						for(int y = 0; y < height; y++)
   5312 						{
   5313 							for(int x = 0; x < width; x++)
   5314 							{
   5315 								__m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
   5316 								__m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
   5317 								__m128 c2 = _mm_load_ps((float*)(source2 + 16 * x));
   5318 								__m128 c3 = _mm_load_ps((float*)(source3 + 16 * x));
   5319 
   5320 								c0 = _mm_add_ps(c0, c1);
   5321 								c2 = _mm_add_ps(c2, c3);
   5322 								c0 = _mm_add_ps(c0, c2);
   5323 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 4.0f));
   5324 
   5325 								_mm_store_ps((float*)(source0 + 16 * x), c0);
   5326 							}
   5327 
   5328 							source0 += pitch;
   5329 							source1 += pitch;
   5330 							source2 += pitch;
   5331 							source3 += pitch;
   5332 						}
   5333 					}
   5334 					else if(internal.depth == 8)
   5335 					{
   5336 						for(int y = 0; y < height; y++)
   5337 						{
   5338 							for(int x = 0; x < width; x++)
   5339 							{
   5340 								__m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
   5341 								__m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
   5342 								__m128 c2 = _mm_load_ps((float*)(source2 + 16 * x));
   5343 								__m128 c3 = _mm_load_ps((float*)(source3 + 16 * x));
   5344 								__m128 c4 = _mm_load_ps((float*)(source4 + 16 * x));
   5345 								__m128 c5 = _mm_load_ps((float*)(source5 + 16 * x));
   5346 								__m128 c6 = _mm_load_ps((float*)(source6 + 16 * x));
   5347 								__m128 c7 = _mm_load_ps((float*)(source7 + 16 * x));
   5348 
   5349 								c0 = _mm_add_ps(c0, c1);
   5350 								c2 = _mm_add_ps(c2, c3);
   5351 								c4 = _mm_add_ps(c4, c5);
   5352 								c6 = _mm_add_ps(c6, c7);
   5353 								c0 = _mm_add_ps(c0, c2);
   5354 								c4 = _mm_add_ps(c4, c6);
   5355 								c0 = _mm_add_ps(c0, c4);
   5356 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 8.0f));
   5357 
   5358 								_mm_store_ps((float*)(source0 + 16 * x), c0);
   5359 							}
   5360 
   5361 							source0 += pitch;
   5362 							source1 += pitch;
   5363 							source2 += pitch;
   5364 							source3 += pitch;
   5365 							source4 += pitch;
   5366 							source5 += pitch;
   5367 							source6 += pitch;
   5368 							source7 += pitch;
   5369 						}
   5370 					}
   5371 					else if(internal.depth == 16)
   5372 					{
   5373 						for(int y = 0; y < height; y++)
   5374 						{
   5375 							for(int x = 0; x < width; x++)
   5376 							{
   5377 								__m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
   5378 								__m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
   5379 								__m128 c2 = _mm_load_ps((float*)(source2 + 16 * x));
   5380 								__m128 c3 = _mm_load_ps((float*)(source3 + 16 * x));
   5381 								__m128 c4 = _mm_load_ps((float*)(source4 + 16 * x));
   5382 								__m128 c5 = _mm_load_ps((float*)(source5 + 16 * x));
   5383 								__m128 c6 = _mm_load_ps((float*)(source6 + 16 * x));
   5384 								__m128 c7 = _mm_load_ps((float*)(source7 + 16 * x));
   5385 								__m128 c8 = _mm_load_ps((float*)(source8 + 16 * x));
   5386 								__m128 c9 = _mm_load_ps((float*)(source9 + 16 * x));
   5387 								__m128 cA = _mm_load_ps((float*)(sourceA + 16 * x));
   5388 								__m128 cB = _mm_load_ps((float*)(sourceB + 16 * x));
   5389 								__m128 cC = _mm_load_ps((float*)(sourceC + 16 * x));
   5390 								__m128 cD = _mm_load_ps((float*)(sourceD + 16 * x));
   5391 								__m128 cE = _mm_load_ps((float*)(sourceE + 16 * x));
   5392 								__m128 cF = _mm_load_ps((float*)(sourceF + 16 * x));
   5393 
   5394 								c0 = _mm_add_ps(c0, c1);
   5395 								c2 = _mm_add_ps(c2, c3);
   5396 								c4 = _mm_add_ps(c4, c5);
   5397 								c6 = _mm_add_ps(c6, c7);
   5398 								c8 = _mm_add_ps(c8, c9);
   5399 								cA = _mm_add_ps(cA, cB);
   5400 								cC = _mm_add_ps(cC, cD);
   5401 								cE = _mm_add_ps(cE, cF);
   5402 								c0 = _mm_add_ps(c0, c2);
   5403 								c4 = _mm_add_ps(c4, c6);
   5404 								c8 = _mm_add_ps(c8, cA);
   5405 								cC = _mm_add_ps(cC, cE);
   5406 								c0 = _mm_add_ps(c0, c4);
   5407 								c8 = _mm_add_ps(c8, cC);
   5408 								c0 = _mm_add_ps(c0, c8);
   5409 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 16.0f));
   5410 
   5411 								_mm_store_ps((float*)(source0 + 16 * x), c0);
   5412 							}
   5413 
   5414 							source0 += pitch;
   5415 							source1 += pitch;
   5416 							source2 += pitch;
   5417 							source3 += pitch;
   5418 							source4 += pitch;
   5419 							source5 += pitch;
   5420 							source6 += pitch;
   5421 							source7 += pitch;
   5422 							source8 += pitch;
   5423 							source9 += pitch;
   5424 							sourceA += pitch;
   5425 							sourceB += pitch;
   5426 							sourceC += pitch;
   5427 							sourceD += pitch;
   5428 							sourceE += pitch;
   5429 							sourceF += pitch;
   5430 						}
   5431 					}
   5432 					else ASSERT(false);
   5433 				}
   5434 				else
   5435 			#endif
   5436 			{
   5437 				if(internal.depth == 2)
   5438 				{
   5439 					for(int y = 0; y < height; y++)
   5440 					{
   5441 						for(int x = 0; x < 4 * width; x++)
   5442 						{
   5443 							float c0 = *(float*)(source0 + 4 * x);
   5444 							float c1 = *(float*)(source1 + 4 * x);
   5445 
   5446 							c0 = c0 + c1;
   5447 							c0 *= 1.0f / 2.0f;
   5448 
   5449 							*(float*)(source0 + 4 * x) = c0;
   5450 						}
   5451 
   5452 						source0 += pitch;
   5453 						source1 += pitch;
   5454 					}
   5455 				}
   5456 				else if(internal.depth == 4)
   5457 				{
   5458 					for(int y = 0; y < height; y++)
   5459 					{
   5460 						for(int x = 0; x < 4 * width; x++)
   5461 						{
   5462 							float c0 = *(float*)(source0 + 4 * x);
   5463 							float c1 = *(float*)(source1 + 4 * x);
   5464 							float c2 = *(float*)(source2 + 4 * x);
   5465 							float c3 = *(float*)(source3 + 4 * x);
   5466 
   5467 							c0 = c0 + c1;
   5468 							c2 = c2 + c3;
   5469 							c0 = c0 + c2;
   5470 							c0 *= 1.0f / 4.0f;
   5471 
   5472 							*(float*)(source0 + 4 * x) = c0;
   5473 						}
   5474 
   5475 						source0 += pitch;
   5476 						source1 += pitch;
   5477 						source2 += pitch;
   5478 						source3 += pitch;
   5479 					}
   5480 				}
   5481 				else if(internal.depth == 8)
   5482 				{
   5483 					for(int y = 0; y < height; y++)
   5484 					{
   5485 						for(int x = 0; x < 4 * width; x++)
   5486 						{
   5487 							float c0 = *(float*)(source0 + 4 * x);
   5488 							float c1 = *(float*)(source1 + 4 * x);
   5489 							float c2 = *(float*)(source2 + 4 * x);
   5490 							float c3 = *(float*)(source3 + 4 * x);
   5491 							float c4 = *(float*)(source4 + 4 * x);
   5492 							float c5 = *(float*)(source5 + 4 * x);
   5493 							float c6 = *(float*)(source6 + 4 * x);
   5494 							float c7 = *(float*)(source7 + 4 * x);
   5495 
   5496 							c0 = c0 + c1;
   5497 							c2 = c2 + c3;
   5498 							c4 = c4 + c5;
   5499 							c6 = c6 + c7;
   5500 							c0 = c0 + c2;
   5501 							c4 = c4 + c6;
   5502 							c0 = c0 + c4;
   5503 							c0 *= 1.0f / 8.0f;
   5504 
   5505 							*(float*)(source0 + 4 * x) = c0;
   5506 						}
   5507 
   5508 						source0 += pitch;
   5509 						source1 += pitch;
   5510 						source2 += pitch;
   5511 						source3 += pitch;
   5512 						source4 += pitch;
   5513 						source5 += pitch;
   5514 						source6 += pitch;
   5515 						source7 += pitch;
   5516 					}
   5517 				}
   5518 				else if(internal.depth == 16)
   5519 				{
   5520 					for(int y = 0; y < height; y++)
   5521 					{
   5522 						for(int x = 0; x < 4 * width; x++)
   5523 						{
   5524 							float c0 = *(float*)(source0 + 4 * x);
   5525 							float c1 = *(float*)(source1 + 4 * x);
   5526 							float c2 = *(float*)(source2 + 4 * x);
   5527 							float c3 = *(float*)(source3 + 4 * x);
   5528 							float c4 = *(float*)(source4 + 4 * x);
   5529 							float c5 = *(float*)(source5 + 4 * x);
   5530 							float c6 = *(float*)(source6 + 4 * x);
   5531 							float c7 = *(float*)(source7 + 4 * x);
   5532 							float c8 = *(float*)(source8 + 4 * x);
   5533 							float c9 = *(float*)(source9 + 4 * x);
   5534 							float cA = *(float*)(sourceA + 4 * x);
   5535 							float cB = *(float*)(sourceB + 4 * x);
   5536 							float cC = *(float*)(sourceC + 4 * x);
   5537 							float cD = *(float*)(sourceD + 4 * x);
   5538 							float cE = *(float*)(sourceE + 4 * x);
   5539 							float cF = *(float*)(sourceF + 4 * x);
   5540 
   5541 							c0 = c0 + c1;
   5542 							c2 = c2 + c3;
   5543 							c4 = c4 + c5;
   5544 							c6 = c6 + c7;
   5545 							c8 = c8 + c9;
   5546 							cA = cA + cB;
   5547 							cC = cC + cD;
   5548 							cE = cE + cF;
   5549 							c0 = c0 + c2;
   5550 							c4 = c4 + c6;
   5551 							c8 = c8 + cA;
   5552 							cC = cC + cE;
   5553 							c0 = c0 + c4;
   5554 							c8 = c8 + cC;
   5555 							c0 = c0 + c8;
   5556 							c0 *= 1.0f / 16.0f;
   5557 
   5558 							*(float*)(source0 + 4 * x) = c0;
   5559 						}
   5560 
   5561 						source0 += pitch;
   5562 						source1 += pitch;
   5563 						source2 += pitch;
   5564 						source3 += pitch;
   5565 						source4 += pitch;
   5566 						source5 += pitch;
   5567 						source6 += pitch;
   5568 						source7 += pitch;
   5569 						source8 += pitch;
   5570 						source9 += pitch;
   5571 						sourceA += pitch;
   5572 						sourceB += pitch;
   5573 						sourceC += pitch;
   5574 						sourceD += pitch;
   5575 						sourceE += pitch;
   5576 						sourceF += pitch;
   5577 					}
   5578 				}
   5579 				else ASSERT(false);
   5580 			}
   5581 		}
   5582 		else if(internal.format == FORMAT_R5G6B5)
   5583 		{
   5584 			#if defined(__i386__) || defined(__x86_64__)
   5585 				if(CPUID::supportsSSE2() && (width % 8) == 0)
   5586 				{
   5587 					if(internal.depth == 2)
   5588 					{
   5589 						for(int y = 0; y < height; y++)
   5590 						{
   5591 							for(int x = 0; x < width; x += 8)
   5592 							{
   5593 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
   5594 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
   5595 
   5596 								static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
   5597 								static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
   5598 								__m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
   5599 								__m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
   5600 								__m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
   5601 								__m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
   5602 
   5603 								c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
   5604 								c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
   5605 								c1 = _mm_avg_epu16(c0__g_, c1__g_);
   5606 								c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
   5607 								c0 = _mm_or_si128(c0, c1);
   5608 
   5609 								_mm_store_si128((__m128i*)(source0 + 2 * x), c0);
   5610 							}
   5611 
   5612 							source0 += pitch;
   5613 							source1 += pitch;
   5614 						}
   5615 					}
   5616 					else if(internal.depth == 4)
   5617 					{
   5618 						for(int y = 0; y < height; y++)
   5619 						{
   5620 							for(int x = 0; x < width; x += 8)
   5621 							{
   5622 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
   5623 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
   5624 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 2 * x));
   5625 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 2 * x));
   5626 
   5627 								static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
   5628 								static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
   5629 								__m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
   5630 								__m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
   5631 								__m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
   5632 								__m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
   5633 								__m128i c2_r_b = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(r_b));
   5634 								__m128i c2__g_ = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(_g_));
   5635 								__m128i c3_r_b = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(r_b));
   5636 								__m128i c3__g_ = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(_g_));
   5637 
   5638 								c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
   5639 								c2 = _mm_avg_epu8(c2_r_b, c3_r_b);
   5640 								c0 = _mm_avg_epu8(c0, c2);
   5641 								c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
   5642 								c1 = _mm_avg_epu16(c0__g_, c1__g_);
   5643 								c3 = _mm_avg_epu16(c2__g_, c3__g_);
   5644 								c1 = _mm_avg_epu16(c1, c3);
   5645 								c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
   5646 								c0 = _mm_or_si128(c0, c1);
   5647 
   5648 								_mm_store_si128((__m128i*)(source0 + 2 * x), c0);
   5649 							}
   5650 
   5651 							source0 += pitch;
   5652 							source1 += pitch;
   5653 							source2 += pitch;
   5654 							source3 += pitch;
   5655 						}
   5656 					}
   5657 					else if(internal.depth == 8)
   5658 					{
   5659 						for(int y = 0; y < height; y++)
   5660 						{
   5661 							for(int x = 0; x < width; x += 8)
   5662 							{
   5663 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
   5664 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
   5665 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 2 * x));
   5666 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 2 * x));
   5667 								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 2 * x));
   5668 								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 2 * x));
   5669 								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 2 * x));
   5670 								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 2 * x));
   5671 
   5672 								static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
   5673 								static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
   5674 								__m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
   5675 								__m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
   5676 								__m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
   5677 								__m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
   5678 								__m128i c2_r_b = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(r_b));
   5679 								__m128i c2__g_ = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(_g_));
   5680 								__m128i c3_r_b = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(r_b));
   5681 								__m128i c3__g_ = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(_g_));
   5682 								__m128i c4_r_b = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(r_b));
   5683 								__m128i c4__g_ = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(_g_));
   5684 								__m128i c5_r_b = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(r_b));
   5685 								__m128i c5__g_ = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(_g_));
   5686 								__m128i c6_r_b = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(r_b));
   5687 								__m128i c6__g_ = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(_g_));
   5688 								__m128i c7_r_b = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(r_b));
   5689 								__m128i c7__g_ = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(_g_));
   5690 
   5691 								c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
   5692 								c2 = _mm_avg_epu8(c2_r_b, c3_r_b);
   5693 								c4 = _mm_avg_epu8(c4_r_b, c5_r_b);
   5694 								c6 = _mm_avg_epu8(c6_r_b, c7_r_b);
   5695 								c0 = _mm_avg_epu8(c0, c2);
   5696 								c4 = _mm_avg_epu8(c4, c6);
   5697 								c0 = _mm_avg_epu8(c0, c4);
   5698 								c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
   5699 								c1 = _mm_avg_epu16(c0__g_, c1__g_);
   5700 								c3 = _mm_avg_epu16(c2__g_, c3__g_);
   5701 								c5 = _mm_avg_epu16(c4__g_, c5__g_);
   5702 								c7 = _mm_avg_epu16(c6__g_, c7__g_);
   5703 								c1 = _mm_avg_epu16(c1, c3);
   5704 								c5 = _mm_avg_epu16(c5, c7);
   5705 								c1 = _mm_avg_epu16(c1, c5);
   5706 								c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
   5707 								c0 = _mm_or_si128(c0, c1);
   5708 
   5709 								_mm_store_si128((__m128i*)(source0 + 2 * x), c0);
   5710 							}
   5711 
   5712 							source0 += pitch;
   5713 							source1 += pitch;
   5714 							source2 += pitch;
   5715 							source3 += pitch;
   5716 							source4 += pitch;
   5717 							source5 += pitch;
   5718 							source6 += pitch;
   5719 							source7 += pitch;
   5720 						}
   5721 					}
   5722 					else if(internal.depth == 16)
   5723 					{
   5724 						for(int y = 0; y < height; y++)
   5725 						{
   5726 							for(int x = 0; x < width; x += 8)
   5727 							{
   5728 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
   5729 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
   5730 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 2 * x));
   5731 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 2 * x));
   5732 								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 2 * x));
   5733 								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 2 * x));
   5734 								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 2 * x));
   5735 								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 2 * x));
   5736 								__m128i c8 = _mm_load_si128((__m128i*)(source8 + 2 * x));
   5737 								__m128i c9 = _mm_load_si128((__m128i*)(source9 + 2 * x));
   5738 								__m128i cA = _mm_load_si128((__m128i*)(sourceA + 2 * x));
   5739 								__m128i cB = _mm_load_si128((__m128i*)(sourceB + 2 * x));
   5740 								__m128i cC = _mm_load_si128((__m128i*)(sourceC + 2 * x));
   5741 								__m128i cD = _mm_load_si128((__m128i*)(sourceD + 2 * x));
   5742 								__m128i cE = _mm_load_si128((__m128i*)(sourceE + 2 * x));
   5743 								__m128i cF = _mm_load_si128((__m128i*)(sourceF + 2 * x));
   5744 
   5745 								static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
   5746 								static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
   5747 								__m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
   5748 								__m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
   5749 								__m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
   5750 								__m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
   5751 								__m128i c2_r_b = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(r_b));
   5752 								__m128i c2__g_ = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(_g_));
   5753 								__m128i c3_r_b = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(r_b));
   5754 								__m128i c3__g_ = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(_g_));
   5755 								__m128i c4_r_b = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(r_b));
   5756 								__m128i c4__g_ = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(_g_));
   5757 								__m128i c5_r_b = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(r_b));
   5758 								__m128i c5__g_ = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(_g_));
   5759 								__m128i c6_r_b = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(r_b));
   5760 								__m128i c6__g_ = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(_g_));
   5761 								__m128i c7_r_b = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(r_b));
   5762 								__m128i c7__g_ = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(_g_));
   5763 								__m128i c8_r_b = _mm_and_si128(c8, reinterpret_cast<const __m128i&>(r_b));
   5764 								__m128i c8__g_ = _mm_and_si128(c8, reinterpret_cast<const __m128i&>(_g_));
   5765 								__m128i c9_r_b = _mm_and_si128(c9, reinterpret_cast<const __m128i&>(r_b));
   5766 								__m128i c9__g_ = _mm_and_si128(c9, reinterpret_cast<const __m128i&>(_g_));
   5767 								__m128i cA_r_b = _mm_and_si128(cA, reinterpret_cast<const __m128i&>(r_b));
   5768 								__m128i cA__g_ = _mm_and_si128(cA, reinterpret_cast<const __m128i&>(_g_));
   5769 								__m128i cB_r_b = _mm_and_si128(cB, reinterpret_cast<const __m128i&>(r_b));
   5770 								__m128i cB__g_ = _mm_and_si128(cB, reinterpret_cast<const __m128i&>(_g_));
   5771 								__m128i cC_r_b = _mm_and_si128(cC, reinterpret_cast<const __m128i&>(r_b));
   5772 								__m128i cC__g_ = _mm_and_si128(cC, reinterpret_cast<const __m128i&>(_g_));
   5773 								__m128i cD_r_b = _mm_and_si128(cD, reinterpret_cast<const __m128i&>(r_b));
   5774 								__m128i cD__g_ = _mm_and_si128(cD, reinterpret_cast<const __m128i&>(_g_));
   5775 								__m128i cE_r_b = _mm_and_si128(cE, reinterpret_cast<const __m128i&>(r_b));
   5776 								__m128i cE__g_ = _mm_and_si128(cE, reinterpret_cast<const __m128i&>(_g_));
   5777 								__m128i cF_r_b = _mm_and_si128(cF, reinterpret_cast<const __m128i&>(r_b));
   5778 								__m128i cF__g_ = _mm_and_si128(cF, reinterpret_cast<const __m128i&>(_g_));
   5779 
   5780 								c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
   5781 								c2 = _mm_avg_epu8(c2_r_b, c3_r_b);
   5782 								c4 = _mm_avg_epu8(c4_r_b, c5_r_b);
   5783 								c6 = _mm_avg_epu8(c6_r_b, c7_r_b);
   5784 								c8 = _mm_avg_epu8(c8_r_b, c9_r_b);
   5785 								cA = _mm_avg_epu8(cA_r_b, cB_r_b);
   5786 								cC = _mm_avg_epu8(cC_r_b, cD_r_b);
   5787 								cE = _mm_avg_epu8(cE_r_b, cF_r_b);
   5788 								c0 = _mm_avg_epu8(c0, c2);
   5789 								c4 = _mm_avg_epu8(c4, c6);
   5790 								c8 = _mm_avg_epu8(c8, cA);
   5791 								cC = _mm_avg_epu8(cC, cE);
   5792 								c0 = _mm_avg_epu8(c0, c4);
   5793 								c8 = _mm_avg_epu8(c8, cC);
   5794 								c0 = _mm_avg_epu8(c0, c8);
   5795 								c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
   5796 								c1 = _mm_avg_epu16(c0__g_, c1__g_);
   5797 								c3 = _mm_avg_epu16(c2__g_, c3__g_);
   5798 								c5 = _mm_avg_epu16(c4__g_, c5__g_);
   5799 								c7 = _mm_avg_epu16(c6__g_, c7__g_);
   5800 								c9 = _mm_avg_epu16(c8__g_, c9__g_);
   5801 								cB = _mm_avg_epu16(cA__g_, cB__g_);
   5802 								cD = _mm_avg_epu16(cC__g_, cD__g_);
   5803 								cF = _mm_avg_epu16(cE__g_, cF__g_);
   5804 								c1 = _mm_avg_epu8(c1, c3);
   5805 								c5 = _mm_avg_epu8(c5, c7);
   5806 								c9 = _mm_avg_epu8(c9, cB);
   5807 								cD = _mm_avg_epu8(cD, cF);
   5808 								c1 = _mm_avg_epu8(c1, c5);
   5809 								c9 = _mm_avg_epu8(c9, cD);
   5810 								c1 = _mm_avg_epu8(c1, c9);
   5811 								c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
   5812 								c0 = _mm_or_si128(c0, c1);
   5813 
   5814 								_mm_store_si128((__m128i*)(source0 + 2 * x), c0);
   5815 							}
   5816 
   5817 							source0 += pitch;
   5818 							source1 += pitch;
   5819 							source2 += pitch;
   5820 							source3 += pitch;
   5821 							source4 += pitch;
   5822 							source5 += pitch;
   5823 							source6 += pitch;
   5824 							source7 += pitch;
   5825 							source8 += pitch;
   5826 							source9 += pitch;
   5827 							sourceA += pitch;
   5828 							sourceB += pitch;
   5829 							sourceC += pitch;
   5830 							sourceD += pitch;
   5831 							sourceE += pitch;
   5832 							sourceF += pitch;
   5833 						}
   5834 					}
   5835 					else ASSERT(false);
   5836 				}
   5837 				else
   5838 			#endif
   5839 			{
   5840 				#define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7BEF) + (((x) ^ (y)) & 0x0821))
   5841 
   5842 				if(internal.depth == 2)
   5843 				{
   5844 					for(int y = 0; y < height; y++)
   5845 					{
   5846 						for(int x = 0; x < width; x++)
   5847 						{
   5848 							unsigned short c0 = *(unsigned short*)(source0 + 2 * x);
   5849 							unsigned short c1 = *(unsigned short*)(source1 + 2 * x);
   5850 
   5851 							c0 = AVERAGE(c0, c1);
   5852 
   5853 							*(unsigned short*)(source0 + 2 * x) = c0;
   5854 						}
   5855 
   5856 						source0 += pitch;
   5857 						source1 += pitch;
   5858 					}
   5859 				}
   5860 				else if(internal.depth == 4)
   5861 				{
   5862 					for(int y = 0; y < height; y++)
   5863 					{
   5864 						for(int x = 0; x < width; x++)
   5865 						{
   5866 							unsigned short c0 = *(unsigned short*)(source0 + 2 * x);
   5867 							unsigned short c1 = *(unsigned short*)(source1 + 2 * x);
   5868 							unsigned short c2 = *(unsigned short*)(source2 + 2 * x);
   5869 							unsigned short c3 = *(unsigned short*)(source3 + 2 * x);
   5870 
   5871 							c0 = AVERAGE(c0, c1);
   5872 							c2 = AVERAGE(c2, c3);
   5873 							c0 = AVERAGE(c0, c2);
   5874 
   5875 							*(unsigned short*)(source0 + 2 * x) = c0;
   5876 						}
   5877 
   5878 						source0 += pitch;
   5879 						source1 += pitch;
   5880 						source2 += pitch;
   5881 						source3 += pitch;
   5882 					}
   5883 				}
   5884 				else if(internal.depth == 8)
   5885 				{
   5886 					for(int y = 0; y < height; y++)
   5887 					{
   5888 						for(int x = 0; x < width; x++)
   5889 						{
   5890 							unsigned short c0 = *(unsigned short*)(source0 + 2 * x);
   5891 							unsigned short c1 = *(unsigned short*)(source1 + 2 * x);
   5892 							unsigned short c2 = *(unsigned short*)(source2 + 2 * x);
   5893 							unsigned short c3 = *(unsigned short*)(source3 + 2 * x);
   5894 							unsigned short c4 = *(unsigned short*)(source4 + 2 * x);
   5895 							unsigned short c5 = *(unsigned short*)(source5 + 2 * x);
   5896 							unsigned short c6 = *(unsigned short*)(source6 + 2 * x);
   5897 							unsigned short c7 = *(unsigned short*)(source7 + 2 * x);
   5898 
   5899 							c0 = AVERAGE(c0, c1);
   5900 							c2 = AVERAGE(c2, c3);
   5901 							c4 = AVERAGE(c4, c5);
   5902 							c6 = AVERAGE(c6, c7);
   5903 							c0 = AVERAGE(c0, c2);
   5904 							c4 = AVERAGE(c4, c6);
   5905 							c0 = AVERAGE(c0, c4);
   5906 
   5907 							*(unsigned short*)(source0 + 2 * x) = c0;
   5908 						}
   5909 
   5910 						source0 += pitch;
   5911 						source1 += pitch;
   5912 						source2 += pitch;
   5913 						source3 += pitch;
   5914 						source4 += pitch;
   5915 						source5 += pitch;
   5916 						source6 += pitch;
   5917 						source7 += pitch;
   5918 					}
   5919 				}
   5920 				else if(internal.depth == 16)
   5921 				{
   5922 					for(int y = 0; y < height; y++)
   5923 					{
   5924 						for(int x = 0; x < width; x++)
   5925 						{
   5926 							unsigned short c0 = *(unsigned short*)(source0 + 2 * x);
   5927 							unsigned short c1 = *(unsigned short*)(source1 + 2 * x);
   5928 							unsigned short c2 = *(unsigned short*)(source2 + 2 * x);
   5929 							unsigned short c3 = *(unsigned short*)(source3 + 2 * x);
   5930 							unsigned short c4 = *(unsigned short*)(source4 + 2 * x);
   5931 							unsigned short c5 = *(unsigned short*)(source5 + 2 * x);
   5932 							unsigned short c6 = *(unsigned short*)(source6 + 2 * x);
   5933 							unsigned short c7 = *(unsigned short*)(source7 + 2 * x);
   5934 							unsigned short c8 = *(unsigned short*)(source8 + 2 * x);
   5935 							unsigned short c9 = *(unsigned short*)(source9 + 2 * x);
   5936 							unsigned short cA = *(unsigned short*)(sourceA + 2 * x);
   5937 							unsigned short cB = *(unsigned short*)(sourceB + 2 * x);
   5938 							unsigned short cC = *(unsigned short*)(sourceC + 2 * x);
   5939 							unsigned short cD = *(unsigned short*)(sourceD + 2 * x);
   5940 							unsigned short cE = *(unsigned short*)(sourceE + 2 * x);
   5941 							unsigned short cF = *(unsigned short*)(sourceF + 2 * x);
   5942 
   5943 							c0 = AVERAGE(c0, c1);
   5944 							c2 = AVERAGE(c2, c3);
   5945 							c4 = AVERAGE(c4, c5);
   5946 							c6 = AVERAGE(c6, c7);
   5947 							c8 = AVERAGE(c8, c9);
   5948 							cA = AVERAGE(cA, cB);
   5949 							cC = AVERAGE(cC, cD);
   5950 							cE = AVERAGE(cE, cF);
   5951 							c0 = AVERAGE(c0, c2);
   5952 							c4 = AVERAGE(c4, c6);
   5953 							c8 = AVERAGE(c8, cA);
   5954 							cC = AVERAGE(cC, cE);
   5955 							c0 = AVERAGE(c0, c4);
   5956 							c8 = AVERAGE(c8, cC);
   5957 							c0 = AVERAGE(c0, c8);
   5958 
   5959 							*(unsigned short*)(source0 + 2 * x) = c0;
   5960 						}
   5961 
   5962 						source0 += pitch;
   5963 						source1 += pitch;
   5964 						source2 += pitch;
   5965 						source3 += pitch;
   5966 						source4 += pitch;
   5967 						source5 += pitch;
   5968 						source6 += pitch;
   5969 						source7 += pitch;
   5970 						source8 += pitch;
   5971 						source9 += pitch;
   5972 						sourceA += pitch;
   5973 						sourceB += pitch;
   5974 						sourceC += pitch;
   5975 						sourceD += pitch;
   5976 						sourceE += pitch;
   5977 						sourceF += pitch;
   5978 					}
   5979 				}
   5980 				else ASSERT(false);
   5981 
   5982 				#undef AVERAGE
   5983 			}
   5984 		}
   5985 		else
   5986 		{
   5987 		//	UNIMPLEMENTED();
   5988 		}
   5989 	}
   5990 }
   5991