Home | History | Annotate | Download | only in Renderer
      1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved.
      2 //
      3 // Licensed under the Apache License, Version 2.0 (the "License");
      4 // you may not use this file except in compliance with the License.
      5 // You may obtain a copy of the License at
      6 //
      7 //    http://www.apache.org/licenses/LICENSE-2.0
      8 //
      9 // Unless required by applicable law or agreed to in writing, software
     10 // distributed under the License is distributed on an "AS IS" BASIS,
     11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 // See the License for the specific language governing permissions and
     13 // limitations under the License.
     14 
     15 #include "Surface.hpp"
     16 
     17 #include "Color.hpp"
     18 #include "Context.hpp"
     19 #include "ETC_Decoder.hpp"
     20 #include "Renderer.hpp"
     21 #include "Common/Half.hpp"
     22 #include "Common/Memory.hpp"
     23 #include "Common/CPUID.hpp"
     24 #include "Common/Resource.hpp"
     25 #include "Common/Debug.hpp"
     26 #include "Reactor/Reactor.hpp"
     27 
     28 #include <xmmintrin.h>
     29 #include <emmintrin.h>
     30 
     31 #undef min
     32 #undef max
     33 
     34 namespace sw
     35 {
     36 	extern bool quadLayoutEnabled;
     37 	extern bool complementaryDepthBuffer;
     38 	extern TranscendentalPrecision logPrecision;
     39 
     40 	unsigned int *Surface::palette = 0;
     41 	unsigned int Surface::paletteID = 0;
     42 
     43 	void Rect::clip(int minX, int minY, int maxX, int maxY)
     44 	{
     45 		x0 = clamp(x0, minX, maxX);
     46 		y0 = clamp(y0, minY, maxY);
     47 		x1 = clamp(x1, minX, maxX);
     48 		y1 = clamp(y1, minY, maxY);
     49 	}
     50 
     51 	void Surface::Buffer::write(int x, int y, int z, const Color<float> &color)
     52 	{
     53 		void *element = (unsigned char*)buffer + x * bytes + y * pitchB + z * sliceB;
     54 
     55 		write(element, color);
     56 	}
     57 
     58 	void Surface::Buffer::write(int x, int y, const Color<float> &color)
     59 	{
     60 		void *element = (unsigned char*)buffer + x * bytes + y * pitchB;
     61 
     62 		write(element, color);
     63 	}
     64 
     65 	inline void Surface::Buffer::write(void *element, const Color<float> &color)
     66 	{
     67 		switch(format)
     68 		{
     69 		case FORMAT_A8:
     70 			*(unsigned char*)element = unorm<8>(color.a);
     71 			break;
     72 		case FORMAT_R8I_SNORM:
     73 			*(char*)element = snorm<8>(color.r);
     74 			break;
     75 		case FORMAT_R8:
     76 			*(unsigned char*)element = unorm<8>(color.r);
     77 			break;
     78 		case FORMAT_R8I:
     79 			*(char*)element = scast<8>(color.r);
     80 			break;
     81 		case FORMAT_R8UI:
     82 			*(unsigned char*)element = ucast<8>(color.r);
     83 			break;
     84 		case FORMAT_R16I:
     85 			*(short*)element = scast<16>(color.r);
     86 			break;
     87 		case FORMAT_R16UI:
     88 			*(unsigned short*)element = ucast<16>(color.r);
     89 			break;
     90 		case FORMAT_R32I:
     91 			*(int*)element = static_cast<int>(color.r);
     92 			break;
     93 		case FORMAT_R32UI:
     94 			*(unsigned int*)element = static_cast<unsigned int>(color.r);
     95 			break;
     96 		case FORMAT_R3G3B2:
     97 			*(unsigned char*)element = (unorm<3>(color.r) << 5) | (unorm<3>(color.g) << 2) | (unorm<2>(color.b) << 0);
     98 			break;
     99 		case FORMAT_A8R3G3B2:
    100 			*(unsigned short*)element = (unorm<8>(color.a) << 8) | (unorm<3>(color.r) << 5) | (unorm<3>(color.g) << 2) | (unorm<2>(color.b) << 0);
    101 			break;
    102 		case FORMAT_X4R4G4B4:
    103 			*(unsigned short*)element = 0xF000 | (unorm<4>(color.r) << 8) | (unorm<4>(color.g) << 4) | (unorm<4>(color.b) << 0);
    104 			break;
    105 		case FORMAT_A4R4G4B4:
    106 			*(unsigned short*)element = (unorm<4>(color.a) << 12) | (unorm<4>(color.r) << 8) | (unorm<4>(color.g) << 4) | (unorm<4>(color.b) << 0);
    107 			break;
    108 		case FORMAT_R4G4B4A4:
    109 			*(unsigned short*)element = (unorm<4>(color.r) << 12) | (unorm<4>(color.g) << 8) | (unorm<4>(color.b) << 4) | (unorm<4>(color.a) << 0);
    110 			break;
    111 		case FORMAT_R5G6B5:
    112 			*(unsigned short*)element = (unorm<5>(color.r) << 11) | (unorm<6>(color.g) << 5) | (unorm<5>(color.b) << 0);
    113 			break;
    114 		case FORMAT_A1R5G5B5:
    115 			*(unsigned short*)element = (unorm<1>(color.a) << 15) | (unorm<5>(color.r) << 10) | (unorm<5>(color.g) << 5) | (unorm<5>(color.b) << 0);
    116 			break;
    117 		case FORMAT_R5G5B5A1:
    118 			*(unsigned short*)element = (unorm<5>(color.r) << 11) | (unorm<5>(color.g) << 6) | (unorm<5>(color.b) << 1) | (unorm<5>(color.a) << 0);
    119 			break;
    120 		case FORMAT_X1R5G5B5:
    121 			*(unsigned short*)element = 0x8000 | (unorm<5>(color.r) << 10) | (unorm<5>(color.g) << 5) | (unorm<5>(color.b) << 0);
    122 			break;
    123 		case FORMAT_A8R8G8B8:
    124 			*(unsigned int*)element = (unorm<8>(color.a) << 24) | (unorm<8>(color.r) << 16) | (unorm<8>(color.g) << 8) | (unorm<8>(color.b) << 0);
    125 			break;
    126 		case FORMAT_X8R8G8B8:
    127 			*(unsigned int*)element = 0xFF000000 | (unorm<8>(color.r) << 16) | (unorm<8>(color.g) << 8) | (unorm<8>(color.b) << 0);
    128 			break;
    129 		case FORMAT_A8B8G8R8I_SNORM:
    130 			*(unsigned int*)element = (static_cast<unsigned int>(snorm<8>(color.a)) << 24) |
    131 			                          (static_cast<unsigned int>(snorm<8>(color.b)) << 16) |
    132 			                          (static_cast<unsigned int>(snorm<8>(color.g)) << 8) |
    133 			                          (static_cast<unsigned int>(snorm<8>(color.r)) << 0);
    134 			break;
    135 		case FORMAT_A8B8G8R8:
    136 		case FORMAT_SRGB8_A8:
    137 			*(unsigned int*)element = (unorm<8>(color.a) << 24) | (unorm<8>(color.b) << 16) | (unorm<8>(color.g) << 8) | (unorm<8>(color.r) << 0);
    138 			break;
    139 		case FORMAT_A8B8G8R8I:
    140 			*(unsigned int*)element = (static_cast<unsigned int>(scast<8>(color.a)) << 24) |
    141 			                          (static_cast<unsigned int>(scast<8>(color.b)) << 16) |
    142 			                          (static_cast<unsigned int>(scast<8>(color.g)) << 8) |
    143 			                          (static_cast<unsigned int>(scast<8>(color.r)) << 0);
    144 			break;
    145 		case FORMAT_A8B8G8R8UI:
    146 			*(unsigned int*)element = (ucast<8>(color.a) << 24) | (ucast<8>(color.b) << 16) | (ucast<8>(color.g) << 8) | (ucast<8>(color.r) << 0);
    147 			break;
    148 		case FORMAT_X8B8G8R8I_SNORM:
    149 			*(unsigned int*)element = 0x7F000000 |
    150 			                          (static_cast<unsigned int>(snorm<8>(color.b)) << 16) |
    151 			                          (static_cast<unsigned int>(snorm<8>(color.g)) << 8) |
    152 			                          (static_cast<unsigned int>(snorm<8>(color.r)) << 0);
    153 			break;
    154 		case FORMAT_X8B8G8R8:
    155 		case FORMAT_SRGB8_X8:
    156 			*(unsigned int*)element = 0xFF000000 | (unorm<8>(color.b) << 16) | (unorm<8>(color.g) << 8) | (unorm<8>(color.r) << 0);
    157 			break;
    158 		case FORMAT_X8B8G8R8I:
    159 			*(unsigned int*)element = 0x7F000000 |
    160 			                          (static_cast<unsigned int>(scast<8>(color.b)) << 16) |
    161 			                          (static_cast<unsigned int>(scast<8>(color.g)) << 8) |
    162 			                          (static_cast<unsigned int>(scast<8>(color.r)) << 0);
    163 		case FORMAT_X8B8G8R8UI:
    164 			*(unsigned int*)element = 0xFF000000 | (ucast<8>(color.b) << 16) | (ucast<8>(color.g) << 8) | (ucast<8>(color.r) << 0);
    165 			break;
    166 		case FORMAT_A2R10G10B10:
    167 			*(unsigned int*)element = (unorm<2>(color.a) << 30) | (unorm<10>(color.r) << 20) | (unorm<10>(color.g) << 10) | (unorm<10>(color.b) << 0);
    168 			break;
    169 		case FORMAT_A2B10G10R10:
    170 			*(unsigned int*)element = (unorm<2>(color.a) << 30) | (unorm<10>(color.b) << 20) | (unorm<10>(color.g) << 10) | (unorm<10>(color.r) << 0);
    171 			break;
    172 		case FORMAT_G8R8I_SNORM:
    173 			*(unsigned short*)element = (static_cast<unsigned short>(snorm<8>(color.g)) << 8) |
    174 			                            (static_cast<unsigned short>(snorm<8>(color.r)) << 0);
    175 			break;
    176 		case FORMAT_G8R8:
    177 			*(unsigned short*)element = (unorm<8>(color.g) << 8) | (unorm<8>(color.r) << 0);
    178 			break;
    179 		case FORMAT_G8R8I:
    180 			*(unsigned short*)element = (static_cast<unsigned short>(scast<8>(color.g)) << 8) |
    181 			                            (static_cast<unsigned short>(scast<8>(color.r)) << 0);
    182 			break;
    183 		case FORMAT_G8R8UI:
    184 			*(unsigned short*)element = (ucast<8>(color.g) << 8) | (ucast<8>(color.r) << 0);
    185 			break;
    186 		case FORMAT_G16R16:
    187 			*(unsigned int*)element = (unorm<16>(color.g) << 16) | (unorm<16>(color.r) << 0);
    188 			break;
    189 		case FORMAT_G16R16I:
    190 			*(unsigned int*)element = (static_cast<unsigned int>(scast<16>(color.g)) << 16) |
    191 			                          (static_cast<unsigned int>(scast<16>(color.r)) << 0);
    192 			break;
    193 		case FORMAT_G16R16UI:
    194 			*(unsigned int*)element = (ucast<16>(color.g) << 16) | (ucast<16>(color.r) << 0);
    195 			break;
    196 		case FORMAT_G32R32I:
    197 		case FORMAT_G32R32UI:
    198 			((unsigned int*)element)[0] = static_cast<unsigned int>(color.r);
    199 			((unsigned int*)element)[1] = static_cast<unsigned int>(color.g);
    200 			break;
    201 		case FORMAT_A16B16G16R16:
    202 			((unsigned short*)element)[0] = unorm<16>(color.r);
    203 			((unsigned short*)element)[1] = unorm<16>(color.g);
    204 			((unsigned short*)element)[2] = unorm<16>(color.b);
    205 			((unsigned short*)element)[3] = unorm<16>(color.a);
    206 			break;
    207 		case FORMAT_A16B16G16R16I:
    208 			((unsigned short*)element)[0] = static_cast<unsigned short>(scast<16>(color.r));
    209 			((unsigned short*)element)[1] = static_cast<unsigned short>(scast<16>(color.g));
    210 			((unsigned short*)element)[2] = static_cast<unsigned short>(scast<16>(color.b));
    211 			((unsigned short*)element)[3] = static_cast<unsigned short>(scast<16>(color.a));
    212 			break;
    213 		case FORMAT_A16B16G16R16UI:
    214 			((unsigned short*)element)[0] = static_cast<unsigned short>(ucast<16>(color.r));
    215 			((unsigned short*)element)[1] = static_cast<unsigned short>(ucast<16>(color.g));
    216 			((unsigned short*)element)[2] = static_cast<unsigned short>(ucast<16>(color.b));
    217 			((unsigned short*)element)[3] = static_cast<unsigned short>(ucast<16>(color.a));
    218 			break;
    219 		case FORMAT_X16B16G16R16I:
    220 			((unsigned short*)element)[0] = static_cast<unsigned short>(scast<16>(color.r));
    221 			((unsigned short*)element)[1] = static_cast<unsigned short>(scast<16>(color.g));
    222 			((unsigned short*)element)[2] = static_cast<unsigned short>(scast<16>(color.b));
    223 			break;
    224 		case FORMAT_X16B16G16R16UI:
    225 			((unsigned short*)element)[0] = static_cast<unsigned short>(ucast<16>(color.r));
    226 			((unsigned short*)element)[1] = static_cast<unsigned short>(ucast<16>(color.g));
    227 			((unsigned short*)element)[2] = static_cast<unsigned short>(ucast<16>(color.b));
    228 			break;
    229 		case FORMAT_A32B32G32R32I:
    230 		case FORMAT_A32B32G32R32UI:
    231 			((unsigned int*)element)[0] = static_cast<unsigned int>(color.r);
    232 			((unsigned int*)element)[1] = static_cast<unsigned int>(color.g);
    233 			((unsigned int*)element)[2] = static_cast<unsigned int>(color.b);
    234 			((unsigned int*)element)[3] = static_cast<unsigned int>(color.a);
    235 			break;
    236 		case FORMAT_X32B32G32R32I:
    237 		case FORMAT_X32B32G32R32UI:
    238 			((unsigned int*)element)[0] = static_cast<unsigned int>(color.r);
    239 			((unsigned int*)element)[1] = static_cast<unsigned int>(color.g);
    240 			((unsigned int*)element)[2] = static_cast<unsigned int>(color.b);
    241 			break;
    242 		case FORMAT_V8U8:
    243 			*(unsigned short*)element = (snorm<8>(color.g) << 8) | (snorm<8>(color.r) << 0);
    244 			break;
    245 		case FORMAT_L6V5U5:
    246 			*(unsigned short*)element = (unorm<6>(color.b) << 10) | (snorm<5>(color.g) << 5) | (snorm<5>(color.r) << 0);
    247 			break;
    248 		case FORMAT_Q8W8V8U8:
    249 			*(unsigned int*)element = (snorm<8>(color.a) << 24) | (snorm<8>(color.b) << 16) | (snorm<8>(color.g) << 8) | (snorm<8>(color.r) << 0);
    250 			break;
    251 		case FORMAT_X8L8V8U8:
    252 			*(unsigned int*)element = 0xFF000000 | (unorm<8>(color.b) << 16) | (snorm<8>(color.g) << 8) | (snorm<8>(color.r) << 0);
    253 			break;
    254 		case FORMAT_V16U16:
    255 			*(unsigned int*)element = (snorm<16>(color.g) << 16) | (snorm<16>(color.r) << 0);
    256 			break;
    257 		case FORMAT_A2W10V10U10:
    258 			*(unsigned int*)element = (unorm<2>(color.a) << 30) | (snorm<10>(color.b) << 20) | (snorm<10>(color.g) << 10) | (snorm<10>(color.r) << 0);
    259 			break;
    260 		case FORMAT_A16W16V16U16:
    261 			((unsigned short*)element)[0] = snorm<16>(color.r);
    262 			((unsigned short*)element)[1] = snorm<16>(color.g);
    263 			((unsigned short*)element)[2] = snorm<16>(color.b);
    264 			((unsigned short*)element)[3] = unorm<16>(color.a);
    265 			break;
    266 		case FORMAT_Q16W16V16U16:
    267 			((unsigned short*)element)[0] = snorm<16>(color.r);
    268 			((unsigned short*)element)[1] = snorm<16>(color.g);
    269 			((unsigned short*)element)[2] = snorm<16>(color.b);
    270 			((unsigned short*)element)[3] = snorm<16>(color.a);
    271 			break;
    272 		case FORMAT_R8G8B8:
    273 			((unsigned char*)element)[0] = unorm<8>(color.b);
    274 			((unsigned char*)element)[1] = unorm<8>(color.g);
    275 			((unsigned char*)element)[2] = unorm<8>(color.r);
    276 			break;
    277 		case FORMAT_B8G8R8:
    278 			((unsigned char*)element)[0] = unorm<8>(color.r);
    279 			((unsigned char*)element)[1] = unorm<8>(color.g);
    280 			((unsigned char*)element)[2] = unorm<8>(color.b);
    281 			break;
    282 		case FORMAT_R16F:
    283 			*(half*)element = (half)color.r;
    284 			break;
    285 		case FORMAT_A16F:
    286 			*(half*)element = (half)color.a;
    287 			break;
    288 		case FORMAT_G16R16F:
    289 			((half*)element)[0] = (half)color.r;
    290 			((half*)element)[1] = (half)color.g;
    291 			break;
    292 		case FORMAT_B16G16R16F:
    293 			((half*)element)[0] = (half)color.r;
    294 			((half*)element)[1] = (half)color.g;
    295 			((half*)element)[2] = (half)color.b;
    296 			break;
    297 		case FORMAT_A16B16G16R16F:
    298 			((half*)element)[0] = (half)color.r;
    299 			((half*)element)[1] = (half)color.g;
    300 			((half*)element)[2] = (half)color.b;
    301 			((half*)element)[3] = (half)color.a;
    302 			break;
    303 		case FORMAT_A32F:
    304 			*(float*)element = color.a;
    305 			break;
    306 		case FORMAT_R32F:
    307 			*(float*)element = color.r;
    308 			break;
    309 		case FORMAT_G32R32F:
    310 			((float*)element)[0] = color.r;
    311 			((float*)element)[1] = color.g;
    312 			break;
    313 		case FORMAT_X32B32G32R32F:
    314 			((float*)element)[3] = 1.0f;
    315 		case FORMAT_B32G32R32F:
    316 			((float*)element)[0] = color.r;
    317 			((float*)element)[1] = color.g;
    318 			((float*)element)[2] = color.b;
    319 			break;
    320 		case FORMAT_A32B32G32R32F:
    321 			((float*)element)[0] = color.r;
    322 			((float*)element)[1] = color.g;
    323 			((float*)element)[2] = color.b;
    324 			((float*)element)[3] = color.a;
    325 			break;
    326 		case FORMAT_D32F:
    327 		case FORMAT_D32F_LOCKABLE:
    328 		case FORMAT_D32FS8_TEXTURE:
    329 		case FORMAT_D32FS8_SHADOW:
    330 			*((float*)element) = color.r;
    331 			break;
    332 		case FORMAT_D32F_COMPLEMENTARY:
    333 			*((float*)element) = 1 - color.r;
    334 			break;
    335 		case FORMAT_S8:
    336 			*((unsigned char*)element) = unorm<8>(color.r);
    337 			break;
    338 		case FORMAT_L8:
    339 			*(unsigned char*)element = unorm<8>(color.r);
    340 			break;
    341 		case FORMAT_A4L4:
    342 			*(unsigned char*)element = (unorm<4>(color.a) << 4) | (unorm<4>(color.r) << 0);
    343 			break;
    344 		case FORMAT_L16:
    345 			*(unsigned short*)element = unorm<16>(color.r);
    346 			break;
    347 		case FORMAT_A8L8:
    348 			*(unsigned short*)element = (unorm<8>(color.a) << 8) | (unorm<8>(color.r) << 0);
    349 			break;
    350 		case FORMAT_L16F:
    351 			*(half*)element = (half)color.r;
    352 			break;
    353 		case FORMAT_A16L16F:
    354 			((half*)element)[0] = (half)color.r;
    355 			((half*)element)[1] = (half)color.a;
    356 			break;
    357 		case FORMAT_L32F:
    358 			*(float*)element = color.r;
    359 			break;
    360 		case FORMAT_A32L32F:
    361 			((float*)element)[0] = color.r;
    362 			((float*)element)[1] = color.a;
    363 			break;
    364 		default:
    365 			ASSERT(false);
    366 		}
    367 	}
    368 
    369 	Color<float> Surface::Buffer::read(int x, int y, int z) const
    370 	{
    371 		void *element = (unsigned char*)buffer + x * bytes + y * pitchB + z * sliceB;
    372 
    373 		return read(element);
    374 	}
    375 
    376 	Color<float> Surface::Buffer::read(int x, int y) const
    377 	{
    378 		void *element = (unsigned char*)buffer + x * bytes + y * pitchB;
    379 
    380 		return read(element);
    381 	}
    382 
    383 	inline Color<float> Surface::Buffer::read(void *element) const
    384 	{
    385 		float r = 0.0f;
    386 		float g = 0.0f;
    387 		float b = 0.0f;
    388 		float a = 1.0f;
    389 
    390 		switch(format)
    391 		{
    392 		case FORMAT_P8:
    393 			{
    394 				ASSERT(palette);
    395 
    396 				unsigned int abgr = palette[*(unsigned char*)element];
    397 
    398 				r = (abgr & 0x000000FF) * (1.0f / 0x000000FF);
    399 				g = (abgr & 0x0000FF00) * (1.0f / 0x0000FF00);
    400 				b = (abgr & 0x00FF0000) * (1.0f / 0x00FF0000);
    401 				a = (abgr & 0xFF000000) * (1.0f / 0xFF000000);
    402 			}
    403 			break;
    404 		case FORMAT_A8P8:
    405 			{
    406 				ASSERT(palette);
    407 
    408 				unsigned int bgr = palette[((unsigned char*)element)[0]];
    409 
    410 				r = (bgr & 0x000000FF) * (1.0f / 0x000000FF);
    411 				g = (bgr & 0x0000FF00) * (1.0f / 0x0000FF00);
    412 				b = (bgr & 0x00FF0000) * (1.0f / 0x00FF0000);
    413 				a = ((unsigned char*)element)[1] * (1.0f / 0xFF);
    414 			}
    415 			break;
    416 		case FORMAT_A8:
    417 			r = 0;
    418 			g = 0;
    419 			b = 0;
    420 			a = *(unsigned char*)element * (1.0f / 0xFF);
    421 			break;
    422 		case FORMAT_R8I_SNORM:
    423 			r = max((*(signed char*)element) * (1.0f / 0x7F), -1.0f);
    424 			break;
    425 		case FORMAT_R8:
    426 			r = *(unsigned char*)element * (1.0f / 0xFF);
    427 			break;
    428 		case FORMAT_R8I:
    429 			r = *(signed char*)element;
    430 			break;
    431 		case FORMAT_R8UI:
    432 			r = *(unsigned char*)element;
    433 			break;
    434 		case FORMAT_R3G3B2:
    435 			{
    436 				unsigned char rgb = *(unsigned char*)element;
    437 
    438 				r = (rgb & 0xE0) * (1.0f / 0xE0);
    439 				g = (rgb & 0x1C) * (1.0f / 0x1C);
    440 				b = (rgb & 0x03) * (1.0f / 0x03);
    441 			}
    442 			break;
    443 		case FORMAT_A8R3G3B2:
    444 			{
    445 				unsigned short argb = *(unsigned short*)element;
    446 
    447 				a = (argb & 0xFF00) * (1.0f / 0xFF00);
    448 				r = (argb & 0x00E0) * (1.0f / 0x00E0);
    449 				g = (argb & 0x001C) * (1.0f / 0x001C);
    450 				b = (argb & 0x0003) * (1.0f / 0x0003);
    451 			}
    452 			break;
    453 		case FORMAT_X4R4G4B4:
    454 			{
    455 				unsigned short rgb = *(unsigned short*)element;
    456 
    457 				r = (rgb & 0x0F00) * (1.0f / 0x0F00);
    458 				g = (rgb & 0x00F0) * (1.0f / 0x00F0);
    459 				b = (rgb & 0x000F) * (1.0f / 0x000F);
    460 			}
    461 			break;
    462 		case FORMAT_A4R4G4B4:
    463 			{
    464 				unsigned short argb = *(unsigned short*)element;
    465 
    466 				a = (argb & 0xF000) * (1.0f / 0xF000);
    467 				r = (argb & 0x0F00) * (1.0f / 0x0F00);
    468 				g = (argb & 0x00F0) * (1.0f / 0x00F0);
    469 				b = (argb & 0x000F) * (1.0f / 0x000F);
    470 			}
    471 			break;
    472 		case FORMAT_R4G4B4A4:
    473 			{
    474 				unsigned short rgba = *(unsigned short*)element;
    475 
    476 				r = (rgba & 0xF000) * (1.0f / 0xF000);
    477 				g = (rgba & 0x0F00) * (1.0f / 0x0F00);
    478 				b = (rgba & 0x00F0) * (1.0f / 0x00F0);
    479 				a = (rgba & 0x000F) * (1.0f / 0x000F);
    480 			}
    481 			break;
    482 		case FORMAT_R5G6B5:
    483 			{
    484 				unsigned short rgb = *(unsigned short*)element;
    485 
    486 				r = (rgb & 0xF800) * (1.0f / 0xF800);
    487 				g = (rgb & 0x07E0) * (1.0f / 0x07E0);
    488 				b = (rgb & 0x001F) * (1.0f / 0x001F);
    489 			}
    490 			break;
    491 		case FORMAT_A1R5G5B5:
    492 			{
    493 				unsigned short argb = *(unsigned short*)element;
    494 
    495 				a = (argb & 0x8000) * (1.0f / 0x8000);
    496 				r = (argb & 0x7C00) * (1.0f / 0x7C00);
    497 				g = (argb & 0x03E0) * (1.0f / 0x03E0);
    498 				b = (argb & 0x001F) * (1.0f / 0x001F);
    499 			}
    500 			break;
    501 		case FORMAT_R5G5B5A1:
    502 			{
    503 				unsigned short rgba = *(unsigned short*)element;
    504 
    505 				r = (rgba & 0xF800) * (1.0f / 0xF800);
    506 				g = (rgba & 0x07C0) * (1.0f / 0x07C0);
    507 				b = (rgba & 0x003E) * (1.0f / 0x003E);
    508 				a = (rgba & 0x0001) * (1.0f / 0x0001);
    509 			}
    510 			break;
    511 		case FORMAT_X1R5G5B5:
    512 			{
    513 				unsigned short xrgb = *(unsigned short*)element;
    514 
    515 				r = (xrgb & 0x7C00) * (1.0f / 0x7C00);
    516 				g = (xrgb & 0x03E0) * (1.0f / 0x03E0);
    517 				b = (xrgb & 0x001F) * (1.0f / 0x001F);
    518 			}
    519 			break;
    520 		case FORMAT_A8R8G8B8:
    521 			{
    522 				unsigned int argb = *(unsigned int*)element;
    523 
    524 				a = (argb & 0xFF000000) * (1.0f / 0xFF000000);
    525 				r = (argb & 0x00FF0000) * (1.0f / 0x00FF0000);
    526 				g = (argb & 0x0000FF00) * (1.0f / 0x0000FF00);
    527 				b = (argb & 0x000000FF) * (1.0f / 0x000000FF);
    528 			}
    529 			break;
    530 		case FORMAT_X8R8G8B8:
    531 			{
    532 				unsigned int xrgb = *(unsigned int*)element;
    533 
    534 				r = (xrgb & 0x00FF0000) * (1.0f / 0x00FF0000);
    535 				g = (xrgb & 0x0000FF00) * (1.0f / 0x0000FF00);
    536 				b = (xrgb & 0x000000FF) * (1.0f / 0x000000FF);
    537 			}
    538 			break;
    539 		case FORMAT_A8B8G8R8I_SNORM:
    540 			{
    541 				signed char* abgr = (signed char*)element;
    542 
    543 				r = max(abgr[0] * (1.0f / 0x7F), -1.0f);
    544 				g = max(abgr[1] * (1.0f / 0x7F), -1.0f);
    545 				b = max(abgr[2] * (1.0f / 0x7F), -1.0f);
    546 				a = max(abgr[3] * (1.0f / 0x7F), -1.0f);
    547 			}
    548 			break;
    549 		case FORMAT_A8B8G8R8:
    550 		case FORMAT_SRGB8_A8:
    551 			{
    552 				unsigned int abgr = *(unsigned int*)element;
    553 
    554 				a = (abgr & 0xFF000000) * (1.0f / 0xFF000000);
    555 				b = (abgr & 0x00FF0000) * (1.0f / 0x00FF0000);
    556 				g = (abgr & 0x0000FF00) * (1.0f / 0x0000FF00);
    557 				r = (abgr & 0x000000FF) * (1.0f / 0x000000FF);
    558 			}
    559 			break;
    560 		case FORMAT_A8B8G8R8I:
    561 			{
    562 				signed char* abgr = (signed char*)element;
    563 
    564 				r = abgr[0];
    565 				g = abgr[1];
    566 				b = abgr[2];
    567 				a = abgr[3];
    568 			}
    569 			break;
    570 		case FORMAT_A8B8G8R8UI:
    571 			{
    572 				unsigned char* abgr = (unsigned char*)element;
    573 
    574 				r = abgr[0];
    575 				g = abgr[1];
    576 				b = abgr[2];
    577 				a = abgr[3];
    578 			}
    579 			break;
    580 		case FORMAT_X8B8G8R8I_SNORM:
    581 			{
    582 				signed char* bgr = (signed char*)element;
    583 
    584 				r = max(bgr[0] * (1.0f / 0x7F), -1.0f);
    585 				g = max(bgr[1] * (1.0f / 0x7F), -1.0f);
    586 				b = max(bgr[2] * (1.0f / 0x7F), -1.0f);
    587 			}
    588 			break;
    589 		case FORMAT_X8B8G8R8:
    590 		case FORMAT_SRGB8_X8:
    591 			{
    592 				unsigned int xbgr = *(unsigned int*)element;
    593 
    594 				b = (xbgr & 0x00FF0000) * (1.0f / 0x00FF0000);
    595 				g = (xbgr & 0x0000FF00) * (1.0f / 0x0000FF00);
    596 				r = (xbgr & 0x000000FF) * (1.0f / 0x000000FF);
    597 			}
    598 			break;
    599 		case FORMAT_X8B8G8R8I:
    600 			{
    601 				signed char* bgr = (signed char*)element;
    602 
    603 				r = bgr[0];
    604 				g = bgr[1];
    605 				b = bgr[2];
    606 			}
    607 			break;
    608 		case FORMAT_X8B8G8R8UI:
    609 			{
    610 				unsigned char* bgr = (unsigned char*)element;
    611 
    612 				r = bgr[0];
    613 				g = bgr[1];
    614 				b = bgr[2];
    615 			}
    616 			break;
    617 		case FORMAT_G8R8I_SNORM:
    618 			{
    619 				signed char* gr = (signed char*)element;
    620 
    621 				r = (gr[0] & 0xFF00) * (1.0f / 0xFF00);
    622 				g = (gr[1] & 0x00FF) * (1.0f / 0x00FF);
    623 			}
    624 			break;
    625 		case FORMAT_G8R8:
    626 			{
    627 				unsigned short gr = *(unsigned short*)element;
    628 
    629 				g = (gr & 0xFF00) * (1.0f / 0xFF00);
    630 				r = (gr & 0x00FF) * (1.0f / 0x00FF);
    631 			}
    632 			break;
    633 		case FORMAT_G8R8I:
    634 			{
    635 				signed char* gr = (signed char*)element;
    636 
    637 				r = gr[0];
    638 				g = gr[1];
    639 			}
    640 			break;
    641 		case FORMAT_G8R8UI:
    642 			{
    643 				unsigned char* gr = (unsigned char*)element;
    644 
    645 				r = gr[0];
    646 				g = gr[1];
    647 			}
    648 			break;
    649 		case FORMAT_R16I:
    650 			r = *((short*)element);
    651 			break;
    652 		case FORMAT_R16UI:
    653 			r = *((unsigned short*)element);
    654 			break;
    655 		case FORMAT_G16R16I:
    656 			{
    657 				short* gr = (short*)element;
    658 
    659 				r = gr[0];
    660 				g = gr[1];
    661 			}
    662 			break;
    663 		case FORMAT_G16R16:
    664 			{
    665 				unsigned int gr = *(unsigned int*)element;
    666 
    667 				g = (gr & 0xFFFF0000) * (1.0f / 0xFFFF0000);
    668 				r = (gr & 0x0000FFFF) * (1.0f / 0x0000FFFF);
    669 			}
    670 			break;
    671 		case FORMAT_G16R16UI:
    672 			{
    673 				unsigned short* gr = (unsigned short*)element;
    674 
    675 				r = gr[0];
    676 				g = gr[1];
    677 			}
    678 			break;
    679 		case FORMAT_A2R10G10B10:
    680 			{
    681 				unsigned int argb = *(unsigned int*)element;
    682 
    683 				a = (argb & 0xC0000000) * (1.0f / 0xC0000000);
    684 				r = (argb & 0x3FF00000) * (1.0f / 0x3FF00000);
    685 				g = (argb & 0x000FFC00) * (1.0f / 0x000FFC00);
    686 				b = (argb & 0x000003FF) * (1.0f / 0x000003FF);
    687 			}
    688 			break;
    689 		case FORMAT_A2B10G10R10:
    690 			{
    691 				unsigned int abgr = *(unsigned int*)element;
    692 
    693 				a = (abgr & 0xC0000000) * (1.0f / 0xC0000000);
    694 				b = (abgr & 0x3FF00000) * (1.0f / 0x3FF00000);
    695 				g = (abgr & 0x000FFC00) * (1.0f / 0x000FFC00);
    696 				r = (abgr & 0x000003FF) * (1.0f / 0x000003FF);
    697 			}
    698 			break;
    699 		case FORMAT_A16B16G16R16I:
    700 			{
    701 				short* abgr = (short*)element;
    702 
    703 				r = abgr[0];
    704 				g = abgr[1];
    705 				b = abgr[2];
    706 				a = abgr[3];
    707 			}
    708 			break;
    709 		case FORMAT_A16B16G16R16:
    710 			r = ((unsigned short*)element)[0] * (1.0f / 0xFFFF);
    711 			g = ((unsigned short*)element)[1] * (1.0f / 0xFFFF);
    712 			b = ((unsigned short*)element)[2] * (1.0f / 0xFFFF);
    713 			a = ((unsigned short*)element)[3] * (1.0f / 0xFFFF);
    714 			break;
    715 		case FORMAT_A16B16G16R16UI:
    716 			{
    717 				unsigned short* abgr = (unsigned short*)element;
    718 
    719 				r = abgr[0];
    720 				g = abgr[1];
    721 				b = abgr[2];
    722 				a = abgr[3];
    723 			}
    724 			break;
    725 		case FORMAT_X16B16G16R16I:
    726 			{
    727 				short* bgr = (short*)element;
    728 
    729 				r = bgr[0];
    730 				g = bgr[1];
    731 				b = bgr[2];
    732 			}
    733 			break;
    734 		case FORMAT_X16B16G16R16UI:
    735 			{
    736 				unsigned short* bgr = (unsigned short*)element;
    737 
    738 				r = bgr[0];
    739 				g = bgr[1];
    740 				b = bgr[2];
    741 			}
    742 			break;
    743 		case FORMAT_A32B32G32R32I:
    744 			{
    745 				int* abgr = (int*)element;
    746 
    747 				r = static_cast<float>(abgr[0]);
    748 				g = static_cast<float>(abgr[1]);
    749 				b = static_cast<float>(abgr[2]);
    750 				a = static_cast<float>(abgr[3]);
    751 			}
    752 			break;
    753 		case FORMAT_A32B32G32R32UI:
    754 			{
    755 				unsigned int* abgr = (unsigned int*)element;
    756 
    757 				r = static_cast<float>(abgr[0]);
    758 				g = static_cast<float>(abgr[1]);
    759 				b = static_cast<float>(abgr[2]);
    760 				a = static_cast<float>(abgr[3]);
    761 			}
    762 			break;
    763 		case FORMAT_X32B32G32R32I:
    764 			{
    765 				int* bgr = (int*)element;
    766 
    767 				r = static_cast<float>(bgr[0]);
    768 				g = static_cast<float>(bgr[1]);
    769 				b = static_cast<float>(bgr[2]);
    770 			}
    771 			break;
    772 		case FORMAT_X32B32G32R32UI:
    773 			{
    774 				unsigned int* bgr = (unsigned int*)element;
    775 
    776 				r = static_cast<float>(bgr[0]);
    777 				g = static_cast<float>(bgr[1]);
    778 				b = static_cast<float>(bgr[2]);
    779 			}
    780 			break;
    781 		case FORMAT_G32R32I:
    782 			{
    783 				int* gr = (int*)element;
    784 
    785 				r = static_cast<float>(gr[0]);
    786 				g = static_cast<float>(gr[1]);
    787 			}
    788 			break;
    789 		case FORMAT_G32R32UI:
    790 			{
    791 				unsigned int* gr = (unsigned int*)element;
    792 
    793 				r = static_cast<float>(gr[0]);
    794 				g = static_cast<float>(gr[1]);
    795 			}
    796 			break;
    797 		case FORMAT_R32I:
    798 			r = static_cast<float>(*((int*)element));
    799 			break;
    800 		case FORMAT_R32UI:
    801 			r = static_cast<float>(*((unsigned int*)element));
    802 			break;
    803 		case FORMAT_V8U8:
    804 			{
    805 				unsigned short vu = *(unsigned short*)element;
    806 
    807 				r = ((int)(vu & 0x00FF) << 24) * (1.0f / 0x7F000000);
    808 				g = ((int)(vu & 0xFF00) << 16) * (1.0f / 0x7F000000);
    809 			}
    810 			break;
    811 		case FORMAT_L6V5U5:
    812 			{
    813 				unsigned short lvu = *(unsigned short*)element;
    814 
    815 				r = ((int)(lvu & 0x001F) << 27) * (1.0f / 0x78000000);
    816 				g = ((int)(lvu & 0x03E0) << 22) * (1.0f / 0x78000000);
    817 				b = (lvu & 0xFC00) * (1.0f / 0xFC00);
    818 			}
    819 			break;
    820 		case FORMAT_Q8W8V8U8:
    821 			{
    822 				unsigned int qwvu = *(unsigned int*)element;
    823 
    824 				r = ((int)(qwvu & 0x000000FF) << 24) * (1.0f / 0x7F000000);
    825 				g = ((int)(qwvu & 0x0000FF00) << 16) * (1.0f / 0x7F000000);
    826 				b = ((int)(qwvu & 0x00FF0000) << 8)  * (1.0f / 0x7F000000);
    827 				a = ((int)(qwvu & 0xFF000000) << 0)  * (1.0f / 0x7F000000);
    828 			}
    829 			break;
    830 		case FORMAT_X8L8V8U8:
    831 			{
    832 				unsigned int xlvu = *(unsigned int*)element;
    833 
    834 				r = ((int)(xlvu & 0x000000FF) << 24) * (1.0f / 0x7F000000);
    835 				g = ((int)(xlvu & 0x0000FF00) << 16) * (1.0f / 0x7F000000);
    836 				b = (xlvu & 0x00FF0000) * (1.0f / 0x00FF0000);
    837 			}
    838 			break;
    839 		case FORMAT_R8G8B8:
    840 			r = ((unsigned char*)element)[2] * (1.0f / 0xFF);
    841 			g = ((unsigned char*)element)[1] * (1.0f / 0xFF);
    842 			b = ((unsigned char*)element)[0] * (1.0f / 0xFF);
    843 			break;
    844 		case FORMAT_B8G8R8:
    845 			r = ((unsigned char*)element)[0] * (1.0f / 0xFF);
    846 			g = ((unsigned char*)element)[1] * (1.0f / 0xFF);
    847 			b = ((unsigned char*)element)[2] * (1.0f / 0xFF);
    848 			break;
    849 		case FORMAT_V16U16:
    850 			{
    851 				unsigned int vu = *(unsigned int*)element;
    852 
    853 				r = ((int)(vu & 0x0000FFFF) << 16) * (1.0f / 0x7FFF0000);
    854 				g = ((int)(vu & 0xFFFF0000) << 0)  * (1.0f / 0x7FFF0000);
    855 			}
    856 			break;
    857 		case FORMAT_A2W10V10U10:
    858 			{
    859 				unsigned int awvu = *(unsigned int*)element;
    860 
    861 				r = ((int)(awvu & 0x000003FF) << 22) * (1.0f / 0x7FC00000);
    862 				g = ((int)(awvu & 0x000FFC00) << 12) * (1.0f / 0x7FC00000);
    863 				b = ((int)(awvu & 0x3FF00000) << 2)  * (1.0f / 0x7FC00000);
    864 				a = (awvu & 0xC0000000) * (1.0f / 0xC0000000);
    865 			}
    866 			break;
    867 		case FORMAT_A16W16V16U16:
    868 			r = ((signed short*)element)[0] * (1.0f / 0x7FFF);
    869 			g = ((signed short*)element)[1] * (1.0f / 0x7FFF);
    870 			b = ((signed short*)element)[2] * (1.0f / 0x7FFF);
    871 			a = ((unsigned short*)element)[3] * (1.0f / 0xFFFF);
    872 			break;
    873 		case FORMAT_Q16W16V16U16:
    874 			r = ((signed short*)element)[0] * (1.0f / 0x7FFF);
    875 			g = ((signed short*)element)[1] * (1.0f / 0x7FFF);
    876 			b = ((signed short*)element)[2] * (1.0f / 0x7FFF);
    877 			a = ((signed short*)element)[3] * (1.0f / 0x7FFF);
    878 			break;
    879 		case FORMAT_L8:
    880 			r =
    881 			g =
    882 			b = *(unsigned char*)element * (1.0f / 0xFF);
    883 			break;
    884 		case FORMAT_A4L4:
    885 			{
    886 				unsigned char al = *(unsigned char*)element;
    887 
    888 				r =
    889 				g =
    890 				b = (al & 0x0F) * (1.0f / 0x0F);
    891 				a = (al & 0xF0) * (1.0f / 0xF0);
    892 			}
    893 			break;
    894 		case FORMAT_L16:
    895 			r =
    896 			g =
    897 			b = *(unsigned short*)element * (1.0f / 0xFFFF);
    898 			break;
    899 		case FORMAT_A8L8:
    900 			r =
    901 			g =
    902 			b = ((unsigned char*)element)[0] * (1.0f / 0xFF);
    903 			a = ((unsigned char*)element)[1] * (1.0f / 0xFF);
    904 			break;
    905 		case FORMAT_L16F:
    906 			r =
    907 			g =
    908 			b = *(half*)element;
    909 			break;
    910 		case FORMAT_A16L16F:
    911 			r =
    912 			g =
    913 			b = ((half*)element)[0];
    914 			a = ((half*)element)[1];
    915 			break;
    916 		case FORMAT_L32F:
    917 			r =
    918 			g =
    919 			b = *(float*)element;
    920 			break;
    921 		case FORMAT_A32L32F:
    922 			r =
    923 			g =
    924 			b = ((float*)element)[0];
    925 			a = ((float*)element)[1];
    926 			break;
    927 		case FORMAT_A16F:
    928 			a = *(half*)element;
    929 			break;
    930 		case FORMAT_R16F:
    931 			r = *(half*)element;
    932 			break;
    933 		case FORMAT_G16R16F:
    934 			r = ((half*)element)[0];
    935 			g = ((half*)element)[1];
    936 			break;
    937 		case FORMAT_B16G16R16F:
    938 			r = ((half*)element)[0];
    939 			g = ((half*)element)[1];
    940 			b = ((half*)element)[2];
    941 			break;
    942 		case FORMAT_A16B16G16R16F:
    943 			r = ((half*)element)[0];
    944 			g = ((half*)element)[1];
    945 			b = ((half*)element)[2];
    946 			a = ((half*)element)[3];
    947 			break;
    948 		case FORMAT_A32F:
    949 			a = *(float*)element;
    950 			break;
    951 		case FORMAT_R32F:
    952 			r = *(float*)element;
    953 			break;
    954 		case FORMAT_G32R32F:
    955 			r = ((float*)element)[0];
    956 			g = ((float*)element)[1];
    957 			break;
    958 		case FORMAT_X32B32G32R32F:
    959 		case FORMAT_B32G32R32F:
    960 			r = ((float*)element)[0];
    961 			g = ((float*)element)[1];
    962 			b = ((float*)element)[2];
    963 			break;
    964 		case FORMAT_A32B32G32R32F:
    965 			r = ((float*)element)[0];
    966 			g = ((float*)element)[1];
    967 			b = ((float*)element)[2];
    968 			a = ((float*)element)[3];
    969 			break;
    970 		case FORMAT_D32F:
    971 		case FORMAT_D32F_LOCKABLE:
    972 		case FORMAT_D32FS8_TEXTURE:
    973 		case FORMAT_D32FS8_SHADOW:
    974 			r = *(float*)element;
    975 			g = r;
    976 			b = r;
    977 			a = r;
    978 			break;
    979 		case FORMAT_D32F_COMPLEMENTARY:
    980 			r = 1.0f - *(float*)element;
    981 			g = r;
    982 			b = r;
    983 			a = r;
    984 			break;
    985 		case FORMAT_S8:
    986 			r = *(unsigned char*)element * (1.0f / 0xFF);
    987 			break;
    988 		default:
    989 			ASSERT(false);
    990 		}
    991 
    992 	//	if(sRGB)
    993 	//	{
    994 	//		r = sRGBtoLinear(r);
    995 	//		g = sRGBtoLinear(g);
    996 	//		b = sRGBtoLinear(b);
    997 	//	}
    998 
    999 		return Color<float>(r, g, b, a);
   1000 	}
   1001 
   1002 	Color<float> Surface::Buffer::sample(float x, float y, float z) const
   1003 	{
   1004 		x -= 0.5f;
   1005 		y -= 0.5f;
   1006 		z -= 0.5f;
   1007 
   1008 		int x0 = clamp((int)x, 0, width - 1);
   1009 		int x1 = (x0 + 1 >= width) ? x0 : x0 + 1;
   1010 
   1011 		int y0 = clamp((int)y, 0, height - 1);
   1012 		int y1 = (y0 + 1 >= height) ? y0 : y0 + 1;
   1013 
   1014 		int z0 = clamp((int)z, 0, depth - 1);
   1015 		int z1 = (z0 + 1 >= depth) ? z0 : z0 + 1;
   1016 
   1017 		Color<float> c000 = read(x0, y0, z0);
   1018 		Color<float> c100 = read(x1, y0, z0);
   1019 		Color<float> c010 = read(x0, y1, z0);
   1020 		Color<float> c110 = read(x1, y1, z0);
   1021 		Color<float> c001 = read(x0, y0, z1);
   1022 		Color<float> c101 = read(x1, y0, z1);
   1023 		Color<float> c011 = read(x0, y1, z1);
   1024 		Color<float> c111 = read(x1, y1, z1);
   1025 
   1026 		float fx = x - x0;
   1027 		float fy = y - y0;
   1028 		float fz = z - z0;
   1029 
   1030 		c000 *= (1 - fx) * (1 - fy) * (1 - fz);
   1031 		c100 *= fx * (1 - fy) * (1 - fz);
   1032 		c010 *= (1 - fx) * fy * (1 - fz);
   1033 		c110 *= fx * fy * (1 - fz);
   1034 		c001 *= (1 - fx) * (1 - fy) * fz;
   1035 		c101 *= fx * (1 - fy) * fz;
   1036 		c011 *= (1 - fx) * fy * fz;
   1037 		c111 *= fx * fy * fz;
   1038 
   1039 		return c000 + c100 + c010 + c110 + c001 + c101 + c011 + c111;
   1040 	}
   1041 
   1042 	Color<float> Surface::Buffer::sample(float x, float y) const
   1043 	{
   1044 		x -= 0.5f;
   1045 		y -= 0.5f;
   1046 
   1047 		int x0 = clamp((int)x, 0, width - 1);
   1048 		int x1 = (x0 + 1 >= width) ? x0 : x0 + 1;
   1049 
   1050 		int y0 = clamp((int)y, 0, height - 1);
   1051 		int y1 = (y0 + 1 >= height) ? y0 : y0 + 1;
   1052 
   1053 		Color<float> c00 = read(x0, y0);
   1054 		Color<float> c10 = read(x1, y0);
   1055 		Color<float> c01 = read(x0, y1);
   1056 		Color<float> c11 = read(x1, y1);
   1057 
   1058 		float fx = x - x0;
   1059 		float fy = y - y0;
   1060 
   1061 		c00 *= (1 - fx) * (1 - fy);
   1062 		c10 *= fx * (1 - fy);
   1063 		c01 *= (1 - fx) * fy;
   1064 		c11 *= fx * fy;
   1065 
   1066 		return c00 + c10 + c01 + c11;
   1067 	}
   1068 
   1069 	void *Surface::Buffer::lockRect(int x, int y, int z, Lock lock)
   1070 	{
   1071 		this->lock = lock;
   1072 
   1073 		switch(lock)
   1074 		{
   1075 		case LOCK_UNLOCKED:
   1076 		case LOCK_READONLY:
   1077 			break;
   1078 		case LOCK_WRITEONLY:
   1079 		case LOCK_READWRITE:
   1080 		case LOCK_DISCARD:
   1081 			dirty = true;
   1082 			break;
   1083 		default:
   1084 			ASSERT(false);
   1085 		}
   1086 
   1087 		if(buffer)
   1088 		{
   1089 			switch(format)
   1090 			{
   1091 			#if S3TC_SUPPORT
   1092 			case FORMAT_DXT1:
   1093 			#endif
   1094 			case FORMAT_ATI1:
   1095 			case FORMAT_ETC1:
   1096 			case FORMAT_R11_EAC:
   1097 			case FORMAT_SIGNED_R11_EAC:
   1098 			case FORMAT_RGB8_ETC2:
   1099 			case FORMAT_SRGB8_ETC2:
   1100 			case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
   1101 			case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
   1102 				return (unsigned char*)buffer + 8 * (x / 4) + (y / 4) * pitchB + z * sliceB;
   1103 			case FORMAT_RG11_EAC:
   1104 			case FORMAT_SIGNED_RG11_EAC:
   1105 			case FORMAT_RGBA8_ETC2_EAC:
   1106 			case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
   1107 			case FORMAT_RGBA_ASTC_4x4_KHR:
   1108 			case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
   1109 				return (unsigned char*)buffer + 16 * (x / 4) + (y / 4) * pitchB + z * sliceB;
   1110 			case FORMAT_RGBA_ASTC_5x4_KHR:
   1111 			case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
   1112 				return (unsigned char*)buffer + 16 * (x / 5) + (y / 4) * pitchB + z * sliceB;
   1113 			case FORMAT_RGBA_ASTC_5x5_KHR:
   1114 			case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
   1115 				return (unsigned char*)buffer + 16 * (x / 5) + (y / 5) * pitchB + z * sliceB;
   1116 			case FORMAT_RGBA_ASTC_6x5_KHR:
   1117 			case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
   1118 				return (unsigned char*)buffer + 16 * (x / 6) + (y / 5) * pitchB + z * sliceB;
   1119 			case FORMAT_RGBA_ASTC_6x6_KHR:
   1120 			case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
   1121 				return (unsigned char*)buffer + 16 * (x / 6) + (y / 6) * pitchB + z * sliceB;
   1122 			case FORMAT_RGBA_ASTC_8x5_KHR:
   1123 			case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
   1124 				return (unsigned char*)buffer + 16 * (x / 8) + (y / 5) * pitchB + z * sliceB;
   1125 			case FORMAT_RGBA_ASTC_8x6_KHR:
   1126 			case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
   1127 				return (unsigned char*)buffer + 16 * (x / 8) + (y / 6) * pitchB + z * sliceB;
   1128 			case FORMAT_RGBA_ASTC_8x8_KHR:
   1129 			case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
   1130 				return (unsigned char*)buffer + 16 * (x / 8) + (y / 8) * pitchB + z * sliceB;
   1131 			case FORMAT_RGBA_ASTC_10x5_KHR:
   1132 			case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
   1133 				return (unsigned char*)buffer + 16 * (x / 10) + (y / 5) * pitchB + z * sliceB;
   1134 			case FORMAT_RGBA_ASTC_10x6_KHR:
   1135 			case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
   1136 				return (unsigned char*)buffer + 16 * (x / 10) + (y / 6) * pitchB + z * sliceB;
   1137 			case FORMAT_RGBA_ASTC_10x8_KHR:
   1138 			case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
   1139 				return (unsigned char*)buffer + 16 * (x / 10) + (y / 8) * pitchB + z * sliceB;
   1140 			case FORMAT_RGBA_ASTC_10x10_KHR:
   1141 			case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
   1142 				return (unsigned char*)buffer + 16 * (x / 10) + (y / 10) * pitchB + z * sliceB;
   1143 			case FORMAT_RGBA_ASTC_12x10_KHR:
   1144 			case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
   1145 				return (unsigned char*)buffer + 16 * (x / 12) + (y / 10) * pitchB + z * sliceB;
   1146 			case FORMAT_RGBA_ASTC_12x12_KHR:
   1147 			case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
   1148 				return (unsigned char*)buffer + 16 * (x / 12) + (y / 12) * pitchB + z * sliceB;
   1149 			#if S3TC_SUPPORT
   1150 			case FORMAT_DXT3:
   1151 			case FORMAT_DXT5:
   1152 			#endif
   1153 			case FORMAT_ATI2:
   1154 				return (unsigned char*)buffer + 16 * (x / 4) + (y / 4) * pitchB + z * sliceB;
   1155 			default:
   1156 				return (unsigned char*)buffer + x * bytes + y * pitchB + z * sliceB;
   1157 			}
   1158 		}
   1159 
   1160 		return 0;
   1161 	}
   1162 
   1163 	void Surface::Buffer::unlockRect()
   1164 	{
   1165 		lock = LOCK_UNLOCKED;
   1166 	}
   1167 
   1168 	Surface::Surface(int width, int height, int depth, Format format, void *pixels, int pitch, int slice) : lockable(true), renderTarget(false)
   1169 	{
   1170 		resource = new Resource(0);
   1171 		hasParent = false;
   1172 		ownExternal = false;
   1173 		depth = max(1, depth);
   1174 
   1175 		external.buffer = pixels;
   1176 		external.width = width;
   1177 		external.height = height;
   1178 		external.depth = depth;
   1179 		external.format = format;
   1180 		external.bytes = bytes(external.format);
   1181 		external.pitchB = pitch;
   1182 		external.pitchP = external.bytes ? pitch / external.bytes : 0;
   1183 		external.sliceB = slice;
   1184 		external.sliceP = external.bytes ? slice / external.bytes : 0;
   1185 		external.lock = LOCK_UNLOCKED;
   1186 		external.dirty = true;
   1187 
   1188 		internal.buffer = 0;
   1189 		internal.width = width;
   1190 		internal.height = height;
   1191 		internal.depth = depth;
   1192 		internal.format = selectInternalFormat(format);
   1193 		internal.bytes = bytes(internal.format);
   1194 		internal.pitchB = pitchB(internal.width, internal.format, false);
   1195 		internal.pitchP = pitchP(internal.width, internal.format, false);
   1196 		internal.sliceB = sliceB(internal.width, internal.height, internal.format, false);
   1197 		internal.sliceP = sliceP(internal.width, internal.height, internal.format, false);
   1198 		internal.lock = LOCK_UNLOCKED;
   1199 		internal.dirty = false;
   1200 
   1201 		stencil.buffer = 0;
   1202 		stencil.width = width;
   1203 		stencil.height = height;
   1204 		stencil.depth = depth;
   1205 		stencil.format = FORMAT_S8;
   1206 		stencil.bytes = bytes(stencil.format);
   1207 		stencil.pitchB = pitchB(stencil.width, stencil.format, false);
   1208 		stencil.pitchP = pitchP(stencil.width, stencil.format, false);
   1209 		stencil.sliceB = sliceB(stencil.width, stencil.height, stencil.format, false);
   1210 		stencil.sliceP = sliceP(stencil.width, stencil.height, stencil.format, false);
   1211 		stencil.lock = LOCK_UNLOCKED;
   1212 		stencil.dirty = false;
   1213 
   1214 		dirtyMipmaps = true;
   1215 		paletteUsed = 0;
   1216 	}
   1217 
   1218 	Surface::Surface(Resource *texture, int width, int height, int depth, Format format, bool lockable, bool renderTarget, int pitchPprovided) : lockable(lockable), renderTarget(renderTarget)
   1219 	{
   1220 		resource = texture ? texture : new Resource(0);
   1221 		hasParent = texture != 0;
   1222 		ownExternal = true;
   1223 		depth = max(1, depth);
   1224 
   1225 		external.buffer = 0;
   1226 		external.width = width;
   1227 		external.height = height;
   1228 		external.depth = depth;
   1229 		external.format = format;
   1230 		external.bytes = bytes(external.format);
   1231 		external.pitchB = pitchB(external.width, external.format, renderTarget && !texture);
   1232 		external.pitchP = pitchP(external.width, external.format, renderTarget && !texture);
   1233 		external.sliceB = sliceB(external.width, external.height, external.format, renderTarget && !texture);
   1234 		external.sliceP = sliceP(external.width, external.height, external.format, renderTarget && !texture);
   1235 		external.lock = LOCK_UNLOCKED;
   1236 		external.dirty = false;
   1237 
   1238 		internal.buffer = 0;
   1239 		internal.width = width;
   1240 		internal.height = height;
   1241 		internal.depth = depth;
   1242 		internal.format = selectInternalFormat(format);
   1243 		internal.bytes = bytes(internal.format);
   1244 		internal.pitchB = !pitchPprovided ? pitchB(internal.width, internal.format, renderTarget) : pitchPprovided * internal.bytes;
   1245 		internal.pitchP = !pitchPprovided ? pitchP(internal.width, internal.format, renderTarget) : pitchPprovided;
   1246 		internal.sliceB = sliceB(internal.width, internal.height, internal.format, renderTarget);
   1247 		internal.sliceP = sliceP(internal.width, internal.height, internal.format, renderTarget);
   1248 		internal.lock = LOCK_UNLOCKED;
   1249 		internal.dirty = false;
   1250 
   1251 		stencil.buffer = 0;
   1252 		stencil.width = width;
   1253 		stencil.height = height;
   1254 		stencil.depth = depth;
   1255 		stencil.format = FORMAT_S8;
   1256 		stencil.bytes = bytes(stencil.format);
   1257 		stencil.pitchB = pitchB(stencil.width, stencil.format, renderTarget);
   1258 		stencil.pitchP = pitchP(stencil.width, stencil.format, renderTarget);
   1259 		stencil.sliceB = sliceB(stencil.width, stencil.height, stencil.format, renderTarget);
   1260 		stencil.sliceP = sliceP(stencil.width, stencil.height, stencil.format, renderTarget);
   1261 		stencil.lock = LOCK_UNLOCKED;
   1262 		stencil.dirty = false;
   1263 
   1264 		dirtyMipmaps = true;
   1265 		paletteUsed = 0;
   1266 	}
   1267 
   1268 	Surface::~Surface()
   1269 	{
   1270 		// Synchronize so we can deallocate the buffers below
   1271 		resource->lock(DESTRUCT);
   1272 		resource->unlock();
   1273 
   1274 		if(!hasParent)
   1275 		{
   1276 			resource->destruct();
   1277 		}
   1278 
   1279 		if(ownExternal)
   1280 		{
   1281 			deallocate(external.buffer);
   1282 		}
   1283 
   1284 		if(internal.buffer != external.buffer)
   1285 		{
   1286 			deallocate(internal.buffer);
   1287 		}
   1288 
   1289 		deallocate(stencil.buffer);
   1290 
   1291 		external.buffer = 0;
   1292 		internal.buffer = 0;
   1293 		stencil.buffer = 0;
   1294 	}
   1295 
   1296 	void *Surface::lockExternal(int x, int y, int z, Lock lock, Accessor client)
   1297 	{
   1298 		resource->lock(client);
   1299 
   1300 		if(!external.buffer)
   1301 		{
   1302 			if(internal.buffer && identicalFormats())
   1303 			{
   1304 				external.buffer = internal.buffer;
   1305 			}
   1306 			else
   1307 			{
   1308 				external.buffer = allocateBuffer(external.width, external.height, external.depth, external.format);
   1309 			}
   1310 		}
   1311 
   1312 		if(internal.dirty)
   1313 		{
   1314 			if(lock != LOCK_DISCARD)
   1315 			{
   1316 				update(external, internal);
   1317 			}
   1318 
   1319 			internal.dirty = false;
   1320 		}
   1321 
   1322 		switch(lock)
   1323 		{
   1324 		case LOCK_READONLY:
   1325 			break;
   1326 		case LOCK_WRITEONLY:
   1327 		case LOCK_READWRITE:
   1328 		case LOCK_DISCARD:
   1329 			dirtyMipmaps = true;
   1330 			break;
   1331 		default:
   1332 			ASSERT(false);
   1333 		}
   1334 
   1335 		return external.lockRect(x, y, z, lock);
   1336 	}
   1337 
   1338 	void Surface::unlockExternal()
   1339 	{
   1340 		resource->unlock();
   1341 
   1342 		external.unlockRect();
   1343 	}
   1344 
   1345 	void *Surface::lockInternal(int x, int y, int z, Lock lock, Accessor client)
   1346 	{
   1347 		if(lock != LOCK_UNLOCKED)
   1348 		{
   1349 			resource->lock(client);
   1350 		}
   1351 
   1352 		if(!internal.buffer)
   1353 		{
   1354 			if(external.buffer && identicalFormats())
   1355 			{
   1356 				internal.buffer = external.buffer;
   1357 			}
   1358 			else
   1359 			{
   1360 				internal.buffer = allocateBuffer(internal.width, internal.height, internal.depth, internal.format);
   1361 			}
   1362 		}
   1363 
   1364 		// FIXME: WHQL requires conversion to lower external precision and back
   1365 		if(logPrecision >= WHQL)
   1366 		{
   1367 			if(internal.dirty && renderTarget && internal.format != external.format)
   1368 			{
   1369 				if(lock != LOCK_DISCARD)
   1370 				{
   1371 					switch(external.format)
   1372 					{
   1373 					case FORMAT_R3G3B2:
   1374 					case FORMAT_A8R3G3B2:
   1375 					case FORMAT_A1R5G5B5:
   1376 					case FORMAT_A2R10G10B10:
   1377 					case FORMAT_A2B10G10R10:
   1378 						lockExternal(0, 0, 0, LOCK_READWRITE, client);
   1379 						unlockExternal();
   1380 						break;
   1381 					default:
   1382 						// Difference passes WHQL
   1383 						break;
   1384 					}
   1385 				}
   1386 			}
   1387 		}
   1388 
   1389 		if(external.dirty || (isPalette(external.format) && paletteUsed != Surface::paletteID))
   1390 		{
   1391 			if(lock != LOCK_DISCARD)
   1392 			{
   1393 				update(internal, external);
   1394 			}
   1395 
   1396 			external.dirty = false;
   1397 			paletteUsed = Surface::paletteID;
   1398 		}
   1399 
   1400 		switch(lock)
   1401 		{
   1402 		case LOCK_UNLOCKED:
   1403 		case LOCK_READONLY:
   1404 			break;
   1405 		case LOCK_WRITEONLY:
   1406 		case LOCK_READWRITE:
   1407 		case LOCK_DISCARD:
   1408 			dirtyMipmaps = true;
   1409 			break;
   1410 		default:
   1411 			ASSERT(false);
   1412 		}
   1413 
   1414 		if(lock == LOCK_READONLY && client == PUBLIC)
   1415 		{
   1416 			resolve();
   1417 		}
   1418 
   1419 		return internal.lockRect(x, y, z, lock);
   1420 	}
   1421 
   1422 	void Surface::unlockInternal()
   1423 	{
   1424 		resource->unlock();
   1425 
   1426 		internal.unlockRect();
   1427 	}
   1428 
   1429 	void *Surface::lockStencil(int front, Accessor client)
   1430 	{
   1431 		resource->lock(client);
   1432 
   1433 		if(!stencil.buffer)
   1434 		{
   1435 			stencil.buffer = allocateBuffer(stencil.width, stencil.height, stencil.depth, stencil.format);
   1436 		}
   1437 
   1438 		return stencil.lockRect(0, 0, front, LOCK_READWRITE);   // FIXME
   1439 	}
   1440 
   1441 	void Surface::unlockStencil()
   1442 	{
   1443 		resource->unlock();
   1444 
   1445 		stencil.unlockRect();
   1446 	}
   1447 
   1448 	int Surface::bytes(Format format)
   1449 	{
   1450 		switch(format)
   1451 		{
   1452 		case FORMAT_NULL:				return 0;
   1453 		case FORMAT_P8:					return 1;
   1454 		case FORMAT_A8P8:				return 2;
   1455 		case FORMAT_A8:					return 1;
   1456 		case FORMAT_R8I:				return 1;
   1457 		case FORMAT_R8:					return 1;
   1458 		case FORMAT_R3G3B2:				return 1;
   1459 		case FORMAT_R16I:				return 2;
   1460 		case FORMAT_R16UI:				return 2;
   1461 		case FORMAT_A8R3G3B2:			return 2;
   1462 		case FORMAT_R5G6B5:				return 2;
   1463 		case FORMAT_A1R5G5B5:			return 2;
   1464 		case FORMAT_X1R5G5B5:			return 2;
   1465 		case FORMAT_R5G5B5A1:           return 2;
   1466 		case FORMAT_X4R4G4B4:			return 2;
   1467 		case FORMAT_A4R4G4B4:			return 2;
   1468 		case FORMAT_R4G4B4A4:           return 2;
   1469 		case FORMAT_R8G8B8:				return 3;
   1470 		case FORMAT_B8G8R8:             return 3;
   1471 		case FORMAT_R32I:				return 4;
   1472 		case FORMAT_R32UI:				return 4;
   1473 		case FORMAT_X8R8G8B8:			return 4;
   1474 	//	case FORMAT_X8G8R8B8Q:			return 4;
   1475 		case FORMAT_A8R8G8B8:			return 4;
   1476 	//	case FORMAT_A8G8R8B8Q:			return 4;
   1477 		case FORMAT_X8B8G8R8I:			return 4;
   1478 		case FORMAT_X8B8G8R8:			return 4;
   1479 		case FORMAT_SRGB8_X8:			return 4;
   1480 		case FORMAT_SRGB8_A8:			return 4;
   1481 		case FORMAT_A8B8G8R8I:			return 4;
   1482 		case FORMAT_R8UI:				return 1;
   1483 		case FORMAT_G8R8UI:				return 2;
   1484 		case FORMAT_X8B8G8R8UI:			return 4;
   1485 		case FORMAT_A8B8G8R8UI:			return 4;
   1486 		case FORMAT_A8B8G8R8:			return 4;
   1487 		case FORMAT_R8I_SNORM:			return 1;
   1488 		case FORMAT_G8R8I_SNORM:		return 2;
   1489 		case FORMAT_X8B8G8R8I_SNORM:	return 4;
   1490 		case FORMAT_A8B8G8R8I_SNORM:	return 4;
   1491 		case FORMAT_A2R10G10B10:		return 4;
   1492 		case FORMAT_A2B10G10R10:		return 4;
   1493 		case FORMAT_G8R8I:				return 2;
   1494 		case FORMAT_G8R8:				return 2;
   1495 		case FORMAT_G16R16I:			return 4;
   1496 		case FORMAT_G16R16UI:			return 4;
   1497 		case FORMAT_G16R16:				return 4;
   1498 		case FORMAT_G32R32I:			return 8;
   1499 		case FORMAT_G32R32UI:			return 8;
   1500 		case FORMAT_X16B16G16R16I:		return 8;
   1501 		case FORMAT_X16B16G16R16UI:		return 8;
   1502 		case FORMAT_A16B16G16R16I:		return 8;
   1503 		case FORMAT_A16B16G16R16UI:		return 8;
   1504 		case FORMAT_A16B16G16R16:		return 8;
   1505 		case FORMAT_X32B32G32R32I:		return 16;
   1506 		case FORMAT_X32B32G32R32UI:		return 16;
   1507 		case FORMAT_A32B32G32R32I:		return 16;
   1508 		case FORMAT_A32B32G32R32UI:		return 16;
   1509 		// Compressed formats
   1510 		#if S3TC_SUPPORT
   1511 		case FORMAT_DXT1:				return 2;   // Column of four pixels
   1512 		case FORMAT_DXT3:				return 4;   // Column of four pixels
   1513 		case FORMAT_DXT5:				return 4;   // Column of four pixels
   1514 		#endif
   1515 		case FORMAT_ATI1:				return 2;   // Column of four pixels
   1516 		case FORMAT_ATI2:				return 4;   // Column of four pixels
   1517 		case FORMAT_ETC1:				return 2;   // Column of four pixels
   1518 		case FORMAT_R11_EAC:			return 2;
   1519 		case FORMAT_SIGNED_R11_EAC:		return 2;
   1520 		case FORMAT_RG11_EAC:			return 4;
   1521 		case FORMAT_SIGNED_RG11_EAC:	return 4;
   1522 		case FORMAT_RGB8_ETC2:			return 2;
   1523 		case FORMAT_SRGB8_ETC2:			return 2;
   1524 		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:	return 2;
   1525 		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:	return 2;
   1526 		case FORMAT_RGBA8_ETC2_EAC:			return 4;
   1527 		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:	return 4;
   1528 		case FORMAT_RGBA_ASTC_4x4_KHR:
   1529 		case FORMAT_RGBA_ASTC_5x4_KHR:
   1530 		case FORMAT_RGBA_ASTC_5x5_KHR:
   1531 		case FORMAT_RGBA_ASTC_6x5_KHR:
   1532 		case FORMAT_RGBA_ASTC_6x6_KHR:
   1533 		case FORMAT_RGBA_ASTC_8x5_KHR:
   1534 		case FORMAT_RGBA_ASTC_8x6_KHR:
   1535 		case FORMAT_RGBA_ASTC_8x8_KHR:
   1536 		case FORMAT_RGBA_ASTC_10x5_KHR:
   1537 		case FORMAT_RGBA_ASTC_10x6_KHR:
   1538 		case FORMAT_RGBA_ASTC_10x8_KHR:
   1539 		case FORMAT_RGBA_ASTC_10x10_KHR:
   1540 		case FORMAT_RGBA_ASTC_12x10_KHR:
   1541 		case FORMAT_RGBA_ASTC_12x12_KHR:
   1542 		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
   1543 		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
   1544 		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
   1545 		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
   1546 		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
   1547 		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
   1548 		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
   1549 		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
   1550 		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
   1551 		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
   1552 		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
   1553 		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
   1554 		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
   1555 		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR: return 0; // FIXME
   1556 		// Bumpmap formats
   1557 		case FORMAT_V8U8:				return 2;
   1558 		case FORMAT_L6V5U5:				return 2;
   1559 		case FORMAT_Q8W8V8U8:			return 4;
   1560 		case FORMAT_X8L8V8U8:			return 4;
   1561 		case FORMAT_A2W10V10U10:		return 4;
   1562 		case FORMAT_V16U16:				return 4;
   1563 		case FORMAT_A16W16V16U16:		return 8;
   1564 		case FORMAT_Q16W16V16U16:		return 8;
   1565 		// Luminance formats
   1566 		case FORMAT_L8:					return 1;
   1567 		case FORMAT_A4L4:				return 1;
   1568 		case FORMAT_L16:				return 2;
   1569 		case FORMAT_A8L8:				return 2;
   1570 		case FORMAT_L16F:               return 2;
   1571 		case FORMAT_A16L16F:            return 4;
   1572 		case FORMAT_L32F:               return 4;
   1573 		case FORMAT_A32L32F:            return 8;
   1574 		// Floating-point formats
   1575 		case FORMAT_A16F:				return 2;
   1576 		case FORMAT_R16F:				return 2;
   1577 		case FORMAT_G16R16F:			return 4;
   1578 		case FORMAT_B16G16R16F:			return 6;
   1579 		case FORMAT_A16B16G16R16F:		return 8;
   1580 		case FORMAT_A32F:				return 4;
   1581 		case FORMAT_R32F:				return 4;
   1582 		case FORMAT_G32R32F:			return 8;
   1583 		case FORMAT_B32G32R32F:			return 12;
   1584 		case FORMAT_X32B32G32R32F:		return 16;
   1585 		case FORMAT_A32B32G32R32F:		return 16;
   1586 		// Depth/stencil formats
   1587 		case FORMAT_D16:				return 2;
   1588 		case FORMAT_D32:				return 4;
   1589 		case FORMAT_D24X8:				return 4;
   1590 		case FORMAT_D24S8:				return 4;
   1591 		case FORMAT_D24FS8:				return 4;
   1592 		case FORMAT_D32F:				return 4;
   1593 		case FORMAT_D32F_COMPLEMENTARY:	return 4;
   1594 		case FORMAT_D32F_LOCKABLE:		return 4;
   1595 		case FORMAT_D32FS8_TEXTURE:		return 4;
   1596 		case FORMAT_D32FS8_SHADOW:		return 4;
   1597 		case FORMAT_DF24S8:				return 4;
   1598 		case FORMAT_DF16S8:				return 2;
   1599 		case FORMAT_INTZ:				return 4;
   1600 		case FORMAT_S8:					return 1;
   1601 		case FORMAT_YV12_BT601:         return 1;   // Y plane only
   1602 		case FORMAT_YV12_BT709:         return 1;   // Y plane only
   1603 		case FORMAT_YV12_JFIF:          return 1;   // Y plane only
   1604 		default:
   1605 			ASSERT(false);
   1606 		}
   1607 
   1608 		return 0;
   1609 	}
   1610 
   1611 	int Surface::pitchB(int width, Format format, bool target)
   1612 	{
   1613 		if(target || isDepth(format) || isStencil(format))
   1614 		{
   1615 			width = align(width, 2);
   1616 		}
   1617 
   1618 		switch(format)
   1619 		{
   1620 		#if S3TC_SUPPORT
   1621 		case FORMAT_DXT1:
   1622 		#endif
   1623 		case FORMAT_ETC1:
   1624 		case FORMAT_R11_EAC:
   1625 		case FORMAT_SIGNED_R11_EAC:
   1626 		case FORMAT_RGB8_ETC2:
   1627 		case FORMAT_SRGB8_ETC2:
   1628 		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
   1629 		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
   1630 			return 8 * ((width + 3) / 4);    // 64 bit per 4x4 block, computed per 4 rows
   1631 		case FORMAT_RG11_EAC:
   1632 		case FORMAT_SIGNED_RG11_EAC:
   1633 		case FORMAT_RGBA8_ETC2_EAC:
   1634 		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
   1635 		case FORMAT_RGBA_ASTC_4x4_KHR:
   1636 		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
   1637 			return 16 * ((width + 3) / 4);    // 128 bit per 4x4 block, computed per 4 rows
   1638 		case FORMAT_RGBA_ASTC_5x4_KHR:
   1639 		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
   1640 		case FORMAT_RGBA_ASTC_5x5_KHR:
   1641 		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
   1642 			return 16 * ((width + 4) / 5);
   1643 		case FORMAT_RGBA_ASTC_6x5_KHR:
   1644 		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
   1645 		case FORMAT_RGBA_ASTC_6x6_KHR:
   1646 		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
   1647 			return 16 * ((width + 5) / 6);
   1648 		case FORMAT_RGBA_ASTC_8x5_KHR:
   1649 		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
   1650 		case FORMAT_RGBA_ASTC_8x6_KHR:
   1651 		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
   1652 		case FORMAT_RGBA_ASTC_8x8_KHR:
   1653 		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
   1654 			return 16 * ((width + 7) / 8);
   1655 		case FORMAT_RGBA_ASTC_10x5_KHR:
   1656 		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
   1657 		case FORMAT_RGBA_ASTC_10x6_KHR:
   1658 		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
   1659 		case FORMAT_RGBA_ASTC_10x8_KHR:
   1660 		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
   1661 		case FORMAT_RGBA_ASTC_10x10_KHR:
   1662 		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
   1663 			return 16 * ((width + 9) / 10);
   1664 		case FORMAT_RGBA_ASTC_12x10_KHR:
   1665 		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
   1666 		case FORMAT_RGBA_ASTC_12x12_KHR:
   1667 		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
   1668 			return 16 * ((width + 11) / 12);
   1669 		#if S3TC_SUPPORT
   1670 		case FORMAT_DXT3:
   1671 		case FORMAT_DXT5:
   1672 			return 16 * ((width + 3) / 4);   // 128 bit per 4x4 block, computed per 4 rows
   1673 		#endif
   1674 		case FORMAT_ATI1:
   1675 			return 2 * ((width + 3) / 4);    // 64 bit per 4x4 block, computed per row
   1676 		case FORMAT_ATI2:
   1677 			return 4 * ((width + 3) / 4);    // 128 bit per 4x4 block, computed per row
   1678 		case FORMAT_YV12_BT601:
   1679 		case FORMAT_YV12_BT709:
   1680 		case FORMAT_YV12_JFIF:
   1681 			return align(width, 16);
   1682 		default:
   1683 			return bytes(format) * width;
   1684 		}
   1685 	}
   1686 
   1687 	int Surface::pitchP(int width, Format format, bool target)
   1688 	{
   1689 		int B = bytes(format);
   1690 
   1691 		return B > 0 ? pitchB(width, format, target) / B : 0;
   1692 	}
   1693 
   1694 	int Surface::sliceB(int width, int height, Format format, bool target)
   1695 	{
   1696 		if(target || isDepth(format) || isStencil(format))
   1697 		{
   1698 			height = ((height + 1) & ~1);
   1699 		}
   1700 
   1701 		switch(format)
   1702 		{
   1703 		#if S3TC_SUPPORT
   1704 		case FORMAT_DXT1:
   1705 		case FORMAT_DXT3:
   1706 		case FORMAT_DXT5:
   1707 		#endif
   1708 		case FORMAT_ETC1:
   1709 		case FORMAT_R11_EAC:
   1710 		case FORMAT_SIGNED_R11_EAC:
   1711 		case FORMAT_RG11_EAC:
   1712 		case FORMAT_SIGNED_RG11_EAC:
   1713 		case FORMAT_RGB8_ETC2:
   1714 		case FORMAT_SRGB8_ETC2:
   1715 		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
   1716 		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
   1717 		case FORMAT_RGBA8_ETC2_EAC:
   1718 		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
   1719 		case FORMAT_RGBA_ASTC_4x4_KHR:
   1720 		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
   1721 		case FORMAT_RGBA_ASTC_5x4_KHR:
   1722 		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
   1723 			return pitchB(width, format, target) * ((height + 3) / 4);   // Pitch computed per 4 rows
   1724 		case FORMAT_RGBA_ASTC_5x5_KHR:
   1725 		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
   1726 		case FORMAT_RGBA_ASTC_6x5_KHR:
   1727 		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
   1728 		case FORMAT_RGBA_ASTC_8x5_KHR:
   1729 		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
   1730 		case FORMAT_RGBA_ASTC_10x5_KHR:
   1731 		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
   1732 			return pitchB(width, format, target) * ((height + 4) / 5);   // Pitch computed per 5 rows
   1733 		case FORMAT_RGBA_ASTC_6x6_KHR:
   1734 		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
   1735 		case FORMAT_RGBA_ASTC_8x6_KHR:
   1736 		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
   1737 		case FORMAT_RGBA_ASTC_10x6_KHR:
   1738 		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
   1739 			return pitchB(width, format, target) * ((height + 5) / 6);   // Pitch computed per 6 rows
   1740 		case FORMAT_RGBA_ASTC_8x8_KHR:
   1741 		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
   1742 		case FORMAT_RGBA_ASTC_10x8_KHR:
   1743 		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
   1744 			return pitchB(width, format, target) * ((height + 7) / 8);   // Pitch computed per 8 rows
   1745 		case FORMAT_RGBA_ASTC_10x10_KHR:
   1746 		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
   1747 		case FORMAT_RGBA_ASTC_12x10_KHR:
   1748 		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
   1749 			return pitchB(width, format, target) * ((height + 9) / 10);   // Pitch computed per 10 rows
   1750 		case FORMAT_RGBA_ASTC_12x12_KHR:
   1751 		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
   1752 			return pitchB(width, format, target) * ((height + 11) / 12);   // Pitch computed per 12 rows
   1753 		case FORMAT_ATI1:
   1754 		case FORMAT_ATI2:
   1755 		default:
   1756 			return pitchB(width, format, target) * height;   // Pitch computed per row
   1757 		}
   1758 	}
   1759 
   1760 	int Surface::sliceP(int width, int height, Format format, bool target)
   1761 	{
   1762 		int B = bytes(format);
   1763 
   1764 		return B > 0 ? sliceB(width, height, format, target) / B : 0;
   1765 	}
   1766 
   1767 	void Surface::update(Buffer &destination, Buffer &source)
   1768 	{
   1769 	//	ASSERT(source.lock != LOCK_UNLOCKED);
   1770 	//	ASSERT(destination.lock != LOCK_UNLOCKED);
   1771 
   1772 		if(destination.buffer != source.buffer)
   1773 		{
   1774 			ASSERT(source.dirty && !destination.dirty);
   1775 
   1776 			switch(source.format)
   1777 			{
   1778 			case FORMAT_R8G8B8:		decodeR8G8B8(destination, source);		break;   // FIXME: Check destination format
   1779 			case FORMAT_X1R5G5B5:	decodeX1R5G5B5(destination, source);	break;   // FIXME: Check destination format
   1780 			case FORMAT_A1R5G5B5:	decodeA1R5G5B5(destination, source);	break;   // FIXME: Check destination format
   1781 			case FORMAT_X4R4G4B4:	decodeX4R4G4B4(destination, source);	break;   // FIXME: Check destination format
   1782 			case FORMAT_A4R4G4B4:	decodeA4R4G4B4(destination, source);	break;   // FIXME: Check destination format
   1783 			case FORMAT_P8:			decodeP8(destination, source);			break;   // FIXME: Check destination format
   1784 			#if S3TC_SUPPORT
   1785 			case FORMAT_DXT1:		decodeDXT1(destination, source);		break;   // FIXME: Check destination format
   1786 			case FORMAT_DXT3:		decodeDXT3(destination, source);		break;   // FIXME: Check destination format
   1787 			case FORMAT_DXT5:		decodeDXT5(destination, source);		break;   // FIXME: Check destination format
   1788 			#endif
   1789 			case FORMAT_ATI1:		decodeATI1(destination, source);		break;   // FIXME: Check destination format
   1790 			case FORMAT_ATI2:		decodeATI2(destination, source);		break;   // FIXME: Check destination format
   1791 			case FORMAT_R11_EAC:         decodeEAC(destination, source, 1, false); break; // FIXME: Check destination format
   1792 			case FORMAT_SIGNED_R11_EAC:  decodeEAC(destination, source, 1, true);  break; // FIXME: Check destination format
   1793 			case FORMAT_RG11_EAC:        decodeEAC(destination, source, 2, false); break; // FIXME: Check destination format
   1794 			case FORMAT_SIGNED_RG11_EAC: decodeEAC(destination, source, 2, true);  break; // FIXME: Check destination format
   1795 			case FORMAT_ETC1:
   1796 			case FORMAT_RGB8_ETC2:                      decodeETC2(destination, source, 0, false); break; // FIXME: Check destination format
   1797 			case FORMAT_SRGB8_ETC2:                     decodeETC2(destination, source, 0, true);  break; // FIXME: Check destination format
   1798 			case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:  decodeETC2(destination, source, 1, false); break; // FIXME: Check destination format
   1799 			case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2: decodeETC2(destination, source, 1, true);  break; // FIXME: Check destination format
   1800 			case FORMAT_RGBA8_ETC2_EAC:                 decodeETC2(destination, source, 8, false); break; // FIXME: Check destination format
   1801 			case FORMAT_SRGB8_ALPHA8_ETC2_EAC:          decodeETC2(destination, source, 8, true);  break; // FIXME: Check destination format
   1802 			case FORMAT_RGBA_ASTC_4x4_KHR:           decodeASTC(destination, source, 4,  4,  1, false); break; // FIXME: Check destination format
   1803 			case FORMAT_RGBA_ASTC_5x4_KHR:           decodeASTC(destination, source, 5,  4,  1, false); break; // FIXME: Check destination format
   1804 			case FORMAT_RGBA_ASTC_5x5_KHR:           decodeASTC(destination, source, 5,  5,  1, false); break; // FIXME: Check destination format
   1805 			case FORMAT_RGBA_ASTC_6x5_KHR:           decodeASTC(destination, source, 6,  5,  1, false); break; // FIXME: Check destination format
   1806 			case FORMAT_RGBA_ASTC_6x6_KHR:           decodeASTC(destination, source, 6,  6,  1, false); break; // FIXME: Check destination format
   1807 			case FORMAT_RGBA_ASTC_8x5_KHR:           decodeASTC(destination, source, 8,  5,  1, false); break; // FIXME: Check destination format
   1808 			case FORMAT_RGBA_ASTC_8x6_KHR:           decodeASTC(destination, source, 8,  6,  1, false); break; // FIXME: Check destination format
   1809 			case FORMAT_RGBA_ASTC_8x8_KHR:           decodeASTC(destination, source, 8,  8,  1, false); break; // FIXME: Check destination format
   1810 			case FORMAT_RGBA_ASTC_10x5_KHR:          decodeASTC(destination, source, 10, 5,  1, false); break; // FIXME: Check destination format
   1811 			case FORMAT_RGBA_ASTC_10x6_KHR:          decodeASTC(destination, source, 10, 6,  1, false); break; // FIXME: Check destination format
   1812 			case FORMAT_RGBA_ASTC_10x8_KHR:          decodeASTC(destination, source, 10, 8,  1, false); break; // FIXME: Check destination format
   1813 			case FORMAT_RGBA_ASTC_10x10_KHR:         decodeASTC(destination, source, 10, 10, 1, false); break; // FIXME: Check destination format
   1814 			case FORMAT_RGBA_ASTC_12x10_KHR:         decodeASTC(destination, source, 12, 10, 1, false); break; // FIXME: Check destination format
   1815 			case FORMAT_RGBA_ASTC_12x12_KHR:         decodeASTC(destination, source, 12, 12, 1, false); break; // FIXME: Check destination format
   1816 			case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:   decodeASTC(destination, source, 4,  4,  1, true);  break; // FIXME: Check destination format
   1817 			case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:   decodeASTC(destination, source, 5,  4,  1, true);  break; // FIXME: Check destination format
   1818 			case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:   decodeASTC(destination, source, 5,  5,  1, true);  break; // FIXME: Check destination format
   1819 			case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:   decodeASTC(destination, source, 6,  5,  1, true);  break; // FIXME: Check destination format
   1820 			case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:   decodeASTC(destination, source, 6,  6,  1, true);  break; // FIXME: Check destination format
   1821 			case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:   decodeASTC(destination, source, 8,  5,  1, true);  break; // FIXME: Check destination format
   1822 			case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:   decodeASTC(destination, source, 8,  6,  1, true);  break; // FIXME: Check destination format
   1823 			case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:   decodeASTC(destination, source, 8,  8,  1, true);  break; // FIXME: Check destination format
   1824 			case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:  decodeASTC(destination, source, 10, 5,  1, true);  break; // FIXME: Check destination format
   1825 			case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:  decodeASTC(destination, source, 10, 6,  1, true);  break; // FIXME: Check destination format
   1826 			case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:  decodeASTC(destination, source, 10, 8,  1, true);  break; // FIXME: Check destination format
   1827 			case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR: decodeASTC(destination, source, 10, 10, 1, true);  break; // FIXME: Check destination format
   1828 			case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR: decodeASTC(destination, source, 12, 10, 1, true);  break; // FIXME: Check destination format
   1829 			case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR: decodeASTC(destination, source, 12, 12, 1, true);  break; // FIXME: Check destination format
   1830 			default:				genericUpdate(destination, source);		break;
   1831 			}
   1832 		}
   1833 	}
   1834 
   1835 	void Surface::genericUpdate(Buffer &destination, Buffer &source)
   1836 	{
   1837 		unsigned char *sourceSlice = (unsigned char*)source.buffer;
   1838 		unsigned char *destinationSlice = (unsigned char*)destination.buffer;
   1839 
   1840 		int depth = min(destination.depth, source.depth);
   1841 		int height = min(destination.height, source.height);
   1842 		int width = min(destination.width, source.width);
   1843 		int rowBytes = width * source.bytes;
   1844 
   1845 		for(int z = 0; z < depth; z++)
   1846 		{
   1847 			unsigned char *sourceRow = sourceSlice;
   1848 			unsigned char *destinationRow = destinationSlice;
   1849 
   1850 			for(int y = 0; y < height; y++)
   1851 			{
   1852 				if(source.format == destination.format)
   1853 				{
   1854 					memcpy(destinationRow, sourceRow, rowBytes);
   1855 				}
   1856 				else
   1857 				{
   1858 					unsigned char *sourceElement = sourceRow;
   1859 					unsigned char *destinationElement = destinationRow;
   1860 
   1861 					for(int x = 0; x < width; x++)
   1862 					{
   1863 						Color<float> color = source.read(sourceElement);
   1864 						destination.write(destinationElement, color);
   1865 
   1866 						sourceElement += source.bytes;
   1867 						destinationElement += destination.bytes;
   1868 					}
   1869 				}
   1870 
   1871 				sourceRow += source.pitchB;
   1872 				destinationRow += destination.pitchB;
   1873 			}
   1874 
   1875 			sourceSlice += source.sliceB;
   1876 			destinationSlice += destination.sliceB;
   1877 		}
   1878 	}
   1879 
   1880 	void Surface::decodeR8G8B8(Buffer &destination, const Buffer &source)
   1881 	{
   1882 		unsigned char *sourceSlice = (unsigned char*)source.buffer;
   1883 		unsigned char *destinationSlice = (unsigned char*)destination.buffer;
   1884 
   1885 		for(int z = 0; z < destination.depth && z < source.depth; z++)
   1886 		{
   1887 			unsigned char *sourceRow = sourceSlice;
   1888 			unsigned char *destinationRow = destinationSlice;
   1889 
   1890 			for(int y = 0; y < destination.height && y < source.height; y++)
   1891 			{
   1892 				unsigned char *sourceElement = sourceRow;
   1893 				unsigned char *destinationElement = destinationRow;
   1894 
   1895 				for(int x = 0; x < destination.width && x < source.width; x++)
   1896 				{
   1897 					unsigned int b = sourceElement[0];
   1898 					unsigned int g = sourceElement[1];
   1899 					unsigned int r = sourceElement[2];
   1900 
   1901 					*(unsigned int*)destinationElement = 0xFF000000 | (r << 16) | (g << 8) | (b << 0);
   1902 
   1903 					sourceElement += source.bytes;
   1904 					destinationElement += destination.bytes;
   1905 				}
   1906 
   1907 				sourceRow += source.pitchB;
   1908 				destinationRow += destination.pitchB;
   1909 			}
   1910 
   1911 			sourceSlice += source.sliceB;
   1912 			destinationSlice += destination.sliceB;
   1913 		}
   1914 	}
   1915 
   1916 	void Surface::decodeX1R5G5B5(Buffer &destination, const Buffer &source)
   1917 	{
   1918 		unsigned char *sourceSlice = (unsigned char*)source.buffer;
   1919 		unsigned char *destinationSlice = (unsigned char*)destination.buffer;
   1920 
   1921 		for(int z = 0; z < destination.depth && z < source.depth; z++)
   1922 		{
   1923 			unsigned char *sourceRow = sourceSlice;
   1924 			unsigned char *destinationRow = destinationSlice;
   1925 
   1926 			for(int y = 0; y < destination.height && y < source.height; y++)
   1927 			{
   1928 				unsigned char *sourceElement = sourceRow;
   1929 				unsigned char *destinationElement = destinationRow;
   1930 
   1931 				for(int x = 0; x < destination.width && x < source.width; x++)
   1932 				{
   1933 					unsigned int xrgb = *(unsigned short*)sourceElement;
   1934 
   1935 					unsigned int r = (((xrgb & 0x7C00) * 134771 + 0x800000) >> 8) & 0x00FF0000;
   1936 					unsigned int g = (((xrgb & 0x03E0) * 16846 + 0x8000) >> 8) & 0x0000FF00;
   1937 					unsigned int b = (((xrgb & 0x001F) * 2106  + 0x80) >> 8);
   1938 
   1939 					*(unsigned int*)destinationElement = 0xFF000000 | r | g | b;
   1940 
   1941 					sourceElement += source.bytes;
   1942 					destinationElement += destination.bytes;
   1943 				}
   1944 
   1945 				sourceRow += source.pitchB;
   1946 				destinationRow += destination.pitchB;
   1947 			}
   1948 
   1949 			sourceSlice += source.sliceB;
   1950 			destinationSlice += destination.sliceB;
   1951 		}
   1952 	}
   1953 
   1954 	void Surface::decodeA1R5G5B5(Buffer &destination, const Buffer &source)
   1955 	{
   1956 		unsigned char *sourceSlice = (unsigned char*)source.buffer;
   1957 		unsigned char *destinationSlice = (unsigned char*)destination.buffer;
   1958 
   1959 		for(int z = 0; z < destination.depth && z < source.depth; z++)
   1960 		{
   1961 			unsigned char *sourceRow = sourceSlice;
   1962 			unsigned char *destinationRow = destinationSlice;
   1963 
   1964 			for(int y = 0; y < destination.height && y < source.height; y++)
   1965 			{
   1966 				unsigned char *sourceElement = sourceRow;
   1967 				unsigned char *destinationElement = destinationRow;
   1968 
   1969 				for(int x = 0; x < destination.width && x < source.width; x++)
   1970 				{
   1971 					unsigned int argb = *(unsigned short*)sourceElement;
   1972 
   1973 					unsigned int a =   (argb & 0x8000) * 130560;
   1974 					unsigned int r = (((argb & 0x7C00) * 134771 + 0x800000) >> 8) & 0x00FF0000;
   1975 					unsigned int g = (((argb & 0x03E0) * 16846  + 0x8000) >> 8) & 0x0000FF00;
   1976 					unsigned int b = (((argb & 0x001F) * 2106   + 0x80) >> 8);
   1977 
   1978 					*(unsigned int*)destinationElement = a | r | g | b;
   1979 
   1980 					sourceElement += source.bytes;
   1981 					destinationElement += destination.bytes;
   1982 				}
   1983 
   1984 				sourceRow += source.pitchB;
   1985 				destinationRow += destination.pitchB;
   1986 			}
   1987 
   1988 			sourceSlice += source.sliceB;
   1989 			destinationSlice += destination.sliceB;
   1990 		}
   1991 	}
   1992 
   1993 	void Surface::decodeX4R4G4B4(Buffer &destination, const Buffer &source)
   1994 	{
   1995 		unsigned char *sourceSlice = (unsigned char*)source.buffer;
   1996 		unsigned char *destinationSlice = (unsigned char*)destination.buffer;
   1997 
   1998 		for(int z = 0; z < destination.depth && z < source.depth; z++)
   1999 		{
   2000 			unsigned char *sourceRow = sourceSlice;
   2001 			unsigned char *destinationRow = destinationSlice;
   2002 
   2003 			for(int y = 0; y < destination.height && y < source.height; y++)
   2004 			{
   2005 				unsigned char *sourceElement = sourceRow;
   2006 				unsigned char *destinationElement = destinationRow;
   2007 
   2008 				for(int x = 0; x < destination.width && x < source.width; x++)
   2009 				{
   2010 					unsigned int xrgb = *(unsigned short*)sourceElement;
   2011 
   2012 					unsigned int r = ((xrgb & 0x0F00) * 0x00001100) & 0x00FF0000;
   2013 					unsigned int g = ((xrgb & 0x00F0) * 0x00000110) & 0x0000FF00;
   2014 					unsigned int b =  (xrgb & 0x000F) * 0x00000011;
   2015 
   2016 					*(unsigned int*)destinationElement = 0xFF000000 | r | g | b;
   2017 
   2018 					sourceElement += source.bytes;
   2019 					destinationElement += destination.bytes;
   2020 				}
   2021 
   2022 				sourceRow += source.pitchB;
   2023 				destinationRow += destination.pitchB;
   2024 			}
   2025 
   2026 			sourceSlice += source.sliceB;
   2027 			destinationSlice += destination.sliceB;
   2028 		}
   2029 	}
   2030 
   2031 	void Surface::decodeA4R4G4B4(Buffer &destination, const Buffer &source)
   2032 	{
   2033 		unsigned char *sourceSlice = (unsigned char*)source.buffer;
   2034 		unsigned char *destinationSlice = (unsigned char*)destination.buffer;
   2035 
   2036 		for(int z = 0; z < destination.depth && z < source.depth; z++)
   2037 		{
   2038 			unsigned char *sourceRow = sourceSlice;
   2039 			unsigned char *destinationRow = destinationSlice;
   2040 
   2041 			for(int y = 0; y < destination.height && y < source.height; y++)
   2042 			{
   2043 				unsigned char *sourceElement = sourceRow;
   2044 				unsigned char *destinationElement = destinationRow;
   2045 
   2046 				for(int x = 0; x < destination.width && x < source.width; x++)
   2047 				{
   2048 					unsigned int argb = *(unsigned short*)sourceElement;
   2049 
   2050 					unsigned int a = ((argb & 0xF000) * 0x00011000) & 0xFF000000;
   2051 					unsigned int r = ((argb & 0x0F00) * 0x00001100) & 0x00FF0000;
   2052 					unsigned int g = ((argb & 0x00F0) * 0x00000110) & 0x0000FF00;
   2053 					unsigned int b =  (argb & 0x000F) * 0x00000011;
   2054 
   2055 					*(unsigned int*)destinationElement = a | r | g | b;
   2056 
   2057 					sourceElement += source.bytes;
   2058 					destinationElement += destination.bytes;
   2059 				}
   2060 
   2061 				sourceRow += source.pitchB;
   2062 				destinationRow += destination.pitchB;
   2063 			}
   2064 
   2065 			sourceSlice += source.sliceB;
   2066 			destinationSlice += destination.sliceB;
   2067 		}
   2068 	}
   2069 
   2070 	void Surface::decodeP8(Buffer &destination, const Buffer &source)
   2071 	{
   2072 		unsigned char *sourceSlice = (unsigned char*)source.buffer;
   2073 		unsigned char *destinationSlice = (unsigned char*)destination.buffer;
   2074 
   2075 		for(int z = 0; z < destination.depth && z < source.depth; z++)
   2076 		{
   2077 			unsigned char *sourceRow = sourceSlice;
   2078 			unsigned char *destinationRow = destinationSlice;
   2079 
   2080 			for(int y = 0; y < destination.height && y < source.height; y++)
   2081 			{
   2082 				unsigned char *sourceElement = sourceRow;
   2083 				unsigned char *destinationElement = destinationRow;
   2084 
   2085 				for(int x = 0; x < destination.width && x < source.width; x++)
   2086 				{
   2087 					unsigned int abgr = palette[*(unsigned char*)sourceElement];
   2088 
   2089 					unsigned int r = (abgr & 0x000000FF) << 16;
   2090 					unsigned int g = (abgr & 0x0000FF00) << 0;
   2091 					unsigned int b = (abgr & 0x00FF0000) >> 16;
   2092 					unsigned int a = (abgr & 0xFF000000) >> 0;
   2093 
   2094 					*(unsigned int*)destinationElement = a | r | g | b;
   2095 
   2096 					sourceElement += source.bytes;
   2097 					destinationElement += destination.bytes;
   2098 				}
   2099 
   2100 				sourceRow += source.pitchB;
   2101 				destinationRow += destination.pitchB;
   2102 			}
   2103 
   2104 			sourceSlice += source.sliceB;
   2105 			destinationSlice += destination.sliceB;
   2106 		}
   2107 	}
   2108 
   2109 #if S3TC_SUPPORT
   2110 	void Surface::decodeDXT1(Buffer &internal, const Buffer &external)
   2111 	{
   2112 		unsigned int *destSlice = (unsigned int*)internal.buffer;
   2113 		const DXT1 *source = (const DXT1*)external.buffer;
   2114 
   2115 		for(int z = 0; z < external.depth; z++)
   2116 		{
   2117 			unsigned int *dest = destSlice;
   2118 
   2119 			for(int y = 0; y < external.height; y += 4)
   2120 			{
   2121 				for(int x = 0; x < external.width; x += 4)
   2122 				{
   2123 					Color<byte> c[4];
   2124 
   2125 					c[0] = source->c0;
   2126 					c[1] = source->c1;
   2127 
   2128 					if(source->c0 > source->c1)   // No transparency
   2129 					{
   2130 						// c2 = 2 / 3 * c0 + 1 / 3 * c1
   2131 						c[2].r = (byte)((2 * (word)c[0].r + (word)c[1].r + 1) / 3);
   2132 						c[2].g = (byte)((2 * (word)c[0].g + (word)c[1].g + 1) / 3);
   2133 						c[2].b = (byte)((2 * (word)c[0].b + (word)c[1].b + 1) / 3);
   2134 						c[2].a = 0xFF;
   2135 
   2136 						// c3 = 1 / 3 * c0 + 2 / 3 * c1
   2137 						c[3].r = (byte)(((word)c[0].r + 2 * (word)c[1].r + 1) / 3);
   2138 						c[3].g = (byte)(((word)c[0].g + 2 * (word)c[1].g + 1) / 3);
   2139 						c[3].b = (byte)(((word)c[0].b + 2 * (word)c[1].b + 1) / 3);
   2140 						c[3].a = 0xFF;
   2141 					}
   2142 					else   // c3 transparent
   2143 					{
   2144 						// c2 = 1 / 2 * c0 + 1 / 2 * c1
   2145 						c[2].r = (byte)(((word)c[0].r + (word)c[1].r) / 2);
   2146 						c[2].g = (byte)(((word)c[0].g + (word)c[1].g) / 2);
   2147 						c[2].b = (byte)(((word)c[0].b + (word)c[1].b) / 2);
   2148 						c[2].a = 0xFF;
   2149 
   2150 						c[3].r = 0;
   2151 						c[3].g = 0;
   2152 						c[3].b = 0;
   2153 						c[3].a = 0;
   2154 					}
   2155 
   2156 					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
   2157 					{
   2158 						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
   2159 						{
   2160 							dest[(x + i) + (y + j) * internal.width] = c[(unsigned int)(source->lut >> 2 * (i + j * 4)) % 4];
   2161 						}
   2162 					}
   2163 
   2164 					source++;
   2165 				}
   2166 			}
   2167 
   2168 			(byte*&)destSlice += internal.sliceB;
   2169 		}
   2170 	}
   2171 
   2172 	void Surface::decodeDXT3(Buffer &internal, const Buffer &external)
   2173 	{
   2174 		unsigned int *destSlice = (unsigned int*)internal.buffer;
   2175 		const DXT3 *source = (const DXT3*)external.buffer;
   2176 
   2177 		for(int z = 0; z < external.depth; z++)
   2178 		{
   2179 			unsigned int *dest = destSlice;
   2180 
   2181 			for(int y = 0; y < external.height; y += 4)
   2182 			{
   2183 				for(int x = 0; x < external.width; x += 4)
   2184 				{
   2185 					Color<byte> c[4];
   2186 
   2187 					c[0] = source->c0;
   2188 					c[1] = source->c1;
   2189 
   2190 					// c2 = 2 / 3 * c0 + 1 / 3 * c1
   2191 					c[2].r = (byte)((2 * (word)c[0].r + (word)c[1].r + 1) / 3);
   2192 					c[2].g = (byte)((2 * (word)c[0].g + (word)c[1].g + 1) / 3);
   2193 					c[2].b = (byte)((2 * (word)c[0].b + (word)c[1].b + 1) / 3);
   2194 
   2195 					// c3 = 1 / 3 * c0 + 2 / 3 * c1
   2196 					c[3].r = (byte)(((word)c[0].r + 2 * (word)c[1].r + 1) / 3);
   2197 					c[3].g = (byte)(((word)c[0].g + 2 * (word)c[1].g + 1) / 3);
   2198 					c[3].b = (byte)(((word)c[0].b + 2 * (word)c[1].b + 1) / 3);
   2199 
   2200 					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
   2201 					{
   2202 						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
   2203 						{
   2204 							unsigned int a = (unsigned int)(source->a >> 4 * (i + j * 4)) & 0x0F;
   2205 							unsigned int color = (c[(unsigned int)(source->lut >> 2 * (i + j * 4)) % 4] & 0x00FFFFFF) | ((a << 28) + (a << 24));
   2206 
   2207 							dest[(x + i) + (y + j) * internal.width] = color;
   2208 						}
   2209 					}
   2210 
   2211 					source++;
   2212 				}
   2213 			}
   2214 
   2215 			(byte*&)destSlice += internal.sliceB;
   2216 		}
   2217 	}
   2218 
   2219 	void Surface::decodeDXT5(Buffer &internal, const Buffer &external)
   2220 	{
   2221 		unsigned int *destSlice = (unsigned int*)internal.buffer;
   2222 		const DXT5 *source = (const DXT5*)external.buffer;
   2223 
   2224 		for(int z = 0; z < external.depth; z++)
   2225 		{
   2226 			unsigned int *dest = destSlice;
   2227 
   2228 			for(int y = 0; y < external.height; y += 4)
   2229 			{
   2230 				for(int x = 0; x < external.width; x += 4)
   2231 				{
   2232 					Color<byte> c[4];
   2233 
   2234 					c[0] = source->c0;
   2235 					c[1] = source->c1;
   2236 
   2237 					// c2 = 2 / 3 * c0 + 1 / 3 * c1
   2238 					c[2].r = (byte)((2 * (word)c[0].r + (word)c[1].r + 1) / 3);
   2239 					c[2].g = (byte)((2 * (word)c[0].g + (word)c[1].g + 1) / 3);
   2240 					c[2].b = (byte)((2 * (word)c[0].b + (word)c[1].b + 1) / 3);
   2241 
   2242 					// c3 = 1 / 3 * c0 + 2 / 3 * c1
   2243 					c[3].r = (byte)(((word)c[0].r + 2 * (word)c[1].r + 1) / 3);
   2244 					c[3].g = (byte)(((word)c[0].g + 2 * (word)c[1].g + 1) / 3);
   2245 					c[3].b = (byte)(((word)c[0].b + 2 * (word)c[1].b + 1) / 3);
   2246 
   2247 					byte a[8];
   2248 
   2249 					a[0] = source->a0;
   2250 					a[1] = source->a1;
   2251 
   2252 					if(a[0] > a[1])
   2253 					{
   2254 						a[2] = (byte)((6 * (word)a[0] + 1 * (word)a[1] + 3) / 7);
   2255 						a[3] = (byte)((5 * (word)a[0] + 2 * (word)a[1] + 3) / 7);
   2256 						a[4] = (byte)((4 * (word)a[0] + 3 * (word)a[1] + 3) / 7);
   2257 						a[5] = (byte)((3 * (word)a[0] + 4 * (word)a[1] + 3) / 7);
   2258 						a[6] = (byte)((2 * (word)a[0] + 5 * (word)a[1] + 3) / 7);
   2259 						a[7] = (byte)((1 * (word)a[0] + 6 * (word)a[1] + 3) / 7);
   2260 					}
   2261 					else
   2262 					{
   2263 						a[2] = (byte)((4 * (word)a[0] + 1 * (word)a[1] + 2) / 5);
   2264 						a[3] = (byte)((3 * (word)a[0] + 2 * (word)a[1] + 2) / 5);
   2265 						a[4] = (byte)((2 * (word)a[0] + 3 * (word)a[1] + 2) / 5);
   2266 						a[5] = (byte)((1 * (word)a[0] + 4 * (word)a[1] + 2) / 5);
   2267 						a[6] = 0;
   2268 						a[7] = 0xFF;
   2269 					}
   2270 
   2271 					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
   2272 					{
   2273 						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
   2274 						{
   2275 							unsigned int alpha = (unsigned int)a[(unsigned int)(source->alut >> (16 + 3 * (i + j * 4))) % 8] << 24;
   2276 							unsigned int color = (c[(source->clut >> 2 * (i + j * 4)) % 4] & 0x00FFFFFF) | alpha;
   2277 
   2278 							dest[(x + i) + (y + j) * internal.width] = color;
   2279 						}
   2280 					}
   2281 
   2282 					source++;
   2283 				}
   2284 			}
   2285 
   2286 			(byte*&)destSlice += internal.sliceB;
   2287 		}
   2288 	}
   2289 #endif
   2290 
   2291 	void Surface::decodeATI1(Buffer &internal, const Buffer &external)
   2292 	{
   2293 		byte *destSlice = (byte*)internal.buffer;
   2294 		const ATI1 *source = (const ATI1*)external.buffer;
   2295 
   2296 		for(int z = 0; z < external.depth; z++)
   2297 		{
   2298 			byte *dest = destSlice;
   2299 
   2300 			for(int y = 0; y < external.height; y += 4)
   2301 			{
   2302 				for(int x = 0; x < external.width; x += 4)
   2303 				{
   2304 					byte r[8];
   2305 
   2306 					r[0] = source->r0;
   2307 					r[1] = source->r1;
   2308 
   2309 					if(r[0] > r[1])
   2310 					{
   2311 						r[2] = (byte)((6 * (word)r[0] + 1 * (word)r[1] + 3) / 7);
   2312 						r[3] = (byte)((5 * (word)r[0] + 2 * (word)r[1] + 3) / 7);
   2313 						r[4] = (byte)((4 * (word)r[0] + 3 * (word)r[1] + 3) / 7);
   2314 						r[5] = (byte)((3 * (word)r[0] + 4 * (word)r[1] + 3) / 7);
   2315 						r[6] = (byte)((2 * (word)r[0] + 5 * (word)r[1] + 3) / 7);
   2316 						r[7] = (byte)((1 * (word)r[0] + 6 * (word)r[1] + 3) / 7);
   2317 					}
   2318 					else
   2319 					{
   2320 						r[2] = (byte)((4 * (word)r[0] + 1 * (word)r[1] + 2) / 5);
   2321 						r[3] = (byte)((3 * (word)r[0] + 2 * (word)r[1] + 2) / 5);
   2322 						r[4] = (byte)((2 * (word)r[0] + 3 * (word)r[1] + 2) / 5);
   2323 						r[5] = (byte)((1 * (word)r[0] + 4 * (word)r[1] + 2) / 5);
   2324 						r[6] = 0;
   2325 						r[7] = 0xFF;
   2326 					}
   2327 
   2328 					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
   2329 					{
   2330 						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
   2331 						{
   2332 							dest[(x + i) + (y + j) * internal.width] = r[(unsigned int)(source->rlut >> (16 + 3 * (i + j * 4))) % 8];
   2333 						}
   2334 					}
   2335 
   2336 					source++;
   2337 				}
   2338 			}
   2339 
   2340 			destSlice += internal.sliceB;
   2341 		}
   2342 	}
   2343 
   2344 	void Surface::decodeATI2(Buffer &internal, const Buffer &external)
   2345 	{
   2346 		word *destSlice = (word*)internal.buffer;
   2347 		const ATI2 *source = (const ATI2*)external.buffer;
   2348 
   2349 		for(int z = 0; z < external.depth; z++)
   2350 		{
   2351 			word *dest = destSlice;
   2352 
   2353 			for(int y = 0; y < external.height; y += 4)
   2354 			{
   2355 				for(int x = 0; x < external.width; x += 4)
   2356 				{
   2357 					byte X[8];
   2358 
   2359 					X[0] = source->x0;
   2360 					X[1] = source->x1;
   2361 
   2362 					if(X[0] > X[1])
   2363 					{
   2364 						X[2] = (byte)((6 * (word)X[0] + 1 * (word)X[1] + 3) / 7);
   2365 						X[3] = (byte)((5 * (word)X[0] + 2 * (word)X[1] + 3) / 7);
   2366 						X[4] = (byte)((4 * (word)X[0] + 3 * (word)X[1] + 3) / 7);
   2367 						X[5] = (byte)((3 * (word)X[0] + 4 * (word)X[1] + 3) / 7);
   2368 						X[6] = (byte)((2 * (word)X[0] + 5 * (word)X[1] + 3) / 7);
   2369 						X[7] = (byte)((1 * (word)X[0] + 6 * (word)X[1] + 3) / 7);
   2370 					}
   2371 					else
   2372 					{
   2373 						X[2] = (byte)((4 * (word)X[0] + 1 * (word)X[1] + 2) / 5);
   2374 						X[3] = (byte)((3 * (word)X[0] + 2 * (word)X[1] + 2) / 5);
   2375 						X[4] = (byte)((2 * (word)X[0] + 3 * (word)X[1] + 2) / 5);
   2376 						X[5] = (byte)((1 * (word)X[0] + 4 * (word)X[1] + 2) / 5);
   2377 						X[6] = 0;
   2378 						X[7] = 0xFF;
   2379 					}
   2380 
   2381 					byte Y[8];
   2382 
   2383 					Y[0] = source->y0;
   2384 					Y[1] = source->y1;
   2385 
   2386 					if(Y[0] > Y[1])
   2387 					{
   2388 						Y[2] = (byte)((6 * (word)Y[0] + 1 * (word)Y[1] + 3) / 7);
   2389 						Y[3] = (byte)((5 * (word)Y[0] + 2 * (word)Y[1] + 3) / 7);
   2390 						Y[4] = (byte)((4 * (word)Y[0] + 3 * (word)Y[1] + 3) / 7);
   2391 						Y[5] = (byte)((3 * (word)Y[0] + 4 * (word)Y[1] + 3) / 7);
   2392 						Y[6] = (byte)((2 * (word)Y[0] + 5 * (word)Y[1] + 3) / 7);
   2393 						Y[7] = (byte)((1 * (word)Y[0] + 6 * (word)Y[1] + 3) / 7);
   2394 					}
   2395 					else
   2396 					{
   2397 						Y[2] = (byte)((4 * (word)Y[0] + 1 * (word)Y[1] + 2) / 5);
   2398 						Y[3] = (byte)((3 * (word)Y[0] + 2 * (word)Y[1] + 2) / 5);
   2399 						Y[4] = (byte)((2 * (word)Y[0] + 3 * (word)Y[1] + 2) / 5);
   2400 						Y[5] = (byte)((1 * (word)Y[0] + 4 * (word)Y[1] + 2) / 5);
   2401 						Y[6] = 0;
   2402 						Y[7] = 0xFF;
   2403 					}
   2404 
   2405 					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
   2406 					{
   2407 						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
   2408 						{
   2409 							word r = X[(unsigned int)(source->xlut >> (16 + 3 * (i + j * 4))) % 8];
   2410 							word g = Y[(unsigned int)(source->ylut >> (16 + 3 * (i + j * 4))) % 8];
   2411 
   2412 							dest[(x + i) + (y + j) * internal.width] = (g << 8) + r;
   2413 						}
   2414 					}
   2415 
   2416 					source++;
   2417 				}
   2418 			}
   2419 
   2420 			(byte*&)destSlice += internal.sliceB;
   2421 		}
   2422 	}
   2423 
   2424 	void Surface::decodeETC2(Buffer &internal, const Buffer &external, int nbAlphaBits, bool isSRGB)
   2425 	{
   2426 		ETC_Decoder::Decode((const byte*)external.buffer, (byte*)internal.buffer, external.width, external.height, internal.width, internal.height, internal.pitchB, internal.bytes,
   2427 		                    (nbAlphaBits == 8) ? ETC_Decoder::ETC_RGBA : ((nbAlphaBits == 1) ? ETC_Decoder::ETC_RGB_PUNCHTHROUGH_ALPHA : ETC_Decoder::ETC_RGB));
   2428 
   2429 		if(isSRGB)
   2430 		{
   2431 			static byte sRGBtoLinearTable[256];
   2432 			static bool sRGBtoLinearTableDirty = true;
   2433 			if(sRGBtoLinearTableDirty)
   2434 			{
   2435 				for(int i = 0; i < 256; i++)
   2436 				{
   2437 					sRGBtoLinearTable[i] = static_cast<byte>(sRGBtoLinear(static_cast<float>(i) / 255.0f) * 255.0f + 0.5f);
   2438 				}
   2439 				sRGBtoLinearTableDirty = false;
   2440 			}
   2441 
   2442 			// Perform sRGB conversion in place after decoding
   2443 			byte* src = (byte*)internal.buffer;
   2444 			for(int y = 0; y < internal.height; y++)
   2445 			{
   2446 				byte* srcRow = src + y * internal.pitchB;
   2447 				for(int x = 0; x <  internal.width; x++)
   2448 				{
   2449 					byte* srcPix = srcRow + x * internal.bytes;
   2450 					for(int i = 0; i < 3; i++)
   2451 					{
   2452 						srcPix[i] = sRGBtoLinearTable[srcPix[i]];
   2453 					}
   2454 				}
   2455 			}
   2456 		}
   2457 	}
   2458 
   2459 	void Surface::decodeEAC(Buffer &internal, const Buffer &external, int nbChannels, bool isSigned)
   2460 	{
   2461 		ASSERT(nbChannels == 1 || nbChannels == 2);
   2462 
   2463 		ETC_Decoder::Decode((const byte*)external.buffer, (byte*)internal.buffer, external.width, external.height, internal.width, internal.height, internal.pitchB, internal.bytes,
   2464 		                    (nbChannels == 1) ? (isSigned ? ETC_Decoder::ETC_R_SIGNED : ETC_Decoder::ETC_R_UNSIGNED) : (isSigned ? ETC_Decoder::ETC_RG_SIGNED : ETC_Decoder::ETC_RG_UNSIGNED));
   2465 
   2466 		// FIXME: We convert signed data to float, until signed integer internal formats are supported
   2467 		//        This code can be removed if signed ETC2 images are decoded to internal 8 bit signed R/RG formats
   2468 		if(isSigned)
   2469 		{
   2470 			sbyte* src = (sbyte*)internal.buffer;
   2471 
   2472 			for(int y = 0; y < internal.height; y++)
   2473 			{
   2474 				sbyte* srcRow = src + y * internal.pitchB;
   2475 				for(int x = internal.width - 1; x >= 0; x--)
   2476 				{
   2477 					int dx = x & 0xFFFFFFFC;
   2478 					int mx = x - dx;
   2479 					sbyte* srcPix = srcRow + dx * internal.bytes + mx * nbChannels;
   2480 					float* dstPix = (float*)(srcRow + x * internal.bytes);
   2481 					for(int c = nbChannels - 1; c >= 0; c--)
   2482 					{
   2483 						static const float normalization = 1.0f / 127.875f;
   2484 						dstPix[c] = clamp(static_cast<float>(srcPix[c]) * normalization, -1.0f, 1.0f);
   2485 					}
   2486 				}
   2487 			}
   2488 		}
   2489 	}
   2490 
   2491 	void Surface::decodeASTC(Buffer &internal, const Buffer &external, int xBlockSize, int yBlockSize, int zBlockSize, bool isSRGB)
   2492 	{
   2493 	}
   2494 
   2495 	unsigned int Surface::size(int width, int height, int depth, Format format)
   2496 	{
   2497 		// Dimensions rounded up to multiples of 4, used for compressed formats
   2498 		int width4 = align(width, 4);
   2499 		int height4 = align(height, 4);
   2500 
   2501 		switch(format)
   2502 		{
   2503 		#if S3TC_SUPPORT
   2504 		case FORMAT_DXT1:
   2505 		#endif
   2506 		case FORMAT_ATI1:
   2507 		case FORMAT_ETC1:
   2508 		case FORMAT_R11_EAC:
   2509 		case FORMAT_SIGNED_R11_EAC:
   2510 		case FORMAT_RGB8_ETC2:
   2511 		case FORMAT_SRGB8_ETC2:
   2512 		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
   2513 		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
   2514 			return width4 * height4 * depth / 2;
   2515 		#if S3TC_SUPPORT
   2516 		case FORMAT_DXT3:
   2517 		case FORMAT_DXT5:
   2518 		#endif
   2519 		case FORMAT_ATI2:
   2520 		case FORMAT_RG11_EAC:
   2521 		case FORMAT_SIGNED_RG11_EAC:
   2522 		case FORMAT_RGBA8_ETC2_EAC:
   2523 		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
   2524 		case FORMAT_RGBA_ASTC_4x4_KHR:
   2525 		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
   2526 			return width4 * height4 * depth;
   2527 		case FORMAT_RGBA_ASTC_5x4_KHR:
   2528 		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
   2529 			return align(width, 5) * height4 * depth;
   2530 		case FORMAT_RGBA_ASTC_5x5_KHR:
   2531 		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
   2532 			return align(width, 5) * align(height, 5) * depth;
   2533 		case FORMAT_RGBA_ASTC_6x5_KHR:
   2534 		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
   2535 			return align(width, 6) * align(height, 5) * depth;
   2536 		case FORMAT_RGBA_ASTC_6x6_KHR:
   2537 		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
   2538 			return align(width, 6) * align(height, 6) * depth;
   2539 		case FORMAT_RGBA_ASTC_8x5_KHR:
   2540 		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
   2541 			return align(width, 8) * align(height, 5) * depth;
   2542 		case FORMAT_RGBA_ASTC_8x6_KHR:
   2543 		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
   2544 			return align(width, 8) * align(height, 6) * depth;
   2545 		case FORMAT_RGBA_ASTC_8x8_KHR:
   2546 		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
   2547 			return align(width, 8) * align(height, 8) * depth;
   2548 		case FORMAT_RGBA_ASTC_10x5_KHR:
   2549 		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
   2550 			return align(width, 10) * align(height, 5) * depth;
   2551 		case FORMAT_RGBA_ASTC_10x6_KHR:
   2552 		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
   2553 			return align(width, 10) * align(height, 6) * depth;
   2554 		case FORMAT_RGBA_ASTC_10x8_KHR:
   2555 		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
   2556 			return align(width, 10) * align(height, 8) * depth;
   2557 		case FORMAT_RGBA_ASTC_10x10_KHR:
   2558 		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
   2559 			return align(width, 10) * align(height, 10) * depth;
   2560 		case FORMAT_RGBA_ASTC_12x10_KHR:
   2561 		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
   2562 			return align(width, 12) * align(height, 10) * depth;
   2563 		case FORMAT_RGBA_ASTC_12x12_KHR:
   2564 		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
   2565 			return align(width, 12) * align(height, 12) * depth;
   2566 		case FORMAT_YV12_BT601:
   2567 		case FORMAT_YV12_BT709:
   2568 		case FORMAT_YV12_JFIF:
   2569 			{
   2570 				unsigned int YStride = align(width, 16);
   2571 				unsigned int YSize = YStride * height;
   2572 				unsigned int CStride = align(YStride / 2, 16);
   2573 				unsigned int CSize = CStride * height / 2;
   2574 
   2575 				return YSize + 2 * CSize;
   2576 			}
   2577 		default:
   2578 			return bytes(format) * width * height * depth;
   2579 		}
   2580 
   2581 		return 0;
   2582 	}
   2583 
   2584 	bool Surface::isStencil(Format format)
   2585 	{
   2586 		switch(format)
   2587 		{
   2588 		case FORMAT_D32:
   2589 		case FORMAT_D16:
   2590 		case FORMAT_D24X8:
   2591 		case FORMAT_D32F:
   2592 		case FORMAT_D32F_COMPLEMENTARY:
   2593 		case FORMAT_D32F_LOCKABLE:
   2594 			return false;
   2595 		case FORMAT_D24S8:
   2596 		case FORMAT_D24FS8:
   2597 		case FORMAT_S8:
   2598 		case FORMAT_DF24S8:
   2599 		case FORMAT_DF16S8:
   2600 		case FORMAT_D32FS8_TEXTURE:
   2601 		case FORMAT_D32FS8_SHADOW:
   2602 		case FORMAT_INTZ:
   2603 			return true;
   2604 		default:
   2605 			return false;
   2606 		}
   2607 	}
   2608 
   2609 	bool Surface::isDepth(Format format)
   2610 	{
   2611 		switch(format)
   2612 		{
   2613 		case FORMAT_D32:
   2614 		case FORMAT_D16:
   2615 		case FORMAT_D24X8:
   2616 		case FORMAT_D24S8:
   2617 		case FORMAT_D24FS8:
   2618 		case FORMAT_D32F:
   2619 		case FORMAT_D32F_COMPLEMENTARY:
   2620 		case FORMAT_D32F_LOCKABLE:
   2621 		case FORMAT_DF24S8:
   2622 		case FORMAT_DF16S8:
   2623 		case FORMAT_D32FS8_TEXTURE:
   2624 		case FORMAT_D32FS8_SHADOW:
   2625 		case FORMAT_INTZ:
   2626 			return true;
   2627 		case FORMAT_S8:
   2628 			return false;
   2629 		default:
   2630 			return false;
   2631 		}
   2632 	}
   2633 
   2634 	bool Surface::isPalette(Format format)
   2635 	{
   2636 		switch(format)
   2637 		{
   2638 		case FORMAT_P8:
   2639 		case FORMAT_A8P8:
   2640 			return true;
   2641 		default:
   2642 			return false;
   2643 		}
   2644 	}
   2645 
   2646 	bool Surface::isFloatFormat(Format format)
   2647 	{
   2648 		switch(format)
   2649 		{
   2650 		case FORMAT_R5G6B5:
   2651 		case FORMAT_R8G8B8:
   2652 		case FORMAT_B8G8R8:
   2653 		case FORMAT_X8R8G8B8:
   2654 		case FORMAT_X8B8G8R8I:
   2655 		case FORMAT_X8B8G8R8:
   2656 		case FORMAT_A8R8G8B8:
   2657 		case FORMAT_SRGB8_X8:
   2658 		case FORMAT_SRGB8_A8:
   2659 		case FORMAT_A8B8G8R8I:
   2660 		case FORMAT_R8UI:
   2661 		case FORMAT_G8R8UI:
   2662 		case FORMAT_X8B8G8R8UI:
   2663 		case FORMAT_A8B8G8R8UI:
   2664 		case FORMAT_A8B8G8R8:
   2665 		case FORMAT_G8R8I:
   2666 		case FORMAT_G8R8:
   2667 		case FORMAT_A2B10G10R10:
   2668 		case FORMAT_R8I_SNORM:
   2669 		case FORMAT_G8R8I_SNORM:
   2670 		case FORMAT_X8B8G8R8I_SNORM:
   2671 		case FORMAT_A8B8G8R8I_SNORM:
   2672 		case FORMAT_R16I:
   2673 		case FORMAT_R16UI:
   2674 		case FORMAT_G16R16I:
   2675 		case FORMAT_G16R16UI:
   2676 		case FORMAT_G16R16:
   2677 		case FORMAT_X16B16G16R16I:
   2678 		case FORMAT_X16B16G16R16UI:
   2679 		case FORMAT_A16B16G16R16I:
   2680 		case FORMAT_A16B16G16R16UI:
   2681 		case FORMAT_A16B16G16R16:
   2682 		case FORMAT_V8U8:
   2683 		case FORMAT_Q8W8V8U8:
   2684 		case FORMAT_X8L8V8U8:
   2685 		case FORMAT_V16U16:
   2686 		case FORMAT_A16W16V16U16:
   2687 		case FORMAT_Q16W16V16U16:
   2688 		case FORMAT_A8:
   2689 		case FORMAT_R8I:
   2690 		case FORMAT_R8:
   2691 		case FORMAT_L8:
   2692 		case FORMAT_L16:
   2693 		case FORMAT_A8L8:
   2694 		case FORMAT_YV12_BT601:
   2695 		case FORMAT_YV12_BT709:
   2696 		case FORMAT_YV12_JFIF:
   2697 		case FORMAT_R32I:
   2698 		case FORMAT_R32UI:
   2699 		case FORMAT_G32R32I:
   2700 		case FORMAT_G32R32UI:
   2701 		case FORMAT_X32B32G32R32I:
   2702 		case FORMAT_X32B32G32R32UI:
   2703 		case FORMAT_A32B32G32R32I:
   2704 		case FORMAT_A32B32G32R32UI:
   2705 			return false;
   2706 		case FORMAT_R32F:
   2707 		case FORMAT_G32R32F:
   2708 		case FORMAT_X32B32G32R32F:
   2709 		case FORMAT_A32B32G32R32F:
   2710 		case FORMAT_D32F:
   2711 		case FORMAT_D32F_COMPLEMENTARY:
   2712 		case FORMAT_D32F_LOCKABLE:
   2713 		case FORMAT_D32FS8_TEXTURE:
   2714 		case FORMAT_D32FS8_SHADOW:
   2715 		case FORMAT_L16F:
   2716 		case FORMAT_A16L16F:
   2717 		case FORMAT_L32F:
   2718 		case FORMAT_A32L32F:
   2719 			return true;
   2720 		default:
   2721 			ASSERT(false);
   2722 		}
   2723 
   2724 		return false;
   2725 	}
   2726 
   2727 	bool Surface::isUnsignedComponent(Format format, int component)
   2728 	{
   2729 		switch(format)
   2730 		{
   2731 		case FORMAT_NULL:
   2732 		case FORMAT_R5G6B5:
   2733 		case FORMAT_R8G8B8:
   2734 		case FORMAT_B8G8R8:
   2735 		case FORMAT_X8R8G8B8:
   2736 		case FORMAT_X8B8G8R8:
   2737 		case FORMAT_A8R8G8B8:
   2738 		case FORMAT_A8B8G8R8:
   2739 		case FORMAT_SRGB8_X8:
   2740 		case FORMAT_SRGB8_A8:
   2741 		case FORMAT_G8R8:
   2742 		case FORMAT_A2B10G10R10:
   2743 		case FORMAT_R16UI:
   2744 		case FORMAT_G16R16:
   2745 		case FORMAT_G16R16UI:
   2746 		case FORMAT_X16B16G16R16UI:
   2747 		case FORMAT_A16B16G16R16:
   2748 		case FORMAT_A16B16G16R16UI:
   2749 		case FORMAT_R32UI:
   2750 		case FORMAT_G32R32UI:
   2751 		case FORMAT_X32B32G32R32UI:
   2752 		case FORMAT_A32B32G32R32UI:
   2753 		case FORMAT_R8UI:
   2754 		case FORMAT_G8R8UI:
   2755 		case FORMAT_X8B8G8R8UI:
   2756 		case FORMAT_A8B8G8R8UI:
   2757 		case FORMAT_D32F:
   2758 		case FORMAT_D32F_COMPLEMENTARY:
   2759 		case FORMAT_D32F_LOCKABLE:
   2760 		case FORMAT_D32FS8_TEXTURE:
   2761 		case FORMAT_D32FS8_SHADOW:
   2762 		case FORMAT_A8:
   2763 		case FORMAT_R8:
   2764 		case FORMAT_L8:
   2765 		case FORMAT_L16:
   2766 		case FORMAT_A8L8:
   2767 		case FORMAT_YV12_BT601:
   2768 		case FORMAT_YV12_BT709:
   2769 		case FORMAT_YV12_JFIF:
   2770 			return true;
   2771 		case FORMAT_A8B8G8R8I:
   2772 		case FORMAT_A16B16G16R16I:
   2773 		case FORMAT_A32B32G32R32I:
   2774 		case FORMAT_A8B8G8R8I_SNORM:
   2775 		case FORMAT_Q8W8V8U8:
   2776 		case FORMAT_Q16W16V16U16:
   2777 		case FORMAT_A32B32G32R32F:
   2778 			return false;
   2779 		case FORMAT_R32F:
   2780 		case FORMAT_R8I:
   2781 		case FORMAT_R16I:
   2782 		case FORMAT_R32I:
   2783 		case FORMAT_R8I_SNORM:
   2784 			return component >= 1;
   2785 		case FORMAT_V8U8:
   2786 		case FORMAT_X8L8V8U8:
   2787 		case FORMAT_V16U16:
   2788 		case FORMAT_G32R32F:
   2789 		case FORMAT_G8R8I:
   2790 		case FORMAT_G16R16I:
   2791 		case FORMAT_G32R32I:
   2792 		case FORMAT_G8R8I_SNORM:
   2793 			return component >= 2;
   2794 		case FORMAT_A16W16V16U16:
   2795 		case FORMAT_X32B32G32R32F:
   2796 		case FORMAT_X8B8G8R8I:
   2797 		case FORMAT_X16B16G16R16I:
   2798 		case FORMAT_X32B32G32R32I:
   2799 		case FORMAT_X8B8G8R8I_SNORM:
   2800 			return component >= 3;
   2801 		default:
   2802 			ASSERT(false);
   2803 		}
   2804 
   2805 		return false;
   2806 	}
   2807 
   2808 	bool Surface::isSRGBreadable(Format format)
   2809 	{
   2810 		// Keep in sync with Capabilities::isSRGBreadable
   2811 		switch(format)
   2812 		{
   2813 		case FORMAT_L8:
   2814 		case FORMAT_A8L8:
   2815 		case FORMAT_R8G8B8:
   2816 		case FORMAT_A8R8G8B8:
   2817 		case FORMAT_X8R8G8B8:
   2818 		case FORMAT_A8B8G8R8:
   2819 		case FORMAT_X8B8G8R8:
   2820 		case FORMAT_SRGB8_X8:
   2821 		case FORMAT_SRGB8_A8:
   2822 		case FORMAT_R5G6B5:
   2823 		case FORMAT_X1R5G5B5:
   2824 		case FORMAT_A1R5G5B5:
   2825 		case FORMAT_A4R4G4B4:
   2826 		#if S3TC_SUPPORT
   2827 		case FORMAT_DXT1:
   2828 		case FORMAT_DXT3:
   2829 		case FORMAT_DXT5:
   2830 		#endif
   2831 		case FORMAT_ATI1:
   2832 		case FORMAT_ATI2:
   2833 			return true;
   2834 		default:
   2835 			return false;
   2836 		}
   2837 
   2838 		return false;
   2839 	}
   2840 
   2841 	bool Surface::isSRGBwritable(Format format)
   2842 	{
   2843 		// Keep in sync with Capabilities::isSRGBwritable
   2844 		switch(format)
   2845 		{
   2846 		case FORMAT_NULL:
   2847 		case FORMAT_A8R8G8B8:
   2848 		case FORMAT_X8R8G8B8:
   2849 		case FORMAT_A8B8G8R8:
   2850 		case FORMAT_X8B8G8R8:
   2851 		case FORMAT_SRGB8_X8:
   2852 		case FORMAT_SRGB8_A8:
   2853 		case FORMAT_R5G6B5:
   2854 			return true;
   2855 		default:
   2856 			return false;
   2857 		}
   2858 	}
   2859 
   2860 	bool Surface::isCompressed(Format format)
   2861 	{
   2862 		switch(format)
   2863 		{
   2864 		#if S3TC_SUPPORT
   2865 		case FORMAT_DXT1:
   2866 		case FORMAT_DXT3:
   2867 		case FORMAT_DXT5:
   2868 		#endif
   2869 		case FORMAT_ATI1:
   2870 		case FORMAT_ATI2:
   2871 		case FORMAT_ETC1:
   2872 		case FORMAT_R11_EAC:
   2873 		case FORMAT_SIGNED_R11_EAC:
   2874 		case FORMAT_RG11_EAC:
   2875 		case FORMAT_SIGNED_RG11_EAC:
   2876 		case FORMAT_RGB8_ETC2:
   2877 		case FORMAT_SRGB8_ETC2:
   2878 		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
   2879 		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
   2880 		case FORMAT_RGBA8_ETC2_EAC:
   2881 		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
   2882 		case FORMAT_RGBA_ASTC_4x4_KHR:
   2883 		case FORMAT_RGBA_ASTC_5x4_KHR:
   2884 		case FORMAT_RGBA_ASTC_5x5_KHR:
   2885 		case FORMAT_RGBA_ASTC_6x5_KHR:
   2886 		case FORMAT_RGBA_ASTC_6x6_KHR:
   2887 		case FORMAT_RGBA_ASTC_8x5_KHR:
   2888 		case FORMAT_RGBA_ASTC_8x6_KHR:
   2889 		case FORMAT_RGBA_ASTC_8x8_KHR:
   2890 		case FORMAT_RGBA_ASTC_10x5_KHR:
   2891 		case FORMAT_RGBA_ASTC_10x6_KHR:
   2892 		case FORMAT_RGBA_ASTC_10x8_KHR:
   2893 		case FORMAT_RGBA_ASTC_10x10_KHR:
   2894 		case FORMAT_RGBA_ASTC_12x10_KHR:
   2895 		case FORMAT_RGBA_ASTC_12x12_KHR:
   2896 		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
   2897 		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
   2898 		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
   2899 		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
   2900 		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
   2901 		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
   2902 		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
   2903 		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
   2904 		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
   2905 		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
   2906 		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
   2907 		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
   2908 		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
   2909 		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
   2910 			return true;
   2911 		default:
   2912 			return false;
   2913 		}
   2914 	}
   2915 
   2916 	bool Surface::isNonNormalizedInteger(Format format)
   2917 	{
   2918 		switch(format)
   2919 		{
   2920 		case FORMAT_A8B8G8R8I:
   2921 		case FORMAT_X8B8G8R8I:
   2922 		case FORMAT_G8R8I:
   2923 		case FORMAT_R8I:
   2924 		case FORMAT_A8B8G8R8UI:
   2925 		case FORMAT_X8B8G8R8UI:
   2926 		case FORMAT_G8R8UI:
   2927 		case FORMAT_R8UI:
   2928 		case FORMAT_A16B16G16R16I:
   2929 		case FORMAT_X16B16G16R16I:
   2930 		case FORMAT_G16R16I:
   2931 		case FORMAT_R16I:
   2932 		case FORMAT_A16B16G16R16UI:
   2933 		case FORMAT_X16B16G16R16UI:
   2934 		case FORMAT_G16R16UI:
   2935 		case FORMAT_R16UI:
   2936 		case FORMAT_A32B32G32R32I:
   2937 		case FORMAT_X32B32G32R32I:
   2938 		case FORMAT_G32R32I:
   2939 		case FORMAT_R32I:
   2940 		case FORMAT_A32B32G32R32UI:
   2941 		case FORMAT_X32B32G32R32UI:
   2942 		case FORMAT_G32R32UI:
   2943 		case FORMAT_R32UI:
   2944 			return true;
   2945 		default:
   2946 			return false;
   2947 		}
   2948 	}
   2949 
   2950 	int Surface::componentCount(Format format)
   2951 	{
   2952 		switch(format)
   2953 		{
   2954 		case FORMAT_R5G6B5:         return 3;
   2955 		case FORMAT_X8R8G8B8:       return 3;
   2956 		case FORMAT_X8B8G8R8I:      return 3;
   2957 		case FORMAT_X8B8G8R8:       return 3;
   2958 		case FORMAT_A8R8G8B8:       return 4;
   2959 		case FORMAT_SRGB8_X8:       return 3;
   2960 		case FORMAT_SRGB8_A8:       return 4;
   2961 		case FORMAT_A8B8G8R8I:      return 4;
   2962 		case FORMAT_A8B8G8R8:       return 4;
   2963 		case FORMAT_G8R8I:          return 2;
   2964 		case FORMAT_G8R8:           return 2;
   2965 		case FORMAT_R8I_SNORM:      return 1;
   2966 		case FORMAT_G8R8I_SNORM:    return 2;
   2967 		case FORMAT_X8B8G8R8I_SNORM:return 3;
   2968 		case FORMAT_A8B8G8R8I_SNORM:return 4;
   2969 		case FORMAT_R8UI:           return 1;
   2970 		case FORMAT_G8R8UI:         return 2;
   2971 		case FORMAT_X8B8G8R8UI:     return 3;
   2972 		case FORMAT_A8B8G8R8UI:     return 4;
   2973 		case FORMAT_A2B10G10R10:    return 4;
   2974 		case FORMAT_G16R16I:        return 2;
   2975 		case FORMAT_G16R16UI:       return 2;
   2976 		case FORMAT_G16R16:         return 2;
   2977 		case FORMAT_G32R32I:        return 2;
   2978 		case FORMAT_G32R32UI:       return 2;
   2979 		case FORMAT_X16B16G16R16I:  return 3;
   2980 		case FORMAT_X16B16G16R16UI: return 3;
   2981 		case FORMAT_A16B16G16R16I:  return 4;
   2982 		case FORMAT_A16B16G16R16UI: return 4;
   2983 		case FORMAT_A16B16G16R16:   return 4;
   2984 		case FORMAT_X32B32G32R32I:  return 3;
   2985 		case FORMAT_X32B32G32R32UI: return 3;
   2986 		case FORMAT_A32B32G32R32I:  return 4;
   2987 		case FORMAT_A32B32G32R32UI: return 4;
   2988 		case FORMAT_V8U8:           return 2;
   2989 		case FORMAT_Q8W8V8U8:       return 4;
   2990 		case FORMAT_X8L8V8U8:       return 3;
   2991 		case FORMAT_V16U16:         return 2;
   2992 		case FORMAT_A16W16V16U16:   return 4;
   2993 		case FORMAT_Q16W16V16U16:   return 4;
   2994 		case FORMAT_R32F:           return 1;
   2995 		case FORMAT_G32R32F:        return 2;
   2996 		case FORMAT_X32B32G32R32F:  return 3;
   2997 		case FORMAT_A32B32G32R32F:  return 4;
   2998 		case FORMAT_D32F:           return 1;
   2999 		case FORMAT_D32F_LOCKABLE:  return 1;
   3000 		case FORMAT_D32FS8_TEXTURE: return 1;
   3001 		case FORMAT_D32FS8_SHADOW:  return 1;
   3002 		case FORMAT_A8:             return 1;
   3003 		case FORMAT_R8I:            return 1;
   3004 		case FORMAT_R8:             return 1;
   3005 		case FORMAT_R16I:           return 1;
   3006 		case FORMAT_R16UI:          return 1;
   3007 		case FORMAT_R32I:           return 1;
   3008 		case FORMAT_R32UI:          return 1;
   3009 		case FORMAT_L8:             return 1;
   3010 		case FORMAT_L16:            return 1;
   3011 		case FORMAT_A8L8:           return 2;
   3012 		case FORMAT_YV12_BT601:     return 3;
   3013 		case FORMAT_YV12_BT709:     return 3;
   3014 		case FORMAT_YV12_JFIF:      return 3;
   3015 		default:
   3016 			ASSERT(false);
   3017 		}
   3018 
   3019 		return 1;
   3020 	}
   3021 
   3022 	void *Surface::allocateBuffer(int width, int height, int depth, Format format)
   3023 	{
   3024 		// Render targets require 2x2 quads
   3025 		int width2 = (width + 1) & ~1;
   3026 		int height2 = (height + 1) & ~1;
   3027 
   3028 		// FIXME: Unpacking byte4 to short4 in the sampler currently involves reading 8 bytes,
   3029 		// so we have to allocate 4 extra bytes to avoid buffer overruns.
   3030 		return allocateZero(size(width2, height2, depth, format) + 4);
   3031 	}
   3032 
   3033 	void Surface::memfill4(void *buffer, int pattern, int bytes)
   3034 	{
   3035 		while((size_t)buffer & 0x1 && bytes >= 1)
   3036 		{
   3037 			*(char*)buffer = (char)pattern;
   3038 			(char*&)buffer += 1;
   3039 			bytes -= 1;
   3040 		}
   3041 
   3042 		while((size_t)buffer & 0x3 && bytes >= 2)
   3043 		{
   3044 			*(short*)buffer = (short)pattern;
   3045 			(short*&)buffer += 1;
   3046 			bytes -= 2;
   3047 		}
   3048 
   3049 		if(CPUID::supportsSSE())
   3050 		{
   3051 			while((size_t)buffer & 0xF && bytes >= 4)
   3052 			{
   3053 				*(int*)buffer = pattern;
   3054 				(int*&)buffer += 1;
   3055 				bytes -= 4;
   3056 			}
   3057 
   3058 			__m128 quad = _mm_set_ps1((float&)pattern);
   3059 
   3060 			float *pointer = (float*)buffer;
   3061 			int qxwords = bytes / 64;
   3062 			bytes -= qxwords * 64;
   3063 
   3064 			while(qxwords--)
   3065 			{
   3066 				_mm_stream_ps(pointer + 0, quad);
   3067 				_mm_stream_ps(pointer + 4, quad);
   3068 				_mm_stream_ps(pointer + 8, quad);
   3069 				_mm_stream_ps(pointer + 12, quad);
   3070 
   3071 				pointer += 16;
   3072 			}
   3073 
   3074 			buffer = pointer;
   3075 		}
   3076 
   3077 		while(bytes >= 4)
   3078 		{
   3079 			*(int*)buffer = (int)pattern;
   3080 			(int*&)buffer += 1;
   3081 			bytes -= 4;
   3082 		}
   3083 
   3084 		while(bytes >= 2)
   3085 		{
   3086 			*(short*)buffer = (short)pattern;
   3087 			(short*&)buffer += 1;
   3088 			bytes -= 2;
   3089 		}
   3090 
   3091 		while(bytes >= 1)
   3092 		{
   3093 			*(char*)buffer = (char)pattern;
   3094 			(char*&)buffer += 1;
   3095 			bytes -= 1;
   3096 		}
   3097 	}
   3098 
   3099 	bool Surface::isEntire(const SliceRect& rect) const
   3100 	{
   3101 		return (rect.x0 == 0 && rect.y0 == 0 && rect.x1 == internal.width && rect.y1 == internal.height && internal.depth == 1);
   3102 	}
   3103 
   3104 	SliceRect Surface::getRect() const
   3105 	{
   3106 		return SliceRect(0, 0, internal.width, internal.height, 0);
   3107 	}
   3108 
   3109 	void Surface::clearDepth(float depth, int x0, int y0, int width, int height)
   3110 	{
   3111 		if(width == 0 || height == 0) return;
   3112 
   3113 		// Not overlapping
   3114 		if(x0 > internal.width) return;
   3115 		if(y0 > internal.height) return;
   3116 		if(x0 + width < 0) return;
   3117 		if(y0 + height < 0) return;
   3118 
   3119 		// Clip against dimensions
   3120 		if(x0 < 0) {width += x0; x0 = 0;}
   3121 		if(x0 + width > internal.width) width = internal.width - x0;
   3122 		if(y0 < 0) {height += y0; y0 = 0;}
   3123 		if(y0 + height > internal.height) height = internal.height - y0;
   3124 
   3125 		const bool entire = x0 == 0 && y0 == 0 && width == internal.width && height == internal.height;
   3126 		const Lock lock = entire ? LOCK_DISCARD : LOCK_WRITEONLY;
   3127 
   3128 		int width2 = (internal.width + 1) & ~1;
   3129 
   3130 		int x1 = x0 + width;
   3131 		int y1 = y0 + height;
   3132 
   3133 		if(internal.format == FORMAT_D32F_LOCKABLE ||
   3134 		   internal.format == FORMAT_D32FS8_TEXTURE ||
   3135 		   internal.format == FORMAT_D32FS8_SHADOW)
   3136 		{
   3137 			float *target = (float*)lockInternal(0, 0, 0, lock, PUBLIC) + x0 + width2 * y0;
   3138 
   3139 			for(int z = 0; z < internal.depth; z++)
   3140 			{
   3141 				for(int y = y0; y < y1; y++)
   3142 				{
   3143 					memfill4(target, (int&)depth, 4 * width);
   3144 					target += width2;
   3145 				}
   3146 			}
   3147 
   3148 			unlockInternal();
   3149 		}
   3150 		else   // Quad layout
   3151 		{
   3152 			if(complementaryDepthBuffer)
   3153 			{
   3154 				depth = 1 - depth;
   3155 			}
   3156 
   3157 			float *buffer = (float*)lockInternal(0, 0, 0, lock, PUBLIC);
   3158 
   3159 			int oddX0 = (x0 & ~1) * 2 + (x0 & 1);
   3160 			int oddX1 = (x1 & ~1) * 2;
   3161 			int evenX0 = ((x0 + 1) & ~1) * 2;
   3162 			int evenBytes = (oddX1 - evenX0) * sizeof(float);
   3163 
   3164 			for(int z = 0; z < internal.depth; z++)
   3165 			{
   3166 				for(int y = y0; y < y1; y++)
   3167 				{
   3168 					float *target = buffer + (y & ~1) * width2 + (y & 1) * 2;
   3169 
   3170 					if((y & 1) == 0 && y + 1 < y1)   // Fill quad line at once
   3171 					{
   3172 						if((x0 & 1) != 0)
   3173 						{
   3174 							target[oddX0 + 0] = depth;
   3175 							target[oddX0 + 2] = depth;
   3176 						}
   3177 
   3178 					//	for(int x2 = evenX0; x2 < x1 * 2; x2 += 4)
   3179 					//	{
   3180 					//		target[x2 + 0] = depth;
   3181 					//		target[x2 + 1] = depth;
   3182 					//		target[x2 + 2] = depth;
   3183 					//		target[x2 + 3] = depth;
   3184 					//	}
   3185 
   3186 					//	__asm
   3187 					//	{
   3188 					//		movss xmm0, depth
   3189 					//		shufps xmm0, xmm0, 0x00
   3190 					//
   3191 					//		mov eax, x0
   3192 					//		add eax, 1
   3193 					//		and eax, 0xFFFFFFFE
   3194 					//		cmp eax, x1
   3195 					//		jge qEnd
   3196 					//
   3197 					//		mov edi, target
   3198 					//
   3199 					//	qLoop:
   3200 					//		movntps [edi+8*eax], xmm0
   3201 					//
   3202 					//		add eax, 2
   3203 					//		cmp eax, x1
   3204 					//		jl qLoop
   3205 					//	qEnd:
   3206 					//	}
   3207 
   3208 						memfill4(&target[evenX0], (int&)depth, evenBytes);
   3209 
   3210 						if((x1 & 1) != 0)
   3211 						{
   3212 							target[oddX1 + 0] = depth;
   3213 							target[oddX1 + 2] = depth;
   3214 						}
   3215 
   3216 						y++;
   3217 					}
   3218 					else
   3219 					{
   3220 						for(int x = x0, i = oddX0; x < x1; x++, i = (x & ~1) * 2 + (x & 1))
   3221 						{
   3222 							target[i] = depth;
   3223 						}
   3224 					}
   3225 				}
   3226 
   3227 				buffer += internal.sliceP;
   3228 			}
   3229 
   3230 			unlockInternal();
   3231 		}
   3232 	}
   3233 
   3234 	void Surface::clearStencil(unsigned char s, unsigned char mask, int x0, int y0, int width, int height)
   3235 	{
   3236 		if(mask == 0 || width == 0 || height == 0) return;
   3237 
   3238 		// Not overlapping
   3239 		if(x0 > internal.width) return;
   3240 		if(y0 > internal.height) return;
   3241 		if(x0 + width < 0) return;
   3242 		if(y0 + height < 0) return;
   3243 
   3244 		// Clip against dimensions
   3245 		if(x0 < 0) {width += x0; x0 = 0;}
   3246 		if(x0 + width > internal.width) width = internal.width - x0;
   3247 		if(y0 < 0) {height += y0; y0 = 0;}
   3248 		if(y0 + height > internal.height) height = internal.height - y0;
   3249 
   3250 		int width2 = (internal.width + 1) & ~1;
   3251 
   3252 		int x1 = x0 + width;
   3253 		int y1 = y0 + height;
   3254 
   3255 		int oddX0 = (x0 & ~1) * 2 + (x0 & 1);
   3256 		int oddX1 = (x1 & ~1) * 2;
   3257 		int evenX0 = ((x0 + 1) & ~1) * 2;
   3258 		int evenBytes = oddX1 - evenX0;
   3259 
   3260 		unsigned char maskedS = s & mask;
   3261 		unsigned char invMask = ~mask;
   3262 		unsigned int fill = maskedS;
   3263 		fill = fill | (fill << 8) | (fill << 16) + (fill << 24);
   3264 
   3265 		char *buffer = (char*)lockStencil(0, PUBLIC);
   3266 
   3267 		// Stencil buffers are assumed to use quad layout
   3268 		for(int z = 0; z < stencil.depth; z++)
   3269 		{
   3270 			for(int y = y0; y < y1; y++)
   3271 			{
   3272 				char *target = buffer + (y & ~1) * width2 + (y & 1) * 2;
   3273 
   3274 				if((y & 1) == 0 && y + 1 < y1 && mask == 0xFF)   // Fill quad line at once
   3275 				{
   3276 					if((x0 & 1) != 0)
   3277 					{
   3278 						target[oddX0 + 0] = fill;
   3279 						target[oddX0 + 2] = fill;
   3280 					}
   3281 
   3282 					memfill4(&target[evenX0], fill, evenBytes);
   3283 
   3284 					if((x1 & 1) != 0)
   3285 					{
   3286 						target[oddX1 + 0] = fill;
   3287 						target[oddX1 + 2] = fill;
   3288 					}
   3289 
   3290 					y++;
   3291 				}
   3292 				else
   3293 				{
   3294 					for(int x = x0, i = oddX0; x < x1; x++, i = (x & ~1) * 2 + (x & 1))
   3295 					{
   3296 						target[i] = maskedS | (target[i] & invMask);
   3297 					}
   3298 				}
   3299 			}
   3300 
   3301 			buffer += stencil.sliceP;
   3302 		}
   3303 
   3304 		unlockStencil();
   3305 	}
   3306 
   3307 	void Surface::fill(const Color<float> &color, int x0, int y0, int width, int height)
   3308 	{
   3309 		unsigned char *row;
   3310 		Buffer *buffer;
   3311 
   3312 		if(internal.dirty)
   3313 		{
   3314 			row = (unsigned char*)lockInternal(x0, y0, 0, LOCK_WRITEONLY, PUBLIC);
   3315 			buffer = &internal;
   3316 		}
   3317 		else
   3318 		{
   3319 			row = (unsigned char*)lockExternal(x0, y0, 0, LOCK_WRITEONLY, PUBLIC);
   3320 			buffer = &external;
   3321 		}
   3322 
   3323 		if(buffer->bytes <= 4)
   3324 		{
   3325 			int c;
   3326 			buffer->write(&c, color);
   3327 
   3328 			if(buffer->bytes <= 1) c = (c << 8)  | c;
   3329 			if(buffer->bytes <= 2) c = (c << 16) | c;
   3330 
   3331 			for(int y = 0; y < height; y++)
   3332 			{
   3333 				memfill4(row, c, width * buffer->bytes);
   3334 
   3335 				row += buffer->pitchB;
   3336 			}
   3337 		}
   3338 		else   // Generic
   3339 		{
   3340 			for(int y = 0; y < height; y++)
   3341 			{
   3342 				unsigned char *element = row;
   3343 
   3344 				for(int x = 0; x < width; x++)
   3345 				{
   3346 					buffer->write(element, color);
   3347 
   3348 					element += buffer->bytes;
   3349 				}
   3350 
   3351 				row += buffer->pitchB;
   3352 			}
   3353 		}
   3354 
   3355 		if(buffer == &internal)
   3356 		{
   3357 			unlockInternal();
   3358 		}
   3359 		else
   3360 		{
   3361 			unlockExternal();
   3362 		}
   3363 	}
   3364 
   3365 	void Surface::copyInternal(const Surface* source, int x, int y, float srcX, float srcY, bool filter)
   3366 	{
   3367 		ASSERT(internal.lock != LOCK_UNLOCKED && source && source->internal.lock != LOCK_UNLOCKED);
   3368 
   3369 		sw::Color<float> color;
   3370 
   3371 		if(!filter)
   3372 		{
   3373 			color = source->internal.read((int)srcX, (int)srcY);
   3374 		}
   3375 		else   // Bilinear filtering
   3376 		{
   3377 			color = source->internal.sample(srcX, srcY);
   3378 		}
   3379 
   3380 		internal.write(x, y, color);
   3381 	}
   3382 
   3383 	void Surface::copyInternal(const Surface* source, int x, int y, int z, float srcX, float srcY, float srcZ, bool filter)
   3384 	{
   3385 		ASSERT(internal.lock != LOCK_UNLOCKED && source && source->internal.lock != LOCK_UNLOCKED);
   3386 
   3387 		sw::Color<float> color;
   3388 
   3389 		if(!filter)
   3390 		{
   3391 			color = source->internal.read((int)srcX, (int)srcY, int(srcZ));
   3392 		}
   3393 		else   // Bilinear filtering
   3394 		{
   3395 			color = source->internal.sample(srcX, srcY, srcZ);
   3396 		}
   3397 
   3398 		internal.write(x, y, z, color);
   3399 	}
   3400 
   3401 	bool Surface::hasStencil() const
   3402 	{
   3403 		return isStencil(external.format);
   3404 	}
   3405 
   3406 	bool Surface::hasDepth() const
   3407 	{
   3408 		return isDepth(external.format);
   3409 	}
   3410 
   3411 	bool Surface::hasPalette() const
   3412 	{
   3413 		return isPalette(external.format);
   3414 	}
   3415 
   3416 	bool Surface::isRenderTarget() const
   3417 	{
   3418 		return renderTarget;
   3419 	}
   3420 
   3421 	bool Surface::hasDirtyMipmaps() const
   3422 	{
   3423 		return dirtyMipmaps;
   3424 	}
   3425 
   3426 	void Surface::cleanMipmaps()
   3427 	{
   3428 		dirtyMipmaps = false;
   3429 	}
   3430 
   3431 	Resource *Surface::getResource()
   3432 	{
   3433 		return resource;
   3434 	}
   3435 
   3436 	bool Surface::identicalFormats() const
   3437 	{
   3438 		return external.format == internal.format &&
   3439 		       external.width  == internal.width &&
   3440 		       external.height == internal.height &&
   3441 		       external.depth  == internal.depth &&
   3442 		       external.pitchB == internal.pitchB &&
   3443 		       external.sliceB == internal.sliceB;
   3444 	}
   3445 
   3446 	Format Surface::selectInternalFormat(Format format) const
   3447 	{
   3448 		switch(format)
   3449 		{
   3450 		case FORMAT_NULL:
   3451 			return FORMAT_NULL;
   3452 		case FORMAT_P8:
   3453 		case FORMAT_A8P8:
   3454 		case FORMAT_A4R4G4B4:
   3455 		case FORMAT_A1R5G5B5:
   3456 		case FORMAT_A8R3G3B2:
   3457 			return FORMAT_A8R8G8B8;
   3458 		case FORMAT_A8:
   3459 			return FORMAT_A8;
   3460 		case FORMAT_R8I:
   3461 			return FORMAT_R8I;
   3462 		case FORMAT_R8UI:
   3463 			return FORMAT_R8UI;
   3464 		case FORMAT_R8I_SNORM:
   3465 			return FORMAT_R8I_SNORM;
   3466 		case FORMAT_R8:
   3467 			return FORMAT_R8;
   3468 		case FORMAT_R16I:
   3469 			return FORMAT_R16I;
   3470 		case FORMAT_R16UI:
   3471 			return FORMAT_R16UI;
   3472 		case FORMAT_R32I:
   3473 			return FORMAT_R32I;
   3474 		case FORMAT_R32UI:
   3475 			return FORMAT_R32UI;
   3476 		case FORMAT_X16B16G16R16I:
   3477 		case FORMAT_A16B16G16R16I:
   3478 			return FORMAT_A16B16G16R16I;
   3479 		case FORMAT_X16B16G16R16UI:
   3480 		case FORMAT_A16B16G16R16UI:
   3481 			return FORMAT_A16B16G16R16UI;
   3482 		case FORMAT_A2R10G10B10:
   3483 		case FORMAT_A2B10G10R10:
   3484 		case FORMAT_A16B16G16R16:
   3485 			return FORMAT_A16B16G16R16;
   3486 		case FORMAT_X32B32G32R32I:
   3487 		case FORMAT_A32B32G32R32I:
   3488 			return FORMAT_A32B32G32R32I;
   3489 		case FORMAT_X32B32G32R32UI:
   3490 		case FORMAT_A32B32G32R32UI:
   3491 			return FORMAT_A32B32G32R32UI;
   3492 		case FORMAT_G8R8I:
   3493 			return FORMAT_G8R8I;
   3494 		case FORMAT_G8R8UI:
   3495 			return FORMAT_G8R8UI;
   3496 		case FORMAT_G8R8I_SNORM:
   3497 			return FORMAT_G8R8I_SNORM;
   3498 		case FORMAT_G8R8:
   3499 			return FORMAT_G8R8;
   3500 		case FORMAT_G16R16I:
   3501 			return FORMAT_G16R16I;
   3502 		case FORMAT_G16R16UI:
   3503 			return FORMAT_G16R16UI;
   3504 		case FORMAT_G16R16:
   3505 			return FORMAT_G16R16;
   3506 		case FORMAT_G32R32I:
   3507 			return FORMAT_G32R32I;
   3508 		case FORMAT_G32R32UI:
   3509 			return FORMAT_G32R32UI;
   3510 		case FORMAT_A8R8G8B8:
   3511 			if(lockable || !quadLayoutEnabled)
   3512 			{
   3513 				return FORMAT_A8R8G8B8;
   3514 			}
   3515 			else
   3516 			{
   3517 				return FORMAT_A8G8R8B8Q;
   3518 			}
   3519 		case FORMAT_A8B8G8R8I:
   3520 			return FORMAT_A8B8G8R8I;
   3521 		case FORMAT_A8B8G8R8UI:
   3522 			return FORMAT_A8B8G8R8UI;
   3523 		case FORMAT_A8B8G8R8I_SNORM:
   3524 			return FORMAT_A8B8G8R8I_SNORM;
   3525 		case FORMAT_R5G5B5A1:
   3526 		case FORMAT_R4G4B4A4:
   3527 		case FORMAT_A8B8G8R8:
   3528 			return FORMAT_A8B8G8R8;
   3529 		case FORMAT_R5G6B5:
   3530 			return FORMAT_R5G6B5;
   3531 		case FORMAT_R3G3B2:
   3532 		case FORMAT_R8G8B8:
   3533 		case FORMAT_X4R4G4B4:
   3534 		case FORMAT_X1R5G5B5:
   3535 		case FORMAT_X8R8G8B8:
   3536 			if(lockable || !quadLayoutEnabled)
   3537 			{
   3538 				return FORMAT_X8R8G8B8;
   3539 			}
   3540 			else
   3541 			{
   3542 				return FORMAT_X8G8R8B8Q;
   3543 			}
   3544 		case FORMAT_X8B8G8R8I:
   3545 			return FORMAT_X8B8G8R8I;
   3546 		case FORMAT_X8B8G8R8UI:
   3547 			return FORMAT_X8B8G8R8UI;
   3548 		case FORMAT_X8B8G8R8I_SNORM:
   3549 			return FORMAT_X8B8G8R8I_SNORM;
   3550 		case FORMAT_B8G8R8:
   3551 		case FORMAT_X8B8G8R8:
   3552 			return FORMAT_X8B8G8R8;
   3553 		case FORMAT_SRGB8_X8:
   3554 			return FORMAT_SRGB8_X8;
   3555 		case FORMAT_SRGB8_A8:
   3556 			return FORMAT_SRGB8_A8;
   3557 		// Compressed formats
   3558 		#if S3TC_SUPPORT
   3559 		case FORMAT_DXT1:
   3560 		case FORMAT_DXT3:
   3561 		case FORMAT_DXT5:
   3562 		#endif
   3563 		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
   3564 		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
   3565 		case FORMAT_RGBA8_ETC2_EAC:
   3566 		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
   3567 		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
   3568 		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
   3569 		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
   3570 		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
   3571 		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
   3572 		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
   3573 		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
   3574 		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
   3575 		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
   3576 		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
   3577 		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
   3578 		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
   3579 		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
   3580 		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
   3581 			return FORMAT_A8R8G8B8;
   3582 		case FORMAT_RGBA_ASTC_4x4_KHR:
   3583 		case FORMAT_RGBA_ASTC_5x4_KHR:
   3584 		case FORMAT_RGBA_ASTC_5x5_KHR:
   3585 		case FORMAT_RGBA_ASTC_6x5_KHR:
   3586 		case FORMAT_RGBA_ASTC_6x6_KHR:
   3587 		case FORMAT_RGBA_ASTC_8x5_KHR:
   3588 		case FORMAT_RGBA_ASTC_8x6_KHR:
   3589 		case FORMAT_RGBA_ASTC_8x8_KHR:
   3590 		case FORMAT_RGBA_ASTC_10x5_KHR:
   3591 		case FORMAT_RGBA_ASTC_10x6_KHR:
   3592 		case FORMAT_RGBA_ASTC_10x8_KHR:
   3593 		case FORMAT_RGBA_ASTC_10x10_KHR:
   3594 		case FORMAT_RGBA_ASTC_12x10_KHR:
   3595 		case FORMAT_RGBA_ASTC_12x12_KHR:
   3596 			// ASTC supports HDR, so a floating point format is required to represent it properly
   3597 			return FORMAT_A32B32G32R32F; // FIXME: 16FP is probably sufficient, but it's currently unsupported
   3598 		case FORMAT_ATI1:
   3599 		case FORMAT_R11_EAC:
   3600 			return FORMAT_R8;
   3601 		case FORMAT_SIGNED_R11_EAC:
   3602 			return FORMAT_R32F; // FIXME: Signed 8bit format would be sufficient
   3603 		case FORMAT_ATI2:
   3604 		case FORMAT_RG11_EAC:
   3605 			return FORMAT_G8R8;
   3606 		case FORMAT_SIGNED_RG11_EAC:
   3607 			return FORMAT_G32R32F; // FIXME: Signed 8bit format would be sufficient
   3608 		case FORMAT_ETC1:
   3609 		case FORMAT_RGB8_ETC2:
   3610 		case FORMAT_SRGB8_ETC2:
   3611 			return FORMAT_X8R8G8B8;
   3612 		// Bumpmap formats
   3613 		case FORMAT_V8U8:			return FORMAT_V8U8;
   3614 		case FORMAT_L6V5U5:			return FORMAT_X8L8V8U8;
   3615 		case FORMAT_Q8W8V8U8:		return FORMAT_Q8W8V8U8;
   3616 		case FORMAT_X8L8V8U8:		return FORMAT_X8L8V8U8;
   3617 		case FORMAT_V16U16:			return FORMAT_V16U16;
   3618 		case FORMAT_A2W10V10U10:	return FORMAT_A16W16V16U16;
   3619 		case FORMAT_Q16W16V16U16:	return FORMAT_Q16W16V16U16;
   3620 		// Floating-point formats
   3621 		case FORMAT_A16F:			return FORMAT_A32B32G32R32F;
   3622 		case FORMAT_R16F:			return FORMAT_R32F;
   3623 		case FORMAT_G16R16F:		return FORMAT_G32R32F;
   3624 		case FORMAT_B16G16R16F:     return FORMAT_X32B32G32R32F;
   3625 		case FORMAT_A16B16G16R16F:	return FORMAT_A32B32G32R32F;
   3626 		case FORMAT_A32F:			return FORMAT_A32B32G32R32F;
   3627 		case FORMAT_R32F:			return FORMAT_R32F;
   3628 		case FORMAT_G32R32F:		return FORMAT_G32R32F;
   3629 		case FORMAT_B32G32R32F:     return FORMAT_X32B32G32R32F;
   3630 		case FORMAT_X32B32G32R32F:  return FORMAT_X32B32G32R32F;
   3631 		case FORMAT_A32B32G32R32F:	return FORMAT_A32B32G32R32F;
   3632 		// Luminance formats
   3633 		case FORMAT_L8:				return FORMAT_L8;
   3634 		case FORMAT_A4L4:			return FORMAT_A8L8;
   3635 		case FORMAT_L16:			return FORMAT_L16;
   3636 		case FORMAT_A8L8:			return FORMAT_A8L8;
   3637 		case FORMAT_L16F:           return FORMAT_X32B32G32R32F;
   3638 		case FORMAT_A16L16F:        return FORMAT_A32B32G32R32F;
   3639 		case FORMAT_L32F:           return FORMAT_X32B32G32R32F;
   3640 		case FORMAT_A32L32F:        return FORMAT_A32B32G32R32F;
   3641 		// Depth/stencil formats
   3642 		case FORMAT_D16:
   3643 		case FORMAT_D32:
   3644 		case FORMAT_D24X8:
   3645 		case FORMAT_D24S8:
   3646 		case FORMAT_D24FS8:
   3647 			if(hasParent)   // Texture
   3648 			{
   3649 				return FORMAT_D32FS8_SHADOW;
   3650 			}
   3651 			else if(complementaryDepthBuffer)
   3652 			{
   3653 				return FORMAT_D32F_COMPLEMENTARY;
   3654 			}
   3655 			else
   3656 			{
   3657 				return FORMAT_D32F;
   3658 			}
   3659 		case FORMAT_D32F:           return FORMAT_D32F;
   3660 		case FORMAT_D32F_LOCKABLE:  return FORMAT_D32F_LOCKABLE;
   3661 		case FORMAT_D32FS8_TEXTURE: return FORMAT_D32FS8_TEXTURE;
   3662 		case FORMAT_INTZ:           return FORMAT_D32FS8_TEXTURE;
   3663 		case FORMAT_DF24S8:         return FORMAT_D32FS8_SHADOW;
   3664 		case FORMAT_DF16S8:         return FORMAT_D32FS8_SHADOW;
   3665 		case FORMAT_YV12_BT601:     return FORMAT_YV12_BT601;
   3666 		case FORMAT_YV12_BT709:     return FORMAT_YV12_BT709;
   3667 		case FORMAT_YV12_JFIF:      return FORMAT_YV12_JFIF;
   3668 		default:
   3669 			ASSERT(false);
   3670 		}
   3671 
   3672 		return FORMAT_NULL;
   3673 	}
   3674 
   3675 	void Surface::setTexturePalette(unsigned int *palette)
   3676 	{
   3677 		Surface::palette = palette;
   3678 		Surface::paletteID++;
   3679 	}
   3680 
   3681 	void Surface::resolve()
   3682 	{
   3683 		if(internal.depth <= 1 || !internal.dirty || !renderTarget || internal.format == FORMAT_NULL)
   3684 		{
   3685 			return;
   3686 		}
   3687 
   3688 		void *source = internal.lockRect(0, 0, 0, LOCK_READWRITE);
   3689 
   3690 		int quality = internal.depth;
   3691 		int width = internal.width;
   3692 		int height = internal.height;
   3693 		int pitch = internal.pitchB;
   3694 		int slice = internal.sliceB;
   3695 
   3696 		unsigned char *source0 = (unsigned char*)source;
   3697 		unsigned char *source1 = source0 + slice;
   3698 		unsigned char *source2 = source1 + slice;
   3699 		unsigned char *source3 = source2 + slice;
   3700 		unsigned char *source4 = source3 + slice;
   3701 		unsigned char *source5 = source4 + slice;
   3702 		unsigned char *source6 = source5 + slice;
   3703 		unsigned char *source7 = source6 + slice;
   3704 		unsigned char *source8 = source7 + slice;
   3705 		unsigned char *source9 = source8 + slice;
   3706 		unsigned char *sourceA = source9 + slice;
   3707 		unsigned char *sourceB = sourceA + slice;
   3708 		unsigned char *sourceC = sourceB + slice;
   3709 		unsigned char *sourceD = sourceC + slice;
   3710 		unsigned char *sourceE = sourceD + slice;
   3711 		unsigned char *sourceF = sourceE + slice;
   3712 
   3713 		if(internal.format == FORMAT_X8R8G8B8 || internal.format == FORMAT_A8R8G8B8 ||
   3714 		   internal.format == FORMAT_X8B8G8R8 || internal.format == FORMAT_A8B8G8R8 ||
   3715 		   internal.format == FORMAT_SRGB8_X8 || internal.format == FORMAT_SRGB8_A8)
   3716 		{
   3717 			if(CPUID::supportsSSE2() && (width % 4) == 0)
   3718 			{
   3719 				if(internal.depth == 2)
   3720 				{
   3721 					for(int y = 0; y < height; y++)
   3722 					{
   3723 						for(int x = 0; x < width; x += 4)
   3724 						{
   3725 							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
   3726 							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
   3727 
   3728 							c0 = _mm_avg_epu8(c0, c1);
   3729 
   3730 							_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
   3731 						}
   3732 
   3733 						source0 += pitch;
   3734 						source1 += pitch;
   3735 					}
   3736 				}
   3737 				else if(internal.depth == 4)
   3738 				{
   3739 					for(int y = 0; y < height; y++)
   3740 					{
   3741 						for(int x = 0; x < width; x += 4)
   3742 						{
   3743 							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
   3744 							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
   3745 							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
   3746 							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
   3747 
   3748 							c0 = _mm_avg_epu8(c0, c1);
   3749 							c2 = _mm_avg_epu8(c2, c3);
   3750 							c0 = _mm_avg_epu8(c0, c2);
   3751 
   3752 							_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
   3753 						}
   3754 
   3755 						source0 += pitch;
   3756 						source1 += pitch;
   3757 						source2 += pitch;
   3758 						source3 += pitch;
   3759 					}
   3760 				}
   3761 				else if(internal.depth == 8)
   3762 				{
   3763 					for(int y = 0; y < height; y++)
   3764 					{
   3765 						for(int x = 0; x < width; x += 4)
   3766 						{
   3767 							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
   3768 							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
   3769 							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
   3770 							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
   3771 							__m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
   3772 							__m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
   3773 							__m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
   3774 							__m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
   3775 
   3776 							c0 = _mm_avg_epu8(c0, c1);
   3777 							c2 = _mm_avg_epu8(c2, c3);
   3778 							c4 = _mm_avg_epu8(c4, c5);
   3779 							c6 = _mm_avg_epu8(c6, c7);
   3780 							c0 = _mm_avg_epu8(c0, c2);
   3781 							c4 = _mm_avg_epu8(c4, c6);
   3782 							c0 = _mm_avg_epu8(c0, c4);
   3783 
   3784 							_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
   3785 						}
   3786 
   3787 						source0 += pitch;
   3788 						source1 += pitch;
   3789 						source2 += pitch;
   3790 						source3 += pitch;
   3791 						source4 += pitch;
   3792 						source5 += pitch;
   3793 						source6 += pitch;
   3794 						source7 += pitch;
   3795 					}
   3796 				}
   3797 				else if(internal.depth == 16)
   3798 				{
   3799 					for(int y = 0; y < height; y++)
   3800 					{
   3801 						for(int x = 0; x < width; x += 4)
   3802 						{
   3803 							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
   3804 							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
   3805 							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
   3806 							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
   3807 							__m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
   3808 							__m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
   3809 							__m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
   3810 							__m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
   3811 							__m128i c8 = _mm_load_si128((__m128i*)(source8 + 4 * x));
   3812 							__m128i c9 = _mm_load_si128((__m128i*)(source9 + 4 * x));
   3813 							__m128i cA = _mm_load_si128((__m128i*)(sourceA + 4 * x));
   3814 							__m128i cB = _mm_load_si128((__m128i*)(sourceB + 4 * x));
   3815 							__m128i cC = _mm_load_si128((__m128i*)(sourceC + 4 * x));
   3816 							__m128i cD = _mm_load_si128((__m128i*)(sourceD + 4 * x));
   3817 							__m128i cE = _mm_load_si128((__m128i*)(sourceE + 4 * x));
   3818 							__m128i cF = _mm_load_si128((__m128i*)(sourceF + 4 * x));
   3819 
   3820 							c0 = _mm_avg_epu8(c0, c1);
   3821 							c2 = _mm_avg_epu8(c2, c3);
   3822 							c4 = _mm_avg_epu8(c4, c5);
   3823 							c6 = _mm_avg_epu8(c6, c7);
   3824 							c8 = _mm_avg_epu8(c8, c9);
   3825 							cA = _mm_avg_epu8(cA, cB);
   3826 							cC = _mm_avg_epu8(cC, cD);
   3827 							cE = _mm_avg_epu8(cE, cF);
   3828 							c0 = _mm_avg_epu8(c0, c2);
   3829 							c4 = _mm_avg_epu8(c4, c6);
   3830 							c8 = _mm_avg_epu8(c8, cA);
   3831 							cC = _mm_avg_epu8(cC, cE);
   3832 							c0 = _mm_avg_epu8(c0, c4);
   3833 							c8 = _mm_avg_epu8(c8, cC);
   3834 							c0 = _mm_avg_epu8(c0, c8);
   3835 
   3836 							_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
   3837 						}
   3838 
   3839 						source0 += pitch;
   3840 						source1 += pitch;
   3841 						source2 += pitch;
   3842 						source3 += pitch;
   3843 						source4 += pitch;
   3844 						source5 += pitch;
   3845 						source6 += pitch;
   3846 						source7 += pitch;
   3847 						source8 += pitch;
   3848 						source9 += pitch;
   3849 						sourceA += pitch;
   3850 						sourceB += pitch;
   3851 						sourceC += pitch;
   3852 						sourceD += pitch;
   3853 						sourceE += pitch;
   3854 						sourceF += pitch;
   3855 					}
   3856 				}
   3857 				else ASSERT(false);
   3858 			}
   3859 			else
   3860 			{
   3861 				#define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7F7F7F7F) + (((x) ^ (y)) & 0x01010101))
   3862 
   3863 				if(internal.depth == 2)
   3864 				{
   3865 					for(int y = 0; y < height; y++)
   3866 					{
   3867 						for(int x = 0; x < width; x++)
   3868 						{
   3869 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
   3870 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
   3871 
   3872 							c0 = AVERAGE(c0, c1);
   3873 
   3874 							*(unsigned int*)(source0 + 4 * x) = c0;
   3875 						}
   3876 
   3877 						source0 += pitch;
   3878 						source1 += pitch;
   3879 					}
   3880 				}
   3881 				else if(internal.depth == 4)
   3882 				{
   3883 					for(int y = 0; y < height; y++)
   3884 					{
   3885 						for(int x = 0; x < width; x++)
   3886 						{
   3887 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
   3888 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
   3889 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
   3890 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
   3891 
   3892 							c0 = AVERAGE(c0, c1);
   3893 							c2 = AVERAGE(c2, c3);
   3894 							c0 = AVERAGE(c0, c2);
   3895 
   3896 							*(unsigned int*)(source0 + 4 * x) = c0;
   3897 						}
   3898 
   3899 						source0 += pitch;
   3900 						source1 += pitch;
   3901 						source2 += pitch;
   3902 						source3 += pitch;
   3903 					}
   3904 				}
   3905 				else if(internal.depth == 8)
   3906 				{
   3907 					for(int y = 0; y < height; y++)
   3908 					{
   3909 						for(int x = 0; x < width; x++)
   3910 						{
   3911 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
   3912 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
   3913 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
   3914 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
   3915 							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
   3916 							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
   3917 							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
   3918 							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
   3919 
   3920 							c0 = AVERAGE(c0, c1);
   3921 							c2 = AVERAGE(c2, c3);
   3922 							c4 = AVERAGE(c4, c5);
   3923 							c6 = AVERAGE(c6, c7);
   3924 							c0 = AVERAGE(c0, c2);
   3925 							c4 = AVERAGE(c4, c6);
   3926 							c0 = AVERAGE(c0, c4);
   3927 
   3928 							*(unsigned int*)(source0 + 4 * x) = c0;
   3929 						}
   3930 
   3931 						source0 += pitch;
   3932 						source1 += pitch;
   3933 						source2 += pitch;
   3934 						source3 += pitch;
   3935 						source4 += pitch;
   3936 						source5 += pitch;
   3937 						source6 += pitch;
   3938 						source7 += pitch;
   3939 					}
   3940 				}
   3941 				else if(internal.depth == 16)
   3942 				{
   3943 					for(int y = 0; y < height; y++)
   3944 					{
   3945 						for(int x = 0; x < width; x++)
   3946 						{
   3947 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
   3948 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
   3949 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
   3950 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
   3951 							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
   3952 							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
   3953 							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
   3954 							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
   3955 							unsigned int c8 = *(unsigned int*)(source8 + 4 * x);
   3956 							unsigned int c9 = *(unsigned int*)(source9 + 4 * x);
   3957 							unsigned int cA = *(unsigned int*)(sourceA + 4 * x);
   3958 							unsigned int cB = *(unsigned int*)(sourceB + 4 * x);
   3959 							unsigned int cC = *(unsigned int*)(sourceC + 4 * x);
   3960 							unsigned int cD = *(unsigned int*)(sourceD + 4 * x);
   3961 							unsigned int cE = *(unsigned int*)(sourceE + 4 * x);
   3962 							unsigned int cF = *(unsigned int*)(sourceF + 4 * x);
   3963 
   3964 							c0 = AVERAGE(c0, c1);
   3965 							c2 = AVERAGE(c2, c3);
   3966 							c4 = AVERAGE(c4, c5);
   3967 							c6 = AVERAGE(c6, c7);
   3968 							c8 = AVERAGE(c8, c9);
   3969 							cA = AVERAGE(cA, cB);
   3970 							cC = AVERAGE(cC, cD);
   3971 							cE = AVERAGE(cE, cF);
   3972 							c0 = AVERAGE(c0, c2);
   3973 							c4 = AVERAGE(c4, c6);
   3974 							c8 = AVERAGE(c8, cA);
   3975 							cC = AVERAGE(cC, cE);
   3976 							c0 = AVERAGE(c0, c4);
   3977 							c8 = AVERAGE(c8, cC);
   3978 							c0 = AVERAGE(c0, c8);
   3979 
   3980 							*(unsigned int*)(source0 + 4 * x) = c0;
   3981 						}
   3982 
   3983 						source0 += pitch;
   3984 						source1 += pitch;
   3985 						source2 += pitch;
   3986 						source3 += pitch;
   3987 						source4 += pitch;
   3988 						source5 += pitch;
   3989 						source6 += pitch;
   3990 						source7 += pitch;
   3991 						source8 += pitch;
   3992 						source9 += pitch;
   3993 						sourceA += pitch;
   3994 						sourceB += pitch;
   3995 						sourceC += pitch;
   3996 						sourceD += pitch;
   3997 						sourceE += pitch;
   3998 						sourceF += pitch;
   3999 					}
   4000 				}
   4001 				else ASSERT(false);
   4002 
   4003 				#undef AVERAGE
   4004 			}
   4005 		}
   4006 		else if(internal.format == FORMAT_G16R16)
   4007 		{
   4008 			if(CPUID::supportsSSE2() && (width % 4) == 0)
   4009 			{
   4010 				if(internal.depth == 2)
   4011 				{
   4012 					for(int y = 0; y < height; y++)
   4013 					{
   4014 						for(int x = 0; x < width; x += 4)
   4015 						{
   4016 							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
   4017 							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
   4018 
   4019 							c0 = _mm_avg_epu16(c0, c1);
   4020 
   4021 							_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
   4022 						}
   4023 
   4024 						source0 += pitch;
   4025 						source1 += pitch;
   4026 					}
   4027 				}
   4028 				else if(internal.depth == 4)
   4029 				{
   4030 					for(int y = 0; y < height; y++)
   4031 					{
   4032 						for(int x = 0; x < width; x += 4)
   4033 						{
   4034 							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
   4035 							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
   4036 							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
   4037 							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
   4038 
   4039 							c0 = _mm_avg_epu16(c0, c1);
   4040 							c2 = _mm_avg_epu16(c2, c3);
   4041 							c0 = _mm_avg_epu16(c0, c2);
   4042 
   4043 							_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
   4044 						}
   4045 
   4046 						source0 += pitch;
   4047 						source1 += pitch;
   4048 						source2 += pitch;
   4049 						source3 += pitch;
   4050 					}
   4051 				}
   4052 				else if(internal.depth == 8)
   4053 				{
   4054 					for(int y = 0; y < height; y++)
   4055 					{
   4056 						for(int x = 0; x < width; x += 4)
   4057 						{
   4058 							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
   4059 							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
   4060 							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
   4061 							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
   4062 							__m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
   4063 							__m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
   4064 							__m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
   4065 							__m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
   4066 
   4067 							c0 = _mm_avg_epu16(c0, c1);
   4068 							c2 = _mm_avg_epu16(c2, c3);
   4069 							c4 = _mm_avg_epu16(c4, c5);
   4070 							c6 = _mm_avg_epu16(c6, c7);
   4071 							c0 = _mm_avg_epu16(c0, c2);
   4072 							c4 = _mm_avg_epu16(c4, c6);
   4073 							c0 = _mm_avg_epu16(c0, c4);
   4074 
   4075 							_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
   4076 						}
   4077 
   4078 						source0 += pitch;
   4079 						source1 += pitch;
   4080 						source2 += pitch;
   4081 						source3 += pitch;
   4082 						source4 += pitch;
   4083 						source5 += pitch;
   4084 						source6 += pitch;
   4085 						source7 += pitch;
   4086 					}
   4087 				}
   4088 				else if(internal.depth == 16)
   4089 				{
   4090 					for(int y = 0; y < height; y++)
   4091 					{
   4092 						for(int x = 0; x < width; x += 4)
   4093 						{
   4094 							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
   4095 							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
   4096 							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
   4097 							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
   4098 							__m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
   4099 							__m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
   4100 							__m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
   4101 							__m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
   4102 							__m128i c8 = _mm_load_si128((__m128i*)(source8 + 4 * x));
   4103 							__m128i c9 = _mm_load_si128((__m128i*)(source9 + 4 * x));
   4104 							__m128i cA = _mm_load_si128((__m128i*)(sourceA + 4 * x));
   4105 							__m128i cB = _mm_load_si128((__m128i*)(sourceB + 4 * x));
   4106 							__m128i cC = _mm_load_si128((__m128i*)(sourceC + 4 * x));
   4107 							__m128i cD = _mm_load_si128((__m128i*)(sourceD + 4 * x));
   4108 							__m128i cE = _mm_load_si128((__m128i*)(sourceE + 4 * x));
   4109 							__m128i cF = _mm_load_si128((__m128i*)(sourceF + 4 * x));
   4110 
   4111 							c0 = _mm_avg_epu16(c0, c1);
   4112 							c2 = _mm_avg_epu16(c2, c3);
   4113 							c4 = _mm_avg_epu16(c4, c5);
   4114 							c6 = _mm_avg_epu16(c6, c7);
   4115 							c8 = _mm_avg_epu16(c8, c9);
   4116 							cA = _mm_avg_epu16(cA, cB);
   4117 							cC = _mm_avg_epu16(cC, cD);
   4118 							cE = _mm_avg_epu16(cE, cF);
   4119 							c0 = _mm_avg_epu16(c0, c2);
   4120 							c4 = _mm_avg_epu16(c4, c6);
   4121 							c8 = _mm_avg_epu16(c8, cA);
   4122 							cC = _mm_avg_epu16(cC, cE);
   4123 							c0 = _mm_avg_epu16(c0, c4);
   4124 							c8 = _mm_avg_epu16(c8, cC);
   4125 							c0 = _mm_avg_epu16(c0, c8);
   4126 
   4127 							_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
   4128 						}
   4129 
   4130 						source0 += pitch;
   4131 						source1 += pitch;
   4132 						source2 += pitch;
   4133 						source3 += pitch;
   4134 						source4 += pitch;
   4135 						source5 += pitch;
   4136 						source6 += pitch;
   4137 						source7 += pitch;
   4138 						source8 += pitch;
   4139 						source9 += pitch;
   4140 						sourceA += pitch;
   4141 						sourceB += pitch;
   4142 						sourceC += pitch;
   4143 						sourceD += pitch;
   4144 						sourceE += pitch;
   4145 						sourceF += pitch;
   4146 					}
   4147 				}
   4148 				else ASSERT(false);
   4149 			}
   4150 			else
   4151 			{
   4152 				#define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7FFF7FFF) + (((x) ^ (y)) & 0x00010001))
   4153 
   4154 				if(internal.depth == 2)
   4155 				{
   4156 					for(int y = 0; y < height; y++)
   4157 					{
   4158 						for(int x = 0; x < width; x++)
   4159 						{
   4160 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
   4161 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
   4162 
   4163 							c0 = AVERAGE(c0, c1);
   4164 
   4165 							*(unsigned int*)(source0 + 4 * x) = c0;
   4166 						}
   4167 
   4168 						source0 += pitch;
   4169 						source1 += pitch;
   4170 					}
   4171 				}
   4172 				else if(internal.depth == 4)
   4173 				{
   4174 					for(int y = 0; y < height; y++)
   4175 					{
   4176 						for(int x = 0; x < width; x++)
   4177 						{
   4178 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
   4179 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
   4180 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
   4181 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
   4182 
   4183 							c0 = AVERAGE(c0, c1);
   4184 							c2 = AVERAGE(c2, c3);
   4185 							c0 = AVERAGE(c0, c2);
   4186 
   4187 							*(unsigned int*)(source0 + 4 * x) = c0;
   4188 						}
   4189 
   4190 						source0 += pitch;
   4191 						source1 += pitch;
   4192 						source2 += pitch;
   4193 						source3 += pitch;
   4194 					}
   4195 				}
   4196 				else if(internal.depth == 8)
   4197 				{
   4198 					for(int y = 0; y < height; y++)
   4199 					{
   4200 						for(int x = 0; x < width; x++)
   4201 						{
   4202 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
   4203 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
   4204 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
   4205 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
   4206 							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
   4207 							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
   4208 							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
   4209 							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
   4210 
   4211 							c0 = AVERAGE(c0, c1);
   4212 							c2 = AVERAGE(c2, c3);
   4213 							c4 = AVERAGE(c4, c5);
   4214 							c6 = AVERAGE(c6, c7);
   4215 							c0 = AVERAGE(c0, c2);
   4216 							c4 = AVERAGE(c4, c6);
   4217 							c0 = AVERAGE(c0, c4);
   4218 
   4219 							*(unsigned int*)(source0 + 4 * x) = c0;
   4220 						}
   4221 
   4222 						source0 += pitch;
   4223 						source1 += pitch;
   4224 						source2 += pitch;
   4225 						source3 += pitch;
   4226 						source4 += pitch;
   4227 						source5 += pitch;
   4228 						source6 += pitch;
   4229 						source7 += pitch;
   4230 					}
   4231 				}
   4232 				else if(internal.depth == 16)
   4233 				{
   4234 					for(int y = 0; y < height; y++)
   4235 					{
   4236 						for(int x = 0; x < width; x++)
   4237 						{
   4238 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
   4239 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
   4240 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
   4241 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
   4242 							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
   4243 							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
   4244 							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
   4245 							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
   4246 							unsigned int c8 = *(unsigned int*)(source8 + 4 * x);
   4247 							unsigned int c9 = *(unsigned int*)(source9 + 4 * x);
   4248 							unsigned int cA = *(unsigned int*)(sourceA + 4 * x);
   4249 							unsigned int cB = *(unsigned int*)(sourceB + 4 * x);
   4250 							unsigned int cC = *(unsigned int*)(sourceC + 4 * x);
   4251 							unsigned int cD = *(unsigned int*)(sourceD + 4 * x);
   4252 							unsigned int cE = *(unsigned int*)(sourceE + 4 * x);
   4253 							unsigned int cF = *(unsigned int*)(sourceF + 4 * x);
   4254 
   4255 							c0 = AVERAGE(c0, c1);
   4256 							c2 = AVERAGE(c2, c3);
   4257 							c4 = AVERAGE(c4, c5);
   4258 							c6 = AVERAGE(c6, c7);
   4259 							c8 = AVERAGE(c8, c9);
   4260 							cA = AVERAGE(cA, cB);
   4261 							cC = AVERAGE(cC, cD);
   4262 							cE = AVERAGE(cE, cF);
   4263 							c0 = AVERAGE(c0, c2);
   4264 							c4 = AVERAGE(c4, c6);
   4265 							c8 = AVERAGE(c8, cA);
   4266 							cC = AVERAGE(cC, cE);
   4267 							c0 = AVERAGE(c0, c4);
   4268 							c8 = AVERAGE(c8, cC);
   4269 							c0 = AVERAGE(c0, c8);
   4270 
   4271 							*(unsigned int*)(source0 + 4 * x) = c0;
   4272 						}
   4273 
   4274 						source0 += pitch;
   4275 						source1 += pitch;
   4276 						source2 += pitch;
   4277 						source3 += pitch;
   4278 						source4 += pitch;
   4279 						source5 += pitch;
   4280 						source6 += pitch;
   4281 						source7 += pitch;
   4282 						source8 += pitch;
   4283 						source9 += pitch;
   4284 						sourceA += pitch;
   4285 						sourceB += pitch;
   4286 						sourceC += pitch;
   4287 						sourceD += pitch;
   4288 						sourceE += pitch;
   4289 						sourceF += pitch;
   4290 					}
   4291 				}
   4292 				else ASSERT(false);
   4293 
   4294 				#undef AVERAGE
   4295 			}
   4296 		}
   4297 		else if(internal.format == FORMAT_A16B16G16R16)
   4298 		{
   4299 			if(CPUID::supportsSSE2() && (width % 2) == 0)
   4300 			{
   4301 				if(internal.depth == 2)
   4302 				{
   4303 					for(int y = 0; y < height; y++)
   4304 					{
   4305 						for(int x = 0; x < width; x += 2)
   4306 						{
   4307 							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
   4308 							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
   4309 
   4310 							c0 = _mm_avg_epu16(c0, c1);
   4311 
   4312 							_mm_store_si128((__m128i*)(source0 + 8 * x), c0);
   4313 						}
   4314 
   4315 						source0 += pitch;
   4316 						source1 += pitch;
   4317 					}
   4318 				}
   4319 				else if(internal.depth == 4)
   4320 				{
   4321 					for(int y = 0; y < height; y++)
   4322 					{
   4323 						for(int x = 0; x < width; x += 2)
   4324 						{
   4325 							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
   4326 							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
   4327 							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 8 * x));
   4328 							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 8 * x));
   4329 
   4330 							c0 = _mm_avg_epu16(c0, c1);
   4331 							c2 = _mm_avg_epu16(c2, c3);
   4332 							c0 = _mm_avg_epu16(c0, c2);
   4333 
   4334 							_mm_store_si128((__m128i*)(source0 + 8 * x), c0);
   4335 						}
   4336 
   4337 						source0 += pitch;
   4338 						source1 += pitch;
   4339 						source2 += pitch;
   4340 						source3 += pitch;
   4341 					}
   4342 				}
   4343 				else if(internal.depth == 8)
   4344 				{
   4345 					for(int y = 0; y < height; y++)
   4346 					{
   4347 						for(int x = 0; x < width; x += 2)
   4348 						{
   4349 							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
   4350 							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
   4351 							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 8 * x));
   4352 							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 8 * x));
   4353 							__m128i c4 = _mm_load_si128((__m128i*)(source4 + 8 * x));
   4354 							__m128i c5 = _mm_load_si128((__m128i*)(source5 + 8 * x));
   4355 							__m128i c6 = _mm_load_si128((__m128i*)(source6 + 8 * x));
   4356 							__m128i c7 = _mm_load_si128((__m128i*)(source7 + 8 * x));
   4357 
   4358 							c0 = _mm_avg_epu16(c0, c1);
   4359 							c2 = _mm_avg_epu16(c2, c3);
   4360 							c4 = _mm_avg_epu16(c4, c5);
   4361 							c6 = _mm_avg_epu16(c6, c7);
   4362 							c0 = _mm_avg_epu16(c0, c2);
   4363 							c4 = _mm_avg_epu16(c4, c6);
   4364 							c0 = _mm_avg_epu16(c0, c4);
   4365 
   4366 							_mm_store_si128((__m128i*)(source0 + 8 * x), c0);
   4367 						}
   4368 
   4369 						source0 += pitch;
   4370 						source1 += pitch;
   4371 						source2 += pitch;
   4372 						source3 += pitch;
   4373 						source4 += pitch;
   4374 						source5 += pitch;
   4375 						source6 += pitch;
   4376 						source7 += pitch;
   4377 					}
   4378 				}
   4379 				else if(internal.depth == 16)
   4380 				{
   4381 					for(int y = 0; y < height; y++)
   4382 					{
   4383 						for(int x = 0; x < width; x += 2)
   4384 						{
   4385 							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
   4386 							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
   4387 							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 8 * x));
   4388 							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 8 * x));
   4389 							__m128i c4 = _mm_load_si128((__m128i*)(source4 + 8 * x));
   4390 							__m128i c5 = _mm_load_si128((__m128i*)(source5 + 8 * x));
   4391 							__m128i c6 = _mm_load_si128((__m128i*)(source6 + 8 * x));
   4392 							__m128i c7 = _mm_load_si128((__m128i*)(source7 + 8 * x));
   4393 							__m128i c8 = _mm_load_si128((__m128i*)(source8 + 8 * x));
   4394 							__m128i c9 = _mm_load_si128((__m128i*)(source9 + 8 * x));
   4395 							__m128i cA = _mm_load_si128((__m128i*)(sourceA + 8 * x));
   4396 							__m128i cB = _mm_load_si128((__m128i*)(sourceB + 8 * x));
   4397 							__m128i cC = _mm_load_si128((__m128i*)(sourceC + 8 * x));
   4398 							__m128i cD = _mm_load_si128((__m128i*)(sourceD + 8 * x));
   4399 							__m128i cE = _mm_load_si128((__m128i*)(sourceE + 8 * x));
   4400 							__m128i cF = _mm_load_si128((__m128i*)(sourceF + 8 * x));
   4401 
   4402 							c0 = _mm_avg_epu16(c0, c1);
   4403 							c2 = _mm_avg_epu16(c2, c3);
   4404 							c4 = _mm_avg_epu16(c4, c5);
   4405 							c6 = _mm_avg_epu16(c6, c7);
   4406 							c8 = _mm_avg_epu16(c8, c9);
   4407 							cA = _mm_avg_epu16(cA, cB);
   4408 							cC = _mm_avg_epu16(cC, cD);
   4409 							cE = _mm_avg_epu16(cE, cF);
   4410 							c0 = _mm_avg_epu16(c0, c2);
   4411 							c4 = _mm_avg_epu16(c4, c6);
   4412 							c8 = _mm_avg_epu16(c8, cA);
   4413 							cC = _mm_avg_epu16(cC, cE);
   4414 							c0 = _mm_avg_epu16(c0, c4);
   4415 							c8 = _mm_avg_epu16(c8, cC);
   4416 							c0 = _mm_avg_epu16(c0, c8);
   4417 
   4418 							_mm_store_si128((__m128i*)(source0 + 8 * x), c0);
   4419 						}
   4420 
   4421 						source0 += pitch;
   4422 						source1 += pitch;
   4423 						source2 += pitch;
   4424 						source3 += pitch;
   4425 						source4 += pitch;
   4426 						source5 += pitch;
   4427 						source6 += pitch;
   4428 						source7 += pitch;
   4429 						source8 += pitch;
   4430 						source9 += pitch;
   4431 						sourceA += pitch;
   4432 						sourceB += pitch;
   4433 						sourceC += pitch;
   4434 						sourceD += pitch;
   4435 						sourceE += pitch;
   4436 						sourceF += pitch;
   4437 					}
   4438 				}
   4439 				else ASSERT(false);
   4440 			}
   4441 			else
   4442 			{
   4443 				#define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7FFF7FFF) + (((x) ^ (y)) & 0x00010001))
   4444 
   4445 				if(internal.depth == 2)
   4446 				{
   4447 					for(int y = 0; y < height; y++)
   4448 					{
   4449 						for(int x = 0; x < 2 * width; x++)
   4450 						{
   4451 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
   4452 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
   4453 
   4454 							c0 = AVERAGE(c0, c1);
   4455 
   4456 							*(unsigned int*)(source0 + 4 * x) = c0;
   4457 						}
   4458 
   4459 						source0 += pitch;
   4460 						source1 += pitch;
   4461 					}
   4462 				}
   4463 				else if(internal.depth == 4)
   4464 				{
   4465 					for(int y = 0; y < height; y++)
   4466 					{
   4467 						for(int x = 0; x < 2 * width; x++)
   4468 						{
   4469 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
   4470 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
   4471 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
   4472 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
   4473 
   4474 							c0 = AVERAGE(c0, c1);
   4475 							c2 = AVERAGE(c2, c3);
   4476 							c0 = AVERAGE(c0, c2);
   4477 
   4478 							*(unsigned int*)(source0 + 4 * x) = c0;
   4479 						}
   4480 
   4481 						source0 += pitch;
   4482 						source1 += pitch;
   4483 						source2 += pitch;
   4484 						source3 += pitch;
   4485 					}
   4486 				}
   4487 				else if(internal.depth == 8)
   4488 				{
   4489 					for(int y = 0; y < height; y++)
   4490 					{
   4491 						for(int x = 0; x < 2 * width; x++)
   4492 						{
   4493 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
   4494 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
   4495 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
   4496 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
   4497 							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
   4498 							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
   4499 							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
   4500 							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
   4501 
   4502 							c0 = AVERAGE(c0, c1);
   4503 							c2 = AVERAGE(c2, c3);
   4504 							c4 = AVERAGE(c4, c5);
   4505 							c6 = AVERAGE(c6, c7);
   4506 							c0 = AVERAGE(c0, c2);
   4507 							c4 = AVERAGE(c4, c6);
   4508 							c0 = AVERAGE(c0, c4);
   4509 
   4510 							*(unsigned int*)(source0 + 4 * x) = c0;
   4511 						}
   4512 
   4513 						source0 += pitch;
   4514 						source1 += pitch;
   4515 						source2 += pitch;
   4516 						source3 += pitch;
   4517 						source4 += pitch;
   4518 						source5 += pitch;
   4519 						source6 += pitch;
   4520 						source7 += pitch;
   4521 					}
   4522 				}
   4523 				else if(internal.depth == 16)
   4524 				{
   4525 					for(int y = 0; y < height; y++)
   4526 					{
   4527 						for(int x = 0; x < 2 * width; x++)
   4528 						{
   4529 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
   4530 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
   4531 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
   4532 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
   4533 							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
   4534 							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
   4535 							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
   4536 							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
   4537 							unsigned int c8 = *(unsigned int*)(source8 + 4 * x);
   4538 							unsigned int c9 = *(unsigned int*)(source9 + 4 * x);
   4539 							unsigned int cA = *(unsigned int*)(sourceA + 4 * x);
   4540 							unsigned int cB = *(unsigned int*)(sourceB + 4 * x);
   4541 							unsigned int cC = *(unsigned int*)(sourceC + 4 * x);
   4542 							unsigned int cD = *(unsigned int*)(sourceD + 4 * x);
   4543 							unsigned int cE = *(unsigned int*)(sourceE + 4 * x);
   4544 							unsigned int cF = *(unsigned int*)(sourceF + 4 * x);
   4545 
   4546 							c0 = AVERAGE(c0, c1);
   4547 							c2 = AVERAGE(c2, c3);
   4548 							c4 = AVERAGE(c4, c5);
   4549 							c6 = AVERAGE(c6, c7);
   4550 							c8 = AVERAGE(c8, c9);
   4551 							cA = AVERAGE(cA, cB);
   4552 							cC = AVERAGE(cC, cD);
   4553 							cE = AVERAGE(cE, cF);
   4554 							c0 = AVERAGE(c0, c2);
   4555 							c4 = AVERAGE(c4, c6);
   4556 							c8 = AVERAGE(c8, cA);
   4557 							cC = AVERAGE(cC, cE);
   4558 							c0 = AVERAGE(c0, c4);
   4559 							c8 = AVERAGE(c8, cC);
   4560 							c0 = AVERAGE(c0, c8);
   4561 
   4562 							*(unsigned int*)(source0 + 4 * x) = c0;
   4563 						}
   4564 
   4565 						source0 += pitch;
   4566 						source1 += pitch;
   4567 						source2 += pitch;
   4568 						source3 += pitch;
   4569 						source4 += pitch;
   4570 						source5 += pitch;
   4571 						source6 += pitch;
   4572 						source7 += pitch;
   4573 						source8 += pitch;
   4574 						source9 += pitch;
   4575 						sourceA += pitch;
   4576 						sourceB += pitch;
   4577 						sourceC += pitch;
   4578 						sourceD += pitch;
   4579 						sourceE += pitch;
   4580 						sourceF += pitch;
   4581 					}
   4582 				}
   4583 				else ASSERT(false);
   4584 
   4585 				#undef AVERAGE
   4586 			}
   4587 		}
   4588 		else if(internal.format == FORMAT_R32F)
   4589 		{
   4590 			if(CPUID::supportsSSE() && (width % 4) == 0)
   4591 			{
   4592 				if(internal.depth == 2)
   4593 				{
   4594 					for(int y = 0; y < height; y++)
   4595 					{
   4596 						for(int x = 0; x < width; x += 4)
   4597 						{
   4598 							__m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
   4599 							__m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
   4600 
   4601 							c0 = _mm_add_ps(c0, c1);
   4602 							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 2.0f));
   4603 
   4604 							_mm_store_ps((float*)(source0 + 4 * x), c0);
   4605 						}
   4606 
   4607 						source0 += pitch;
   4608 						source1 += pitch;
   4609 					}
   4610 				}
   4611 				else if(internal.depth == 4)
   4612 				{
   4613 					for(int y = 0; y < height; y++)
   4614 					{
   4615 						for(int x = 0; x < width; x += 4)
   4616 						{
   4617 							__m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
   4618 							__m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
   4619 							__m128 c2 = _mm_load_ps((float*)(source2 + 4 * x));
   4620 							__m128 c3 = _mm_load_ps((float*)(source3 + 4 * x));
   4621 
   4622 							c0 = _mm_add_ps(c0, c1);
   4623 							c2 = _mm_add_ps(c2, c3);
   4624 							c0 = _mm_add_ps(c0, c2);
   4625 							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 4.0f));
   4626 
   4627 							_mm_store_ps((float*)(source0 + 4 * x), c0);
   4628 						}
   4629 
   4630 						source0 += pitch;
   4631 						source1 += pitch;
   4632 						source2 += pitch;
   4633 						source3 += pitch;
   4634 					}
   4635 				}
   4636 				else if(internal.depth == 8)
   4637 				{
   4638 					for(int y = 0; y < height; y++)
   4639 					{
   4640 						for(int x = 0; x < width; x += 4)
   4641 						{
   4642 							__m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
   4643 							__m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
   4644 							__m128 c2 = _mm_load_ps((float*)(source2 + 4 * x));
   4645 							__m128 c3 = _mm_load_ps((float*)(source3 + 4 * x));
   4646 							__m128 c4 = _mm_load_ps((float*)(source4 + 4 * x));
   4647 							__m128 c5 = _mm_load_ps((float*)(source5 + 4 * x));
   4648 							__m128 c6 = _mm_load_ps((float*)(source6 + 4 * x));
   4649 							__m128 c7 = _mm_load_ps((float*)(source7 + 4 * x));
   4650 
   4651 							c0 = _mm_add_ps(c0, c1);
   4652 							c2 = _mm_add_ps(c2, c3);
   4653 							c4 = _mm_add_ps(c4, c5);
   4654 							c6 = _mm_add_ps(c6, c7);
   4655 							c0 = _mm_add_ps(c0, c2);
   4656 							c4 = _mm_add_ps(c4, c6);
   4657 							c0 = _mm_add_ps(c0, c4);
   4658 							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 8.0f));
   4659 
   4660 							_mm_store_ps((float*)(source0 + 4 * x), c0);
   4661 						}
   4662 
   4663 						source0 += pitch;
   4664 						source1 += pitch;
   4665 						source2 += pitch;
   4666 						source3 += pitch;
   4667 						source4 += pitch;
   4668 						source5 += pitch;
   4669 						source6 += pitch;
   4670 						source7 += pitch;
   4671 					}
   4672 				}
   4673 				else if(internal.depth == 16)
   4674 				{
   4675 					for(int y = 0; y < height; y++)
   4676 					{
   4677 						for(int x = 0; x < width; x += 4)
   4678 						{
   4679 							__m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
   4680 							__m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
   4681 							__m128 c2 = _mm_load_ps((float*)(source2 + 4 * x));
   4682 							__m128 c3 = _mm_load_ps((float*)(source3 + 4 * x));
   4683 							__m128 c4 = _mm_load_ps((float*)(source4 + 4 * x));
   4684 							__m128 c5 = _mm_load_ps((float*)(source5 + 4 * x));
   4685 							__m128 c6 = _mm_load_ps((float*)(source6 + 4 * x));
   4686 							__m128 c7 = _mm_load_ps((float*)(source7 + 4 * x));
   4687 							__m128 c8 = _mm_load_ps((float*)(source8 + 4 * x));
   4688 							__m128 c9 = _mm_load_ps((float*)(source9 + 4 * x));
   4689 							__m128 cA = _mm_load_ps((float*)(sourceA + 4 * x));
   4690 							__m128 cB = _mm_load_ps((float*)(sourceB + 4 * x));
   4691 							__m128 cC = _mm_load_ps((float*)(sourceC + 4 * x));
   4692 							__m128 cD = _mm_load_ps((float*)(sourceD + 4 * x));
   4693 							__m128 cE = _mm_load_ps((float*)(sourceE + 4 * x));
   4694 							__m128 cF = _mm_load_ps((float*)(sourceF + 4 * x));
   4695 
   4696 							c0 = _mm_add_ps(c0, c1);
   4697 							c2 = _mm_add_ps(c2, c3);
   4698 							c4 = _mm_add_ps(c4, c5);
   4699 							c6 = _mm_add_ps(c6, c7);
   4700 							c8 = _mm_add_ps(c8, c9);
   4701 							cA = _mm_add_ps(cA, cB);
   4702 							cC = _mm_add_ps(cC, cD);
   4703 							cE = _mm_add_ps(cE, cF);
   4704 							c0 = _mm_add_ps(c0, c2);
   4705 							c4 = _mm_add_ps(c4, c6);
   4706 							c8 = _mm_add_ps(c8, cA);
   4707 							cC = _mm_add_ps(cC, cE);
   4708 							c0 = _mm_add_ps(c0, c4);
   4709 							c8 = _mm_add_ps(c8, cC);
   4710 							c0 = _mm_add_ps(c0, c8);
   4711 							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 16.0f));
   4712 
   4713 							_mm_store_ps((float*)(source0 + 4 * x), c0);
   4714 						}
   4715 
   4716 						source0 += pitch;
   4717 						source1 += pitch;
   4718 						source2 += pitch;
   4719 						source3 += pitch;
   4720 						source4 += pitch;
   4721 						source5 += pitch;
   4722 						source6 += pitch;
   4723 						source7 += pitch;
   4724 						source8 += pitch;
   4725 						source9 += pitch;
   4726 						sourceA += pitch;
   4727 						sourceB += pitch;
   4728 						sourceC += pitch;
   4729 						sourceD += pitch;
   4730 						sourceE += pitch;
   4731 						sourceF += pitch;
   4732 					}
   4733 				}
   4734 				else ASSERT(false);
   4735 			}
   4736 			else
   4737 			{
   4738 				if(internal.depth == 2)
   4739 				{
   4740 					for(int y = 0; y < height; y++)
   4741 					{
   4742 						for(int x = 0; x < width; x++)
   4743 						{
   4744 							float c0 = *(float*)(source0 + 4 * x);
   4745 							float c1 = *(float*)(source1 + 4 * x);
   4746 
   4747 							c0 = c0 + c1;
   4748 							c0 *= 1.0f / 2.0f;
   4749 
   4750 							*(float*)(source0 + 4 * x) = c0;
   4751 						}
   4752 
   4753 						source0 += pitch;
   4754 						source1 += pitch;
   4755 					}
   4756 				}
   4757 				else if(internal.depth == 4)
   4758 				{
   4759 					for(int y = 0; y < height; y++)
   4760 					{
   4761 						for(int x = 0; x < width; x++)
   4762 						{
   4763 							float c0 = *(float*)(source0 + 4 * x);
   4764 							float c1 = *(float*)(source1 + 4 * x);
   4765 							float c2 = *(float*)(source2 + 4 * x);
   4766 							float c3 = *(float*)(source3 + 4 * x);
   4767 
   4768 							c0 = c0 + c1;
   4769 							c2 = c2 + c3;
   4770 							c0 = c0 + c2;
   4771 							c0 *= 1.0f / 4.0f;
   4772 
   4773 							*(float*)(source0 + 4 * x) = c0;
   4774 						}
   4775 
   4776 						source0 += pitch;
   4777 						source1 += pitch;
   4778 						source2 += pitch;
   4779 						source3 += pitch;
   4780 					}
   4781 				}
   4782 				else if(internal.depth == 8)
   4783 				{
   4784 					for(int y = 0; y < height; y++)
   4785 					{
   4786 						for(int x = 0; x < width; x++)
   4787 						{
   4788 							float c0 = *(float*)(source0 + 4 * x);
   4789 							float c1 = *(float*)(source1 + 4 * x);
   4790 							float c2 = *(float*)(source2 + 4 * x);
   4791 							float c3 = *(float*)(source3 + 4 * x);
   4792 							float c4 = *(float*)(source4 + 4 * x);
   4793 							float c5 = *(float*)(source5 + 4 * x);
   4794 							float c6 = *(float*)(source6 + 4 * x);
   4795 							float c7 = *(float*)(source7 + 4 * x);
   4796 
   4797 							c0 = c0 + c1;
   4798 							c2 = c2 + c3;
   4799 							c4 = c4 + c5;
   4800 							c6 = c6 + c7;
   4801 							c0 = c0 + c2;
   4802 							c4 = c4 + c6;
   4803 							c0 = c0 + c4;
   4804 							c0 *= 1.0f / 8.0f;
   4805 
   4806 							*(float*)(source0 + 4 * x) = c0;
   4807 						}
   4808 
   4809 						source0 += pitch;
   4810 						source1 += pitch;
   4811 						source2 += pitch;
   4812 						source3 += pitch;
   4813 						source4 += pitch;
   4814 						source5 += pitch;
   4815 						source6 += pitch;
   4816 						source7 += pitch;
   4817 					}
   4818 				}
   4819 				else if(internal.depth == 16)
   4820 				{
   4821 					for(int y = 0; y < height; y++)
   4822 					{
   4823 						for(int x = 0; x < width; x++)
   4824 						{
   4825 							float c0 = *(float*)(source0 + 4 * x);
   4826 							float c1 = *(float*)(source1 + 4 * x);
   4827 							float c2 = *(float*)(source2 + 4 * x);
   4828 							float c3 = *(float*)(source3 + 4 * x);
   4829 							float c4 = *(float*)(source4 + 4 * x);
   4830 							float c5 = *(float*)(source5 + 4 * x);
   4831 							float c6 = *(float*)(source6 + 4 * x);
   4832 							float c7 = *(float*)(source7 + 4 * x);
   4833 							float c8 = *(float*)(source8 + 4 * x);
   4834 							float c9 = *(float*)(source9 + 4 * x);
   4835 							float cA = *(float*)(sourceA + 4 * x);
   4836 							float cB = *(float*)(sourceB + 4 * x);
   4837 							float cC = *(float*)(sourceC + 4 * x);
   4838 							float cD = *(float*)(sourceD + 4 * x);
   4839 							float cE = *(float*)(sourceE + 4 * x);
   4840 							float cF = *(float*)(sourceF + 4 * x);
   4841 
   4842 							c0 = c0 + c1;
   4843 							c2 = c2 + c3;
   4844 							c4 = c4 + c5;
   4845 							c6 = c6 + c7;
   4846 							c8 = c8 + c9;
   4847 							cA = cA + cB;
   4848 							cC = cC + cD;
   4849 							cE = cE + cF;
   4850 							c0 = c0 + c2;
   4851 							c4 = c4 + c6;
   4852 							c8 = c8 + cA;
   4853 							cC = cC + cE;
   4854 							c0 = c0 + c4;
   4855 							c8 = c8 + cC;
   4856 							c0 = c0 + c8;
   4857 							c0 *= 1.0f / 16.0f;
   4858 
   4859 							*(float*)(source0 + 4 * x) = c0;
   4860 						}
   4861 
   4862 						source0 += pitch;
   4863 						source1 += pitch;
   4864 						source2 += pitch;
   4865 						source3 += pitch;
   4866 						source4 += pitch;
   4867 						source5 += pitch;
   4868 						source6 += pitch;
   4869 						source7 += pitch;
   4870 						source8 += pitch;
   4871 						source9 += pitch;
   4872 						sourceA += pitch;
   4873 						sourceB += pitch;
   4874 						sourceC += pitch;
   4875 						sourceD += pitch;
   4876 						sourceE += pitch;
   4877 						sourceF += pitch;
   4878 					}
   4879 				}
   4880 				else ASSERT(false);
   4881 			}
   4882 		}
   4883 		else if(internal.format == FORMAT_G32R32F)
   4884 		{
   4885 			if(CPUID::supportsSSE() && (width % 2) == 0)
   4886 			{
   4887 				if(internal.depth == 2)
   4888 				{
   4889 					for(int y = 0; y < height; y++)
   4890 					{
   4891 						for(int x = 0; x < width; x += 2)
   4892 						{
   4893 							__m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
   4894 							__m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
   4895 
   4896 							c0 = _mm_add_ps(c0, c1);
   4897 							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 2.0f));
   4898 
   4899 							_mm_store_ps((float*)(source0 + 8 * x), c0);
   4900 						}
   4901 
   4902 						source0 += pitch;
   4903 						source1 += pitch;
   4904 					}
   4905 				}
   4906 				else if(internal.depth == 4)
   4907 				{
   4908 					for(int y = 0; y < height; y++)
   4909 					{
   4910 						for(int x = 0; x < width; x += 2)
   4911 						{
   4912 							__m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
   4913 							__m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
   4914 							__m128 c2 = _mm_load_ps((float*)(source2 + 8 * x));
   4915 							__m128 c3 = _mm_load_ps((float*)(source3 + 8 * x));
   4916 
   4917 							c0 = _mm_add_ps(c0, c1);
   4918 							c2 = _mm_add_ps(c2, c3);
   4919 							c0 = _mm_add_ps(c0, c2);
   4920 							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 4.0f));
   4921 
   4922 							_mm_store_ps((float*)(source0 + 8 * x), c0);
   4923 						}
   4924 
   4925 						source0 += pitch;
   4926 						source1 += pitch;
   4927 						source2 += pitch;
   4928 						source3 += pitch;
   4929 					}
   4930 				}
   4931 				else if(internal.depth == 8)
   4932 				{
   4933 					for(int y = 0; y < height; y++)
   4934 					{
   4935 						for(int x = 0; x < width; x += 2)
   4936 						{
   4937 							__m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
   4938 							__m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
   4939 							__m128 c2 = _mm_load_ps((float*)(source2 + 8 * x));
   4940 							__m128 c3 = _mm_load_ps((float*)(source3 + 8 * x));
   4941 							__m128 c4 = _mm_load_ps((float*)(source4 + 8 * x));
   4942 							__m128 c5 = _mm_load_ps((float*)(source5 + 8 * x));
   4943 							__m128 c6 = _mm_load_ps((float*)(source6 + 8 * x));
   4944 							__m128 c7 = _mm_load_ps((float*)(source7 + 8 * x));
   4945 
   4946 							c0 = _mm_add_ps(c0, c1);
   4947 							c2 = _mm_add_ps(c2, c3);
   4948 							c4 = _mm_add_ps(c4, c5);
   4949 							c6 = _mm_add_ps(c6, c7);
   4950 							c0 = _mm_add_ps(c0, c2);
   4951 							c4 = _mm_add_ps(c4, c6);
   4952 							c0 = _mm_add_ps(c0, c4);
   4953 							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 8.0f));
   4954 
   4955 							_mm_store_ps((float*)(source0 + 8 * x), c0);
   4956 						}
   4957 
   4958 						source0 += pitch;
   4959 						source1 += pitch;
   4960 						source2 += pitch;
   4961 						source3 += pitch;
   4962 						source4 += pitch;
   4963 						source5 += pitch;
   4964 						source6 += pitch;
   4965 						source7 += pitch;
   4966 					}
   4967 				}
   4968 				else if(internal.depth == 16)
   4969 				{
   4970 					for(int y = 0; y < height; y++)
   4971 					{
   4972 						for(int x = 0; x < width; x += 2)
   4973 						{
   4974 							__m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
   4975 							__m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
   4976 							__m128 c2 = _mm_load_ps((float*)(source2 + 8 * x));
   4977 							__m128 c3 = _mm_load_ps((float*)(source3 + 8 * x));
   4978 							__m128 c4 = _mm_load_ps((float*)(source4 + 8 * x));
   4979 							__m128 c5 = _mm_load_ps((float*)(source5 + 8 * x));
   4980 							__m128 c6 = _mm_load_ps((float*)(source6 + 8 * x));
   4981 							__m128 c7 = _mm_load_ps((float*)(source7 + 8 * x));
   4982 							__m128 c8 = _mm_load_ps((float*)(source8 + 8 * x));
   4983 							__m128 c9 = _mm_load_ps((float*)(source9 + 8 * x));
   4984 							__m128 cA = _mm_load_ps((float*)(sourceA + 8 * x));
   4985 							__m128 cB = _mm_load_ps((float*)(sourceB + 8 * x));
   4986 							__m128 cC = _mm_load_ps((float*)(sourceC + 8 * x));
   4987 							__m128 cD = _mm_load_ps((float*)(sourceD + 8 * x));
   4988 							__m128 cE = _mm_load_ps((float*)(sourceE + 8 * x));
   4989 							__m128 cF = _mm_load_ps((float*)(sourceF + 8 * x));
   4990 
   4991 							c0 = _mm_add_ps(c0, c1);
   4992 							c2 = _mm_add_ps(c2, c3);
   4993 							c4 = _mm_add_ps(c4, c5);
   4994 							c6 = _mm_add_ps(c6, c7);
   4995 							c8 = _mm_add_ps(c8, c9);
   4996 							cA = _mm_add_ps(cA, cB);
   4997 							cC = _mm_add_ps(cC, cD);
   4998 							cE = _mm_add_ps(cE, cF);
   4999 							c0 = _mm_add_ps(c0, c2);
   5000 							c4 = _mm_add_ps(c4, c6);
   5001 							c8 = _mm_add_ps(c8, cA);
   5002 							cC = _mm_add_ps(cC, cE);
   5003 							c0 = _mm_add_ps(c0, c4);
   5004 							c8 = _mm_add_ps(c8, cC);
   5005 							c0 = _mm_add_ps(c0, c8);
   5006 							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 16.0f));
   5007 
   5008 							_mm_store_ps((float*)(source0 + 8 * x), c0);
   5009 						}
   5010 
   5011 						source0 += pitch;
   5012 						source1 += pitch;
   5013 						source2 += pitch;
   5014 						source3 += pitch;
   5015 						source4 += pitch;
   5016 						source5 += pitch;
   5017 						source6 += pitch;
   5018 						source7 += pitch;
   5019 						source8 += pitch;
   5020 						source9 += pitch;
   5021 						sourceA += pitch;
   5022 						sourceB += pitch;
   5023 						sourceC += pitch;
   5024 						sourceD += pitch;
   5025 						sourceE += pitch;
   5026 						sourceF += pitch;
   5027 					}
   5028 				}
   5029 				else ASSERT(false);
   5030 			}
   5031 			else
   5032 			{
   5033 				if(internal.depth == 2)
   5034 				{
   5035 					for(int y = 0; y < height; y++)
   5036 					{
   5037 						for(int x = 0; x < 2 * width; x++)
   5038 						{
   5039 							float c0 = *(float*)(source0 + 4 * x);
   5040 							float c1 = *(float*)(source1 + 4 * x);
   5041 
   5042 							c0 = c0 + c1;
   5043 							c0 *= 1.0f / 2.0f;
   5044 
   5045 							*(float*)(source0 + 4 * x) = c0;
   5046 						}
   5047 
   5048 						source0 += pitch;
   5049 						source1 += pitch;
   5050 					}
   5051 				}
   5052 				else if(internal.depth == 4)
   5053 				{
   5054 					for(int y = 0; y < height; y++)
   5055 					{
   5056 						for(int x = 0; x < 2 * width; x++)
   5057 						{
   5058 							float c0 = *(float*)(source0 + 4 * x);
   5059 							float c1 = *(float*)(source1 + 4 * x);
   5060 							float c2 = *(float*)(source2 + 4 * x);
   5061 							float c3 = *(float*)(source3 + 4 * x);
   5062 
   5063 							c0 = c0 + c1;
   5064 							c2 = c2 + c3;
   5065 							c0 = c0 + c2;
   5066 							c0 *= 1.0f / 4.0f;
   5067 
   5068 							*(float*)(source0 + 4 * x) = c0;
   5069 						}
   5070 
   5071 						source0 += pitch;
   5072 						source1 += pitch;
   5073 						source2 += pitch;
   5074 						source3 += pitch;
   5075 					}
   5076 				}
   5077 				else if(internal.depth == 8)
   5078 				{
   5079 					for(int y = 0; y < height; y++)
   5080 					{
   5081 						for(int x = 0; x < 2 * width; x++)
   5082 						{
   5083 							float c0 = *(float*)(source0 + 4 * x);
   5084 							float c1 = *(float*)(source1 + 4 * x);
   5085 							float c2 = *(float*)(source2 + 4 * x);
   5086 							float c3 = *(float*)(source3 + 4 * x);
   5087 							float c4 = *(float*)(source4 + 4 * x);
   5088 							float c5 = *(float*)(source5 + 4 * x);
   5089 							float c6 = *(float*)(source6 + 4 * x);
   5090 							float c7 = *(float*)(source7 + 4 * x);
   5091 
   5092 							c0 = c0 + c1;
   5093 							c2 = c2 + c3;
   5094 							c4 = c4 + c5;
   5095 							c6 = c6 + c7;
   5096 							c0 = c0 + c2;
   5097 							c4 = c4 + c6;
   5098 							c0 = c0 + c4;
   5099 							c0 *= 1.0f / 8.0f;
   5100 
   5101 							*(float*)(source0 + 4 * x) = c0;
   5102 						}
   5103 
   5104 						source0 += pitch;
   5105 						source1 += pitch;
   5106 						source2 += pitch;
   5107 						source3 += pitch;
   5108 						source4 += pitch;
   5109 						source5 += pitch;
   5110 						source6 += pitch;
   5111 						source7 += pitch;
   5112 					}
   5113 				}
   5114 				else if(internal.depth == 16)
   5115 				{
   5116 					for(int y = 0; y < height; y++)
   5117 					{
   5118 						for(int x = 0; x < 2 * width; x++)
   5119 						{
   5120 							float c0 = *(float*)(source0 + 4 * x);
   5121 							float c1 = *(float*)(source1 + 4 * x);
   5122 							float c2 = *(float*)(source2 + 4 * x);
   5123 							float c3 = *(float*)(source3 + 4 * x);
   5124 							float c4 = *(float*)(source4 + 4 * x);
   5125 							float c5 = *(float*)(source5 + 4 * x);
   5126 							float c6 = *(float*)(source6 + 4 * x);
   5127 							float c7 = *(float*)(source7 + 4 * x);
   5128 							float c8 = *(float*)(source8 + 4 * x);
   5129 							float c9 = *(float*)(source9 + 4 * x);
   5130 							float cA = *(float*)(sourceA + 4 * x);
   5131 							float cB = *(float*)(sourceB + 4 * x);
   5132 							float cC = *(float*)(sourceC + 4 * x);
   5133 							float cD = *(float*)(sourceD + 4 * x);
   5134 							float cE = *(float*)(sourceE + 4 * x);
   5135 							float cF = *(float*)(sourceF + 4 * x);
   5136 
   5137 							c0 = c0 + c1;
   5138 							c2 = c2 + c3;
   5139 							c4 = c4 + c5;
   5140 							c6 = c6 + c7;
   5141 							c8 = c8 + c9;
   5142 							cA = cA + cB;
   5143 							cC = cC + cD;
   5144 							cE = cE + cF;
   5145 							c0 = c0 + c2;
   5146 							c4 = c4 + c6;
   5147 							c8 = c8 + cA;
   5148 							cC = cC + cE;
   5149 							c0 = c0 + c4;
   5150 							c8 = c8 + cC;
   5151 							c0 = c0 + c8;
   5152 							c0 *= 1.0f / 16.0f;
   5153 
   5154 							*(float*)(source0 + 4 * x) = c0;
   5155 						}
   5156 
   5157 						source0 += pitch;
   5158 						source1 += pitch;
   5159 						source2 += pitch;
   5160 						source3 += pitch;
   5161 						source4 += pitch;
   5162 						source5 += pitch;
   5163 						source6 += pitch;
   5164 						source7 += pitch;
   5165 						source8 += pitch;
   5166 						source9 += pitch;
   5167 						sourceA += pitch;
   5168 						sourceB += pitch;
   5169 						sourceC += pitch;
   5170 						sourceD += pitch;
   5171 						sourceE += pitch;
   5172 						sourceF += pitch;
   5173 					}
   5174 				}
   5175 				else ASSERT(false);
   5176 			}
   5177 		}
   5178 		else if(internal.format == FORMAT_A32B32G32R32F || internal.format == FORMAT_X32B32G32R32F)
   5179 		{
   5180 			if(CPUID::supportsSSE())
   5181 			{
   5182 				if(internal.depth == 2)
   5183 				{
   5184 					for(int y = 0; y < height; y++)
   5185 					{
   5186 						for(int x = 0; x < width; x++)
   5187 						{
   5188 							__m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
   5189 							__m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
   5190 
   5191 							c0 = _mm_add_ps(c0, c1);
   5192 							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 2.0f));
   5193 
   5194 							_mm_store_ps((float*)(source0 + 16 * x), c0);
   5195 						}
   5196 
   5197 						source0 += pitch;
   5198 						source1 += pitch;
   5199 					}
   5200 				}
   5201 				else if(internal.depth == 4)
   5202 				{
   5203 					for(int y = 0; y < height; y++)
   5204 					{
   5205 						for(int x = 0; x < width; x++)
   5206 						{
   5207 							__m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
   5208 							__m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
   5209 							__m128 c2 = _mm_load_ps((float*)(source2 + 16 * x));
   5210 							__m128 c3 = _mm_load_ps((float*)(source3 + 16 * x));
   5211 
   5212 							c0 = _mm_add_ps(c0, c1);
   5213 							c2 = _mm_add_ps(c2, c3);
   5214 							c0 = _mm_add_ps(c0, c2);
   5215 							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 4.0f));
   5216 
   5217 							_mm_store_ps((float*)(source0 + 16 * x), c0);
   5218 						}
   5219 
   5220 						source0 += pitch;
   5221 						source1 += pitch;
   5222 						source2 += pitch;
   5223 						source3 += pitch;
   5224 					}
   5225 				}
   5226 				else if(internal.depth == 8)
   5227 				{
   5228 					for(int y = 0; y < height; y++)
   5229 					{
   5230 						for(int x = 0; x < width; x++)
   5231 						{
   5232 							__m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
   5233 							__m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
   5234 							__m128 c2 = _mm_load_ps((float*)(source2 + 16 * x));
   5235 							__m128 c3 = _mm_load_ps((float*)(source3 + 16 * x));
   5236 							__m128 c4 = _mm_load_ps((float*)(source4 + 16 * x));
   5237 							__m128 c5 = _mm_load_ps((float*)(source5 + 16 * x));
   5238 							__m128 c6 = _mm_load_ps((float*)(source6 + 16 * x));
   5239 							__m128 c7 = _mm_load_ps((float*)(source7 + 16 * x));
   5240 
   5241 							c0 = _mm_add_ps(c0, c1);
   5242 							c2 = _mm_add_ps(c2, c3);
   5243 							c4 = _mm_add_ps(c4, c5);
   5244 							c6 = _mm_add_ps(c6, c7);
   5245 							c0 = _mm_add_ps(c0, c2);
   5246 							c4 = _mm_add_ps(c4, c6);
   5247 							c0 = _mm_add_ps(c0, c4);
   5248 							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 8.0f));
   5249 
   5250 							_mm_store_ps((float*)(source0 + 16 * x), c0);
   5251 						}
   5252 
   5253 						source0 += pitch;
   5254 						source1 += pitch;
   5255 						source2 += pitch;
   5256 						source3 += pitch;
   5257 						source4 += pitch;
   5258 						source5 += pitch;
   5259 						source6 += pitch;
   5260 						source7 += pitch;
   5261 					}
   5262 				}
   5263 				else if(internal.depth == 16)
   5264 				{
   5265 					for(int y = 0; y < height; y++)
   5266 					{
   5267 						for(int x = 0; x < width; x++)
   5268 						{
   5269 							__m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
   5270 							__m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
   5271 							__m128 c2 = _mm_load_ps((float*)(source2 + 16 * x));
   5272 							__m128 c3 = _mm_load_ps((float*)(source3 + 16 * x));
   5273 							__m128 c4 = _mm_load_ps((float*)(source4 + 16 * x));
   5274 							__m128 c5 = _mm_load_ps((float*)(source5 + 16 * x));
   5275 							__m128 c6 = _mm_load_ps((float*)(source6 + 16 * x));
   5276 							__m128 c7 = _mm_load_ps((float*)(source7 + 16 * x));
   5277 							__m128 c8 = _mm_load_ps((float*)(source8 + 16 * x));
   5278 							__m128 c9 = _mm_load_ps((float*)(source9 + 16 * x));
   5279 							__m128 cA = _mm_load_ps((float*)(sourceA + 16 * x));
   5280 							__m128 cB = _mm_load_ps((float*)(sourceB + 16 * x));
   5281 							__m128 cC = _mm_load_ps((float*)(sourceC + 16 * x));
   5282 							__m128 cD = _mm_load_ps((float*)(sourceD + 16 * x));
   5283 							__m128 cE = _mm_load_ps((float*)(sourceE + 16 * x));
   5284 							__m128 cF = _mm_load_ps((float*)(sourceF + 16 * x));
   5285 
   5286 							c0 = _mm_add_ps(c0, c1);
   5287 							c2 = _mm_add_ps(c2, c3);
   5288 							c4 = _mm_add_ps(c4, c5);
   5289 							c6 = _mm_add_ps(c6, c7);
   5290 							c8 = _mm_add_ps(c8, c9);
   5291 							cA = _mm_add_ps(cA, cB);
   5292 							cC = _mm_add_ps(cC, cD);
   5293 							cE = _mm_add_ps(cE, cF);
   5294 							c0 = _mm_add_ps(c0, c2);
   5295 							c4 = _mm_add_ps(c4, c6);
   5296 							c8 = _mm_add_ps(c8, cA);
   5297 							cC = _mm_add_ps(cC, cE);
   5298 							c0 = _mm_add_ps(c0, c4);
   5299 							c8 = _mm_add_ps(c8, cC);
   5300 							c0 = _mm_add_ps(c0, c8);
   5301 							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 16.0f));
   5302 
   5303 							_mm_store_ps((float*)(source0 + 16 * x), c0);
   5304 						}
   5305 
   5306 						source0 += pitch;
   5307 						source1 += pitch;
   5308 						source2 += pitch;
   5309 						source3 += pitch;
   5310 						source4 += pitch;
   5311 						source5 += pitch;
   5312 						source6 += pitch;
   5313 						source7 += pitch;
   5314 						source8 += pitch;
   5315 						source9 += pitch;
   5316 						sourceA += pitch;
   5317 						sourceB += pitch;
   5318 						sourceC += pitch;
   5319 						sourceD += pitch;
   5320 						sourceE += pitch;
   5321 						sourceF += pitch;
   5322 					}
   5323 				}
   5324 				else ASSERT(false);
   5325 			}
   5326 			else
   5327 			{
   5328 				if(internal.depth == 2)
   5329 				{
   5330 					for(int y = 0; y < height; y++)
   5331 					{
   5332 						for(int x = 0; x < 4 * width; x++)
   5333 						{
   5334 							float c0 = *(float*)(source0 + 4 * x);
   5335 							float c1 = *(float*)(source1 + 4 * x);
   5336 
   5337 							c0 = c0 + c1;
   5338 							c0 *= 1.0f / 2.0f;
   5339 
   5340 							*(float*)(source0 + 4 * x) = c0;
   5341 						}
   5342 
   5343 						source0 += pitch;
   5344 						source1 += pitch;
   5345 					}
   5346 				}
   5347 				else if(internal.depth == 4)
   5348 				{
   5349 					for(int y = 0; y < height; y++)
   5350 					{
   5351 						for(int x = 0; x < 4 * width; x++)
   5352 						{
   5353 							float c0 = *(float*)(source0 + 4 * x);
   5354 							float c1 = *(float*)(source1 + 4 * x);
   5355 							float c2 = *(float*)(source2 + 4 * x);
   5356 							float c3 = *(float*)(source3 + 4 * x);
   5357 
   5358 							c0 = c0 + c1;
   5359 							c2 = c2 + c3;
   5360 							c0 = c0 + c2;
   5361 							c0 *= 1.0f / 4.0f;
   5362 
   5363 							*(float*)(source0 + 4 * x) = c0;
   5364 						}
   5365 
   5366 						source0 += pitch;
   5367 						source1 += pitch;
   5368 						source2 += pitch;
   5369 						source3 += pitch;
   5370 					}
   5371 				}
   5372 				else if(internal.depth == 8)
   5373 				{
   5374 					for(int y = 0; y < height; y++)
   5375 					{
   5376 						for(int x = 0; x < 4 * width; x++)
   5377 						{
   5378 							float c0 = *(float*)(source0 + 4 * x);
   5379 							float c1 = *(float*)(source1 + 4 * x);
   5380 							float c2 = *(float*)(source2 + 4 * x);
   5381 							float c3 = *(float*)(source3 + 4 * x);
   5382 							float c4 = *(float*)(source4 + 4 * x);
   5383 							float c5 = *(float*)(source5 + 4 * x);
   5384 							float c6 = *(float*)(source6 + 4 * x);
   5385 							float c7 = *(float*)(source7 + 4 * x);
   5386 
   5387 							c0 = c0 + c1;
   5388 							c2 = c2 + c3;
   5389 							c4 = c4 + c5;
   5390 							c6 = c6 + c7;
   5391 							c0 = c0 + c2;
   5392 							c4 = c4 + c6;
   5393 							c0 = c0 + c4;
   5394 							c0 *= 1.0f / 8.0f;
   5395 
   5396 							*(float*)(source0 + 4 * x) = c0;
   5397 						}
   5398 
   5399 						source0 += pitch;
   5400 						source1 += pitch;
   5401 						source2 += pitch;
   5402 						source3 += pitch;
   5403 						source4 += pitch;
   5404 						source5 += pitch;
   5405 						source6 += pitch;
   5406 						source7 += pitch;
   5407 					}
   5408 				}
   5409 				else if(internal.depth == 16)
   5410 				{
   5411 					for(int y = 0; y < height; y++)
   5412 					{
   5413 						for(int x = 0; x < 4 * width; x++)
   5414 						{
   5415 							float c0 = *(float*)(source0 + 4 * x);
   5416 							float c1 = *(float*)(source1 + 4 * x);
   5417 							float c2 = *(float*)(source2 + 4 * x);
   5418 							float c3 = *(float*)(source3 + 4 * x);
   5419 							float c4 = *(float*)(source4 + 4 * x);
   5420 							float c5 = *(float*)(source5 + 4 * x);
   5421 							float c6 = *(float*)(source6 + 4 * x);
   5422 							float c7 = *(float*)(source7 + 4 * x);
   5423 							float c8 = *(float*)(source8 + 4 * x);
   5424 							float c9 = *(float*)(source9 + 4 * x);
   5425 							float cA = *(float*)(sourceA + 4 * x);
   5426 							float cB = *(float*)(sourceB + 4 * x);
   5427 							float cC = *(float*)(sourceC + 4 * x);
   5428 							float cD = *(float*)(sourceD + 4 * x);
   5429 							float cE = *(float*)(sourceE + 4 * x);
   5430 							float cF = *(float*)(sourceF + 4 * x);
   5431 
   5432 							c0 = c0 + c1;
   5433 							c2 = c2 + c3;
   5434 							c4 = c4 + c5;
   5435 							c6 = c6 + c7;
   5436 							c8 = c8 + c9;
   5437 							cA = cA + cB;
   5438 							cC = cC + cD;
   5439 							cE = cE + cF;
   5440 							c0 = c0 + c2;
   5441 							c4 = c4 + c6;
   5442 							c8 = c8 + cA;
   5443 							cC = cC + cE;
   5444 							c0 = c0 + c4;
   5445 							c8 = c8 + cC;
   5446 							c0 = c0 + c8;
   5447 							c0 *= 1.0f / 16.0f;
   5448 
   5449 							*(float*)(source0 + 4 * x) = c0;
   5450 						}
   5451 
   5452 						source0 += pitch;
   5453 						source1 += pitch;
   5454 						source2 += pitch;
   5455 						source3 += pitch;
   5456 						source4 += pitch;
   5457 						source5 += pitch;
   5458 						source6 += pitch;
   5459 						source7 += pitch;
   5460 						source8 += pitch;
   5461 						source9 += pitch;
   5462 						sourceA += pitch;
   5463 						sourceB += pitch;
   5464 						sourceC += pitch;
   5465 						sourceD += pitch;
   5466 						sourceE += pitch;
   5467 						sourceF += pitch;
   5468 					}
   5469 				}
   5470 				else ASSERT(false);
   5471 			}
   5472 		}
   5473 		else if(internal.format == FORMAT_R5G6B5)
   5474 		{
   5475 			if(CPUID::supportsSSE2() && (width % 8) == 0)
   5476 			{
   5477 				if(internal.depth == 2)
   5478 				{
   5479 					for(int y = 0; y < height; y++)
   5480 					{
   5481 						for(int x = 0; x < width; x += 8)
   5482 						{
   5483 							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
   5484 							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
   5485 
   5486 							static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
   5487 							static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
   5488 							__m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
   5489 							__m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
   5490 							__m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
   5491 							__m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
   5492 
   5493 							c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
   5494 							c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
   5495 							c1 = _mm_avg_epu16(c0__g_, c1__g_);
   5496 							c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
   5497 							c0 = _mm_or_si128(c0, c1);
   5498 
   5499 							_mm_store_si128((__m128i*)(source0 + 2 * x), c0);
   5500 						}
   5501 
   5502 						source0 += pitch;
   5503 						source1 += pitch;
   5504 					}
   5505 				}
   5506 				else if(internal.depth == 4)
   5507 				{
   5508 					for(int y = 0; y < height; y++)
   5509 					{
   5510 						for(int x = 0; x < width; x += 8)
   5511 						{
   5512 							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
   5513 							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
   5514 							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 2 * x));
   5515 							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 2 * x));
   5516 
   5517 							static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
   5518 							static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
   5519 							__m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
   5520 							__m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
   5521 							__m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
   5522 							__m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
   5523 							__m128i c2_r_b = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(r_b));
   5524 							__m128i c2__g_ = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(_g_));
   5525 							__m128i c3_r_b = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(r_b));
   5526 							__m128i c3__g_ = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(_g_));
   5527 
   5528 							c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
   5529 							c2 = _mm_avg_epu8(c2_r_b, c3_r_b);
   5530 							c0 = _mm_avg_epu8(c0, c2);
   5531 							c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
   5532 							c1 = _mm_avg_epu16(c0__g_, c1__g_);
   5533 							c3 = _mm_avg_epu16(c2__g_, c3__g_);
   5534 							c1 = _mm_avg_epu16(c1, c3);
   5535 							c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
   5536 							c0 = _mm_or_si128(c0, c1);
   5537 
   5538 							_mm_store_si128((__m128i*)(source0 + 2 * x), c0);
   5539 						}
   5540 
   5541 						source0 += pitch;
   5542 						source1 += pitch;
   5543 						source2 += pitch;
   5544 						source3 += pitch;
   5545 					}
   5546 				}
   5547 				else if(internal.depth == 8)
   5548 				{
   5549 					for(int y = 0; y < height; y++)
   5550 					{
   5551 						for(int x = 0; x < width; x += 8)
   5552 						{
   5553 							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
   5554 							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
   5555 							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 2 * x));
   5556 							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 2 * x));
   5557 							__m128i c4 = _mm_load_si128((__m128i*)(source4 + 2 * x));
   5558 							__m128i c5 = _mm_load_si128((__m128i*)(source5 + 2 * x));
   5559 							__m128i c6 = _mm_load_si128((__m128i*)(source6 + 2 * x));
   5560 							__m128i c7 = _mm_load_si128((__m128i*)(source7 + 2 * x));
   5561 
   5562 							static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
   5563 							static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
   5564 							__m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
   5565 							__m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
   5566 							__m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
   5567 							__m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
   5568 							__m128i c2_r_b = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(r_b));
   5569 							__m128i c2__g_ = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(_g_));
   5570 							__m128i c3_r_b = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(r_b));
   5571 							__m128i c3__g_ = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(_g_));
   5572 							__m128i c4_r_b = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(r_b));
   5573 							__m128i c4__g_ = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(_g_));
   5574 							__m128i c5_r_b = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(r_b));
   5575 							__m128i c5__g_ = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(_g_));
   5576 							__m128i c6_r_b = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(r_b));
   5577 							__m128i c6__g_ = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(_g_));
   5578 							__m128i c7_r_b = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(r_b));
   5579 							__m128i c7__g_ = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(_g_));
   5580 
   5581 							c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
   5582 							c2 = _mm_avg_epu8(c2_r_b, c3_r_b);
   5583 							c4 = _mm_avg_epu8(c4_r_b, c5_r_b);
   5584 							c6 = _mm_avg_epu8(c6_r_b, c7_r_b);
   5585 							c0 = _mm_avg_epu8(c0, c2);
   5586 							c4 = _mm_avg_epu8(c4, c6);
   5587 							c0 = _mm_avg_epu8(c0, c4);
   5588 							c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
   5589 							c1 = _mm_avg_epu16(c0__g_, c1__g_);
   5590 							c3 = _mm_avg_epu16(c2__g_, c3__g_);
   5591 							c5 = _mm_avg_epu16(c4__g_, c5__g_);
   5592 							c7 = _mm_avg_epu16(c6__g_, c7__g_);
   5593 							c1 = _mm_avg_epu16(c1, c3);
   5594 							c5 = _mm_avg_epu16(c5, c7);
   5595 							c1 = _mm_avg_epu16(c1, c5);
   5596 							c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
   5597 							c0 = _mm_or_si128(c0, c1);
   5598 
   5599 							_mm_store_si128((__m128i*)(source0 + 2 * x), c0);
   5600 						}
   5601 
   5602 						source0 += pitch;
   5603 						source1 += pitch;
   5604 						source2 += pitch;
   5605 						source3 += pitch;
   5606 						source4 += pitch;
   5607 						source5 += pitch;
   5608 						source6 += pitch;
   5609 						source7 += pitch;
   5610 					}
   5611 				}
   5612 				else if(internal.depth == 16)
   5613 				{
   5614 					for(int y = 0; y < height; y++)
   5615 					{
   5616 						for(int x = 0; x < width; x += 8)
   5617 						{
   5618 							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
   5619 							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
   5620 							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 2 * x));
   5621 							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 2 * x));
   5622 							__m128i c4 = _mm_load_si128((__m128i*)(source4 + 2 * x));
   5623 							__m128i c5 = _mm_load_si128((__m128i*)(source5 + 2 * x));
   5624 							__m128i c6 = _mm_load_si128((__m128i*)(source6 + 2 * x));
   5625 							__m128i c7 = _mm_load_si128((__m128i*)(source7 + 2 * x));
   5626 							__m128i c8 = _mm_load_si128((__m128i*)(source8 + 2 * x));
   5627 							__m128i c9 = _mm_load_si128((__m128i*)(source9 + 2 * x));
   5628 							__m128i cA = _mm_load_si128((__m128i*)(sourceA + 2 * x));
   5629 							__m128i cB = _mm_load_si128((__m128i*)(sourceB + 2 * x));
   5630 							__m128i cC = _mm_load_si128((__m128i*)(sourceC + 2 * x));
   5631 							__m128i cD = _mm_load_si128((__m128i*)(sourceD + 2 * x));
   5632 							__m128i cE = _mm_load_si128((__m128i*)(sourceE + 2 * x));
   5633 							__m128i cF = _mm_load_si128((__m128i*)(sourceF + 2 * x));
   5634 
   5635 							static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
   5636 							static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
   5637 							__m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
   5638 							__m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
   5639 							__m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
   5640 							__m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
   5641 							__m128i c2_r_b = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(r_b));
   5642 							__m128i c2__g_ = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(_g_));
   5643 							__m128i c3_r_b = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(r_b));
   5644 							__m128i c3__g_ = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(_g_));
   5645 							__m128i c4_r_b = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(r_b));
   5646 							__m128i c4__g_ = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(_g_));
   5647 							__m128i c5_r_b = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(r_b));
   5648 							__m128i c5__g_ = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(_g_));
   5649 							__m128i c6_r_b = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(r_b));
   5650 							__m128i c6__g_ = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(_g_));
   5651 							__m128i c7_r_b = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(r_b));
   5652 							__m128i c7__g_ = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(_g_));
   5653 							__m128i c8_r_b = _mm_and_si128(c8, reinterpret_cast<const __m128i&>(r_b));
   5654 							__m128i c8__g_ = _mm_and_si128(c8, reinterpret_cast<const __m128i&>(_g_));
   5655 							__m128i c9_r_b = _mm_and_si128(c9, reinterpret_cast<const __m128i&>(r_b));
   5656 							__m128i c9__g_ = _mm_and_si128(c9, reinterpret_cast<const __m128i&>(_g_));
   5657 							__m128i cA_r_b = _mm_and_si128(cA, reinterpret_cast<const __m128i&>(r_b));
   5658 							__m128i cA__g_ = _mm_and_si128(cA, reinterpret_cast<const __m128i&>(_g_));
   5659 							__m128i cB_r_b = _mm_and_si128(cB, reinterpret_cast<const __m128i&>(r_b));
   5660 							__m128i cB__g_ = _mm_and_si128(cB, reinterpret_cast<const __m128i&>(_g_));
   5661 							__m128i cC_r_b = _mm_and_si128(cC, reinterpret_cast<const __m128i&>(r_b));
   5662 							__m128i cC__g_ = _mm_and_si128(cC, reinterpret_cast<const __m128i&>(_g_));
   5663 							__m128i cD_r_b = _mm_and_si128(cD, reinterpret_cast<const __m128i&>(r_b));
   5664 							__m128i cD__g_ = _mm_and_si128(cD, reinterpret_cast<const __m128i&>(_g_));
   5665 							__m128i cE_r_b = _mm_and_si128(cE, reinterpret_cast<const __m128i&>(r_b));
   5666 							__m128i cE__g_ = _mm_and_si128(cE, reinterpret_cast<const __m128i&>(_g_));
   5667 							__m128i cF_r_b = _mm_and_si128(cF, reinterpret_cast<const __m128i&>(r_b));
   5668 							__m128i cF__g_ = _mm_and_si128(cF, reinterpret_cast<const __m128i&>(_g_));
   5669 
   5670 							c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
   5671 							c2 = _mm_avg_epu8(c2_r_b, c3_r_b);
   5672 							c4 = _mm_avg_epu8(c4_r_b, c5_r_b);
   5673 							c6 = _mm_avg_epu8(c6_r_b, c7_r_b);
   5674 							c8 = _mm_avg_epu8(c8_r_b, c9_r_b);
   5675 							cA = _mm_avg_epu8(cA_r_b, cB_r_b);
   5676 							cC = _mm_avg_epu8(cC_r_b, cD_r_b);
   5677 							cE = _mm_avg_epu8(cE_r_b, cF_r_b);
   5678 							c0 = _mm_avg_epu8(c0, c2);
   5679 							c4 = _mm_avg_epu8(c4, c6);
   5680 							c8 = _mm_avg_epu8(c8, cA);
   5681 							cC = _mm_avg_epu8(cC, cE);
   5682 							c0 = _mm_avg_epu8(c0, c4);
   5683 							c8 = _mm_avg_epu8(c8, cC);
   5684 							c0 = _mm_avg_epu8(c0, c8);
   5685 							c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
   5686 							c1 = _mm_avg_epu16(c0__g_, c1__g_);
   5687 							c3 = _mm_avg_epu16(c2__g_, c3__g_);
   5688 							c5 = _mm_avg_epu16(c4__g_, c5__g_);
   5689 							c7 = _mm_avg_epu16(c6__g_, c7__g_);
   5690 							c9 = _mm_avg_epu16(c8__g_, c9__g_);
   5691 							cB = _mm_avg_epu16(cA__g_, cB__g_);
   5692 							cD = _mm_avg_epu16(cC__g_, cD__g_);
   5693 							cF = _mm_avg_epu16(cE__g_, cF__g_);
   5694 							c1 = _mm_avg_epu8(c1, c3);
   5695 							c5 = _mm_avg_epu8(c5, c7);
   5696 							c9 = _mm_avg_epu8(c9, cB);
   5697 							cD = _mm_avg_epu8(cD, cF);
   5698 							c1 = _mm_avg_epu8(c1, c5);
   5699 							c9 = _mm_avg_epu8(c9, cD);
   5700 							c1 = _mm_avg_epu8(c1, c9);
   5701 							c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
   5702 							c0 = _mm_or_si128(c0, c1);
   5703 
   5704 							_mm_store_si128((__m128i*)(source0 + 2 * x), c0);
   5705 						}
   5706 
   5707 						source0 += pitch;
   5708 						source1 += pitch;
   5709 						source2 += pitch;
   5710 						source3 += pitch;
   5711 						source4 += pitch;
   5712 						source5 += pitch;
   5713 						source6 += pitch;
   5714 						source7 += pitch;
   5715 						source8 += pitch;
   5716 						source9 += pitch;
   5717 						sourceA += pitch;
   5718 						sourceB += pitch;
   5719 						sourceC += pitch;
   5720 						sourceD += pitch;
   5721 						sourceE += pitch;
   5722 						sourceF += pitch;
   5723 					}
   5724 				}
   5725 				else ASSERT(false);
   5726 			}
   5727 			else
   5728 			{
   5729 				#define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7BEF) + (((x) ^ (y)) & 0x0821))
   5730 
   5731 				if(internal.depth == 2)
   5732 				{
   5733 					for(int y = 0; y < height; y++)
   5734 					{
   5735 						for(int x = 0; x < width; x++)
   5736 						{
   5737 							unsigned short c0 = *(unsigned short*)(source0 + 2 * x);
   5738 							unsigned short c1 = *(unsigned short*)(source1 + 2 * x);
   5739 
   5740 							c0 = AVERAGE(c0, c1);
   5741 
   5742 							*(unsigned short*)(source0 + 2 * x) = c0;
   5743 						}
   5744 
   5745 						source0 += pitch;
   5746 						source1 += pitch;
   5747 					}
   5748 				}
   5749 				else if(internal.depth == 4)
   5750 				{
   5751 					for(int y = 0; y < height; y++)
   5752 					{
   5753 						for(int x = 0; x < width; x++)
   5754 						{
   5755 							unsigned short c0 = *(unsigned short*)(source0 + 2 * x);
   5756 							unsigned short c1 = *(unsigned short*)(source1 + 2 * x);
   5757 							unsigned short c2 = *(unsigned short*)(source2 + 2 * x);
   5758 							unsigned short c3 = *(unsigned short*)(source3 + 2 * x);
   5759 
   5760 							c0 = AVERAGE(c0, c1);
   5761 							c2 = AVERAGE(c2, c3);
   5762 							c0 = AVERAGE(c0, c2);
   5763 
   5764 							*(unsigned short*)(source0 + 2 * x) = c0;
   5765 						}
   5766 
   5767 						source0 += pitch;
   5768 						source1 += pitch;
   5769 						source2 += pitch;
   5770 						source3 += pitch;
   5771 					}
   5772 				}
   5773 				else if(internal.depth == 8)
   5774 				{
   5775 					for(int y = 0; y < height; y++)
   5776 					{
   5777 						for(int x = 0; x < width; x++)
   5778 						{
   5779 							unsigned short c0 = *(unsigned short*)(source0 + 2 * x);
   5780 							unsigned short c1 = *(unsigned short*)(source1 + 2 * x);
   5781 							unsigned short c2 = *(unsigned short*)(source2 + 2 * x);
   5782 							unsigned short c3 = *(unsigned short*)(source3 + 2 * x);
   5783 							unsigned short c4 = *(unsigned short*)(source4 + 2 * x);
   5784 							unsigned short c5 = *(unsigned short*)(source5 + 2 * x);
   5785 							unsigned short c6 = *(unsigned short*)(source6 + 2 * x);
   5786 							unsigned short c7 = *(unsigned short*)(source7 + 2 * x);
   5787 
   5788 							c0 = AVERAGE(c0, c1);
   5789 							c2 = AVERAGE(c2, c3);
   5790 							c4 = AVERAGE(c4, c5);
   5791 							c6 = AVERAGE(c6, c7);
   5792 							c0 = AVERAGE(c0, c2);
   5793 							c4 = AVERAGE(c4, c6);
   5794 							c0 = AVERAGE(c0, c4);
   5795 
   5796 							*(unsigned short*)(source0 + 2 * x) = c0;
   5797 						}
   5798 
   5799 						source0 += pitch;
   5800 						source1 += pitch;
   5801 						source2 += pitch;
   5802 						source3 += pitch;
   5803 						source4 += pitch;
   5804 						source5 += pitch;
   5805 						source6 += pitch;
   5806 						source7 += pitch;
   5807 					}
   5808 				}
   5809 				else if(internal.depth == 16)
   5810 				{
   5811 					for(int y = 0; y < height; y++)
   5812 					{
   5813 						for(int x = 0; x < width; x++)
   5814 						{
   5815 							unsigned short c0 = *(unsigned short*)(source0 + 2 * x);
   5816 							unsigned short c1 = *(unsigned short*)(source1 + 2 * x);
   5817 							unsigned short c2 = *(unsigned short*)(source2 + 2 * x);
   5818 							unsigned short c3 = *(unsigned short*)(source3 + 2 * x);
   5819 							unsigned short c4 = *(unsigned short*)(source4 + 2 * x);
   5820 							unsigned short c5 = *(unsigned short*)(source5 + 2 * x);
   5821 							unsigned short c6 = *(unsigned short*)(source6 + 2 * x);
   5822 							unsigned short c7 = *(unsigned short*)(source7 + 2 * x);
   5823 							unsigned short c8 = *(unsigned short*)(source8 + 2 * x);
   5824 							unsigned short c9 = *(unsigned short*)(source9 + 2 * x);
   5825 							unsigned short cA = *(unsigned short*)(sourceA + 2 * x);
   5826 							unsigned short cB = *(unsigned short*)(sourceB + 2 * x);
   5827 							unsigned short cC = *(unsigned short*)(sourceC + 2 * x);
   5828 							unsigned short cD = *(unsigned short*)(sourceD + 2 * x);
   5829 							unsigned short cE = *(unsigned short*)(sourceE + 2 * x);
   5830 							unsigned short cF = *(unsigned short*)(sourceF + 2 * x);
   5831 
   5832 							c0 = AVERAGE(c0, c1);
   5833 							c2 = AVERAGE(c2, c3);
   5834 							c4 = AVERAGE(c4, c5);
   5835 							c6 = AVERAGE(c6, c7);
   5836 							c8 = AVERAGE(c8, c9);
   5837 							cA = AVERAGE(cA, cB);
   5838 							cC = AVERAGE(cC, cD);
   5839 							cE = AVERAGE(cE, cF);
   5840 							c0 = AVERAGE(c0, c2);
   5841 							c4 = AVERAGE(c4, c6);
   5842 							c8 = AVERAGE(c8, cA);
   5843 							cC = AVERAGE(cC, cE);
   5844 							c0 = AVERAGE(c0, c4);
   5845 							c8 = AVERAGE(c8, cC);
   5846 							c0 = AVERAGE(c0, c8);
   5847 
   5848 							*(unsigned short*)(source0 + 2 * x) = c0;
   5849 						}
   5850 
   5851 						source0 += pitch;
   5852 						source1 += pitch;
   5853 						source2 += pitch;
   5854 						source3 += pitch;
   5855 						source4 += pitch;
   5856 						source5 += pitch;
   5857 						source6 += pitch;
   5858 						source7 += pitch;
   5859 						source8 += pitch;
   5860 						source9 += pitch;
   5861 						sourceA += pitch;
   5862 						sourceB += pitch;
   5863 						sourceC += pitch;
   5864 						sourceD += pitch;
   5865 						sourceE += pitch;
   5866 						sourceF += pitch;
   5867 					}
   5868 				}
   5869 				else ASSERT(false);
   5870 
   5871 				#undef AVERAGE
   5872 			}
   5873 		}
   5874 		else
   5875 		{
   5876 		//	UNIMPLEMENTED();
   5877 		}
   5878 	}
   5879 }
   5880