Home | History | Annotate | Download | only in Renderer
      1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved.
      2 //
      3 // Licensed under the Apache License, Version 2.0 (the "License");
      4 // you may not use this file except in compliance with the License.
      5 // You may obtain a copy of the License at
      6 //
      7 //    http://www.apache.org/licenses/LICENSE-2.0
      8 //
      9 // Unless required by applicable law or agreed to in writing, software
     10 // distributed under the License is distributed on an "AS IS" BASIS,
     11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 // See the License for the specific language governing permissions and
     13 // limitations under the License.
     14 
     15 #include "Surface.hpp"
     16 
     17 #include "Color.hpp"
     18 #include "Context.hpp"
     19 #include "ETC_Decoder.hpp"
     20 #include "Renderer.hpp"
     21 #include "Common/Half.hpp"
     22 #include "Common/Memory.hpp"
     23 #include "Common/CPUID.hpp"
     24 #include "Common/Resource.hpp"
     25 #include "Common/Debug.hpp"
     26 #include "Reactor/Reactor.hpp"
     27 
     28 #if defined(__i386__) || defined(__x86_64__)
     29 	#include <xmmintrin.h>
     30 	#include <emmintrin.h>
     31 #endif
     32 
     33 #undef min
     34 #undef max
     35 
     36 namespace sw
     37 {
     38 	extern bool quadLayoutEnabled;
     39 	extern bool complementaryDepthBuffer;
     40 	extern TranscendentalPrecision logPrecision;
     41 
     42 	unsigned int *Surface::palette = 0;
     43 	unsigned int Surface::paletteID = 0;
     44 
     45 	void Surface::Buffer::write(int x, int y, int z, const Color<float> &color)
     46 	{
     47 		ASSERT((x >= -border) && (x < (width + border)));
     48 		ASSERT((y >= -border) && (y < (height + border)));
     49 		ASSERT((z >= 0) && (z < depth));
     50 
     51 		byte *element = (byte*)buffer + (x + border) * bytes + (y + border) * pitchB + z * samples * sliceB;
     52 
     53 		for(int i = 0; i < samples; i++)
     54 		{
     55 			write(element, color);
     56 			element += sliceB;
     57 		}
     58 	}
     59 
     60 	void Surface::Buffer::write(int x, int y, const Color<float> &color)
     61 	{
     62 		ASSERT((x >= -border) && (x < (width + border)));
     63 		ASSERT((y >= -border) && (y < (height + border)));
     64 
     65 		byte *element = (byte*)buffer + (x + border) * bytes + (y + border) * pitchB;
     66 
     67 		for(int i = 0; i < samples; i++)
     68 		{
     69 			write(element, color);
     70 			element += sliceB;
     71 		}
     72 	}
     73 
     74 	inline void Surface::Buffer::write(void *element, const Color<float> &color)
     75 	{
     76 		float r = color.r;
     77 		float g = color.g;
     78 		float b = color.b;
     79 		float a = color.a;
     80 
     81 		if(isSRGBformat(format))
     82 		{
     83 			r = linearToSRGB(r);
     84 			g = linearToSRGB(g);
     85 			b = linearToSRGB(b);
     86 		}
     87 
     88 		switch(format)
     89 		{
     90 		case FORMAT_A8:
     91 			*(unsigned char*)element = unorm<8>(a);
     92 			break;
     93 		case FORMAT_R8_SNORM:
     94 			*(char*)element = snorm<8>(r);
     95 			break;
     96 		case FORMAT_R8:
     97 			*(unsigned char*)element = unorm<8>(r);
     98 			break;
     99 		case FORMAT_R8I:
    100 			*(char*)element = scast<8>(r);
    101 			break;
    102 		case FORMAT_R8UI:
    103 			*(unsigned char*)element = ucast<8>(r);
    104 			break;
    105 		case FORMAT_R16I:
    106 			*(short*)element = scast<16>(r);
    107 			break;
    108 		case FORMAT_R16UI:
    109 			*(unsigned short*)element = ucast<16>(r);
    110 			break;
    111 		case FORMAT_R32I:
    112 			*(int*)element = static_cast<int>(r);
    113 			break;
    114 		case FORMAT_R32UI:
    115 			*(unsigned int*)element = static_cast<unsigned int>(r);
    116 			break;
    117 		case FORMAT_R3G3B2:
    118 			*(unsigned char*)element = (unorm<3>(r) << 5) | (unorm<3>(g) << 2) | (unorm<2>(b) << 0);
    119 			break;
    120 		case FORMAT_A8R3G3B2:
    121 			*(unsigned short*)element = (unorm<8>(a) << 8) | (unorm<3>(r) << 5) | (unorm<3>(g) << 2) | (unorm<2>(b) << 0);
    122 			break;
    123 		case FORMAT_X4R4G4B4:
    124 			*(unsigned short*)element = 0xF000 | (unorm<4>(r) << 8) | (unorm<4>(g) << 4) | (unorm<4>(b) << 0);
    125 			break;
    126 		case FORMAT_A4R4G4B4:
    127 			*(unsigned short*)element = (unorm<4>(a) << 12) | (unorm<4>(r) << 8) | (unorm<4>(g) << 4) | (unorm<4>(b) << 0);
    128 			break;
    129 		case FORMAT_R4G4B4A4:
    130 			*(unsigned short*)element = (unorm<4>(r) << 12) | (unorm<4>(g) << 8) | (unorm<4>(b) << 4) | (unorm<4>(a) << 0);
    131 			break;
    132 		case FORMAT_R5G6B5:
    133 			*(unsigned short*)element = (unorm<5>(r) << 11) | (unorm<6>(g) << 5) | (unorm<5>(b) << 0);
    134 			break;
    135 		case FORMAT_A1R5G5B5:
    136 			*(unsigned short*)element = (unorm<1>(a) << 15) | (unorm<5>(r) << 10) | (unorm<5>(g) << 5) | (unorm<5>(b) << 0);
    137 			break;
    138 		case FORMAT_R5G5B5A1:
    139 			*(unsigned short*)element = (unorm<5>(r) << 11) | (unorm<5>(g) << 6) | (unorm<5>(b) << 1) | (unorm<5>(a) << 0);
    140 			break;
    141 		case FORMAT_X1R5G5B5:
    142 			*(unsigned short*)element = 0x8000 | (unorm<5>(r) << 10) | (unorm<5>(g) << 5) | (unorm<5>(b) << 0);
    143 			break;
    144 		case FORMAT_A8R8G8B8:
    145 			*(unsigned int*)element = (unorm<8>(a) << 24) | (unorm<8>(r) << 16) | (unorm<8>(g) << 8) | (unorm<8>(b) << 0);
    146 			break;
    147 		case FORMAT_X8R8G8B8:
    148 			*(unsigned int*)element = 0xFF000000 | (unorm<8>(r) << 16) | (unorm<8>(g) << 8) | (unorm<8>(b) << 0);
    149 			break;
    150 		case FORMAT_A8B8G8R8_SNORM:
    151 			*(unsigned int*)element = (static_cast<unsigned int>(snorm<8>(a)) << 24) |
    152 			                          (static_cast<unsigned int>(snorm<8>(b)) << 16) |
    153 			                          (static_cast<unsigned int>(snorm<8>(g)) << 8) |
    154 			                          (static_cast<unsigned int>(snorm<8>(r)) << 0);
    155 			break;
    156 		case FORMAT_A8B8G8R8:
    157 		case FORMAT_SRGB8_A8:
    158 			*(unsigned int*)element = (unorm<8>(a) << 24) | (unorm<8>(b) << 16) | (unorm<8>(g) << 8) | (unorm<8>(r) << 0);
    159 			break;
    160 		case FORMAT_A8B8G8R8I:
    161 			*(unsigned int*)element = (static_cast<unsigned int>(scast<8>(a)) << 24) |
    162 			                          (static_cast<unsigned int>(scast<8>(b)) << 16) |
    163 			                          (static_cast<unsigned int>(scast<8>(g)) << 8) |
    164 			                          (static_cast<unsigned int>(scast<8>(r)) << 0);
    165 			break;
    166 		case FORMAT_A8B8G8R8UI:
    167 			*(unsigned int*)element = (ucast<8>(a) << 24) | (ucast<8>(b) << 16) | (ucast<8>(g) << 8) | (ucast<8>(r) << 0);
    168 			break;
    169 		case FORMAT_X8B8G8R8_SNORM:
    170 			*(unsigned int*)element = 0x7F000000 |
    171 			                          (static_cast<unsigned int>(snorm<8>(b)) << 16) |
    172 			                          (static_cast<unsigned int>(snorm<8>(g)) << 8) |
    173 			                          (static_cast<unsigned int>(snorm<8>(r)) << 0);
    174 			break;
    175 		case FORMAT_X8B8G8R8:
    176 		case FORMAT_SRGB8_X8:
    177 			*(unsigned int*)element = 0xFF000000 | (unorm<8>(b) << 16) | (unorm<8>(g) << 8) | (unorm<8>(r) << 0);
    178 			break;
    179 		case FORMAT_X8B8G8R8I:
    180 			*(unsigned int*)element = 0x7F000000 |
    181 			                          (static_cast<unsigned int>(scast<8>(b)) << 16) |
    182 			                          (static_cast<unsigned int>(scast<8>(g)) << 8) |
    183 			                          (static_cast<unsigned int>(scast<8>(r)) << 0);
    184 		case FORMAT_X8B8G8R8UI:
    185 			*(unsigned int*)element = 0xFF000000 | (ucast<8>(b) << 16) | (ucast<8>(g) << 8) | (ucast<8>(r) << 0);
    186 			break;
    187 		case FORMAT_A2R10G10B10:
    188 			*(unsigned int*)element = (unorm<2>(a) << 30) | (unorm<10>(r) << 20) | (unorm<10>(g) << 10) | (unorm<10>(b) << 0);
    189 			break;
    190 		case FORMAT_A2B10G10R10:
    191 		case FORMAT_A2B10G10R10UI:
    192 			*(unsigned int*)element = (unorm<2>(a) << 30) | (unorm<10>(b) << 20) | (unorm<10>(g) << 10) | (unorm<10>(r) << 0);
    193 			break;
    194 		case FORMAT_G8R8_SNORM:
    195 			*(unsigned short*)element = (static_cast<unsigned short>(snorm<8>(g)) << 8) |
    196 			                            (static_cast<unsigned short>(snorm<8>(r)) << 0);
    197 			break;
    198 		case FORMAT_G8R8:
    199 			*(unsigned short*)element = (unorm<8>(g) << 8) | (unorm<8>(r) << 0);
    200 			break;
    201 		case FORMAT_G8R8I:
    202 			*(unsigned short*)element = (static_cast<unsigned short>(scast<8>(g)) << 8) |
    203 			                            (static_cast<unsigned short>(scast<8>(r)) << 0);
    204 			break;
    205 		case FORMAT_G8R8UI:
    206 			*(unsigned short*)element = (ucast<8>(g) << 8) | (ucast<8>(r) << 0);
    207 			break;
    208 		case FORMAT_G16R16:
    209 			*(unsigned int*)element = (unorm<16>(g) << 16) | (unorm<16>(r) << 0);
    210 			break;
    211 		case FORMAT_G16R16I:
    212 			*(unsigned int*)element = (static_cast<unsigned int>(scast<16>(g)) << 16) |
    213 			                          (static_cast<unsigned int>(scast<16>(r)) << 0);
    214 			break;
    215 		case FORMAT_G16R16UI:
    216 			*(unsigned int*)element = (ucast<16>(g) << 16) | (ucast<16>(r) << 0);
    217 			break;
    218 		case FORMAT_G32R32I:
    219 		case FORMAT_G32R32UI:
    220 			((unsigned int*)element)[0] = static_cast<unsigned int>(r);
    221 			((unsigned int*)element)[1] = static_cast<unsigned int>(g);
    222 			break;
    223 		case FORMAT_A16B16G16R16:
    224 			((unsigned short*)element)[0] = unorm<16>(r);
    225 			((unsigned short*)element)[1] = unorm<16>(g);
    226 			((unsigned short*)element)[2] = unorm<16>(b);
    227 			((unsigned short*)element)[3] = unorm<16>(a);
    228 			break;
    229 		case FORMAT_A16B16G16R16I:
    230 			((unsigned short*)element)[0] = static_cast<unsigned short>(scast<16>(r));
    231 			((unsigned short*)element)[1] = static_cast<unsigned short>(scast<16>(g));
    232 			((unsigned short*)element)[2] = static_cast<unsigned short>(scast<16>(b));
    233 			((unsigned short*)element)[3] = static_cast<unsigned short>(scast<16>(a));
    234 			break;
    235 		case FORMAT_A16B16G16R16UI:
    236 			((unsigned short*)element)[0] = static_cast<unsigned short>(ucast<16>(r));
    237 			((unsigned short*)element)[1] = static_cast<unsigned short>(ucast<16>(g));
    238 			((unsigned short*)element)[2] = static_cast<unsigned short>(ucast<16>(b));
    239 			((unsigned short*)element)[3] = static_cast<unsigned short>(ucast<16>(a));
    240 			break;
    241 		case FORMAT_X16B16G16R16I:
    242 			((unsigned short*)element)[0] = static_cast<unsigned short>(scast<16>(r));
    243 			((unsigned short*)element)[1] = static_cast<unsigned short>(scast<16>(g));
    244 			((unsigned short*)element)[2] = static_cast<unsigned short>(scast<16>(b));
    245 			break;
    246 		case FORMAT_X16B16G16R16UI:
    247 			((unsigned short*)element)[0] = static_cast<unsigned short>(ucast<16>(r));
    248 			((unsigned short*)element)[1] = static_cast<unsigned short>(ucast<16>(g));
    249 			((unsigned short*)element)[2] = static_cast<unsigned short>(ucast<16>(b));
    250 			break;
    251 		case FORMAT_A32B32G32R32I:
    252 		case FORMAT_A32B32G32R32UI:
    253 			((unsigned int*)element)[0] = static_cast<unsigned int>(r);
    254 			((unsigned int*)element)[1] = static_cast<unsigned int>(g);
    255 			((unsigned int*)element)[2] = static_cast<unsigned int>(b);
    256 			((unsigned int*)element)[3] = static_cast<unsigned int>(a);
    257 			break;
    258 		case FORMAT_X32B32G32R32I:
    259 		case FORMAT_X32B32G32R32UI:
    260 			((unsigned int*)element)[0] = static_cast<unsigned int>(r);
    261 			((unsigned int*)element)[1] = static_cast<unsigned int>(g);
    262 			((unsigned int*)element)[2] = static_cast<unsigned int>(b);
    263 			break;
    264 		case FORMAT_V8U8:
    265 			*(unsigned short*)element = (snorm<8>(g) << 8) | (snorm<8>(r) << 0);
    266 			break;
    267 		case FORMAT_L6V5U5:
    268 			*(unsigned short*)element = (unorm<6>(b) << 10) | (snorm<5>(g) << 5) | (snorm<5>(r) << 0);
    269 			break;
    270 		case FORMAT_Q8W8V8U8:
    271 			*(unsigned int*)element = (snorm<8>(a) << 24) | (snorm<8>(b) << 16) | (snorm<8>(g) << 8) | (snorm<8>(r) << 0);
    272 			break;
    273 		case FORMAT_X8L8V8U8:
    274 			*(unsigned int*)element = 0xFF000000 | (unorm<8>(b) << 16) | (snorm<8>(g) << 8) | (snorm<8>(r) << 0);
    275 			break;
    276 		case FORMAT_V16U16:
    277 			*(unsigned int*)element = (snorm<16>(g) << 16) | (snorm<16>(r) << 0);
    278 			break;
    279 		case FORMAT_A2W10V10U10:
    280 			*(unsigned int*)element = (unorm<2>(a) << 30) | (snorm<10>(b) << 20) | (snorm<10>(g) << 10) | (snorm<10>(r) << 0);
    281 			break;
    282 		case FORMAT_A16W16V16U16:
    283 			((unsigned short*)element)[0] = snorm<16>(r);
    284 			((unsigned short*)element)[1] = snorm<16>(g);
    285 			((unsigned short*)element)[2] = snorm<16>(b);
    286 			((unsigned short*)element)[3] = unorm<16>(a);
    287 			break;
    288 		case FORMAT_Q16W16V16U16:
    289 			((unsigned short*)element)[0] = snorm<16>(r);
    290 			((unsigned short*)element)[1] = snorm<16>(g);
    291 			((unsigned short*)element)[2] = snorm<16>(b);
    292 			((unsigned short*)element)[3] = snorm<16>(a);
    293 			break;
    294 		case FORMAT_R8G8B8:
    295 			((unsigned char*)element)[0] = unorm<8>(b);
    296 			((unsigned char*)element)[1] = unorm<8>(g);
    297 			((unsigned char*)element)[2] = unorm<8>(r);
    298 			break;
    299 		case FORMAT_B8G8R8:
    300 			((unsigned char*)element)[0] = unorm<8>(r);
    301 			((unsigned char*)element)[1] = unorm<8>(g);
    302 			((unsigned char*)element)[2] = unorm<8>(b);
    303 			break;
    304 		case FORMAT_R16F:
    305 			*(half*)element = (half)r;
    306 			break;
    307 		case FORMAT_A16F:
    308 			*(half*)element = (half)a;
    309 			break;
    310 		case FORMAT_G16R16F:
    311 			((half*)element)[0] = (half)r;
    312 			((half*)element)[1] = (half)g;
    313 			break;
    314 		case FORMAT_X16B16G16R16F_UNSIGNED:
    315 			r = max(r, 0.0f); g = max(g, 0.0f); b = max(b, 0.0f);
    316 			// Fall through to FORMAT_X16B16G16R16F.
    317 		case FORMAT_X16B16G16R16F:
    318 			((half*)element)[3] = 1.0f;
    319 			// Fall through to FORMAT_B16G16R16F.
    320 		case FORMAT_B16G16R16F:
    321 			((half*)element)[0] = (half)r;
    322 			((half*)element)[1] = (half)g;
    323 			((half*)element)[2] = (half)b;
    324 			break;
    325 		case FORMAT_A16B16G16R16F:
    326 			((half*)element)[0] = (half)r;
    327 			((half*)element)[1] = (half)g;
    328 			((half*)element)[2] = (half)b;
    329 			((half*)element)[3] = (half)a;
    330 			break;
    331 		case FORMAT_A32F:
    332 			*(float*)element = a;
    333 			break;
    334 		case FORMAT_R32F:
    335 			*(float*)element = r;
    336 			break;
    337 		case FORMAT_G32R32F:
    338 			((float*)element)[0] = r;
    339 			((float*)element)[1] = g;
    340 			break;
    341 		case FORMAT_X32B32G32R32F_UNSIGNED:
    342 			r = max(r, 0.0f); g = max(g, 0.0f); b = max(b, 0.0f);
    343 			// Fall through to FORMAT_X32B32G32R32F.
    344 		case FORMAT_X32B32G32R32F:
    345 			((float*)element)[3] = 1.0f;
    346 			// Fall through to FORMAT_B32G32R32F.
    347 		case FORMAT_B32G32R32F:
    348 			((float*)element)[0] = r;
    349 			((float*)element)[1] = g;
    350 			((float*)element)[2] = b;
    351 			break;
    352 		case FORMAT_A32B32G32R32F:
    353 			((float*)element)[0] = r;
    354 			((float*)element)[1] = g;
    355 			((float*)element)[2] = b;
    356 			((float*)element)[3] = a;
    357 			break;
    358 		case FORMAT_D32F:
    359 		case FORMAT_D32FS8:
    360 		case FORMAT_D32F_LOCKABLE:
    361 		case FORMAT_D32FS8_TEXTURE:
    362 		case FORMAT_D32F_SHADOW:
    363 		case FORMAT_D32FS8_SHADOW:
    364 			*((float*)element) = r;
    365 			break;
    366 		case FORMAT_D32F_COMPLEMENTARY:
    367 		case FORMAT_D32FS8_COMPLEMENTARY:
    368 			*((float*)element) = 1 - r;
    369 			break;
    370 		case FORMAT_S8:
    371 			*((unsigned char*)element) = unorm<8>(r);
    372 			break;
    373 		case FORMAT_L8:
    374 			*(unsigned char*)element = unorm<8>(r);
    375 			break;
    376 		case FORMAT_A4L4:
    377 			*(unsigned char*)element = (unorm<4>(a) << 4) | (unorm<4>(r) << 0);
    378 			break;
    379 		case FORMAT_L16:
    380 			*(unsigned short*)element = unorm<16>(r);
    381 			break;
    382 		case FORMAT_A8L8:
    383 			*(unsigned short*)element = (unorm<8>(a) << 8) | (unorm<8>(r) << 0);
    384 			break;
    385 		case FORMAT_L16F:
    386 			*(half*)element = (half)r;
    387 			break;
    388 		case FORMAT_A16L16F:
    389 			((half*)element)[0] = (half)r;
    390 			((half*)element)[1] = (half)a;
    391 			break;
    392 		case FORMAT_L32F:
    393 			*(float*)element = r;
    394 			break;
    395 		case FORMAT_A32L32F:
    396 			((float*)element)[0] = r;
    397 			((float*)element)[1] = a;
    398 			break;
    399 		default:
    400 			ASSERT(false);
    401 		}
    402 	}
    403 
    404 	Color<float> Surface::Buffer::read(int x, int y, int z) const
    405 	{
    406 		ASSERT((x >= -border) && (x < (width + border)));
    407 		ASSERT((y >= -border) && (y < (height + border)));
    408 		ASSERT((z >= 0) && (z < depth));
    409 
    410 		void *element = (unsigned char*)buffer + (x + border) * bytes + (y + border) * pitchB + z * samples * sliceB;
    411 
    412 		return read(element);
    413 	}
    414 
    415 	Color<float> Surface::Buffer::read(int x, int y) const
    416 	{
    417 		ASSERT((x >= -border) && (x < (width + border)));
    418 		ASSERT((y >= -border) && (y < (height + border)));
    419 
    420 		void *element = (unsigned char*)buffer + (x + border) * bytes + (y + border) * pitchB;
    421 
    422 		return read(element);
    423 	}
    424 
    425 	inline Color<float> Surface::Buffer::read(void *element) const
    426 	{
    427 		float r = 0.0f;
    428 		float g = 0.0f;
    429 		float b = 0.0f;
    430 		float a = 1.0f;
    431 
    432 		switch(format)
    433 		{
    434 		case FORMAT_P8:
    435 			{
    436 				ASSERT(palette);
    437 
    438 				unsigned int abgr = palette[*(unsigned char*)element];
    439 
    440 				r = (abgr & 0x000000FF) * (1.0f / 0x000000FF);
    441 				g = (abgr & 0x0000FF00) * (1.0f / 0x0000FF00);
    442 				b = (abgr & 0x00FF0000) * (1.0f / 0x00FF0000);
    443 				a = (abgr & 0xFF000000) * (1.0f / 0xFF000000);
    444 			}
    445 			break;
    446 		case FORMAT_A8P8:
    447 			{
    448 				ASSERT(palette);
    449 
    450 				unsigned int bgr = palette[((unsigned char*)element)[0]];
    451 
    452 				r = (bgr & 0x000000FF) * (1.0f / 0x000000FF);
    453 				g = (bgr & 0x0000FF00) * (1.0f / 0x0000FF00);
    454 				b = (bgr & 0x00FF0000) * (1.0f / 0x00FF0000);
    455 				a = ((unsigned char*)element)[1] * (1.0f / 0xFF);
    456 			}
    457 			break;
    458 		case FORMAT_A8:
    459 			r = 0;
    460 			g = 0;
    461 			b = 0;
    462 			a = *(unsigned char*)element * (1.0f / 0xFF);
    463 			break;
    464 		case FORMAT_R8_SNORM:
    465 			r = max((*(signed char*)element) * (1.0f / 0x7F), -1.0f);
    466 			break;
    467 		case FORMAT_R8:
    468 			r = *(unsigned char*)element * (1.0f / 0xFF);
    469 			break;
    470 		case FORMAT_R8I:
    471 			r = *(signed char*)element;
    472 			break;
    473 		case FORMAT_R8UI:
    474 			r = *(unsigned char*)element;
    475 			break;
    476 		case FORMAT_R3G3B2:
    477 			{
    478 				unsigned char rgb = *(unsigned char*)element;
    479 
    480 				r = (rgb & 0xE0) * (1.0f / 0xE0);
    481 				g = (rgb & 0x1C) * (1.0f / 0x1C);
    482 				b = (rgb & 0x03) * (1.0f / 0x03);
    483 			}
    484 			break;
    485 		case FORMAT_A8R3G3B2:
    486 			{
    487 				unsigned short argb = *(unsigned short*)element;
    488 
    489 				a = (argb & 0xFF00) * (1.0f / 0xFF00);
    490 				r = (argb & 0x00E0) * (1.0f / 0x00E0);
    491 				g = (argb & 0x001C) * (1.0f / 0x001C);
    492 				b = (argb & 0x0003) * (1.0f / 0x0003);
    493 			}
    494 			break;
    495 		case FORMAT_X4R4G4B4:
    496 			{
    497 				unsigned short rgb = *(unsigned short*)element;
    498 
    499 				r = (rgb & 0x0F00) * (1.0f / 0x0F00);
    500 				g = (rgb & 0x00F0) * (1.0f / 0x00F0);
    501 				b = (rgb & 0x000F) * (1.0f / 0x000F);
    502 			}
    503 			break;
    504 		case FORMAT_A4R4G4B4:
    505 			{
    506 				unsigned short argb = *(unsigned short*)element;
    507 
    508 				a = (argb & 0xF000) * (1.0f / 0xF000);
    509 				r = (argb & 0x0F00) * (1.0f / 0x0F00);
    510 				g = (argb & 0x00F0) * (1.0f / 0x00F0);
    511 				b = (argb & 0x000F) * (1.0f / 0x000F);
    512 			}
    513 			break;
    514 		case FORMAT_R4G4B4A4:
    515 			{
    516 				unsigned short rgba = *(unsigned short*)element;
    517 
    518 				r = (rgba & 0xF000) * (1.0f / 0xF000);
    519 				g = (rgba & 0x0F00) * (1.0f / 0x0F00);
    520 				b = (rgba & 0x00F0) * (1.0f / 0x00F0);
    521 				a = (rgba & 0x000F) * (1.0f / 0x000F);
    522 			}
    523 			break;
    524 		case FORMAT_R5G6B5:
    525 			{
    526 				unsigned short rgb = *(unsigned short*)element;
    527 
    528 				r = (rgb & 0xF800) * (1.0f / 0xF800);
    529 				g = (rgb & 0x07E0) * (1.0f / 0x07E0);
    530 				b = (rgb & 0x001F) * (1.0f / 0x001F);
    531 			}
    532 			break;
    533 		case FORMAT_A1R5G5B5:
    534 			{
    535 				unsigned short argb = *(unsigned short*)element;
    536 
    537 				a = (argb & 0x8000) * (1.0f / 0x8000);
    538 				r = (argb & 0x7C00) * (1.0f / 0x7C00);
    539 				g = (argb & 0x03E0) * (1.0f / 0x03E0);
    540 				b = (argb & 0x001F) * (1.0f / 0x001F);
    541 			}
    542 			break;
    543 		case FORMAT_R5G5B5A1:
    544 			{
    545 				unsigned short rgba = *(unsigned short*)element;
    546 
    547 				r = (rgba & 0xF800) * (1.0f / 0xF800);
    548 				g = (rgba & 0x07C0) * (1.0f / 0x07C0);
    549 				b = (rgba & 0x003E) * (1.0f / 0x003E);
    550 				a = (rgba & 0x0001) * (1.0f / 0x0001);
    551 			}
    552 			break;
    553 		case FORMAT_X1R5G5B5:
    554 			{
    555 				unsigned short xrgb = *(unsigned short*)element;
    556 
    557 				r = (xrgb & 0x7C00) * (1.0f / 0x7C00);
    558 				g = (xrgb & 0x03E0) * (1.0f / 0x03E0);
    559 				b = (xrgb & 0x001F) * (1.0f / 0x001F);
    560 			}
    561 			break;
    562 		case FORMAT_A8R8G8B8:
    563 			{
    564 				unsigned int argb = *(unsigned int*)element;
    565 
    566 				a = (argb & 0xFF000000) * (1.0f / 0xFF000000);
    567 				r = (argb & 0x00FF0000) * (1.0f / 0x00FF0000);
    568 				g = (argb & 0x0000FF00) * (1.0f / 0x0000FF00);
    569 				b = (argb & 0x000000FF) * (1.0f / 0x000000FF);
    570 			}
    571 			break;
    572 		case FORMAT_X8R8G8B8:
    573 			{
    574 				unsigned int xrgb = *(unsigned int*)element;
    575 
    576 				r = (xrgb & 0x00FF0000) * (1.0f / 0x00FF0000);
    577 				g = (xrgb & 0x0000FF00) * (1.0f / 0x0000FF00);
    578 				b = (xrgb & 0x000000FF) * (1.0f / 0x000000FF);
    579 			}
    580 			break;
    581 		case FORMAT_A8B8G8R8_SNORM:
    582 			{
    583 				signed char* abgr = (signed char*)element;
    584 
    585 				r = max(abgr[0] * (1.0f / 0x7F), -1.0f);
    586 				g = max(abgr[1] * (1.0f / 0x7F), -1.0f);
    587 				b = max(abgr[2] * (1.0f / 0x7F), -1.0f);
    588 				a = max(abgr[3] * (1.0f / 0x7F), -1.0f);
    589 			}
    590 			break;
    591 		case FORMAT_A8B8G8R8:
    592 		case FORMAT_SRGB8_A8:
    593 			{
    594 				unsigned int abgr = *(unsigned int*)element;
    595 
    596 				a = (abgr & 0xFF000000) * (1.0f / 0xFF000000);
    597 				b = (abgr & 0x00FF0000) * (1.0f / 0x00FF0000);
    598 				g = (abgr & 0x0000FF00) * (1.0f / 0x0000FF00);
    599 				r = (abgr & 0x000000FF) * (1.0f / 0x000000FF);
    600 			}
    601 			break;
    602 		case FORMAT_A8B8G8R8I:
    603 			{
    604 				signed char* abgr = (signed char*)element;
    605 
    606 				r = abgr[0];
    607 				g = abgr[1];
    608 				b = abgr[2];
    609 				a = abgr[3];
    610 			}
    611 			break;
    612 		case FORMAT_A8B8G8R8UI:
    613 			{
    614 				unsigned char* abgr = (unsigned char*)element;
    615 
    616 				r = abgr[0];
    617 				g = abgr[1];
    618 				b = abgr[2];
    619 				a = abgr[3];
    620 			}
    621 			break;
    622 		case FORMAT_X8B8G8R8_SNORM:
    623 			{
    624 				signed char* bgr = (signed char*)element;
    625 
    626 				r = max(bgr[0] * (1.0f / 0x7F), -1.0f);
    627 				g = max(bgr[1] * (1.0f / 0x7F), -1.0f);
    628 				b = max(bgr[2] * (1.0f / 0x7F), -1.0f);
    629 			}
    630 			break;
    631 		case FORMAT_X8B8G8R8:
    632 		case FORMAT_SRGB8_X8:
    633 			{
    634 				unsigned int xbgr = *(unsigned int*)element;
    635 
    636 				b = (xbgr & 0x00FF0000) * (1.0f / 0x00FF0000);
    637 				g = (xbgr & 0x0000FF00) * (1.0f / 0x0000FF00);
    638 				r = (xbgr & 0x000000FF) * (1.0f / 0x000000FF);
    639 			}
    640 			break;
    641 		case FORMAT_X8B8G8R8I:
    642 			{
    643 				signed char* bgr = (signed char*)element;
    644 
    645 				r = bgr[0];
    646 				g = bgr[1];
    647 				b = bgr[2];
    648 			}
    649 			break;
    650 		case FORMAT_X8B8G8R8UI:
    651 			{
    652 				unsigned char* bgr = (unsigned char*)element;
    653 
    654 				r = bgr[0];
    655 				g = bgr[1];
    656 				b = bgr[2];
    657 			}
    658 			break;
    659 		case FORMAT_G8R8_SNORM:
    660 			{
    661 				signed char* gr = (signed char*)element;
    662 
    663 				r = (gr[0] & 0xFF00) * (1.0f / 0xFF00);
    664 				g = (gr[1] & 0x00FF) * (1.0f / 0x00FF);
    665 			}
    666 			break;
    667 		case FORMAT_G8R8:
    668 			{
    669 				unsigned short gr = *(unsigned short*)element;
    670 
    671 				g = (gr & 0xFF00) * (1.0f / 0xFF00);
    672 				r = (gr & 0x00FF) * (1.0f / 0x00FF);
    673 			}
    674 			break;
    675 		case FORMAT_G8R8I:
    676 			{
    677 				signed char* gr = (signed char*)element;
    678 
    679 				r = gr[0];
    680 				g = gr[1];
    681 			}
    682 			break;
    683 		case FORMAT_G8R8UI:
    684 			{
    685 				unsigned char* gr = (unsigned char*)element;
    686 
    687 				r = gr[0];
    688 				g = gr[1];
    689 			}
    690 			break;
    691 		case FORMAT_R16I:
    692 			r = *((short*)element);
    693 			break;
    694 		case FORMAT_R16UI:
    695 			r = *((unsigned short*)element);
    696 			break;
    697 		case FORMAT_G16R16I:
    698 			{
    699 				short* gr = (short*)element;
    700 
    701 				r = gr[0];
    702 				g = gr[1];
    703 			}
    704 			break;
    705 		case FORMAT_G16R16:
    706 			{
    707 				unsigned int gr = *(unsigned int*)element;
    708 
    709 				g = (gr & 0xFFFF0000) * (1.0f / 0xFFFF0000);
    710 				r = (gr & 0x0000FFFF) * (1.0f / 0x0000FFFF);
    711 			}
    712 			break;
    713 		case FORMAT_G16R16UI:
    714 			{
    715 				unsigned short* gr = (unsigned short*)element;
    716 
    717 				r = gr[0];
    718 				g = gr[1];
    719 			}
    720 			break;
    721 		case FORMAT_A2R10G10B10:
    722 			{
    723 				unsigned int argb = *(unsigned int*)element;
    724 
    725 				a = (argb & 0xC0000000) * (1.0f / 0xC0000000);
    726 				r = (argb & 0x3FF00000) * (1.0f / 0x3FF00000);
    727 				g = (argb & 0x000FFC00) * (1.0f / 0x000FFC00);
    728 				b = (argb & 0x000003FF) * (1.0f / 0x000003FF);
    729 			}
    730 			break;
    731 		case FORMAT_A2B10G10R10:
    732 			{
    733 				unsigned int abgr = *(unsigned int*)element;
    734 
    735 				a = (abgr & 0xC0000000) * (1.0f / 0xC0000000);
    736 				b = (abgr & 0x3FF00000) * (1.0f / 0x3FF00000);
    737 				g = (abgr & 0x000FFC00) * (1.0f / 0x000FFC00);
    738 				r = (abgr & 0x000003FF) * (1.0f / 0x000003FF);
    739 			}
    740 			break;
    741 		case FORMAT_A2B10G10R10UI:
    742 			{
    743 				unsigned int abgr = *(unsigned int*)element;
    744 
    745 				a = static_cast<float>((abgr & 0xC0000000) >> 30);
    746 				b = static_cast<float>((abgr & 0x3FF00000) >> 20);
    747 				g = static_cast<float>((abgr & 0x000FFC00) >> 10);
    748 				r = static_cast<float>(abgr & 0x000003FF);
    749 			}
    750 			break;
    751 		case FORMAT_A16B16G16R16I:
    752 			{
    753 				short* abgr = (short*)element;
    754 
    755 				r = abgr[0];
    756 				g = abgr[1];
    757 				b = abgr[2];
    758 				a = abgr[3];
    759 			}
    760 			break;
    761 		case FORMAT_A16B16G16R16:
    762 			r = ((unsigned short*)element)[0] * (1.0f / 0xFFFF);
    763 			g = ((unsigned short*)element)[1] * (1.0f / 0xFFFF);
    764 			b = ((unsigned short*)element)[2] * (1.0f / 0xFFFF);
    765 			a = ((unsigned short*)element)[3] * (1.0f / 0xFFFF);
    766 			break;
    767 		case FORMAT_A16B16G16R16UI:
    768 			{
    769 				unsigned short* abgr = (unsigned short*)element;
    770 
    771 				r = abgr[0];
    772 				g = abgr[1];
    773 				b = abgr[2];
    774 				a = abgr[3];
    775 			}
    776 			break;
    777 		case FORMAT_X16B16G16R16I:
    778 			{
    779 				short* bgr = (short*)element;
    780 
    781 				r = bgr[0];
    782 				g = bgr[1];
    783 				b = bgr[2];
    784 			}
    785 			break;
    786 		case FORMAT_X16B16G16R16UI:
    787 			{
    788 				unsigned short* bgr = (unsigned short*)element;
    789 
    790 				r = bgr[0];
    791 				g = bgr[1];
    792 				b = bgr[2];
    793 			}
    794 			break;
    795 		case FORMAT_A32B32G32R32I:
    796 			{
    797 				int* abgr = (int*)element;
    798 
    799 				r = static_cast<float>(abgr[0]);
    800 				g = static_cast<float>(abgr[1]);
    801 				b = static_cast<float>(abgr[2]);
    802 				a = static_cast<float>(abgr[3]);
    803 			}
    804 			break;
    805 		case FORMAT_A32B32G32R32UI:
    806 			{
    807 				unsigned int* abgr = (unsigned int*)element;
    808 
    809 				r = static_cast<float>(abgr[0]);
    810 				g = static_cast<float>(abgr[1]);
    811 				b = static_cast<float>(abgr[2]);
    812 				a = static_cast<float>(abgr[3]);
    813 			}
    814 			break;
    815 		case FORMAT_X32B32G32R32I:
    816 			{
    817 				int* bgr = (int*)element;
    818 
    819 				r = static_cast<float>(bgr[0]);
    820 				g = static_cast<float>(bgr[1]);
    821 				b = static_cast<float>(bgr[2]);
    822 			}
    823 			break;
    824 		case FORMAT_X32B32G32R32UI:
    825 			{
    826 				unsigned int* bgr = (unsigned int*)element;
    827 
    828 				r = static_cast<float>(bgr[0]);
    829 				g = static_cast<float>(bgr[1]);
    830 				b = static_cast<float>(bgr[2]);
    831 			}
    832 			break;
    833 		case FORMAT_G32R32I:
    834 			{
    835 				int* gr = (int*)element;
    836 
    837 				r = static_cast<float>(gr[0]);
    838 				g = static_cast<float>(gr[1]);
    839 			}
    840 			break;
    841 		case FORMAT_G32R32UI:
    842 			{
    843 				unsigned int* gr = (unsigned int*)element;
    844 
    845 				r = static_cast<float>(gr[0]);
    846 				g = static_cast<float>(gr[1]);
    847 			}
    848 			break;
    849 		case FORMAT_R32I:
    850 			r = static_cast<float>(*((int*)element));
    851 			break;
    852 		case FORMAT_R32UI:
    853 			r = static_cast<float>(*((unsigned int*)element));
    854 			break;
    855 		case FORMAT_V8U8:
    856 			{
    857 				unsigned short vu = *(unsigned short*)element;
    858 
    859 				r = ((int)(vu & 0x00FF) << 24) * (1.0f / 0x7F000000);
    860 				g = ((int)(vu & 0xFF00) << 16) * (1.0f / 0x7F000000);
    861 			}
    862 			break;
    863 		case FORMAT_L6V5U5:
    864 			{
    865 				unsigned short lvu = *(unsigned short*)element;
    866 
    867 				r = ((int)(lvu & 0x001F) << 27) * (1.0f / 0x78000000);
    868 				g = ((int)(lvu & 0x03E0) << 22) * (1.0f / 0x78000000);
    869 				b = (lvu & 0xFC00) * (1.0f / 0xFC00);
    870 			}
    871 			break;
    872 		case FORMAT_Q8W8V8U8:
    873 			{
    874 				unsigned int qwvu = *(unsigned int*)element;
    875 
    876 				r = ((int)(qwvu & 0x000000FF) << 24) * (1.0f / 0x7F000000);
    877 				g = ((int)(qwvu & 0x0000FF00) << 16) * (1.0f / 0x7F000000);
    878 				b = ((int)(qwvu & 0x00FF0000) << 8)  * (1.0f / 0x7F000000);
    879 				a = ((int)(qwvu & 0xFF000000) << 0)  * (1.0f / 0x7F000000);
    880 			}
    881 			break;
    882 		case FORMAT_X8L8V8U8:
    883 			{
    884 				unsigned int xlvu = *(unsigned int*)element;
    885 
    886 				r = ((int)(xlvu & 0x000000FF) << 24) * (1.0f / 0x7F000000);
    887 				g = ((int)(xlvu & 0x0000FF00) << 16) * (1.0f / 0x7F000000);
    888 				b = (xlvu & 0x00FF0000) * (1.0f / 0x00FF0000);
    889 			}
    890 			break;
    891 		case FORMAT_R8G8B8:
    892 			r = ((unsigned char*)element)[2] * (1.0f / 0xFF);
    893 			g = ((unsigned char*)element)[1] * (1.0f / 0xFF);
    894 			b = ((unsigned char*)element)[0] * (1.0f / 0xFF);
    895 			break;
    896 		case FORMAT_B8G8R8:
    897 			r = ((unsigned char*)element)[0] * (1.0f / 0xFF);
    898 			g = ((unsigned char*)element)[1] * (1.0f / 0xFF);
    899 			b = ((unsigned char*)element)[2] * (1.0f / 0xFF);
    900 			break;
    901 		case FORMAT_V16U16:
    902 			{
    903 				unsigned int vu = *(unsigned int*)element;
    904 
    905 				r = ((int)(vu & 0x0000FFFF) << 16) * (1.0f / 0x7FFF0000);
    906 				g = ((int)(vu & 0xFFFF0000) << 0)  * (1.0f / 0x7FFF0000);
    907 			}
    908 			break;
    909 		case FORMAT_A2W10V10U10:
    910 			{
    911 				unsigned int awvu = *(unsigned int*)element;
    912 
    913 				r = ((int)(awvu & 0x000003FF) << 22) * (1.0f / 0x7FC00000);
    914 				g = ((int)(awvu & 0x000FFC00) << 12) * (1.0f / 0x7FC00000);
    915 				b = ((int)(awvu & 0x3FF00000) << 2)  * (1.0f / 0x7FC00000);
    916 				a = (awvu & 0xC0000000) * (1.0f / 0xC0000000);
    917 			}
    918 			break;
    919 		case FORMAT_A16W16V16U16:
    920 			r = ((signed short*)element)[0] * (1.0f / 0x7FFF);
    921 			g = ((signed short*)element)[1] * (1.0f / 0x7FFF);
    922 			b = ((signed short*)element)[2] * (1.0f / 0x7FFF);
    923 			a = ((unsigned short*)element)[3] * (1.0f / 0xFFFF);
    924 			break;
    925 		case FORMAT_Q16W16V16U16:
    926 			r = ((signed short*)element)[0] * (1.0f / 0x7FFF);
    927 			g = ((signed short*)element)[1] * (1.0f / 0x7FFF);
    928 			b = ((signed short*)element)[2] * (1.0f / 0x7FFF);
    929 			a = ((signed short*)element)[3] * (1.0f / 0x7FFF);
    930 			break;
    931 		case FORMAT_L8:
    932 			r =
    933 			g =
    934 			b = *(unsigned char*)element * (1.0f / 0xFF);
    935 			break;
    936 		case FORMAT_A4L4:
    937 			{
    938 				unsigned char al = *(unsigned char*)element;
    939 
    940 				r =
    941 				g =
    942 				b = (al & 0x0F) * (1.0f / 0x0F);
    943 				a = (al & 0xF0) * (1.0f / 0xF0);
    944 			}
    945 			break;
    946 		case FORMAT_L16:
    947 			r =
    948 			g =
    949 			b = *(unsigned short*)element * (1.0f / 0xFFFF);
    950 			break;
    951 		case FORMAT_A8L8:
    952 			r =
    953 			g =
    954 			b = ((unsigned char*)element)[0] * (1.0f / 0xFF);
    955 			a = ((unsigned char*)element)[1] * (1.0f / 0xFF);
    956 			break;
    957 		case FORMAT_L16F:
    958 			r =
    959 			g =
    960 			b = *(half*)element;
    961 			break;
    962 		case FORMAT_A16L16F:
    963 			r =
    964 			g =
    965 			b = ((half*)element)[0];
    966 			a = ((half*)element)[1];
    967 			break;
    968 		case FORMAT_L32F:
    969 			r =
    970 			g =
    971 			b = *(float*)element;
    972 			break;
    973 		case FORMAT_A32L32F:
    974 			r =
    975 			g =
    976 			b = ((float*)element)[0];
    977 			a = ((float*)element)[1];
    978 			break;
    979 		case FORMAT_A16F:
    980 			a = *(half*)element;
    981 			break;
    982 		case FORMAT_R16F:
    983 			r = *(half*)element;
    984 			break;
    985 		case FORMAT_G16R16F:
    986 			r = ((half*)element)[0];
    987 			g = ((half*)element)[1];
    988 			break;
    989 		case FORMAT_X16B16G16R16F:
    990 		case FORMAT_X16B16G16R16F_UNSIGNED:
    991 		case FORMAT_B16G16R16F:
    992 			r = ((half*)element)[0];
    993 			g = ((half*)element)[1];
    994 			b = ((half*)element)[2];
    995 			break;
    996 		case FORMAT_A16B16G16R16F:
    997 			r = ((half*)element)[0];
    998 			g = ((half*)element)[1];
    999 			b = ((half*)element)[2];
   1000 			a = ((half*)element)[3];
   1001 			break;
   1002 		case FORMAT_A32F:
   1003 			a = *(float*)element;
   1004 			break;
   1005 		case FORMAT_R32F:
   1006 			r = *(float*)element;
   1007 			break;
   1008 		case FORMAT_G32R32F:
   1009 			r = ((float*)element)[0];
   1010 			g = ((float*)element)[1];
   1011 			break;
   1012 		case FORMAT_X32B32G32R32F:
   1013 		case FORMAT_X32B32G32R32F_UNSIGNED:
   1014 		case FORMAT_B32G32R32F:
   1015 			r = ((float*)element)[0];
   1016 			g = ((float*)element)[1];
   1017 			b = ((float*)element)[2];
   1018 			break;
   1019 		case FORMAT_A32B32G32R32F:
   1020 			r = ((float*)element)[0];
   1021 			g = ((float*)element)[1];
   1022 			b = ((float*)element)[2];
   1023 			a = ((float*)element)[3];
   1024 			break;
   1025 		case FORMAT_D32F:
   1026 		case FORMAT_D32FS8:
   1027 		case FORMAT_D32F_LOCKABLE:
   1028 		case FORMAT_D32FS8_TEXTURE:
   1029 		case FORMAT_D32F_SHADOW:
   1030 		case FORMAT_D32FS8_SHADOW:
   1031 			r = *(float*)element;
   1032 			g = r;
   1033 			b = r;
   1034 			a = r;
   1035 			break;
   1036 		case FORMAT_D32F_COMPLEMENTARY:
   1037 		case FORMAT_D32FS8_COMPLEMENTARY:
   1038 			r = 1.0f - *(float*)element;
   1039 			g = r;
   1040 			b = r;
   1041 			a = r;
   1042 			break;
   1043 		case FORMAT_S8:
   1044 			r = *(unsigned char*)element * (1.0f / 0xFF);
   1045 			break;
   1046 		default:
   1047 			ASSERT(false);
   1048 		}
   1049 
   1050 		if(isSRGBformat(format))
   1051 		{
   1052 			r = sRGBtoLinear(r);
   1053 			g = sRGBtoLinear(g);
   1054 			b = sRGBtoLinear(b);
   1055 		}
   1056 
   1057 		return Color<float>(r, g, b, a);
   1058 	}
   1059 
   1060 	Color<float> Surface::Buffer::sample(float x, float y, float z) const
   1061 	{
   1062 		x -= 0.5f;
   1063 		y -= 0.5f;
   1064 		z -= 0.5f;
   1065 
   1066 		int x0 = clamp((int)x, 0, width - 1);
   1067 		int x1 = (x0 + 1 >= width) ? x0 : x0 + 1;
   1068 
   1069 		int y0 = clamp((int)y, 0, height - 1);
   1070 		int y1 = (y0 + 1 >= height) ? y0 : y0 + 1;
   1071 
   1072 		int z0 = clamp((int)z, 0, depth - 1);
   1073 		int z1 = (z0 + 1 >= depth) ? z0 : z0 + 1;
   1074 
   1075 		Color<float> c000 = read(x0, y0, z0);
   1076 		Color<float> c100 = read(x1, y0, z0);
   1077 		Color<float> c010 = read(x0, y1, z0);
   1078 		Color<float> c110 = read(x1, y1, z0);
   1079 		Color<float> c001 = read(x0, y0, z1);
   1080 		Color<float> c101 = read(x1, y0, z1);
   1081 		Color<float> c011 = read(x0, y1, z1);
   1082 		Color<float> c111 = read(x1, y1, z1);
   1083 
   1084 		float fx = x - x0;
   1085 		float fy = y - y0;
   1086 		float fz = z - z0;
   1087 
   1088 		c000 *= (1 - fx) * (1 - fy) * (1 - fz);
   1089 		c100 *= fx * (1 - fy) * (1 - fz);
   1090 		c010 *= (1 - fx) * fy * (1 - fz);
   1091 		c110 *= fx * fy * (1 - fz);
   1092 		c001 *= (1 - fx) * (1 - fy) * fz;
   1093 		c101 *= fx * (1 - fy) * fz;
   1094 		c011 *= (1 - fx) * fy * fz;
   1095 		c111 *= fx * fy * fz;
   1096 
   1097 		return c000 + c100 + c010 + c110 + c001 + c101 + c011 + c111;
   1098 	}
   1099 
   1100 	Color<float> Surface::Buffer::sample(float x, float y, int layer) const
   1101 	{
   1102 		x -= 0.5f;
   1103 		y -= 0.5f;
   1104 
   1105 		int x0 = clamp((int)x, 0, width - 1);
   1106 		int x1 = (x0 + 1 >= width) ? x0 : x0 + 1;
   1107 
   1108 		int y0 = clamp((int)y, 0, height - 1);
   1109 		int y1 = (y0 + 1 >= height) ? y0 : y0 + 1;
   1110 
   1111 		Color<float> c00 = read(x0, y0, layer);
   1112 		Color<float> c10 = read(x1, y0, layer);
   1113 		Color<float> c01 = read(x0, y1, layer);
   1114 		Color<float> c11 = read(x1, y1, layer);
   1115 
   1116 		float fx = x - x0;
   1117 		float fy = y - y0;
   1118 
   1119 		c00 *= (1 - fx) * (1 - fy);
   1120 		c10 *= fx * (1 - fy);
   1121 		c01 *= (1 - fx) * fy;
   1122 		c11 *= fx * fy;
   1123 
   1124 		return c00 + c10 + c01 + c11;
   1125 	}
   1126 
   1127 	void *Surface::Buffer::lockRect(int x, int y, int z, Lock lock)
   1128 	{
   1129 		this->lock = lock;
   1130 
   1131 		switch(lock)
   1132 		{
   1133 		case LOCK_UNLOCKED:
   1134 		case LOCK_READONLY:
   1135 		case LOCK_UPDATE:
   1136 			break;
   1137 		case LOCK_WRITEONLY:
   1138 		case LOCK_READWRITE:
   1139 		case LOCK_DISCARD:
   1140 			dirty = true;
   1141 			break;
   1142 		default:
   1143 			ASSERT(false);
   1144 		}
   1145 
   1146 		if(buffer)
   1147 		{
   1148 			x += border;
   1149 			y += border;
   1150 
   1151 			switch(format)
   1152 			{
   1153 			case FORMAT_DXT1:
   1154 			case FORMAT_ATI1:
   1155 			case FORMAT_ETC1:
   1156 			case FORMAT_R11_EAC:
   1157 			case FORMAT_SIGNED_R11_EAC:
   1158 			case FORMAT_RGB8_ETC2:
   1159 			case FORMAT_SRGB8_ETC2:
   1160 			case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
   1161 			case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
   1162 				return (unsigned char*)buffer + 8 * (x / 4) + (y / 4) * pitchB + z * sliceB;
   1163 			case FORMAT_RG11_EAC:
   1164 			case FORMAT_SIGNED_RG11_EAC:
   1165 			case FORMAT_RGBA8_ETC2_EAC:
   1166 			case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
   1167 			case FORMAT_RGBA_ASTC_4x4_KHR:
   1168 			case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
   1169 				return (unsigned char*)buffer + 16 * (x / 4) + (y / 4) * pitchB + z * sliceB;
   1170 			case FORMAT_RGBA_ASTC_5x4_KHR:
   1171 			case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
   1172 				return (unsigned char*)buffer + 16 * (x / 5) + (y / 4) * pitchB + z * sliceB;
   1173 			case FORMAT_RGBA_ASTC_5x5_KHR:
   1174 			case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
   1175 				return (unsigned char*)buffer + 16 * (x / 5) + (y / 5) * pitchB + z * sliceB;
   1176 			case FORMAT_RGBA_ASTC_6x5_KHR:
   1177 			case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
   1178 				return (unsigned char*)buffer + 16 * (x / 6) + (y / 5) * pitchB + z * sliceB;
   1179 			case FORMAT_RGBA_ASTC_6x6_KHR:
   1180 			case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
   1181 				return (unsigned char*)buffer + 16 * (x / 6) + (y / 6) * pitchB + z * sliceB;
   1182 			case FORMAT_RGBA_ASTC_8x5_KHR:
   1183 			case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
   1184 				return (unsigned char*)buffer + 16 * (x / 8) + (y / 5) * pitchB + z * sliceB;
   1185 			case FORMAT_RGBA_ASTC_8x6_KHR:
   1186 			case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
   1187 				return (unsigned char*)buffer + 16 * (x / 8) + (y / 6) * pitchB + z * sliceB;
   1188 			case FORMAT_RGBA_ASTC_8x8_KHR:
   1189 			case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
   1190 				return (unsigned char*)buffer + 16 * (x / 8) + (y / 8) * pitchB + z * sliceB;
   1191 			case FORMAT_RGBA_ASTC_10x5_KHR:
   1192 			case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
   1193 				return (unsigned char*)buffer + 16 * (x / 10) + (y / 5) * pitchB + z * sliceB;
   1194 			case FORMAT_RGBA_ASTC_10x6_KHR:
   1195 			case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
   1196 				return (unsigned char*)buffer + 16 * (x / 10) + (y / 6) * pitchB + z * sliceB;
   1197 			case FORMAT_RGBA_ASTC_10x8_KHR:
   1198 			case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
   1199 				return (unsigned char*)buffer + 16 * (x / 10) + (y / 8) * pitchB + z * sliceB;
   1200 			case FORMAT_RGBA_ASTC_10x10_KHR:
   1201 			case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
   1202 				return (unsigned char*)buffer + 16 * (x / 10) + (y / 10) * pitchB + z * sliceB;
   1203 			case FORMAT_RGBA_ASTC_12x10_KHR:
   1204 			case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
   1205 				return (unsigned char*)buffer + 16 * (x / 12) + (y / 10) * pitchB + z * sliceB;
   1206 			case FORMAT_RGBA_ASTC_12x12_KHR:
   1207 			case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
   1208 				return (unsigned char*)buffer + 16 * (x / 12) + (y / 12) * pitchB + z * sliceB;
   1209 			case FORMAT_DXT3:
   1210 			case FORMAT_DXT5:
   1211 			case FORMAT_ATI2:
   1212 				return (unsigned char*)buffer + 16 * (x / 4) + (y / 4) * pitchB + z * sliceB;
   1213 			default:
   1214 				return (unsigned char*)buffer + x * bytes + y * pitchB + z * samples * sliceB;
   1215 			}
   1216 		}
   1217 
   1218 		return nullptr;
   1219 	}
   1220 
   1221 	void Surface::Buffer::unlockRect()
   1222 	{
   1223 		lock = LOCK_UNLOCKED;
   1224 	}
   1225 
   1226 	class SurfaceImplementation : public Surface
   1227 	{
   1228 	public:
   1229 		SurfaceImplementation(int width, int height, int depth, Format format, void *pixels, int pitch, int slice)
   1230 			: Surface(width, height, depth, format, pixels, pitch, slice) {}
   1231 		SurfaceImplementation(Resource *texture, int width, int height, int depth, int border, int samples, Format format, bool lockable, bool renderTarget, int pitchP = 0)
   1232 			: Surface(texture, width, height, depth, border, samples, format, lockable, renderTarget, pitchP) {}
   1233 		~SurfaceImplementation() override {};
   1234 
   1235 		void *lockInternal(int x, int y, int z, Lock lock, Accessor client) override
   1236 		{
   1237 			return Surface::lockInternal(x, y, z, lock, client);
   1238 		}
   1239 
   1240 		void unlockInternal() override
   1241 		{
   1242 			Surface::unlockInternal();
   1243 		}
   1244 	};
   1245 
   1246 	Surface *Surface::create(int width, int height, int depth, Format format, void *pixels, int pitch, int slice)
   1247 	{
   1248 		return new SurfaceImplementation(width, height, depth, format, pixels, pitch, slice);
   1249 	}
   1250 
   1251 	Surface *Surface::create(Resource *texture, int width, int height, int depth, int border, int samples, Format format, bool lockable, bool renderTarget, int pitchPprovided)
   1252 	{
   1253 		return new SurfaceImplementation(texture, width, height, depth, border, samples, format, lockable, renderTarget, pitchPprovided);
   1254 	}
   1255 
   1256 	Surface::Surface(int width, int height, int depth, Format format, void *pixels, int pitch, int slice) : lockable(true), renderTarget(false)
   1257 	{
   1258 		resource = new Resource(0);
   1259 		hasParent = false;
   1260 		ownExternal = false;
   1261 		depth = max(1, depth);
   1262 
   1263 		external.buffer = pixels;
   1264 		external.width = width;
   1265 		external.height = height;
   1266 		external.depth = depth;
   1267 		external.samples = 1;
   1268 		external.format = format;
   1269 		external.bytes = bytes(external.format);
   1270 		external.pitchB = pitch;
   1271 		external.pitchP = external.bytes ? pitch / external.bytes : 0;
   1272 		external.sliceB = slice;
   1273 		external.sliceP = external.bytes ? slice / external.bytes : 0;
   1274 		external.border = 0;
   1275 		external.lock = LOCK_UNLOCKED;
   1276 		external.dirty = true;
   1277 
   1278 		internal.buffer = nullptr;
   1279 		internal.width = width;
   1280 		internal.height = height;
   1281 		internal.depth = depth;
   1282 		internal.samples = 1;
   1283 		internal.format = selectInternalFormat(format);
   1284 		internal.bytes = bytes(internal.format);
   1285 		internal.pitchB = pitchB(internal.width, 0, internal.format, false);
   1286 		internal.pitchP = pitchP(internal.width, 0, internal.format, false);
   1287 		internal.sliceB = sliceB(internal.width, internal.height, 0, internal.format, false);
   1288 		internal.sliceP = sliceP(internal.width, internal.height, 0, internal.format, false);
   1289 		internal.border = 0;
   1290 		internal.lock = LOCK_UNLOCKED;
   1291 		internal.dirty = false;
   1292 
   1293 		stencil.buffer = nullptr;
   1294 		stencil.width = width;
   1295 		stencil.height = height;
   1296 		stencil.depth = depth;
   1297 		stencil.samples = 1;
   1298 		stencil.format = isStencil(format) ? FORMAT_S8 : FORMAT_NULL;
   1299 		stencil.bytes = bytes(stencil.format);
   1300 		stencil.pitchB = pitchB(stencil.width, 0, stencil.format, false);
   1301 		stencil.pitchP = pitchP(stencil.width, 0, stencil.format, false);
   1302 		stencil.sliceB = sliceB(stencil.width, stencil.height, 0, stencil.format, false);
   1303 		stencil.sliceP = sliceP(stencil.width, stencil.height, 0, stencil.format, false);
   1304 		stencil.border = 0;
   1305 		stencil.lock = LOCK_UNLOCKED;
   1306 		stencil.dirty = false;
   1307 
   1308 		dirtyContents = true;
   1309 		paletteUsed = 0;
   1310 	}
   1311 
   1312 	Surface::Surface(Resource *texture, int width, int height, int depth, int border, int samples, Format format, bool lockable, bool renderTarget, int pitchPprovided) : lockable(lockable), renderTarget(renderTarget)
   1313 	{
   1314 		resource = texture ? texture : new Resource(0);
   1315 		hasParent = texture != nullptr;
   1316 		ownExternal = true;
   1317 		depth = max(1, depth);
   1318 		samples = max(1, samples);
   1319 
   1320 		external.buffer = nullptr;
   1321 		external.width = width;
   1322 		external.height = height;
   1323 		external.depth = depth;
   1324 		external.samples = (short)samples;
   1325 		external.format = format;
   1326 		external.bytes = bytes(external.format);
   1327 		external.pitchB = !pitchPprovided ? pitchB(external.width, 0, external.format, renderTarget && !texture) : pitchPprovided * external.bytes;
   1328 		external.pitchP = !pitchPprovided ? pitchP(external.width, 0, external.format, renderTarget && !texture) : pitchPprovided;
   1329 		external.sliceB = sliceB(external.width, external.height, 0, external.format, renderTarget && !texture);
   1330 		external.sliceP = sliceP(external.width, external.height, 0, external.format, renderTarget && !texture);
   1331 		external.border = 0;
   1332 		external.lock = LOCK_UNLOCKED;
   1333 		external.dirty = false;
   1334 
   1335 		internal.buffer = nullptr;
   1336 		internal.width = width;
   1337 		internal.height = height;
   1338 		internal.depth = depth;
   1339 		internal.samples = (short)samples;
   1340 		internal.format = selectInternalFormat(format);
   1341 		internal.bytes = bytes(internal.format);
   1342 		internal.pitchB = !pitchPprovided ? pitchB(internal.width, border, internal.format, renderTarget) : pitchPprovided * internal.bytes;
   1343 		internal.pitchP = !pitchPprovided ? pitchP(internal.width, border, internal.format, renderTarget) : pitchPprovided;
   1344 		internal.sliceB = sliceB(internal.width, internal.height, border, internal.format, renderTarget);
   1345 		internal.sliceP = sliceP(internal.width, internal.height, border, internal.format, renderTarget);
   1346 		internal.border = (short)border;
   1347 		internal.lock = LOCK_UNLOCKED;
   1348 		internal.dirty = false;
   1349 
   1350 		stencil.buffer = nullptr;
   1351 		stencil.width = width;
   1352 		stencil.height = height;
   1353 		stencil.depth = depth;
   1354 		stencil.samples = (short)samples;
   1355 		stencil.format = isStencil(format) ? FORMAT_S8 : FORMAT_NULL;
   1356 		stencil.bytes = bytes(stencil.format);
   1357 		stencil.pitchB = pitchB(stencil.width, 0, stencil.format, renderTarget);
   1358 		stencil.pitchP = pitchP(stencil.width, 0, stencil.format, renderTarget);
   1359 		stencil.sliceB = sliceB(stencil.width, stencil.height, 0, stencil.format, renderTarget);
   1360 		stencil.sliceP = sliceP(stencil.width, stencil.height, 0, stencil.format, renderTarget);
   1361 		stencil.border = 0;
   1362 		stencil.lock = LOCK_UNLOCKED;
   1363 		stencil.dirty = false;
   1364 
   1365 		dirtyContents = true;
   1366 		paletteUsed = 0;
   1367 	}
   1368 
   1369 	Surface::~Surface()
   1370 	{
   1371 		// sync() must be called before this destructor to ensure all locks have been released.
   1372 		// We can't call it here because the parent resource may already have been destroyed.
   1373 		ASSERT(isUnlocked());
   1374 
   1375 		if(!hasParent)
   1376 		{
   1377 			resource->destruct();
   1378 		}
   1379 
   1380 		if(ownExternal)
   1381 		{
   1382 			deallocate(external.buffer);
   1383 		}
   1384 
   1385 		if(internal.buffer != external.buffer)
   1386 		{
   1387 			deallocate(internal.buffer);
   1388 		}
   1389 
   1390 		deallocate(stencil.buffer);
   1391 
   1392 		external.buffer = nullptr;
   1393 		internal.buffer = nullptr;
   1394 		stencil.buffer = nullptr;
   1395 	}
   1396 
   1397 	void *Surface::lockExternal(int x, int y, int z, Lock lock, Accessor client)
   1398 	{
   1399 		resource->lock(client);
   1400 
   1401 		if(!external.buffer)
   1402 		{
   1403 			if(internal.buffer && identicalBuffers())
   1404 			{
   1405 				external.buffer = internal.buffer;
   1406 			}
   1407 			else
   1408 			{
   1409 				external.buffer = allocateBuffer(external.width, external.height, external.depth, external.border, external.samples, external.format);
   1410 			}
   1411 		}
   1412 
   1413 		if(internal.dirty)
   1414 		{
   1415 			if(lock != LOCK_DISCARD)
   1416 			{
   1417 				update(external, internal);
   1418 			}
   1419 
   1420 			internal.dirty = false;
   1421 		}
   1422 
   1423 		switch(lock)
   1424 		{
   1425 		case LOCK_READONLY:
   1426 			break;
   1427 		case LOCK_WRITEONLY:
   1428 		case LOCK_READWRITE:
   1429 		case LOCK_DISCARD:
   1430 			dirtyContents = true;
   1431 			break;
   1432 		default:
   1433 			ASSERT(false);
   1434 		}
   1435 
   1436 		return external.lockRect(x, y, z, lock);
   1437 	}
   1438 
   1439 	void Surface::unlockExternal()
   1440 	{
   1441 		external.unlockRect();
   1442 
   1443 		resource->unlock();
   1444 	}
   1445 
   1446 	void *Surface::lockInternal(int x, int y, int z, Lock lock, Accessor client)
   1447 	{
   1448 		if(lock != LOCK_UNLOCKED)
   1449 		{
   1450 			resource->lock(client);
   1451 		}
   1452 
   1453 		if(!internal.buffer)
   1454 		{
   1455 			if(external.buffer && identicalBuffers())
   1456 			{
   1457 				internal.buffer = external.buffer;
   1458 			}
   1459 			else
   1460 			{
   1461 				internal.buffer = allocateBuffer(internal.width, internal.height, internal.depth, internal.border, internal.samples, internal.format);
   1462 			}
   1463 		}
   1464 
   1465 		// FIXME: WHQL requires conversion to lower external precision and back
   1466 		if(logPrecision >= WHQL)
   1467 		{
   1468 			if(internal.dirty && renderTarget && internal.format != external.format)
   1469 			{
   1470 				if(lock != LOCK_DISCARD)
   1471 				{
   1472 					switch(external.format)
   1473 					{
   1474 					case FORMAT_R3G3B2:
   1475 					case FORMAT_A8R3G3B2:
   1476 					case FORMAT_A1R5G5B5:
   1477 					case FORMAT_A2R10G10B10:
   1478 					case FORMAT_A2B10G10R10:
   1479 						lockExternal(0, 0, 0, LOCK_READWRITE, client);
   1480 						unlockExternal();
   1481 						break;
   1482 					default:
   1483 						// Difference passes WHQL
   1484 						break;
   1485 					}
   1486 				}
   1487 			}
   1488 		}
   1489 
   1490 		if(external.dirty || (isPalette(external.format) && paletteUsed != Surface::paletteID))
   1491 		{
   1492 			if(lock != LOCK_DISCARD)
   1493 			{
   1494 				update(internal, external);
   1495 			}
   1496 
   1497 			external.dirty = false;
   1498 			paletteUsed = Surface::paletteID;
   1499 		}
   1500 
   1501 		switch(lock)
   1502 		{
   1503 		case LOCK_UNLOCKED:
   1504 		case LOCK_READONLY:
   1505 			break;
   1506 		case LOCK_WRITEONLY:
   1507 		case LOCK_READWRITE:
   1508 		case LOCK_DISCARD:
   1509 			dirtyContents = true;
   1510 			break;
   1511 		default:
   1512 			ASSERT(false);
   1513 		}
   1514 
   1515 		if(lock == LOCK_READONLY && client == PUBLIC)
   1516 		{
   1517 			resolve();
   1518 		}
   1519 
   1520 		return internal.lockRect(x, y, z, lock);
   1521 	}
   1522 
   1523 	void Surface::unlockInternal()
   1524 	{
   1525 		internal.unlockRect();
   1526 
   1527 		resource->unlock();
   1528 	}
   1529 
   1530 	void *Surface::lockStencil(int x, int y, int front, Accessor client)
   1531 	{
   1532 		resource->lock(client);
   1533 
   1534 		if(stencil.format == FORMAT_NULL)
   1535 		{
   1536 			return nullptr;
   1537 		}
   1538 
   1539 		if(!stencil.buffer)
   1540 		{
   1541 			stencil.buffer = allocateBuffer(stencil.width, stencil.height, stencil.depth, stencil.border, stencil.samples, stencil.format);
   1542 		}
   1543 
   1544 		return stencil.lockRect(x, y, front, LOCK_READWRITE);   // FIXME
   1545 	}
   1546 
   1547 	void Surface::unlockStencil()
   1548 	{
   1549 		stencil.unlockRect();
   1550 
   1551 		resource->unlock();
   1552 	}
   1553 
   1554 	int Surface::bytes(Format format)
   1555 	{
   1556 		switch(format)
   1557 		{
   1558 		case FORMAT_NULL:				return 0;
   1559 		case FORMAT_P8:					return 1;
   1560 		case FORMAT_A8P8:				return 2;
   1561 		case FORMAT_A8:					return 1;
   1562 		case FORMAT_R8I:				return 1;
   1563 		case FORMAT_R8:					return 1;
   1564 		case FORMAT_R3G3B2:				return 1;
   1565 		case FORMAT_R16I:				return 2;
   1566 		case FORMAT_R16UI:				return 2;
   1567 		case FORMAT_A8R3G3B2:			return 2;
   1568 		case FORMAT_R5G6B5:				return 2;
   1569 		case FORMAT_A1R5G5B5:			return 2;
   1570 		case FORMAT_X1R5G5B5:			return 2;
   1571 		case FORMAT_R5G5B5A1:           return 2;
   1572 		case FORMAT_X4R4G4B4:			return 2;
   1573 		case FORMAT_A4R4G4B4:			return 2;
   1574 		case FORMAT_R4G4B4A4:           return 2;
   1575 		case FORMAT_R8G8B8:				return 3;
   1576 		case FORMAT_B8G8R8:             return 3;
   1577 		case FORMAT_R32I:				return 4;
   1578 		case FORMAT_R32UI:				return 4;
   1579 		case FORMAT_X8R8G8B8:			return 4;
   1580 	//	case FORMAT_X8G8R8B8Q:			return 4;
   1581 		case FORMAT_A8R8G8B8:			return 4;
   1582 	//	case FORMAT_A8G8R8B8Q:			return 4;
   1583 		case FORMAT_X8B8G8R8I:			return 4;
   1584 		case FORMAT_X8B8G8R8:			return 4;
   1585 		case FORMAT_SRGB8_X8:			return 4;
   1586 		case FORMAT_SRGB8_A8:			return 4;
   1587 		case FORMAT_A8B8G8R8I:			return 4;
   1588 		case FORMAT_R8UI:				return 1;
   1589 		case FORMAT_G8R8UI:				return 2;
   1590 		case FORMAT_X8B8G8R8UI:			return 4;
   1591 		case FORMAT_A8B8G8R8UI:			return 4;
   1592 		case FORMAT_A8B8G8R8:			return 4;
   1593 		case FORMAT_R8_SNORM:			return 1;
   1594 		case FORMAT_G8R8_SNORM:		return 2;
   1595 		case FORMAT_X8B8G8R8_SNORM:	return 4;
   1596 		case FORMAT_A8B8G8R8_SNORM:	return 4;
   1597 		case FORMAT_A2R10G10B10:		return 4;
   1598 		case FORMAT_A2B10G10R10:		return 4;
   1599 		case FORMAT_A2B10G10R10UI:		return 4;
   1600 		case FORMAT_G8R8I:				return 2;
   1601 		case FORMAT_G8R8:				return 2;
   1602 		case FORMAT_G16R16I:			return 4;
   1603 		case FORMAT_G16R16UI:			return 4;
   1604 		case FORMAT_G16R16:				return 4;
   1605 		case FORMAT_G32R32I:			return 8;
   1606 		case FORMAT_G32R32UI:			return 8;
   1607 		case FORMAT_X16B16G16R16I:		return 8;
   1608 		case FORMAT_X16B16G16R16UI:		return 8;
   1609 		case FORMAT_A16B16G16R16I:		return 8;
   1610 		case FORMAT_A16B16G16R16UI:		return 8;
   1611 		case FORMAT_A16B16G16R16:		return 8;
   1612 		case FORMAT_X32B32G32R32I:		return 16;
   1613 		case FORMAT_X32B32G32R32UI:		return 16;
   1614 		case FORMAT_A32B32G32R32I:		return 16;
   1615 		case FORMAT_A32B32G32R32UI:		return 16;
   1616 		// Compressed formats
   1617 		case FORMAT_DXT1:				return 2;   // Column of four pixels
   1618 		case FORMAT_DXT3:				return 4;   // Column of four pixels
   1619 		case FORMAT_DXT5:				return 4;   // Column of four pixels
   1620 		case FORMAT_ATI1:				return 2;   // Column of four pixels
   1621 		case FORMAT_ATI2:				return 4;   // Column of four pixels
   1622 		case FORMAT_ETC1:				return 2;   // Column of four pixels
   1623 		case FORMAT_R11_EAC:			return 2;
   1624 		case FORMAT_SIGNED_R11_EAC:		return 2;
   1625 		case FORMAT_RG11_EAC:			return 4;
   1626 		case FORMAT_SIGNED_RG11_EAC:	return 4;
   1627 		case FORMAT_RGB8_ETC2:			return 2;
   1628 		case FORMAT_SRGB8_ETC2:			return 2;
   1629 		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:	return 2;
   1630 		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:	return 2;
   1631 		case FORMAT_RGBA8_ETC2_EAC:			return 4;
   1632 		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:	return 4;
   1633 		case FORMAT_RGBA_ASTC_4x4_KHR:
   1634 		case FORMAT_RGBA_ASTC_5x4_KHR:
   1635 		case FORMAT_RGBA_ASTC_5x5_KHR:
   1636 		case FORMAT_RGBA_ASTC_6x5_KHR:
   1637 		case FORMAT_RGBA_ASTC_6x6_KHR:
   1638 		case FORMAT_RGBA_ASTC_8x5_KHR:
   1639 		case FORMAT_RGBA_ASTC_8x6_KHR:
   1640 		case FORMAT_RGBA_ASTC_8x8_KHR:
   1641 		case FORMAT_RGBA_ASTC_10x5_KHR:
   1642 		case FORMAT_RGBA_ASTC_10x6_KHR:
   1643 		case FORMAT_RGBA_ASTC_10x8_KHR:
   1644 		case FORMAT_RGBA_ASTC_10x10_KHR:
   1645 		case FORMAT_RGBA_ASTC_12x10_KHR:
   1646 		case FORMAT_RGBA_ASTC_12x12_KHR:
   1647 		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
   1648 		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
   1649 		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
   1650 		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
   1651 		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
   1652 		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
   1653 		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
   1654 		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
   1655 		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
   1656 		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
   1657 		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
   1658 		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
   1659 		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
   1660 		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR: return 0; // FIXME
   1661 		// Bumpmap formats
   1662 		case FORMAT_V8U8:				return 2;
   1663 		case FORMAT_L6V5U5:				return 2;
   1664 		case FORMAT_Q8W8V8U8:			return 4;
   1665 		case FORMAT_X8L8V8U8:			return 4;
   1666 		case FORMAT_A2W10V10U10:		return 4;
   1667 		case FORMAT_V16U16:				return 4;
   1668 		case FORMAT_A16W16V16U16:		return 8;
   1669 		case FORMAT_Q16W16V16U16:		return 8;
   1670 		// Luminance formats
   1671 		case FORMAT_L8:					return 1;
   1672 		case FORMAT_A4L4:				return 1;
   1673 		case FORMAT_L16:				return 2;
   1674 		case FORMAT_A8L8:				return 2;
   1675 		case FORMAT_L16F:               return 2;
   1676 		case FORMAT_A16L16F:            return 4;
   1677 		case FORMAT_L32F:               return 4;
   1678 		case FORMAT_A32L32F:            return 8;
   1679 		// Floating-point formats
   1680 		case FORMAT_A16F:				return 2;
   1681 		case FORMAT_R16F:				return 2;
   1682 		case FORMAT_G16R16F:			return 4;
   1683 		case FORMAT_B16G16R16F:			return 6;
   1684 		case FORMAT_X16B16G16R16F:		return 8;
   1685 		case FORMAT_A16B16G16R16F:		return 8;
   1686 		case FORMAT_X16B16G16R16F_UNSIGNED: return 8;
   1687 		case FORMAT_A32F:				return 4;
   1688 		case FORMAT_R32F:				return 4;
   1689 		case FORMAT_G32R32F:			return 8;
   1690 		case FORMAT_B32G32R32F:			return 12;
   1691 		case FORMAT_X32B32G32R32F:		return 16;
   1692 		case FORMAT_A32B32G32R32F:		return 16;
   1693 		case FORMAT_X32B32G32R32F_UNSIGNED: return 16;
   1694 		// Depth/stencil formats
   1695 		case FORMAT_D16:				return 2;
   1696 		case FORMAT_D32:				return 4;
   1697 		case FORMAT_D24X8:				return 4;
   1698 		case FORMAT_D24S8:				return 4;
   1699 		case FORMAT_D24FS8:				return 4;
   1700 		case FORMAT_D32F:				return 4;
   1701 		case FORMAT_D32FS8:				return 4;
   1702 		case FORMAT_D32F_COMPLEMENTARY:	return 4;
   1703 		case FORMAT_D32FS8_COMPLEMENTARY: return 4;
   1704 		case FORMAT_D32F_LOCKABLE:		return 4;
   1705 		case FORMAT_D32FS8_TEXTURE:		return 4;
   1706 		case FORMAT_D32F_SHADOW:		return 4;
   1707 		case FORMAT_D32FS8_SHADOW:		return 4;
   1708 		case FORMAT_DF24S8:				return 4;
   1709 		case FORMAT_DF16S8:				return 2;
   1710 		case FORMAT_INTZ:				return 4;
   1711 		case FORMAT_S8:					return 1;
   1712 		case FORMAT_YV12_BT601:         return 1;   // Y plane only
   1713 		case FORMAT_YV12_BT709:         return 1;   // Y plane only
   1714 		case FORMAT_YV12_JFIF:          return 1;   // Y plane only
   1715 		default:
   1716 			ASSERT(false);
   1717 		}
   1718 
   1719 		return 0;
   1720 	}
   1721 
   1722 	int Surface::pitchB(int width, int border, Format format, bool target)
   1723 	{
   1724 		width += 2 * border;
   1725 
   1726 		// Render targets require 2x2 quads
   1727 		if(target || isDepth(format) || isStencil(format))
   1728 		{
   1729 			width = align<2>(width);
   1730 		}
   1731 
   1732 		switch(format)
   1733 		{
   1734 		case FORMAT_DXT1:
   1735 		case FORMAT_ETC1:
   1736 		case FORMAT_R11_EAC:
   1737 		case FORMAT_SIGNED_R11_EAC:
   1738 		case FORMAT_RGB8_ETC2:
   1739 		case FORMAT_SRGB8_ETC2:
   1740 		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
   1741 		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
   1742 			return 8 * ((width + 3) / 4);    // 64 bit per 4x4 block, computed per 4 rows
   1743 		case FORMAT_RG11_EAC:
   1744 		case FORMAT_SIGNED_RG11_EAC:
   1745 		case FORMAT_RGBA8_ETC2_EAC:
   1746 		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
   1747 		case FORMAT_RGBA_ASTC_4x4_KHR:
   1748 		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
   1749 			return 16 * ((width + 3) / 4);    // 128 bit per 4x4 block, computed per 4 rows
   1750 		case FORMAT_RGBA_ASTC_5x4_KHR:
   1751 		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
   1752 		case FORMAT_RGBA_ASTC_5x5_KHR:
   1753 		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
   1754 			return 16 * ((width + 4) / 5);
   1755 		case FORMAT_RGBA_ASTC_6x5_KHR:
   1756 		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
   1757 		case FORMAT_RGBA_ASTC_6x6_KHR:
   1758 		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
   1759 			return 16 * ((width + 5) / 6);
   1760 		case FORMAT_RGBA_ASTC_8x5_KHR:
   1761 		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
   1762 		case FORMAT_RGBA_ASTC_8x6_KHR:
   1763 		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
   1764 		case FORMAT_RGBA_ASTC_8x8_KHR:
   1765 		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
   1766 			return 16 * ((width + 7) / 8);
   1767 		case FORMAT_RGBA_ASTC_10x5_KHR:
   1768 		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
   1769 		case FORMAT_RGBA_ASTC_10x6_KHR:
   1770 		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
   1771 		case FORMAT_RGBA_ASTC_10x8_KHR:
   1772 		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
   1773 		case FORMAT_RGBA_ASTC_10x10_KHR:
   1774 		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
   1775 			return 16 * ((width + 9) / 10);
   1776 		case FORMAT_RGBA_ASTC_12x10_KHR:
   1777 		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
   1778 		case FORMAT_RGBA_ASTC_12x12_KHR:
   1779 		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
   1780 			return 16 * ((width + 11) / 12);
   1781 		case FORMAT_DXT3:
   1782 		case FORMAT_DXT5:
   1783 			return 16 * ((width + 3) / 4);   // 128 bit per 4x4 block, computed per 4 rows
   1784 		case FORMAT_ATI1:
   1785 			return 2 * ((width + 3) / 4);    // 64 bit per 4x4 block, computed per row
   1786 		case FORMAT_ATI2:
   1787 			return 4 * ((width + 3) / 4);    // 128 bit per 4x4 block, computed per row
   1788 		case FORMAT_YV12_BT601:
   1789 		case FORMAT_YV12_BT709:
   1790 		case FORMAT_YV12_JFIF:
   1791 			return align<16>(width);
   1792 		default:
   1793 			return bytes(format) * width;
   1794 		}
   1795 	}
   1796 
   1797 	int Surface::pitchP(int width, int border, Format format, bool target)
   1798 	{
   1799 		int B = bytes(format);
   1800 
   1801 		return B > 0 ? pitchB(width, border, format, target) / B : 0;
   1802 	}
   1803 
   1804 	int Surface::sliceB(int width, int height, int border, Format format, bool target)
   1805 	{
   1806 		height += 2 * border;
   1807 
   1808 		// Render targets require 2x2 quads
   1809 		if(target || isDepth(format) || isStencil(format))
   1810 		{
   1811 			height = align<2>(height);
   1812 		}
   1813 
   1814 		switch(format)
   1815 		{
   1816 		case FORMAT_DXT1:
   1817 		case FORMAT_DXT3:
   1818 		case FORMAT_DXT5:
   1819 		case FORMAT_ETC1:
   1820 		case FORMAT_R11_EAC:
   1821 		case FORMAT_SIGNED_R11_EAC:
   1822 		case FORMAT_RG11_EAC:
   1823 		case FORMAT_SIGNED_RG11_EAC:
   1824 		case FORMAT_RGB8_ETC2:
   1825 		case FORMAT_SRGB8_ETC2:
   1826 		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
   1827 		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
   1828 		case FORMAT_RGBA8_ETC2_EAC:
   1829 		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
   1830 		case FORMAT_RGBA_ASTC_4x4_KHR:
   1831 		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
   1832 		case FORMAT_RGBA_ASTC_5x4_KHR:
   1833 		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
   1834 			return pitchB(width, border, format, target) * ((height + 3) / 4);   // Pitch computed per 4 rows
   1835 		case FORMAT_RGBA_ASTC_5x5_KHR:
   1836 		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
   1837 		case FORMAT_RGBA_ASTC_6x5_KHR:
   1838 		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
   1839 		case FORMAT_RGBA_ASTC_8x5_KHR:
   1840 		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
   1841 		case FORMAT_RGBA_ASTC_10x5_KHR:
   1842 		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
   1843 			return pitchB(width, border, format, target) * ((height + 4) / 5);   // Pitch computed per 5 rows
   1844 		case FORMAT_RGBA_ASTC_6x6_KHR:
   1845 		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
   1846 		case FORMAT_RGBA_ASTC_8x6_KHR:
   1847 		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
   1848 		case FORMAT_RGBA_ASTC_10x6_KHR:
   1849 		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
   1850 			return pitchB(width, border, format, target) * ((height + 5) / 6);   // Pitch computed per 6 rows
   1851 		case FORMAT_RGBA_ASTC_8x8_KHR:
   1852 		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
   1853 		case FORMAT_RGBA_ASTC_10x8_KHR:
   1854 		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
   1855 			return pitchB(width, border, format, target) * ((height + 7) / 8);   // Pitch computed per 8 rows
   1856 		case FORMAT_RGBA_ASTC_10x10_KHR:
   1857 		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
   1858 		case FORMAT_RGBA_ASTC_12x10_KHR:
   1859 		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
   1860 			return pitchB(width, border, format, target) * ((height + 9) / 10);   // Pitch computed per 10 rows
   1861 		case FORMAT_RGBA_ASTC_12x12_KHR:
   1862 		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
   1863 			return pitchB(width, border, format, target) * ((height + 11) / 12);   // Pitch computed per 12 rows
   1864 		case FORMAT_ATI1:
   1865 		case FORMAT_ATI2:
   1866 			return pitchB(width, border, format, target) * align<4>(height);   // Pitch computed per row
   1867 		default:
   1868 			return pitchB(width, border, format, target) * height;   // Pitch computed per row
   1869 		}
   1870 	}
   1871 
   1872 	int Surface::sliceP(int width, int height, int border, Format format, bool target)
   1873 	{
   1874 		int B = bytes(format);
   1875 
   1876 		return B > 0 ? sliceB(width, height, border, format, target) / B : 0;
   1877 	}
   1878 
   1879 	void Surface::update(Buffer &destination, Buffer &source)
   1880 	{
   1881 	//	ASSERT(source.lock != LOCK_UNLOCKED);
   1882 	//	ASSERT(destination.lock != LOCK_UNLOCKED);
   1883 
   1884 		if(destination.buffer != source.buffer)
   1885 		{
   1886 			ASSERT(source.dirty && !destination.dirty);
   1887 
   1888 			switch(source.format)
   1889 			{
   1890 			case FORMAT_R8G8B8:		decodeR8G8B8(destination, source);		break;   // FIXME: Check destination format
   1891 			case FORMAT_X1R5G5B5:	decodeX1R5G5B5(destination, source);	break;   // FIXME: Check destination format
   1892 			case FORMAT_A1R5G5B5:	decodeA1R5G5B5(destination, source);	break;   // FIXME: Check destination format
   1893 			case FORMAT_X4R4G4B4:	decodeX4R4G4B4(destination, source);	break;   // FIXME: Check destination format
   1894 			case FORMAT_A4R4G4B4:	decodeA4R4G4B4(destination, source);	break;   // FIXME: Check destination format
   1895 			case FORMAT_P8:			decodeP8(destination, source);			break;   // FIXME: Check destination format
   1896 			case FORMAT_DXT1:		decodeDXT1(destination, source);		break;   // FIXME: Check destination format
   1897 			case FORMAT_DXT3:		decodeDXT3(destination, source);		break;   // FIXME: Check destination format
   1898 			case FORMAT_DXT5:		decodeDXT5(destination, source);		break;   // FIXME: Check destination format
   1899 			case FORMAT_ATI1:		decodeATI1(destination, source);		break;   // FIXME: Check destination format
   1900 			case FORMAT_ATI2:		decodeATI2(destination, source);		break;   // FIXME: Check destination format
   1901 			case FORMAT_R11_EAC:         decodeEAC(destination, source, 1, false); break; // FIXME: Check destination format
   1902 			case FORMAT_SIGNED_R11_EAC:  decodeEAC(destination, source, 1, true);  break; // FIXME: Check destination format
   1903 			case FORMAT_RG11_EAC:        decodeEAC(destination, source, 2, false); break; // FIXME: Check destination format
   1904 			case FORMAT_SIGNED_RG11_EAC: decodeEAC(destination, source, 2, true);  break; // FIXME: Check destination format
   1905 			case FORMAT_ETC1:
   1906 			case FORMAT_RGB8_ETC2:                      decodeETC2(destination, source, 0, false); break; // FIXME: Check destination format
   1907 			case FORMAT_SRGB8_ETC2:                     decodeETC2(destination, source, 0, true);  break; // FIXME: Check destination format
   1908 			case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:  decodeETC2(destination, source, 1, false); break; // FIXME: Check destination format
   1909 			case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2: decodeETC2(destination, source, 1, true);  break; // FIXME: Check destination format
   1910 			case FORMAT_RGBA8_ETC2_EAC:                 decodeETC2(destination, source, 8, false); break; // FIXME: Check destination format
   1911 			case FORMAT_SRGB8_ALPHA8_ETC2_EAC:          decodeETC2(destination, source, 8, true);  break; // FIXME: Check destination format
   1912 			case FORMAT_RGBA_ASTC_4x4_KHR:           decodeASTC(destination, source, 4,  4,  1, false); break; // FIXME: Check destination format
   1913 			case FORMAT_RGBA_ASTC_5x4_KHR:           decodeASTC(destination, source, 5,  4,  1, false); break; // FIXME: Check destination format
   1914 			case FORMAT_RGBA_ASTC_5x5_KHR:           decodeASTC(destination, source, 5,  5,  1, false); break; // FIXME: Check destination format
   1915 			case FORMAT_RGBA_ASTC_6x5_KHR:           decodeASTC(destination, source, 6,  5,  1, false); break; // FIXME: Check destination format
   1916 			case FORMAT_RGBA_ASTC_6x6_KHR:           decodeASTC(destination, source, 6,  6,  1, false); break; // FIXME: Check destination format
   1917 			case FORMAT_RGBA_ASTC_8x5_KHR:           decodeASTC(destination, source, 8,  5,  1, false); break; // FIXME: Check destination format
   1918 			case FORMAT_RGBA_ASTC_8x6_KHR:           decodeASTC(destination, source, 8,  6,  1, false); break; // FIXME: Check destination format
   1919 			case FORMAT_RGBA_ASTC_8x8_KHR:           decodeASTC(destination, source, 8,  8,  1, false); break; // FIXME: Check destination format
   1920 			case FORMAT_RGBA_ASTC_10x5_KHR:          decodeASTC(destination, source, 10, 5,  1, false); break; // FIXME: Check destination format
   1921 			case FORMAT_RGBA_ASTC_10x6_KHR:          decodeASTC(destination, source, 10, 6,  1, false); break; // FIXME: Check destination format
   1922 			case FORMAT_RGBA_ASTC_10x8_KHR:          decodeASTC(destination, source, 10, 8,  1, false); break; // FIXME: Check destination format
   1923 			case FORMAT_RGBA_ASTC_10x10_KHR:         decodeASTC(destination, source, 10, 10, 1, false); break; // FIXME: Check destination format
   1924 			case FORMAT_RGBA_ASTC_12x10_KHR:         decodeASTC(destination, source, 12, 10, 1, false); break; // FIXME: Check destination format
   1925 			case FORMAT_RGBA_ASTC_12x12_KHR:         decodeASTC(destination, source, 12, 12, 1, false); break; // FIXME: Check destination format
   1926 			case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:   decodeASTC(destination, source, 4,  4,  1, true);  break; // FIXME: Check destination format
   1927 			case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:   decodeASTC(destination, source, 5,  4,  1, true);  break; // FIXME: Check destination format
   1928 			case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:   decodeASTC(destination, source, 5,  5,  1, true);  break; // FIXME: Check destination format
   1929 			case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:   decodeASTC(destination, source, 6,  5,  1, true);  break; // FIXME: Check destination format
   1930 			case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:   decodeASTC(destination, source, 6,  6,  1, true);  break; // FIXME: Check destination format
   1931 			case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:   decodeASTC(destination, source, 8,  5,  1, true);  break; // FIXME: Check destination format
   1932 			case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:   decodeASTC(destination, source, 8,  6,  1, true);  break; // FIXME: Check destination format
   1933 			case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:   decodeASTC(destination, source, 8,  8,  1, true);  break; // FIXME: Check destination format
   1934 			case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:  decodeASTC(destination, source, 10, 5,  1, true);  break; // FIXME: Check destination format
   1935 			case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:  decodeASTC(destination, source, 10, 6,  1, true);  break; // FIXME: Check destination format
   1936 			case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:  decodeASTC(destination, source, 10, 8,  1, true);  break; // FIXME: Check destination format
   1937 			case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR: decodeASTC(destination, source, 10, 10, 1, true);  break; // FIXME: Check destination format
   1938 			case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR: decodeASTC(destination, source, 12, 10, 1, true);  break; // FIXME: Check destination format
   1939 			case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR: decodeASTC(destination, source, 12, 12, 1, true);  break; // FIXME: Check destination format
   1940 			default:				genericUpdate(destination, source);		break;
   1941 			}
   1942 		}
   1943 	}
   1944 
   1945 	void Surface::genericUpdate(Buffer &destination, Buffer &source)
   1946 	{
   1947 		unsigned char *sourceSlice = (unsigned char*)source.lockRect(0, 0, 0, sw::LOCK_READONLY);
   1948 		unsigned char *destinationSlice = (unsigned char*)destination.lockRect(0, 0, 0, sw::LOCK_UPDATE);
   1949 
   1950 		int depth = min(destination.depth, source.depth);
   1951 		int height = min(destination.height, source.height);
   1952 		int width = min(destination.width, source.width);
   1953 		int rowBytes = width * source.bytes;
   1954 
   1955 		for(int z = 0; z < depth; z++)
   1956 		{
   1957 			unsigned char *sourceRow = sourceSlice;
   1958 			unsigned char *destinationRow = destinationSlice;
   1959 
   1960 			for(int y = 0; y < height; y++)
   1961 			{
   1962 				if(source.format == destination.format)
   1963 				{
   1964 					memcpy(destinationRow, sourceRow, rowBytes);
   1965 				}
   1966 				else
   1967 				{
   1968 					unsigned char *sourceElement = sourceRow;
   1969 					unsigned char *destinationElement = destinationRow;
   1970 
   1971 					for(int x = 0; x < width; x++)
   1972 					{
   1973 						Color<float> color = source.read(sourceElement);
   1974 						destination.write(destinationElement, color);
   1975 
   1976 						sourceElement += source.bytes;
   1977 						destinationElement += destination.bytes;
   1978 					}
   1979 				}
   1980 
   1981 				sourceRow += source.pitchB;
   1982 				destinationRow += destination.pitchB;
   1983 			}
   1984 
   1985 			sourceSlice += source.sliceB;
   1986 			destinationSlice += destination.sliceB;
   1987 		}
   1988 
   1989 		source.unlockRect();
   1990 		destination.unlockRect();
   1991 	}
   1992 
   1993 	void Surface::decodeR8G8B8(Buffer &destination, Buffer &source)
   1994 	{
   1995 		unsigned char *sourceSlice = (unsigned char*)source.lockRect(0, 0, 0, sw::LOCK_READONLY);
   1996 		unsigned char *destinationSlice = (unsigned char*)destination.lockRect(0, 0, 0, sw::LOCK_UPDATE);
   1997 
   1998 		int depth = min(destination.depth, source.depth);
   1999 		int height = min(destination.height, source.height);
   2000 		int width = min(destination.width, source.width);
   2001 
   2002 		for(int z = 0; z < depth; z++)
   2003 		{
   2004 			unsigned char *sourceRow = sourceSlice;
   2005 			unsigned char *destinationRow = destinationSlice;
   2006 
   2007 			for(int y = 0; y < height; y++)
   2008 			{
   2009 				unsigned char *sourceElement = sourceRow;
   2010 				unsigned char *destinationElement = destinationRow;
   2011 
   2012 				for(int x = 0; x < width; x++)
   2013 				{
   2014 					unsigned int b = sourceElement[0];
   2015 					unsigned int g = sourceElement[1];
   2016 					unsigned int r = sourceElement[2];
   2017 
   2018 					*(unsigned int*)destinationElement = 0xFF000000 | (r << 16) | (g << 8) | (b << 0);
   2019 
   2020 					sourceElement += source.bytes;
   2021 					destinationElement += destination.bytes;
   2022 				}
   2023 
   2024 				sourceRow += source.pitchB;
   2025 				destinationRow += destination.pitchB;
   2026 			}
   2027 
   2028 			sourceSlice += source.sliceB;
   2029 			destinationSlice += destination.sliceB;
   2030 		}
   2031 
   2032 		source.unlockRect();
   2033 		destination.unlockRect();
   2034 	}
   2035 
   2036 	void Surface::decodeX1R5G5B5(Buffer &destination, Buffer &source)
   2037 	{
   2038 		unsigned char *sourceSlice = (unsigned char*)source.lockRect(0, 0, 0, sw::LOCK_READONLY);
   2039 		unsigned char *destinationSlice = (unsigned char*)destination.lockRect(0, 0, 0, sw::LOCK_UPDATE);
   2040 
   2041 		int depth = min(destination.depth, source.depth);
   2042 		int height = min(destination.height, source.height);
   2043 		int width = min(destination.width, source.width);
   2044 
   2045 		for(int z = 0; z < depth; z++)
   2046 		{
   2047 			unsigned char *sourceRow = sourceSlice;
   2048 			unsigned char *destinationRow = destinationSlice;
   2049 
   2050 			for(int y = 0; y < height; y++)
   2051 			{
   2052 				unsigned char *sourceElement = sourceRow;
   2053 				unsigned char *destinationElement = destinationRow;
   2054 
   2055 				for(int x = 0; x < width; x++)
   2056 				{
   2057 					unsigned int xrgb = *(unsigned short*)sourceElement;
   2058 
   2059 					unsigned int r = (((xrgb & 0x7C00) * 134771 + 0x800000) >> 8) & 0x00FF0000;
   2060 					unsigned int g = (((xrgb & 0x03E0) * 16846 + 0x8000) >> 8) & 0x0000FF00;
   2061 					unsigned int b = (((xrgb & 0x001F) * 2106  + 0x80) >> 8);
   2062 
   2063 					*(unsigned int*)destinationElement = 0xFF000000 | r | g | b;
   2064 
   2065 					sourceElement += source.bytes;
   2066 					destinationElement += destination.bytes;
   2067 				}
   2068 
   2069 				sourceRow += source.pitchB;
   2070 				destinationRow += destination.pitchB;
   2071 			}
   2072 
   2073 			sourceSlice += source.sliceB;
   2074 			destinationSlice += destination.sliceB;
   2075 		}
   2076 
   2077 		source.unlockRect();
   2078 		destination.unlockRect();
   2079 	}
   2080 
   2081 	void Surface::decodeA1R5G5B5(Buffer &destination, Buffer &source)
   2082 	{
   2083 		unsigned char *sourceSlice = (unsigned char*)source.lockRect(0, 0, 0, sw::LOCK_READONLY);
   2084 		unsigned char *destinationSlice = (unsigned char*)destination.lockRect(0, 0, 0, sw::LOCK_UPDATE);
   2085 
   2086 		int depth = min(destination.depth, source.depth);
   2087 		int height = min(destination.height, source.height);
   2088 		int width = min(destination.width, source.width);
   2089 
   2090 		for(int z = 0; z < depth; z++)
   2091 		{
   2092 			unsigned char *sourceRow = sourceSlice;
   2093 			unsigned char *destinationRow = destinationSlice;
   2094 
   2095 			for(int y = 0; y < height; y++)
   2096 			{
   2097 				unsigned char *sourceElement = sourceRow;
   2098 				unsigned char *destinationElement = destinationRow;
   2099 
   2100 				for(int x = 0; x < width; x++)
   2101 				{
   2102 					unsigned int argb = *(unsigned short*)sourceElement;
   2103 
   2104 					unsigned int a =   (argb & 0x8000) * 130560;
   2105 					unsigned int r = (((argb & 0x7C00) * 134771 + 0x800000) >> 8) & 0x00FF0000;
   2106 					unsigned int g = (((argb & 0x03E0) * 16846  + 0x8000) >> 8) & 0x0000FF00;
   2107 					unsigned int b = (((argb & 0x001F) * 2106   + 0x80) >> 8);
   2108 
   2109 					*(unsigned int*)destinationElement = a | r | g | b;
   2110 
   2111 					sourceElement += source.bytes;
   2112 					destinationElement += destination.bytes;
   2113 				}
   2114 
   2115 				sourceRow += source.pitchB;
   2116 				destinationRow += destination.pitchB;
   2117 			}
   2118 
   2119 			sourceSlice += source.sliceB;
   2120 			destinationSlice += destination.sliceB;
   2121 		}
   2122 
   2123 		source.unlockRect();
   2124 		destination.unlockRect();
   2125 	}
   2126 
   2127 	void Surface::decodeX4R4G4B4(Buffer &destination, Buffer &source)
   2128 	{
   2129 		unsigned char *sourceSlice = (unsigned char*)source.lockRect(0, 0, 0, sw::LOCK_READONLY);
   2130 		unsigned char *destinationSlice = (unsigned char*)destination.lockRect(0, 0, 0, sw::LOCK_UPDATE);
   2131 
   2132 		int depth = min(destination.depth, source.depth);
   2133 		int height = min(destination.height, source.height);
   2134 		int width = min(destination.width, source.width);
   2135 
   2136 		for(int z = 0; z < depth; z++)
   2137 		{
   2138 			unsigned char *sourceRow = sourceSlice;
   2139 			unsigned char *destinationRow = destinationSlice;
   2140 
   2141 			for(int y = 0; y < height; y++)
   2142 			{
   2143 				unsigned char *sourceElement = sourceRow;
   2144 				unsigned char *destinationElement = destinationRow;
   2145 
   2146 				for(int x = 0; x < width; x++)
   2147 				{
   2148 					unsigned int xrgb = *(unsigned short*)sourceElement;
   2149 
   2150 					unsigned int r = ((xrgb & 0x0F00) * 0x00001100) & 0x00FF0000;
   2151 					unsigned int g = ((xrgb & 0x00F0) * 0x00000110) & 0x0000FF00;
   2152 					unsigned int b =  (xrgb & 0x000F) * 0x00000011;
   2153 
   2154 					*(unsigned int*)destinationElement = 0xFF000000 | r | g | b;
   2155 
   2156 					sourceElement += source.bytes;
   2157 					destinationElement += destination.bytes;
   2158 				}
   2159 
   2160 				sourceRow += source.pitchB;
   2161 				destinationRow += destination.pitchB;
   2162 			}
   2163 
   2164 			sourceSlice += source.sliceB;
   2165 			destinationSlice += destination.sliceB;
   2166 		}
   2167 
   2168 		source.unlockRect();
   2169 		destination.unlockRect();
   2170 	}
   2171 
   2172 	void Surface::decodeA4R4G4B4(Buffer &destination, Buffer &source)
   2173 	{
   2174 		unsigned char *sourceSlice = (unsigned char*)source.lockRect(0, 0, 0, sw::LOCK_READONLY);
   2175 		unsigned char *destinationSlice = (unsigned char*)destination.lockRect(0, 0, 0, sw::LOCK_UPDATE);
   2176 
   2177 		int depth = min(destination.depth, source.depth);
   2178 		int height = min(destination.height, source.height);
   2179 		int width = min(destination.width, source.width);
   2180 
   2181 		for(int z = 0; z < depth; z++)
   2182 		{
   2183 			unsigned char *sourceRow = sourceSlice;
   2184 			unsigned char *destinationRow = destinationSlice;
   2185 
   2186 			for(int y = 0; y < height; y++)
   2187 			{
   2188 				unsigned char *sourceElement = sourceRow;
   2189 				unsigned char *destinationElement = destinationRow;
   2190 
   2191 				for(int x = 0; x < width; x++)
   2192 				{
   2193 					unsigned int argb = *(unsigned short*)sourceElement;
   2194 
   2195 					unsigned int a = ((argb & 0xF000) * 0x00011000) & 0xFF000000;
   2196 					unsigned int r = ((argb & 0x0F00) * 0x00001100) & 0x00FF0000;
   2197 					unsigned int g = ((argb & 0x00F0) * 0x00000110) & 0x0000FF00;
   2198 					unsigned int b =  (argb & 0x000F) * 0x00000011;
   2199 
   2200 					*(unsigned int*)destinationElement = a | r | g | b;
   2201 
   2202 					sourceElement += source.bytes;
   2203 					destinationElement += destination.bytes;
   2204 				}
   2205 
   2206 				sourceRow += source.pitchB;
   2207 				destinationRow += destination.pitchB;
   2208 			}
   2209 
   2210 			sourceSlice += source.sliceB;
   2211 			destinationSlice += destination.sliceB;
   2212 		}
   2213 
   2214 		source.unlockRect();
   2215 		destination.unlockRect();
   2216 	}
   2217 
   2218 	void Surface::decodeP8(Buffer &destination, Buffer &source)
   2219 	{
   2220 		unsigned char *sourceSlice = (unsigned char*)source.lockRect(0, 0, 0, sw::LOCK_READONLY);
   2221 		unsigned char *destinationSlice = (unsigned char*)destination.lockRect(0, 0, 0, sw::LOCK_UPDATE);
   2222 
   2223 		int depth = min(destination.depth, source.depth);
   2224 		int height = min(destination.height, source.height);
   2225 		int width = min(destination.width, source.width);
   2226 
   2227 		for(int z = 0; z < depth; z++)
   2228 		{
   2229 			unsigned char *sourceRow = sourceSlice;
   2230 			unsigned char *destinationRow = destinationSlice;
   2231 
   2232 			for(int y = 0; y < height; y++)
   2233 			{
   2234 				unsigned char *sourceElement = sourceRow;
   2235 				unsigned char *destinationElement = destinationRow;
   2236 
   2237 				for(int x = 0; x < width; x++)
   2238 				{
   2239 					unsigned int abgr = palette[*(unsigned char*)sourceElement];
   2240 
   2241 					unsigned int r = (abgr & 0x000000FF) << 16;
   2242 					unsigned int g = (abgr & 0x0000FF00) << 0;
   2243 					unsigned int b = (abgr & 0x00FF0000) >> 16;
   2244 					unsigned int a = (abgr & 0xFF000000) >> 0;
   2245 
   2246 					*(unsigned int*)destinationElement = a | r | g | b;
   2247 
   2248 					sourceElement += source.bytes;
   2249 					destinationElement += destination.bytes;
   2250 				}
   2251 
   2252 				sourceRow += source.pitchB;
   2253 				destinationRow += destination.pitchB;
   2254 			}
   2255 
   2256 			sourceSlice += source.sliceB;
   2257 			destinationSlice += destination.sliceB;
   2258 		}
   2259 
   2260 		source.unlockRect();
   2261 		destination.unlockRect();
   2262 	}
   2263 
   2264 	void Surface::decodeDXT1(Buffer &internal, Buffer &external)
   2265 	{
   2266 		unsigned int *destSlice = (unsigned int*)internal.lockRect(0, 0, 0, LOCK_UPDATE);
   2267 		const DXT1 *source = (const DXT1*)external.lockRect(0, 0, 0, LOCK_READONLY);
   2268 
   2269 		for(int z = 0; z < external.depth; z++)
   2270 		{
   2271 			unsigned int *dest = destSlice;
   2272 
   2273 			for(int y = 0; y < external.height; y += 4)
   2274 			{
   2275 				for(int x = 0; x < external.width; x += 4)
   2276 				{
   2277 					Color<byte> c[4];
   2278 
   2279 					c[0] = source->c0;
   2280 					c[1] = source->c1;
   2281 
   2282 					if(source->c0 > source->c1)   // No transparency
   2283 					{
   2284 						// c2 = 2 / 3 * c0 + 1 / 3 * c1
   2285 						c[2].r = (byte)((2 * (word)c[0].r + (word)c[1].r + 1) / 3);
   2286 						c[2].g = (byte)((2 * (word)c[0].g + (word)c[1].g + 1) / 3);
   2287 						c[2].b = (byte)((2 * (word)c[0].b + (word)c[1].b + 1) / 3);
   2288 						c[2].a = 0xFF;
   2289 
   2290 						// c3 = 1 / 3 * c0 + 2 / 3 * c1
   2291 						c[3].r = (byte)(((word)c[0].r + 2 * (word)c[1].r + 1) / 3);
   2292 						c[3].g = (byte)(((word)c[0].g + 2 * (word)c[1].g + 1) / 3);
   2293 						c[3].b = (byte)(((word)c[0].b + 2 * (word)c[1].b + 1) / 3);
   2294 						c[3].a = 0xFF;
   2295 					}
   2296 					else   // c3 transparent
   2297 					{
   2298 						// c2 = 1 / 2 * c0 + 1 / 2 * c1
   2299 						c[2].r = (byte)(((word)c[0].r + (word)c[1].r) / 2);
   2300 						c[2].g = (byte)(((word)c[0].g + (word)c[1].g) / 2);
   2301 						c[2].b = (byte)(((word)c[0].b + (word)c[1].b) / 2);
   2302 						c[2].a = 0xFF;
   2303 
   2304 						c[3].r = 0;
   2305 						c[3].g = 0;
   2306 						c[3].b = 0;
   2307 						c[3].a = 0;
   2308 					}
   2309 
   2310 					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
   2311 					{
   2312 						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
   2313 						{
   2314 							dest[(x + i) + (y + j) * internal.pitchP] = c[(unsigned int)(source->lut >> 2 * (i + j * 4)) % 4];
   2315 						}
   2316 					}
   2317 
   2318 					source++;
   2319 				}
   2320 			}
   2321 
   2322 			(byte*&)destSlice += internal.sliceB;
   2323 		}
   2324 
   2325 		external.unlockRect();
   2326 		internal.unlockRect();
   2327 	}
   2328 
   2329 	void Surface::decodeDXT3(Buffer &internal, Buffer &external)
   2330 	{
   2331 		unsigned int *destSlice = (unsigned int*)internal.lockRect(0, 0, 0, LOCK_UPDATE);
   2332 		const DXT3 *source = (const DXT3*)external.lockRect(0, 0, 0, LOCK_READONLY);
   2333 
   2334 		for(int z = 0; z < external.depth; z++)
   2335 		{
   2336 			unsigned int *dest = destSlice;
   2337 
   2338 			for(int y = 0; y < external.height; y += 4)
   2339 			{
   2340 				for(int x = 0; x < external.width; x += 4)
   2341 				{
   2342 					Color<byte> c[4];
   2343 
   2344 					c[0] = source->c0;
   2345 					c[1] = source->c1;
   2346 
   2347 					// c2 = 2 / 3 * c0 + 1 / 3 * c1
   2348 					c[2].r = (byte)((2 * (word)c[0].r + (word)c[1].r + 1) / 3);
   2349 					c[2].g = (byte)((2 * (word)c[0].g + (word)c[1].g + 1) / 3);
   2350 					c[2].b = (byte)((2 * (word)c[0].b + (word)c[1].b + 1) / 3);
   2351 
   2352 					// c3 = 1 / 3 * c0 + 2 / 3 * c1
   2353 					c[3].r = (byte)(((word)c[0].r + 2 * (word)c[1].r + 1) / 3);
   2354 					c[3].g = (byte)(((word)c[0].g + 2 * (word)c[1].g + 1) / 3);
   2355 					c[3].b = (byte)(((word)c[0].b + 2 * (word)c[1].b + 1) / 3);
   2356 
   2357 					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
   2358 					{
   2359 						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
   2360 						{
   2361 							unsigned int a = (unsigned int)(source->a >> 4 * (i + j * 4)) & 0x0F;
   2362 							unsigned int color = (c[(unsigned int)(source->lut >> 2 * (i + j * 4)) % 4] & 0x00FFFFFF) | ((a << 28) + (a << 24));
   2363 
   2364 							dest[(x + i) + (y + j) * internal.pitchP] = color;
   2365 						}
   2366 					}
   2367 
   2368 					source++;
   2369 				}
   2370 			}
   2371 
   2372 			(byte*&)destSlice += internal.sliceB;
   2373 		}
   2374 
   2375 		external.unlockRect();
   2376 		internal.unlockRect();
   2377 	}
   2378 
   2379 	void Surface::decodeDXT5(Buffer &internal, Buffer &external)
   2380 	{
   2381 		unsigned int *destSlice = (unsigned int*)internal.lockRect(0, 0, 0, LOCK_UPDATE);
   2382 		const DXT5 *source = (const DXT5*)external.lockRect(0, 0, 0, LOCK_READONLY);
   2383 
   2384 		for(int z = 0; z < external.depth; z++)
   2385 		{
   2386 			unsigned int *dest = destSlice;
   2387 
   2388 			for(int y = 0; y < external.height; y += 4)
   2389 			{
   2390 				for(int x = 0; x < external.width; x += 4)
   2391 				{
   2392 					Color<byte> c[4];
   2393 
   2394 					c[0] = source->c0;
   2395 					c[1] = source->c1;
   2396 
   2397 					// c2 = 2 / 3 * c0 + 1 / 3 * c1
   2398 					c[2].r = (byte)((2 * (word)c[0].r + (word)c[1].r + 1) / 3);
   2399 					c[2].g = (byte)((2 * (word)c[0].g + (word)c[1].g + 1) / 3);
   2400 					c[2].b = (byte)((2 * (word)c[0].b + (word)c[1].b + 1) / 3);
   2401 
   2402 					// c3 = 1 / 3 * c0 + 2 / 3 * c1
   2403 					c[3].r = (byte)(((word)c[0].r + 2 * (word)c[1].r + 1) / 3);
   2404 					c[3].g = (byte)(((word)c[0].g + 2 * (word)c[1].g + 1) / 3);
   2405 					c[3].b = (byte)(((word)c[0].b + 2 * (word)c[1].b + 1) / 3);
   2406 
   2407 					byte a[8];
   2408 
   2409 					a[0] = source->a0;
   2410 					a[1] = source->a1;
   2411 
   2412 					if(a[0] > a[1])
   2413 					{
   2414 						a[2] = (byte)((6 * (word)a[0] + 1 * (word)a[1] + 3) / 7);
   2415 						a[3] = (byte)((5 * (word)a[0] + 2 * (word)a[1] + 3) / 7);
   2416 						a[4] = (byte)((4 * (word)a[0] + 3 * (word)a[1] + 3) / 7);
   2417 						a[5] = (byte)((3 * (word)a[0] + 4 * (word)a[1] + 3) / 7);
   2418 						a[6] = (byte)((2 * (word)a[0] + 5 * (word)a[1] + 3) / 7);
   2419 						a[7] = (byte)((1 * (word)a[0] + 6 * (word)a[1] + 3) / 7);
   2420 					}
   2421 					else
   2422 					{
   2423 						a[2] = (byte)((4 * (word)a[0] + 1 * (word)a[1] + 2) / 5);
   2424 						a[3] = (byte)((3 * (word)a[0] + 2 * (word)a[1] + 2) / 5);
   2425 						a[4] = (byte)((2 * (word)a[0] + 3 * (word)a[1] + 2) / 5);
   2426 						a[5] = (byte)((1 * (word)a[0] + 4 * (word)a[1] + 2) / 5);
   2427 						a[6] = 0;
   2428 						a[7] = 0xFF;
   2429 					}
   2430 
   2431 					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
   2432 					{
   2433 						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
   2434 						{
   2435 							unsigned int alpha = (unsigned int)a[(unsigned int)(source->alut >> (16 + 3 * (i + j * 4))) % 8] << 24;
   2436 							unsigned int color = (c[(source->clut >> 2 * (i + j * 4)) % 4] & 0x00FFFFFF) | alpha;
   2437 
   2438 							dest[(x + i) + (y + j) * internal.pitchP] = color;
   2439 						}
   2440 					}
   2441 
   2442 					source++;
   2443 				}
   2444 			}
   2445 
   2446 			(byte*&)destSlice += internal.sliceB;
   2447 		}
   2448 
   2449 		external.unlockRect();
   2450 		internal.unlockRect();
   2451 	}
   2452 
   2453 	void Surface::decodeATI1(Buffer &internal, Buffer &external)
   2454 	{
   2455 		byte *destSlice = (byte*)internal.lockRect(0, 0, 0, LOCK_UPDATE);
   2456 		const ATI1 *source = (const ATI1*)external.lockRect(0, 0, 0, LOCK_READONLY);
   2457 
   2458 		for(int z = 0; z < external.depth; z++)
   2459 		{
   2460 			byte *dest = destSlice;
   2461 
   2462 			for(int y = 0; y < external.height; y += 4)
   2463 			{
   2464 				for(int x = 0; x < external.width; x += 4)
   2465 				{
   2466 					byte r[8];
   2467 
   2468 					r[0] = source->r0;
   2469 					r[1] = source->r1;
   2470 
   2471 					if(r[0] > r[1])
   2472 					{
   2473 						r[2] = (byte)((6 * (word)r[0] + 1 * (word)r[1] + 3) / 7);
   2474 						r[3] = (byte)((5 * (word)r[0] + 2 * (word)r[1] + 3) / 7);
   2475 						r[4] = (byte)((4 * (word)r[0] + 3 * (word)r[1] + 3) / 7);
   2476 						r[5] = (byte)((3 * (word)r[0] + 4 * (word)r[1] + 3) / 7);
   2477 						r[6] = (byte)((2 * (word)r[0] + 5 * (word)r[1] + 3) / 7);
   2478 						r[7] = (byte)((1 * (word)r[0] + 6 * (word)r[1] + 3) / 7);
   2479 					}
   2480 					else
   2481 					{
   2482 						r[2] = (byte)((4 * (word)r[0] + 1 * (word)r[1] + 2) / 5);
   2483 						r[3] = (byte)((3 * (word)r[0] + 2 * (word)r[1] + 2) / 5);
   2484 						r[4] = (byte)((2 * (word)r[0] + 3 * (word)r[1] + 2) / 5);
   2485 						r[5] = (byte)((1 * (word)r[0] + 4 * (word)r[1] + 2) / 5);
   2486 						r[6] = 0;
   2487 						r[7] = 0xFF;
   2488 					}
   2489 
   2490 					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
   2491 					{
   2492 						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
   2493 						{
   2494 							dest[(x + i) + (y + j) * internal.pitchP] = r[(unsigned int)(source->rlut >> (16 + 3 * (i + j * 4))) % 8];
   2495 						}
   2496 					}
   2497 
   2498 					source++;
   2499 				}
   2500 			}
   2501 
   2502 			destSlice += internal.sliceB;
   2503 		}
   2504 
   2505 		external.unlockRect();
   2506 		internal.unlockRect();
   2507 	}
   2508 
   2509 	void Surface::decodeATI2(Buffer &internal, Buffer &external)
   2510 	{
   2511 		word *destSlice = (word*)internal.lockRect(0, 0, 0, LOCK_UPDATE);
   2512 		const ATI2 *source = (const ATI2*)external.lockRect(0, 0, 0, LOCK_READONLY);
   2513 
   2514 		for(int z = 0; z < external.depth; z++)
   2515 		{
   2516 			word *dest = destSlice;
   2517 
   2518 			for(int y = 0; y < external.height; y += 4)
   2519 			{
   2520 				for(int x = 0; x < external.width; x += 4)
   2521 				{
   2522 					byte X[8];
   2523 
   2524 					X[0] = source->x0;
   2525 					X[1] = source->x1;
   2526 
   2527 					if(X[0] > X[1])
   2528 					{
   2529 						X[2] = (byte)((6 * (word)X[0] + 1 * (word)X[1] + 3) / 7);
   2530 						X[3] = (byte)((5 * (word)X[0] + 2 * (word)X[1] + 3) / 7);
   2531 						X[4] = (byte)((4 * (word)X[0] + 3 * (word)X[1] + 3) / 7);
   2532 						X[5] = (byte)((3 * (word)X[0] + 4 * (word)X[1] + 3) / 7);
   2533 						X[6] = (byte)((2 * (word)X[0] + 5 * (word)X[1] + 3) / 7);
   2534 						X[7] = (byte)((1 * (word)X[0] + 6 * (word)X[1] + 3) / 7);
   2535 					}
   2536 					else
   2537 					{
   2538 						X[2] = (byte)((4 * (word)X[0] + 1 * (word)X[1] + 2) / 5);
   2539 						X[3] = (byte)((3 * (word)X[0] + 2 * (word)X[1] + 2) / 5);
   2540 						X[4] = (byte)((2 * (word)X[0] + 3 * (word)X[1] + 2) / 5);
   2541 						X[5] = (byte)((1 * (word)X[0] + 4 * (word)X[1] + 2) / 5);
   2542 						X[6] = 0;
   2543 						X[7] = 0xFF;
   2544 					}
   2545 
   2546 					byte Y[8];
   2547 
   2548 					Y[0] = source->y0;
   2549 					Y[1] = source->y1;
   2550 
   2551 					if(Y[0] > Y[1])
   2552 					{
   2553 						Y[2] = (byte)((6 * (word)Y[0] + 1 * (word)Y[1] + 3) / 7);
   2554 						Y[3] = (byte)((5 * (word)Y[0] + 2 * (word)Y[1] + 3) / 7);
   2555 						Y[4] = (byte)((4 * (word)Y[0] + 3 * (word)Y[1] + 3) / 7);
   2556 						Y[5] = (byte)((3 * (word)Y[0] + 4 * (word)Y[1] + 3) / 7);
   2557 						Y[6] = (byte)((2 * (word)Y[0] + 5 * (word)Y[1] + 3) / 7);
   2558 						Y[7] = (byte)((1 * (word)Y[0] + 6 * (word)Y[1] + 3) / 7);
   2559 					}
   2560 					else
   2561 					{
   2562 						Y[2] = (byte)((4 * (word)Y[0] + 1 * (word)Y[1] + 2) / 5);
   2563 						Y[3] = (byte)((3 * (word)Y[0] + 2 * (word)Y[1] + 2) / 5);
   2564 						Y[4] = (byte)((2 * (word)Y[0] + 3 * (word)Y[1] + 2) / 5);
   2565 						Y[5] = (byte)((1 * (word)Y[0] + 4 * (word)Y[1] + 2) / 5);
   2566 						Y[6] = 0;
   2567 						Y[7] = 0xFF;
   2568 					}
   2569 
   2570 					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
   2571 					{
   2572 						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
   2573 						{
   2574 							word r = X[(unsigned int)(source->xlut >> (16 + 3 * (i + j * 4))) % 8];
   2575 							word g = Y[(unsigned int)(source->ylut >> (16 + 3 * (i + j * 4))) % 8];
   2576 
   2577 							dest[(x + i) + (y + j) * internal.pitchP] = (g << 8) + r;
   2578 						}
   2579 					}
   2580 
   2581 					source++;
   2582 				}
   2583 			}
   2584 
   2585 			(byte*&)destSlice += internal.sliceB;
   2586 		}
   2587 
   2588 		external.unlockRect();
   2589 		internal.unlockRect();
   2590 	}
   2591 
   2592 	void Surface::decodeETC2(Buffer &internal, Buffer &external, int nbAlphaBits, bool isSRGB)
   2593 	{
   2594 		ETC_Decoder::Decode((const byte*)external.lockRect(0, 0, 0, LOCK_READONLY), (byte*)internal.lockRect(0, 0, 0, LOCK_UPDATE), external.width, external.height, internal.width, internal.height, internal.pitchB, internal.bytes,
   2595 		                    (nbAlphaBits == 8) ? ETC_Decoder::ETC_RGBA : ((nbAlphaBits == 1) ? ETC_Decoder::ETC_RGB_PUNCHTHROUGH_ALPHA : ETC_Decoder::ETC_RGB));
   2596 		external.unlockRect();
   2597 		internal.unlockRect();
   2598 
   2599 		if(isSRGB)
   2600 		{
   2601 			static byte sRGBtoLinearTable[256];
   2602 			static bool sRGBtoLinearTableDirty = true;
   2603 			if(sRGBtoLinearTableDirty)
   2604 			{
   2605 				for(int i = 0; i < 256; i++)
   2606 				{
   2607 					sRGBtoLinearTable[i] = static_cast<byte>(sRGBtoLinear(static_cast<float>(i) / 255.0f) * 255.0f + 0.5f);
   2608 				}
   2609 				sRGBtoLinearTableDirty = false;
   2610 			}
   2611 
   2612 			// Perform sRGB conversion in place after decoding
   2613 			byte *src = (byte*)internal.lockRect(0, 0, 0, LOCK_READWRITE);
   2614 			for(int y = 0; y < internal.height; y++)
   2615 			{
   2616 				byte *srcRow = src + y * internal.pitchB;
   2617 				for(int x = 0; x <  internal.width; x++)
   2618 				{
   2619 					byte *srcPix = srcRow + x * internal.bytes;
   2620 					for(int i = 0; i < 3; i++)
   2621 					{
   2622 						srcPix[i] = sRGBtoLinearTable[srcPix[i]];
   2623 					}
   2624 				}
   2625 			}
   2626 			internal.unlockRect();
   2627 		}
   2628 	}
   2629 
   2630 	void Surface::decodeEAC(Buffer &internal, Buffer &external, int nbChannels, bool isSigned)
   2631 	{
   2632 		ASSERT(nbChannels == 1 || nbChannels == 2);
   2633 
   2634 		byte *src = (byte*)internal.lockRect(0, 0, 0, LOCK_READWRITE);
   2635 		ETC_Decoder::Decode((const byte*)external.lockRect(0, 0, 0, LOCK_READONLY), src, external.width, external.height, internal.width, internal.height, internal.pitchB, internal.bytes,
   2636 		                    (nbChannels == 1) ? (isSigned ? ETC_Decoder::ETC_R_SIGNED : ETC_Decoder::ETC_R_UNSIGNED) : (isSigned ? ETC_Decoder::ETC_RG_SIGNED : ETC_Decoder::ETC_RG_UNSIGNED));
   2637 		external.unlockRect();
   2638 
   2639 		// FIXME: We convert EAC data to float, until signed short internal formats are supported
   2640 		//        This code can be removed if ETC2 images are decoded to internal 16 bit signed R/RG formats
   2641 		const float normalization = isSigned ? (1.0f / (8.0f * 127.875f)) : (1.0f / (8.0f * 255.875f));
   2642 		for(int y = 0; y < internal.height; y++)
   2643 		{
   2644 			byte* srcRow = src + y * internal.pitchB;
   2645 			for(int x = internal.width - 1; x >= 0; x--)
   2646 			{
   2647 				int* srcPix = reinterpret_cast<int*>(srcRow + x * internal.bytes);
   2648 				float* dstPix = reinterpret_cast<float*>(srcPix);
   2649 				for(int c = nbChannels - 1; c >= 0; c--)
   2650 				{
   2651 					dstPix[c] = clamp(static_cast<float>(srcPix[c]) * normalization, -1.0f, 1.0f);
   2652 				}
   2653 			}
   2654 		}
   2655 
   2656 		internal.unlockRect();
   2657 	}
   2658 
   2659 	void Surface::decodeASTC(Buffer &internal, Buffer &external, int xBlockSize, int yBlockSize, int zBlockSize, bool isSRGB)
   2660 	{
   2661 	}
   2662 
   2663 	size_t Surface::size(int width, int height, int depth, int border, int samples, Format format)
   2664 	{
   2665 		samples = max(1, samples);
   2666 
   2667 		switch(format)
   2668 		{
   2669 		default:
   2670 			{
   2671 				uint64_t size = (uint64_t)sliceB(width, height, border, format, true) * depth * samples;
   2672 
   2673 				// FIXME: Unpacking byte4 to short4 in the sampler currently involves reading 8 bytes,
   2674 				// and stencil operations also read 8 bytes per four 8-bit stencil values,
   2675 				// so we have to allocate 4 extra bytes to avoid buffer overruns.
   2676 				size += 4;
   2677 
   2678 				// We can only sample buffers smaller than 2 GiB.
   2679 				// Force an out-of-memory if larger, or let the caller report an error.
   2680 				return size < 0x80000000u ? (size_t)size : std::numeric_limits<size_t>::max();
   2681 			}
   2682 		case FORMAT_YV12_BT601:
   2683 		case FORMAT_YV12_BT709:
   2684 		case FORMAT_YV12_JFIF:
   2685 			{
   2686 				width += 2 * border;
   2687 				height += 2 * border;
   2688 
   2689 				size_t YStride = align<16>(width);
   2690 				size_t YSize = YStride * height;
   2691 				size_t CStride = align<16>(YStride / 2);
   2692 				size_t CSize = CStride * height / 2;
   2693 
   2694 				return YSize + 2 * CSize;
   2695 			}
   2696 		}
   2697 	}
   2698 
   2699 	bool Surface::isStencil(Format format)
   2700 	{
   2701 		switch(format)
   2702 		{
   2703 		case FORMAT_D32:
   2704 		case FORMAT_D16:
   2705 		case FORMAT_D24X8:
   2706 		case FORMAT_D32F:
   2707 		case FORMAT_D32F_COMPLEMENTARY:
   2708 		case FORMAT_D32F_LOCKABLE:
   2709 		case FORMAT_D32F_SHADOW:
   2710 			return false;
   2711 		case FORMAT_D24S8:
   2712 		case FORMAT_D24FS8:
   2713 		case FORMAT_S8:
   2714 		case FORMAT_DF24S8:
   2715 		case FORMAT_DF16S8:
   2716 		case FORMAT_D32FS8_TEXTURE:
   2717 		case FORMAT_D32FS8_SHADOW:
   2718 		case FORMAT_D32FS8:
   2719 		case FORMAT_D32FS8_COMPLEMENTARY:
   2720 		case FORMAT_INTZ:
   2721 			return true;
   2722 		default:
   2723 			return false;
   2724 		}
   2725 	}
   2726 
   2727 	bool Surface::isDepth(Format format)
   2728 	{
   2729 		switch(format)
   2730 		{
   2731 		case FORMAT_D32:
   2732 		case FORMAT_D16:
   2733 		case FORMAT_D24X8:
   2734 		case FORMAT_D24S8:
   2735 		case FORMAT_D24FS8:
   2736 		case FORMAT_D32F:
   2737 		case FORMAT_D32FS8:
   2738 		case FORMAT_D32F_COMPLEMENTARY:
   2739 		case FORMAT_D32FS8_COMPLEMENTARY:
   2740 		case FORMAT_D32F_LOCKABLE:
   2741 		case FORMAT_DF24S8:
   2742 		case FORMAT_DF16S8:
   2743 		case FORMAT_D32FS8_TEXTURE:
   2744 		case FORMAT_D32F_SHADOW:
   2745 		case FORMAT_D32FS8_SHADOW:
   2746 		case FORMAT_INTZ:
   2747 			return true;
   2748 		case FORMAT_S8:
   2749 			return false;
   2750 		default:
   2751 			return false;
   2752 		}
   2753 	}
   2754 
   2755 	bool Surface::hasQuadLayout(Format format)
   2756 	{
   2757 		switch(format)
   2758 		{
   2759 		case FORMAT_D32:
   2760 		case FORMAT_D16:
   2761 		case FORMAT_D24X8:
   2762 		case FORMAT_D24S8:
   2763 		case FORMAT_D24FS8:
   2764 		case FORMAT_D32F:
   2765 		case FORMAT_D32FS8:
   2766 		case FORMAT_D32F_COMPLEMENTARY:
   2767 		case FORMAT_D32FS8_COMPLEMENTARY:
   2768 		case FORMAT_DF24S8:
   2769 		case FORMAT_DF16S8:
   2770 		case FORMAT_INTZ:
   2771 		case FORMAT_S8:
   2772 		case FORMAT_A8G8R8B8Q:
   2773 		case FORMAT_X8G8R8B8Q:
   2774 			return true;
   2775 		case FORMAT_D32F_LOCKABLE:
   2776 		case FORMAT_D32FS8_TEXTURE:
   2777 		case FORMAT_D32F_SHADOW:
   2778 		case FORMAT_D32FS8_SHADOW:
   2779 		default:
   2780 			break;
   2781 		}
   2782 
   2783 		return false;
   2784 	}
   2785 
   2786 	bool Surface::isPalette(Format format)
   2787 	{
   2788 		switch(format)
   2789 		{
   2790 		case FORMAT_P8:
   2791 		case FORMAT_A8P8:
   2792 			return true;
   2793 		default:
   2794 			return false;
   2795 		}
   2796 	}
   2797 
   2798 	bool Surface::isFloatFormat(Format format)
   2799 	{
   2800 		switch(format)
   2801 		{
   2802 		case FORMAT_R5G6B5:
   2803 		case FORMAT_R8G8B8:
   2804 		case FORMAT_B8G8R8:
   2805 		case FORMAT_X8R8G8B8:
   2806 		case FORMAT_X8B8G8R8I:
   2807 		case FORMAT_X8B8G8R8:
   2808 		case FORMAT_A8R8G8B8:
   2809 		case FORMAT_SRGB8_X8:
   2810 		case FORMAT_SRGB8_A8:
   2811 		case FORMAT_A8B8G8R8I:
   2812 		case FORMAT_R8UI:
   2813 		case FORMAT_G8R8UI:
   2814 		case FORMAT_X8B8G8R8UI:
   2815 		case FORMAT_A8B8G8R8UI:
   2816 		case FORMAT_A8B8G8R8:
   2817 		case FORMAT_G8R8I:
   2818 		case FORMAT_G8R8:
   2819 		case FORMAT_A2B10G10R10:
   2820 		case FORMAT_A2B10G10R10UI:
   2821 		case FORMAT_R8_SNORM:
   2822 		case FORMAT_G8R8_SNORM:
   2823 		case FORMAT_X8B8G8R8_SNORM:
   2824 		case FORMAT_A8B8G8R8_SNORM:
   2825 		case FORMAT_R16I:
   2826 		case FORMAT_R16UI:
   2827 		case FORMAT_G16R16I:
   2828 		case FORMAT_G16R16UI:
   2829 		case FORMAT_G16R16:
   2830 		case FORMAT_X16B16G16R16I:
   2831 		case FORMAT_X16B16G16R16UI:
   2832 		case FORMAT_A16B16G16R16I:
   2833 		case FORMAT_A16B16G16R16UI:
   2834 		case FORMAT_A16B16G16R16:
   2835 		case FORMAT_V8U8:
   2836 		case FORMAT_Q8W8V8U8:
   2837 		case FORMAT_X8L8V8U8:
   2838 		case FORMAT_V16U16:
   2839 		case FORMAT_A16W16V16U16:
   2840 		case FORMAT_Q16W16V16U16:
   2841 		case FORMAT_A8:
   2842 		case FORMAT_R8I:
   2843 		case FORMAT_R8:
   2844 		case FORMAT_S8:
   2845 		case FORMAT_L8:
   2846 		case FORMAT_L16:
   2847 		case FORMAT_A8L8:
   2848 		case FORMAT_YV12_BT601:
   2849 		case FORMAT_YV12_BT709:
   2850 		case FORMAT_YV12_JFIF:
   2851 		case FORMAT_R32I:
   2852 		case FORMAT_R32UI:
   2853 		case FORMAT_G32R32I:
   2854 		case FORMAT_G32R32UI:
   2855 		case FORMAT_X32B32G32R32I:
   2856 		case FORMAT_X32B32G32R32UI:
   2857 		case FORMAT_A32B32G32R32I:
   2858 		case FORMAT_A32B32G32R32UI:
   2859 			return false;
   2860 		case FORMAT_R16F:
   2861 		case FORMAT_G16R16F:
   2862 		case FORMAT_B16G16R16F:
   2863 		case FORMAT_X16B16G16R16F:
   2864 		case FORMAT_A16B16G16R16F:
   2865 		case FORMAT_X16B16G16R16F_UNSIGNED:
   2866 		case FORMAT_R32F:
   2867 		case FORMAT_G32R32F:
   2868 		case FORMAT_B32G32R32F:
   2869 		case FORMAT_X32B32G32R32F:
   2870 		case FORMAT_A32B32G32R32F:
   2871 		case FORMAT_X32B32G32R32F_UNSIGNED:
   2872 		case FORMAT_D32F:
   2873 		case FORMAT_D32FS8:
   2874 		case FORMAT_D32F_COMPLEMENTARY:
   2875 		case FORMAT_D32FS8_COMPLEMENTARY:
   2876 		case FORMAT_D32F_LOCKABLE:
   2877 		case FORMAT_D32FS8_TEXTURE:
   2878 		case FORMAT_D32F_SHADOW:
   2879 		case FORMAT_D32FS8_SHADOW:
   2880 		case FORMAT_L16F:
   2881 		case FORMAT_A16L16F:
   2882 		case FORMAT_L32F:
   2883 		case FORMAT_A32L32F:
   2884 			return true;
   2885 		default:
   2886 			ASSERT(false);
   2887 		}
   2888 
   2889 		return false;
   2890 	}
   2891 
   2892 	bool Surface::isUnsignedComponent(Format format, int component)
   2893 	{
   2894 		switch(format)
   2895 		{
   2896 		case FORMAT_NULL:
   2897 		case FORMAT_R5G6B5:
   2898 		case FORMAT_R8G8B8:
   2899 		case FORMAT_B8G8R8:
   2900 		case FORMAT_X8R8G8B8:
   2901 		case FORMAT_X8B8G8R8:
   2902 		case FORMAT_A8R8G8B8:
   2903 		case FORMAT_A8B8G8R8:
   2904 		case FORMAT_SRGB8_X8:
   2905 		case FORMAT_SRGB8_A8:
   2906 		case FORMAT_G8R8:
   2907 		case FORMAT_A2B10G10R10:
   2908 		case FORMAT_A2B10G10R10UI:
   2909 		case FORMAT_R16UI:
   2910 		case FORMAT_G16R16:
   2911 		case FORMAT_G16R16UI:
   2912 		case FORMAT_X16B16G16R16UI:
   2913 		case FORMAT_A16B16G16R16:
   2914 		case FORMAT_A16B16G16R16UI:
   2915 		case FORMAT_R32UI:
   2916 		case FORMAT_G32R32UI:
   2917 		case FORMAT_X32B32G32R32UI:
   2918 		case FORMAT_A32B32G32R32UI:
   2919 		case FORMAT_X32B32G32R32F_UNSIGNED:
   2920 		case FORMAT_R8UI:
   2921 		case FORMAT_G8R8UI:
   2922 		case FORMAT_X8B8G8R8UI:
   2923 		case FORMAT_A8B8G8R8UI:
   2924 		case FORMAT_D32F:
   2925 		case FORMAT_D32FS8:
   2926 		case FORMAT_D32F_COMPLEMENTARY:
   2927 		case FORMAT_D32FS8_COMPLEMENTARY:
   2928 		case FORMAT_D32F_LOCKABLE:
   2929 		case FORMAT_D32FS8_TEXTURE:
   2930 		case FORMAT_D32F_SHADOW:
   2931 		case FORMAT_D32FS8_SHADOW:
   2932 		case FORMAT_A8:
   2933 		case FORMAT_R8:
   2934 		case FORMAT_L8:
   2935 		case FORMAT_L16:
   2936 		case FORMAT_A8L8:
   2937 		case FORMAT_YV12_BT601:
   2938 		case FORMAT_YV12_BT709:
   2939 		case FORMAT_YV12_JFIF:
   2940 			return true;
   2941 		case FORMAT_A8B8G8R8I:
   2942 		case FORMAT_A16B16G16R16I:
   2943 		case FORMAT_A32B32G32R32I:
   2944 		case FORMAT_A8B8G8R8_SNORM:
   2945 		case FORMAT_Q8W8V8U8:
   2946 		case FORMAT_Q16W16V16U16:
   2947 		case FORMAT_A32B32G32R32F:
   2948 			return false;
   2949 		case FORMAT_R32F:
   2950 		case FORMAT_R8I:
   2951 		case FORMAT_R16I:
   2952 		case FORMAT_R32I:
   2953 		case FORMAT_R8_SNORM:
   2954 			return component >= 1;
   2955 		case FORMAT_V8U8:
   2956 		case FORMAT_X8L8V8U8:
   2957 		case FORMAT_V16U16:
   2958 		case FORMAT_G32R32F:
   2959 		case FORMAT_G8R8I:
   2960 		case FORMAT_G16R16I:
   2961 		case FORMAT_G32R32I:
   2962 		case FORMAT_G8R8_SNORM:
   2963 			return component >= 2;
   2964 		case FORMAT_A16W16V16U16:
   2965 		case FORMAT_B32G32R32F:
   2966 		case FORMAT_X32B32G32R32F:
   2967 		case FORMAT_X8B8G8R8I:
   2968 		case FORMAT_X16B16G16R16I:
   2969 		case FORMAT_X32B32G32R32I:
   2970 		case FORMAT_X8B8G8R8_SNORM:
   2971 			return component >= 3;
   2972 		default:
   2973 			ASSERT(false);
   2974 		}
   2975 
   2976 		return false;
   2977 	}
   2978 
   2979 	bool Surface::isSRGBreadable(Format format)
   2980 	{
   2981 		// Keep in sync with Capabilities::isSRGBreadable
   2982 		switch(format)
   2983 		{
   2984 		case FORMAT_L8:
   2985 		case FORMAT_A8L8:
   2986 		case FORMAT_R8G8B8:
   2987 		case FORMAT_A8R8G8B8:
   2988 		case FORMAT_X8R8G8B8:
   2989 		case FORMAT_A8B8G8R8:
   2990 		case FORMAT_X8B8G8R8:
   2991 		case FORMAT_SRGB8_X8:
   2992 		case FORMAT_SRGB8_A8:
   2993 		case FORMAT_R5G6B5:
   2994 		case FORMAT_X1R5G5B5:
   2995 		case FORMAT_A1R5G5B5:
   2996 		case FORMAT_A4R4G4B4:
   2997 		case FORMAT_DXT1:
   2998 		case FORMAT_DXT3:
   2999 		case FORMAT_DXT5:
   3000 		case FORMAT_ATI1:
   3001 		case FORMAT_ATI2:
   3002 			return true;
   3003 		default:
   3004 			return false;
   3005 		}
   3006 	}
   3007 
   3008 	bool Surface::isSRGBwritable(Format format)
   3009 	{
   3010 		// Keep in sync with Capabilities::isSRGBwritable
   3011 		switch(format)
   3012 		{
   3013 		case FORMAT_NULL:
   3014 		case FORMAT_A8R8G8B8:
   3015 		case FORMAT_X8R8G8B8:
   3016 		case FORMAT_A8B8G8R8:
   3017 		case FORMAT_X8B8G8R8:
   3018 		case FORMAT_SRGB8_X8:
   3019 		case FORMAT_SRGB8_A8:
   3020 		case FORMAT_R5G6B5:
   3021 			return true;
   3022 		default:
   3023 			return false;
   3024 		}
   3025 	}
   3026 
   3027 	bool Surface::isSRGBformat(Format format)
   3028 	{
   3029 		switch(format)
   3030 		{
   3031 		case FORMAT_SRGB8_X8:
   3032 		case FORMAT_SRGB8_A8:
   3033 			return true;
   3034 		default:
   3035 			return false;
   3036 		}
   3037 	}
   3038 
   3039 	bool Surface::isCompressed(Format format)
   3040 	{
   3041 		switch(format)
   3042 		{
   3043 		case FORMAT_DXT1:
   3044 		case FORMAT_DXT3:
   3045 		case FORMAT_DXT5:
   3046 		case FORMAT_ATI1:
   3047 		case FORMAT_ATI2:
   3048 		case FORMAT_ETC1:
   3049 		case FORMAT_R11_EAC:
   3050 		case FORMAT_SIGNED_R11_EAC:
   3051 		case FORMAT_RG11_EAC:
   3052 		case FORMAT_SIGNED_RG11_EAC:
   3053 		case FORMAT_RGB8_ETC2:
   3054 		case FORMAT_SRGB8_ETC2:
   3055 		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
   3056 		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
   3057 		case FORMAT_RGBA8_ETC2_EAC:
   3058 		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
   3059 		case FORMAT_RGBA_ASTC_4x4_KHR:
   3060 		case FORMAT_RGBA_ASTC_5x4_KHR:
   3061 		case FORMAT_RGBA_ASTC_5x5_KHR:
   3062 		case FORMAT_RGBA_ASTC_6x5_KHR:
   3063 		case FORMAT_RGBA_ASTC_6x6_KHR:
   3064 		case FORMAT_RGBA_ASTC_8x5_KHR:
   3065 		case FORMAT_RGBA_ASTC_8x6_KHR:
   3066 		case FORMAT_RGBA_ASTC_8x8_KHR:
   3067 		case FORMAT_RGBA_ASTC_10x5_KHR:
   3068 		case FORMAT_RGBA_ASTC_10x6_KHR:
   3069 		case FORMAT_RGBA_ASTC_10x8_KHR:
   3070 		case FORMAT_RGBA_ASTC_10x10_KHR:
   3071 		case FORMAT_RGBA_ASTC_12x10_KHR:
   3072 		case FORMAT_RGBA_ASTC_12x12_KHR:
   3073 		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
   3074 		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
   3075 		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
   3076 		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
   3077 		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
   3078 		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
   3079 		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
   3080 		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
   3081 		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
   3082 		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
   3083 		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
   3084 		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
   3085 		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
   3086 		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
   3087 			return true;
   3088 		default:
   3089 			return false;
   3090 		}
   3091 	}
   3092 
   3093 	bool Surface::isSignedNonNormalizedInteger(Format format)
   3094 	{
   3095 		switch(format)
   3096 		{
   3097 		case FORMAT_A8B8G8R8I:
   3098 		case FORMAT_X8B8G8R8I:
   3099 		case FORMAT_G8R8I:
   3100 		case FORMAT_R8I:
   3101 		case FORMAT_A16B16G16R16I:
   3102 		case FORMAT_X16B16G16R16I:
   3103 		case FORMAT_G16R16I:
   3104 		case FORMAT_R16I:
   3105 		case FORMAT_A32B32G32R32I:
   3106 		case FORMAT_X32B32G32R32I:
   3107 		case FORMAT_G32R32I:
   3108 		case FORMAT_R32I:
   3109 			return true;
   3110 		default:
   3111 			return false;
   3112 		}
   3113 	}
   3114 
   3115 	bool Surface::isUnsignedNonNormalizedInteger(Format format)
   3116 	{
   3117 		switch(format)
   3118 		{
   3119 		case FORMAT_A8B8G8R8UI:
   3120 		case FORMAT_X8B8G8R8UI:
   3121 		case FORMAT_G8R8UI:
   3122 		case FORMAT_R8UI:
   3123 		case FORMAT_A16B16G16R16UI:
   3124 		case FORMAT_X16B16G16R16UI:
   3125 		case FORMAT_G16R16UI:
   3126 		case FORMAT_R16UI:
   3127 		case FORMAT_A32B32G32R32UI:
   3128 		case FORMAT_X32B32G32R32UI:
   3129 		case FORMAT_G32R32UI:
   3130 		case FORMAT_R32UI:
   3131 			return true;
   3132 		default:
   3133 			return false;
   3134 		}
   3135 	}
   3136 
   3137 	bool Surface::isNonNormalizedInteger(Format format)
   3138 	{
   3139 		return isSignedNonNormalizedInteger(format) ||
   3140 		       isUnsignedNonNormalizedInteger(format);
   3141 	}
   3142 
   3143 	bool Surface::isNormalizedInteger(Format format)
   3144 	{
   3145 		return !isFloatFormat(format) &&
   3146 		       !isNonNormalizedInteger(format) &&
   3147 		       !isCompressed(format) &&
   3148 		       !isDepth(format) &&
   3149 		       !isStencil(format);
   3150 	}
   3151 
   3152 	int Surface::componentCount(Format format)
   3153 	{
   3154 		switch(format)
   3155 		{
   3156 		case FORMAT_R5G6B5:         return 3;
   3157 		case FORMAT_X8R8G8B8:       return 3;
   3158 		case FORMAT_X8B8G8R8I:      return 3;
   3159 		case FORMAT_X8B8G8R8:       return 3;
   3160 		case FORMAT_A8R8G8B8:       return 4;
   3161 		case FORMAT_SRGB8_X8:       return 3;
   3162 		case FORMAT_SRGB8_A8:       return 4;
   3163 		case FORMAT_A8B8G8R8I:      return 4;
   3164 		case FORMAT_A8B8G8R8:       return 4;
   3165 		case FORMAT_G8R8I:          return 2;
   3166 		case FORMAT_G8R8:           return 2;
   3167 		case FORMAT_R8_SNORM:      return 1;
   3168 		case FORMAT_G8R8_SNORM:    return 2;
   3169 		case FORMAT_X8B8G8R8_SNORM:return 3;
   3170 		case FORMAT_A8B8G8R8_SNORM:return 4;
   3171 		case FORMAT_R8UI:           return 1;
   3172 		case FORMAT_G8R8UI:         return 2;
   3173 		case FORMAT_X8B8G8R8UI:     return 3;
   3174 		case FORMAT_A8B8G8R8UI:     return 4;
   3175 		case FORMAT_A2B10G10R10:    return 4;
   3176 		case FORMAT_A2B10G10R10UI:  return 4;
   3177 		case FORMAT_G16R16I:        return 2;
   3178 		case FORMAT_G16R16UI:       return 2;
   3179 		case FORMAT_G16R16:         return 2;
   3180 		case FORMAT_G32R32I:        return 2;
   3181 		case FORMAT_G32R32UI:       return 2;
   3182 		case FORMAT_X16B16G16R16I:  return 3;
   3183 		case FORMAT_X16B16G16R16UI: return 3;
   3184 		case FORMAT_A16B16G16R16I:  return 4;
   3185 		case FORMAT_A16B16G16R16UI: return 4;
   3186 		case FORMAT_A16B16G16R16:   return 4;
   3187 		case FORMAT_X32B32G32R32I:  return 3;
   3188 		case FORMAT_X32B32G32R32UI: return 3;
   3189 		case FORMAT_A32B32G32R32I:  return 4;
   3190 		case FORMAT_A32B32G32R32UI: return 4;
   3191 		case FORMAT_V8U8:           return 2;
   3192 		case FORMAT_Q8W8V8U8:       return 4;
   3193 		case FORMAT_X8L8V8U8:       return 3;
   3194 		case FORMAT_V16U16:         return 2;
   3195 		case FORMAT_A16W16V16U16:   return 4;
   3196 		case FORMAT_Q16W16V16U16:   return 4;
   3197 		case FORMAT_R32F:           return 1;
   3198 		case FORMAT_G32R32F:        return 2;
   3199 		case FORMAT_X32B32G32R32F:  return 3;
   3200 		case FORMAT_A32B32G32R32F:  return 4;
   3201 		case FORMAT_X32B32G32R32F_UNSIGNED: return 3;
   3202 		case FORMAT_D32F:           return 1;
   3203 		case FORMAT_D32FS8:         return 1;
   3204 		case FORMAT_D32F_LOCKABLE:  return 1;
   3205 		case FORMAT_D32FS8_TEXTURE: return 1;
   3206 		case FORMAT_D32F_SHADOW:    return 1;
   3207 		case FORMAT_D32FS8_SHADOW:  return 1;
   3208 		case FORMAT_A8:             return 1;
   3209 		case FORMAT_R8I:            return 1;
   3210 		case FORMAT_R8:             return 1;
   3211 		case FORMAT_R16I:           return 1;
   3212 		case FORMAT_R16UI:          return 1;
   3213 		case FORMAT_R32I:           return 1;
   3214 		case FORMAT_R32UI:          return 1;
   3215 		case FORMAT_L8:             return 1;
   3216 		case FORMAT_L16:            return 1;
   3217 		case FORMAT_A8L8:           return 2;
   3218 		case FORMAT_YV12_BT601:     return 3;
   3219 		case FORMAT_YV12_BT709:     return 3;
   3220 		case FORMAT_YV12_JFIF:      return 3;
   3221 		default:
   3222 			ASSERT(false);
   3223 		}
   3224 
   3225 		return 1;
   3226 	}
   3227 
   3228 	void *Surface::allocateBuffer(int width, int height, int depth, int border, int samples, Format format)
   3229 	{
   3230 		return allocate(size(width, height, depth, border, samples, format));
   3231 	}
   3232 
   3233 	void Surface::memfill4(void *buffer, int pattern, int bytes)
   3234 	{
   3235 		while((size_t)buffer & 0x1 && bytes >= 1)
   3236 		{
   3237 			*(char*)buffer = (char)pattern;
   3238 			(char*&)buffer += 1;
   3239 			bytes -= 1;
   3240 		}
   3241 
   3242 		while((size_t)buffer & 0x3 && bytes >= 2)
   3243 		{
   3244 			*(short*)buffer = (short)pattern;
   3245 			(short*&)buffer += 1;
   3246 			bytes -= 2;
   3247 		}
   3248 
   3249 		#if defined(__i386__) || defined(__x86_64__)
   3250 			if(CPUID::supportsSSE())
   3251 			{
   3252 				while((size_t)buffer & 0xF && bytes >= 4)
   3253 				{
   3254 					*(int*)buffer = pattern;
   3255 					(int*&)buffer += 1;
   3256 					bytes -= 4;
   3257 				}
   3258 
   3259 				__m128 quad = _mm_set_ps1((float&)pattern);
   3260 
   3261 				float *pointer = (float*)buffer;
   3262 				int qxwords = bytes / 64;
   3263 				bytes -= qxwords * 64;
   3264 
   3265 				while(qxwords--)
   3266 				{
   3267 					_mm_stream_ps(pointer + 0, quad);
   3268 					_mm_stream_ps(pointer + 4, quad);
   3269 					_mm_stream_ps(pointer + 8, quad);
   3270 					_mm_stream_ps(pointer + 12, quad);
   3271 
   3272 					pointer += 16;
   3273 				}
   3274 
   3275 				buffer = pointer;
   3276 			}
   3277 		#endif
   3278 
   3279 		while(bytes >= 4)
   3280 		{
   3281 			*(int*)buffer = (int)pattern;
   3282 			(int*&)buffer += 1;
   3283 			bytes -= 4;
   3284 		}
   3285 
   3286 		while(bytes >= 2)
   3287 		{
   3288 			*(short*)buffer = (short)pattern;
   3289 			(short*&)buffer += 1;
   3290 			bytes -= 2;
   3291 		}
   3292 
   3293 		while(bytes >= 1)
   3294 		{
   3295 			*(char*)buffer = (char)pattern;
   3296 			(char*&)buffer += 1;
   3297 			bytes -= 1;
   3298 		}
   3299 	}
   3300 
   3301 	void Surface::sync()
   3302 	{
   3303 		resource->lock(EXCLUSIVE);
   3304 		resource->unlock();
   3305 	}
   3306 
   3307 	bool Surface::isEntire(const Rect& rect) const
   3308 	{
   3309 		return (rect.x0 == 0 && rect.y0 == 0 && rect.x1 == internal.width && rect.y1 == internal.height && internal.depth == 1);
   3310 	}
   3311 
   3312 	Rect Surface::getRect() const
   3313 	{
   3314 		return Rect(0, 0, internal.width, internal.height);
   3315 	}
   3316 
   3317 	void Surface::clearDepth(float depth, int x0, int y0, int width, int height)
   3318 	{
   3319 		if(width == 0 || height == 0)
   3320 		{
   3321 			return;
   3322 		}
   3323 
   3324 		if(internal.format == FORMAT_NULL)
   3325 		{
   3326 			return;
   3327 		}
   3328 
   3329 		// Not overlapping
   3330 		if(x0 > internal.width) return;
   3331 		if(y0 > internal.height) return;
   3332 		if(x0 + width < 0) return;
   3333 		if(y0 + height < 0) return;
   3334 
   3335 		// Clip against dimensions
   3336 		if(x0 < 0) {width += x0; x0 = 0;}
   3337 		if(x0 + width > internal.width) width = internal.width - x0;
   3338 		if(y0 < 0) {height += y0; y0 = 0;}
   3339 		if(y0 + height > internal.height) height = internal.height - y0;
   3340 
   3341 		const bool entire = x0 == 0 && y0 == 0 && width == internal.width && height == internal.height;
   3342 		const Lock lock = entire ? LOCK_DISCARD : LOCK_WRITEONLY;
   3343 
   3344 		int x1 = x0 + width;
   3345 		int y1 = y0 + height;
   3346 
   3347 		if(!hasQuadLayout(internal.format))
   3348 		{
   3349 			float *target = (float*)lockInternal(x0, y0, 0, lock, PUBLIC);
   3350 
   3351 			for(int z = 0; z < internal.samples; z++)
   3352 			{
   3353 				float *row = target;
   3354 				for(int y = y0; y < y1; y++)
   3355 				{
   3356 					memfill4(row, (int&)depth, width * sizeof(float));
   3357 					row += internal.pitchP;
   3358 				}
   3359 				target += internal.sliceP;
   3360 			}
   3361 
   3362 			unlockInternal();
   3363 		}
   3364 		else   // Quad layout
   3365 		{
   3366 			if(complementaryDepthBuffer)
   3367 			{
   3368 				depth = 1 - depth;
   3369 			}
   3370 
   3371 			float *buffer = (float*)lockInternal(0, 0, 0, lock, PUBLIC);
   3372 
   3373 			int oddX0 = (x0 & ~1) * 2 + (x0 & 1);
   3374 			int oddX1 = (x1 & ~1) * 2;
   3375 			int evenX0 = ((x0 + 1) & ~1) * 2;
   3376 			int evenBytes = (oddX1 - evenX0) * sizeof(float);
   3377 
   3378 			for(int z = 0; z < internal.samples; z++)
   3379 			{
   3380 				for(int y = y0; y < y1; y++)
   3381 				{
   3382 					float *target = buffer + (y & ~1) * internal.pitchP + (y & 1) * 2;
   3383 
   3384 					if((y & 1) == 0 && y + 1 < y1)   // Fill quad line at once
   3385 					{
   3386 						if((x0 & 1) != 0)
   3387 						{
   3388 							target[oddX0 + 0] = depth;
   3389 							target[oddX0 + 2] = depth;
   3390 						}
   3391 
   3392 					//	for(int x2 = evenX0; x2 < x1 * 2; x2 += 4)
   3393 					//	{
   3394 					//		target[x2 + 0] = depth;
   3395 					//		target[x2 + 1] = depth;
   3396 					//		target[x2 + 2] = depth;
   3397 					//		target[x2 + 3] = depth;
   3398 					//	}
   3399 
   3400 					//	__asm
   3401 					//	{
   3402 					//		movss xmm0, depth
   3403 					//		shufps xmm0, xmm0, 0x00
   3404 					//
   3405 					//		mov eax, x0
   3406 					//		add eax, 1
   3407 					//		and eax, 0xFFFFFFFE
   3408 					//		cmp eax, x1
   3409 					//		jge qEnd
   3410 					//
   3411 					//		mov edi, target
   3412 					//
   3413 					//	qLoop:
   3414 					//		movntps [edi+8*eax], xmm0
   3415 					//
   3416 					//		add eax, 2
   3417 					//		cmp eax, x1
   3418 					//		jl qLoop
   3419 					//	qEnd:
   3420 					//	}
   3421 
   3422 						memfill4(&target[evenX0], (int&)depth, evenBytes);
   3423 
   3424 						if((x1 & 1) != 0)
   3425 						{
   3426 							target[oddX1 + 0] = depth;
   3427 							target[oddX1 + 2] = depth;
   3428 						}
   3429 
   3430 						y++;
   3431 					}
   3432 					else
   3433 					{
   3434 						for(int x = x0, i = oddX0; x < x1; x++, i = (x & ~1) * 2 + (x & 1))
   3435 						{
   3436 							target[i] = depth;
   3437 						}
   3438 					}
   3439 				}
   3440 
   3441 				buffer += internal.sliceP;
   3442 			}
   3443 
   3444 			unlockInternal();
   3445 		}
   3446 	}
   3447 
   3448 	void Surface::clearStencil(unsigned char s, unsigned char mask, int x0, int y0, int width, int height)
   3449 	{
   3450 		if(mask == 0 || width == 0 || height == 0)
   3451 		{
   3452 			return;
   3453 		}
   3454 
   3455 		if(stencil.format == FORMAT_NULL)
   3456 		{
   3457 			return;
   3458 		}
   3459 
   3460 		// Not overlapping
   3461 		if(x0 > internal.width) return;
   3462 		if(y0 > internal.height) return;
   3463 		if(x0 + width < 0) return;
   3464 		if(y0 + height < 0) return;
   3465 
   3466 		// Clip against dimensions
   3467 		if(x0 < 0) {width += x0; x0 = 0;}
   3468 		if(x0 + width > internal.width) width = internal.width - x0;
   3469 		if(y0 < 0) {height += y0; y0 = 0;}
   3470 		if(y0 + height > internal.height) height = internal.height - y0;
   3471 
   3472 		int x1 = x0 + width;
   3473 		int y1 = y0 + height;
   3474 
   3475 		int oddX0 = (x0 & ~1) * 2 + (x0 & 1);
   3476 		int oddX1 = (x1 & ~1) * 2;
   3477 		int evenX0 = ((x0 + 1) & ~1) * 2;
   3478 		int evenBytes = oddX1 - evenX0;
   3479 
   3480 		unsigned char maskedS = s & mask;
   3481 		unsigned char invMask = ~mask;
   3482 		unsigned int fill = maskedS;
   3483 		fill = fill | (fill << 8) | (fill << 16) | (fill << 24);
   3484 
   3485 		char *buffer = (char*)lockStencil(0, 0, 0, PUBLIC);
   3486 
   3487 		// Stencil buffers are assumed to use quad layout
   3488 		for(int z = 0; z < stencil.samples; z++)
   3489 		{
   3490 			for(int y = y0; y < y1; y++)
   3491 			{
   3492 				char *target = buffer + (y & ~1) * stencil.pitchP + (y & 1) * 2;
   3493 
   3494 				if((y & 1) == 0 && y + 1 < y1 && mask == 0xFF)   // Fill quad line at once
   3495 				{
   3496 					if((x0 & 1) != 0)
   3497 					{
   3498 						target[oddX0 + 0] = fill;
   3499 						target[oddX0 + 2] = fill;
   3500 					}
   3501 
   3502 					memfill4(&target[evenX0], fill, evenBytes);
   3503 
   3504 					if((x1 & 1) != 0)
   3505 					{
   3506 						target[oddX1 + 0] = fill;
   3507 						target[oddX1 + 2] = fill;
   3508 					}
   3509 
   3510 					y++;
   3511 				}
   3512 				else
   3513 				{
   3514 					for(int x = x0; x < x1; x++)
   3515 					{
   3516 						int i = (x & ~1) * 2 + (x & 1);
   3517 						target[i] = maskedS | (target[i] & invMask);
   3518 					}
   3519 				}
   3520 			}
   3521 
   3522 			buffer += stencil.sliceP;
   3523 		}
   3524 
   3525 		unlockStencil();
   3526 	}
   3527 
   3528 	void Surface::fill(const Color<float> &color, int x0, int y0, int width, int height)
   3529 	{
   3530 		unsigned char *row;
   3531 		Buffer *buffer;
   3532 
   3533 		if(internal.dirty)
   3534 		{
   3535 			row = (unsigned char*)lockInternal(x0, y0, 0, LOCK_WRITEONLY, PUBLIC);
   3536 			buffer = &internal;
   3537 		}
   3538 		else
   3539 		{
   3540 			row = (unsigned char*)lockExternal(x0, y0, 0, LOCK_WRITEONLY, PUBLIC);
   3541 			buffer = &external;
   3542 		}
   3543 
   3544 		if(buffer->bytes <= 4)
   3545 		{
   3546 			int c;
   3547 			buffer->write(&c, color);
   3548 
   3549 			if(buffer->bytes <= 1) c = (c << 8)  | c;
   3550 			if(buffer->bytes <= 2) c = (c << 16) | c;
   3551 
   3552 			for(int y = 0; y < height; y++)
   3553 			{
   3554 				memfill4(row, c, width * buffer->bytes);
   3555 
   3556 				row += buffer->pitchB;
   3557 			}
   3558 		}
   3559 		else   // Generic
   3560 		{
   3561 			for(int y = 0; y < height; y++)
   3562 			{
   3563 				unsigned char *element = row;
   3564 
   3565 				for(int x = 0; x < width; x++)
   3566 				{
   3567 					buffer->write(element, color);
   3568 
   3569 					element += buffer->bytes;
   3570 				}
   3571 
   3572 				row += buffer->pitchB;
   3573 			}
   3574 		}
   3575 
   3576 		if(buffer == &internal)
   3577 		{
   3578 			unlockInternal();
   3579 		}
   3580 		else
   3581 		{
   3582 			unlockExternal();
   3583 		}
   3584 	}
   3585 
   3586 	void Surface::copyInternal(const Surface *source, int x, int y, float srcX, float srcY, bool filter)
   3587 	{
   3588 		ASSERT(internal.lock != LOCK_UNLOCKED && source && source->internal.lock != LOCK_UNLOCKED);
   3589 
   3590 		sw::Color<float> color;
   3591 
   3592 		if(!filter)
   3593 		{
   3594 			color = source->internal.read((int)srcX, (int)srcY, 0);
   3595 		}
   3596 		else   // Bilinear filtering
   3597 		{
   3598 			color = source->internal.sample(srcX, srcY, 0);
   3599 		}
   3600 
   3601 		internal.write(x, y, color);
   3602 	}
   3603 
   3604 	void Surface::copyInternal(const Surface *source, int x, int y, int z, float srcX, float srcY, float srcZ, bool filter)
   3605 	{
   3606 		ASSERT(internal.lock != LOCK_UNLOCKED && source && source->internal.lock != LOCK_UNLOCKED);
   3607 
   3608 		sw::Color<float> color;
   3609 
   3610 		if(!filter)
   3611 		{
   3612 			color = source->internal.read((int)srcX, (int)srcY, int(srcZ));
   3613 		}
   3614 		else   // Bilinear filtering
   3615 		{
   3616 			color = source->internal.sample(srcX, srcY, srcZ);
   3617 		}
   3618 
   3619 		internal.write(x, y, z, color);
   3620 	}
   3621 
   3622 	void Surface::copyCubeEdge(Edge dstEdge, Surface *src, Edge srcEdge)
   3623 	{
   3624 		Surface *dst = this;
   3625 
   3626 		// Figure out if the edges to be copied in reverse order respectively from one another
   3627 		// The copy should be reversed whenever the same edges are contiguous or if we're
   3628 		// copying top <-> right or bottom <-> left. This is explained by the layout, which is:
   3629 		//
   3630 		//      | +y |
   3631 		// | -x | +z | +x | -z |
   3632 		//      | -y |
   3633 
   3634 		bool reverse = (srcEdge == dstEdge) ||
   3635 		               ((srcEdge == TOP) && (dstEdge == RIGHT)) ||
   3636 		               ((srcEdge == RIGHT) && (dstEdge == TOP)) ||
   3637 		               ((srcEdge == BOTTOM) && (dstEdge == LEFT)) ||
   3638 		               ((srcEdge == LEFT) && (dstEdge == BOTTOM));
   3639 
   3640 		int srcBytes = src->bytes(src->Surface::getInternalFormat());
   3641 		int srcPitch = src->getInternalPitchB();
   3642 		int dstBytes = dst->bytes(dst->Surface::getInternalFormat());
   3643 		int dstPitch = dst->getInternalPitchB();
   3644 
   3645 		int srcW = src->getWidth();
   3646 		int srcH = src->getHeight();
   3647 		int dstW = dst->getWidth();
   3648 		int dstH = dst->getHeight();
   3649 
   3650 		ASSERT(srcW == srcH && dstW == dstH && srcW == dstW && srcBytes == dstBytes);
   3651 
   3652 		// Src is expressed in the regular [0, width-1], [0, height-1] space
   3653 		int srcDelta = ((srcEdge == TOP) || (srcEdge == BOTTOM)) ? srcBytes : srcPitch;
   3654 		int srcStart = ((srcEdge == BOTTOM) ? srcPitch * (srcH - 1) : ((srcEdge == RIGHT) ? srcBytes * (srcW - 1) : 0));
   3655 
   3656 		// Dst contains borders, so it is expressed in the [-1, width+1], [-1, height+1] space
   3657 		int dstDelta = (((dstEdge == TOP) || (dstEdge == BOTTOM)) ? dstBytes : dstPitch) * (reverse ? -1 : 1);
   3658 		int dstStart = ((dstEdge == BOTTOM) ? dstPitch * (dstH + 1) : ((dstEdge == RIGHT) ? dstBytes * (dstW + 1) : 0)) + (reverse ? dstW * -dstDelta : dstDelta);
   3659 
   3660 		char *srcBuf = (char*)src->lockInternal(0, 0, 0, sw::LOCK_READONLY, sw::PRIVATE) + srcStart;
   3661 		char *dstBuf = (char*)dst->lockInternal(-1, -1, 0, sw::LOCK_READWRITE, sw::PRIVATE) + dstStart;
   3662 
   3663 		for(int i = 0; i < srcW; ++i, dstBuf += dstDelta, srcBuf += srcDelta)
   3664 		{
   3665 			memcpy(dstBuf, srcBuf, srcBytes);
   3666 		}
   3667 
   3668 		if(dstEdge == LEFT || dstEdge == RIGHT)
   3669 		{
   3670 			// TOP and BOTTOM are already set, let's average out the corners
   3671 			int x0 = (dstEdge == RIGHT) ? dstW : -1;
   3672 			int y0 = -1;
   3673 			int x1 = (dstEdge == RIGHT) ? dstW - 1 : 0;
   3674 			int y1 = 0;
   3675 			dst->computeCubeCorner(x0, y0, x1, y1);
   3676 			y0 = dstH;
   3677 			y1 = dstH - 1;
   3678 			dst->computeCubeCorner(x0, y0, x1, y1);
   3679 		}
   3680 
   3681 		src->unlockInternal();
   3682 		dst->unlockInternal();
   3683 	}
   3684 
   3685 	void Surface::computeCubeCorner(int x0, int y0, int x1, int y1)
   3686 	{
   3687 		ASSERT(internal.lock != LOCK_UNLOCKED);
   3688 
   3689 		sw::Color<float> color = internal.read(x0, y1);
   3690 		color += internal.read(x1, y0);
   3691 		color += internal.read(x1, y1);
   3692 		color *= (1.0f / 3.0f);
   3693 
   3694 		internal.write(x0, y0, color);
   3695 	}
   3696 
   3697 	bool Surface::hasStencil() const
   3698 	{
   3699 		return isStencil(external.format);
   3700 	}
   3701 
   3702 	bool Surface::hasDepth() const
   3703 	{
   3704 		return isDepth(external.format);
   3705 	}
   3706 
   3707 	bool Surface::hasPalette() const
   3708 	{
   3709 		return isPalette(external.format);
   3710 	}
   3711 
   3712 	bool Surface::isRenderTarget() const
   3713 	{
   3714 		return renderTarget;
   3715 	}
   3716 
   3717 	bool Surface::hasDirtyContents() const
   3718 	{
   3719 		return dirtyContents;
   3720 	}
   3721 
   3722 	void Surface::markContentsClean()
   3723 	{
   3724 		dirtyContents = false;
   3725 	}
   3726 
   3727 	Resource *Surface::getResource()
   3728 	{
   3729 		return resource;
   3730 	}
   3731 
   3732 	bool Surface::identicalBuffers() const
   3733 	{
   3734 		return external.format == internal.format &&
   3735 		       external.width  == internal.width &&
   3736 		       external.height == internal.height &&
   3737 		       external.depth  == internal.depth &&
   3738 		       external.pitchB == internal.pitchB &&
   3739 		       external.sliceB == internal.sliceB &&
   3740 		       external.border == internal.border &&
   3741 		       external.samples == internal.samples;
   3742 	}
   3743 
   3744 	Format Surface::selectInternalFormat(Format format) const
   3745 	{
   3746 		switch(format)
   3747 		{
   3748 		case FORMAT_NULL:
   3749 			return FORMAT_NULL;
   3750 		case FORMAT_P8:
   3751 		case FORMAT_A8P8:
   3752 		case FORMAT_A4R4G4B4:
   3753 		case FORMAT_A1R5G5B5:
   3754 		case FORMAT_A8R3G3B2:
   3755 			return FORMAT_A8R8G8B8;
   3756 		case FORMAT_A8:
   3757 			return FORMAT_A8;
   3758 		case FORMAT_R8I:
   3759 			return FORMAT_R8I;
   3760 		case FORMAT_R8UI:
   3761 			return FORMAT_R8UI;
   3762 		case FORMAT_R8_SNORM:
   3763 			return FORMAT_R8_SNORM;
   3764 		case FORMAT_R8:
   3765 			return FORMAT_R8;
   3766 		case FORMAT_R16I:
   3767 			return FORMAT_R16I;
   3768 		case FORMAT_R16UI:
   3769 			return FORMAT_R16UI;
   3770 		case FORMAT_R32I:
   3771 			return FORMAT_R32I;
   3772 		case FORMAT_R32UI:
   3773 			return FORMAT_R32UI;
   3774 		case FORMAT_X16B16G16R16I:
   3775 			return FORMAT_X16B16G16R16I;
   3776 		case FORMAT_A16B16G16R16I:
   3777 			return FORMAT_A16B16G16R16I;
   3778 		case FORMAT_X16B16G16R16UI:
   3779 			return FORMAT_X16B16G16R16UI;
   3780 		case FORMAT_A16B16G16R16UI:
   3781 			return FORMAT_A16B16G16R16UI;
   3782 		case FORMAT_A2R10G10B10:
   3783 		case FORMAT_A2B10G10R10:
   3784 		case FORMAT_A16B16G16R16:
   3785 			return FORMAT_A16B16G16R16;
   3786 		case FORMAT_A2B10G10R10UI:
   3787 			return FORMAT_A16B16G16R16UI;
   3788 		case FORMAT_X32B32G32R32I:
   3789 			return FORMAT_X32B32G32R32I;
   3790 		case FORMAT_A32B32G32R32I:
   3791 			return FORMAT_A32B32G32R32I;
   3792 		case FORMAT_X32B32G32R32UI:
   3793 			return FORMAT_X32B32G32R32UI;
   3794 		case FORMAT_A32B32G32R32UI:
   3795 			return FORMAT_A32B32G32R32UI;
   3796 		case FORMAT_G8R8I:
   3797 			return FORMAT_G8R8I;
   3798 		case FORMAT_G8R8UI:
   3799 			return FORMAT_G8R8UI;
   3800 		case FORMAT_G8R8_SNORM:
   3801 			return FORMAT_G8R8_SNORM;
   3802 		case FORMAT_G8R8:
   3803 			return FORMAT_G8R8;
   3804 		case FORMAT_G16R16I:
   3805 			return FORMAT_G16R16I;
   3806 		case FORMAT_G16R16UI:
   3807 			return FORMAT_G16R16UI;
   3808 		case FORMAT_G16R16:
   3809 			return FORMAT_G16R16;
   3810 		case FORMAT_G32R32I:
   3811 			return FORMAT_G32R32I;
   3812 		case FORMAT_G32R32UI:
   3813 			return FORMAT_G32R32UI;
   3814 		case FORMAT_A8R8G8B8:
   3815 			if(lockable || !quadLayoutEnabled)
   3816 			{
   3817 				return FORMAT_A8R8G8B8;
   3818 			}
   3819 			else
   3820 			{
   3821 				return FORMAT_A8G8R8B8Q;
   3822 			}
   3823 		case FORMAT_A8B8G8R8I:
   3824 			return FORMAT_A8B8G8R8I;
   3825 		case FORMAT_A8B8G8R8UI:
   3826 			return FORMAT_A8B8G8R8UI;
   3827 		case FORMAT_A8B8G8R8_SNORM:
   3828 			return FORMAT_A8B8G8R8_SNORM;
   3829 		case FORMAT_R5G5B5A1:
   3830 		case FORMAT_R4G4B4A4:
   3831 		case FORMAT_A8B8G8R8:
   3832 			return FORMAT_A8B8G8R8;
   3833 		case FORMAT_R5G6B5:
   3834 			return FORMAT_R5G6B5;
   3835 		case FORMAT_R3G3B2:
   3836 		case FORMAT_R8G8B8:
   3837 		case FORMAT_X4R4G4B4:
   3838 		case FORMAT_X1R5G5B5:
   3839 		case FORMAT_X8R8G8B8:
   3840 			if(lockable || !quadLayoutEnabled)
   3841 			{
   3842 				return FORMAT_X8R8G8B8;
   3843 			}
   3844 			else
   3845 			{
   3846 				return FORMAT_X8G8R8B8Q;
   3847 			}
   3848 		case FORMAT_X8B8G8R8I:
   3849 			return FORMAT_X8B8G8R8I;
   3850 		case FORMAT_X8B8G8R8UI:
   3851 			return FORMAT_X8B8G8R8UI;
   3852 		case FORMAT_X8B8G8R8_SNORM:
   3853 			return FORMAT_X8B8G8R8_SNORM;
   3854 		case FORMAT_B8G8R8:
   3855 		case FORMAT_X8B8G8R8:
   3856 			return FORMAT_X8B8G8R8;
   3857 		case FORMAT_SRGB8_X8:
   3858 			return FORMAT_SRGB8_X8;
   3859 		case FORMAT_SRGB8_A8:
   3860 			return FORMAT_SRGB8_A8;
   3861 		// Compressed formats
   3862 		case FORMAT_DXT1:
   3863 		case FORMAT_DXT3:
   3864 		case FORMAT_DXT5:
   3865 		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
   3866 		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
   3867 		case FORMAT_RGBA8_ETC2_EAC:
   3868 		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
   3869 		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
   3870 		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
   3871 		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
   3872 		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
   3873 		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
   3874 		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
   3875 		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
   3876 		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
   3877 		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
   3878 		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
   3879 		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
   3880 		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
   3881 		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
   3882 		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
   3883 			return FORMAT_A8R8G8B8;
   3884 		case FORMAT_RGBA_ASTC_4x4_KHR:
   3885 		case FORMAT_RGBA_ASTC_5x4_KHR:
   3886 		case FORMAT_RGBA_ASTC_5x5_KHR:
   3887 		case FORMAT_RGBA_ASTC_6x5_KHR:
   3888 		case FORMAT_RGBA_ASTC_6x6_KHR:
   3889 		case FORMAT_RGBA_ASTC_8x5_KHR:
   3890 		case FORMAT_RGBA_ASTC_8x6_KHR:
   3891 		case FORMAT_RGBA_ASTC_8x8_KHR:
   3892 		case FORMAT_RGBA_ASTC_10x5_KHR:
   3893 		case FORMAT_RGBA_ASTC_10x6_KHR:
   3894 		case FORMAT_RGBA_ASTC_10x8_KHR:
   3895 		case FORMAT_RGBA_ASTC_10x10_KHR:
   3896 		case FORMAT_RGBA_ASTC_12x10_KHR:
   3897 		case FORMAT_RGBA_ASTC_12x12_KHR:
   3898 			// ASTC supports HDR, so a floating point format is required to represent it properly
   3899 			return FORMAT_A32B32G32R32F; // FIXME: 16FP is probably sufficient, but it's currently unsupported
   3900 		case FORMAT_ATI1:
   3901 			return FORMAT_R8;
   3902 		case FORMAT_R11_EAC:
   3903 		case FORMAT_SIGNED_R11_EAC:
   3904 			return FORMAT_R32F; // FIXME: Signed 8bit format would be sufficient
   3905 		case FORMAT_ATI2:
   3906 			return FORMAT_G8R8;
   3907 		case FORMAT_RG11_EAC:
   3908 		case FORMAT_SIGNED_RG11_EAC:
   3909 			return FORMAT_G32R32F; // FIXME: Signed 8bit format would be sufficient
   3910 		case FORMAT_ETC1:
   3911 		case FORMAT_RGB8_ETC2:
   3912 		case FORMAT_SRGB8_ETC2:
   3913 			return FORMAT_X8R8G8B8;
   3914 		// Bumpmap formats
   3915 		case FORMAT_V8U8:			return FORMAT_V8U8;
   3916 		case FORMAT_L6V5U5:			return FORMAT_X8L8V8U8;
   3917 		case FORMAT_Q8W8V8U8:		return FORMAT_Q8W8V8U8;
   3918 		case FORMAT_X8L8V8U8:		return FORMAT_X8L8V8U8;
   3919 		case FORMAT_V16U16:			return FORMAT_V16U16;
   3920 		case FORMAT_A2W10V10U10:	return FORMAT_A16W16V16U16;
   3921 		case FORMAT_Q16W16V16U16:	return FORMAT_Q16W16V16U16;
   3922 		// Floating-point formats
   3923 		case FORMAT_A16F:			return FORMAT_A32B32G32R32F;
   3924 		case FORMAT_R16F:			return FORMAT_R32F;
   3925 		case FORMAT_G16R16F:		return FORMAT_G32R32F;
   3926 		case FORMAT_B16G16R16F:     return FORMAT_X32B32G32R32F;
   3927 		case FORMAT_X16B16G16R16F:	return FORMAT_X32B32G32R32F;
   3928 		case FORMAT_A16B16G16R16F:	return FORMAT_A32B32G32R32F;
   3929 		case FORMAT_X16B16G16R16F_UNSIGNED: return FORMAT_X32B32G32R32F_UNSIGNED;
   3930 		case FORMAT_A32F:			return FORMAT_A32B32G32R32F;
   3931 		case FORMAT_R32F:			return FORMAT_R32F;
   3932 		case FORMAT_G32R32F:		return FORMAT_G32R32F;
   3933 		case FORMAT_B32G32R32F:     return FORMAT_X32B32G32R32F;
   3934 		case FORMAT_X32B32G32R32F:  return FORMAT_X32B32G32R32F;
   3935 		case FORMAT_A32B32G32R32F:	return FORMAT_A32B32G32R32F;
   3936 		case FORMAT_X32B32G32R32F_UNSIGNED: return FORMAT_X32B32G32R32F_UNSIGNED;
   3937 		// Luminance formats
   3938 		case FORMAT_L8:				return FORMAT_L8;
   3939 		case FORMAT_A4L4:			return FORMAT_A8L8;
   3940 		case FORMAT_L16:			return FORMAT_L16;
   3941 		case FORMAT_A8L8:			return FORMAT_A8L8;
   3942 		case FORMAT_L16F:           return FORMAT_X32B32G32R32F;
   3943 		case FORMAT_A16L16F:        return FORMAT_A32B32G32R32F;
   3944 		case FORMAT_L32F:           return FORMAT_X32B32G32R32F;
   3945 		case FORMAT_A32L32F:        return FORMAT_A32B32G32R32F;
   3946 		// Depth/stencil formats
   3947 		case FORMAT_D16:
   3948 		case FORMAT_D32:
   3949 		case FORMAT_D24X8:
   3950 			if(hasParent)   // Texture
   3951 			{
   3952 				return FORMAT_D32F_SHADOW;
   3953 			}
   3954 			else if(complementaryDepthBuffer)
   3955 			{
   3956 				return FORMAT_D32F_COMPLEMENTARY;
   3957 			}
   3958 			else
   3959 			{
   3960 				return FORMAT_D32F;
   3961 			}
   3962 		case FORMAT_D24S8:
   3963 		case FORMAT_D24FS8:
   3964 			if(hasParent)   // Texture
   3965 			{
   3966 				return FORMAT_D32FS8_SHADOW;
   3967 			}
   3968 			else if(complementaryDepthBuffer)
   3969 			{
   3970 				return FORMAT_D32FS8_COMPLEMENTARY;
   3971 			}
   3972 			else
   3973 			{
   3974 				return FORMAT_D32FS8;
   3975 			}
   3976 		case FORMAT_D32F:           return FORMAT_D32F;
   3977 		case FORMAT_D32FS8:         return FORMAT_D32FS8;
   3978 		case FORMAT_D32F_LOCKABLE:  return FORMAT_D32F_LOCKABLE;
   3979 		case FORMAT_D32FS8_TEXTURE: return FORMAT_D32FS8_TEXTURE;
   3980 		case FORMAT_INTZ:           return FORMAT_D32FS8_TEXTURE;
   3981 		case FORMAT_DF24S8:         return FORMAT_D32FS8_SHADOW;
   3982 		case FORMAT_DF16S8:         return FORMAT_D32FS8_SHADOW;
   3983 		case FORMAT_S8:             return FORMAT_S8;
   3984 		// YUV formats
   3985 		case FORMAT_YV12_BT601:     return FORMAT_YV12_BT601;
   3986 		case FORMAT_YV12_BT709:     return FORMAT_YV12_BT709;
   3987 		case FORMAT_YV12_JFIF:      return FORMAT_YV12_JFIF;
   3988 		default:
   3989 			ASSERT(false);
   3990 		}
   3991 
   3992 		return FORMAT_NULL;
   3993 	}
   3994 
   3995 	void Surface::setTexturePalette(unsigned int *palette)
   3996 	{
   3997 		Surface::palette = palette;
   3998 		Surface::paletteID++;
   3999 	}
   4000 
   4001 	void Surface::resolve()
   4002 	{
   4003 		if(internal.samples <= 1 || !internal.dirty || !renderTarget || internal.format == FORMAT_NULL)
   4004 		{
   4005 			return;
   4006 		}
   4007 
   4008 		ASSERT(internal.depth == 1);  // Unimplemented
   4009 
   4010 		void *source = internal.lockRect(0, 0, 0, LOCK_READWRITE);
   4011 
   4012 		int width = internal.width;
   4013 		int height = internal.height;
   4014 		int pitch = internal.pitchB;
   4015 		int slice = internal.sliceB;
   4016 
   4017 		unsigned char *source0 = (unsigned char*)source;
   4018 		unsigned char *source1 = source0 + slice;
   4019 		unsigned char *source2 = source1 + slice;
   4020 		unsigned char *source3 = source2 + slice;
   4021 		unsigned char *source4 = source3 + slice;
   4022 		unsigned char *source5 = source4 + slice;
   4023 		unsigned char *source6 = source5 + slice;
   4024 		unsigned char *source7 = source6 + slice;
   4025 		unsigned char *source8 = source7 + slice;
   4026 		unsigned char *source9 = source8 + slice;
   4027 		unsigned char *sourceA = source9 + slice;
   4028 		unsigned char *sourceB = sourceA + slice;
   4029 		unsigned char *sourceC = sourceB + slice;
   4030 		unsigned char *sourceD = sourceC + slice;
   4031 		unsigned char *sourceE = sourceD + slice;
   4032 		unsigned char *sourceF = sourceE + slice;
   4033 
   4034 		if(internal.format == FORMAT_X8R8G8B8 || internal.format == FORMAT_A8R8G8B8 ||
   4035 		   internal.format == FORMAT_X8B8G8R8 || internal.format == FORMAT_A8B8G8R8 ||
   4036 		   internal.format == FORMAT_SRGB8_X8 || internal.format == FORMAT_SRGB8_A8)
   4037 		{
   4038 			#if defined(__i386__) || defined(__x86_64__)
   4039 				if(CPUID::supportsSSE2() && (width % 4) == 0)
   4040 				{
   4041 					if(internal.samples == 2)
   4042 					{
   4043 						for(int y = 0; y < height; y++)
   4044 						{
   4045 							for(int x = 0; x < width; x += 4)
   4046 							{
   4047 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
   4048 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
   4049 
   4050 								c0 = _mm_avg_epu8(c0, c1);
   4051 
   4052 								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
   4053 							}
   4054 
   4055 							source0 += pitch;
   4056 							source1 += pitch;
   4057 						}
   4058 					}
   4059 					else if(internal.samples == 4)
   4060 					{
   4061 						for(int y = 0; y < height; y++)
   4062 						{
   4063 							for(int x = 0; x < width; x += 4)
   4064 							{
   4065 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
   4066 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
   4067 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
   4068 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
   4069 
   4070 								c0 = _mm_avg_epu8(c0, c1);
   4071 								c2 = _mm_avg_epu8(c2, c3);
   4072 								c0 = _mm_avg_epu8(c0, c2);
   4073 
   4074 								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
   4075 							}
   4076 
   4077 							source0 += pitch;
   4078 							source1 += pitch;
   4079 							source2 += pitch;
   4080 							source3 += pitch;
   4081 						}
   4082 					}
   4083 					else if(internal.samples == 8)
   4084 					{
   4085 						for(int y = 0; y < height; y++)
   4086 						{
   4087 							for(int x = 0; x < width; x += 4)
   4088 							{
   4089 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
   4090 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
   4091 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
   4092 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
   4093 								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
   4094 								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
   4095 								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
   4096 								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
   4097 
   4098 								c0 = _mm_avg_epu8(c0, c1);
   4099 								c2 = _mm_avg_epu8(c2, c3);
   4100 								c4 = _mm_avg_epu8(c4, c5);
   4101 								c6 = _mm_avg_epu8(c6, c7);
   4102 								c0 = _mm_avg_epu8(c0, c2);
   4103 								c4 = _mm_avg_epu8(c4, c6);
   4104 								c0 = _mm_avg_epu8(c0, c4);
   4105 
   4106 								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
   4107 							}
   4108 
   4109 							source0 += pitch;
   4110 							source1 += pitch;
   4111 							source2 += pitch;
   4112 							source3 += pitch;
   4113 							source4 += pitch;
   4114 							source5 += pitch;
   4115 							source6 += pitch;
   4116 							source7 += pitch;
   4117 						}
   4118 					}
   4119 					else if(internal.samples == 16)
   4120 					{
   4121 						for(int y = 0; y < height; y++)
   4122 						{
   4123 							for(int x = 0; x < width; x += 4)
   4124 							{
   4125 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
   4126 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
   4127 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
   4128 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
   4129 								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
   4130 								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
   4131 								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
   4132 								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
   4133 								__m128i c8 = _mm_load_si128((__m128i*)(source8 + 4 * x));
   4134 								__m128i c9 = _mm_load_si128((__m128i*)(source9 + 4 * x));
   4135 								__m128i cA = _mm_load_si128((__m128i*)(sourceA + 4 * x));
   4136 								__m128i cB = _mm_load_si128((__m128i*)(sourceB + 4 * x));
   4137 								__m128i cC = _mm_load_si128((__m128i*)(sourceC + 4 * x));
   4138 								__m128i cD = _mm_load_si128((__m128i*)(sourceD + 4 * x));
   4139 								__m128i cE = _mm_load_si128((__m128i*)(sourceE + 4 * x));
   4140 								__m128i cF = _mm_load_si128((__m128i*)(sourceF + 4 * x));
   4141 
   4142 								c0 = _mm_avg_epu8(c0, c1);
   4143 								c2 = _mm_avg_epu8(c2, c3);
   4144 								c4 = _mm_avg_epu8(c4, c5);
   4145 								c6 = _mm_avg_epu8(c6, c7);
   4146 								c8 = _mm_avg_epu8(c8, c9);
   4147 								cA = _mm_avg_epu8(cA, cB);
   4148 								cC = _mm_avg_epu8(cC, cD);
   4149 								cE = _mm_avg_epu8(cE, cF);
   4150 								c0 = _mm_avg_epu8(c0, c2);
   4151 								c4 = _mm_avg_epu8(c4, c6);
   4152 								c8 = _mm_avg_epu8(c8, cA);
   4153 								cC = _mm_avg_epu8(cC, cE);
   4154 								c0 = _mm_avg_epu8(c0, c4);
   4155 								c8 = _mm_avg_epu8(c8, cC);
   4156 								c0 = _mm_avg_epu8(c0, c8);
   4157 
   4158 								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
   4159 							}
   4160 
   4161 							source0 += pitch;
   4162 							source1 += pitch;
   4163 							source2 += pitch;
   4164 							source3 += pitch;
   4165 							source4 += pitch;
   4166 							source5 += pitch;
   4167 							source6 += pitch;
   4168 							source7 += pitch;
   4169 							source8 += pitch;
   4170 							source9 += pitch;
   4171 							sourceA += pitch;
   4172 							sourceB += pitch;
   4173 							sourceC += pitch;
   4174 							sourceD += pitch;
   4175 							sourceE += pitch;
   4176 							sourceF += pitch;
   4177 						}
   4178 					}
   4179 					else ASSERT(false);
   4180 				}
   4181 				else
   4182 			#endif
   4183 			{
   4184 				#define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7F7F7F7F) + (((x) ^ (y)) & 0x01010101))
   4185 
   4186 				if(internal.samples == 2)
   4187 				{
   4188 					for(int y = 0; y < height; y++)
   4189 					{
   4190 						for(int x = 0; x < width; x++)
   4191 						{
   4192 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
   4193 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
   4194 
   4195 							c0 = AVERAGE(c0, c1);
   4196 
   4197 							*(unsigned int*)(source0 + 4 * x) = c0;
   4198 						}
   4199 
   4200 						source0 += pitch;
   4201 						source1 += pitch;
   4202 					}
   4203 				}
   4204 				else if(internal.samples == 4)
   4205 				{
   4206 					for(int y = 0; y < height; y++)
   4207 					{
   4208 						for(int x = 0; x < width; x++)
   4209 						{
   4210 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
   4211 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
   4212 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
   4213 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
   4214 
   4215 							c0 = AVERAGE(c0, c1);
   4216 							c2 = AVERAGE(c2, c3);
   4217 							c0 = AVERAGE(c0, c2);
   4218 
   4219 							*(unsigned int*)(source0 + 4 * x) = c0;
   4220 						}
   4221 
   4222 						source0 += pitch;
   4223 						source1 += pitch;
   4224 						source2 += pitch;
   4225 						source3 += pitch;
   4226 					}
   4227 				}
   4228 				else if(internal.samples == 8)
   4229 				{
   4230 					for(int y = 0; y < height; y++)
   4231 					{
   4232 						for(int x = 0; x < width; x++)
   4233 						{
   4234 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
   4235 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
   4236 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
   4237 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
   4238 							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
   4239 							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
   4240 							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
   4241 							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
   4242 
   4243 							c0 = AVERAGE(c0, c1);
   4244 							c2 = AVERAGE(c2, c3);
   4245 							c4 = AVERAGE(c4, c5);
   4246 							c6 = AVERAGE(c6, c7);
   4247 							c0 = AVERAGE(c0, c2);
   4248 							c4 = AVERAGE(c4, c6);
   4249 							c0 = AVERAGE(c0, c4);
   4250 
   4251 							*(unsigned int*)(source0 + 4 * x) = c0;
   4252 						}
   4253 
   4254 						source0 += pitch;
   4255 						source1 += pitch;
   4256 						source2 += pitch;
   4257 						source3 += pitch;
   4258 						source4 += pitch;
   4259 						source5 += pitch;
   4260 						source6 += pitch;
   4261 						source7 += pitch;
   4262 					}
   4263 				}
   4264 				else if(internal.samples == 16)
   4265 				{
   4266 					for(int y = 0; y < height; y++)
   4267 					{
   4268 						for(int x = 0; x < width; x++)
   4269 						{
   4270 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
   4271 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
   4272 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
   4273 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
   4274 							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
   4275 							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
   4276 							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
   4277 							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
   4278 							unsigned int c8 = *(unsigned int*)(source8 + 4 * x);
   4279 							unsigned int c9 = *(unsigned int*)(source9 + 4 * x);
   4280 							unsigned int cA = *(unsigned int*)(sourceA + 4 * x);
   4281 							unsigned int cB = *(unsigned int*)(sourceB + 4 * x);
   4282 							unsigned int cC = *(unsigned int*)(sourceC + 4 * x);
   4283 							unsigned int cD = *(unsigned int*)(sourceD + 4 * x);
   4284 							unsigned int cE = *(unsigned int*)(sourceE + 4 * x);
   4285 							unsigned int cF = *(unsigned int*)(sourceF + 4 * x);
   4286 
   4287 							c0 = AVERAGE(c0, c1);
   4288 							c2 = AVERAGE(c2, c3);
   4289 							c4 = AVERAGE(c4, c5);
   4290 							c6 = AVERAGE(c6, c7);
   4291 							c8 = AVERAGE(c8, c9);
   4292 							cA = AVERAGE(cA, cB);
   4293 							cC = AVERAGE(cC, cD);
   4294 							cE = AVERAGE(cE, cF);
   4295 							c0 = AVERAGE(c0, c2);
   4296 							c4 = AVERAGE(c4, c6);
   4297 							c8 = AVERAGE(c8, cA);
   4298 							cC = AVERAGE(cC, cE);
   4299 							c0 = AVERAGE(c0, c4);
   4300 							c8 = AVERAGE(c8, cC);
   4301 							c0 = AVERAGE(c0, c8);
   4302 
   4303 							*(unsigned int*)(source0 + 4 * x) = c0;
   4304 						}
   4305 
   4306 						source0 += pitch;
   4307 						source1 += pitch;
   4308 						source2 += pitch;
   4309 						source3 += pitch;
   4310 						source4 += pitch;
   4311 						source5 += pitch;
   4312 						source6 += pitch;
   4313 						source7 += pitch;
   4314 						source8 += pitch;
   4315 						source9 += pitch;
   4316 						sourceA += pitch;
   4317 						sourceB += pitch;
   4318 						sourceC += pitch;
   4319 						sourceD += pitch;
   4320 						sourceE += pitch;
   4321 						sourceF += pitch;
   4322 					}
   4323 				}
   4324 				else ASSERT(false);
   4325 
   4326 				#undef AVERAGE
   4327 			}
   4328 		}
   4329 		else if(internal.format == FORMAT_G16R16)
   4330 		{
   4331 
   4332 			#if defined(__i386__) || defined(__x86_64__)
   4333 				if(CPUID::supportsSSE2() && (width % 4) == 0)
   4334 				{
   4335 					if(internal.samples == 2)
   4336 					{
   4337 						for(int y = 0; y < height; y++)
   4338 						{
   4339 							for(int x = 0; x < width; x += 4)
   4340 							{
   4341 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
   4342 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
   4343 
   4344 								c0 = _mm_avg_epu16(c0, c1);
   4345 
   4346 								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
   4347 							}
   4348 
   4349 							source0 += pitch;
   4350 							source1 += pitch;
   4351 						}
   4352 					}
   4353 					else if(internal.samples == 4)
   4354 					{
   4355 						for(int y = 0; y < height; y++)
   4356 						{
   4357 							for(int x = 0; x < width; x += 4)
   4358 							{
   4359 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
   4360 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
   4361 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
   4362 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
   4363 
   4364 								c0 = _mm_avg_epu16(c0, c1);
   4365 								c2 = _mm_avg_epu16(c2, c3);
   4366 								c0 = _mm_avg_epu16(c0, c2);
   4367 
   4368 								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
   4369 							}
   4370 
   4371 							source0 += pitch;
   4372 							source1 += pitch;
   4373 							source2 += pitch;
   4374 							source3 += pitch;
   4375 						}
   4376 					}
   4377 					else if(internal.samples == 8)
   4378 					{
   4379 						for(int y = 0; y < height; y++)
   4380 						{
   4381 							for(int x = 0; x < width; x += 4)
   4382 							{
   4383 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
   4384 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
   4385 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
   4386 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
   4387 								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
   4388 								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
   4389 								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
   4390 								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
   4391 
   4392 								c0 = _mm_avg_epu16(c0, c1);
   4393 								c2 = _mm_avg_epu16(c2, c3);
   4394 								c4 = _mm_avg_epu16(c4, c5);
   4395 								c6 = _mm_avg_epu16(c6, c7);
   4396 								c0 = _mm_avg_epu16(c0, c2);
   4397 								c4 = _mm_avg_epu16(c4, c6);
   4398 								c0 = _mm_avg_epu16(c0, c4);
   4399 
   4400 								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
   4401 							}
   4402 
   4403 							source0 += pitch;
   4404 							source1 += pitch;
   4405 							source2 += pitch;
   4406 							source3 += pitch;
   4407 							source4 += pitch;
   4408 							source5 += pitch;
   4409 							source6 += pitch;
   4410 							source7 += pitch;
   4411 						}
   4412 					}
   4413 					else if(internal.samples == 16)
   4414 					{
   4415 						for(int y = 0; y < height; y++)
   4416 						{
   4417 							for(int x = 0; x < width; x += 4)
   4418 							{
   4419 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
   4420 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
   4421 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
   4422 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
   4423 								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
   4424 								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
   4425 								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
   4426 								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
   4427 								__m128i c8 = _mm_load_si128((__m128i*)(source8 + 4 * x));
   4428 								__m128i c9 = _mm_load_si128((__m128i*)(source9 + 4 * x));
   4429 								__m128i cA = _mm_load_si128((__m128i*)(sourceA + 4 * x));
   4430 								__m128i cB = _mm_load_si128((__m128i*)(sourceB + 4 * x));
   4431 								__m128i cC = _mm_load_si128((__m128i*)(sourceC + 4 * x));
   4432 								__m128i cD = _mm_load_si128((__m128i*)(sourceD + 4 * x));
   4433 								__m128i cE = _mm_load_si128((__m128i*)(sourceE + 4 * x));
   4434 								__m128i cF = _mm_load_si128((__m128i*)(sourceF + 4 * x));
   4435 
   4436 								c0 = _mm_avg_epu16(c0, c1);
   4437 								c2 = _mm_avg_epu16(c2, c3);
   4438 								c4 = _mm_avg_epu16(c4, c5);
   4439 								c6 = _mm_avg_epu16(c6, c7);
   4440 								c8 = _mm_avg_epu16(c8, c9);
   4441 								cA = _mm_avg_epu16(cA, cB);
   4442 								cC = _mm_avg_epu16(cC, cD);
   4443 								cE = _mm_avg_epu16(cE, cF);
   4444 								c0 = _mm_avg_epu16(c0, c2);
   4445 								c4 = _mm_avg_epu16(c4, c6);
   4446 								c8 = _mm_avg_epu16(c8, cA);
   4447 								cC = _mm_avg_epu16(cC, cE);
   4448 								c0 = _mm_avg_epu16(c0, c4);
   4449 								c8 = _mm_avg_epu16(c8, cC);
   4450 								c0 = _mm_avg_epu16(c0, c8);
   4451 
   4452 								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
   4453 							}
   4454 
   4455 							source0 += pitch;
   4456 							source1 += pitch;
   4457 							source2 += pitch;
   4458 							source3 += pitch;
   4459 							source4 += pitch;
   4460 							source5 += pitch;
   4461 							source6 += pitch;
   4462 							source7 += pitch;
   4463 							source8 += pitch;
   4464 							source9 += pitch;
   4465 							sourceA += pitch;
   4466 							sourceB += pitch;
   4467 							sourceC += pitch;
   4468 							sourceD += pitch;
   4469 							sourceE += pitch;
   4470 							sourceF += pitch;
   4471 						}
   4472 					}
   4473 					else ASSERT(false);
   4474 				}
   4475 				else
   4476 			#endif
   4477 			{
   4478 				#define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7FFF7FFF) + (((x) ^ (y)) & 0x00010001))
   4479 
   4480 				if(internal.samples == 2)
   4481 				{
   4482 					for(int y = 0; y < height; y++)
   4483 					{
   4484 						for(int x = 0; x < width; x++)
   4485 						{
   4486 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
   4487 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
   4488 
   4489 							c0 = AVERAGE(c0, c1);
   4490 
   4491 							*(unsigned int*)(source0 + 4 * x) = c0;
   4492 						}
   4493 
   4494 						source0 += pitch;
   4495 						source1 += pitch;
   4496 					}
   4497 				}
   4498 				else if(internal.samples == 4)
   4499 				{
   4500 					for(int y = 0; y < height; y++)
   4501 					{
   4502 						for(int x = 0; x < width; x++)
   4503 						{
   4504 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
   4505 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
   4506 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
   4507 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
   4508 
   4509 							c0 = AVERAGE(c0, c1);
   4510 							c2 = AVERAGE(c2, c3);
   4511 							c0 = AVERAGE(c0, c2);
   4512 
   4513 							*(unsigned int*)(source0 + 4 * x) = c0;
   4514 						}
   4515 
   4516 						source0 += pitch;
   4517 						source1 += pitch;
   4518 						source2 += pitch;
   4519 						source3 += pitch;
   4520 					}
   4521 				}
   4522 				else if(internal.samples == 8)
   4523 				{
   4524 					for(int y = 0; y < height; y++)
   4525 					{
   4526 						for(int x = 0; x < width; x++)
   4527 						{
   4528 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
   4529 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
   4530 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
   4531 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
   4532 							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
   4533 							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
   4534 							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
   4535 							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
   4536 
   4537 							c0 = AVERAGE(c0, c1);
   4538 							c2 = AVERAGE(c2, c3);
   4539 							c4 = AVERAGE(c4, c5);
   4540 							c6 = AVERAGE(c6, c7);
   4541 							c0 = AVERAGE(c0, c2);
   4542 							c4 = AVERAGE(c4, c6);
   4543 							c0 = AVERAGE(c0, c4);
   4544 
   4545 							*(unsigned int*)(source0 + 4 * x) = c0;
   4546 						}
   4547 
   4548 						source0 += pitch;
   4549 						source1 += pitch;
   4550 						source2 += pitch;
   4551 						source3 += pitch;
   4552 						source4 += pitch;
   4553 						source5 += pitch;
   4554 						source6 += pitch;
   4555 						source7 += pitch;
   4556 					}
   4557 				}
   4558 				else if(internal.samples == 16)
   4559 				{
   4560 					for(int y = 0; y < height; y++)
   4561 					{
   4562 						for(int x = 0; x < width; x++)
   4563 						{
   4564 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
   4565 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
   4566 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
   4567 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
   4568 							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
   4569 							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
   4570 							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
   4571 							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
   4572 							unsigned int c8 = *(unsigned int*)(source8 + 4 * x);
   4573 							unsigned int c9 = *(unsigned int*)(source9 + 4 * x);
   4574 							unsigned int cA = *(unsigned int*)(sourceA + 4 * x);
   4575 							unsigned int cB = *(unsigned int*)(sourceB + 4 * x);
   4576 							unsigned int cC = *(unsigned int*)(sourceC + 4 * x);
   4577 							unsigned int cD = *(unsigned int*)(sourceD + 4 * x);
   4578 							unsigned int cE = *(unsigned int*)(sourceE + 4 * x);
   4579 							unsigned int cF = *(unsigned int*)(sourceF + 4 * x);
   4580 
   4581 							c0 = AVERAGE(c0, c1);
   4582 							c2 = AVERAGE(c2, c3);
   4583 							c4 = AVERAGE(c4, c5);
   4584 							c6 = AVERAGE(c6, c7);
   4585 							c8 = AVERAGE(c8, c9);
   4586 							cA = AVERAGE(cA, cB);
   4587 							cC = AVERAGE(cC, cD);
   4588 							cE = AVERAGE(cE, cF);
   4589 							c0 = AVERAGE(c0, c2);
   4590 							c4 = AVERAGE(c4, c6);
   4591 							c8 = AVERAGE(c8, cA);
   4592 							cC = AVERAGE(cC, cE);
   4593 							c0 = AVERAGE(c0, c4);
   4594 							c8 = AVERAGE(c8, cC);
   4595 							c0 = AVERAGE(c0, c8);
   4596 
   4597 							*(unsigned int*)(source0 + 4 * x) = c0;
   4598 						}
   4599 
   4600 						source0 += pitch;
   4601 						source1 += pitch;
   4602 						source2 += pitch;
   4603 						source3 += pitch;
   4604 						source4 += pitch;
   4605 						source5 += pitch;
   4606 						source6 += pitch;
   4607 						source7 += pitch;
   4608 						source8 += pitch;
   4609 						source9 += pitch;
   4610 						sourceA += pitch;
   4611 						sourceB += pitch;
   4612 						sourceC += pitch;
   4613 						sourceD += pitch;
   4614 						sourceE += pitch;
   4615 						sourceF += pitch;
   4616 					}
   4617 				}
   4618 				else ASSERT(false);
   4619 
   4620 				#undef AVERAGE
   4621 			}
   4622 		}
   4623 		else if(internal.format == FORMAT_A16B16G16R16)
   4624 		{
   4625 			#if defined(__i386__) || defined(__x86_64__)
   4626 				if(CPUID::supportsSSE2() && (width % 2) == 0)
   4627 				{
   4628 					if(internal.samples == 2)
   4629 					{
   4630 						for(int y = 0; y < height; y++)
   4631 						{
   4632 							for(int x = 0; x < width; x += 2)
   4633 							{
   4634 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
   4635 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
   4636 
   4637 								c0 = _mm_avg_epu16(c0, c1);
   4638 
   4639 								_mm_store_si128((__m128i*)(source0 + 8 * x), c0);
   4640 							}
   4641 
   4642 							source0 += pitch;
   4643 							source1 += pitch;
   4644 						}
   4645 					}
   4646 					else if(internal.samples == 4)
   4647 					{
   4648 						for(int y = 0; y < height; y++)
   4649 						{
   4650 							for(int x = 0; x < width; x += 2)
   4651 							{
   4652 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
   4653 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
   4654 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 8 * x));
   4655 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 8 * x));
   4656 
   4657 								c0 = _mm_avg_epu16(c0, c1);
   4658 								c2 = _mm_avg_epu16(c2, c3);
   4659 								c0 = _mm_avg_epu16(c0, c2);
   4660 
   4661 								_mm_store_si128((__m128i*)(source0 + 8 * x), c0);
   4662 							}
   4663 
   4664 							source0 += pitch;
   4665 							source1 += pitch;
   4666 							source2 += pitch;
   4667 							source3 += pitch;
   4668 						}
   4669 					}
   4670 					else if(internal.samples == 8)
   4671 					{
   4672 						for(int y = 0; y < height; y++)
   4673 						{
   4674 							for(int x = 0; x < width; x += 2)
   4675 							{
   4676 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
   4677 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
   4678 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 8 * x));
   4679 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 8 * x));
   4680 								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 8 * x));
   4681 								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 8 * x));
   4682 								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 8 * x));
   4683 								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 8 * x));
   4684 
   4685 								c0 = _mm_avg_epu16(c0, c1);
   4686 								c2 = _mm_avg_epu16(c2, c3);
   4687 								c4 = _mm_avg_epu16(c4, c5);
   4688 								c6 = _mm_avg_epu16(c6, c7);
   4689 								c0 = _mm_avg_epu16(c0, c2);
   4690 								c4 = _mm_avg_epu16(c4, c6);
   4691 								c0 = _mm_avg_epu16(c0, c4);
   4692 
   4693 								_mm_store_si128((__m128i*)(source0 + 8 * x), c0);
   4694 							}
   4695 
   4696 							source0 += pitch;
   4697 							source1 += pitch;
   4698 							source2 += pitch;
   4699 							source3 += pitch;
   4700 							source4 += pitch;
   4701 							source5 += pitch;
   4702 							source6 += pitch;
   4703 							source7 += pitch;
   4704 						}
   4705 					}
   4706 					else if(internal.samples == 16)
   4707 					{
   4708 						for(int y = 0; y < height; y++)
   4709 						{
   4710 							for(int x = 0; x < width; x += 2)
   4711 							{
   4712 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
   4713 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
   4714 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 8 * x));
   4715 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 8 * x));
   4716 								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 8 * x));
   4717 								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 8 * x));
   4718 								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 8 * x));
   4719 								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 8 * x));
   4720 								__m128i c8 = _mm_load_si128((__m128i*)(source8 + 8 * x));
   4721 								__m128i c9 = _mm_load_si128((__m128i*)(source9 + 8 * x));
   4722 								__m128i cA = _mm_load_si128((__m128i*)(sourceA + 8 * x));
   4723 								__m128i cB = _mm_load_si128((__m128i*)(sourceB + 8 * x));
   4724 								__m128i cC = _mm_load_si128((__m128i*)(sourceC + 8 * x));
   4725 								__m128i cD = _mm_load_si128((__m128i*)(sourceD + 8 * x));
   4726 								__m128i cE = _mm_load_si128((__m128i*)(sourceE + 8 * x));
   4727 								__m128i cF = _mm_load_si128((__m128i*)(sourceF + 8 * x));
   4728 
   4729 								c0 = _mm_avg_epu16(c0, c1);
   4730 								c2 = _mm_avg_epu16(c2, c3);
   4731 								c4 = _mm_avg_epu16(c4, c5);
   4732 								c6 = _mm_avg_epu16(c6, c7);
   4733 								c8 = _mm_avg_epu16(c8, c9);
   4734 								cA = _mm_avg_epu16(cA, cB);
   4735 								cC = _mm_avg_epu16(cC, cD);
   4736 								cE = _mm_avg_epu16(cE, cF);
   4737 								c0 = _mm_avg_epu16(c0, c2);
   4738 								c4 = _mm_avg_epu16(c4, c6);
   4739 								c8 = _mm_avg_epu16(c8, cA);
   4740 								cC = _mm_avg_epu16(cC, cE);
   4741 								c0 = _mm_avg_epu16(c0, c4);
   4742 								c8 = _mm_avg_epu16(c8, cC);
   4743 								c0 = _mm_avg_epu16(c0, c8);
   4744 
   4745 								_mm_store_si128((__m128i*)(source0 + 8 * x), c0);
   4746 							}
   4747 
   4748 							source0 += pitch;
   4749 							source1 += pitch;
   4750 							source2 += pitch;
   4751 							source3 += pitch;
   4752 							source4 += pitch;
   4753 							source5 += pitch;
   4754 							source6 += pitch;
   4755 							source7 += pitch;
   4756 							source8 += pitch;
   4757 							source9 += pitch;
   4758 							sourceA += pitch;
   4759 							sourceB += pitch;
   4760 							sourceC += pitch;
   4761 							sourceD += pitch;
   4762 							sourceE += pitch;
   4763 							sourceF += pitch;
   4764 						}
   4765 					}
   4766 					else ASSERT(false);
   4767 				}
   4768 				else
   4769 			#endif
   4770 			{
   4771 				#define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7FFF7FFF) + (((x) ^ (y)) & 0x00010001))
   4772 
   4773 				if(internal.samples == 2)
   4774 				{
   4775 					for(int y = 0; y < height; y++)
   4776 					{
   4777 						for(int x = 0; x < 2 * width; x++)
   4778 						{
   4779 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
   4780 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
   4781 
   4782 							c0 = AVERAGE(c0, c1);
   4783 
   4784 							*(unsigned int*)(source0 + 4 * x) = c0;
   4785 						}
   4786 
   4787 						source0 += pitch;
   4788 						source1 += pitch;
   4789 					}
   4790 				}
   4791 				else if(internal.samples == 4)
   4792 				{
   4793 					for(int y = 0; y < height; y++)
   4794 					{
   4795 						for(int x = 0; x < 2 * width; x++)
   4796 						{
   4797 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
   4798 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
   4799 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
   4800 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
   4801 
   4802 							c0 = AVERAGE(c0, c1);
   4803 							c2 = AVERAGE(c2, c3);
   4804 							c0 = AVERAGE(c0, c2);
   4805 
   4806 							*(unsigned int*)(source0 + 4 * x) = c0;
   4807 						}
   4808 
   4809 						source0 += pitch;
   4810 						source1 += pitch;
   4811 						source2 += pitch;
   4812 						source3 += pitch;
   4813 					}
   4814 				}
   4815 				else if(internal.samples == 8)
   4816 				{
   4817 					for(int y = 0; y < height; y++)
   4818 					{
   4819 						for(int x = 0; x < 2 * width; x++)
   4820 						{
   4821 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
   4822 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
   4823 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
   4824 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
   4825 							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
   4826 							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
   4827 							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
   4828 							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
   4829 
   4830 							c0 = AVERAGE(c0, c1);
   4831 							c2 = AVERAGE(c2, c3);
   4832 							c4 = AVERAGE(c4, c5);
   4833 							c6 = AVERAGE(c6, c7);
   4834 							c0 = AVERAGE(c0, c2);
   4835 							c4 = AVERAGE(c4, c6);
   4836 							c0 = AVERAGE(c0, c4);
   4837 
   4838 							*(unsigned int*)(source0 + 4 * x) = c0;
   4839 						}
   4840 
   4841 						source0 += pitch;
   4842 						source1 += pitch;
   4843 						source2 += pitch;
   4844 						source3 += pitch;
   4845 						source4 += pitch;
   4846 						source5 += pitch;
   4847 						source6 += pitch;
   4848 						source7 += pitch;
   4849 					}
   4850 				}
   4851 				else if(internal.samples == 16)
   4852 				{
   4853 					for(int y = 0; y < height; y++)
   4854 					{
   4855 						for(int x = 0; x < 2 * width; x++)
   4856 						{
   4857 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
   4858 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
   4859 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
   4860 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
   4861 							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
   4862 							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
   4863 							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
   4864 							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
   4865 							unsigned int c8 = *(unsigned int*)(source8 + 4 * x);
   4866 							unsigned int c9 = *(unsigned int*)(source9 + 4 * x);
   4867 							unsigned int cA = *(unsigned int*)(sourceA + 4 * x);
   4868 							unsigned int cB = *(unsigned int*)(sourceB + 4 * x);
   4869 							unsigned int cC = *(unsigned int*)(sourceC + 4 * x);
   4870 							unsigned int cD = *(unsigned int*)(sourceD + 4 * x);
   4871 							unsigned int cE = *(unsigned int*)(sourceE + 4 * x);
   4872 							unsigned int cF = *(unsigned int*)(sourceF + 4 * x);
   4873 
   4874 							c0 = AVERAGE(c0, c1);
   4875 							c2 = AVERAGE(c2, c3);
   4876 							c4 = AVERAGE(c4, c5);
   4877 							c6 = AVERAGE(c6, c7);
   4878 							c8 = AVERAGE(c8, c9);
   4879 							cA = AVERAGE(cA, cB);
   4880 							cC = AVERAGE(cC, cD);
   4881 							cE = AVERAGE(cE, cF);
   4882 							c0 = AVERAGE(c0, c2);
   4883 							c4 = AVERAGE(c4, c6);
   4884 							c8 = AVERAGE(c8, cA);
   4885 							cC = AVERAGE(cC, cE);
   4886 							c0 = AVERAGE(c0, c4);
   4887 							c8 = AVERAGE(c8, cC);
   4888 							c0 = AVERAGE(c0, c8);
   4889 
   4890 							*(unsigned int*)(source0 + 4 * x) = c0;
   4891 						}
   4892 
   4893 						source0 += pitch;
   4894 						source1 += pitch;
   4895 						source2 += pitch;
   4896 						source3 += pitch;
   4897 						source4 += pitch;
   4898 						source5 += pitch;
   4899 						source6 += pitch;
   4900 						source7 += pitch;
   4901 						source8 += pitch;
   4902 						source9 += pitch;
   4903 						sourceA += pitch;
   4904 						sourceB += pitch;
   4905 						sourceC += pitch;
   4906 						sourceD += pitch;
   4907 						sourceE += pitch;
   4908 						sourceF += pitch;
   4909 					}
   4910 				}
   4911 				else ASSERT(false);
   4912 
   4913 				#undef AVERAGE
   4914 			}
   4915 		}
   4916 		else if(internal.format == FORMAT_R32F)
   4917 		{
   4918 			#if defined(__i386__) || defined(__x86_64__)
   4919 				if(CPUID::supportsSSE() && (width % 4) == 0)
   4920 				{
   4921 					if(internal.samples == 2)
   4922 					{
   4923 						for(int y = 0; y < height; y++)
   4924 						{
   4925 							for(int x = 0; x < width; x += 4)
   4926 							{
   4927 								__m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
   4928 								__m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
   4929 
   4930 								c0 = _mm_add_ps(c0, c1);
   4931 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 2.0f));
   4932 
   4933 								_mm_store_ps((float*)(source0 + 4 * x), c0);
   4934 							}
   4935 
   4936 							source0 += pitch;
   4937 							source1 += pitch;
   4938 						}
   4939 					}
   4940 					else if(internal.samples == 4)
   4941 					{
   4942 						for(int y = 0; y < height; y++)
   4943 						{
   4944 							for(int x = 0; x < width; x += 4)
   4945 							{
   4946 								__m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
   4947 								__m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
   4948 								__m128 c2 = _mm_load_ps((float*)(source2 + 4 * x));
   4949 								__m128 c3 = _mm_load_ps((float*)(source3 + 4 * x));
   4950 
   4951 								c0 = _mm_add_ps(c0, c1);
   4952 								c2 = _mm_add_ps(c2, c3);
   4953 								c0 = _mm_add_ps(c0, c2);
   4954 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 4.0f));
   4955 
   4956 								_mm_store_ps((float*)(source0 + 4 * x), c0);
   4957 							}
   4958 
   4959 							source0 += pitch;
   4960 							source1 += pitch;
   4961 							source2 += pitch;
   4962 							source3 += pitch;
   4963 						}
   4964 					}
   4965 					else if(internal.samples == 8)
   4966 					{
   4967 						for(int y = 0; y < height; y++)
   4968 						{
   4969 							for(int x = 0; x < width; x += 4)
   4970 							{
   4971 								__m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
   4972 								__m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
   4973 								__m128 c2 = _mm_load_ps((float*)(source2 + 4 * x));
   4974 								__m128 c3 = _mm_load_ps((float*)(source3 + 4 * x));
   4975 								__m128 c4 = _mm_load_ps((float*)(source4 + 4 * x));
   4976 								__m128 c5 = _mm_load_ps((float*)(source5 + 4 * x));
   4977 								__m128 c6 = _mm_load_ps((float*)(source6 + 4 * x));
   4978 								__m128 c7 = _mm_load_ps((float*)(source7 + 4 * x));
   4979 
   4980 								c0 = _mm_add_ps(c0, c1);
   4981 								c2 = _mm_add_ps(c2, c3);
   4982 								c4 = _mm_add_ps(c4, c5);
   4983 								c6 = _mm_add_ps(c6, c7);
   4984 								c0 = _mm_add_ps(c0, c2);
   4985 								c4 = _mm_add_ps(c4, c6);
   4986 								c0 = _mm_add_ps(c0, c4);
   4987 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 8.0f));
   4988 
   4989 								_mm_store_ps((float*)(source0 + 4 * x), c0);
   4990 							}
   4991 
   4992 							source0 += pitch;
   4993 							source1 += pitch;
   4994 							source2 += pitch;
   4995 							source3 += pitch;
   4996 							source4 += pitch;
   4997 							source5 += pitch;
   4998 							source6 += pitch;
   4999 							source7 += pitch;
   5000 						}
   5001 					}
   5002 					else if(internal.samples == 16)
   5003 					{
   5004 						for(int y = 0; y < height; y++)
   5005 						{
   5006 							for(int x = 0; x < width; x += 4)
   5007 							{
   5008 								__m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
   5009 								__m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
   5010 								__m128 c2 = _mm_load_ps((float*)(source2 + 4 * x));
   5011 								__m128 c3 = _mm_load_ps((float*)(source3 + 4 * x));
   5012 								__m128 c4 = _mm_load_ps((float*)(source4 + 4 * x));
   5013 								__m128 c5 = _mm_load_ps((float*)(source5 + 4 * x));
   5014 								__m128 c6 = _mm_load_ps((float*)(source6 + 4 * x));
   5015 								__m128 c7 = _mm_load_ps((float*)(source7 + 4 * x));
   5016 								__m128 c8 = _mm_load_ps((float*)(source8 + 4 * x));
   5017 								__m128 c9 = _mm_load_ps((float*)(source9 + 4 * x));
   5018 								__m128 cA = _mm_load_ps((float*)(sourceA + 4 * x));
   5019 								__m128 cB = _mm_load_ps((float*)(sourceB + 4 * x));
   5020 								__m128 cC = _mm_load_ps((float*)(sourceC + 4 * x));
   5021 								__m128 cD = _mm_load_ps((float*)(sourceD + 4 * x));
   5022 								__m128 cE = _mm_load_ps((float*)(sourceE + 4 * x));
   5023 								__m128 cF = _mm_load_ps((float*)(sourceF + 4 * x));
   5024 
   5025 								c0 = _mm_add_ps(c0, c1);
   5026 								c2 = _mm_add_ps(c2, c3);
   5027 								c4 = _mm_add_ps(c4, c5);
   5028 								c6 = _mm_add_ps(c6, c7);
   5029 								c8 = _mm_add_ps(c8, c9);
   5030 								cA = _mm_add_ps(cA, cB);
   5031 								cC = _mm_add_ps(cC, cD);
   5032 								cE = _mm_add_ps(cE, cF);
   5033 								c0 = _mm_add_ps(c0, c2);
   5034 								c4 = _mm_add_ps(c4, c6);
   5035 								c8 = _mm_add_ps(c8, cA);
   5036 								cC = _mm_add_ps(cC, cE);
   5037 								c0 = _mm_add_ps(c0, c4);
   5038 								c8 = _mm_add_ps(c8, cC);
   5039 								c0 = _mm_add_ps(c0, c8);
   5040 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 16.0f));
   5041 
   5042 								_mm_store_ps((float*)(source0 + 4 * x), c0);
   5043 							}
   5044 
   5045 							source0 += pitch;
   5046 							source1 += pitch;
   5047 							source2 += pitch;
   5048 							source3 += pitch;
   5049 							source4 += pitch;
   5050 							source5 += pitch;
   5051 							source6 += pitch;
   5052 							source7 += pitch;
   5053 							source8 += pitch;
   5054 							source9 += pitch;
   5055 							sourceA += pitch;
   5056 							sourceB += pitch;
   5057 							sourceC += pitch;
   5058 							sourceD += pitch;
   5059 							sourceE += pitch;
   5060 							sourceF += pitch;
   5061 						}
   5062 					}
   5063 					else ASSERT(false);
   5064 				}
   5065 				else
   5066 			#endif
   5067 			{
   5068 				if(internal.samples == 2)
   5069 				{
   5070 					for(int y = 0; y < height; y++)
   5071 					{
   5072 						for(int x = 0; x < width; x++)
   5073 						{
   5074 							float c0 = *(float*)(source0 + 4 * x);
   5075 							float c1 = *(float*)(source1 + 4 * x);
   5076 
   5077 							c0 = c0 + c1;
   5078 							c0 *= 1.0f / 2.0f;
   5079 
   5080 							*(float*)(source0 + 4 * x) = c0;
   5081 						}
   5082 
   5083 						source0 += pitch;
   5084 						source1 += pitch;
   5085 					}
   5086 				}
   5087 				else if(internal.samples == 4)
   5088 				{
   5089 					for(int y = 0; y < height; y++)
   5090 					{
   5091 						for(int x = 0; x < width; x++)
   5092 						{
   5093 							float c0 = *(float*)(source0 + 4 * x);
   5094 							float c1 = *(float*)(source1 + 4 * x);
   5095 							float c2 = *(float*)(source2 + 4 * x);
   5096 							float c3 = *(float*)(source3 + 4 * x);
   5097 
   5098 							c0 = c0 + c1;
   5099 							c2 = c2 + c3;
   5100 							c0 = c0 + c2;
   5101 							c0 *= 1.0f / 4.0f;
   5102 
   5103 							*(float*)(source0 + 4 * x) = c0;
   5104 						}
   5105 
   5106 						source0 += pitch;
   5107 						source1 += pitch;
   5108 						source2 += pitch;
   5109 						source3 += pitch;
   5110 					}
   5111 				}
   5112 				else if(internal.samples == 8)
   5113 				{
   5114 					for(int y = 0; y < height; y++)
   5115 					{
   5116 						for(int x = 0; x < width; x++)
   5117 						{
   5118 							float c0 = *(float*)(source0 + 4 * x);
   5119 							float c1 = *(float*)(source1 + 4 * x);
   5120 							float c2 = *(float*)(source2 + 4 * x);
   5121 							float c3 = *(float*)(source3 + 4 * x);
   5122 							float c4 = *(float*)(source4 + 4 * x);
   5123 							float c5 = *(float*)(source5 + 4 * x);
   5124 							float c6 = *(float*)(source6 + 4 * x);
   5125 							float c7 = *(float*)(source7 + 4 * x);
   5126 
   5127 							c0 = c0 + c1;
   5128 							c2 = c2 + c3;
   5129 							c4 = c4 + c5;
   5130 							c6 = c6 + c7;
   5131 							c0 = c0 + c2;
   5132 							c4 = c4 + c6;
   5133 							c0 = c0 + c4;
   5134 							c0 *= 1.0f / 8.0f;
   5135 
   5136 							*(float*)(source0 + 4 * x) = c0;
   5137 						}
   5138 
   5139 						source0 += pitch;
   5140 						source1 += pitch;
   5141 						source2 += pitch;
   5142 						source3 += pitch;
   5143 						source4 += pitch;
   5144 						source5 += pitch;
   5145 						source6 += pitch;
   5146 						source7 += pitch;
   5147 					}
   5148 				}
   5149 				else if(internal.samples == 16)
   5150 				{
   5151 					for(int y = 0; y < height; y++)
   5152 					{
   5153 						for(int x = 0; x < width; x++)
   5154 						{
   5155 							float c0 = *(float*)(source0 + 4 * x);
   5156 							float c1 = *(float*)(source1 + 4 * x);
   5157 							float c2 = *(float*)(source2 + 4 * x);
   5158 							float c3 = *(float*)(source3 + 4 * x);
   5159 							float c4 = *(float*)(source4 + 4 * x);
   5160 							float c5 = *(float*)(source5 + 4 * x);
   5161 							float c6 = *(float*)(source6 + 4 * x);
   5162 							float c7 = *(float*)(source7 + 4 * x);
   5163 							float c8 = *(float*)(source8 + 4 * x);
   5164 							float c9 = *(float*)(source9 + 4 * x);
   5165 							float cA = *(float*)(sourceA + 4 * x);
   5166 							float cB = *(float*)(sourceB + 4 * x);
   5167 							float cC = *(float*)(sourceC + 4 * x);
   5168 							float cD = *(float*)(sourceD + 4 * x);
   5169 							float cE = *(float*)(sourceE + 4 * x);
   5170 							float cF = *(float*)(sourceF + 4 * x);
   5171 
   5172 							c0 = c0 + c1;
   5173 							c2 = c2 + c3;
   5174 							c4 = c4 + c5;
   5175 							c6 = c6 + c7;
   5176 							c8 = c8 + c9;
   5177 							cA = cA + cB;
   5178 							cC = cC + cD;
   5179 							cE = cE + cF;
   5180 							c0 = c0 + c2;
   5181 							c4 = c4 + c6;
   5182 							c8 = c8 + cA;
   5183 							cC = cC + cE;
   5184 							c0 = c0 + c4;
   5185 							c8 = c8 + cC;
   5186 							c0 = c0 + c8;
   5187 							c0 *= 1.0f / 16.0f;
   5188 
   5189 							*(float*)(source0 + 4 * x) = c0;
   5190 						}
   5191 
   5192 						source0 += pitch;
   5193 						source1 += pitch;
   5194 						source2 += pitch;
   5195 						source3 += pitch;
   5196 						source4 += pitch;
   5197 						source5 += pitch;
   5198 						source6 += pitch;
   5199 						source7 += pitch;
   5200 						source8 += pitch;
   5201 						source9 += pitch;
   5202 						sourceA += pitch;
   5203 						sourceB += pitch;
   5204 						sourceC += pitch;
   5205 						sourceD += pitch;
   5206 						sourceE += pitch;
   5207 						sourceF += pitch;
   5208 					}
   5209 				}
   5210 				else ASSERT(false);
   5211 			}
   5212 		}
   5213 		else if(internal.format == FORMAT_G32R32F)
   5214 		{
   5215 			#if defined(__i386__) || defined(__x86_64__)
   5216 				if(CPUID::supportsSSE() && (width % 2) == 0)
   5217 				{
   5218 					if(internal.samples == 2)
   5219 					{
   5220 						for(int y = 0; y < height; y++)
   5221 						{
   5222 							for(int x = 0; x < width; x += 2)
   5223 							{
   5224 								__m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
   5225 								__m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
   5226 
   5227 								c0 = _mm_add_ps(c0, c1);
   5228 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 2.0f));
   5229 
   5230 								_mm_store_ps((float*)(source0 + 8 * x), c0);
   5231 							}
   5232 
   5233 							source0 += pitch;
   5234 							source1 += pitch;
   5235 						}
   5236 					}
   5237 					else if(internal.samples == 4)
   5238 					{
   5239 						for(int y = 0; y < height; y++)
   5240 						{
   5241 							for(int x = 0; x < width; x += 2)
   5242 							{
   5243 								__m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
   5244 								__m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
   5245 								__m128 c2 = _mm_load_ps((float*)(source2 + 8 * x));
   5246 								__m128 c3 = _mm_load_ps((float*)(source3 + 8 * x));
   5247 
   5248 								c0 = _mm_add_ps(c0, c1);
   5249 								c2 = _mm_add_ps(c2, c3);
   5250 								c0 = _mm_add_ps(c0, c2);
   5251 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 4.0f));
   5252 
   5253 								_mm_store_ps((float*)(source0 + 8 * x), c0);
   5254 							}
   5255 
   5256 							source0 += pitch;
   5257 							source1 += pitch;
   5258 							source2 += pitch;
   5259 							source3 += pitch;
   5260 						}
   5261 					}
   5262 					else if(internal.samples == 8)
   5263 					{
   5264 						for(int y = 0; y < height; y++)
   5265 						{
   5266 							for(int x = 0; x < width; x += 2)
   5267 							{
   5268 								__m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
   5269 								__m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
   5270 								__m128 c2 = _mm_load_ps((float*)(source2 + 8 * x));
   5271 								__m128 c3 = _mm_load_ps((float*)(source3 + 8 * x));
   5272 								__m128 c4 = _mm_load_ps((float*)(source4 + 8 * x));
   5273 								__m128 c5 = _mm_load_ps((float*)(source5 + 8 * x));
   5274 								__m128 c6 = _mm_load_ps((float*)(source6 + 8 * x));
   5275 								__m128 c7 = _mm_load_ps((float*)(source7 + 8 * x));
   5276 
   5277 								c0 = _mm_add_ps(c0, c1);
   5278 								c2 = _mm_add_ps(c2, c3);
   5279 								c4 = _mm_add_ps(c4, c5);
   5280 								c6 = _mm_add_ps(c6, c7);
   5281 								c0 = _mm_add_ps(c0, c2);
   5282 								c4 = _mm_add_ps(c4, c6);
   5283 								c0 = _mm_add_ps(c0, c4);
   5284 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 8.0f));
   5285 
   5286 								_mm_store_ps((float*)(source0 + 8 * x), c0);
   5287 							}
   5288 
   5289 							source0 += pitch;
   5290 							source1 += pitch;
   5291 							source2 += pitch;
   5292 							source3 += pitch;
   5293 							source4 += pitch;
   5294 							source5 += pitch;
   5295 							source6 += pitch;
   5296 							source7 += pitch;
   5297 						}
   5298 					}
   5299 					else if(internal.samples == 16)
   5300 					{
   5301 						for(int y = 0; y < height; y++)
   5302 						{
   5303 							for(int x = 0; x < width; x += 2)
   5304 							{
   5305 								__m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
   5306 								__m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
   5307 								__m128 c2 = _mm_load_ps((float*)(source2 + 8 * x));
   5308 								__m128 c3 = _mm_load_ps((float*)(source3 + 8 * x));
   5309 								__m128 c4 = _mm_load_ps((float*)(source4 + 8 * x));
   5310 								__m128 c5 = _mm_load_ps((float*)(source5 + 8 * x));
   5311 								__m128 c6 = _mm_load_ps((float*)(source6 + 8 * x));
   5312 								__m128 c7 = _mm_load_ps((float*)(source7 + 8 * x));
   5313 								__m128 c8 = _mm_load_ps((float*)(source8 + 8 * x));
   5314 								__m128 c9 = _mm_load_ps((float*)(source9 + 8 * x));
   5315 								__m128 cA = _mm_load_ps((float*)(sourceA + 8 * x));
   5316 								__m128 cB = _mm_load_ps((float*)(sourceB + 8 * x));
   5317 								__m128 cC = _mm_load_ps((float*)(sourceC + 8 * x));
   5318 								__m128 cD = _mm_load_ps((float*)(sourceD + 8 * x));
   5319 								__m128 cE = _mm_load_ps((float*)(sourceE + 8 * x));
   5320 								__m128 cF = _mm_load_ps((float*)(sourceF + 8 * x));
   5321 
   5322 								c0 = _mm_add_ps(c0, c1);
   5323 								c2 = _mm_add_ps(c2, c3);
   5324 								c4 = _mm_add_ps(c4, c5);
   5325 								c6 = _mm_add_ps(c6, c7);
   5326 								c8 = _mm_add_ps(c8, c9);
   5327 								cA = _mm_add_ps(cA, cB);
   5328 								cC = _mm_add_ps(cC, cD);
   5329 								cE = _mm_add_ps(cE, cF);
   5330 								c0 = _mm_add_ps(c0, c2);
   5331 								c4 = _mm_add_ps(c4, c6);
   5332 								c8 = _mm_add_ps(c8, cA);
   5333 								cC = _mm_add_ps(cC, cE);
   5334 								c0 = _mm_add_ps(c0, c4);
   5335 								c8 = _mm_add_ps(c8, cC);
   5336 								c0 = _mm_add_ps(c0, c8);
   5337 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 16.0f));
   5338 
   5339 								_mm_store_ps((float*)(source0 + 8 * x), c0);
   5340 							}
   5341 
   5342 							source0 += pitch;
   5343 							source1 += pitch;
   5344 							source2 += pitch;
   5345 							source3 += pitch;
   5346 							source4 += pitch;
   5347 							source5 += pitch;
   5348 							source6 += pitch;
   5349 							source7 += pitch;
   5350 							source8 += pitch;
   5351 							source9 += pitch;
   5352 							sourceA += pitch;
   5353 							sourceB += pitch;
   5354 							sourceC += pitch;
   5355 							sourceD += pitch;
   5356 							sourceE += pitch;
   5357 							sourceF += pitch;
   5358 						}
   5359 					}
   5360 					else ASSERT(false);
   5361 				}
   5362 				else
   5363 			#endif
   5364 			{
   5365 				if(internal.samples == 2)
   5366 				{
   5367 					for(int y = 0; y < height; y++)
   5368 					{
   5369 						for(int x = 0; x < 2 * width; x++)
   5370 						{
   5371 							float c0 = *(float*)(source0 + 4 * x);
   5372 							float c1 = *(float*)(source1 + 4 * x);
   5373 
   5374 							c0 = c0 + c1;
   5375 							c0 *= 1.0f / 2.0f;
   5376 
   5377 							*(float*)(source0 + 4 * x) = c0;
   5378 						}
   5379 
   5380 						source0 += pitch;
   5381 						source1 += pitch;
   5382 					}
   5383 				}
   5384 				else if(internal.samples == 4)
   5385 				{
   5386 					for(int y = 0; y < height; y++)
   5387 					{
   5388 						for(int x = 0; x < 2 * width; x++)
   5389 						{
   5390 							float c0 = *(float*)(source0 + 4 * x);
   5391 							float c1 = *(float*)(source1 + 4 * x);
   5392 							float c2 = *(float*)(source2 + 4 * x);
   5393 							float c3 = *(float*)(source3 + 4 * x);
   5394 
   5395 							c0 = c0 + c1;
   5396 							c2 = c2 + c3;
   5397 							c0 = c0 + c2;
   5398 							c0 *= 1.0f / 4.0f;
   5399 
   5400 							*(float*)(source0 + 4 * x) = c0;
   5401 						}
   5402 
   5403 						source0 += pitch;
   5404 						source1 += pitch;
   5405 						source2 += pitch;
   5406 						source3 += pitch;
   5407 					}
   5408 				}
   5409 				else if(internal.samples == 8)
   5410 				{
   5411 					for(int y = 0; y < height; y++)
   5412 					{
   5413 						for(int x = 0; x < 2 * width; x++)
   5414 						{
   5415 							float c0 = *(float*)(source0 + 4 * x);
   5416 							float c1 = *(float*)(source1 + 4 * x);
   5417 							float c2 = *(float*)(source2 + 4 * x);
   5418 							float c3 = *(float*)(source3 + 4 * x);
   5419 							float c4 = *(float*)(source4 + 4 * x);
   5420 							float c5 = *(float*)(source5 + 4 * x);
   5421 							float c6 = *(float*)(source6 + 4 * x);
   5422 							float c7 = *(float*)(source7 + 4 * x);
   5423 
   5424 							c0 = c0 + c1;
   5425 							c2 = c2 + c3;
   5426 							c4 = c4 + c5;
   5427 							c6 = c6 + c7;
   5428 							c0 = c0 + c2;
   5429 							c4 = c4 + c6;
   5430 							c0 = c0 + c4;
   5431 							c0 *= 1.0f / 8.0f;
   5432 
   5433 							*(float*)(source0 + 4 * x) = c0;
   5434 						}
   5435 
   5436 						source0 += pitch;
   5437 						source1 += pitch;
   5438 						source2 += pitch;
   5439 						source3 += pitch;
   5440 						source4 += pitch;
   5441 						source5 += pitch;
   5442 						source6 += pitch;
   5443 						source7 += pitch;
   5444 					}
   5445 				}
   5446 				else if(internal.samples == 16)
   5447 				{
   5448 					for(int y = 0; y < height; y++)
   5449 					{
   5450 						for(int x = 0; x < 2 * width; x++)
   5451 						{
   5452 							float c0 = *(float*)(source0 + 4 * x);
   5453 							float c1 = *(float*)(source1 + 4 * x);
   5454 							float c2 = *(float*)(source2 + 4 * x);
   5455 							float c3 = *(float*)(source3 + 4 * x);
   5456 							float c4 = *(float*)(source4 + 4 * x);
   5457 							float c5 = *(float*)(source5 + 4 * x);
   5458 							float c6 = *(float*)(source6 + 4 * x);
   5459 							float c7 = *(float*)(source7 + 4 * x);
   5460 							float c8 = *(float*)(source8 + 4 * x);
   5461 							float c9 = *(float*)(source9 + 4 * x);
   5462 							float cA = *(float*)(sourceA + 4 * x);
   5463 							float cB = *(float*)(sourceB + 4 * x);
   5464 							float cC = *(float*)(sourceC + 4 * x);
   5465 							float cD = *(float*)(sourceD + 4 * x);
   5466 							float cE = *(float*)(sourceE + 4 * x);
   5467 							float cF = *(float*)(sourceF + 4 * x);
   5468 
   5469 							c0 = c0 + c1;
   5470 							c2 = c2 + c3;
   5471 							c4 = c4 + c5;
   5472 							c6 = c6 + c7;
   5473 							c8 = c8 + c9;
   5474 							cA = cA + cB;
   5475 							cC = cC + cD;
   5476 							cE = cE + cF;
   5477 							c0 = c0 + c2;
   5478 							c4 = c4 + c6;
   5479 							c8 = c8 + cA;
   5480 							cC = cC + cE;
   5481 							c0 = c0 + c4;
   5482 							c8 = c8 + cC;
   5483 							c0 = c0 + c8;
   5484 							c0 *= 1.0f / 16.0f;
   5485 
   5486 							*(float*)(source0 + 4 * x) = c0;
   5487 						}
   5488 
   5489 						source0 += pitch;
   5490 						source1 += pitch;
   5491 						source2 += pitch;
   5492 						source3 += pitch;
   5493 						source4 += pitch;
   5494 						source5 += pitch;
   5495 						source6 += pitch;
   5496 						source7 += pitch;
   5497 						source8 += pitch;
   5498 						source9 += pitch;
   5499 						sourceA += pitch;
   5500 						sourceB += pitch;
   5501 						sourceC += pitch;
   5502 						sourceD += pitch;
   5503 						sourceE += pitch;
   5504 						sourceF += pitch;
   5505 					}
   5506 				}
   5507 				else ASSERT(false);
   5508 			}
   5509 		}
   5510 		else if(internal.format == FORMAT_A32B32G32R32F ||
   5511 		        internal.format == FORMAT_X32B32G32R32F ||
   5512 		        internal.format == FORMAT_X32B32G32R32F_UNSIGNED)
   5513 		{
   5514 			#if defined(__i386__) || defined(__x86_64__)
   5515 				if(CPUID::supportsSSE())
   5516 				{
   5517 					if(internal.samples == 2)
   5518 					{
   5519 						for(int y = 0; y < height; y++)
   5520 						{
   5521 							for(int x = 0; x < width; x++)
   5522 							{
   5523 								__m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
   5524 								__m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
   5525 
   5526 								c0 = _mm_add_ps(c0, c1);
   5527 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 2.0f));
   5528 
   5529 								_mm_store_ps((float*)(source0 + 16 * x), c0);
   5530 							}
   5531 
   5532 							source0 += pitch;
   5533 							source1 += pitch;
   5534 						}
   5535 					}
   5536 					else if(internal.samples == 4)
   5537 					{
   5538 						for(int y = 0; y < height; y++)
   5539 						{
   5540 							for(int x = 0; x < width; x++)
   5541 							{
   5542 								__m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
   5543 								__m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
   5544 								__m128 c2 = _mm_load_ps((float*)(source2 + 16 * x));
   5545 								__m128 c3 = _mm_load_ps((float*)(source3 + 16 * x));
   5546 
   5547 								c0 = _mm_add_ps(c0, c1);
   5548 								c2 = _mm_add_ps(c2, c3);
   5549 								c0 = _mm_add_ps(c0, c2);
   5550 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 4.0f));
   5551 
   5552 								_mm_store_ps((float*)(source0 + 16 * x), c0);
   5553 							}
   5554 
   5555 							source0 += pitch;
   5556 							source1 += pitch;
   5557 							source2 += pitch;
   5558 							source3 += pitch;
   5559 						}
   5560 					}
   5561 					else if(internal.samples == 8)
   5562 					{
   5563 						for(int y = 0; y < height; y++)
   5564 						{
   5565 							for(int x = 0; x < width; x++)
   5566 							{
   5567 								__m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
   5568 								__m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
   5569 								__m128 c2 = _mm_load_ps((float*)(source2 + 16 * x));
   5570 								__m128 c3 = _mm_load_ps((float*)(source3 + 16 * x));
   5571 								__m128 c4 = _mm_load_ps((float*)(source4 + 16 * x));
   5572 								__m128 c5 = _mm_load_ps((float*)(source5 + 16 * x));
   5573 								__m128 c6 = _mm_load_ps((float*)(source6 + 16 * x));
   5574 								__m128 c7 = _mm_load_ps((float*)(source7 + 16 * x));
   5575 
   5576 								c0 = _mm_add_ps(c0, c1);
   5577 								c2 = _mm_add_ps(c2, c3);
   5578 								c4 = _mm_add_ps(c4, c5);
   5579 								c6 = _mm_add_ps(c6, c7);
   5580 								c0 = _mm_add_ps(c0, c2);
   5581 								c4 = _mm_add_ps(c4, c6);
   5582 								c0 = _mm_add_ps(c0, c4);
   5583 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 8.0f));
   5584 
   5585 								_mm_store_ps((float*)(source0 + 16 * x), c0);
   5586 							}
   5587 
   5588 							source0 += pitch;
   5589 							source1 += pitch;
   5590 							source2 += pitch;
   5591 							source3 += pitch;
   5592 							source4 += pitch;
   5593 							source5 += pitch;
   5594 							source6 += pitch;
   5595 							source7 += pitch;
   5596 						}
   5597 					}
   5598 					else if(internal.samples == 16)
   5599 					{
   5600 						for(int y = 0; y < height; y++)
   5601 						{
   5602 							for(int x = 0; x < width; x++)
   5603 							{
   5604 								__m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
   5605 								__m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
   5606 								__m128 c2 = _mm_load_ps((float*)(source2 + 16 * x));
   5607 								__m128 c3 = _mm_load_ps((float*)(source3 + 16 * x));
   5608 								__m128 c4 = _mm_load_ps((float*)(source4 + 16 * x));
   5609 								__m128 c5 = _mm_load_ps((float*)(source5 + 16 * x));
   5610 								__m128 c6 = _mm_load_ps((float*)(source6 + 16 * x));
   5611 								__m128 c7 = _mm_load_ps((float*)(source7 + 16 * x));
   5612 								__m128 c8 = _mm_load_ps((float*)(source8 + 16 * x));
   5613 								__m128 c9 = _mm_load_ps((float*)(source9 + 16 * x));
   5614 								__m128 cA = _mm_load_ps((float*)(sourceA + 16 * x));
   5615 								__m128 cB = _mm_load_ps((float*)(sourceB + 16 * x));
   5616 								__m128 cC = _mm_load_ps((float*)(sourceC + 16 * x));
   5617 								__m128 cD = _mm_load_ps((float*)(sourceD + 16 * x));
   5618 								__m128 cE = _mm_load_ps((float*)(sourceE + 16 * x));
   5619 								__m128 cF = _mm_load_ps((float*)(sourceF + 16 * x));
   5620 
   5621 								c0 = _mm_add_ps(c0, c1);
   5622 								c2 = _mm_add_ps(c2, c3);
   5623 								c4 = _mm_add_ps(c4, c5);
   5624 								c6 = _mm_add_ps(c6, c7);
   5625 								c8 = _mm_add_ps(c8, c9);
   5626 								cA = _mm_add_ps(cA, cB);
   5627 								cC = _mm_add_ps(cC, cD);
   5628 								cE = _mm_add_ps(cE, cF);
   5629 								c0 = _mm_add_ps(c0, c2);
   5630 								c4 = _mm_add_ps(c4, c6);
   5631 								c8 = _mm_add_ps(c8, cA);
   5632 								cC = _mm_add_ps(cC, cE);
   5633 								c0 = _mm_add_ps(c0, c4);
   5634 								c8 = _mm_add_ps(c8, cC);
   5635 								c0 = _mm_add_ps(c0, c8);
   5636 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 16.0f));
   5637 
   5638 								_mm_store_ps((float*)(source0 + 16 * x), c0);
   5639 							}
   5640 
   5641 							source0 += pitch;
   5642 							source1 += pitch;
   5643 							source2 += pitch;
   5644 							source3 += pitch;
   5645 							source4 += pitch;
   5646 							source5 += pitch;
   5647 							source6 += pitch;
   5648 							source7 += pitch;
   5649 							source8 += pitch;
   5650 							source9 += pitch;
   5651 							sourceA += pitch;
   5652 							sourceB += pitch;
   5653 							sourceC += pitch;
   5654 							sourceD += pitch;
   5655 							sourceE += pitch;
   5656 							sourceF += pitch;
   5657 						}
   5658 					}
   5659 					else ASSERT(false);
   5660 				}
   5661 				else
   5662 			#endif
   5663 			{
   5664 				if(internal.samples == 2)
   5665 				{
   5666 					for(int y = 0; y < height; y++)
   5667 					{
   5668 						for(int x = 0; x < 4 * width; x++)
   5669 						{
   5670 							float c0 = *(float*)(source0 + 4 * x);
   5671 							float c1 = *(float*)(source1 + 4 * x);
   5672 
   5673 							c0 = c0 + c1;
   5674 							c0 *= 1.0f / 2.0f;
   5675 
   5676 							*(float*)(source0 + 4 * x) = c0;
   5677 						}
   5678 
   5679 						source0 += pitch;
   5680 						source1 += pitch;
   5681 					}
   5682 				}
   5683 				else if(internal.samples == 4)
   5684 				{
   5685 					for(int y = 0; y < height; y++)
   5686 					{
   5687 						for(int x = 0; x < 4 * width; x++)
   5688 						{
   5689 							float c0 = *(float*)(source0 + 4 * x);
   5690 							float c1 = *(float*)(source1 + 4 * x);
   5691 							float c2 = *(float*)(source2 + 4 * x);
   5692 							float c3 = *(float*)(source3 + 4 * x);
   5693 
   5694 							c0 = c0 + c1;
   5695 							c2 = c2 + c3;
   5696 							c0 = c0 + c2;
   5697 							c0 *= 1.0f / 4.0f;
   5698 
   5699 							*(float*)(source0 + 4 * x) = c0;
   5700 						}
   5701 
   5702 						source0 += pitch;
   5703 						source1 += pitch;
   5704 						source2 += pitch;
   5705 						source3 += pitch;
   5706 					}
   5707 				}
   5708 				else if(internal.samples == 8)
   5709 				{
   5710 					for(int y = 0; y < height; y++)
   5711 					{
   5712 						for(int x = 0; x < 4 * width; x++)
   5713 						{
   5714 							float c0 = *(float*)(source0 + 4 * x);
   5715 							float c1 = *(float*)(source1 + 4 * x);
   5716 							float c2 = *(float*)(source2 + 4 * x);
   5717 							float c3 = *(float*)(source3 + 4 * x);
   5718 							float c4 = *(float*)(source4 + 4 * x);
   5719 							float c5 = *(float*)(source5 + 4 * x);
   5720 							float c6 = *(float*)(source6 + 4 * x);
   5721 							float c7 = *(float*)(source7 + 4 * x);
   5722 
   5723 							c0 = c0 + c1;
   5724 							c2 = c2 + c3;
   5725 							c4 = c4 + c5;
   5726 							c6 = c6 + c7;
   5727 							c0 = c0 + c2;
   5728 							c4 = c4 + c6;
   5729 							c0 = c0 + c4;
   5730 							c0 *= 1.0f / 8.0f;
   5731 
   5732 							*(float*)(source0 + 4 * x) = c0;
   5733 						}
   5734 
   5735 						source0 += pitch;
   5736 						source1 += pitch;
   5737 						source2 += pitch;
   5738 						source3 += pitch;
   5739 						source4 += pitch;
   5740 						source5 += pitch;
   5741 						source6 += pitch;
   5742 						source7 += pitch;
   5743 					}
   5744 				}
   5745 				else if(internal.samples == 16)
   5746 				{
   5747 					for(int y = 0; y < height; y++)
   5748 					{
   5749 						for(int x = 0; x < 4 * width; x++)
   5750 						{
   5751 							float c0 = *(float*)(source0 + 4 * x);
   5752 							float c1 = *(float*)(source1 + 4 * x);
   5753 							float c2 = *(float*)(source2 + 4 * x);
   5754 							float c3 = *(float*)(source3 + 4 * x);
   5755 							float c4 = *(float*)(source4 + 4 * x);
   5756 							float c5 = *(float*)(source5 + 4 * x);
   5757 							float c6 = *(float*)(source6 + 4 * x);
   5758 							float c7 = *(float*)(source7 + 4 * x);
   5759 							float c8 = *(float*)(source8 + 4 * x);
   5760 							float c9 = *(float*)(source9 + 4 * x);
   5761 							float cA = *(float*)(sourceA + 4 * x);
   5762 							float cB = *(float*)(sourceB + 4 * x);
   5763 							float cC = *(float*)(sourceC + 4 * x);
   5764 							float cD = *(float*)(sourceD + 4 * x);
   5765 							float cE = *(float*)(sourceE + 4 * x);
   5766 							float cF = *(float*)(sourceF + 4 * x);
   5767 
   5768 							c0 = c0 + c1;
   5769 							c2 = c2 + c3;
   5770 							c4 = c4 + c5;
   5771 							c6 = c6 + c7;
   5772 							c8 = c8 + c9;
   5773 							cA = cA + cB;
   5774 							cC = cC + cD;
   5775 							cE = cE + cF;
   5776 							c0 = c0 + c2;
   5777 							c4 = c4 + c6;
   5778 							c8 = c8 + cA;
   5779 							cC = cC + cE;
   5780 							c0 = c0 + c4;
   5781 							c8 = c8 + cC;
   5782 							c0 = c0 + c8;
   5783 							c0 *= 1.0f / 16.0f;
   5784 
   5785 							*(float*)(source0 + 4 * x) = c0;
   5786 						}
   5787 
   5788 						source0 += pitch;
   5789 						source1 += pitch;
   5790 						source2 += pitch;
   5791 						source3 += pitch;
   5792 						source4 += pitch;
   5793 						source5 += pitch;
   5794 						source6 += pitch;
   5795 						source7 += pitch;
   5796 						source8 += pitch;
   5797 						source9 += pitch;
   5798 						sourceA += pitch;
   5799 						sourceB += pitch;
   5800 						sourceC += pitch;
   5801 						sourceD += pitch;
   5802 						sourceE += pitch;
   5803 						sourceF += pitch;
   5804 					}
   5805 				}
   5806 				else ASSERT(false);
   5807 			}
   5808 		}
   5809 		else if(internal.format == FORMAT_R5G6B5)
   5810 		{
   5811 			#if defined(__i386__) || defined(__x86_64__)
   5812 				if(CPUID::supportsSSE2() && (width % 8) == 0)
   5813 				{
   5814 					if(internal.samples == 2)
   5815 					{
   5816 						for(int y = 0; y < height; y++)
   5817 						{
   5818 							for(int x = 0; x < width; x += 8)
   5819 							{
   5820 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
   5821 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
   5822 
   5823 								static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
   5824 								static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
   5825 								__m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
   5826 								__m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
   5827 								__m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
   5828 								__m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
   5829 
   5830 								c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
   5831 								c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
   5832 								c1 = _mm_avg_epu16(c0__g_, c1__g_);
   5833 								c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
   5834 								c0 = _mm_or_si128(c0, c1);
   5835 
   5836 								_mm_store_si128((__m128i*)(source0 + 2 * x), c0);
   5837 							}
   5838 
   5839 							source0 += pitch;
   5840 							source1 += pitch;
   5841 						}
   5842 					}
   5843 					else if(internal.samples == 4)
   5844 					{
   5845 						for(int y = 0; y < height; y++)
   5846 						{
   5847 							for(int x = 0; x < width; x += 8)
   5848 							{
   5849 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
   5850 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
   5851 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 2 * x));
   5852 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 2 * x));
   5853 
   5854 								static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
   5855 								static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
   5856 								__m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
   5857 								__m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
   5858 								__m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
   5859 								__m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
   5860 								__m128i c2_r_b = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(r_b));
   5861 								__m128i c2__g_ = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(_g_));
   5862 								__m128i c3_r_b = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(r_b));
   5863 								__m128i c3__g_ = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(_g_));
   5864 
   5865 								c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
   5866 								c2 = _mm_avg_epu8(c2_r_b, c3_r_b);
   5867 								c0 = _mm_avg_epu8(c0, c2);
   5868 								c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
   5869 								c1 = _mm_avg_epu16(c0__g_, c1__g_);
   5870 								c3 = _mm_avg_epu16(c2__g_, c3__g_);
   5871 								c1 = _mm_avg_epu16(c1, c3);
   5872 								c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
   5873 								c0 = _mm_or_si128(c0, c1);
   5874 
   5875 								_mm_store_si128((__m128i*)(source0 + 2 * x), c0);
   5876 							}
   5877 
   5878 							source0 += pitch;
   5879 							source1 += pitch;
   5880 							source2 += pitch;
   5881 							source3 += pitch;
   5882 						}
   5883 					}
   5884 					else if(internal.samples == 8)
   5885 					{
   5886 						for(int y = 0; y < height; y++)
   5887 						{
   5888 							for(int x = 0; x < width; x += 8)
   5889 							{
   5890 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
   5891 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
   5892 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 2 * x));
   5893 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 2 * x));
   5894 								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 2 * x));
   5895 								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 2 * x));
   5896 								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 2 * x));
   5897 								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 2 * x));
   5898 
   5899 								static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
   5900 								static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
   5901 								__m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
   5902 								__m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
   5903 								__m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
   5904 								__m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
   5905 								__m128i c2_r_b = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(r_b));
   5906 								__m128i c2__g_ = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(_g_));
   5907 								__m128i c3_r_b = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(r_b));
   5908 								__m128i c3__g_ = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(_g_));
   5909 								__m128i c4_r_b = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(r_b));
   5910 								__m128i c4__g_ = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(_g_));
   5911 								__m128i c5_r_b = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(r_b));
   5912 								__m128i c5__g_ = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(_g_));
   5913 								__m128i c6_r_b = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(r_b));
   5914 								__m128i c6__g_ = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(_g_));
   5915 								__m128i c7_r_b = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(r_b));
   5916 								__m128i c7__g_ = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(_g_));
   5917 
   5918 								c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
   5919 								c2 = _mm_avg_epu8(c2_r_b, c3_r_b);
   5920 								c4 = _mm_avg_epu8(c4_r_b, c5_r_b);
   5921 								c6 = _mm_avg_epu8(c6_r_b, c7_r_b);
   5922 								c0 = _mm_avg_epu8(c0, c2);
   5923 								c4 = _mm_avg_epu8(c4, c6);
   5924 								c0 = _mm_avg_epu8(c0, c4);
   5925 								c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
   5926 								c1 = _mm_avg_epu16(c0__g_, c1__g_);
   5927 								c3 = _mm_avg_epu16(c2__g_, c3__g_);
   5928 								c5 = _mm_avg_epu16(c4__g_, c5__g_);
   5929 								c7 = _mm_avg_epu16(c6__g_, c7__g_);
   5930 								c1 = _mm_avg_epu16(c1, c3);
   5931 								c5 = _mm_avg_epu16(c5, c7);
   5932 								c1 = _mm_avg_epu16(c1, c5);
   5933 								c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
   5934 								c0 = _mm_or_si128(c0, c1);
   5935 
   5936 								_mm_store_si128((__m128i*)(source0 + 2 * x), c0);
   5937 							}
   5938 
   5939 							source0 += pitch;
   5940 							source1 += pitch;
   5941 							source2 += pitch;
   5942 							source3 += pitch;
   5943 							source4 += pitch;
   5944 							source5 += pitch;
   5945 							source6 += pitch;
   5946 							source7 += pitch;
   5947 						}
   5948 					}
   5949 					else if(internal.samples == 16)
   5950 					{
   5951 						for(int y = 0; y < height; y++)
   5952 						{
   5953 							for(int x = 0; x < width; x += 8)
   5954 							{
   5955 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
   5956 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
   5957 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 2 * x));
   5958 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 2 * x));
   5959 								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 2 * x));
   5960 								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 2 * x));
   5961 								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 2 * x));
   5962 								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 2 * x));
   5963 								__m128i c8 = _mm_load_si128((__m128i*)(source8 + 2 * x));
   5964 								__m128i c9 = _mm_load_si128((__m128i*)(source9 + 2 * x));
   5965 								__m128i cA = _mm_load_si128((__m128i*)(sourceA + 2 * x));
   5966 								__m128i cB = _mm_load_si128((__m128i*)(sourceB + 2 * x));
   5967 								__m128i cC = _mm_load_si128((__m128i*)(sourceC + 2 * x));
   5968 								__m128i cD = _mm_load_si128((__m128i*)(sourceD + 2 * x));
   5969 								__m128i cE = _mm_load_si128((__m128i*)(sourceE + 2 * x));
   5970 								__m128i cF = _mm_load_si128((__m128i*)(sourceF + 2 * x));
   5971 
   5972 								static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
   5973 								static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
   5974 								__m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
   5975 								__m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
   5976 								__m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
   5977 								__m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
   5978 								__m128i c2_r_b = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(r_b));
   5979 								__m128i c2__g_ = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(_g_));
   5980 								__m128i c3_r_b = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(r_b));
   5981 								__m128i c3__g_ = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(_g_));
   5982 								__m128i c4_r_b = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(r_b));
   5983 								__m128i c4__g_ = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(_g_));
   5984 								__m128i c5_r_b = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(r_b));
   5985 								__m128i c5__g_ = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(_g_));
   5986 								__m128i c6_r_b = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(r_b));
   5987 								__m128i c6__g_ = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(_g_));
   5988 								__m128i c7_r_b = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(r_b));
   5989 								__m128i c7__g_ = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(_g_));
   5990 								__m128i c8_r_b = _mm_and_si128(c8, reinterpret_cast<const __m128i&>(r_b));
   5991 								__m128i c8__g_ = _mm_and_si128(c8, reinterpret_cast<const __m128i&>(_g_));
   5992 								__m128i c9_r_b = _mm_and_si128(c9, reinterpret_cast<const __m128i&>(r_b));
   5993 								__m128i c9__g_ = _mm_and_si128(c9, reinterpret_cast<const __m128i&>(_g_));
   5994 								__m128i cA_r_b = _mm_and_si128(cA, reinterpret_cast<const __m128i&>(r_b));
   5995 								__m128i cA__g_ = _mm_and_si128(cA, reinterpret_cast<const __m128i&>(_g_));
   5996 								__m128i cB_r_b = _mm_and_si128(cB, reinterpret_cast<const __m128i&>(r_b));
   5997 								__m128i cB__g_ = _mm_and_si128(cB, reinterpret_cast<const __m128i&>(_g_));
   5998 								__m128i cC_r_b = _mm_and_si128(cC, reinterpret_cast<const __m128i&>(r_b));
   5999 								__m128i cC__g_ = _mm_and_si128(cC, reinterpret_cast<const __m128i&>(_g_));
   6000 								__m128i cD_r_b = _mm_and_si128(cD, reinterpret_cast<const __m128i&>(r_b));
   6001 								__m128i cD__g_ = _mm_and_si128(cD, reinterpret_cast<const __m128i&>(_g_));
   6002 								__m128i cE_r_b = _mm_and_si128(cE, reinterpret_cast<const __m128i&>(r_b));
   6003 								__m128i cE__g_ = _mm_and_si128(cE, reinterpret_cast<const __m128i&>(_g_));
   6004 								__m128i cF_r_b = _mm_and_si128(cF, reinterpret_cast<const __m128i&>(r_b));
   6005 								__m128i cF__g_ = _mm_and_si128(cF, reinterpret_cast<const __m128i&>(_g_));
   6006 
   6007 								c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
   6008 								c2 = _mm_avg_epu8(c2_r_b, c3_r_b);
   6009 								c4 = _mm_avg_epu8(c4_r_b, c5_r_b);
   6010 								c6 = _mm_avg_epu8(c6_r_b, c7_r_b);
   6011 								c8 = _mm_avg_epu8(c8_r_b, c9_r_b);
   6012 								cA = _mm_avg_epu8(cA_r_b, cB_r_b);
   6013 								cC = _mm_avg_epu8(cC_r_b, cD_r_b);
   6014 								cE = _mm_avg_epu8(cE_r_b, cF_r_b);
   6015 								c0 = _mm_avg_epu8(c0, c2);
   6016 								c4 = _mm_avg_epu8(c4, c6);
   6017 								c8 = _mm_avg_epu8(c8, cA);
   6018 								cC = _mm_avg_epu8(cC, cE);
   6019 								c0 = _mm_avg_epu8(c0, c4);
   6020 								c8 = _mm_avg_epu8(c8, cC);
   6021 								c0 = _mm_avg_epu8(c0, c8);
   6022 								c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
   6023 								c1 = _mm_avg_epu16(c0__g_, c1__g_);
   6024 								c3 = _mm_avg_epu16(c2__g_, c3__g_);
   6025 								c5 = _mm_avg_epu16(c4__g_, c5__g_);
   6026 								c7 = _mm_avg_epu16(c6__g_, c7__g_);
   6027 								c9 = _mm_avg_epu16(c8__g_, c9__g_);
   6028 								cB = _mm_avg_epu16(cA__g_, cB__g_);
   6029 								cD = _mm_avg_epu16(cC__g_, cD__g_);
   6030 								cF = _mm_avg_epu16(cE__g_, cF__g_);
   6031 								c1 = _mm_avg_epu8(c1, c3);
   6032 								c5 = _mm_avg_epu8(c5, c7);
   6033 								c9 = _mm_avg_epu8(c9, cB);
   6034 								cD = _mm_avg_epu8(cD, cF);
   6035 								c1 = _mm_avg_epu8(c1, c5);
   6036 								c9 = _mm_avg_epu8(c9, cD);
   6037 								c1 = _mm_avg_epu8(c1, c9);
   6038 								c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
   6039 								c0 = _mm_or_si128(c0, c1);
   6040 
   6041 								_mm_store_si128((__m128i*)(source0 + 2 * x), c0);
   6042 							}
   6043 
   6044 							source0 += pitch;
   6045 							source1 += pitch;
   6046 							source2 += pitch;
   6047 							source3 += pitch;
   6048 							source4 += pitch;
   6049 							source5 += pitch;
   6050 							source6 += pitch;
   6051 							source7 += pitch;
   6052 							source8 += pitch;
   6053 							source9 += pitch;
   6054 							sourceA += pitch;
   6055 							sourceB += pitch;
   6056 							sourceC += pitch;
   6057 							sourceD += pitch;
   6058 							sourceE += pitch;
   6059 							sourceF += pitch;
   6060 						}
   6061 					}
   6062 					else ASSERT(false);
   6063 				}
   6064 				else
   6065 			#endif
   6066 			{
   6067 				#define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7BEF) + (((x) ^ (y)) & 0x0821))
   6068 
   6069 				if(internal.samples == 2)
   6070 				{
   6071 					for(int y = 0; y < height; y++)
   6072 					{
   6073 						for(int x = 0; x < width; x++)
   6074 						{
   6075 							unsigned short c0 = *(unsigned short*)(source0 + 2 * x);
   6076 							unsigned short c1 = *(unsigned short*)(source1 + 2 * x);
   6077 
   6078 							c0 = AVERAGE(c0, c1);
   6079 
   6080 							*(unsigned short*)(source0 + 2 * x) = c0;
   6081 						}
   6082 
   6083 						source0 += pitch;
   6084 						source1 += pitch;
   6085 					}
   6086 				}
   6087 				else if(internal.samples == 4)
   6088 				{
   6089 					for(int y = 0; y < height; y++)
   6090 					{
   6091 						for(int x = 0; x < width; x++)
   6092 						{
   6093 							unsigned short c0 = *(unsigned short*)(source0 + 2 * x);
   6094 							unsigned short c1 = *(unsigned short*)(source1 + 2 * x);
   6095 							unsigned short c2 = *(unsigned short*)(source2 + 2 * x);
   6096 							unsigned short c3 = *(unsigned short*)(source3 + 2 * x);
   6097 
   6098 							c0 = AVERAGE(c0, c1);
   6099 							c2 = AVERAGE(c2, c3);
   6100 							c0 = AVERAGE(c0, c2);
   6101 
   6102 							*(unsigned short*)(source0 + 2 * x) = c0;
   6103 						}
   6104 
   6105 						source0 += pitch;
   6106 						source1 += pitch;
   6107 						source2 += pitch;
   6108 						source3 += pitch;
   6109 					}
   6110 				}
   6111 				else if(internal.samples == 8)
   6112 				{
   6113 					for(int y = 0; y < height; y++)
   6114 					{
   6115 						for(int x = 0; x < width; x++)
   6116 						{
   6117 							unsigned short c0 = *(unsigned short*)(source0 + 2 * x);
   6118 							unsigned short c1 = *(unsigned short*)(source1 + 2 * x);
   6119 							unsigned short c2 = *(unsigned short*)(source2 + 2 * x);
   6120 							unsigned short c3 = *(unsigned short*)(source3 + 2 * x);
   6121 							unsigned short c4 = *(unsigned short*)(source4 + 2 * x);
   6122 							unsigned short c5 = *(unsigned short*)(source5 + 2 * x);
   6123 							unsigned short c6 = *(unsigned short*)(source6 + 2 * x);
   6124 							unsigned short c7 = *(unsigned short*)(source7 + 2 * x);
   6125 
   6126 							c0 = AVERAGE(c0, c1);
   6127 							c2 = AVERAGE(c2, c3);
   6128 							c4 = AVERAGE(c4, c5);
   6129 							c6 = AVERAGE(c6, c7);
   6130 							c0 = AVERAGE(c0, c2);
   6131 							c4 = AVERAGE(c4, c6);
   6132 							c0 = AVERAGE(c0, c4);
   6133 
   6134 							*(unsigned short*)(source0 + 2 * x) = c0;
   6135 						}
   6136 
   6137 						source0 += pitch;
   6138 						source1 += pitch;
   6139 						source2 += pitch;
   6140 						source3 += pitch;
   6141 						source4 += pitch;
   6142 						source5 += pitch;
   6143 						source6 += pitch;
   6144 						source7 += pitch;
   6145 					}
   6146 				}
   6147 				else if(internal.samples == 16)
   6148 				{
   6149 					for(int y = 0; y < height; y++)
   6150 					{
   6151 						for(int x = 0; x < width; x++)
   6152 						{
   6153 							unsigned short c0 = *(unsigned short*)(source0 + 2 * x);
   6154 							unsigned short c1 = *(unsigned short*)(source1 + 2 * x);
   6155 							unsigned short c2 = *(unsigned short*)(source2 + 2 * x);
   6156 							unsigned short c3 = *(unsigned short*)(source3 + 2 * x);
   6157 							unsigned short c4 = *(unsigned short*)(source4 + 2 * x);
   6158 							unsigned short c5 = *(unsigned short*)(source5 + 2 * x);
   6159 							unsigned short c6 = *(unsigned short*)(source6 + 2 * x);
   6160 							unsigned short c7 = *(unsigned short*)(source7 + 2 * x);
   6161 							unsigned short c8 = *(unsigned short*)(source8 + 2 * x);
   6162 							unsigned short c9 = *(unsigned short*)(source9 + 2 * x);
   6163 							unsigned short cA = *(unsigned short*)(sourceA + 2 * x);
   6164 							unsigned short cB = *(unsigned short*)(sourceB + 2 * x);
   6165 							unsigned short cC = *(unsigned short*)(sourceC + 2 * x);
   6166 							unsigned short cD = *(unsigned short*)(sourceD + 2 * x);
   6167 							unsigned short cE = *(unsigned short*)(sourceE + 2 * x);
   6168 							unsigned short cF = *(unsigned short*)(sourceF + 2 * x);
   6169 
   6170 							c0 = AVERAGE(c0, c1);
   6171 							c2 = AVERAGE(c2, c3);
   6172 							c4 = AVERAGE(c4, c5);
   6173 							c6 = AVERAGE(c6, c7);
   6174 							c8 = AVERAGE(c8, c9);
   6175 							cA = AVERAGE(cA, cB);
   6176 							cC = AVERAGE(cC, cD);
   6177 							cE = AVERAGE(cE, cF);
   6178 							c0 = AVERAGE(c0, c2);
   6179 							c4 = AVERAGE(c4, c6);
   6180 							c8 = AVERAGE(c8, cA);
   6181 							cC = AVERAGE(cC, cE);
   6182 							c0 = AVERAGE(c0, c4);
   6183 							c8 = AVERAGE(c8, cC);
   6184 							c0 = AVERAGE(c0, c8);
   6185 
   6186 							*(unsigned short*)(source0 + 2 * x) = c0;
   6187 						}
   6188 
   6189 						source0 += pitch;
   6190 						source1 += pitch;
   6191 						source2 += pitch;
   6192 						source3 += pitch;
   6193 						source4 += pitch;
   6194 						source5 += pitch;
   6195 						source6 += pitch;
   6196 						source7 += pitch;
   6197 						source8 += pitch;
   6198 						source9 += pitch;
   6199 						sourceA += pitch;
   6200 						sourceB += pitch;
   6201 						sourceC += pitch;
   6202 						sourceD += pitch;
   6203 						sourceE += pitch;
   6204 						sourceF += pitch;
   6205 					}
   6206 				}
   6207 				else ASSERT(false);
   6208 
   6209 				#undef AVERAGE
   6210 			}
   6211 		}
   6212 		else
   6213 		{
   6214 		//	UNIMPLEMENTED();
   6215 		}
   6216 	}
   6217 }
   6218