Home | History | Annotate | Download | only in Renderer
      1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved.
      2 //
      3 // Licensed under the Apache License, Version 2.0 (the "License");
      4 // you may not use this file except in compliance with the License.
      5 // You may obtain a copy of the License at
      6 //
      7 //    http://www.apache.org/licenses/LICENSE-2.0
      8 //
      9 // Unless required by applicable law or agreed to in writing, software
     10 // distributed under the License is distributed on an "AS IS" BASIS,
     11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 // See the License for the specific language governing permissions and
     13 // limitations under the License.
     14 
     15 #include "Surface.hpp"
     16 
     17 #include "Color.hpp"
     18 #include "Context.hpp"
     19 #include "ETC_Decoder.hpp"
     20 #include "Renderer.hpp"
     21 #include "Common/Half.hpp"
     22 #include "Common/Memory.hpp"
     23 #include "Common/CPUID.hpp"
     24 #include "Common/Resource.hpp"
     25 #include "Common/Debug.hpp"
     26 #include "Reactor/Reactor.hpp"
     27 
     28 #if defined(__i386__) || defined(__x86_64__)
     29 	#include <xmmintrin.h>
     30 	#include <emmintrin.h>
     31 #endif
     32 
     33 #undef min
     34 #undef max
     35 
     36 namespace sw
     37 {
     38 	extern bool quadLayoutEnabled;
     39 	extern bool complementaryDepthBuffer;
     40 	extern TranscendentalPrecision logPrecision;
     41 
     42 	unsigned int *Surface::palette = 0;
     43 	unsigned int Surface::paletteID = 0;
     44 
     45 	void Surface::Buffer::write(int x, int y, int z, const Color<float> &color)
     46 	{
     47 		byte *element = (byte*)buffer + (x + border) * bytes + (y + border) * pitchB + z * samples * sliceB;
     48 
     49 		for(int i = 0; i < samples; i++)
     50 		{
     51 			write(element, color);
     52 			element += sliceB;
     53 		}
     54 	}
     55 
     56 	void Surface::Buffer::write(int x, int y, const Color<float> &color)
     57 	{
     58 		byte *element = (byte*)buffer + (x + border) * bytes + (y + border) * pitchB;
     59 
     60 		for(int i = 0; i < samples; i++)
     61 		{
     62 			write(element, color);
     63 			element += sliceB;
     64 		}
     65 	}
     66 
     67 	inline void Surface::Buffer::write(void *element, const Color<float> &color)
     68 	{
     69 		float r = color.r;
     70 		float g = color.g;
     71 		float b = color.b;
     72 		float a = color.a;
     73 
     74 		if(isSRGBformat(format))
     75 		{
     76 			r = linearToSRGB(r);
     77 			g = linearToSRGB(g);
     78 			b = linearToSRGB(b);
     79 		}
     80 
     81 		switch(format)
     82 		{
     83 		case FORMAT_A8:
     84 			*(unsigned char*)element = unorm<8>(a);
     85 			break;
     86 		case FORMAT_R8_SNORM:
     87 			*(char*)element = snorm<8>(r);
     88 			break;
     89 		case FORMAT_R8:
     90 			*(unsigned char*)element = unorm<8>(r);
     91 			break;
     92 		case FORMAT_R8I:
     93 			*(char*)element = scast<8>(r);
     94 			break;
     95 		case FORMAT_R8UI:
     96 			*(unsigned char*)element = ucast<8>(r);
     97 			break;
     98 		case FORMAT_R16I:
     99 			*(short*)element = scast<16>(r);
    100 			break;
    101 		case FORMAT_R16UI:
    102 			*(unsigned short*)element = ucast<16>(r);
    103 			break;
    104 		case FORMAT_R32I:
    105 			*(int*)element = static_cast<int>(r);
    106 			break;
    107 		case FORMAT_R32UI:
    108 			*(unsigned int*)element = static_cast<unsigned int>(r);
    109 			break;
    110 		case FORMAT_R3G3B2:
    111 			*(unsigned char*)element = (unorm<3>(r) << 5) | (unorm<3>(g) << 2) | (unorm<2>(b) << 0);
    112 			break;
    113 		case FORMAT_A8R3G3B2:
    114 			*(unsigned short*)element = (unorm<8>(a) << 8) | (unorm<3>(r) << 5) | (unorm<3>(g) << 2) | (unorm<2>(b) << 0);
    115 			break;
    116 		case FORMAT_X4R4G4B4:
    117 			*(unsigned short*)element = 0xF000 | (unorm<4>(r) << 8) | (unorm<4>(g) << 4) | (unorm<4>(b) << 0);
    118 			break;
    119 		case FORMAT_A4R4G4B4:
    120 			*(unsigned short*)element = (unorm<4>(a) << 12) | (unorm<4>(r) << 8) | (unorm<4>(g) << 4) | (unorm<4>(b) << 0);
    121 			break;
    122 		case FORMAT_R4G4B4A4:
    123 			*(unsigned short*)element = (unorm<4>(r) << 12) | (unorm<4>(g) << 8) | (unorm<4>(b) << 4) | (unorm<4>(a) << 0);
    124 			break;
    125 		case FORMAT_R5G6B5:
    126 			*(unsigned short*)element = (unorm<5>(r) << 11) | (unorm<6>(g) << 5) | (unorm<5>(b) << 0);
    127 			break;
    128 		case FORMAT_A1R5G5B5:
    129 			*(unsigned short*)element = (unorm<1>(a) << 15) | (unorm<5>(r) << 10) | (unorm<5>(g) << 5) | (unorm<5>(b) << 0);
    130 			break;
    131 		case FORMAT_R5G5B5A1:
    132 			*(unsigned short*)element = (unorm<5>(r) << 11) | (unorm<5>(g) << 6) | (unorm<5>(b) << 1) | (unorm<5>(a) << 0);
    133 			break;
    134 		case FORMAT_X1R5G5B5:
    135 			*(unsigned short*)element = 0x8000 | (unorm<5>(r) << 10) | (unorm<5>(g) << 5) | (unorm<5>(b) << 0);
    136 			break;
    137 		case FORMAT_A8R8G8B8:
    138 			*(unsigned int*)element = (unorm<8>(a) << 24) | (unorm<8>(r) << 16) | (unorm<8>(g) << 8) | (unorm<8>(b) << 0);
    139 			break;
    140 		case FORMAT_X8R8G8B8:
    141 			*(unsigned int*)element = 0xFF000000 | (unorm<8>(r) << 16) | (unorm<8>(g) << 8) | (unorm<8>(b) << 0);
    142 			break;
    143 		case FORMAT_A8B8G8R8_SNORM:
    144 			*(unsigned int*)element = (static_cast<unsigned int>(snorm<8>(a)) << 24) |
    145 			                          (static_cast<unsigned int>(snorm<8>(b)) << 16) |
    146 			                          (static_cast<unsigned int>(snorm<8>(g)) << 8) |
    147 			                          (static_cast<unsigned int>(snorm<8>(r)) << 0);
    148 			break;
    149 		case FORMAT_A8B8G8R8:
    150 		case FORMAT_SRGB8_A8:
    151 			*(unsigned int*)element = (unorm<8>(a) << 24) | (unorm<8>(b) << 16) | (unorm<8>(g) << 8) | (unorm<8>(r) << 0);
    152 			break;
    153 		case FORMAT_A8B8G8R8I:
    154 			*(unsigned int*)element = (static_cast<unsigned int>(scast<8>(a)) << 24) |
    155 			                          (static_cast<unsigned int>(scast<8>(b)) << 16) |
    156 			                          (static_cast<unsigned int>(scast<8>(g)) << 8) |
    157 			                          (static_cast<unsigned int>(scast<8>(r)) << 0);
    158 			break;
    159 		case FORMAT_A8B8G8R8UI:
    160 			*(unsigned int*)element = (ucast<8>(a) << 24) | (ucast<8>(b) << 16) | (ucast<8>(g) << 8) | (ucast<8>(r) << 0);
    161 			break;
    162 		case FORMAT_X8B8G8R8_SNORM:
    163 			*(unsigned int*)element = 0x7F000000 |
    164 			                          (static_cast<unsigned int>(snorm<8>(b)) << 16) |
    165 			                          (static_cast<unsigned int>(snorm<8>(g)) << 8) |
    166 			                          (static_cast<unsigned int>(snorm<8>(r)) << 0);
    167 			break;
    168 		case FORMAT_X8B8G8R8:
    169 		case FORMAT_SRGB8_X8:
    170 			*(unsigned int*)element = 0xFF000000 | (unorm<8>(b) << 16) | (unorm<8>(g) << 8) | (unorm<8>(r) << 0);
    171 			break;
    172 		case FORMAT_X8B8G8R8I:
    173 			*(unsigned int*)element = 0x7F000000 |
    174 			                          (static_cast<unsigned int>(scast<8>(b)) << 16) |
    175 			                          (static_cast<unsigned int>(scast<8>(g)) << 8) |
    176 			                          (static_cast<unsigned int>(scast<8>(r)) << 0);
    177 		case FORMAT_X8B8G8R8UI:
    178 			*(unsigned int*)element = 0xFF000000 | (ucast<8>(b) << 16) | (ucast<8>(g) << 8) | (ucast<8>(r) << 0);
    179 			break;
    180 		case FORMAT_A2R10G10B10:
    181 			*(unsigned int*)element = (unorm<2>(a) << 30) | (unorm<10>(r) << 20) | (unorm<10>(g) << 10) | (unorm<10>(b) << 0);
    182 			break;
    183 		case FORMAT_A2B10G10R10:
    184 		case FORMAT_A2B10G10R10UI:
    185 			*(unsigned int*)element = (unorm<2>(a) << 30) | (unorm<10>(b) << 20) | (unorm<10>(g) << 10) | (unorm<10>(r) << 0);
    186 			break;
    187 		case FORMAT_G8R8_SNORM:
    188 			*(unsigned short*)element = (static_cast<unsigned short>(snorm<8>(g)) << 8) |
    189 			                            (static_cast<unsigned short>(snorm<8>(r)) << 0);
    190 			break;
    191 		case FORMAT_G8R8:
    192 			*(unsigned short*)element = (unorm<8>(g) << 8) | (unorm<8>(r) << 0);
    193 			break;
    194 		case FORMAT_G8R8I:
    195 			*(unsigned short*)element = (static_cast<unsigned short>(scast<8>(g)) << 8) |
    196 			                            (static_cast<unsigned short>(scast<8>(r)) << 0);
    197 			break;
    198 		case FORMAT_G8R8UI:
    199 			*(unsigned short*)element = (ucast<8>(g) << 8) | (ucast<8>(r) << 0);
    200 			break;
    201 		case FORMAT_G16R16:
    202 			*(unsigned int*)element = (unorm<16>(g) << 16) | (unorm<16>(r) << 0);
    203 			break;
    204 		case FORMAT_G16R16I:
    205 			*(unsigned int*)element = (static_cast<unsigned int>(scast<16>(g)) << 16) |
    206 			                          (static_cast<unsigned int>(scast<16>(r)) << 0);
    207 			break;
    208 		case FORMAT_G16R16UI:
    209 			*(unsigned int*)element = (ucast<16>(g) << 16) | (ucast<16>(r) << 0);
    210 			break;
    211 		case FORMAT_G32R32I:
    212 		case FORMAT_G32R32UI:
    213 			((unsigned int*)element)[0] = static_cast<unsigned int>(r);
    214 			((unsigned int*)element)[1] = static_cast<unsigned int>(g);
    215 			break;
    216 		case FORMAT_A16B16G16R16:
    217 			((unsigned short*)element)[0] = unorm<16>(r);
    218 			((unsigned short*)element)[1] = unorm<16>(g);
    219 			((unsigned short*)element)[2] = unorm<16>(b);
    220 			((unsigned short*)element)[3] = unorm<16>(a);
    221 			break;
    222 		case FORMAT_A16B16G16R16I:
    223 			((unsigned short*)element)[0] = static_cast<unsigned short>(scast<16>(r));
    224 			((unsigned short*)element)[1] = static_cast<unsigned short>(scast<16>(g));
    225 			((unsigned short*)element)[2] = static_cast<unsigned short>(scast<16>(b));
    226 			((unsigned short*)element)[3] = static_cast<unsigned short>(scast<16>(a));
    227 			break;
    228 		case FORMAT_A16B16G16R16UI:
    229 			((unsigned short*)element)[0] = static_cast<unsigned short>(ucast<16>(r));
    230 			((unsigned short*)element)[1] = static_cast<unsigned short>(ucast<16>(g));
    231 			((unsigned short*)element)[2] = static_cast<unsigned short>(ucast<16>(b));
    232 			((unsigned short*)element)[3] = static_cast<unsigned short>(ucast<16>(a));
    233 			break;
    234 		case FORMAT_X16B16G16R16I:
    235 			((unsigned short*)element)[0] = static_cast<unsigned short>(scast<16>(r));
    236 			((unsigned short*)element)[1] = static_cast<unsigned short>(scast<16>(g));
    237 			((unsigned short*)element)[2] = static_cast<unsigned short>(scast<16>(b));
    238 			break;
    239 		case FORMAT_X16B16G16R16UI:
    240 			((unsigned short*)element)[0] = static_cast<unsigned short>(ucast<16>(r));
    241 			((unsigned short*)element)[1] = static_cast<unsigned short>(ucast<16>(g));
    242 			((unsigned short*)element)[2] = static_cast<unsigned short>(ucast<16>(b));
    243 			break;
    244 		case FORMAT_A32B32G32R32I:
    245 		case FORMAT_A32B32G32R32UI:
    246 			((unsigned int*)element)[0] = static_cast<unsigned int>(r);
    247 			((unsigned int*)element)[1] = static_cast<unsigned int>(g);
    248 			((unsigned int*)element)[2] = static_cast<unsigned int>(b);
    249 			((unsigned int*)element)[3] = static_cast<unsigned int>(a);
    250 			break;
    251 		case FORMAT_X32B32G32R32I:
    252 		case FORMAT_X32B32G32R32UI:
    253 			((unsigned int*)element)[0] = static_cast<unsigned int>(r);
    254 			((unsigned int*)element)[1] = static_cast<unsigned int>(g);
    255 			((unsigned int*)element)[2] = static_cast<unsigned int>(b);
    256 			break;
    257 		case FORMAT_V8U8:
    258 			*(unsigned short*)element = (snorm<8>(g) << 8) | (snorm<8>(r) << 0);
    259 			break;
    260 		case FORMAT_L6V5U5:
    261 			*(unsigned short*)element = (unorm<6>(b) << 10) | (snorm<5>(g) << 5) | (snorm<5>(r) << 0);
    262 			break;
    263 		case FORMAT_Q8W8V8U8:
    264 			*(unsigned int*)element = (snorm<8>(a) << 24) | (snorm<8>(b) << 16) | (snorm<8>(g) << 8) | (snorm<8>(r) << 0);
    265 			break;
    266 		case FORMAT_X8L8V8U8:
    267 			*(unsigned int*)element = 0xFF000000 | (unorm<8>(b) << 16) | (snorm<8>(g) << 8) | (snorm<8>(r) << 0);
    268 			break;
    269 		case FORMAT_V16U16:
    270 			*(unsigned int*)element = (snorm<16>(g) << 16) | (snorm<16>(r) << 0);
    271 			break;
    272 		case FORMAT_A2W10V10U10:
    273 			*(unsigned int*)element = (unorm<2>(a) << 30) | (snorm<10>(b) << 20) | (snorm<10>(g) << 10) | (snorm<10>(r) << 0);
    274 			break;
    275 		case FORMAT_A16W16V16U16:
    276 			((unsigned short*)element)[0] = snorm<16>(r);
    277 			((unsigned short*)element)[1] = snorm<16>(g);
    278 			((unsigned short*)element)[2] = snorm<16>(b);
    279 			((unsigned short*)element)[3] = unorm<16>(a);
    280 			break;
    281 		case FORMAT_Q16W16V16U16:
    282 			((unsigned short*)element)[0] = snorm<16>(r);
    283 			((unsigned short*)element)[1] = snorm<16>(g);
    284 			((unsigned short*)element)[2] = snorm<16>(b);
    285 			((unsigned short*)element)[3] = snorm<16>(a);
    286 			break;
    287 		case FORMAT_R8G8B8:
    288 			((unsigned char*)element)[0] = unorm<8>(b);
    289 			((unsigned char*)element)[1] = unorm<8>(g);
    290 			((unsigned char*)element)[2] = unorm<8>(r);
    291 			break;
    292 		case FORMAT_B8G8R8:
    293 			((unsigned char*)element)[0] = unorm<8>(r);
    294 			((unsigned char*)element)[1] = unorm<8>(g);
    295 			((unsigned char*)element)[2] = unorm<8>(b);
    296 			break;
    297 		case FORMAT_R16F:
    298 			*(half*)element = (half)r;
    299 			break;
    300 		case FORMAT_A16F:
    301 			*(half*)element = (half)a;
    302 			break;
    303 		case FORMAT_G16R16F:
    304 			((half*)element)[0] = (half)r;
    305 			((half*)element)[1] = (half)g;
    306 			break;
    307 		case FORMAT_X16B16G16R16F_UNSIGNED:
    308 			r = max(r, 0.0f); g = max(g, 0.0f); b = max(b, 0.0f);
    309 			// Fall through to FORMAT_X16B16G16R16F.
    310 		case FORMAT_X16B16G16R16F:
    311 			((half*)element)[3] = 1.0f;
    312 			// Fall through to FORMAT_B16G16R16F.
    313 		case FORMAT_B16G16R16F:
    314 			((half*)element)[0] = (half)r;
    315 			((half*)element)[1] = (half)g;
    316 			((half*)element)[2] = (half)b;
    317 			break;
    318 		case FORMAT_A16B16G16R16F:
    319 			((half*)element)[0] = (half)r;
    320 			((half*)element)[1] = (half)g;
    321 			((half*)element)[2] = (half)b;
    322 			((half*)element)[3] = (half)a;
    323 			break;
    324 		case FORMAT_A32F:
    325 			*(float*)element = a;
    326 			break;
    327 		case FORMAT_R32F:
    328 			*(float*)element = r;
    329 			break;
    330 		case FORMAT_G32R32F:
    331 			((float*)element)[0] = r;
    332 			((float*)element)[1] = g;
    333 			break;
    334 		case FORMAT_X32B32G32R32F_UNSIGNED:
    335 			r = max(r, 0.0f); g = max(g, 0.0f); b = max(b, 0.0f);
    336 			// Fall through to FORMAT_X32B32G32R32F.
    337 		case FORMAT_X32B32G32R32F:
    338 			((float*)element)[3] = 1.0f;
    339 			// Fall through to FORMAT_B32G32R32F.
    340 		case FORMAT_B32G32R32F:
    341 			((float*)element)[0] = r;
    342 			((float*)element)[1] = g;
    343 			((float*)element)[2] = b;
    344 			break;
    345 		case FORMAT_A32B32G32R32F:
    346 			((float*)element)[0] = r;
    347 			((float*)element)[1] = g;
    348 			((float*)element)[2] = b;
    349 			((float*)element)[3] = a;
    350 			break;
    351 		case FORMAT_D32F:
    352 		case FORMAT_D32FS8:
    353 		case FORMAT_D32F_LOCKABLE:
    354 		case FORMAT_D32FS8_TEXTURE:
    355 		case FORMAT_D32F_SHADOW:
    356 		case FORMAT_D32FS8_SHADOW:
    357 			*((float*)element) = r;
    358 			break;
    359 		case FORMAT_D32F_COMPLEMENTARY:
    360 		case FORMAT_D32FS8_COMPLEMENTARY:
    361 			*((float*)element) = 1 - r;
    362 			break;
    363 		case FORMAT_S8:
    364 			*((unsigned char*)element) = unorm<8>(r);
    365 			break;
    366 		case FORMAT_L8:
    367 			*(unsigned char*)element = unorm<8>(r);
    368 			break;
    369 		case FORMAT_A4L4:
    370 			*(unsigned char*)element = (unorm<4>(a) << 4) | (unorm<4>(r) << 0);
    371 			break;
    372 		case FORMAT_L16:
    373 			*(unsigned short*)element = unorm<16>(r);
    374 			break;
    375 		case FORMAT_A8L8:
    376 			*(unsigned short*)element = (unorm<8>(a) << 8) | (unorm<8>(r) << 0);
    377 			break;
    378 		case FORMAT_L16F:
    379 			*(half*)element = (half)r;
    380 			break;
    381 		case FORMAT_A16L16F:
    382 			((half*)element)[0] = (half)r;
    383 			((half*)element)[1] = (half)a;
    384 			break;
    385 		case FORMAT_L32F:
    386 			*(float*)element = r;
    387 			break;
    388 		case FORMAT_A32L32F:
    389 			((float*)element)[0] = r;
    390 			((float*)element)[1] = a;
    391 			break;
    392 		default:
    393 			ASSERT(false);
    394 		}
    395 	}
    396 
    397 	Color<float> Surface::Buffer::read(int x, int y, int z) const
    398 	{
    399 		void *element = (unsigned char*)buffer + (x + border) * bytes + (y + border) * pitchB + z * samples * sliceB;
    400 
    401 		return read(element);
    402 	}
    403 
    404 	Color<float> Surface::Buffer::read(int x, int y) const
    405 	{
    406 		void *element = (unsigned char*)buffer + (x + border) * bytes + (y + border) * pitchB;
    407 
    408 		return read(element);
    409 	}
    410 
    411 	inline Color<float> Surface::Buffer::read(void *element) const
    412 	{
    413 		float r = 0.0f;
    414 		float g = 0.0f;
    415 		float b = 0.0f;
    416 		float a = 1.0f;
    417 
    418 		switch(format)
    419 		{
    420 		case FORMAT_P8:
    421 			{
    422 				ASSERT(palette);
    423 
    424 				unsigned int abgr = palette[*(unsigned char*)element];
    425 
    426 				r = (abgr & 0x000000FF) * (1.0f / 0x000000FF);
    427 				g = (abgr & 0x0000FF00) * (1.0f / 0x0000FF00);
    428 				b = (abgr & 0x00FF0000) * (1.0f / 0x00FF0000);
    429 				a = (abgr & 0xFF000000) * (1.0f / 0xFF000000);
    430 			}
    431 			break;
    432 		case FORMAT_A8P8:
    433 			{
    434 				ASSERT(palette);
    435 
    436 				unsigned int bgr = palette[((unsigned char*)element)[0]];
    437 
    438 				r = (bgr & 0x000000FF) * (1.0f / 0x000000FF);
    439 				g = (bgr & 0x0000FF00) * (1.0f / 0x0000FF00);
    440 				b = (bgr & 0x00FF0000) * (1.0f / 0x00FF0000);
    441 				a = ((unsigned char*)element)[1] * (1.0f / 0xFF);
    442 			}
    443 			break;
    444 		case FORMAT_A8:
    445 			r = 0;
    446 			g = 0;
    447 			b = 0;
    448 			a = *(unsigned char*)element * (1.0f / 0xFF);
    449 			break;
    450 		case FORMAT_R8_SNORM:
    451 			r = max((*(signed char*)element) * (1.0f / 0x7F), -1.0f);
    452 			break;
    453 		case FORMAT_R8:
    454 			r = *(unsigned char*)element * (1.0f / 0xFF);
    455 			break;
    456 		case FORMAT_R8I:
    457 			r = *(signed char*)element;
    458 			break;
    459 		case FORMAT_R8UI:
    460 			r = *(unsigned char*)element;
    461 			break;
    462 		case FORMAT_R3G3B2:
    463 			{
    464 				unsigned char rgb = *(unsigned char*)element;
    465 
    466 				r = (rgb & 0xE0) * (1.0f / 0xE0);
    467 				g = (rgb & 0x1C) * (1.0f / 0x1C);
    468 				b = (rgb & 0x03) * (1.0f / 0x03);
    469 			}
    470 			break;
    471 		case FORMAT_A8R3G3B2:
    472 			{
    473 				unsigned short argb = *(unsigned short*)element;
    474 
    475 				a = (argb & 0xFF00) * (1.0f / 0xFF00);
    476 				r = (argb & 0x00E0) * (1.0f / 0x00E0);
    477 				g = (argb & 0x001C) * (1.0f / 0x001C);
    478 				b = (argb & 0x0003) * (1.0f / 0x0003);
    479 			}
    480 			break;
    481 		case FORMAT_X4R4G4B4:
    482 			{
    483 				unsigned short rgb = *(unsigned short*)element;
    484 
    485 				r = (rgb & 0x0F00) * (1.0f / 0x0F00);
    486 				g = (rgb & 0x00F0) * (1.0f / 0x00F0);
    487 				b = (rgb & 0x000F) * (1.0f / 0x000F);
    488 			}
    489 			break;
    490 		case FORMAT_A4R4G4B4:
    491 			{
    492 				unsigned short argb = *(unsigned short*)element;
    493 
    494 				a = (argb & 0xF000) * (1.0f / 0xF000);
    495 				r = (argb & 0x0F00) * (1.0f / 0x0F00);
    496 				g = (argb & 0x00F0) * (1.0f / 0x00F0);
    497 				b = (argb & 0x000F) * (1.0f / 0x000F);
    498 			}
    499 			break;
    500 		case FORMAT_R4G4B4A4:
    501 			{
    502 				unsigned short rgba = *(unsigned short*)element;
    503 
    504 				r = (rgba & 0xF000) * (1.0f / 0xF000);
    505 				g = (rgba & 0x0F00) * (1.0f / 0x0F00);
    506 				b = (rgba & 0x00F0) * (1.0f / 0x00F0);
    507 				a = (rgba & 0x000F) * (1.0f / 0x000F);
    508 			}
    509 			break;
    510 		case FORMAT_R5G6B5:
    511 			{
    512 				unsigned short rgb = *(unsigned short*)element;
    513 
    514 				r = (rgb & 0xF800) * (1.0f / 0xF800);
    515 				g = (rgb & 0x07E0) * (1.0f / 0x07E0);
    516 				b = (rgb & 0x001F) * (1.0f / 0x001F);
    517 			}
    518 			break;
    519 		case FORMAT_A1R5G5B5:
    520 			{
    521 				unsigned short argb = *(unsigned short*)element;
    522 
    523 				a = (argb & 0x8000) * (1.0f / 0x8000);
    524 				r = (argb & 0x7C00) * (1.0f / 0x7C00);
    525 				g = (argb & 0x03E0) * (1.0f / 0x03E0);
    526 				b = (argb & 0x001F) * (1.0f / 0x001F);
    527 			}
    528 			break;
    529 		case FORMAT_R5G5B5A1:
    530 			{
    531 				unsigned short rgba = *(unsigned short*)element;
    532 
    533 				r = (rgba & 0xF800) * (1.0f / 0xF800);
    534 				g = (rgba & 0x07C0) * (1.0f / 0x07C0);
    535 				b = (rgba & 0x003E) * (1.0f / 0x003E);
    536 				a = (rgba & 0x0001) * (1.0f / 0x0001);
    537 			}
    538 			break;
    539 		case FORMAT_X1R5G5B5:
    540 			{
    541 				unsigned short xrgb = *(unsigned short*)element;
    542 
    543 				r = (xrgb & 0x7C00) * (1.0f / 0x7C00);
    544 				g = (xrgb & 0x03E0) * (1.0f / 0x03E0);
    545 				b = (xrgb & 0x001F) * (1.0f / 0x001F);
    546 			}
    547 			break;
    548 		case FORMAT_A8R8G8B8:
    549 			{
    550 				unsigned int argb = *(unsigned int*)element;
    551 
    552 				a = (argb & 0xFF000000) * (1.0f / 0xFF000000);
    553 				r = (argb & 0x00FF0000) * (1.0f / 0x00FF0000);
    554 				g = (argb & 0x0000FF00) * (1.0f / 0x0000FF00);
    555 				b = (argb & 0x000000FF) * (1.0f / 0x000000FF);
    556 			}
    557 			break;
    558 		case FORMAT_X8R8G8B8:
    559 			{
    560 				unsigned int xrgb = *(unsigned int*)element;
    561 
    562 				r = (xrgb & 0x00FF0000) * (1.0f / 0x00FF0000);
    563 				g = (xrgb & 0x0000FF00) * (1.0f / 0x0000FF00);
    564 				b = (xrgb & 0x000000FF) * (1.0f / 0x000000FF);
    565 			}
    566 			break;
    567 		case FORMAT_A8B8G8R8_SNORM:
    568 			{
    569 				signed char* abgr = (signed char*)element;
    570 
    571 				r = max(abgr[0] * (1.0f / 0x7F), -1.0f);
    572 				g = max(abgr[1] * (1.0f / 0x7F), -1.0f);
    573 				b = max(abgr[2] * (1.0f / 0x7F), -1.0f);
    574 				a = max(abgr[3] * (1.0f / 0x7F), -1.0f);
    575 			}
    576 			break;
    577 		case FORMAT_A8B8G8R8:
    578 		case FORMAT_SRGB8_A8:
    579 			{
    580 				unsigned int abgr = *(unsigned int*)element;
    581 
    582 				a = (abgr & 0xFF000000) * (1.0f / 0xFF000000);
    583 				b = (abgr & 0x00FF0000) * (1.0f / 0x00FF0000);
    584 				g = (abgr & 0x0000FF00) * (1.0f / 0x0000FF00);
    585 				r = (abgr & 0x000000FF) * (1.0f / 0x000000FF);
    586 			}
    587 			break;
    588 		case FORMAT_A8B8G8R8I:
    589 			{
    590 				signed char* abgr = (signed char*)element;
    591 
    592 				r = abgr[0];
    593 				g = abgr[1];
    594 				b = abgr[2];
    595 				a = abgr[3];
    596 			}
    597 			break;
    598 		case FORMAT_A8B8G8R8UI:
    599 			{
    600 				unsigned char* abgr = (unsigned char*)element;
    601 
    602 				r = abgr[0];
    603 				g = abgr[1];
    604 				b = abgr[2];
    605 				a = abgr[3];
    606 			}
    607 			break;
    608 		case FORMAT_X8B8G8R8_SNORM:
    609 			{
    610 				signed char* bgr = (signed char*)element;
    611 
    612 				r = max(bgr[0] * (1.0f / 0x7F), -1.0f);
    613 				g = max(bgr[1] * (1.0f / 0x7F), -1.0f);
    614 				b = max(bgr[2] * (1.0f / 0x7F), -1.0f);
    615 			}
    616 			break;
    617 		case FORMAT_X8B8G8R8:
    618 		case FORMAT_SRGB8_X8:
    619 			{
    620 				unsigned int xbgr = *(unsigned int*)element;
    621 
    622 				b = (xbgr & 0x00FF0000) * (1.0f / 0x00FF0000);
    623 				g = (xbgr & 0x0000FF00) * (1.0f / 0x0000FF00);
    624 				r = (xbgr & 0x000000FF) * (1.0f / 0x000000FF);
    625 			}
    626 			break;
    627 		case FORMAT_X8B8G8R8I:
    628 			{
    629 				signed char* bgr = (signed char*)element;
    630 
    631 				r = bgr[0];
    632 				g = bgr[1];
    633 				b = bgr[2];
    634 			}
    635 			break;
    636 		case FORMAT_X8B8G8R8UI:
    637 			{
    638 				unsigned char* bgr = (unsigned char*)element;
    639 
    640 				r = bgr[0];
    641 				g = bgr[1];
    642 				b = bgr[2];
    643 			}
    644 			break;
    645 		case FORMAT_G8R8_SNORM:
    646 			{
    647 				signed char* gr = (signed char*)element;
    648 
    649 				r = (gr[0] & 0xFF00) * (1.0f / 0xFF00);
    650 				g = (gr[1] & 0x00FF) * (1.0f / 0x00FF);
    651 			}
    652 			break;
    653 		case FORMAT_G8R8:
    654 			{
    655 				unsigned short gr = *(unsigned short*)element;
    656 
    657 				g = (gr & 0xFF00) * (1.0f / 0xFF00);
    658 				r = (gr & 0x00FF) * (1.0f / 0x00FF);
    659 			}
    660 			break;
    661 		case FORMAT_G8R8I:
    662 			{
    663 				signed char* gr = (signed char*)element;
    664 
    665 				r = gr[0];
    666 				g = gr[1];
    667 			}
    668 			break;
    669 		case FORMAT_G8R8UI:
    670 			{
    671 				unsigned char* gr = (unsigned char*)element;
    672 
    673 				r = gr[0];
    674 				g = gr[1];
    675 			}
    676 			break;
    677 		case FORMAT_R16I:
    678 			r = *((short*)element);
    679 			break;
    680 		case FORMAT_R16UI:
    681 			r = *((unsigned short*)element);
    682 			break;
    683 		case FORMAT_G16R16I:
    684 			{
    685 				short* gr = (short*)element;
    686 
    687 				r = gr[0];
    688 				g = gr[1];
    689 			}
    690 			break;
    691 		case FORMAT_G16R16:
    692 			{
    693 				unsigned int gr = *(unsigned int*)element;
    694 
    695 				g = (gr & 0xFFFF0000) * (1.0f / 0xFFFF0000);
    696 				r = (gr & 0x0000FFFF) * (1.0f / 0x0000FFFF);
    697 			}
    698 			break;
    699 		case FORMAT_G16R16UI:
    700 			{
    701 				unsigned short* gr = (unsigned short*)element;
    702 
    703 				r = gr[0];
    704 				g = gr[1];
    705 			}
    706 			break;
    707 		case FORMAT_A2R10G10B10:
    708 			{
    709 				unsigned int argb = *(unsigned int*)element;
    710 
    711 				a = (argb & 0xC0000000) * (1.0f / 0xC0000000);
    712 				r = (argb & 0x3FF00000) * (1.0f / 0x3FF00000);
    713 				g = (argb & 0x000FFC00) * (1.0f / 0x000FFC00);
    714 				b = (argb & 0x000003FF) * (1.0f / 0x000003FF);
    715 			}
    716 			break;
    717 		case FORMAT_A2B10G10R10:
    718 			{
    719 				unsigned int abgr = *(unsigned int*)element;
    720 
    721 				a = (abgr & 0xC0000000) * (1.0f / 0xC0000000);
    722 				b = (abgr & 0x3FF00000) * (1.0f / 0x3FF00000);
    723 				g = (abgr & 0x000FFC00) * (1.0f / 0x000FFC00);
    724 				r = (abgr & 0x000003FF) * (1.0f / 0x000003FF);
    725 			}
    726 			break;
    727 		case FORMAT_A2B10G10R10UI:
    728 			{
    729 				unsigned int abgr = *(unsigned int*)element;
    730 
    731 				a = static_cast<float>((abgr & 0xC0000000) >> 30);
    732 				b = static_cast<float>((abgr & 0x3FF00000) >> 20);
    733 				g = static_cast<float>((abgr & 0x000FFC00) >> 10);
    734 				r = static_cast<float>(abgr & 0x000003FF);
    735 			}
    736 			break;
    737 		case FORMAT_A16B16G16R16I:
    738 			{
    739 				short* abgr = (short*)element;
    740 
    741 				r = abgr[0];
    742 				g = abgr[1];
    743 				b = abgr[2];
    744 				a = abgr[3];
    745 			}
    746 			break;
    747 		case FORMAT_A16B16G16R16:
    748 			r = ((unsigned short*)element)[0] * (1.0f / 0xFFFF);
    749 			g = ((unsigned short*)element)[1] * (1.0f / 0xFFFF);
    750 			b = ((unsigned short*)element)[2] * (1.0f / 0xFFFF);
    751 			a = ((unsigned short*)element)[3] * (1.0f / 0xFFFF);
    752 			break;
    753 		case FORMAT_A16B16G16R16UI:
    754 			{
    755 				unsigned short* abgr = (unsigned short*)element;
    756 
    757 				r = abgr[0];
    758 				g = abgr[1];
    759 				b = abgr[2];
    760 				a = abgr[3];
    761 			}
    762 			break;
    763 		case FORMAT_X16B16G16R16I:
    764 			{
    765 				short* bgr = (short*)element;
    766 
    767 				r = bgr[0];
    768 				g = bgr[1];
    769 				b = bgr[2];
    770 			}
    771 			break;
    772 		case FORMAT_X16B16G16R16UI:
    773 			{
    774 				unsigned short* bgr = (unsigned short*)element;
    775 
    776 				r = bgr[0];
    777 				g = bgr[1];
    778 				b = bgr[2];
    779 			}
    780 			break;
    781 		case FORMAT_A32B32G32R32I:
    782 			{
    783 				int* abgr = (int*)element;
    784 
    785 				r = static_cast<float>(abgr[0]);
    786 				g = static_cast<float>(abgr[1]);
    787 				b = static_cast<float>(abgr[2]);
    788 				a = static_cast<float>(abgr[3]);
    789 			}
    790 			break;
    791 		case FORMAT_A32B32G32R32UI:
    792 			{
    793 				unsigned int* abgr = (unsigned int*)element;
    794 
    795 				r = static_cast<float>(abgr[0]);
    796 				g = static_cast<float>(abgr[1]);
    797 				b = static_cast<float>(abgr[2]);
    798 				a = static_cast<float>(abgr[3]);
    799 			}
    800 			break;
    801 		case FORMAT_X32B32G32R32I:
    802 			{
    803 				int* bgr = (int*)element;
    804 
    805 				r = static_cast<float>(bgr[0]);
    806 				g = static_cast<float>(bgr[1]);
    807 				b = static_cast<float>(bgr[2]);
    808 			}
    809 			break;
    810 		case FORMAT_X32B32G32R32UI:
    811 			{
    812 				unsigned int* bgr = (unsigned int*)element;
    813 
    814 				r = static_cast<float>(bgr[0]);
    815 				g = static_cast<float>(bgr[1]);
    816 				b = static_cast<float>(bgr[2]);
    817 			}
    818 			break;
    819 		case FORMAT_G32R32I:
    820 			{
    821 				int* gr = (int*)element;
    822 
    823 				r = static_cast<float>(gr[0]);
    824 				g = static_cast<float>(gr[1]);
    825 			}
    826 			break;
    827 		case FORMAT_G32R32UI:
    828 			{
    829 				unsigned int* gr = (unsigned int*)element;
    830 
    831 				r = static_cast<float>(gr[0]);
    832 				g = static_cast<float>(gr[1]);
    833 			}
    834 			break;
    835 		case FORMAT_R32I:
    836 			r = static_cast<float>(*((int*)element));
    837 			break;
    838 		case FORMAT_R32UI:
    839 			r = static_cast<float>(*((unsigned int*)element));
    840 			break;
    841 		case FORMAT_V8U8:
    842 			{
    843 				unsigned short vu = *(unsigned short*)element;
    844 
    845 				r = ((int)(vu & 0x00FF) << 24) * (1.0f / 0x7F000000);
    846 				g = ((int)(vu & 0xFF00) << 16) * (1.0f / 0x7F000000);
    847 			}
    848 			break;
    849 		case FORMAT_L6V5U5:
    850 			{
    851 				unsigned short lvu = *(unsigned short*)element;
    852 
    853 				r = ((int)(lvu & 0x001F) << 27) * (1.0f / 0x78000000);
    854 				g = ((int)(lvu & 0x03E0) << 22) * (1.0f / 0x78000000);
    855 				b = (lvu & 0xFC00) * (1.0f / 0xFC00);
    856 			}
    857 			break;
    858 		case FORMAT_Q8W8V8U8:
    859 			{
    860 				unsigned int qwvu = *(unsigned int*)element;
    861 
    862 				r = ((int)(qwvu & 0x000000FF) << 24) * (1.0f / 0x7F000000);
    863 				g = ((int)(qwvu & 0x0000FF00) << 16) * (1.0f / 0x7F000000);
    864 				b = ((int)(qwvu & 0x00FF0000) << 8)  * (1.0f / 0x7F000000);
    865 				a = ((int)(qwvu & 0xFF000000) << 0)  * (1.0f / 0x7F000000);
    866 			}
    867 			break;
    868 		case FORMAT_X8L8V8U8:
    869 			{
    870 				unsigned int xlvu = *(unsigned int*)element;
    871 
    872 				r = ((int)(xlvu & 0x000000FF) << 24) * (1.0f / 0x7F000000);
    873 				g = ((int)(xlvu & 0x0000FF00) << 16) * (1.0f / 0x7F000000);
    874 				b = (xlvu & 0x00FF0000) * (1.0f / 0x00FF0000);
    875 			}
    876 			break;
    877 		case FORMAT_R8G8B8:
    878 			r = ((unsigned char*)element)[2] * (1.0f / 0xFF);
    879 			g = ((unsigned char*)element)[1] * (1.0f / 0xFF);
    880 			b = ((unsigned char*)element)[0] * (1.0f / 0xFF);
    881 			break;
    882 		case FORMAT_B8G8R8:
    883 			r = ((unsigned char*)element)[0] * (1.0f / 0xFF);
    884 			g = ((unsigned char*)element)[1] * (1.0f / 0xFF);
    885 			b = ((unsigned char*)element)[2] * (1.0f / 0xFF);
    886 			break;
    887 		case FORMAT_V16U16:
    888 			{
    889 				unsigned int vu = *(unsigned int*)element;
    890 
    891 				r = ((int)(vu & 0x0000FFFF) << 16) * (1.0f / 0x7FFF0000);
    892 				g = ((int)(vu & 0xFFFF0000) << 0)  * (1.0f / 0x7FFF0000);
    893 			}
    894 			break;
    895 		case FORMAT_A2W10V10U10:
    896 			{
    897 				unsigned int awvu = *(unsigned int*)element;
    898 
    899 				r = ((int)(awvu & 0x000003FF) << 22) * (1.0f / 0x7FC00000);
    900 				g = ((int)(awvu & 0x000FFC00) << 12) * (1.0f / 0x7FC00000);
    901 				b = ((int)(awvu & 0x3FF00000) << 2)  * (1.0f / 0x7FC00000);
    902 				a = (awvu & 0xC0000000) * (1.0f / 0xC0000000);
    903 			}
    904 			break;
    905 		case FORMAT_A16W16V16U16:
    906 			r = ((signed short*)element)[0] * (1.0f / 0x7FFF);
    907 			g = ((signed short*)element)[1] * (1.0f / 0x7FFF);
    908 			b = ((signed short*)element)[2] * (1.0f / 0x7FFF);
    909 			a = ((unsigned short*)element)[3] * (1.0f / 0xFFFF);
    910 			break;
    911 		case FORMAT_Q16W16V16U16:
    912 			r = ((signed short*)element)[0] * (1.0f / 0x7FFF);
    913 			g = ((signed short*)element)[1] * (1.0f / 0x7FFF);
    914 			b = ((signed short*)element)[2] * (1.0f / 0x7FFF);
    915 			a = ((signed short*)element)[3] * (1.0f / 0x7FFF);
    916 			break;
    917 		case FORMAT_L8:
    918 			r =
    919 			g =
    920 			b = *(unsigned char*)element * (1.0f / 0xFF);
    921 			break;
    922 		case FORMAT_A4L4:
    923 			{
    924 				unsigned char al = *(unsigned char*)element;
    925 
    926 				r =
    927 				g =
    928 				b = (al & 0x0F) * (1.0f / 0x0F);
    929 				a = (al & 0xF0) * (1.0f / 0xF0);
    930 			}
    931 			break;
    932 		case FORMAT_L16:
    933 			r =
    934 			g =
    935 			b = *(unsigned short*)element * (1.0f / 0xFFFF);
    936 			break;
    937 		case FORMAT_A8L8:
    938 			r =
    939 			g =
    940 			b = ((unsigned char*)element)[0] * (1.0f / 0xFF);
    941 			a = ((unsigned char*)element)[1] * (1.0f / 0xFF);
    942 			break;
    943 		case FORMAT_L16F:
    944 			r =
    945 			g =
    946 			b = *(half*)element;
    947 			break;
    948 		case FORMAT_A16L16F:
    949 			r =
    950 			g =
    951 			b = ((half*)element)[0];
    952 			a = ((half*)element)[1];
    953 			break;
    954 		case FORMAT_L32F:
    955 			r =
    956 			g =
    957 			b = *(float*)element;
    958 			break;
    959 		case FORMAT_A32L32F:
    960 			r =
    961 			g =
    962 			b = ((float*)element)[0];
    963 			a = ((float*)element)[1];
    964 			break;
    965 		case FORMAT_A16F:
    966 			a = *(half*)element;
    967 			break;
    968 		case FORMAT_R16F:
    969 			r = *(half*)element;
    970 			break;
    971 		case FORMAT_G16R16F:
    972 			r = ((half*)element)[0];
    973 			g = ((half*)element)[1];
    974 			break;
    975 		case FORMAT_X16B16G16R16F:
    976 		case FORMAT_X16B16G16R16F_UNSIGNED:
    977 		case FORMAT_B16G16R16F:
    978 			r = ((half*)element)[0];
    979 			g = ((half*)element)[1];
    980 			b = ((half*)element)[2];
    981 			break;
    982 		case FORMAT_A16B16G16R16F:
    983 			r = ((half*)element)[0];
    984 			g = ((half*)element)[1];
    985 			b = ((half*)element)[2];
    986 			a = ((half*)element)[3];
    987 			break;
    988 		case FORMAT_A32F:
    989 			a = *(float*)element;
    990 			break;
    991 		case FORMAT_R32F:
    992 			r = *(float*)element;
    993 			break;
    994 		case FORMAT_G32R32F:
    995 			r = ((float*)element)[0];
    996 			g = ((float*)element)[1];
    997 			break;
    998 		case FORMAT_X32B32G32R32F:
    999 		case FORMAT_X32B32G32R32F_UNSIGNED:
   1000 		case FORMAT_B32G32R32F:
   1001 			r = ((float*)element)[0];
   1002 			g = ((float*)element)[1];
   1003 			b = ((float*)element)[2];
   1004 			break;
   1005 		case FORMAT_A32B32G32R32F:
   1006 			r = ((float*)element)[0];
   1007 			g = ((float*)element)[1];
   1008 			b = ((float*)element)[2];
   1009 			a = ((float*)element)[3];
   1010 			break;
   1011 		case FORMAT_D32F:
   1012 		case FORMAT_D32FS8:
   1013 		case FORMAT_D32F_LOCKABLE:
   1014 		case FORMAT_D32FS8_TEXTURE:
   1015 		case FORMAT_D32F_SHADOW:
   1016 		case FORMAT_D32FS8_SHADOW:
   1017 			r = *(float*)element;
   1018 			g = r;
   1019 			b = r;
   1020 			a = r;
   1021 			break;
   1022 		case FORMAT_D32F_COMPLEMENTARY:
   1023 		case FORMAT_D32FS8_COMPLEMENTARY:
   1024 			r = 1.0f - *(float*)element;
   1025 			g = r;
   1026 			b = r;
   1027 			a = r;
   1028 			break;
   1029 		case FORMAT_S8:
   1030 			r = *(unsigned char*)element * (1.0f / 0xFF);
   1031 			break;
   1032 		default:
   1033 			ASSERT(false);
   1034 		}
   1035 
   1036 		if(isSRGBformat(format))
   1037 		{
   1038 			r = sRGBtoLinear(r);
   1039 			g = sRGBtoLinear(g);
   1040 			b = sRGBtoLinear(b);
   1041 		}
   1042 
   1043 		return Color<float>(r, g, b, a);
   1044 	}
   1045 
   1046 	Color<float> Surface::Buffer::sample(float x, float y, float z) const
   1047 	{
   1048 		x -= 0.5f;
   1049 		y -= 0.5f;
   1050 		z -= 0.5f;
   1051 
   1052 		int x0 = clamp((int)x, 0, width - 1);
   1053 		int x1 = (x0 + 1 >= width) ? x0 : x0 + 1;
   1054 
   1055 		int y0 = clamp((int)y, 0, height - 1);
   1056 		int y1 = (y0 + 1 >= height) ? y0 : y0 + 1;
   1057 
   1058 		int z0 = clamp((int)z, 0, depth - 1);
   1059 		int z1 = (z0 + 1 >= depth) ? z0 : z0 + 1;
   1060 
   1061 		Color<float> c000 = read(x0, y0, z0);
   1062 		Color<float> c100 = read(x1, y0, z0);
   1063 		Color<float> c010 = read(x0, y1, z0);
   1064 		Color<float> c110 = read(x1, y1, z0);
   1065 		Color<float> c001 = read(x0, y0, z1);
   1066 		Color<float> c101 = read(x1, y0, z1);
   1067 		Color<float> c011 = read(x0, y1, z1);
   1068 		Color<float> c111 = read(x1, y1, z1);
   1069 
   1070 		float fx = x - x0;
   1071 		float fy = y - y0;
   1072 		float fz = z - z0;
   1073 
   1074 		c000 *= (1 - fx) * (1 - fy) * (1 - fz);
   1075 		c100 *= fx * (1 - fy) * (1 - fz);
   1076 		c010 *= (1 - fx) * fy * (1 - fz);
   1077 		c110 *= fx * fy * (1 - fz);
   1078 		c001 *= (1 - fx) * (1 - fy) * fz;
   1079 		c101 *= fx * (1 - fy) * fz;
   1080 		c011 *= (1 - fx) * fy * fz;
   1081 		c111 *= fx * fy * fz;
   1082 
   1083 		return c000 + c100 + c010 + c110 + c001 + c101 + c011 + c111;
   1084 	}
   1085 
   1086 	Color<float> Surface::Buffer::sample(float x, float y, int layer) const
   1087 	{
   1088 		x -= 0.5f;
   1089 		y -= 0.5f;
   1090 
   1091 		int x0 = clamp((int)x, 0, width - 1);
   1092 		int x1 = (x0 + 1 >= width) ? x0 : x0 + 1;
   1093 
   1094 		int y0 = clamp((int)y, 0, height - 1);
   1095 		int y1 = (y0 + 1 >= height) ? y0 : y0 + 1;
   1096 
   1097 		Color<float> c00 = read(x0, y0, layer);
   1098 		Color<float> c10 = read(x1, y0, layer);
   1099 		Color<float> c01 = read(x0, y1, layer);
   1100 		Color<float> c11 = read(x1, y1, layer);
   1101 
   1102 		float fx = x - x0;
   1103 		float fy = y - y0;
   1104 
   1105 		c00 *= (1 - fx) * (1 - fy);
   1106 		c10 *= fx * (1 - fy);
   1107 		c01 *= (1 - fx) * fy;
   1108 		c11 *= fx * fy;
   1109 
   1110 		return c00 + c10 + c01 + c11;
   1111 	}
   1112 
   1113 	void *Surface::Buffer::lockRect(int x, int y, int z, Lock lock)
   1114 	{
   1115 		this->lock = lock;
   1116 
   1117 		switch(lock)
   1118 		{
   1119 		case LOCK_UNLOCKED:
   1120 		case LOCK_READONLY:
   1121 		case LOCK_UPDATE:
   1122 			break;
   1123 		case LOCK_WRITEONLY:
   1124 		case LOCK_READWRITE:
   1125 		case LOCK_DISCARD:
   1126 			dirty = true;
   1127 			break;
   1128 		default:
   1129 			ASSERT(false);
   1130 		}
   1131 
   1132 		if(buffer)
   1133 		{
   1134 			x += border;
   1135 			y += border;
   1136 
   1137 			switch(format)
   1138 			{
   1139 			case FORMAT_DXT1:
   1140 			case FORMAT_ATI1:
   1141 			case FORMAT_ETC1:
   1142 			case FORMAT_R11_EAC:
   1143 			case FORMAT_SIGNED_R11_EAC:
   1144 			case FORMAT_RGB8_ETC2:
   1145 			case FORMAT_SRGB8_ETC2:
   1146 			case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
   1147 			case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
   1148 				return (unsigned char*)buffer + 8 * (x / 4) + (y / 4) * pitchB + z * sliceB;
   1149 			case FORMAT_RG11_EAC:
   1150 			case FORMAT_SIGNED_RG11_EAC:
   1151 			case FORMAT_RGBA8_ETC2_EAC:
   1152 			case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
   1153 			case FORMAT_RGBA_ASTC_4x4_KHR:
   1154 			case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
   1155 				return (unsigned char*)buffer + 16 * (x / 4) + (y / 4) * pitchB + z * sliceB;
   1156 			case FORMAT_RGBA_ASTC_5x4_KHR:
   1157 			case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
   1158 				return (unsigned char*)buffer + 16 * (x / 5) + (y / 4) * pitchB + z * sliceB;
   1159 			case FORMAT_RGBA_ASTC_5x5_KHR:
   1160 			case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
   1161 				return (unsigned char*)buffer + 16 * (x / 5) + (y / 5) * pitchB + z * sliceB;
   1162 			case FORMAT_RGBA_ASTC_6x5_KHR:
   1163 			case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
   1164 				return (unsigned char*)buffer + 16 * (x / 6) + (y / 5) * pitchB + z * sliceB;
   1165 			case FORMAT_RGBA_ASTC_6x6_KHR:
   1166 			case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
   1167 				return (unsigned char*)buffer + 16 * (x / 6) + (y / 6) * pitchB + z * sliceB;
   1168 			case FORMAT_RGBA_ASTC_8x5_KHR:
   1169 			case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
   1170 				return (unsigned char*)buffer + 16 * (x / 8) + (y / 5) * pitchB + z * sliceB;
   1171 			case FORMAT_RGBA_ASTC_8x6_KHR:
   1172 			case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
   1173 				return (unsigned char*)buffer + 16 * (x / 8) + (y / 6) * pitchB + z * sliceB;
   1174 			case FORMAT_RGBA_ASTC_8x8_KHR:
   1175 			case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
   1176 				return (unsigned char*)buffer + 16 * (x / 8) + (y / 8) * pitchB + z * sliceB;
   1177 			case FORMAT_RGBA_ASTC_10x5_KHR:
   1178 			case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
   1179 				return (unsigned char*)buffer + 16 * (x / 10) + (y / 5) * pitchB + z * sliceB;
   1180 			case FORMAT_RGBA_ASTC_10x6_KHR:
   1181 			case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
   1182 				return (unsigned char*)buffer + 16 * (x / 10) + (y / 6) * pitchB + z * sliceB;
   1183 			case FORMAT_RGBA_ASTC_10x8_KHR:
   1184 			case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
   1185 				return (unsigned char*)buffer + 16 * (x / 10) + (y / 8) * pitchB + z * sliceB;
   1186 			case FORMAT_RGBA_ASTC_10x10_KHR:
   1187 			case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
   1188 				return (unsigned char*)buffer + 16 * (x / 10) + (y / 10) * pitchB + z * sliceB;
   1189 			case FORMAT_RGBA_ASTC_12x10_KHR:
   1190 			case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
   1191 				return (unsigned char*)buffer + 16 * (x / 12) + (y / 10) * pitchB + z * sliceB;
   1192 			case FORMAT_RGBA_ASTC_12x12_KHR:
   1193 			case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
   1194 				return (unsigned char*)buffer + 16 * (x / 12) + (y / 12) * pitchB + z * sliceB;
   1195 			case FORMAT_DXT3:
   1196 			case FORMAT_DXT5:
   1197 			case FORMAT_ATI2:
   1198 				return (unsigned char*)buffer + 16 * (x / 4) + (y / 4) * pitchB + z * sliceB;
   1199 			default:
   1200 				return (unsigned char*)buffer + x * bytes + y * pitchB + z * samples * sliceB;
   1201 			}
   1202 		}
   1203 
   1204 		return nullptr;
   1205 	}
   1206 
   1207 	void Surface::Buffer::unlockRect()
   1208 	{
   1209 		lock = LOCK_UNLOCKED;
   1210 	}
   1211 
   1212 	class SurfaceImplementation : public Surface
   1213 	{
   1214 	public:
   1215 		SurfaceImplementation(int width, int height, int depth, Format format, void *pixels, int pitch, int slice)
   1216 			: Surface(width, height, depth, format, pixels, pitch, slice) {}
   1217 		SurfaceImplementation(Resource *texture, int width, int height, int depth, int border, int samples, Format format, bool lockable, bool renderTarget, int pitchP = 0)
   1218 			: Surface(texture, width, height, depth, border, samples, format, lockable, renderTarget, pitchP) {}
   1219 		~SurfaceImplementation() override {};
   1220 
   1221 		void *lockInternal(int x, int y, int z, Lock lock, Accessor client) override
   1222 		{
   1223 			return Surface::lockInternal(x, y, z, lock, client);
   1224 		}
   1225 
   1226 		void unlockInternal() override
   1227 		{
   1228 			Surface::unlockInternal();
   1229 		}
   1230 	};
   1231 
   1232 	Surface *Surface::create(int width, int height, int depth, Format format, void *pixels, int pitch, int slice)
   1233 	{
   1234 		return new SurfaceImplementation(width, height, depth, format, pixels, pitch, slice);
   1235 	}
   1236 
   1237 	Surface *Surface::create(Resource *texture, int width, int height, int depth, int border, int samples, Format format, bool lockable, bool renderTarget, int pitchPprovided)
   1238 	{
   1239 		return new SurfaceImplementation(texture, width, height, depth, border, samples, format, lockable, renderTarget, pitchPprovided);
   1240 	}
   1241 
   1242 	Surface::Surface(int width, int height, int depth, Format format, void *pixels, int pitch, int slice) : lockable(true), renderTarget(false)
   1243 	{
   1244 		resource = new Resource(0);
   1245 		hasParent = false;
   1246 		ownExternal = false;
   1247 		depth = max(1, depth);
   1248 
   1249 		external.buffer = pixels;
   1250 		external.width = width;
   1251 		external.height = height;
   1252 		external.depth = depth;
   1253 		external.samples = 1;
   1254 		external.format = format;
   1255 		external.bytes = bytes(external.format);
   1256 		external.pitchB = pitch;
   1257 		external.pitchP = external.bytes ? pitch / external.bytes : 0;
   1258 		external.sliceB = slice;
   1259 		external.sliceP = external.bytes ? slice / external.bytes : 0;
   1260 		external.border = 0;
   1261 		external.lock = LOCK_UNLOCKED;
   1262 		external.dirty = true;
   1263 
   1264 		internal.buffer = nullptr;
   1265 		internal.width = width;
   1266 		internal.height = height;
   1267 		internal.depth = depth;
   1268 		internal.samples = 1;
   1269 		internal.format = selectInternalFormat(format);
   1270 		internal.bytes = bytes(internal.format);
   1271 		internal.pitchB = pitchB(internal.width, 0, internal.format, false);
   1272 		internal.pitchP = pitchP(internal.width, 0, internal.format, false);
   1273 		internal.sliceB = sliceB(internal.width, internal.height, 0, internal.format, false);
   1274 		internal.sliceP = sliceP(internal.width, internal.height, 0, internal.format, false);
   1275 		internal.border = 0;
   1276 		internal.lock = LOCK_UNLOCKED;
   1277 		internal.dirty = false;
   1278 
   1279 		stencil.buffer = nullptr;
   1280 		stencil.width = width;
   1281 		stencil.height = height;
   1282 		stencil.depth = depth;
   1283 		stencil.samples = 1;
   1284 		stencil.format = isStencil(format) ? FORMAT_S8 : FORMAT_NULL;
   1285 		stencil.bytes = bytes(stencil.format);
   1286 		stencil.pitchB = pitchB(stencil.width, 0, stencil.format, false);
   1287 		stencil.pitchP = pitchP(stencil.width, 0, stencil.format, false);
   1288 		stencil.sliceB = sliceB(stencil.width, stencil.height, 0, stencil.format, false);
   1289 		stencil.sliceP = sliceP(stencil.width, stencil.height, 0, stencil.format, false);
   1290 		stencil.border = 0;
   1291 		stencil.lock = LOCK_UNLOCKED;
   1292 		stencil.dirty = false;
   1293 
   1294 		dirtyContents = true;
   1295 		paletteUsed = 0;
   1296 	}
   1297 
   1298 	Surface::Surface(Resource *texture, int width, int height, int depth, int border, int samples, Format format, bool lockable, bool renderTarget, int pitchPprovided) : lockable(lockable), renderTarget(renderTarget)
   1299 	{
   1300 		resource = texture ? texture : new Resource(0);
   1301 		hasParent = texture != nullptr;
   1302 		ownExternal = true;
   1303 		depth = max(1, depth);
   1304 		samples = max(1, samples);
   1305 
   1306 		external.buffer = nullptr;
   1307 		external.width = width;
   1308 		external.height = height;
   1309 		external.depth = depth;
   1310 		external.samples = (short)samples;
   1311 		external.format = format;
   1312 		external.bytes = bytes(external.format);
   1313 		external.pitchB = pitchB(external.width, 0, external.format, renderTarget && !texture);
   1314 		external.pitchP = pitchP(external.width, 0, external.format, renderTarget && !texture);
   1315 		external.sliceB = sliceB(external.width, external.height, 0, external.format, renderTarget && !texture);
   1316 		external.sliceP = sliceP(external.width, external.height, 0, external.format, renderTarget && !texture);
   1317 		external.border = 0;
   1318 		external.lock = LOCK_UNLOCKED;
   1319 		external.dirty = false;
   1320 
   1321 		internal.buffer = nullptr;
   1322 		internal.width = width;
   1323 		internal.height = height;
   1324 		internal.depth = depth;
   1325 		internal.samples = (short)samples;
   1326 		internal.format = selectInternalFormat(format);
   1327 		internal.bytes = bytes(internal.format);
   1328 		internal.pitchB = !pitchPprovided ? pitchB(internal.width, border, internal.format, renderTarget) : pitchPprovided * internal.bytes;
   1329 		internal.pitchP = !pitchPprovided ? pitchP(internal.width, border, internal.format, renderTarget) : pitchPprovided;
   1330 		internal.sliceB = sliceB(internal.width, internal.height, border, internal.format, renderTarget);
   1331 		internal.sliceP = sliceP(internal.width, internal.height, border, internal.format, renderTarget);
   1332 		internal.border = (short)border;
   1333 		internal.lock = LOCK_UNLOCKED;
   1334 		internal.dirty = false;
   1335 
   1336 		stencil.buffer = nullptr;
   1337 		stencil.width = width;
   1338 		stencil.height = height;
   1339 		stencil.depth = depth;
   1340 		stencil.samples = (short)samples;
   1341 		stencil.format = isStencil(format) ? FORMAT_S8 : FORMAT_NULL;
   1342 		stencil.bytes = bytes(stencil.format);
   1343 		stencil.pitchB = pitchB(stencil.width, 0, stencil.format, renderTarget);
   1344 		stencil.pitchP = pitchP(stencil.width, 0, stencil.format, renderTarget);
   1345 		stencil.sliceB = sliceB(stencil.width, stencil.height, 0, stencil.format, renderTarget);
   1346 		stencil.sliceP = sliceP(stencil.width, stencil.height, 0, stencil.format, renderTarget);
   1347 		stencil.border = 0;
   1348 		stencil.lock = LOCK_UNLOCKED;
   1349 		stencil.dirty = false;
   1350 
   1351 		dirtyContents = true;
   1352 		paletteUsed = 0;
   1353 	}
   1354 
   1355 	Surface::~Surface()
   1356 	{
   1357 		// sync() must be called before this destructor to ensure all locks have been released.
   1358 		// We can't call it here because the parent resource may already have been destroyed.
   1359 		ASSERT(isUnlocked());
   1360 
   1361 		if(!hasParent)
   1362 		{
   1363 			resource->destruct();
   1364 		}
   1365 
   1366 		if(ownExternal)
   1367 		{
   1368 			deallocate(external.buffer);
   1369 		}
   1370 
   1371 		if(internal.buffer != external.buffer)
   1372 		{
   1373 			deallocate(internal.buffer);
   1374 		}
   1375 
   1376 		deallocate(stencil.buffer);
   1377 
   1378 		external.buffer = 0;
   1379 		internal.buffer = 0;
   1380 		stencil.buffer = 0;
   1381 	}
   1382 
   1383 	void *Surface::lockExternal(int x, int y, int z, Lock lock, Accessor client)
   1384 	{
   1385 		resource->lock(client);
   1386 
   1387 		if(!external.buffer)
   1388 		{
   1389 			if(internal.buffer && identicalFormats())
   1390 			{
   1391 				external.buffer = internal.buffer;
   1392 			}
   1393 			else
   1394 			{
   1395 				external.buffer = allocateBuffer(external.width, external.height, external.depth, external.border, external.samples, external.format);
   1396 			}
   1397 		}
   1398 
   1399 		if(internal.dirty)
   1400 		{
   1401 			if(lock != LOCK_DISCARD)
   1402 			{
   1403 				update(external, internal);
   1404 			}
   1405 
   1406 			internal.dirty = false;
   1407 		}
   1408 
   1409 		switch(lock)
   1410 		{
   1411 		case LOCK_READONLY:
   1412 			break;
   1413 		case LOCK_WRITEONLY:
   1414 		case LOCK_READWRITE:
   1415 		case LOCK_DISCARD:
   1416 			dirtyContents = true;
   1417 			break;
   1418 		default:
   1419 			ASSERT(false);
   1420 		}
   1421 
   1422 		return external.lockRect(x, y, z, lock);
   1423 	}
   1424 
   1425 	void Surface::unlockExternal()
   1426 	{
   1427 		external.unlockRect();
   1428 
   1429 		resource->unlock();
   1430 	}
   1431 
   1432 	void *Surface::lockInternal(int x, int y, int z, Lock lock, Accessor client)
   1433 	{
   1434 		if(lock != LOCK_UNLOCKED)
   1435 		{
   1436 			resource->lock(client);
   1437 		}
   1438 
   1439 		if(!internal.buffer)
   1440 		{
   1441 			if(external.buffer && identicalFormats())
   1442 			{
   1443 				internal.buffer = external.buffer;
   1444 			}
   1445 			else
   1446 			{
   1447 				internal.buffer = allocateBuffer(internal.width, internal.height, internal.depth, internal.border, internal.samples, internal.format);
   1448 			}
   1449 		}
   1450 
   1451 		// FIXME: WHQL requires conversion to lower external precision and back
   1452 		if(logPrecision >= WHQL)
   1453 		{
   1454 			if(internal.dirty && renderTarget && internal.format != external.format)
   1455 			{
   1456 				if(lock != LOCK_DISCARD)
   1457 				{
   1458 					switch(external.format)
   1459 					{
   1460 					case FORMAT_R3G3B2:
   1461 					case FORMAT_A8R3G3B2:
   1462 					case FORMAT_A1R5G5B5:
   1463 					case FORMAT_A2R10G10B10:
   1464 					case FORMAT_A2B10G10R10:
   1465 						lockExternal(0, 0, 0, LOCK_READWRITE, client);
   1466 						unlockExternal();
   1467 						break;
   1468 					default:
   1469 						// Difference passes WHQL
   1470 						break;
   1471 					}
   1472 				}
   1473 			}
   1474 		}
   1475 
   1476 		if(external.dirty || (isPalette(external.format) && paletteUsed != Surface::paletteID))
   1477 		{
   1478 			if(lock != LOCK_DISCARD)
   1479 			{
   1480 				update(internal, external);
   1481 			}
   1482 
   1483 			external.dirty = false;
   1484 			paletteUsed = Surface::paletteID;
   1485 		}
   1486 
   1487 		switch(lock)
   1488 		{
   1489 		case LOCK_UNLOCKED:
   1490 		case LOCK_READONLY:
   1491 			break;
   1492 		case LOCK_WRITEONLY:
   1493 		case LOCK_READWRITE:
   1494 		case LOCK_DISCARD:
   1495 			dirtyContents = true;
   1496 			break;
   1497 		default:
   1498 			ASSERT(false);
   1499 		}
   1500 
   1501 		if(lock == LOCK_READONLY && client == PUBLIC)
   1502 		{
   1503 			resolve();
   1504 		}
   1505 
   1506 		return internal.lockRect(x, y, z, lock);
   1507 	}
   1508 
   1509 	void Surface::unlockInternal()
   1510 	{
   1511 		internal.unlockRect();
   1512 
   1513 		resource->unlock();
   1514 	}
   1515 
   1516 	void *Surface::lockStencil(int x, int y, int front, Accessor client)
   1517 	{
   1518 		if(stencil.format == FORMAT_NULL)
   1519 		{
   1520 			return nullptr;
   1521 		}
   1522 
   1523 		resource->lock(client);
   1524 
   1525 		if(!stencil.buffer)
   1526 		{
   1527 			stencil.buffer = allocateBuffer(stencil.width, stencil.height, stencil.depth, stencil.border, stencil.samples, stencil.format);
   1528 		}
   1529 
   1530 		return stencil.lockRect(x, y, front, LOCK_READWRITE);   // FIXME
   1531 	}
   1532 
   1533 	void Surface::unlockStencil()
   1534 	{
   1535 		stencil.unlockRect();
   1536 
   1537 		resource->unlock();
   1538 	}
   1539 
   1540 	int Surface::bytes(Format format)
   1541 	{
   1542 		switch(format)
   1543 		{
   1544 		case FORMAT_NULL:				return 0;
   1545 		case FORMAT_P8:					return 1;
   1546 		case FORMAT_A8P8:				return 2;
   1547 		case FORMAT_A8:					return 1;
   1548 		case FORMAT_R8I:				return 1;
   1549 		case FORMAT_R8:					return 1;
   1550 		case FORMAT_R3G3B2:				return 1;
   1551 		case FORMAT_R16I:				return 2;
   1552 		case FORMAT_R16UI:				return 2;
   1553 		case FORMAT_A8R3G3B2:			return 2;
   1554 		case FORMAT_R5G6B5:				return 2;
   1555 		case FORMAT_A1R5G5B5:			return 2;
   1556 		case FORMAT_X1R5G5B5:			return 2;
   1557 		case FORMAT_R5G5B5A1:           return 2;
   1558 		case FORMAT_X4R4G4B4:			return 2;
   1559 		case FORMAT_A4R4G4B4:			return 2;
   1560 		case FORMAT_R4G4B4A4:           return 2;
   1561 		case FORMAT_R8G8B8:				return 3;
   1562 		case FORMAT_B8G8R8:             return 3;
   1563 		case FORMAT_R32I:				return 4;
   1564 		case FORMAT_R32UI:				return 4;
   1565 		case FORMAT_X8R8G8B8:			return 4;
   1566 	//	case FORMAT_X8G8R8B8Q:			return 4;
   1567 		case FORMAT_A8R8G8B8:			return 4;
   1568 	//	case FORMAT_A8G8R8B8Q:			return 4;
   1569 		case FORMAT_X8B8G8R8I:			return 4;
   1570 		case FORMAT_X8B8G8R8:			return 4;
   1571 		case FORMAT_SRGB8_X8:			return 4;
   1572 		case FORMAT_SRGB8_A8:			return 4;
   1573 		case FORMAT_A8B8G8R8I:			return 4;
   1574 		case FORMAT_R8UI:				return 1;
   1575 		case FORMAT_G8R8UI:				return 2;
   1576 		case FORMAT_X8B8G8R8UI:			return 4;
   1577 		case FORMAT_A8B8G8R8UI:			return 4;
   1578 		case FORMAT_A8B8G8R8:			return 4;
   1579 		case FORMAT_R8_SNORM:			return 1;
   1580 		case FORMAT_G8R8_SNORM:		return 2;
   1581 		case FORMAT_X8B8G8R8_SNORM:	return 4;
   1582 		case FORMAT_A8B8G8R8_SNORM:	return 4;
   1583 		case FORMAT_A2R10G10B10:		return 4;
   1584 		case FORMAT_A2B10G10R10:		return 4;
   1585 		case FORMAT_A2B10G10R10UI:		return 4;
   1586 		case FORMAT_G8R8I:				return 2;
   1587 		case FORMAT_G8R8:				return 2;
   1588 		case FORMAT_G16R16I:			return 4;
   1589 		case FORMAT_G16R16UI:			return 4;
   1590 		case FORMAT_G16R16:				return 4;
   1591 		case FORMAT_G32R32I:			return 8;
   1592 		case FORMAT_G32R32UI:			return 8;
   1593 		case FORMAT_X16B16G16R16I:		return 8;
   1594 		case FORMAT_X16B16G16R16UI:		return 8;
   1595 		case FORMAT_A16B16G16R16I:		return 8;
   1596 		case FORMAT_A16B16G16R16UI:		return 8;
   1597 		case FORMAT_A16B16G16R16:		return 8;
   1598 		case FORMAT_X32B32G32R32I:		return 16;
   1599 		case FORMAT_X32B32G32R32UI:		return 16;
   1600 		case FORMAT_A32B32G32R32I:		return 16;
   1601 		case FORMAT_A32B32G32R32UI:		return 16;
   1602 		// Compressed formats
   1603 		case FORMAT_DXT1:				return 2;   // Column of four pixels
   1604 		case FORMAT_DXT3:				return 4;   // Column of four pixels
   1605 		case FORMAT_DXT5:				return 4;   // Column of four pixels
   1606 		case FORMAT_ATI1:				return 2;   // Column of four pixels
   1607 		case FORMAT_ATI2:				return 4;   // Column of four pixels
   1608 		case FORMAT_ETC1:				return 2;   // Column of four pixels
   1609 		case FORMAT_R11_EAC:			return 2;
   1610 		case FORMAT_SIGNED_R11_EAC:		return 2;
   1611 		case FORMAT_RG11_EAC:			return 4;
   1612 		case FORMAT_SIGNED_RG11_EAC:	return 4;
   1613 		case FORMAT_RGB8_ETC2:			return 2;
   1614 		case FORMAT_SRGB8_ETC2:			return 2;
   1615 		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:	return 2;
   1616 		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:	return 2;
   1617 		case FORMAT_RGBA8_ETC2_EAC:			return 4;
   1618 		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:	return 4;
   1619 		case FORMAT_RGBA_ASTC_4x4_KHR:
   1620 		case FORMAT_RGBA_ASTC_5x4_KHR:
   1621 		case FORMAT_RGBA_ASTC_5x5_KHR:
   1622 		case FORMAT_RGBA_ASTC_6x5_KHR:
   1623 		case FORMAT_RGBA_ASTC_6x6_KHR:
   1624 		case FORMAT_RGBA_ASTC_8x5_KHR:
   1625 		case FORMAT_RGBA_ASTC_8x6_KHR:
   1626 		case FORMAT_RGBA_ASTC_8x8_KHR:
   1627 		case FORMAT_RGBA_ASTC_10x5_KHR:
   1628 		case FORMAT_RGBA_ASTC_10x6_KHR:
   1629 		case FORMAT_RGBA_ASTC_10x8_KHR:
   1630 		case FORMAT_RGBA_ASTC_10x10_KHR:
   1631 		case FORMAT_RGBA_ASTC_12x10_KHR:
   1632 		case FORMAT_RGBA_ASTC_12x12_KHR:
   1633 		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
   1634 		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
   1635 		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
   1636 		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
   1637 		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
   1638 		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
   1639 		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
   1640 		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
   1641 		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
   1642 		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
   1643 		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
   1644 		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
   1645 		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
   1646 		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR: return 0; // FIXME
   1647 		// Bumpmap formats
   1648 		case FORMAT_V8U8:				return 2;
   1649 		case FORMAT_L6V5U5:				return 2;
   1650 		case FORMAT_Q8W8V8U8:			return 4;
   1651 		case FORMAT_X8L8V8U8:			return 4;
   1652 		case FORMAT_A2W10V10U10:		return 4;
   1653 		case FORMAT_V16U16:				return 4;
   1654 		case FORMAT_A16W16V16U16:		return 8;
   1655 		case FORMAT_Q16W16V16U16:		return 8;
   1656 		// Luminance formats
   1657 		case FORMAT_L8:					return 1;
   1658 		case FORMAT_A4L4:				return 1;
   1659 		case FORMAT_L16:				return 2;
   1660 		case FORMAT_A8L8:				return 2;
   1661 		case FORMAT_L16F:               return 2;
   1662 		case FORMAT_A16L16F:            return 4;
   1663 		case FORMAT_L32F:               return 4;
   1664 		case FORMAT_A32L32F:            return 8;
   1665 		// Floating-point formats
   1666 		case FORMAT_A16F:				return 2;
   1667 		case FORMAT_R16F:				return 2;
   1668 		case FORMAT_G16R16F:			return 4;
   1669 		case FORMAT_B16G16R16F:			return 6;
   1670 		case FORMAT_X16B16G16R16F:		return 8;
   1671 		case FORMAT_A16B16G16R16F:		return 8;
   1672 		case FORMAT_X16B16G16R16F_UNSIGNED: return 8;
   1673 		case FORMAT_A32F:				return 4;
   1674 		case FORMAT_R32F:				return 4;
   1675 		case FORMAT_G32R32F:			return 8;
   1676 		case FORMAT_B32G32R32F:			return 12;
   1677 		case FORMAT_X32B32G32R32F:		return 16;
   1678 		case FORMAT_A32B32G32R32F:		return 16;
   1679 		case FORMAT_X32B32G32R32F_UNSIGNED: return 16;
   1680 		// Depth/stencil formats
   1681 		case FORMAT_D16:				return 2;
   1682 		case FORMAT_D32:				return 4;
   1683 		case FORMAT_D24X8:				return 4;
   1684 		case FORMAT_D24S8:				return 4;
   1685 		case FORMAT_D24FS8:				return 4;
   1686 		case FORMAT_D32F:				return 4;
   1687 		case FORMAT_D32FS8:				return 4;
   1688 		case FORMAT_D32F_COMPLEMENTARY:	return 4;
   1689 		case FORMAT_D32FS8_COMPLEMENTARY: return 4;
   1690 		case FORMAT_D32F_LOCKABLE:		return 4;
   1691 		case FORMAT_D32FS8_TEXTURE:		return 4;
   1692 		case FORMAT_D32F_SHADOW:		return 4;
   1693 		case FORMAT_D32FS8_SHADOW:		return 4;
   1694 		case FORMAT_DF24S8:				return 4;
   1695 		case FORMAT_DF16S8:				return 2;
   1696 		case FORMAT_INTZ:				return 4;
   1697 		case FORMAT_S8:					return 1;
   1698 		case FORMAT_YV12_BT601:         return 1;   // Y plane only
   1699 		case FORMAT_YV12_BT709:         return 1;   // Y plane only
   1700 		case FORMAT_YV12_JFIF:          return 1;   // Y plane only
   1701 		default:
   1702 			ASSERT(false);
   1703 		}
   1704 
   1705 		return 0;
   1706 	}
   1707 
   1708 	int Surface::pitchB(int width, int border, Format format, bool target)
   1709 	{
   1710 		width += 2 * border;
   1711 
   1712 		if(target || isDepth(format) || isStencil(format))
   1713 		{
   1714 			width = align(width, 2);
   1715 		}
   1716 
   1717 		switch(format)
   1718 		{
   1719 		case FORMAT_DXT1:
   1720 		case FORMAT_ETC1:
   1721 		case FORMAT_R11_EAC:
   1722 		case FORMAT_SIGNED_R11_EAC:
   1723 		case FORMAT_RGB8_ETC2:
   1724 		case FORMAT_SRGB8_ETC2:
   1725 		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
   1726 		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
   1727 			return 8 * ((width + 3) / 4);    // 64 bit per 4x4 block, computed per 4 rows
   1728 		case FORMAT_RG11_EAC:
   1729 		case FORMAT_SIGNED_RG11_EAC:
   1730 		case FORMAT_RGBA8_ETC2_EAC:
   1731 		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
   1732 		case FORMAT_RGBA_ASTC_4x4_KHR:
   1733 		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
   1734 			return 16 * ((width + 3) / 4);    // 128 bit per 4x4 block, computed per 4 rows
   1735 		case FORMAT_RGBA_ASTC_5x4_KHR:
   1736 		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
   1737 		case FORMAT_RGBA_ASTC_5x5_KHR:
   1738 		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
   1739 			return 16 * ((width + 4) / 5);
   1740 		case FORMAT_RGBA_ASTC_6x5_KHR:
   1741 		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
   1742 		case FORMAT_RGBA_ASTC_6x6_KHR:
   1743 		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
   1744 			return 16 * ((width + 5) / 6);
   1745 		case FORMAT_RGBA_ASTC_8x5_KHR:
   1746 		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
   1747 		case FORMAT_RGBA_ASTC_8x6_KHR:
   1748 		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
   1749 		case FORMAT_RGBA_ASTC_8x8_KHR:
   1750 		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
   1751 			return 16 * ((width + 7) / 8);
   1752 		case FORMAT_RGBA_ASTC_10x5_KHR:
   1753 		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
   1754 		case FORMAT_RGBA_ASTC_10x6_KHR:
   1755 		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
   1756 		case FORMAT_RGBA_ASTC_10x8_KHR:
   1757 		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
   1758 		case FORMAT_RGBA_ASTC_10x10_KHR:
   1759 		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
   1760 			return 16 * ((width + 9) / 10);
   1761 		case FORMAT_RGBA_ASTC_12x10_KHR:
   1762 		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
   1763 		case FORMAT_RGBA_ASTC_12x12_KHR:
   1764 		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
   1765 			return 16 * ((width + 11) / 12);
   1766 		case FORMAT_DXT3:
   1767 		case FORMAT_DXT5:
   1768 			return 16 * ((width + 3) / 4);   // 128 bit per 4x4 block, computed per 4 rows
   1769 		case FORMAT_ATI1:
   1770 			return 2 * ((width + 3) / 4);    // 64 bit per 4x4 block, computed per row
   1771 		case FORMAT_ATI2:
   1772 			return 4 * ((width + 3) / 4);    // 128 bit per 4x4 block, computed per row
   1773 		case FORMAT_YV12_BT601:
   1774 		case FORMAT_YV12_BT709:
   1775 		case FORMAT_YV12_JFIF:
   1776 			return align(width, 16);
   1777 		default:
   1778 			return bytes(format) * width;
   1779 		}
   1780 	}
   1781 
   1782 	int Surface::pitchP(int width, int border, Format format, bool target)
   1783 	{
   1784 		int B = bytes(format);
   1785 
   1786 		return B > 0 ? pitchB(width, border, format, target) / B : 0;
   1787 	}
   1788 
   1789 	int Surface::sliceB(int width, int height, int border, Format format, bool target)
   1790 	{
   1791 		height += 2 * border;
   1792 
   1793 		if(target || isDepth(format) || isStencil(format))
   1794 		{
   1795 			height = ((height + 1) & ~1);
   1796 		}
   1797 
   1798 		switch(format)
   1799 		{
   1800 		case FORMAT_DXT1:
   1801 		case FORMAT_DXT3:
   1802 		case FORMAT_DXT5:
   1803 		case FORMAT_ETC1:
   1804 		case FORMAT_R11_EAC:
   1805 		case FORMAT_SIGNED_R11_EAC:
   1806 		case FORMAT_RG11_EAC:
   1807 		case FORMAT_SIGNED_RG11_EAC:
   1808 		case FORMAT_RGB8_ETC2:
   1809 		case FORMAT_SRGB8_ETC2:
   1810 		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
   1811 		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
   1812 		case FORMAT_RGBA8_ETC2_EAC:
   1813 		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
   1814 		case FORMAT_RGBA_ASTC_4x4_KHR:
   1815 		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
   1816 		case FORMAT_RGBA_ASTC_5x4_KHR:
   1817 		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
   1818 			return pitchB(width, border, format, target) * ((height + 3) / 4);   // Pitch computed per 4 rows
   1819 		case FORMAT_RGBA_ASTC_5x5_KHR:
   1820 		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
   1821 		case FORMAT_RGBA_ASTC_6x5_KHR:
   1822 		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
   1823 		case FORMAT_RGBA_ASTC_8x5_KHR:
   1824 		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
   1825 		case FORMAT_RGBA_ASTC_10x5_KHR:
   1826 		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
   1827 			return pitchB(width, border, format, target) * ((height + 4) / 5);   // Pitch computed per 5 rows
   1828 		case FORMAT_RGBA_ASTC_6x6_KHR:
   1829 		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
   1830 		case FORMAT_RGBA_ASTC_8x6_KHR:
   1831 		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
   1832 		case FORMAT_RGBA_ASTC_10x6_KHR:
   1833 		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
   1834 			return pitchB(width, border, format, target) * ((height + 5) / 6);   // Pitch computed per 6 rows
   1835 		case FORMAT_RGBA_ASTC_8x8_KHR:
   1836 		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
   1837 		case FORMAT_RGBA_ASTC_10x8_KHR:
   1838 		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
   1839 			return pitchB(width, border, format, target) * ((height + 7) / 8);   // Pitch computed per 8 rows
   1840 		case FORMAT_RGBA_ASTC_10x10_KHR:
   1841 		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
   1842 		case FORMAT_RGBA_ASTC_12x10_KHR:
   1843 		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
   1844 			return pitchB(width, border, format, target) * ((height + 9) / 10);   // Pitch computed per 10 rows
   1845 		case FORMAT_RGBA_ASTC_12x12_KHR:
   1846 		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
   1847 			return pitchB(width, border, format, target) * ((height + 11) / 12);   // Pitch computed per 12 rows
   1848 		case FORMAT_ATI1:
   1849 		case FORMAT_ATI2:
   1850 		default:
   1851 			return pitchB(width, border, format, target) * height;   // Pitch computed per row
   1852 		}
   1853 	}
   1854 
   1855 	int Surface::sliceP(int width, int height, int border, Format format, bool target)
   1856 	{
   1857 		int B = bytes(format);
   1858 
   1859 		return B > 0 ? sliceB(width, height, border, format, target) / B : 0;
   1860 	}
   1861 
   1862 	void Surface::update(Buffer &destination, Buffer &source)
   1863 	{
   1864 	//	ASSERT(source.lock != LOCK_UNLOCKED);
   1865 	//	ASSERT(destination.lock != LOCK_UNLOCKED);
   1866 
   1867 		if(destination.buffer != source.buffer)
   1868 		{
   1869 			ASSERT(source.dirty && !destination.dirty);
   1870 
   1871 			switch(source.format)
   1872 			{
   1873 			case FORMAT_R8G8B8:		decodeR8G8B8(destination, source);		break;   // FIXME: Check destination format
   1874 			case FORMAT_X1R5G5B5:	decodeX1R5G5B5(destination, source);	break;   // FIXME: Check destination format
   1875 			case FORMAT_A1R5G5B5:	decodeA1R5G5B5(destination, source);	break;   // FIXME: Check destination format
   1876 			case FORMAT_X4R4G4B4:	decodeX4R4G4B4(destination, source);	break;   // FIXME: Check destination format
   1877 			case FORMAT_A4R4G4B4:	decodeA4R4G4B4(destination, source);	break;   // FIXME: Check destination format
   1878 			case FORMAT_P8:			decodeP8(destination, source);			break;   // FIXME: Check destination format
   1879 			case FORMAT_DXT1:		decodeDXT1(destination, source);		break;   // FIXME: Check destination format
   1880 			case FORMAT_DXT3:		decodeDXT3(destination, source);		break;   // FIXME: Check destination format
   1881 			case FORMAT_DXT5:		decodeDXT5(destination, source);		break;   // FIXME: Check destination format
   1882 			case FORMAT_ATI1:		decodeATI1(destination, source);		break;   // FIXME: Check destination format
   1883 			case FORMAT_ATI2:		decodeATI2(destination, source);		break;   // FIXME: Check destination format
   1884 			case FORMAT_R11_EAC:         decodeEAC(destination, source, 1, false); break; // FIXME: Check destination format
   1885 			case FORMAT_SIGNED_R11_EAC:  decodeEAC(destination, source, 1, true);  break; // FIXME: Check destination format
   1886 			case FORMAT_RG11_EAC:        decodeEAC(destination, source, 2, false); break; // FIXME: Check destination format
   1887 			case FORMAT_SIGNED_RG11_EAC: decodeEAC(destination, source, 2, true);  break; // FIXME: Check destination format
   1888 			case FORMAT_ETC1:
   1889 			case FORMAT_RGB8_ETC2:                      decodeETC2(destination, source, 0, false); break; // FIXME: Check destination format
   1890 			case FORMAT_SRGB8_ETC2:                     decodeETC2(destination, source, 0, true);  break; // FIXME: Check destination format
   1891 			case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:  decodeETC2(destination, source, 1, false); break; // FIXME: Check destination format
   1892 			case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2: decodeETC2(destination, source, 1, true);  break; // FIXME: Check destination format
   1893 			case FORMAT_RGBA8_ETC2_EAC:                 decodeETC2(destination, source, 8, false); break; // FIXME: Check destination format
   1894 			case FORMAT_SRGB8_ALPHA8_ETC2_EAC:          decodeETC2(destination, source, 8, true);  break; // FIXME: Check destination format
   1895 			case FORMAT_RGBA_ASTC_4x4_KHR:           decodeASTC(destination, source, 4,  4,  1, false); break; // FIXME: Check destination format
   1896 			case FORMAT_RGBA_ASTC_5x4_KHR:           decodeASTC(destination, source, 5,  4,  1, false); break; // FIXME: Check destination format
   1897 			case FORMAT_RGBA_ASTC_5x5_KHR:           decodeASTC(destination, source, 5,  5,  1, false); break; // FIXME: Check destination format
   1898 			case FORMAT_RGBA_ASTC_6x5_KHR:           decodeASTC(destination, source, 6,  5,  1, false); break; // FIXME: Check destination format
   1899 			case FORMAT_RGBA_ASTC_6x6_KHR:           decodeASTC(destination, source, 6,  6,  1, false); break; // FIXME: Check destination format
   1900 			case FORMAT_RGBA_ASTC_8x5_KHR:           decodeASTC(destination, source, 8,  5,  1, false); break; // FIXME: Check destination format
   1901 			case FORMAT_RGBA_ASTC_8x6_KHR:           decodeASTC(destination, source, 8,  6,  1, false); break; // FIXME: Check destination format
   1902 			case FORMAT_RGBA_ASTC_8x8_KHR:           decodeASTC(destination, source, 8,  8,  1, false); break; // FIXME: Check destination format
   1903 			case FORMAT_RGBA_ASTC_10x5_KHR:          decodeASTC(destination, source, 10, 5,  1, false); break; // FIXME: Check destination format
   1904 			case FORMAT_RGBA_ASTC_10x6_KHR:          decodeASTC(destination, source, 10, 6,  1, false); break; // FIXME: Check destination format
   1905 			case FORMAT_RGBA_ASTC_10x8_KHR:          decodeASTC(destination, source, 10, 8,  1, false); break; // FIXME: Check destination format
   1906 			case FORMAT_RGBA_ASTC_10x10_KHR:         decodeASTC(destination, source, 10, 10, 1, false); break; // FIXME: Check destination format
   1907 			case FORMAT_RGBA_ASTC_12x10_KHR:         decodeASTC(destination, source, 12, 10, 1, false); break; // FIXME: Check destination format
   1908 			case FORMAT_RGBA_ASTC_12x12_KHR:         decodeASTC(destination, source, 12, 12, 1, false); break; // FIXME: Check destination format
   1909 			case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:   decodeASTC(destination, source, 4,  4,  1, true);  break; // FIXME: Check destination format
   1910 			case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:   decodeASTC(destination, source, 5,  4,  1, true);  break; // FIXME: Check destination format
   1911 			case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:   decodeASTC(destination, source, 5,  5,  1, true);  break; // FIXME: Check destination format
   1912 			case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:   decodeASTC(destination, source, 6,  5,  1, true);  break; // FIXME: Check destination format
   1913 			case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:   decodeASTC(destination, source, 6,  6,  1, true);  break; // FIXME: Check destination format
   1914 			case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:   decodeASTC(destination, source, 8,  5,  1, true);  break; // FIXME: Check destination format
   1915 			case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:   decodeASTC(destination, source, 8,  6,  1, true);  break; // FIXME: Check destination format
   1916 			case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:   decodeASTC(destination, source, 8,  8,  1, true);  break; // FIXME: Check destination format
   1917 			case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:  decodeASTC(destination, source, 10, 5,  1, true);  break; // FIXME: Check destination format
   1918 			case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:  decodeASTC(destination, source, 10, 6,  1, true);  break; // FIXME: Check destination format
   1919 			case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:  decodeASTC(destination, source, 10, 8,  1, true);  break; // FIXME: Check destination format
   1920 			case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR: decodeASTC(destination, source, 10, 10, 1, true);  break; // FIXME: Check destination format
   1921 			case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR: decodeASTC(destination, source, 12, 10, 1, true);  break; // FIXME: Check destination format
   1922 			case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR: decodeASTC(destination, source, 12, 12, 1, true);  break; // FIXME: Check destination format
   1923 			default:				genericUpdate(destination, source);		break;
   1924 			}
   1925 		}
   1926 	}
   1927 
   1928 	void Surface::genericUpdate(Buffer &destination, Buffer &source)
   1929 	{
   1930 		unsigned char *sourceSlice = (unsigned char*)source.lockRect(0, 0, 0, sw::LOCK_READONLY);
   1931 		unsigned char *destinationSlice = (unsigned char*)destination.lockRect(0, 0, 0, sw::LOCK_UPDATE);
   1932 
   1933 		int depth = min(destination.depth, source.depth);
   1934 		int height = min(destination.height, source.height);
   1935 		int width = min(destination.width, source.width);
   1936 		int rowBytes = width * source.bytes;
   1937 
   1938 		for(int z = 0; z < depth; z++)
   1939 		{
   1940 			unsigned char *sourceRow = sourceSlice;
   1941 			unsigned char *destinationRow = destinationSlice;
   1942 
   1943 			for(int y = 0; y < height; y++)
   1944 			{
   1945 				if(source.format == destination.format)
   1946 				{
   1947 					memcpy(destinationRow, sourceRow, rowBytes);
   1948 				}
   1949 				else
   1950 				{
   1951 					unsigned char *sourceElement = sourceRow;
   1952 					unsigned char *destinationElement = destinationRow;
   1953 
   1954 					for(int x = 0; x < width; x++)
   1955 					{
   1956 						Color<float> color = source.read(sourceElement);
   1957 						destination.write(destinationElement, color);
   1958 
   1959 						sourceElement += source.bytes;
   1960 						destinationElement += destination.bytes;
   1961 					}
   1962 				}
   1963 
   1964 				sourceRow += source.pitchB;
   1965 				destinationRow += destination.pitchB;
   1966 			}
   1967 
   1968 			sourceSlice += source.sliceB;
   1969 			destinationSlice += destination.sliceB;
   1970 		}
   1971 
   1972 		source.unlockRect();
   1973 		destination.unlockRect();
   1974 	}
   1975 
   1976 	void Surface::decodeR8G8B8(Buffer &destination, Buffer &source)
   1977 	{
   1978 		unsigned char *sourceSlice = (unsigned char*)source.lockRect(0, 0, 0, sw::LOCK_READONLY);
   1979 		unsigned char *destinationSlice = (unsigned char*)destination.lockRect(0, 0, 0, sw::LOCK_UPDATE);
   1980 
   1981 		int depth = min(destination.depth, source.depth);
   1982 		int height = min(destination.height, source.height);
   1983 		int width = min(destination.width, source.width);
   1984 
   1985 		for(int z = 0; z < depth; z++)
   1986 		{
   1987 			unsigned char *sourceRow = sourceSlice;
   1988 			unsigned char *destinationRow = destinationSlice;
   1989 
   1990 			for(int y = 0; y < height; y++)
   1991 			{
   1992 				unsigned char *sourceElement = sourceRow;
   1993 				unsigned char *destinationElement = destinationRow;
   1994 
   1995 				for(int x = 0; x < width; x++)
   1996 				{
   1997 					unsigned int b = sourceElement[0];
   1998 					unsigned int g = sourceElement[1];
   1999 					unsigned int r = sourceElement[2];
   2000 
   2001 					*(unsigned int*)destinationElement = 0xFF000000 | (r << 16) | (g << 8) | (b << 0);
   2002 
   2003 					sourceElement += source.bytes;
   2004 					destinationElement += destination.bytes;
   2005 				}
   2006 
   2007 				sourceRow += source.pitchB;
   2008 				destinationRow += destination.pitchB;
   2009 			}
   2010 
   2011 			sourceSlice += source.sliceB;
   2012 			destinationSlice += destination.sliceB;
   2013 		}
   2014 
   2015 		source.unlockRect();
   2016 		destination.unlockRect();
   2017 	}
   2018 
   2019 	void Surface::decodeX1R5G5B5(Buffer &destination, Buffer &source)
   2020 	{
   2021 		unsigned char *sourceSlice = (unsigned char*)source.lockRect(0, 0, 0, sw::LOCK_READONLY);
   2022 		unsigned char *destinationSlice = (unsigned char*)destination.lockRect(0, 0, 0, sw::LOCK_UPDATE);
   2023 
   2024 		int depth = min(destination.depth, source.depth);
   2025 		int height = min(destination.height, source.height);
   2026 		int width = min(destination.width, source.width);
   2027 
   2028 		for(int z = 0; z < depth; z++)
   2029 		{
   2030 			unsigned char *sourceRow = sourceSlice;
   2031 			unsigned char *destinationRow = destinationSlice;
   2032 
   2033 			for(int y = 0; y < height; y++)
   2034 			{
   2035 				unsigned char *sourceElement = sourceRow;
   2036 				unsigned char *destinationElement = destinationRow;
   2037 
   2038 				for(int x = 0; x < width; x++)
   2039 				{
   2040 					unsigned int xrgb = *(unsigned short*)sourceElement;
   2041 
   2042 					unsigned int r = (((xrgb & 0x7C00) * 134771 + 0x800000) >> 8) & 0x00FF0000;
   2043 					unsigned int g = (((xrgb & 0x03E0) * 16846 + 0x8000) >> 8) & 0x0000FF00;
   2044 					unsigned int b = (((xrgb & 0x001F) * 2106  + 0x80) >> 8);
   2045 
   2046 					*(unsigned int*)destinationElement = 0xFF000000 | r | g | b;
   2047 
   2048 					sourceElement += source.bytes;
   2049 					destinationElement += destination.bytes;
   2050 				}
   2051 
   2052 				sourceRow += source.pitchB;
   2053 				destinationRow += destination.pitchB;
   2054 			}
   2055 
   2056 			sourceSlice += source.sliceB;
   2057 			destinationSlice += destination.sliceB;
   2058 		}
   2059 
   2060 		source.unlockRect();
   2061 		destination.unlockRect();
   2062 	}
   2063 
   2064 	void Surface::decodeA1R5G5B5(Buffer &destination, Buffer &source)
   2065 	{
   2066 		unsigned char *sourceSlice = (unsigned char*)source.lockRect(0, 0, 0, sw::LOCK_READONLY);
   2067 		unsigned char *destinationSlice = (unsigned char*)destination.lockRect(0, 0, 0, sw::LOCK_UPDATE);
   2068 
   2069 		int depth = min(destination.depth, source.depth);
   2070 		int height = min(destination.height, source.height);
   2071 		int width = min(destination.width, source.width);
   2072 
   2073 		for(int z = 0; z < depth; z++)
   2074 		{
   2075 			unsigned char *sourceRow = sourceSlice;
   2076 			unsigned char *destinationRow = destinationSlice;
   2077 
   2078 			for(int y = 0; y < height; y++)
   2079 			{
   2080 				unsigned char *sourceElement = sourceRow;
   2081 				unsigned char *destinationElement = destinationRow;
   2082 
   2083 				for(int x = 0; x < width; x++)
   2084 				{
   2085 					unsigned int argb = *(unsigned short*)sourceElement;
   2086 
   2087 					unsigned int a =   (argb & 0x8000) * 130560;
   2088 					unsigned int r = (((argb & 0x7C00) * 134771 + 0x800000) >> 8) & 0x00FF0000;
   2089 					unsigned int g = (((argb & 0x03E0) * 16846  + 0x8000) >> 8) & 0x0000FF00;
   2090 					unsigned int b = (((argb & 0x001F) * 2106   + 0x80) >> 8);
   2091 
   2092 					*(unsigned int*)destinationElement = a | r | g | b;
   2093 
   2094 					sourceElement += source.bytes;
   2095 					destinationElement += destination.bytes;
   2096 				}
   2097 
   2098 				sourceRow += source.pitchB;
   2099 				destinationRow += destination.pitchB;
   2100 			}
   2101 
   2102 			sourceSlice += source.sliceB;
   2103 			destinationSlice += destination.sliceB;
   2104 		}
   2105 
   2106 		source.unlockRect();
   2107 		destination.unlockRect();
   2108 	}
   2109 
   2110 	void Surface::decodeX4R4G4B4(Buffer &destination, Buffer &source)
   2111 	{
   2112 		unsigned char *sourceSlice = (unsigned char*)source.lockRect(0, 0, 0, sw::LOCK_READONLY);
   2113 		unsigned char *destinationSlice = (unsigned char*)destination.lockRect(0, 0, 0, sw::LOCK_UPDATE);
   2114 
   2115 		int depth = min(destination.depth, source.depth);
   2116 		int height = min(destination.height, source.height);
   2117 		int width = min(destination.width, source.width);
   2118 
   2119 		for(int z = 0; z < depth; z++)
   2120 		{
   2121 			unsigned char *sourceRow = sourceSlice;
   2122 			unsigned char *destinationRow = destinationSlice;
   2123 
   2124 			for(int y = 0; y < height; y++)
   2125 			{
   2126 				unsigned char *sourceElement = sourceRow;
   2127 				unsigned char *destinationElement = destinationRow;
   2128 
   2129 				for(int x = 0; x < width; x++)
   2130 				{
   2131 					unsigned int xrgb = *(unsigned short*)sourceElement;
   2132 
   2133 					unsigned int r = ((xrgb & 0x0F00) * 0x00001100) & 0x00FF0000;
   2134 					unsigned int g = ((xrgb & 0x00F0) * 0x00000110) & 0x0000FF00;
   2135 					unsigned int b =  (xrgb & 0x000F) * 0x00000011;
   2136 
   2137 					*(unsigned int*)destinationElement = 0xFF000000 | r | g | b;
   2138 
   2139 					sourceElement += source.bytes;
   2140 					destinationElement += destination.bytes;
   2141 				}
   2142 
   2143 				sourceRow += source.pitchB;
   2144 				destinationRow += destination.pitchB;
   2145 			}
   2146 
   2147 			sourceSlice += source.sliceB;
   2148 			destinationSlice += destination.sliceB;
   2149 		}
   2150 
   2151 		source.unlockRect();
   2152 		destination.unlockRect();
   2153 	}
   2154 
   2155 	void Surface::decodeA4R4G4B4(Buffer &destination, Buffer &source)
   2156 	{
   2157 		unsigned char *sourceSlice = (unsigned char*)source.lockRect(0, 0, 0, sw::LOCK_READONLY);
   2158 		unsigned char *destinationSlice = (unsigned char*)destination.lockRect(0, 0, 0, sw::LOCK_UPDATE);
   2159 
   2160 		int depth = min(destination.depth, source.depth);
   2161 		int height = min(destination.height, source.height);
   2162 		int width = min(destination.width, source.width);
   2163 
   2164 		for(int z = 0; z < depth; z++)
   2165 		{
   2166 			unsigned char *sourceRow = sourceSlice;
   2167 			unsigned char *destinationRow = destinationSlice;
   2168 
   2169 			for(int y = 0; y < height; y++)
   2170 			{
   2171 				unsigned char *sourceElement = sourceRow;
   2172 				unsigned char *destinationElement = destinationRow;
   2173 
   2174 				for(int x = 0; x < width; x++)
   2175 				{
   2176 					unsigned int argb = *(unsigned short*)sourceElement;
   2177 
   2178 					unsigned int a = ((argb & 0xF000) * 0x00011000) & 0xFF000000;
   2179 					unsigned int r = ((argb & 0x0F00) * 0x00001100) & 0x00FF0000;
   2180 					unsigned int g = ((argb & 0x00F0) * 0x00000110) & 0x0000FF00;
   2181 					unsigned int b =  (argb & 0x000F) * 0x00000011;
   2182 
   2183 					*(unsigned int*)destinationElement = a | r | g | b;
   2184 
   2185 					sourceElement += source.bytes;
   2186 					destinationElement += destination.bytes;
   2187 				}
   2188 
   2189 				sourceRow += source.pitchB;
   2190 				destinationRow += destination.pitchB;
   2191 			}
   2192 
   2193 			sourceSlice += source.sliceB;
   2194 			destinationSlice += destination.sliceB;
   2195 		}
   2196 
   2197 		source.unlockRect();
   2198 		destination.unlockRect();
   2199 	}
   2200 
   2201 	void Surface::decodeP8(Buffer &destination, Buffer &source)
   2202 	{
   2203 		unsigned char *sourceSlice = (unsigned char*)source.lockRect(0, 0, 0, sw::LOCK_READONLY);
   2204 		unsigned char *destinationSlice = (unsigned char*)destination.lockRect(0, 0, 0, sw::LOCK_UPDATE);
   2205 
   2206 		int depth = min(destination.depth, source.depth);
   2207 		int height = min(destination.height, source.height);
   2208 		int width = min(destination.width, source.width);
   2209 
   2210 		for(int z = 0; z < depth; z++)
   2211 		{
   2212 			unsigned char *sourceRow = sourceSlice;
   2213 			unsigned char *destinationRow = destinationSlice;
   2214 
   2215 			for(int y = 0; y < height; y++)
   2216 			{
   2217 				unsigned char *sourceElement = sourceRow;
   2218 				unsigned char *destinationElement = destinationRow;
   2219 
   2220 				for(int x = 0; x < width; x++)
   2221 				{
   2222 					unsigned int abgr = palette[*(unsigned char*)sourceElement];
   2223 
   2224 					unsigned int r = (abgr & 0x000000FF) << 16;
   2225 					unsigned int g = (abgr & 0x0000FF00) << 0;
   2226 					unsigned int b = (abgr & 0x00FF0000) >> 16;
   2227 					unsigned int a = (abgr & 0xFF000000) >> 0;
   2228 
   2229 					*(unsigned int*)destinationElement = a | r | g | b;
   2230 
   2231 					sourceElement += source.bytes;
   2232 					destinationElement += destination.bytes;
   2233 				}
   2234 
   2235 				sourceRow += source.pitchB;
   2236 				destinationRow += destination.pitchB;
   2237 			}
   2238 
   2239 			sourceSlice += source.sliceB;
   2240 			destinationSlice += destination.sliceB;
   2241 		}
   2242 
   2243 		source.unlockRect();
   2244 		destination.unlockRect();
   2245 	}
   2246 
   2247 	void Surface::decodeDXT1(Buffer &internal, Buffer &external)
   2248 	{
   2249 		unsigned int *destSlice = (unsigned int*)internal.lockRect(0, 0, 0, LOCK_UPDATE);
   2250 		const DXT1 *source = (const DXT1*)external.lockRect(0, 0, 0, LOCK_READONLY);
   2251 
   2252 		for(int z = 0; z < external.depth; z++)
   2253 		{
   2254 			unsigned int *dest = destSlice;
   2255 
   2256 			for(int y = 0; y < external.height; y += 4)
   2257 			{
   2258 				for(int x = 0; x < external.width; x += 4)
   2259 				{
   2260 					Color<byte> c[4];
   2261 
   2262 					c[0] = source->c0;
   2263 					c[1] = source->c1;
   2264 
   2265 					if(source->c0 > source->c1)   // No transparency
   2266 					{
   2267 						// c2 = 2 / 3 * c0 + 1 / 3 * c1
   2268 						c[2].r = (byte)((2 * (word)c[0].r + (word)c[1].r + 1) / 3);
   2269 						c[2].g = (byte)((2 * (word)c[0].g + (word)c[1].g + 1) / 3);
   2270 						c[2].b = (byte)((2 * (word)c[0].b + (word)c[1].b + 1) / 3);
   2271 						c[2].a = 0xFF;
   2272 
   2273 						// c3 = 1 / 3 * c0 + 2 / 3 * c1
   2274 						c[3].r = (byte)(((word)c[0].r + 2 * (word)c[1].r + 1) / 3);
   2275 						c[3].g = (byte)(((word)c[0].g + 2 * (word)c[1].g + 1) / 3);
   2276 						c[3].b = (byte)(((word)c[0].b + 2 * (word)c[1].b + 1) / 3);
   2277 						c[3].a = 0xFF;
   2278 					}
   2279 					else   // c3 transparent
   2280 					{
   2281 						// c2 = 1 / 2 * c0 + 1 / 2 * c1
   2282 						c[2].r = (byte)(((word)c[0].r + (word)c[1].r) / 2);
   2283 						c[2].g = (byte)(((word)c[0].g + (word)c[1].g) / 2);
   2284 						c[2].b = (byte)(((word)c[0].b + (word)c[1].b) / 2);
   2285 						c[2].a = 0xFF;
   2286 
   2287 						c[3].r = 0;
   2288 						c[3].g = 0;
   2289 						c[3].b = 0;
   2290 						c[3].a = 0;
   2291 					}
   2292 
   2293 					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
   2294 					{
   2295 						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
   2296 						{
   2297 							dest[(x + i) + (y + j) * internal.width] = c[(unsigned int)(source->lut >> 2 * (i + j * 4)) % 4];
   2298 						}
   2299 					}
   2300 
   2301 					source++;
   2302 				}
   2303 			}
   2304 
   2305 			(byte*&)destSlice += internal.sliceB;
   2306 		}
   2307 
   2308 		external.unlockRect();
   2309 		internal.unlockRect();
   2310 	}
   2311 
   2312 	void Surface::decodeDXT3(Buffer &internal, Buffer &external)
   2313 	{
   2314 		unsigned int *destSlice = (unsigned int*)internal.lockRect(0, 0, 0, LOCK_UPDATE);
   2315 		const DXT3 *source = (const DXT3*)external.lockRect(0, 0, 0, LOCK_READONLY);
   2316 
   2317 		for(int z = 0; z < external.depth; z++)
   2318 		{
   2319 			unsigned int *dest = destSlice;
   2320 
   2321 			for(int y = 0; y < external.height; y += 4)
   2322 			{
   2323 				for(int x = 0; x < external.width; x += 4)
   2324 				{
   2325 					Color<byte> c[4];
   2326 
   2327 					c[0] = source->c0;
   2328 					c[1] = source->c1;
   2329 
   2330 					// c2 = 2 / 3 * c0 + 1 / 3 * c1
   2331 					c[2].r = (byte)((2 * (word)c[0].r + (word)c[1].r + 1) / 3);
   2332 					c[2].g = (byte)((2 * (word)c[0].g + (word)c[1].g + 1) / 3);
   2333 					c[2].b = (byte)((2 * (word)c[0].b + (word)c[1].b + 1) / 3);
   2334 
   2335 					// c3 = 1 / 3 * c0 + 2 / 3 * c1
   2336 					c[3].r = (byte)(((word)c[0].r + 2 * (word)c[1].r + 1) / 3);
   2337 					c[3].g = (byte)(((word)c[0].g + 2 * (word)c[1].g + 1) / 3);
   2338 					c[3].b = (byte)(((word)c[0].b + 2 * (word)c[1].b + 1) / 3);
   2339 
   2340 					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
   2341 					{
   2342 						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
   2343 						{
   2344 							unsigned int a = (unsigned int)(source->a >> 4 * (i + j * 4)) & 0x0F;
   2345 							unsigned int color = (c[(unsigned int)(source->lut >> 2 * (i + j * 4)) % 4] & 0x00FFFFFF) | ((a << 28) + (a << 24));
   2346 
   2347 							dest[(x + i) + (y + j) * internal.width] = color;
   2348 						}
   2349 					}
   2350 
   2351 					source++;
   2352 				}
   2353 			}
   2354 
   2355 			(byte*&)destSlice += internal.sliceB;
   2356 		}
   2357 
   2358 		external.unlockRect();
   2359 		internal.unlockRect();
   2360 	}
   2361 
   2362 	void Surface::decodeDXT5(Buffer &internal, Buffer &external)
   2363 	{
   2364 		unsigned int *destSlice = (unsigned int*)internal.lockRect(0, 0, 0, LOCK_UPDATE);
   2365 		const DXT5 *source = (const DXT5*)external.lockRect(0, 0, 0, LOCK_READONLY);
   2366 
   2367 		for(int z = 0; z < external.depth; z++)
   2368 		{
   2369 			unsigned int *dest = destSlice;
   2370 
   2371 			for(int y = 0; y < external.height; y += 4)
   2372 			{
   2373 				for(int x = 0; x < external.width; x += 4)
   2374 				{
   2375 					Color<byte> c[4];
   2376 
   2377 					c[0] = source->c0;
   2378 					c[1] = source->c1;
   2379 
   2380 					// c2 = 2 / 3 * c0 + 1 / 3 * c1
   2381 					c[2].r = (byte)((2 * (word)c[0].r + (word)c[1].r + 1) / 3);
   2382 					c[2].g = (byte)((2 * (word)c[0].g + (word)c[1].g + 1) / 3);
   2383 					c[2].b = (byte)((2 * (word)c[0].b + (word)c[1].b + 1) / 3);
   2384 
   2385 					// c3 = 1 / 3 * c0 + 2 / 3 * c1
   2386 					c[3].r = (byte)(((word)c[0].r + 2 * (word)c[1].r + 1) / 3);
   2387 					c[3].g = (byte)(((word)c[0].g + 2 * (word)c[1].g + 1) / 3);
   2388 					c[3].b = (byte)(((word)c[0].b + 2 * (word)c[1].b + 1) / 3);
   2389 
   2390 					byte a[8];
   2391 
   2392 					a[0] = source->a0;
   2393 					a[1] = source->a1;
   2394 
   2395 					if(a[0] > a[1])
   2396 					{
   2397 						a[2] = (byte)((6 * (word)a[0] + 1 * (word)a[1] + 3) / 7);
   2398 						a[3] = (byte)((5 * (word)a[0] + 2 * (word)a[1] + 3) / 7);
   2399 						a[4] = (byte)((4 * (word)a[0] + 3 * (word)a[1] + 3) / 7);
   2400 						a[5] = (byte)((3 * (word)a[0] + 4 * (word)a[1] + 3) / 7);
   2401 						a[6] = (byte)((2 * (word)a[0] + 5 * (word)a[1] + 3) / 7);
   2402 						a[7] = (byte)((1 * (word)a[0] + 6 * (word)a[1] + 3) / 7);
   2403 					}
   2404 					else
   2405 					{
   2406 						a[2] = (byte)((4 * (word)a[0] + 1 * (word)a[1] + 2) / 5);
   2407 						a[3] = (byte)((3 * (word)a[0] + 2 * (word)a[1] + 2) / 5);
   2408 						a[4] = (byte)((2 * (word)a[0] + 3 * (word)a[1] + 2) / 5);
   2409 						a[5] = (byte)((1 * (word)a[0] + 4 * (word)a[1] + 2) / 5);
   2410 						a[6] = 0;
   2411 						a[7] = 0xFF;
   2412 					}
   2413 
   2414 					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
   2415 					{
   2416 						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
   2417 						{
   2418 							unsigned int alpha = (unsigned int)a[(unsigned int)(source->alut >> (16 + 3 * (i + j * 4))) % 8] << 24;
   2419 							unsigned int color = (c[(source->clut >> 2 * (i + j * 4)) % 4] & 0x00FFFFFF) | alpha;
   2420 
   2421 							dest[(x + i) + (y + j) * internal.width] = color;
   2422 						}
   2423 					}
   2424 
   2425 					source++;
   2426 				}
   2427 			}
   2428 
   2429 			(byte*&)destSlice += internal.sliceB;
   2430 		}
   2431 
   2432 		external.unlockRect();
   2433 		internal.unlockRect();
   2434 	}
   2435 
   2436 	void Surface::decodeATI1(Buffer &internal, Buffer &external)
   2437 	{
   2438 		byte *destSlice = (byte*)internal.lockRect(0, 0, 0, LOCK_UPDATE);
   2439 		const ATI1 *source = (const ATI1*)external.lockRect(0, 0, 0, LOCK_READONLY);
   2440 
   2441 		for(int z = 0; z < external.depth; z++)
   2442 		{
   2443 			byte *dest = destSlice;
   2444 
   2445 			for(int y = 0; y < external.height; y += 4)
   2446 			{
   2447 				for(int x = 0; x < external.width; x += 4)
   2448 				{
   2449 					byte r[8];
   2450 
   2451 					r[0] = source->r0;
   2452 					r[1] = source->r1;
   2453 
   2454 					if(r[0] > r[1])
   2455 					{
   2456 						r[2] = (byte)((6 * (word)r[0] + 1 * (word)r[1] + 3) / 7);
   2457 						r[3] = (byte)((5 * (word)r[0] + 2 * (word)r[1] + 3) / 7);
   2458 						r[4] = (byte)((4 * (word)r[0] + 3 * (word)r[1] + 3) / 7);
   2459 						r[5] = (byte)((3 * (word)r[0] + 4 * (word)r[1] + 3) / 7);
   2460 						r[6] = (byte)((2 * (word)r[0] + 5 * (word)r[1] + 3) / 7);
   2461 						r[7] = (byte)((1 * (word)r[0] + 6 * (word)r[1] + 3) / 7);
   2462 					}
   2463 					else
   2464 					{
   2465 						r[2] = (byte)((4 * (word)r[0] + 1 * (word)r[1] + 2) / 5);
   2466 						r[3] = (byte)((3 * (word)r[0] + 2 * (word)r[1] + 2) / 5);
   2467 						r[4] = (byte)((2 * (word)r[0] + 3 * (word)r[1] + 2) / 5);
   2468 						r[5] = (byte)((1 * (word)r[0] + 4 * (word)r[1] + 2) / 5);
   2469 						r[6] = 0;
   2470 						r[7] = 0xFF;
   2471 					}
   2472 
   2473 					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
   2474 					{
   2475 						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
   2476 						{
   2477 							dest[(x + i) + (y + j) * internal.width] = r[(unsigned int)(source->rlut >> (16 + 3 * (i + j * 4))) % 8];
   2478 						}
   2479 					}
   2480 
   2481 					source++;
   2482 				}
   2483 			}
   2484 
   2485 			destSlice += internal.sliceB;
   2486 		}
   2487 
   2488 		external.unlockRect();
   2489 		internal.unlockRect();
   2490 	}
   2491 
   2492 	void Surface::decodeATI2(Buffer &internal, Buffer &external)
   2493 	{
   2494 		word *destSlice = (word*)internal.lockRect(0, 0, 0, LOCK_UPDATE);
   2495 		const ATI2 *source = (const ATI2*)external.lockRect(0, 0, 0, LOCK_READONLY);
   2496 
   2497 		for(int z = 0; z < external.depth; z++)
   2498 		{
   2499 			word *dest = destSlice;
   2500 
   2501 			for(int y = 0; y < external.height; y += 4)
   2502 			{
   2503 				for(int x = 0; x < external.width; x += 4)
   2504 				{
   2505 					byte X[8];
   2506 
   2507 					X[0] = source->x0;
   2508 					X[1] = source->x1;
   2509 
   2510 					if(X[0] > X[1])
   2511 					{
   2512 						X[2] = (byte)((6 * (word)X[0] + 1 * (word)X[1] + 3) / 7);
   2513 						X[3] = (byte)((5 * (word)X[0] + 2 * (word)X[1] + 3) / 7);
   2514 						X[4] = (byte)((4 * (word)X[0] + 3 * (word)X[1] + 3) / 7);
   2515 						X[5] = (byte)((3 * (word)X[0] + 4 * (word)X[1] + 3) / 7);
   2516 						X[6] = (byte)((2 * (word)X[0] + 5 * (word)X[1] + 3) / 7);
   2517 						X[7] = (byte)((1 * (word)X[0] + 6 * (word)X[1] + 3) / 7);
   2518 					}
   2519 					else
   2520 					{
   2521 						X[2] = (byte)((4 * (word)X[0] + 1 * (word)X[1] + 2) / 5);
   2522 						X[3] = (byte)((3 * (word)X[0] + 2 * (word)X[1] + 2) / 5);
   2523 						X[4] = (byte)((2 * (word)X[0] + 3 * (word)X[1] + 2) / 5);
   2524 						X[5] = (byte)((1 * (word)X[0] + 4 * (word)X[1] + 2) / 5);
   2525 						X[6] = 0;
   2526 						X[7] = 0xFF;
   2527 					}
   2528 
   2529 					byte Y[8];
   2530 
   2531 					Y[0] = source->y0;
   2532 					Y[1] = source->y1;
   2533 
   2534 					if(Y[0] > Y[1])
   2535 					{
   2536 						Y[2] = (byte)((6 * (word)Y[0] + 1 * (word)Y[1] + 3) / 7);
   2537 						Y[3] = (byte)((5 * (word)Y[0] + 2 * (word)Y[1] + 3) / 7);
   2538 						Y[4] = (byte)((4 * (word)Y[0] + 3 * (word)Y[1] + 3) / 7);
   2539 						Y[5] = (byte)((3 * (word)Y[0] + 4 * (word)Y[1] + 3) / 7);
   2540 						Y[6] = (byte)((2 * (word)Y[0] + 5 * (word)Y[1] + 3) / 7);
   2541 						Y[7] = (byte)((1 * (word)Y[0] + 6 * (word)Y[1] + 3) / 7);
   2542 					}
   2543 					else
   2544 					{
   2545 						Y[2] = (byte)((4 * (word)Y[0] + 1 * (word)Y[1] + 2) / 5);
   2546 						Y[3] = (byte)((3 * (word)Y[0] + 2 * (word)Y[1] + 2) / 5);
   2547 						Y[4] = (byte)((2 * (word)Y[0] + 3 * (word)Y[1] + 2) / 5);
   2548 						Y[5] = (byte)((1 * (word)Y[0] + 4 * (word)Y[1] + 2) / 5);
   2549 						Y[6] = 0;
   2550 						Y[7] = 0xFF;
   2551 					}
   2552 
   2553 					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
   2554 					{
   2555 						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
   2556 						{
   2557 							word r = X[(unsigned int)(source->xlut >> (16 + 3 * (i + j * 4))) % 8];
   2558 							word g = Y[(unsigned int)(source->ylut >> (16 + 3 * (i + j * 4))) % 8];
   2559 
   2560 							dest[(x + i) + (y + j) * internal.width] = (g << 8) + r;
   2561 						}
   2562 					}
   2563 
   2564 					source++;
   2565 				}
   2566 			}
   2567 
   2568 			(byte*&)destSlice += internal.sliceB;
   2569 		}
   2570 
   2571 		external.unlockRect();
   2572 		internal.unlockRect();
   2573 	}
   2574 
   2575 	void Surface::decodeETC2(Buffer &internal, Buffer &external, int nbAlphaBits, bool isSRGB)
   2576 	{
   2577 		ETC_Decoder::Decode((const byte*)external.lockRect(0, 0, 0, LOCK_READONLY), (byte*)internal.lockRect(0, 0, 0, LOCK_UPDATE), external.width, external.height, internal.width, internal.height, internal.pitchB, internal.bytes,
   2578 		                    (nbAlphaBits == 8) ? ETC_Decoder::ETC_RGBA : ((nbAlphaBits == 1) ? ETC_Decoder::ETC_RGB_PUNCHTHROUGH_ALPHA : ETC_Decoder::ETC_RGB));
   2579 		external.unlockRect();
   2580 		internal.unlockRect();
   2581 
   2582 		if(isSRGB)
   2583 		{
   2584 			static byte sRGBtoLinearTable[256];
   2585 			static bool sRGBtoLinearTableDirty = true;
   2586 			if(sRGBtoLinearTableDirty)
   2587 			{
   2588 				for(int i = 0; i < 256; i++)
   2589 				{
   2590 					sRGBtoLinearTable[i] = static_cast<byte>(sRGBtoLinear(static_cast<float>(i) / 255.0f) * 255.0f + 0.5f);
   2591 				}
   2592 				sRGBtoLinearTableDirty = false;
   2593 			}
   2594 
   2595 			// Perform sRGB conversion in place after decoding
   2596 			byte *src = (byte*)internal.lockRect(0, 0, 0, LOCK_READWRITE);
   2597 			for(int y = 0; y < internal.height; y++)
   2598 			{
   2599 				byte *srcRow = src + y * internal.pitchB;
   2600 				for(int x = 0; x <  internal.width; x++)
   2601 				{
   2602 					byte *srcPix = srcRow + x * internal.bytes;
   2603 					for(int i = 0; i < 3; i++)
   2604 					{
   2605 						srcPix[i] = sRGBtoLinearTable[srcPix[i]];
   2606 					}
   2607 				}
   2608 			}
   2609 			internal.unlockRect();
   2610 		}
   2611 	}
   2612 
   2613 	void Surface::decodeEAC(Buffer &internal, Buffer &external, int nbChannels, bool isSigned)
   2614 	{
   2615 		ASSERT(nbChannels == 1 || nbChannels == 2);
   2616 
   2617 		byte *src = (byte*)internal.lockRect(0, 0, 0, LOCK_READWRITE);
   2618 		ETC_Decoder::Decode((const byte*)external.lockRect(0, 0, 0, LOCK_READONLY), src, external.width, external.height, internal.width, internal.height, internal.pitchB, internal.bytes,
   2619 		                    (nbChannels == 1) ? (isSigned ? ETC_Decoder::ETC_R_SIGNED : ETC_Decoder::ETC_R_UNSIGNED) : (isSigned ? ETC_Decoder::ETC_RG_SIGNED : ETC_Decoder::ETC_RG_UNSIGNED));
   2620 		external.unlockRect();
   2621 
   2622 		// FIXME: We convert EAC data to float, until signed short internal formats are supported
   2623 		//        This code can be removed if ETC2 images are decoded to internal 16 bit signed R/RG formats
   2624 		const float normalization = isSigned ? (1.0f / (8.0f * 127.875f)) : (1.0f / (8.0f * 255.875f));
   2625 		for(int y = 0; y < internal.height; y++)
   2626 		{
   2627 			byte* srcRow = src + y * internal.pitchB;
   2628 			for(int x = internal.width - 1; x >= 0; x--)
   2629 			{
   2630 				int* srcPix = reinterpret_cast<int*>(srcRow + x * internal.bytes);
   2631 				float* dstPix = reinterpret_cast<float*>(srcPix);
   2632 				for(int c = nbChannels - 1; c >= 0; c--)
   2633 				{
   2634 					dstPix[c] = clamp(static_cast<float>(srcPix[c]) * normalization, -1.0f, 1.0f);
   2635 				}
   2636 			}
   2637 		}
   2638 
   2639 		internal.unlockRect();
   2640 	}
   2641 
   2642 	void Surface::decodeASTC(Buffer &internal, Buffer &external, int xBlockSize, int yBlockSize, int zBlockSize, bool isSRGB)
   2643 	{
   2644 	}
   2645 
   2646 	unsigned int Surface::size(int width, int height, int depth, int border, int samples, Format format)
   2647 	{
   2648 		width += 2 * border;
   2649 		height += 2 * border;
   2650 
   2651 		// Dimensions rounded up to multiples of 4, used for compressed formats
   2652 		int width4 = align(width, 4);
   2653 		int height4 = align(height, 4);
   2654 
   2655 		switch(format)
   2656 		{
   2657 		case FORMAT_DXT1:
   2658 		case FORMAT_ATI1:
   2659 		case FORMAT_ETC1:
   2660 		case FORMAT_R11_EAC:
   2661 		case FORMAT_SIGNED_R11_EAC:
   2662 		case FORMAT_RGB8_ETC2:
   2663 		case FORMAT_SRGB8_ETC2:
   2664 		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
   2665 		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
   2666 			return width4 * height4 * depth / 2;
   2667 		case FORMAT_DXT3:
   2668 		case FORMAT_DXT5:
   2669 		case FORMAT_ATI2:
   2670 		case FORMAT_RG11_EAC:
   2671 		case FORMAT_SIGNED_RG11_EAC:
   2672 		case FORMAT_RGBA8_ETC2_EAC:
   2673 		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
   2674 		case FORMAT_RGBA_ASTC_4x4_KHR:
   2675 		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
   2676 			return width4 * height4 * depth;
   2677 		case FORMAT_RGBA_ASTC_5x4_KHR:
   2678 		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
   2679 			return align(width, 5) * height4 * depth;
   2680 		case FORMAT_RGBA_ASTC_5x5_KHR:
   2681 		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
   2682 			return align(width, 5) * align(height, 5) * depth;
   2683 		case FORMAT_RGBA_ASTC_6x5_KHR:
   2684 		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
   2685 			return align(width, 6) * align(height, 5) * depth;
   2686 		case FORMAT_RGBA_ASTC_6x6_KHR:
   2687 		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
   2688 			return align(width, 6) * align(height, 6) * depth;
   2689 		case FORMAT_RGBA_ASTC_8x5_KHR:
   2690 		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
   2691 			return align(width, 8) * align(height, 5) * depth;
   2692 		case FORMAT_RGBA_ASTC_8x6_KHR:
   2693 		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
   2694 			return align(width, 8) * align(height, 6) * depth;
   2695 		case FORMAT_RGBA_ASTC_8x8_KHR:
   2696 		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
   2697 			return align(width, 8) * align(height, 8) * depth;
   2698 		case FORMAT_RGBA_ASTC_10x5_KHR:
   2699 		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
   2700 			return align(width, 10) * align(height, 5) * depth;
   2701 		case FORMAT_RGBA_ASTC_10x6_KHR:
   2702 		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
   2703 			return align(width, 10) * align(height, 6) * depth;
   2704 		case FORMAT_RGBA_ASTC_10x8_KHR:
   2705 		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
   2706 			return align(width, 10) * align(height, 8) * depth;
   2707 		case FORMAT_RGBA_ASTC_10x10_KHR:
   2708 		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
   2709 			return align(width, 10) * align(height, 10) * depth;
   2710 		case FORMAT_RGBA_ASTC_12x10_KHR:
   2711 		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
   2712 			return align(width, 12) * align(height, 10) * depth;
   2713 		case FORMAT_RGBA_ASTC_12x12_KHR:
   2714 		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
   2715 			return align(width, 12) * align(height, 12) * depth;
   2716 		case FORMAT_YV12_BT601:
   2717 		case FORMAT_YV12_BT709:
   2718 		case FORMAT_YV12_JFIF:
   2719 			{
   2720 				unsigned int YStride = align(width, 16);
   2721 				unsigned int YSize = YStride * height;
   2722 				unsigned int CStride = align(YStride / 2, 16);
   2723 				unsigned int CSize = CStride * height / 2;
   2724 
   2725 				return YSize + 2 * CSize;
   2726 			}
   2727 		default:
   2728 			return bytes(format) * width * height * depth * samples;
   2729 		}
   2730 	}
   2731 
   2732 	bool Surface::isStencil(Format format)
   2733 	{
   2734 		switch(format)
   2735 		{
   2736 		case FORMAT_D32:
   2737 		case FORMAT_D16:
   2738 		case FORMAT_D24X8:
   2739 		case FORMAT_D32F:
   2740 		case FORMAT_D32F_COMPLEMENTARY:
   2741 		case FORMAT_D32F_LOCKABLE:
   2742 		case FORMAT_D32F_SHADOW:
   2743 			return false;
   2744 		case FORMAT_D24S8:
   2745 		case FORMAT_D24FS8:
   2746 		case FORMAT_S8:
   2747 		case FORMAT_DF24S8:
   2748 		case FORMAT_DF16S8:
   2749 		case FORMAT_D32FS8_TEXTURE:
   2750 		case FORMAT_D32FS8_SHADOW:
   2751 		case FORMAT_D32FS8:
   2752 		case FORMAT_D32FS8_COMPLEMENTARY:
   2753 		case FORMAT_INTZ:
   2754 			return true;
   2755 		default:
   2756 			return false;
   2757 		}
   2758 	}
   2759 
   2760 	bool Surface::isDepth(Format format)
   2761 	{
   2762 		switch(format)
   2763 		{
   2764 		case FORMAT_D32:
   2765 		case FORMAT_D16:
   2766 		case FORMAT_D24X8:
   2767 		case FORMAT_D24S8:
   2768 		case FORMAT_D24FS8:
   2769 		case FORMAT_D32F:
   2770 		case FORMAT_D32FS8:
   2771 		case FORMAT_D32F_COMPLEMENTARY:
   2772 		case FORMAT_D32FS8_COMPLEMENTARY:
   2773 		case FORMAT_D32F_LOCKABLE:
   2774 		case FORMAT_DF24S8:
   2775 		case FORMAT_DF16S8:
   2776 		case FORMAT_D32FS8_TEXTURE:
   2777 		case FORMAT_D32F_SHADOW:
   2778 		case FORMAT_D32FS8_SHADOW:
   2779 		case FORMAT_INTZ:
   2780 			return true;
   2781 		case FORMAT_S8:
   2782 			return false;
   2783 		default:
   2784 			return false;
   2785 		}
   2786 	}
   2787 
   2788 	bool Surface::hasQuadLayout(Format format)
   2789 	{
   2790 		switch(format)
   2791 		{
   2792 		case FORMAT_D32:
   2793 		case FORMAT_D16:
   2794 		case FORMAT_D24X8:
   2795 		case FORMAT_D24S8:
   2796 		case FORMAT_D24FS8:
   2797 		case FORMAT_D32F:
   2798 		case FORMAT_D32FS8:
   2799 		case FORMAT_D32F_COMPLEMENTARY:
   2800 		case FORMAT_D32FS8_COMPLEMENTARY:
   2801 		case FORMAT_DF24S8:
   2802 		case FORMAT_DF16S8:
   2803 		case FORMAT_INTZ:
   2804 		case FORMAT_S8:
   2805 		case FORMAT_A8G8R8B8Q:
   2806 		case FORMAT_X8G8R8B8Q:
   2807 			return true;
   2808 		case FORMAT_D32F_LOCKABLE:
   2809 		case FORMAT_D32FS8_TEXTURE:
   2810 		case FORMAT_D32F_SHADOW:
   2811 		case FORMAT_D32FS8_SHADOW:
   2812 		default:
   2813 			break;
   2814 		}
   2815 
   2816 		return false;
   2817 	}
   2818 
   2819 	bool Surface::isPalette(Format format)
   2820 	{
   2821 		switch(format)
   2822 		{
   2823 		case FORMAT_P8:
   2824 		case FORMAT_A8P8:
   2825 			return true;
   2826 		default:
   2827 			return false;
   2828 		}
   2829 	}
   2830 
   2831 	bool Surface::isFloatFormat(Format format)
   2832 	{
   2833 		switch(format)
   2834 		{
   2835 		case FORMAT_R5G6B5:
   2836 		case FORMAT_R8G8B8:
   2837 		case FORMAT_B8G8R8:
   2838 		case FORMAT_X8R8G8B8:
   2839 		case FORMAT_X8B8G8R8I:
   2840 		case FORMAT_X8B8G8R8:
   2841 		case FORMAT_A8R8G8B8:
   2842 		case FORMAT_SRGB8_X8:
   2843 		case FORMAT_SRGB8_A8:
   2844 		case FORMAT_A8B8G8R8I:
   2845 		case FORMAT_R8UI:
   2846 		case FORMAT_G8R8UI:
   2847 		case FORMAT_X8B8G8R8UI:
   2848 		case FORMAT_A8B8G8R8UI:
   2849 		case FORMAT_A8B8G8R8:
   2850 		case FORMAT_G8R8I:
   2851 		case FORMAT_G8R8:
   2852 		case FORMAT_A2B10G10R10:
   2853 		case FORMAT_A2B10G10R10UI:
   2854 		case FORMAT_R8_SNORM:
   2855 		case FORMAT_G8R8_SNORM:
   2856 		case FORMAT_X8B8G8R8_SNORM:
   2857 		case FORMAT_A8B8G8R8_SNORM:
   2858 		case FORMAT_R16I:
   2859 		case FORMAT_R16UI:
   2860 		case FORMAT_G16R16I:
   2861 		case FORMAT_G16R16UI:
   2862 		case FORMAT_G16R16:
   2863 		case FORMAT_X16B16G16R16I:
   2864 		case FORMAT_X16B16G16R16UI:
   2865 		case FORMAT_A16B16G16R16I:
   2866 		case FORMAT_A16B16G16R16UI:
   2867 		case FORMAT_A16B16G16R16:
   2868 		case FORMAT_V8U8:
   2869 		case FORMAT_Q8W8V8U8:
   2870 		case FORMAT_X8L8V8U8:
   2871 		case FORMAT_V16U16:
   2872 		case FORMAT_A16W16V16U16:
   2873 		case FORMAT_Q16W16V16U16:
   2874 		case FORMAT_A8:
   2875 		case FORMAT_R8I:
   2876 		case FORMAT_R8:
   2877 		case FORMAT_S8:
   2878 		case FORMAT_L8:
   2879 		case FORMAT_L16:
   2880 		case FORMAT_A8L8:
   2881 		case FORMAT_YV12_BT601:
   2882 		case FORMAT_YV12_BT709:
   2883 		case FORMAT_YV12_JFIF:
   2884 		case FORMAT_R32I:
   2885 		case FORMAT_R32UI:
   2886 		case FORMAT_G32R32I:
   2887 		case FORMAT_G32R32UI:
   2888 		case FORMAT_X32B32G32R32I:
   2889 		case FORMAT_X32B32G32R32UI:
   2890 		case FORMAT_A32B32G32R32I:
   2891 		case FORMAT_A32B32G32R32UI:
   2892 			return false;
   2893 		case FORMAT_R16F:
   2894 		case FORMAT_G16R16F:
   2895 		case FORMAT_B16G16R16F:
   2896 		case FORMAT_X16B16G16R16F:
   2897 		case FORMAT_A16B16G16R16F:
   2898 		case FORMAT_X16B16G16R16F_UNSIGNED:
   2899 		case FORMAT_R32F:
   2900 		case FORMAT_G32R32F:
   2901 		case FORMAT_B32G32R32F:
   2902 		case FORMAT_X32B32G32R32F:
   2903 		case FORMAT_A32B32G32R32F:
   2904 		case FORMAT_X32B32G32R32F_UNSIGNED:
   2905 		case FORMAT_D32F:
   2906 		case FORMAT_D32FS8:
   2907 		case FORMAT_D32F_COMPLEMENTARY:
   2908 		case FORMAT_D32FS8_COMPLEMENTARY:
   2909 		case FORMAT_D32F_LOCKABLE:
   2910 		case FORMAT_D32FS8_TEXTURE:
   2911 		case FORMAT_D32F_SHADOW:
   2912 		case FORMAT_D32FS8_SHADOW:
   2913 		case FORMAT_L16F:
   2914 		case FORMAT_A16L16F:
   2915 		case FORMAT_L32F:
   2916 		case FORMAT_A32L32F:
   2917 			return true;
   2918 		default:
   2919 			ASSERT(false);
   2920 		}
   2921 
   2922 		return false;
   2923 	}
   2924 
   2925 	bool Surface::isUnsignedComponent(Format format, int component)
   2926 	{
   2927 		switch(format)
   2928 		{
   2929 		case FORMAT_NULL:
   2930 		case FORMAT_R5G6B5:
   2931 		case FORMAT_R8G8B8:
   2932 		case FORMAT_B8G8R8:
   2933 		case FORMAT_X8R8G8B8:
   2934 		case FORMAT_X8B8G8R8:
   2935 		case FORMAT_A8R8G8B8:
   2936 		case FORMAT_A8B8G8R8:
   2937 		case FORMAT_SRGB8_X8:
   2938 		case FORMAT_SRGB8_A8:
   2939 		case FORMAT_G8R8:
   2940 		case FORMAT_A2B10G10R10:
   2941 		case FORMAT_A2B10G10R10UI:
   2942 		case FORMAT_R16UI:
   2943 		case FORMAT_G16R16:
   2944 		case FORMAT_G16R16UI:
   2945 		case FORMAT_X16B16G16R16UI:
   2946 		case FORMAT_A16B16G16R16:
   2947 		case FORMAT_A16B16G16R16UI:
   2948 		case FORMAT_R32UI:
   2949 		case FORMAT_G32R32UI:
   2950 		case FORMAT_X32B32G32R32UI:
   2951 		case FORMAT_A32B32G32R32UI:
   2952 		case FORMAT_X32B32G32R32F_UNSIGNED:
   2953 		case FORMAT_R8UI:
   2954 		case FORMAT_G8R8UI:
   2955 		case FORMAT_X8B8G8R8UI:
   2956 		case FORMAT_A8B8G8R8UI:
   2957 		case FORMAT_D32F:
   2958 		case FORMAT_D32FS8:
   2959 		case FORMAT_D32F_COMPLEMENTARY:
   2960 		case FORMAT_D32FS8_COMPLEMENTARY:
   2961 		case FORMAT_D32F_LOCKABLE:
   2962 		case FORMAT_D32FS8_TEXTURE:
   2963 		case FORMAT_D32F_SHADOW:
   2964 		case FORMAT_D32FS8_SHADOW:
   2965 		case FORMAT_A8:
   2966 		case FORMAT_R8:
   2967 		case FORMAT_L8:
   2968 		case FORMAT_L16:
   2969 		case FORMAT_A8L8:
   2970 		case FORMAT_YV12_BT601:
   2971 		case FORMAT_YV12_BT709:
   2972 		case FORMAT_YV12_JFIF:
   2973 			return true;
   2974 		case FORMAT_A8B8G8R8I:
   2975 		case FORMAT_A16B16G16R16I:
   2976 		case FORMAT_A32B32G32R32I:
   2977 		case FORMAT_A8B8G8R8_SNORM:
   2978 		case FORMAT_Q8W8V8U8:
   2979 		case FORMAT_Q16W16V16U16:
   2980 		case FORMAT_A32B32G32R32F:
   2981 			return false;
   2982 		case FORMAT_R32F:
   2983 		case FORMAT_R8I:
   2984 		case FORMAT_R16I:
   2985 		case FORMAT_R32I:
   2986 		case FORMAT_R8_SNORM:
   2987 			return component >= 1;
   2988 		case FORMAT_V8U8:
   2989 		case FORMAT_X8L8V8U8:
   2990 		case FORMAT_V16U16:
   2991 		case FORMAT_G32R32F:
   2992 		case FORMAT_G8R8I:
   2993 		case FORMAT_G16R16I:
   2994 		case FORMAT_G32R32I:
   2995 		case FORMAT_G8R8_SNORM:
   2996 			return component >= 2;
   2997 		case FORMAT_A16W16V16U16:
   2998 		case FORMAT_B32G32R32F:
   2999 		case FORMAT_X32B32G32R32F:
   3000 		case FORMAT_X8B8G8R8I:
   3001 		case FORMAT_X16B16G16R16I:
   3002 		case FORMAT_X32B32G32R32I:
   3003 		case FORMAT_X8B8G8R8_SNORM:
   3004 			return component >= 3;
   3005 		default:
   3006 			ASSERT(false);
   3007 		}
   3008 
   3009 		return false;
   3010 	}
   3011 
   3012 	bool Surface::isSRGBreadable(Format format)
   3013 	{
   3014 		// Keep in sync with Capabilities::isSRGBreadable
   3015 		switch(format)
   3016 		{
   3017 		case FORMAT_L8:
   3018 		case FORMAT_A8L8:
   3019 		case FORMAT_R8G8B8:
   3020 		case FORMAT_A8R8G8B8:
   3021 		case FORMAT_X8R8G8B8:
   3022 		case FORMAT_A8B8G8R8:
   3023 		case FORMAT_X8B8G8R8:
   3024 		case FORMAT_SRGB8_X8:
   3025 		case FORMAT_SRGB8_A8:
   3026 		case FORMAT_R5G6B5:
   3027 		case FORMAT_X1R5G5B5:
   3028 		case FORMAT_A1R5G5B5:
   3029 		case FORMAT_A4R4G4B4:
   3030 		case FORMAT_DXT1:
   3031 		case FORMAT_DXT3:
   3032 		case FORMAT_DXT5:
   3033 		case FORMAT_ATI1:
   3034 		case FORMAT_ATI2:
   3035 			return true;
   3036 		default:
   3037 			return false;
   3038 		}
   3039 	}
   3040 
   3041 	bool Surface::isSRGBwritable(Format format)
   3042 	{
   3043 		// Keep in sync with Capabilities::isSRGBwritable
   3044 		switch(format)
   3045 		{
   3046 		case FORMAT_NULL:
   3047 		case FORMAT_A8R8G8B8:
   3048 		case FORMAT_X8R8G8B8:
   3049 		case FORMAT_A8B8G8R8:
   3050 		case FORMAT_X8B8G8R8:
   3051 		case FORMAT_SRGB8_X8:
   3052 		case FORMAT_SRGB8_A8:
   3053 		case FORMAT_R5G6B5:
   3054 			return true;
   3055 		default:
   3056 			return false;
   3057 		}
   3058 	}
   3059 
   3060 	bool Surface::isSRGBformat(Format format)
   3061 	{
   3062 		switch(format)
   3063 		{
   3064 		case FORMAT_SRGB8_X8:
   3065 		case FORMAT_SRGB8_A8:
   3066 			return true;
   3067 		default:
   3068 			return false;
   3069 		}
   3070 	}
   3071 
   3072 	bool Surface::isCompressed(Format format)
   3073 	{
   3074 		switch(format)
   3075 		{
   3076 		case FORMAT_DXT1:
   3077 		case FORMAT_DXT3:
   3078 		case FORMAT_DXT5:
   3079 		case FORMAT_ATI1:
   3080 		case FORMAT_ATI2:
   3081 		case FORMAT_ETC1:
   3082 		case FORMAT_R11_EAC:
   3083 		case FORMAT_SIGNED_R11_EAC:
   3084 		case FORMAT_RG11_EAC:
   3085 		case FORMAT_SIGNED_RG11_EAC:
   3086 		case FORMAT_RGB8_ETC2:
   3087 		case FORMAT_SRGB8_ETC2:
   3088 		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
   3089 		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
   3090 		case FORMAT_RGBA8_ETC2_EAC:
   3091 		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
   3092 		case FORMAT_RGBA_ASTC_4x4_KHR:
   3093 		case FORMAT_RGBA_ASTC_5x4_KHR:
   3094 		case FORMAT_RGBA_ASTC_5x5_KHR:
   3095 		case FORMAT_RGBA_ASTC_6x5_KHR:
   3096 		case FORMAT_RGBA_ASTC_6x6_KHR:
   3097 		case FORMAT_RGBA_ASTC_8x5_KHR:
   3098 		case FORMAT_RGBA_ASTC_8x6_KHR:
   3099 		case FORMAT_RGBA_ASTC_8x8_KHR:
   3100 		case FORMAT_RGBA_ASTC_10x5_KHR:
   3101 		case FORMAT_RGBA_ASTC_10x6_KHR:
   3102 		case FORMAT_RGBA_ASTC_10x8_KHR:
   3103 		case FORMAT_RGBA_ASTC_10x10_KHR:
   3104 		case FORMAT_RGBA_ASTC_12x10_KHR:
   3105 		case FORMAT_RGBA_ASTC_12x12_KHR:
   3106 		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
   3107 		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
   3108 		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
   3109 		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
   3110 		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
   3111 		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
   3112 		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
   3113 		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
   3114 		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
   3115 		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
   3116 		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
   3117 		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
   3118 		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
   3119 		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
   3120 			return true;
   3121 		default:
   3122 			return false;
   3123 		}
   3124 	}
   3125 
   3126 	bool Surface::isSignedNonNormalizedInteger(Format format)
   3127 	{
   3128 		switch(format)
   3129 		{
   3130 		case FORMAT_A8B8G8R8I:
   3131 		case FORMAT_X8B8G8R8I:
   3132 		case FORMAT_G8R8I:
   3133 		case FORMAT_R8I:
   3134 		case FORMAT_A16B16G16R16I:
   3135 		case FORMAT_X16B16G16R16I:
   3136 		case FORMAT_G16R16I:
   3137 		case FORMAT_R16I:
   3138 		case FORMAT_A32B32G32R32I:
   3139 		case FORMAT_X32B32G32R32I:
   3140 		case FORMAT_G32R32I:
   3141 		case FORMAT_R32I:
   3142 			return true;
   3143 		default:
   3144 			return false;
   3145 		}
   3146 	}
   3147 
   3148 	bool Surface::isUnsignedNonNormalizedInteger(Format format)
   3149 	{
   3150 		switch(format)
   3151 		{
   3152 		case FORMAT_A8B8G8R8UI:
   3153 		case FORMAT_X8B8G8R8UI:
   3154 		case FORMAT_G8R8UI:
   3155 		case FORMAT_R8UI:
   3156 		case FORMAT_A16B16G16R16UI:
   3157 		case FORMAT_X16B16G16R16UI:
   3158 		case FORMAT_G16R16UI:
   3159 		case FORMAT_R16UI:
   3160 		case FORMAT_A32B32G32R32UI:
   3161 		case FORMAT_X32B32G32R32UI:
   3162 		case FORMAT_G32R32UI:
   3163 		case FORMAT_R32UI:
   3164 			return true;
   3165 		default:
   3166 			return false;
   3167 		}
   3168 	}
   3169 
   3170 	bool Surface::isNonNormalizedInteger(Format format)
   3171 	{
   3172 		return isSignedNonNormalizedInteger(format) ||
   3173 		       isUnsignedNonNormalizedInteger(format);
   3174 	}
   3175 
   3176 	bool Surface::isNormalizedInteger(Format format)
   3177 	{
   3178 		return !isFloatFormat(format) &&
   3179 		       !isNonNormalizedInteger(format) &&
   3180 		       !isCompressed(format) &&
   3181 		       !isDepth(format) &&
   3182 		       !isStencil(format);
   3183 	}
   3184 
   3185 	int Surface::componentCount(Format format)
   3186 	{
   3187 		switch(format)
   3188 		{
   3189 		case FORMAT_R5G6B5:         return 3;
   3190 		case FORMAT_X8R8G8B8:       return 3;
   3191 		case FORMAT_X8B8G8R8I:      return 3;
   3192 		case FORMAT_X8B8G8R8:       return 3;
   3193 		case FORMAT_A8R8G8B8:       return 4;
   3194 		case FORMAT_SRGB8_X8:       return 3;
   3195 		case FORMAT_SRGB8_A8:       return 4;
   3196 		case FORMAT_A8B8G8R8I:      return 4;
   3197 		case FORMAT_A8B8G8R8:       return 4;
   3198 		case FORMAT_G8R8I:          return 2;
   3199 		case FORMAT_G8R8:           return 2;
   3200 		case FORMAT_R8_SNORM:      return 1;
   3201 		case FORMAT_G8R8_SNORM:    return 2;
   3202 		case FORMAT_X8B8G8R8_SNORM:return 3;
   3203 		case FORMAT_A8B8G8R8_SNORM:return 4;
   3204 		case FORMAT_R8UI:           return 1;
   3205 		case FORMAT_G8R8UI:         return 2;
   3206 		case FORMAT_X8B8G8R8UI:     return 3;
   3207 		case FORMAT_A8B8G8R8UI:     return 4;
   3208 		case FORMAT_A2B10G10R10:    return 4;
   3209 		case FORMAT_A2B10G10R10UI:  return 4;
   3210 		case FORMAT_G16R16I:        return 2;
   3211 		case FORMAT_G16R16UI:       return 2;
   3212 		case FORMAT_G16R16:         return 2;
   3213 		case FORMAT_G32R32I:        return 2;
   3214 		case FORMAT_G32R32UI:       return 2;
   3215 		case FORMAT_X16B16G16R16I:  return 3;
   3216 		case FORMAT_X16B16G16R16UI: return 3;
   3217 		case FORMAT_A16B16G16R16I:  return 4;
   3218 		case FORMAT_A16B16G16R16UI: return 4;
   3219 		case FORMAT_A16B16G16R16:   return 4;
   3220 		case FORMAT_X32B32G32R32I:  return 3;
   3221 		case FORMAT_X32B32G32R32UI: return 3;
   3222 		case FORMAT_A32B32G32R32I:  return 4;
   3223 		case FORMAT_A32B32G32R32UI: return 4;
   3224 		case FORMAT_V8U8:           return 2;
   3225 		case FORMAT_Q8W8V8U8:       return 4;
   3226 		case FORMAT_X8L8V8U8:       return 3;
   3227 		case FORMAT_V16U16:         return 2;
   3228 		case FORMAT_A16W16V16U16:   return 4;
   3229 		case FORMAT_Q16W16V16U16:   return 4;
   3230 		case FORMAT_R32F:           return 1;
   3231 		case FORMAT_G32R32F:        return 2;
   3232 		case FORMAT_X32B32G32R32F:  return 3;
   3233 		case FORMAT_A32B32G32R32F:  return 4;
   3234 		case FORMAT_X32B32G32R32F_UNSIGNED: return 3;
   3235 		case FORMAT_D32F:           return 1;
   3236 		case FORMAT_D32FS8:         return 1;
   3237 		case FORMAT_D32F_LOCKABLE:  return 1;
   3238 		case FORMAT_D32FS8_TEXTURE: return 1;
   3239 		case FORMAT_D32F_SHADOW:    return 1;
   3240 		case FORMAT_D32FS8_SHADOW:  return 1;
   3241 		case FORMAT_A8:             return 1;
   3242 		case FORMAT_R8I:            return 1;
   3243 		case FORMAT_R8:             return 1;
   3244 		case FORMAT_R16I:           return 1;
   3245 		case FORMAT_R16UI:          return 1;
   3246 		case FORMAT_R32I:           return 1;
   3247 		case FORMAT_R32UI:          return 1;
   3248 		case FORMAT_L8:             return 1;
   3249 		case FORMAT_L16:            return 1;
   3250 		case FORMAT_A8L8:           return 2;
   3251 		case FORMAT_YV12_BT601:     return 3;
   3252 		case FORMAT_YV12_BT709:     return 3;
   3253 		case FORMAT_YV12_JFIF:      return 3;
   3254 		default:
   3255 			ASSERT(false);
   3256 		}
   3257 
   3258 		return 1;
   3259 	}
   3260 
   3261 	void *Surface::allocateBuffer(int width, int height, int depth, int border, int samples, Format format)
   3262 	{
   3263 		// Render targets require 2x2 quads
   3264 		int width2 = (width + 1) & ~1;
   3265 		int height2 = (height + 1) & ~1;
   3266 
   3267 		// FIXME: Unpacking byte4 to short4 in the sampler currently involves reading 8 bytes,
   3268 		// and stencil operations also read 8 bytes per four 8-bit stencil values,
   3269 		// so we have to allocate 4 extra bytes to avoid buffer overruns.
   3270 		return allocate(size(width2, height2, depth, border, samples, format) + 4);
   3271 	}
   3272 
   3273 	void Surface::memfill4(void *buffer, int pattern, int bytes)
   3274 	{
   3275 		while((size_t)buffer & 0x1 && bytes >= 1)
   3276 		{
   3277 			*(char*)buffer = (char)pattern;
   3278 			(char*&)buffer += 1;
   3279 			bytes -= 1;
   3280 		}
   3281 
   3282 		while((size_t)buffer & 0x3 && bytes >= 2)
   3283 		{
   3284 			*(short*)buffer = (short)pattern;
   3285 			(short*&)buffer += 1;
   3286 			bytes -= 2;
   3287 		}
   3288 
   3289 		#if defined(__i386__) || defined(__x86_64__)
   3290 			if(CPUID::supportsSSE())
   3291 			{
   3292 				while((size_t)buffer & 0xF && bytes >= 4)
   3293 				{
   3294 					*(int*)buffer = pattern;
   3295 					(int*&)buffer += 1;
   3296 					bytes -= 4;
   3297 				}
   3298 
   3299 				__m128 quad = _mm_set_ps1((float&)pattern);
   3300 
   3301 				float *pointer = (float*)buffer;
   3302 				int qxwords = bytes / 64;
   3303 				bytes -= qxwords * 64;
   3304 
   3305 				while(qxwords--)
   3306 				{
   3307 					_mm_stream_ps(pointer + 0, quad);
   3308 					_mm_stream_ps(pointer + 4, quad);
   3309 					_mm_stream_ps(pointer + 8, quad);
   3310 					_mm_stream_ps(pointer + 12, quad);
   3311 
   3312 					pointer += 16;
   3313 				}
   3314 
   3315 				buffer = pointer;
   3316 			}
   3317 		#endif
   3318 
   3319 		while(bytes >= 4)
   3320 		{
   3321 			*(int*)buffer = (int)pattern;
   3322 			(int*&)buffer += 1;
   3323 			bytes -= 4;
   3324 		}
   3325 
   3326 		while(bytes >= 2)
   3327 		{
   3328 			*(short*)buffer = (short)pattern;
   3329 			(short*&)buffer += 1;
   3330 			bytes -= 2;
   3331 		}
   3332 
   3333 		while(bytes >= 1)
   3334 		{
   3335 			*(char*)buffer = (char)pattern;
   3336 			(char*&)buffer += 1;
   3337 			bytes -= 1;
   3338 		}
   3339 	}
   3340 
   3341 	void Surface::sync()
   3342 	{
   3343 		resource->lock(EXCLUSIVE);
   3344 		resource->unlock();
   3345 	}
   3346 
   3347 	bool Surface::isEntire(const Rect& rect) const
   3348 	{
   3349 		return (rect.x0 == 0 && rect.y0 == 0 && rect.x1 == internal.width && rect.y1 == internal.height && internal.depth == 1);
   3350 	}
   3351 
   3352 	Rect Surface::getRect() const
   3353 	{
   3354 		return Rect(0, 0, internal.width, internal.height);
   3355 	}
   3356 
   3357 	void Surface::clearDepth(float depth, int x0, int y0, int width, int height)
   3358 	{
   3359 		if(width == 0 || height == 0) return;
   3360 
   3361 		// Not overlapping
   3362 		if(x0 > internal.width) return;
   3363 		if(y0 > internal.height) return;
   3364 		if(x0 + width < 0) return;
   3365 		if(y0 + height < 0) return;
   3366 
   3367 		// Clip against dimensions
   3368 		if(x0 < 0) {width += x0; x0 = 0;}
   3369 		if(x0 + width > internal.width) width = internal.width - x0;
   3370 		if(y0 < 0) {height += y0; y0 = 0;}
   3371 		if(y0 + height > internal.height) height = internal.height - y0;
   3372 
   3373 		const bool entire = x0 == 0 && y0 == 0 && width == internal.width && height == internal.height;
   3374 		const Lock lock = entire ? LOCK_DISCARD : LOCK_WRITEONLY;
   3375 
   3376 		int x1 = x0 + width;
   3377 		int y1 = y0 + height;
   3378 
   3379 		if(!hasQuadLayout(internal.format))
   3380 		{
   3381 			float *target = (float*)lockInternal(x0, y0, 0, lock, PUBLIC);
   3382 
   3383 			for(int z = 0; z < internal.samples; z++)
   3384 			{
   3385 				float *row = target;
   3386 				for(int y = y0; y < y1; y++)
   3387 				{
   3388 					memfill4(row, (int&)depth, width * sizeof(float));
   3389 					row += internal.pitchP;
   3390 				}
   3391 				target += internal.sliceP;
   3392 			}
   3393 
   3394 			unlockInternal();
   3395 		}
   3396 		else   // Quad layout
   3397 		{
   3398 			if(complementaryDepthBuffer)
   3399 			{
   3400 				depth = 1 - depth;
   3401 			}
   3402 
   3403 			float *buffer = (float*)lockInternal(0, 0, 0, lock, PUBLIC);
   3404 
   3405 			int oddX0 = (x0 & ~1) * 2 + (x0 & 1);
   3406 			int oddX1 = (x1 & ~1) * 2;
   3407 			int evenX0 = ((x0 + 1) & ~1) * 2;
   3408 			int evenBytes = (oddX1 - evenX0) * sizeof(float);
   3409 
   3410 			for(int z = 0; z < internal.samples; z++)
   3411 			{
   3412 				for(int y = y0; y < y1; y++)
   3413 				{
   3414 					float *target = buffer + (y & ~1) * internal.pitchP + (y & 1) * 2;
   3415 
   3416 					if((y & 1) == 0 && y + 1 < y1)   // Fill quad line at once
   3417 					{
   3418 						if((x0 & 1) != 0)
   3419 						{
   3420 							target[oddX0 + 0] = depth;
   3421 							target[oddX0 + 2] = depth;
   3422 						}
   3423 
   3424 					//	for(int x2 = evenX0; x2 < x1 * 2; x2 += 4)
   3425 					//	{
   3426 					//		target[x2 + 0] = depth;
   3427 					//		target[x2 + 1] = depth;
   3428 					//		target[x2 + 2] = depth;
   3429 					//		target[x2 + 3] = depth;
   3430 					//	}
   3431 
   3432 					//	__asm
   3433 					//	{
   3434 					//		movss xmm0, depth
   3435 					//		shufps xmm0, xmm0, 0x00
   3436 					//
   3437 					//		mov eax, x0
   3438 					//		add eax, 1
   3439 					//		and eax, 0xFFFFFFFE
   3440 					//		cmp eax, x1
   3441 					//		jge qEnd
   3442 					//
   3443 					//		mov edi, target
   3444 					//
   3445 					//	qLoop:
   3446 					//		movntps [edi+8*eax], xmm0
   3447 					//
   3448 					//		add eax, 2
   3449 					//		cmp eax, x1
   3450 					//		jl qLoop
   3451 					//	qEnd:
   3452 					//	}
   3453 
   3454 						memfill4(&target[evenX0], (int&)depth, evenBytes);
   3455 
   3456 						if((x1 & 1) != 0)
   3457 						{
   3458 							target[oddX1 + 0] = depth;
   3459 							target[oddX1 + 2] = depth;
   3460 						}
   3461 
   3462 						y++;
   3463 					}
   3464 					else
   3465 					{
   3466 						for(int x = x0, i = oddX0; x < x1; x++, i = (x & ~1) * 2 + (x & 1))
   3467 						{
   3468 							target[i] = depth;
   3469 						}
   3470 					}
   3471 				}
   3472 
   3473 				buffer += internal.sliceP;
   3474 			}
   3475 
   3476 			unlockInternal();
   3477 		}
   3478 	}
   3479 
   3480 	void Surface::clearStencil(unsigned char s, unsigned char mask, int x0, int y0, int width, int height)
   3481 	{
   3482 		if(mask == 0 || width == 0 || height == 0) return;
   3483 
   3484 		// Not overlapping
   3485 		if(x0 > internal.width) return;
   3486 		if(y0 > internal.height) return;
   3487 		if(x0 + width < 0) return;
   3488 		if(y0 + height < 0) return;
   3489 
   3490 		// Clip against dimensions
   3491 		if(x0 < 0) {width += x0; x0 = 0;}
   3492 		if(x0 + width > internal.width) width = internal.width - x0;
   3493 		if(y0 < 0) {height += y0; y0 = 0;}
   3494 		if(y0 + height > internal.height) height = internal.height - y0;
   3495 
   3496 		int x1 = x0 + width;
   3497 		int y1 = y0 + height;
   3498 
   3499 		int oddX0 = (x0 & ~1) * 2 + (x0 & 1);
   3500 		int oddX1 = (x1 & ~1) * 2;
   3501 		int evenX0 = ((x0 + 1) & ~1) * 2;
   3502 		int evenBytes = oddX1 - evenX0;
   3503 
   3504 		unsigned char maskedS = s & mask;
   3505 		unsigned char invMask = ~mask;
   3506 		unsigned int fill = maskedS;
   3507 		fill = fill | (fill << 8) | (fill << 16) | (fill << 24);
   3508 
   3509 		char *buffer = (char*)lockStencil(0, 0, 0, PUBLIC);
   3510 
   3511 		// Stencil buffers are assumed to use quad layout
   3512 		for(int z = 0; z < stencil.samples; z++)
   3513 		{
   3514 			for(int y = y0; y < y1; y++)
   3515 			{
   3516 				char *target = buffer + (y & ~1) * stencil.pitchP + (y & 1) * 2;
   3517 
   3518 				if((y & 1) == 0 && y + 1 < y1 && mask == 0xFF)   // Fill quad line at once
   3519 				{
   3520 					if((x0 & 1) != 0)
   3521 					{
   3522 						target[oddX0 + 0] = fill;
   3523 						target[oddX0 + 2] = fill;
   3524 					}
   3525 
   3526 					memfill4(&target[evenX0], fill, evenBytes);
   3527 
   3528 					if((x1 & 1) != 0)
   3529 					{
   3530 						target[oddX1 + 0] = fill;
   3531 						target[oddX1 + 2] = fill;
   3532 					}
   3533 
   3534 					y++;
   3535 				}
   3536 				else
   3537 				{
   3538 					for(int x = x0; x < x1; x++)
   3539 					{
   3540 						int i = (x & ~1) * 2 + (x & 1);
   3541 						target[i] = maskedS | (target[i] & invMask);
   3542 					}
   3543 				}
   3544 			}
   3545 
   3546 			buffer += stencil.sliceP;
   3547 		}
   3548 
   3549 		unlockStencil();
   3550 	}
   3551 
   3552 	void Surface::fill(const Color<float> &color, int x0, int y0, int width, int height)
   3553 	{
   3554 		unsigned char *row;
   3555 		Buffer *buffer;
   3556 
   3557 		if(internal.dirty)
   3558 		{
   3559 			row = (unsigned char*)lockInternal(x0, y0, 0, LOCK_WRITEONLY, PUBLIC);
   3560 			buffer = &internal;
   3561 		}
   3562 		else
   3563 		{
   3564 			row = (unsigned char*)lockExternal(x0, y0, 0, LOCK_WRITEONLY, PUBLIC);
   3565 			buffer = &external;
   3566 		}
   3567 
   3568 		if(buffer->bytes <= 4)
   3569 		{
   3570 			int c;
   3571 			buffer->write(&c, color);
   3572 
   3573 			if(buffer->bytes <= 1) c = (c << 8)  | c;
   3574 			if(buffer->bytes <= 2) c = (c << 16) | c;
   3575 
   3576 			for(int y = 0; y < height; y++)
   3577 			{
   3578 				memfill4(row, c, width * buffer->bytes);
   3579 
   3580 				row += buffer->pitchB;
   3581 			}
   3582 		}
   3583 		else   // Generic
   3584 		{
   3585 			for(int y = 0; y < height; y++)
   3586 			{
   3587 				unsigned char *element = row;
   3588 
   3589 				for(int x = 0; x < width; x++)
   3590 				{
   3591 					buffer->write(element, color);
   3592 
   3593 					element += buffer->bytes;
   3594 				}
   3595 
   3596 				row += buffer->pitchB;
   3597 			}
   3598 		}
   3599 
   3600 		if(buffer == &internal)
   3601 		{
   3602 			unlockInternal();
   3603 		}
   3604 		else
   3605 		{
   3606 			unlockExternal();
   3607 		}
   3608 	}
   3609 
   3610 	void Surface::copyInternal(const Surface *source, int x, int y, float srcX, float srcY, bool filter)
   3611 	{
   3612 		ASSERT(internal.lock != LOCK_UNLOCKED && source && source->internal.lock != LOCK_UNLOCKED);
   3613 
   3614 		sw::Color<float> color;
   3615 
   3616 		if(!filter)
   3617 		{
   3618 			color = source->internal.read((int)srcX, (int)srcY, 0);
   3619 		}
   3620 		else   // Bilinear filtering
   3621 		{
   3622 			color = source->internal.sample(srcX, srcY, 0);
   3623 		}
   3624 
   3625 		internal.write(x, y, color);
   3626 	}
   3627 
   3628 	void Surface::copyInternal(const Surface *source, int x, int y, int z, float srcX, float srcY, float srcZ, bool filter)
   3629 	{
   3630 		ASSERT(internal.lock != LOCK_UNLOCKED && source && source->internal.lock != LOCK_UNLOCKED);
   3631 
   3632 		sw::Color<float> color;
   3633 
   3634 		if(!filter)
   3635 		{
   3636 			color = source->internal.read((int)srcX, (int)srcY, int(srcZ));
   3637 		}
   3638 		else   // Bilinear filtering
   3639 		{
   3640 			color = source->internal.sample(srcX, srcY, srcZ);
   3641 		}
   3642 
   3643 		internal.write(x, y, z, color);
   3644 	}
   3645 
   3646 	void Surface::copyCubeEdge(Edge dstEdge, Surface *src, Edge srcEdge)
   3647 	{
   3648 		Surface *dst = this;
   3649 
   3650 		// Figure out if the edges to be copied in reverse order respectively from one another
   3651 		// The copy should be reversed whenever the same edges are contiguous or if we're
   3652 		// copying top <-> right or bottom <-> left. This is explained by the layout, which is:
   3653 		//
   3654 		//      | +y |
   3655 		// | -x | +z | +x | -z |
   3656 		//      | -y |
   3657 
   3658 		bool reverse = (srcEdge == dstEdge) ||
   3659 		               ((srcEdge == TOP) && (dstEdge == RIGHT)) ||
   3660 		               ((srcEdge == RIGHT) && (dstEdge == TOP)) ||
   3661 		               ((srcEdge == BOTTOM) && (dstEdge == LEFT)) ||
   3662 		               ((srcEdge == LEFT) && (dstEdge == BOTTOM));
   3663 
   3664 		int srcBytes = src->bytes(src->Surface::getInternalFormat());
   3665 		int srcPitch = src->getInternalPitchB();
   3666 		int dstBytes = dst->bytes(dst->Surface::getInternalFormat());
   3667 		int dstPitch = dst->getInternalPitchB();
   3668 
   3669 		int srcW = src->getWidth();
   3670 		int srcH = src->getHeight();
   3671 		int dstW = dst->getWidth();
   3672 		int dstH = dst->getHeight();
   3673 
   3674 		ASSERT(srcW == srcH && dstW == dstH && srcW == dstW && srcBytes == dstBytes);
   3675 
   3676 		// Src is expressed in the regular [0, width-1], [0, height-1] space
   3677 		int srcDelta = ((srcEdge == TOP) || (srcEdge == BOTTOM)) ? srcBytes : srcPitch;
   3678 		int srcStart = ((srcEdge == BOTTOM) ? srcPitch * (srcH - 1) : ((srcEdge == RIGHT) ? srcBytes * (srcW - 1) : 0));
   3679 
   3680 		// Dst contains borders, so it is expressed in the [-1, width+1], [-1, height+1] space
   3681 		int dstDelta = (((dstEdge == TOP) || (dstEdge == BOTTOM)) ? dstBytes : dstPitch) * (reverse ? -1 : 1);
   3682 		int dstStart = ((dstEdge == BOTTOM) ? dstPitch * (dstH + 1) : ((dstEdge == RIGHT) ? dstBytes * (dstW + 1) : 0)) + (reverse ? dstW * -dstDelta : dstDelta);
   3683 
   3684 		char *srcBuf = (char*)src->lockInternal(0, 0, 0, sw::LOCK_READONLY, sw::PRIVATE) + srcStart;
   3685 		char *dstBuf = (char*)dst->lockInternal(-1, -1, 0, sw::LOCK_READWRITE, sw::PRIVATE) + dstStart;
   3686 
   3687 		for(int i = 0; i < srcW; ++i, dstBuf += dstDelta, srcBuf += srcDelta)
   3688 		{
   3689 			memcpy(dstBuf, srcBuf, srcBytes);
   3690 		}
   3691 
   3692 		if(dstEdge == LEFT || dstEdge == RIGHT)
   3693 		{
   3694 			// TOP and BOTTOM are already set, let's average out the corners
   3695 			int x0 = (dstEdge == RIGHT) ? dstW : -1;
   3696 			int y0 = -1;
   3697 			int x1 = (dstEdge == RIGHT) ? dstW - 1 : 0;
   3698 			int y1 = 0;
   3699 			dst->computeCubeCorner(x0, y0, x1, y1);
   3700 			y0 = dstH;
   3701 			y1 = dstH - 1;
   3702 			dst->computeCubeCorner(x0, y0, x1, y1);
   3703 		}
   3704 
   3705 		src->unlockInternal();
   3706 		dst->unlockInternal();
   3707 	}
   3708 
   3709 	void Surface::computeCubeCorner(int x0, int y0, int x1, int y1)
   3710 	{
   3711 		ASSERT(internal.lock != LOCK_UNLOCKED);
   3712 
   3713 		sw::Color<float> color = internal.read(x0, y1);
   3714 		color += internal.read(x1, y0);
   3715 		color += internal.read(x1, y1);
   3716 		color *= (1.0f / 3.0f);
   3717 
   3718 		internal.write(x0, y0, color);
   3719 	}
   3720 
   3721 	bool Surface::hasStencil() const
   3722 	{
   3723 		return isStencil(external.format);
   3724 	}
   3725 
   3726 	bool Surface::hasDepth() const
   3727 	{
   3728 		return isDepth(external.format);
   3729 	}
   3730 
   3731 	bool Surface::hasPalette() const
   3732 	{
   3733 		return isPalette(external.format);
   3734 	}
   3735 
   3736 	bool Surface::isRenderTarget() const
   3737 	{
   3738 		return renderTarget;
   3739 	}
   3740 
   3741 	bool Surface::hasDirtyContents() const
   3742 	{
   3743 		return dirtyContents;
   3744 	}
   3745 
   3746 	void Surface::markContentsClean()
   3747 	{
   3748 		dirtyContents = false;
   3749 	}
   3750 
   3751 	Resource *Surface::getResource()
   3752 	{
   3753 		return resource;
   3754 	}
   3755 
   3756 	bool Surface::identicalFormats() const
   3757 	{
   3758 		return external.format == internal.format &&
   3759 		       external.width  == internal.width &&
   3760 		       external.height == internal.height &&
   3761 		       external.depth  == internal.depth &&
   3762 		       external.pitchB == internal.pitchB &&
   3763 		       external.sliceB == internal.sliceB &&
   3764 		       external.border == internal.border &&
   3765 		       external.samples == internal.samples;
   3766 	}
   3767 
   3768 	Format Surface::selectInternalFormat(Format format) const
   3769 	{
   3770 		switch(format)
   3771 		{
   3772 		case FORMAT_NULL:
   3773 			return FORMAT_NULL;
   3774 		case FORMAT_P8:
   3775 		case FORMAT_A8P8:
   3776 		case FORMAT_A4R4G4B4:
   3777 		case FORMAT_A1R5G5B5:
   3778 		case FORMAT_A8R3G3B2:
   3779 			return FORMAT_A8R8G8B8;
   3780 		case FORMAT_A8:
   3781 			return FORMAT_A8;
   3782 		case FORMAT_R8I:
   3783 			return FORMAT_R8I;
   3784 		case FORMAT_R8UI:
   3785 			return FORMAT_R8UI;
   3786 		case FORMAT_R8_SNORM:
   3787 			return FORMAT_R8_SNORM;
   3788 		case FORMAT_R8:
   3789 			return FORMAT_R8;
   3790 		case FORMAT_R16I:
   3791 			return FORMAT_R16I;
   3792 		case FORMAT_R16UI:
   3793 			return FORMAT_R16UI;
   3794 		case FORMAT_R32I:
   3795 			return FORMAT_R32I;
   3796 		case FORMAT_R32UI:
   3797 			return FORMAT_R32UI;
   3798 		case FORMAT_X16B16G16R16I:
   3799 			return FORMAT_X16B16G16R16I;
   3800 		case FORMAT_A16B16G16R16I:
   3801 			return FORMAT_A16B16G16R16I;
   3802 		case FORMAT_X16B16G16R16UI:
   3803 			return FORMAT_X16B16G16R16UI;
   3804 		case FORMAT_A16B16G16R16UI:
   3805 			return FORMAT_A16B16G16R16UI;
   3806 		case FORMAT_A2R10G10B10:
   3807 		case FORMAT_A2B10G10R10:
   3808 		case FORMAT_A16B16G16R16:
   3809 			return FORMAT_A16B16G16R16;
   3810 		case FORMAT_A2B10G10R10UI:
   3811 			return FORMAT_A16B16G16R16UI;
   3812 		case FORMAT_X32B32G32R32I:
   3813 			return FORMAT_X32B32G32R32I;
   3814 		case FORMAT_A32B32G32R32I:
   3815 			return FORMAT_A32B32G32R32I;
   3816 		case FORMAT_X32B32G32R32UI:
   3817 			return FORMAT_X32B32G32R32UI;
   3818 		case FORMAT_A32B32G32R32UI:
   3819 			return FORMAT_A32B32G32R32UI;
   3820 		case FORMAT_G8R8I:
   3821 			return FORMAT_G8R8I;
   3822 		case FORMAT_G8R8UI:
   3823 			return FORMAT_G8R8UI;
   3824 		case FORMAT_G8R8_SNORM:
   3825 			return FORMAT_G8R8_SNORM;
   3826 		case FORMAT_G8R8:
   3827 			return FORMAT_G8R8;
   3828 		case FORMAT_G16R16I:
   3829 			return FORMAT_G16R16I;
   3830 		case FORMAT_G16R16UI:
   3831 			return FORMAT_G16R16UI;
   3832 		case FORMAT_G16R16:
   3833 			return FORMAT_G16R16;
   3834 		case FORMAT_G32R32I:
   3835 			return FORMAT_G32R32I;
   3836 		case FORMAT_G32R32UI:
   3837 			return FORMAT_G32R32UI;
   3838 		case FORMAT_A8R8G8B8:
   3839 			if(lockable || !quadLayoutEnabled)
   3840 			{
   3841 				return FORMAT_A8R8G8B8;
   3842 			}
   3843 			else
   3844 			{
   3845 				return FORMAT_A8G8R8B8Q;
   3846 			}
   3847 		case FORMAT_A8B8G8R8I:
   3848 			return FORMAT_A8B8G8R8I;
   3849 		case FORMAT_A8B8G8R8UI:
   3850 			return FORMAT_A8B8G8R8UI;
   3851 		case FORMAT_A8B8G8R8_SNORM:
   3852 			return FORMAT_A8B8G8R8_SNORM;
   3853 		case FORMAT_R5G5B5A1:
   3854 		case FORMAT_R4G4B4A4:
   3855 		case FORMAT_A8B8G8R8:
   3856 			return FORMAT_A8B8G8R8;
   3857 		case FORMAT_R5G6B5:
   3858 			return FORMAT_R5G6B5;
   3859 		case FORMAT_R3G3B2:
   3860 		case FORMAT_R8G8B8:
   3861 		case FORMAT_X4R4G4B4:
   3862 		case FORMAT_X1R5G5B5:
   3863 		case FORMAT_X8R8G8B8:
   3864 			if(lockable || !quadLayoutEnabled)
   3865 			{
   3866 				return FORMAT_X8R8G8B8;
   3867 			}
   3868 			else
   3869 			{
   3870 				return FORMAT_X8G8R8B8Q;
   3871 			}
   3872 		case FORMAT_X8B8G8R8I:
   3873 			return FORMAT_X8B8G8R8I;
   3874 		case FORMAT_X8B8G8R8UI:
   3875 			return FORMAT_X8B8G8R8UI;
   3876 		case FORMAT_X8B8G8R8_SNORM:
   3877 			return FORMAT_X8B8G8R8_SNORM;
   3878 		case FORMAT_B8G8R8:
   3879 		case FORMAT_X8B8G8R8:
   3880 			return FORMAT_X8B8G8R8;
   3881 		case FORMAT_SRGB8_X8:
   3882 			return FORMAT_SRGB8_X8;
   3883 		case FORMAT_SRGB8_A8:
   3884 			return FORMAT_SRGB8_A8;
   3885 		// Compressed formats
   3886 		case FORMAT_DXT1:
   3887 		case FORMAT_DXT3:
   3888 		case FORMAT_DXT5:
   3889 		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
   3890 		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
   3891 		case FORMAT_RGBA8_ETC2_EAC:
   3892 		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
   3893 		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
   3894 		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
   3895 		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
   3896 		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
   3897 		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
   3898 		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
   3899 		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
   3900 		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
   3901 		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
   3902 		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
   3903 		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
   3904 		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
   3905 		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
   3906 		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
   3907 			return FORMAT_A8R8G8B8;
   3908 		case FORMAT_RGBA_ASTC_4x4_KHR:
   3909 		case FORMAT_RGBA_ASTC_5x4_KHR:
   3910 		case FORMAT_RGBA_ASTC_5x5_KHR:
   3911 		case FORMAT_RGBA_ASTC_6x5_KHR:
   3912 		case FORMAT_RGBA_ASTC_6x6_KHR:
   3913 		case FORMAT_RGBA_ASTC_8x5_KHR:
   3914 		case FORMAT_RGBA_ASTC_8x6_KHR:
   3915 		case FORMAT_RGBA_ASTC_8x8_KHR:
   3916 		case FORMAT_RGBA_ASTC_10x5_KHR:
   3917 		case FORMAT_RGBA_ASTC_10x6_KHR:
   3918 		case FORMAT_RGBA_ASTC_10x8_KHR:
   3919 		case FORMAT_RGBA_ASTC_10x10_KHR:
   3920 		case FORMAT_RGBA_ASTC_12x10_KHR:
   3921 		case FORMAT_RGBA_ASTC_12x12_KHR:
   3922 			// ASTC supports HDR, so a floating point format is required to represent it properly
   3923 			return FORMAT_A32B32G32R32F; // FIXME: 16FP is probably sufficient, but it's currently unsupported
   3924 		case FORMAT_ATI1:
   3925 			return FORMAT_R8;
   3926 		case FORMAT_R11_EAC:
   3927 		case FORMAT_SIGNED_R11_EAC:
   3928 			return FORMAT_R32F; // FIXME: Signed 8bit format would be sufficient
   3929 		case FORMAT_ATI2:
   3930 			return FORMAT_G8R8;
   3931 		case FORMAT_RG11_EAC:
   3932 		case FORMAT_SIGNED_RG11_EAC:
   3933 			return FORMAT_G32R32F; // FIXME: Signed 8bit format would be sufficient
   3934 		case FORMAT_ETC1:
   3935 		case FORMAT_RGB8_ETC2:
   3936 		case FORMAT_SRGB8_ETC2:
   3937 			return FORMAT_X8R8G8B8;
   3938 		// Bumpmap formats
   3939 		case FORMAT_V8U8:			return FORMAT_V8U8;
   3940 		case FORMAT_L6V5U5:			return FORMAT_X8L8V8U8;
   3941 		case FORMAT_Q8W8V8U8:		return FORMAT_Q8W8V8U8;
   3942 		case FORMAT_X8L8V8U8:		return FORMAT_X8L8V8U8;
   3943 		case FORMAT_V16U16:			return FORMAT_V16U16;
   3944 		case FORMAT_A2W10V10U10:	return FORMAT_A16W16V16U16;
   3945 		case FORMAT_Q16W16V16U16:	return FORMAT_Q16W16V16U16;
   3946 		// Floating-point formats
   3947 		case FORMAT_A16F:			return FORMAT_A32B32G32R32F;
   3948 		case FORMAT_R16F:			return FORMAT_R32F;
   3949 		case FORMAT_G16R16F:		return FORMAT_G32R32F;
   3950 		case FORMAT_B16G16R16F:     return FORMAT_X32B32G32R32F;
   3951 		case FORMAT_X16B16G16R16F:	return FORMAT_X32B32G32R32F;
   3952 		case FORMAT_A16B16G16R16F:	return FORMAT_A32B32G32R32F;
   3953 		case FORMAT_X16B16G16R16F_UNSIGNED: return FORMAT_X32B32G32R32F_UNSIGNED;
   3954 		case FORMAT_A32F:			return FORMAT_A32B32G32R32F;
   3955 		case FORMAT_R32F:			return FORMAT_R32F;
   3956 		case FORMAT_G32R32F:		return FORMAT_G32R32F;
   3957 		case FORMAT_B32G32R32F:     return FORMAT_X32B32G32R32F;
   3958 		case FORMAT_X32B32G32R32F:  return FORMAT_X32B32G32R32F;
   3959 		case FORMAT_A32B32G32R32F:	return FORMAT_A32B32G32R32F;
   3960 		case FORMAT_X32B32G32R32F_UNSIGNED: return FORMAT_X32B32G32R32F_UNSIGNED;
   3961 		// Luminance formats
   3962 		case FORMAT_L8:				return FORMAT_L8;
   3963 		case FORMAT_A4L4:			return FORMAT_A8L8;
   3964 		case FORMAT_L16:			return FORMAT_L16;
   3965 		case FORMAT_A8L8:			return FORMAT_A8L8;
   3966 		case FORMAT_L16F:           return FORMAT_X32B32G32R32F;
   3967 		case FORMAT_A16L16F:        return FORMAT_A32B32G32R32F;
   3968 		case FORMAT_L32F:           return FORMAT_X32B32G32R32F;
   3969 		case FORMAT_A32L32F:        return FORMAT_A32B32G32R32F;
   3970 		// Depth/stencil formats
   3971 		case FORMAT_D16:
   3972 		case FORMAT_D32:
   3973 		case FORMAT_D24X8:
   3974 			if(hasParent)   // Texture
   3975 			{
   3976 				return FORMAT_D32F_SHADOW;
   3977 			}
   3978 			else if(complementaryDepthBuffer)
   3979 			{
   3980 				return FORMAT_D32F_COMPLEMENTARY;
   3981 			}
   3982 			else
   3983 			{
   3984 				return FORMAT_D32F;
   3985 			}
   3986 		case FORMAT_D24S8:
   3987 		case FORMAT_D24FS8:
   3988 			if(hasParent)   // Texture
   3989 			{
   3990 				return FORMAT_D32FS8_SHADOW;
   3991 			}
   3992 			else if(complementaryDepthBuffer)
   3993 			{
   3994 				return FORMAT_D32FS8_COMPLEMENTARY;
   3995 			}
   3996 			else
   3997 			{
   3998 				return FORMAT_D32FS8;
   3999 			}
   4000 		case FORMAT_D32F:           return FORMAT_D32F;
   4001 		case FORMAT_D32FS8:         return FORMAT_D32FS8;
   4002 		case FORMAT_D32F_LOCKABLE:  return FORMAT_D32F_LOCKABLE;
   4003 		case FORMAT_D32FS8_TEXTURE: return FORMAT_D32FS8_TEXTURE;
   4004 		case FORMAT_INTZ:           return FORMAT_D32FS8_TEXTURE;
   4005 		case FORMAT_DF24S8:         return FORMAT_D32FS8_SHADOW;
   4006 		case FORMAT_DF16S8:         return FORMAT_D32FS8_SHADOW;
   4007 		case FORMAT_S8:             return FORMAT_S8;
   4008 		// YUV formats
   4009 		case FORMAT_YV12_BT601:     return FORMAT_YV12_BT601;
   4010 		case FORMAT_YV12_BT709:     return FORMAT_YV12_BT709;
   4011 		case FORMAT_YV12_JFIF:      return FORMAT_YV12_JFIF;
   4012 		default:
   4013 			ASSERT(false);
   4014 		}
   4015 
   4016 		return FORMAT_NULL;
   4017 	}
   4018 
   4019 	void Surface::setTexturePalette(unsigned int *palette)
   4020 	{
   4021 		Surface::palette = palette;
   4022 		Surface::paletteID++;
   4023 	}
   4024 
   4025 	void Surface::resolve()
   4026 	{
   4027 		if(internal.samples <= 1 || !internal.dirty || !renderTarget || internal.format == FORMAT_NULL)
   4028 		{
   4029 			return;
   4030 		}
   4031 
   4032 		ASSERT(internal.depth == 1);  // Unimplemented
   4033 
   4034 		void *source = internal.lockRect(0, 0, 0, LOCK_READWRITE);
   4035 
   4036 		int width = internal.width;
   4037 		int height = internal.height;
   4038 		int pitch = internal.pitchB;
   4039 		int slice = internal.sliceB;
   4040 
   4041 		unsigned char *source0 = (unsigned char*)source;
   4042 		unsigned char *source1 = source0 + slice;
   4043 		unsigned char *source2 = source1 + slice;
   4044 		unsigned char *source3 = source2 + slice;
   4045 		unsigned char *source4 = source3 + slice;
   4046 		unsigned char *source5 = source4 + slice;
   4047 		unsigned char *source6 = source5 + slice;
   4048 		unsigned char *source7 = source6 + slice;
   4049 		unsigned char *source8 = source7 + slice;
   4050 		unsigned char *source9 = source8 + slice;
   4051 		unsigned char *sourceA = source9 + slice;
   4052 		unsigned char *sourceB = sourceA + slice;
   4053 		unsigned char *sourceC = sourceB + slice;
   4054 		unsigned char *sourceD = sourceC + slice;
   4055 		unsigned char *sourceE = sourceD + slice;
   4056 		unsigned char *sourceF = sourceE + slice;
   4057 
   4058 		if(internal.format == FORMAT_X8R8G8B8 || internal.format == FORMAT_A8R8G8B8 ||
   4059 		   internal.format == FORMAT_X8B8G8R8 || internal.format == FORMAT_A8B8G8R8 ||
   4060 		   internal.format == FORMAT_SRGB8_X8 || internal.format == FORMAT_SRGB8_A8)
   4061 		{
   4062 			#if defined(__i386__) || defined(__x86_64__)
   4063 				if(CPUID::supportsSSE2() && (width % 4) == 0)
   4064 				{
   4065 					if(internal.samples == 2)
   4066 					{
   4067 						for(int y = 0; y < height; y++)
   4068 						{
   4069 							for(int x = 0; x < width; x += 4)
   4070 							{
   4071 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
   4072 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
   4073 
   4074 								c0 = _mm_avg_epu8(c0, c1);
   4075 
   4076 								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
   4077 							}
   4078 
   4079 							source0 += pitch;
   4080 							source1 += pitch;
   4081 						}
   4082 					}
   4083 					else if(internal.samples == 4)
   4084 					{
   4085 						for(int y = 0; y < height; y++)
   4086 						{
   4087 							for(int x = 0; x < width; x += 4)
   4088 							{
   4089 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
   4090 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
   4091 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
   4092 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
   4093 
   4094 								c0 = _mm_avg_epu8(c0, c1);
   4095 								c2 = _mm_avg_epu8(c2, c3);
   4096 								c0 = _mm_avg_epu8(c0, c2);
   4097 
   4098 								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
   4099 							}
   4100 
   4101 							source0 += pitch;
   4102 							source1 += pitch;
   4103 							source2 += pitch;
   4104 							source3 += pitch;
   4105 						}
   4106 					}
   4107 					else if(internal.samples == 8)
   4108 					{
   4109 						for(int y = 0; y < height; y++)
   4110 						{
   4111 							for(int x = 0; x < width; x += 4)
   4112 							{
   4113 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
   4114 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
   4115 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
   4116 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
   4117 								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
   4118 								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
   4119 								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
   4120 								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
   4121 
   4122 								c0 = _mm_avg_epu8(c0, c1);
   4123 								c2 = _mm_avg_epu8(c2, c3);
   4124 								c4 = _mm_avg_epu8(c4, c5);
   4125 								c6 = _mm_avg_epu8(c6, c7);
   4126 								c0 = _mm_avg_epu8(c0, c2);
   4127 								c4 = _mm_avg_epu8(c4, c6);
   4128 								c0 = _mm_avg_epu8(c0, c4);
   4129 
   4130 								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
   4131 							}
   4132 
   4133 							source0 += pitch;
   4134 							source1 += pitch;
   4135 							source2 += pitch;
   4136 							source3 += pitch;
   4137 							source4 += pitch;
   4138 							source5 += pitch;
   4139 							source6 += pitch;
   4140 							source7 += pitch;
   4141 						}
   4142 					}
   4143 					else if(internal.samples == 16)
   4144 					{
   4145 						for(int y = 0; y < height; y++)
   4146 						{
   4147 							for(int x = 0; x < width; x += 4)
   4148 							{
   4149 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
   4150 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
   4151 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
   4152 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
   4153 								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
   4154 								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
   4155 								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
   4156 								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
   4157 								__m128i c8 = _mm_load_si128((__m128i*)(source8 + 4 * x));
   4158 								__m128i c9 = _mm_load_si128((__m128i*)(source9 + 4 * x));
   4159 								__m128i cA = _mm_load_si128((__m128i*)(sourceA + 4 * x));
   4160 								__m128i cB = _mm_load_si128((__m128i*)(sourceB + 4 * x));
   4161 								__m128i cC = _mm_load_si128((__m128i*)(sourceC + 4 * x));
   4162 								__m128i cD = _mm_load_si128((__m128i*)(sourceD + 4 * x));
   4163 								__m128i cE = _mm_load_si128((__m128i*)(sourceE + 4 * x));
   4164 								__m128i cF = _mm_load_si128((__m128i*)(sourceF + 4 * x));
   4165 
   4166 								c0 = _mm_avg_epu8(c0, c1);
   4167 								c2 = _mm_avg_epu8(c2, c3);
   4168 								c4 = _mm_avg_epu8(c4, c5);
   4169 								c6 = _mm_avg_epu8(c6, c7);
   4170 								c8 = _mm_avg_epu8(c8, c9);
   4171 								cA = _mm_avg_epu8(cA, cB);
   4172 								cC = _mm_avg_epu8(cC, cD);
   4173 								cE = _mm_avg_epu8(cE, cF);
   4174 								c0 = _mm_avg_epu8(c0, c2);
   4175 								c4 = _mm_avg_epu8(c4, c6);
   4176 								c8 = _mm_avg_epu8(c8, cA);
   4177 								cC = _mm_avg_epu8(cC, cE);
   4178 								c0 = _mm_avg_epu8(c0, c4);
   4179 								c8 = _mm_avg_epu8(c8, cC);
   4180 								c0 = _mm_avg_epu8(c0, c8);
   4181 
   4182 								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
   4183 							}
   4184 
   4185 							source0 += pitch;
   4186 							source1 += pitch;
   4187 							source2 += pitch;
   4188 							source3 += pitch;
   4189 							source4 += pitch;
   4190 							source5 += pitch;
   4191 							source6 += pitch;
   4192 							source7 += pitch;
   4193 							source8 += pitch;
   4194 							source9 += pitch;
   4195 							sourceA += pitch;
   4196 							sourceB += pitch;
   4197 							sourceC += pitch;
   4198 							sourceD += pitch;
   4199 							sourceE += pitch;
   4200 							sourceF += pitch;
   4201 						}
   4202 					}
   4203 					else ASSERT(false);
   4204 				}
   4205 				else
   4206 			#endif
   4207 			{
   4208 				#define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7F7F7F7F) + (((x) ^ (y)) & 0x01010101))
   4209 
   4210 				if(internal.samples == 2)
   4211 				{
   4212 					for(int y = 0; y < height; y++)
   4213 					{
   4214 						for(int x = 0; x < width; x++)
   4215 						{
   4216 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
   4217 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
   4218 
   4219 							c0 = AVERAGE(c0, c1);
   4220 
   4221 							*(unsigned int*)(source0 + 4 * x) = c0;
   4222 						}
   4223 
   4224 						source0 += pitch;
   4225 						source1 += pitch;
   4226 					}
   4227 				}
   4228 				else if(internal.samples == 4)
   4229 				{
   4230 					for(int y = 0; y < height; y++)
   4231 					{
   4232 						for(int x = 0; x < width; x++)
   4233 						{
   4234 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
   4235 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
   4236 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
   4237 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
   4238 
   4239 							c0 = AVERAGE(c0, c1);
   4240 							c2 = AVERAGE(c2, c3);
   4241 							c0 = AVERAGE(c0, c2);
   4242 
   4243 							*(unsigned int*)(source0 + 4 * x) = c0;
   4244 						}
   4245 
   4246 						source0 += pitch;
   4247 						source1 += pitch;
   4248 						source2 += pitch;
   4249 						source3 += pitch;
   4250 					}
   4251 				}
   4252 				else if(internal.samples == 8)
   4253 				{
   4254 					for(int y = 0; y < height; y++)
   4255 					{
   4256 						for(int x = 0; x < width; x++)
   4257 						{
   4258 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
   4259 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
   4260 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
   4261 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
   4262 							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
   4263 							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
   4264 							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
   4265 							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
   4266 
   4267 							c0 = AVERAGE(c0, c1);
   4268 							c2 = AVERAGE(c2, c3);
   4269 							c4 = AVERAGE(c4, c5);
   4270 							c6 = AVERAGE(c6, c7);
   4271 							c0 = AVERAGE(c0, c2);
   4272 							c4 = AVERAGE(c4, c6);
   4273 							c0 = AVERAGE(c0, c4);
   4274 
   4275 							*(unsigned int*)(source0 + 4 * x) = c0;
   4276 						}
   4277 
   4278 						source0 += pitch;
   4279 						source1 += pitch;
   4280 						source2 += pitch;
   4281 						source3 += pitch;
   4282 						source4 += pitch;
   4283 						source5 += pitch;
   4284 						source6 += pitch;
   4285 						source7 += pitch;
   4286 					}
   4287 				}
   4288 				else if(internal.samples == 16)
   4289 				{
   4290 					for(int y = 0; y < height; y++)
   4291 					{
   4292 						for(int x = 0; x < width; x++)
   4293 						{
   4294 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
   4295 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
   4296 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
   4297 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
   4298 							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
   4299 							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
   4300 							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
   4301 							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
   4302 							unsigned int c8 = *(unsigned int*)(source8 + 4 * x);
   4303 							unsigned int c9 = *(unsigned int*)(source9 + 4 * x);
   4304 							unsigned int cA = *(unsigned int*)(sourceA + 4 * x);
   4305 							unsigned int cB = *(unsigned int*)(sourceB + 4 * x);
   4306 							unsigned int cC = *(unsigned int*)(sourceC + 4 * x);
   4307 							unsigned int cD = *(unsigned int*)(sourceD + 4 * x);
   4308 							unsigned int cE = *(unsigned int*)(sourceE + 4 * x);
   4309 							unsigned int cF = *(unsigned int*)(sourceF + 4 * x);
   4310 
   4311 							c0 = AVERAGE(c0, c1);
   4312 							c2 = AVERAGE(c2, c3);
   4313 							c4 = AVERAGE(c4, c5);
   4314 							c6 = AVERAGE(c6, c7);
   4315 							c8 = AVERAGE(c8, c9);
   4316 							cA = AVERAGE(cA, cB);
   4317 							cC = AVERAGE(cC, cD);
   4318 							cE = AVERAGE(cE, cF);
   4319 							c0 = AVERAGE(c0, c2);
   4320 							c4 = AVERAGE(c4, c6);
   4321 							c8 = AVERAGE(c8, cA);
   4322 							cC = AVERAGE(cC, cE);
   4323 							c0 = AVERAGE(c0, c4);
   4324 							c8 = AVERAGE(c8, cC);
   4325 							c0 = AVERAGE(c0, c8);
   4326 
   4327 							*(unsigned int*)(source0 + 4 * x) = c0;
   4328 						}
   4329 
   4330 						source0 += pitch;
   4331 						source1 += pitch;
   4332 						source2 += pitch;
   4333 						source3 += pitch;
   4334 						source4 += pitch;
   4335 						source5 += pitch;
   4336 						source6 += pitch;
   4337 						source7 += pitch;
   4338 						source8 += pitch;
   4339 						source9 += pitch;
   4340 						sourceA += pitch;
   4341 						sourceB += pitch;
   4342 						sourceC += pitch;
   4343 						sourceD += pitch;
   4344 						sourceE += pitch;
   4345 						sourceF += pitch;
   4346 					}
   4347 				}
   4348 				else ASSERT(false);
   4349 
   4350 				#undef AVERAGE
   4351 			}
   4352 		}
   4353 		else if(internal.format == FORMAT_G16R16)
   4354 		{
   4355 
   4356 			#if defined(__i386__) || defined(__x86_64__)
   4357 				if(CPUID::supportsSSE2() && (width % 4) == 0)
   4358 				{
   4359 					if(internal.samples == 2)
   4360 					{
   4361 						for(int y = 0; y < height; y++)
   4362 						{
   4363 							for(int x = 0; x < width; x += 4)
   4364 							{
   4365 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
   4366 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
   4367 
   4368 								c0 = _mm_avg_epu16(c0, c1);
   4369 
   4370 								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
   4371 							}
   4372 
   4373 							source0 += pitch;
   4374 							source1 += pitch;
   4375 						}
   4376 					}
   4377 					else if(internal.samples == 4)
   4378 					{
   4379 						for(int y = 0; y < height; y++)
   4380 						{
   4381 							for(int x = 0; x < width; x += 4)
   4382 							{
   4383 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
   4384 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
   4385 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
   4386 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
   4387 
   4388 								c0 = _mm_avg_epu16(c0, c1);
   4389 								c2 = _mm_avg_epu16(c2, c3);
   4390 								c0 = _mm_avg_epu16(c0, c2);
   4391 
   4392 								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
   4393 							}
   4394 
   4395 							source0 += pitch;
   4396 							source1 += pitch;
   4397 							source2 += pitch;
   4398 							source3 += pitch;
   4399 						}
   4400 					}
   4401 					else if(internal.samples == 8)
   4402 					{
   4403 						for(int y = 0; y < height; y++)
   4404 						{
   4405 							for(int x = 0; x < width; x += 4)
   4406 							{
   4407 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
   4408 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
   4409 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
   4410 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
   4411 								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
   4412 								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
   4413 								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
   4414 								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
   4415 
   4416 								c0 = _mm_avg_epu16(c0, c1);
   4417 								c2 = _mm_avg_epu16(c2, c3);
   4418 								c4 = _mm_avg_epu16(c4, c5);
   4419 								c6 = _mm_avg_epu16(c6, c7);
   4420 								c0 = _mm_avg_epu16(c0, c2);
   4421 								c4 = _mm_avg_epu16(c4, c6);
   4422 								c0 = _mm_avg_epu16(c0, c4);
   4423 
   4424 								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
   4425 							}
   4426 
   4427 							source0 += pitch;
   4428 							source1 += pitch;
   4429 							source2 += pitch;
   4430 							source3 += pitch;
   4431 							source4 += pitch;
   4432 							source5 += pitch;
   4433 							source6 += pitch;
   4434 							source7 += pitch;
   4435 						}
   4436 					}
   4437 					else if(internal.samples == 16)
   4438 					{
   4439 						for(int y = 0; y < height; y++)
   4440 						{
   4441 							for(int x = 0; x < width; x += 4)
   4442 							{
   4443 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
   4444 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
   4445 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
   4446 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
   4447 								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
   4448 								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
   4449 								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
   4450 								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
   4451 								__m128i c8 = _mm_load_si128((__m128i*)(source8 + 4 * x));
   4452 								__m128i c9 = _mm_load_si128((__m128i*)(source9 + 4 * x));
   4453 								__m128i cA = _mm_load_si128((__m128i*)(sourceA + 4 * x));
   4454 								__m128i cB = _mm_load_si128((__m128i*)(sourceB + 4 * x));
   4455 								__m128i cC = _mm_load_si128((__m128i*)(sourceC + 4 * x));
   4456 								__m128i cD = _mm_load_si128((__m128i*)(sourceD + 4 * x));
   4457 								__m128i cE = _mm_load_si128((__m128i*)(sourceE + 4 * x));
   4458 								__m128i cF = _mm_load_si128((__m128i*)(sourceF + 4 * x));
   4459 
   4460 								c0 = _mm_avg_epu16(c0, c1);
   4461 								c2 = _mm_avg_epu16(c2, c3);
   4462 								c4 = _mm_avg_epu16(c4, c5);
   4463 								c6 = _mm_avg_epu16(c6, c7);
   4464 								c8 = _mm_avg_epu16(c8, c9);
   4465 								cA = _mm_avg_epu16(cA, cB);
   4466 								cC = _mm_avg_epu16(cC, cD);
   4467 								cE = _mm_avg_epu16(cE, cF);
   4468 								c0 = _mm_avg_epu16(c0, c2);
   4469 								c4 = _mm_avg_epu16(c4, c6);
   4470 								c8 = _mm_avg_epu16(c8, cA);
   4471 								cC = _mm_avg_epu16(cC, cE);
   4472 								c0 = _mm_avg_epu16(c0, c4);
   4473 								c8 = _mm_avg_epu16(c8, cC);
   4474 								c0 = _mm_avg_epu16(c0, c8);
   4475 
   4476 								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
   4477 							}
   4478 
   4479 							source0 += pitch;
   4480 							source1 += pitch;
   4481 							source2 += pitch;
   4482 							source3 += pitch;
   4483 							source4 += pitch;
   4484 							source5 += pitch;
   4485 							source6 += pitch;
   4486 							source7 += pitch;
   4487 							source8 += pitch;
   4488 							source9 += pitch;
   4489 							sourceA += pitch;
   4490 							sourceB += pitch;
   4491 							sourceC += pitch;
   4492 							sourceD += pitch;
   4493 							sourceE += pitch;
   4494 							sourceF += pitch;
   4495 						}
   4496 					}
   4497 					else ASSERT(false);
   4498 				}
   4499 				else
   4500 			#endif
   4501 			{
   4502 				#define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7FFF7FFF) + (((x) ^ (y)) & 0x00010001))
   4503 
   4504 				if(internal.samples == 2)
   4505 				{
   4506 					for(int y = 0; y < height; y++)
   4507 					{
   4508 						for(int x = 0; x < width; x++)
   4509 						{
   4510 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
   4511 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
   4512 
   4513 							c0 = AVERAGE(c0, c1);
   4514 
   4515 							*(unsigned int*)(source0 + 4 * x) = c0;
   4516 						}
   4517 
   4518 						source0 += pitch;
   4519 						source1 += pitch;
   4520 					}
   4521 				}
   4522 				else if(internal.samples == 4)
   4523 				{
   4524 					for(int y = 0; y < height; y++)
   4525 					{
   4526 						for(int x = 0; x < width; x++)
   4527 						{
   4528 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
   4529 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
   4530 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
   4531 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
   4532 
   4533 							c0 = AVERAGE(c0, c1);
   4534 							c2 = AVERAGE(c2, c3);
   4535 							c0 = AVERAGE(c0, c2);
   4536 
   4537 							*(unsigned int*)(source0 + 4 * x) = c0;
   4538 						}
   4539 
   4540 						source0 += pitch;
   4541 						source1 += pitch;
   4542 						source2 += pitch;
   4543 						source3 += pitch;
   4544 					}
   4545 				}
   4546 				else if(internal.samples == 8)
   4547 				{
   4548 					for(int y = 0; y < height; y++)
   4549 					{
   4550 						for(int x = 0; x < width; x++)
   4551 						{
   4552 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
   4553 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
   4554 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
   4555 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
   4556 							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
   4557 							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
   4558 							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
   4559 							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
   4560 
   4561 							c0 = AVERAGE(c0, c1);
   4562 							c2 = AVERAGE(c2, c3);
   4563 							c4 = AVERAGE(c4, c5);
   4564 							c6 = AVERAGE(c6, c7);
   4565 							c0 = AVERAGE(c0, c2);
   4566 							c4 = AVERAGE(c4, c6);
   4567 							c0 = AVERAGE(c0, c4);
   4568 
   4569 							*(unsigned int*)(source0 + 4 * x) = c0;
   4570 						}
   4571 
   4572 						source0 += pitch;
   4573 						source1 += pitch;
   4574 						source2 += pitch;
   4575 						source3 += pitch;
   4576 						source4 += pitch;
   4577 						source5 += pitch;
   4578 						source6 += pitch;
   4579 						source7 += pitch;
   4580 					}
   4581 				}
   4582 				else if(internal.samples == 16)
   4583 				{
   4584 					for(int y = 0; y < height; y++)
   4585 					{
   4586 						for(int x = 0; x < width; x++)
   4587 						{
   4588 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
   4589 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
   4590 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
   4591 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
   4592 							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
   4593 							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
   4594 							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
   4595 							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
   4596 							unsigned int c8 = *(unsigned int*)(source8 + 4 * x);
   4597 							unsigned int c9 = *(unsigned int*)(source9 + 4 * x);
   4598 							unsigned int cA = *(unsigned int*)(sourceA + 4 * x);
   4599 							unsigned int cB = *(unsigned int*)(sourceB + 4 * x);
   4600 							unsigned int cC = *(unsigned int*)(sourceC + 4 * x);
   4601 							unsigned int cD = *(unsigned int*)(sourceD + 4 * x);
   4602 							unsigned int cE = *(unsigned int*)(sourceE + 4 * x);
   4603 							unsigned int cF = *(unsigned int*)(sourceF + 4 * x);
   4604 
   4605 							c0 = AVERAGE(c0, c1);
   4606 							c2 = AVERAGE(c2, c3);
   4607 							c4 = AVERAGE(c4, c5);
   4608 							c6 = AVERAGE(c6, c7);
   4609 							c8 = AVERAGE(c8, c9);
   4610 							cA = AVERAGE(cA, cB);
   4611 							cC = AVERAGE(cC, cD);
   4612 							cE = AVERAGE(cE, cF);
   4613 							c0 = AVERAGE(c0, c2);
   4614 							c4 = AVERAGE(c4, c6);
   4615 							c8 = AVERAGE(c8, cA);
   4616 							cC = AVERAGE(cC, cE);
   4617 							c0 = AVERAGE(c0, c4);
   4618 							c8 = AVERAGE(c8, cC);
   4619 							c0 = AVERAGE(c0, c8);
   4620 
   4621 							*(unsigned int*)(source0 + 4 * x) = c0;
   4622 						}
   4623 
   4624 						source0 += pitch;
   4625 						source1 += pitch;
   4626 						source2 += pitch;
   4627 						source3 += pitch;
   4628 						source4 += pitch;
   4629 						source5 += pitch;
   4630 						source6 += pitch;
   4631 						source7 += pitch;
   4632 						source8 += pitch;
   4633 						source9 += pitch;
   4634 						sourceA += pitch;
   4635 						sourceB += pitch;
   4636 						sourceC += pitch;
   4637 						sourceD += pitch;
   4638 						sourceE += pitch;
   4639 						sourceF += pitch;
   4640 					}
   4641 				}
   4642 				else ASSERT(false);
   4643 
   4644 				#undef AVERAGE
   4645 			}
   4646 		}
   4647 		else if(internal.format == FORMAT_A16B16G16R16)
   4648 		{
   4649 			#if defined(__i386__) || defined(__x86_64__)
   4650 				if(CPUID::supportsSSE2() && (width % 2) == 0)
   4651 				{
   4652 					if(internal.samples == 2)
   4653 					{
   4654 						for(int y = 0; y < height; y++)
   4655 						{
   4656 							for(int x = 0; x < width; x += 2)
   4657 							{
   4658 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
   4659 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
   4660 
   4661 								c0 = _mm_avg_epu16(c0, c1);
   4662 
   4663 								_mm_store_si128((__m128i*)(source0 + 8 * x), c0);
   4664 							}
   4665 
   4666 							source0 += pitch;
   4667 							source1 += pitch;
   4668 						}
   4669 					}
   4670 					else if(internal.samples == 4)
   4671 					{
   4672 						for(int y = 0; y < height; y++)
   4673 						{
   4674 							for(int x = 0; x < width; x += 2)
   4675 							{
   4676 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
   4677 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
   4678 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 8 * x));
   4679 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 8 * x));
   4680 
   4681 								c0 = _mm_avg_epu16(c0, c1);
   4682 								c2 = _mm_avg_epu16(c2, c3);
   4683 								c0 = _mm_avg_epu16(c0, c2);
   4684 
   4685 								_mm_store_si128((__m128i*)(source0 + 8 * x), c0);
   4686 							}
   4687 
   4688 							source0 += pitch;
   4689 							source1 += pitch;
   4690 							source2 += pitch;
   4691 							source3 += pitch;
   4692 						}
   4693 					}
   4694 					else if(internal.samples == 8)
   4695 					{
   4696 						for(int y = 0; y < height; y++)
   4697 						{
   4698 							for(int x = 0; x < width; x += 2)
   4699 							{
   4700 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
   4701 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
   4702 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 8 * x));
   4703 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 8 * x));
   4704 								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 8 * x));
   4705 								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 8 * x));
   4706 								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 8 * x));
   4707 								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 8 * x));
   4708 
   4709 								c0 = _mm_avg_epu16(c0, c1);
   4710 								c2 = _mm_avg_epu16(c2, c3);
   4711 								c4 = _mm_avg_epu16(c4, c5);
   4712 								c6 = _mm_avg_epu16(c6, c7);
   4713 								c0 = _mm_avg_epu16(c0, c2);
   4714 								c4 = _mm_avg_epu16(c4, c6);
   4715 								c0 = _mm_avg_epu16(c0, c4);
   4716 
   4717 								_mm_store_si128((__m128i*)(source0 + 8 * x), c0);
   4718 							}
   4719 
   4720 							source0 += pitch;
   4721 							source1 += pitch;
   4722 							source2 += pitch;
   4723 							source3 += pitch;
   4724 							source4 += pitch;
   4725 							source5 += pitch;
   4726 							source6 += pitch;
   4727 							source7 += pitch;
   4728 						}
   4729 					}
   4730 					else if(internal.samples == 16)
   4731 					{
   4732 						for(int y = 0; y < height; y++)
   4733 						{
   4734 							for(int x = 0; x < width; x += 2)
   4735 							{
   4736 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
   4737 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
   4738 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 8 * x));
   4739 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 8 * x));
   4740 								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 8 * x));
   4741 								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 8 * x));
   4742 								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 8 * x));
   4743 								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 8 * x));
   4744 								__m128i c8 = _mm_load_si128((__m128i*)(source8 + 8 * x));
   4745 								__m128i c9 = _mm_load_si128((__m128i*)(source9 + 8 * x));
   4746 								__m128i cA = _mm_load_si128((__m128i*)(sourceA + 8 * x));
   4747 								__m128i cB = _mm_load_si128((__m128i*)(sourceB + 8 * x));
   4748 								__m128i cC = _mm_load_si128((__m128i*)(sourceC + 8 * x));
   4749 								__m128i cD = _mm_load_si128((__m128i*)(sourceD + 8 * x));
   4750 								__m128i cE = _mm_load_si128((__m128i*)(sourceE + 8 * x));
   4751 								__m128i cF = _mm_load_si128((__m128i*)(sourceF + 8 * x));
   4752 
   4753 								c0 = _mm_avg_epu16(c0, c1);
   4754 								c2 = _mm_avg_epu16(c2, c3);
   4755 								c4 = _mm_avg_epu16(c4, c5);
   4756 								c6 = _mm_avg_epu16(c6, c7);
   4757 								c8 = _mm_avg_epu16(c8, c9);
   4758 								cA = _mm_avg_epu16(cA, cB);
   4759 								cC = _mm_avg_epu16(cC, cD);
   4760 								cE = _mm_avg_epu16(cE, cF);
   4761 								c0 = _mm_avg_epu16(c0, c2);
   4762 								c4 = _mm_avg_epu16(c4, c6);
   4763 								c8 = _mm_avg_epu16(c8, cA);
   4764 								cC = _mm_avg_epu16(cC, cE);
   4765 								c0 = _mm_avg_epu16(c0, c4);
   4766 								c8 = _mm_avg_epu16(c8, cC);
   4767 								c0 = _mm_avg_epu16(c0, c8);
   4768 
   4769 								_mm_store_si128((__m128i*)(source0 + 8 * x), c0);
   4770 							}
   4771 
   4772 							source0 += pitch;
   4773 							source1 += pitch;
   4774 							source2 += pitch;
   4775 							source3 += pitch;
   4776 							source4 += pitch;
   4777 							source5 += pitch;
   4778 							source6 += pitch;
   4779 							source7 += pitch;
   4780 							source8 += pitch;
   4781 							source9 += pitch;
   4782 							sourceA += pitch;
   4783 							sourceB += pitch;
   4784 							sourceC += pitch;
   4785 							sourceD += pitch;
   4786 							sourceE += pitch;
   4787 							sourceF += pitch;
   4788 						}
   4789 					}
   4790 					else ASSERT(false);
   4791 				}
   4792 				else
   4793 			#endif
   4794 			{
   4795 				#define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7FFF7FFF) + (((x) ^ (y)) & 0x00010001))
   4796 
   4797 				if(internal.samples == 2)
   4798 				{
   4799 					for(int y = 0; y < height; y++)
   4800 					{
   4801 						for(int x = 0; x < 2 * width; x++)
   4802 						{
   4803 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
   4804 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
   4805 
   4806 							c0 = AVERAGE(c0, c1);
   4807 
   4808 							*(unsigned int*)(source0 + 4 * x) = c0;
   4809 						}
   4810 
   4811 						source0 += pitch;
   4812 						source1 += pitch;
   4813 					}
   4814 				}
   4815 				else if(internal.samples == 4)
   4816 				{
   4817 					for(int y = 0; y < height; y++)
   4818 					{
   4819 						for(int x = 0; x < 2 * width; x++)
   4820 						{
   4821 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
   4822 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
   4823 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
   4824 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
   4825 
   4826 							c0 = AVERAGE(c0, c1);
   4827 							c2 = AVERAGE(c2, c3);
   4828 							c0 = AVERAGE(c0, c2);
   4829 
   4830 							*(unsigned int*)(source0 + 4 * x) = c0;
   4831 						}
   4832 
   4833 						source0 += pitch;
   4834 						source1 += pitch;
   4835 						source2 += pitch;
   4836 						source3 += pitch;
   4837 					}
   4838 				}
   4839 				else if(internal.samples == 8)
   4840 				{
   4841 					for(int y = 0; y < height; y++)
   4842 					{
   4843 						for(int x = 0; x < 2 * width; x++)
   4844 						{
   4845 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
   4846 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
   4847 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
   4848 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
   4849 							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
   4850 							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
   4851 							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
   4852 							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
   4853 
   4854 							c0 = AVERAGE(c0, c1);
   4855 							c2 = AVERAGE(c2, c3);
   4856 							c4 = AVERAGE(c4, c5);
   4857 							c6 = AVERAGE(c6, c7);
   4858 							c0 = AVERAGE(c0, c2);
   4859 							c4 = AVERAGE(c4, c6);
   4860 							c0 = AVERAGE(c0, c4);
   4861 
   4862 							*(unsigned int*)(source0 + 4 * x) = c0;
   4863 						}
   4864 
   4865 						source0 += pitch;
   4866 						source1 += pitch;
   4867 						source2 += pitch;
   4868 						source3 += pitch;
   4869 						source4 += pitch;
   4870 						source5 += pitch;
   4871 						source6 += pitch;
   4872 						source7 += pitch;
   4873 					}
   4874 				}
   4875 				else if(internal.samples == 16)
   4876 				{
   4877 					for(int y = 0; y < height; y++)
   4878 					{
   4879 						for(int x = 0; x < 2 * width; x++)
   4880 						{
   4881 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
   4882 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
   4883 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
   4884 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
   4885 							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
   4886 							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
   4887 							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
   4888 							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
   4889 							unsigned int c8 = *(unsigned int*)(source8 + 4 * x);
   4890 							unsigned int c9 = *(unsigned int*)(source9 + 4 * x);
   4891 							unsigned int cA = *(unsigned int*)(sourceA + 4 * x);
   4892 							unsigned int cB = *(unsigned int*)(sourceB + 4 * x);
   4893 							unsigned int cC = *(unsigned int*)(sourceC + 4 * x);
   4894 							unsigned int cD = *(unsigned int*)(sourceD + 4 * x);
   4895 							unsigned int cE = *(unsigned int*)(sourceE + 4 * x);
   4896 							unsigned int cF = *(unsigned int*)(sourceF + 4 * x);
   4897 
   4898 							c0 = AVERAGE(c0, c1);
   4899 							c2 = AVERAGE(c2, c3);
   4900 							c4 = AVERAGE(c4, c5);
   4901 							c6 = AVERAGE(c6, c7);
   4902 							c8 = AVERAGE(c8, c9);
   4903 							cA = AVERAGE(cA, cB);
   4904 							cC = AVERAGE(cC, cD);
   4905 							cE = AVERAGE(cE, cF);
   4906 							c0 = AVERAGE(c0, c2);
   4907 							c4 = AVERAGE(c4, c6);
   4908 							c8 = AVERAGE(c8, cA);
   4909 							cC = AVERAGE(cC, cE);
   4910 							c0 = AVERAGE(c0, c4);
   4911 							c8 = AVERAGE(c8, cC);
   4912 							c0 = AVERAGE(c0, c8);
   4913 
   4914 							*(unsigned int*)(source0 + 4 * x) = c0;
   4915 						}
   4916 
   4917 						source0 += pitch;
   4918 						source1 += pitch;
   4919 						source2 += pitch;
   4920 						source3 += pitch;
   4921 						source4 += pitch;
   4922 						source5 += pitch;
   4923 						source6 += pitch;
   4924 						source7 += pitch;
   4925 						source8 += pitch;
   4926 						source9 += pitch;
   4927 						sourceA += pitch;
   4928 						sourceB += pitch;
   4929 						sourceC += pitch;
   4930 						sourceD += pitch;
   4931 						sourceE += pitch;
   4932 						sourceF += pitch;
   4933 					}
   4934 				}
   4935 				else ASSERT(false);
   4936 
   4937 				#undef AVERAGE
   4938 			}
   4939 		}
   4940 		else if(internal.format == FORMAT_R32F)
   4941 		{
   4942 			#if defined(__i386__) || defined(__x86_64__)
   4943 				if(CPUID::supportsSSE() && (width % 4) == 0)
   4944 				{
   4945 					if(internal.samples == 2)
   4946 					{
   4947 						for(int y = 0; y < height; y++)
   4948 						{
   4949 							for(int x = 0; x < width; x += 4)
   4950 							{
   4951 								__m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
   4952 								__m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
   4953 
   4954 								c0 = _mm_add_ps(c0, c1);
   4955 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 2.0f));
   4956 
   4957 								_mm_store_ps((float*)(source0 + 4 * x), c0);
   4958 							}
   4959 
   4960 							source0 += pitch;
   4961 							source1 += pitch;
   4962 						}
   4963 					}
   4964 					else if(internal.samples == 4)
   4965 					{
   4966 						for(int y = 0; y < height; y++)
   4967 						{
   4968 							for(int x = 0; x < width; x += 4)
   4969 							{
   4970 								__m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
   4971 								__m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
   4972 								__m128 c2 = _mm_load_ps((float*)(source2 + 4 * x));
   4973 								__m128 c3 = _mm_load_ps((float*)(source3 + 4 * x));
   4974 
   4975 								c0 = _mm_add_ps(c0, c1);
   4976 								c2 = _mm_add_ps(c2, c3);
   4977 								c0 = _mm_add_ps(c0, c2);
   4978 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 4.0f));
   4979 
   4980 								_mm_store_ps((float*)(source0 + 4 * x), c0);
   4981 							}
   4982 
   4983 							source0 += pitch;
   4984 							source1 += pitch;
   4985 							source2 += pitch;
   4986 							source3 += pitch;
   4987 						}
   4988 					}
   4989 					else if(internal.samples == 8)
   4990 					{
   4991 						for(int y = 0; y < height; y++)
   4992 						{
   4993 							for(int x = 0; x < width; x += 4)
   4994 							{
   4995 								__m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
   4996 								__m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
   4997 								__m128 c2 = _mm_load_ps((float*)(source2 + 4 * x));
   4998 								__m128 c3 = _mm_load_ps((float*)(source3 + 4 * x));
   4999 								__m128 c4 = _mm_load_ps((float*)(source4 + 4 * x));
   5000 								__m128 c5 = _mm_load_ps((float*)(source5 + 4 * x));
   5001 								__m128 c6 = _mm_load_ps((float*)(source6 + 4 * x));
   5002 								__m128 c7 = _mm_load_ps((float*)(source7 + 4 * x));
   5003 
   5004 								c0 = _mm_add_ps(c0, c1);
   5005 								c2 = _mm_add_ps(c2, c3);
   5006 								c4 = _mm_add_ps(c4, c5);
   5007 								c6 = _mm_add_ps(c6, c7);
   5008 								c0 = _mm_add_ps(c0, c2);
   5009 								c4 = _mm_add_ps(c4, c6);
   5010 								c0 = _mm_add_ps(c0, c4);
   5011 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 8.0f));
   5012 
   5013 								_mm_store_ps((float*)(source0 + 4 * x), c0);
   5014 							}
   5015 
   5016 							source0 += pitch;
   5017 							source1 += pitch;
   5018 							source2 += pitch;
   5019 							source3 += pitch;
   5020 							source4 += pitch;
   5021 							source5 += pitch;
   5022 							source6 += pitch;
   5023 							source7 += pitch;
   5024 						}
   5025 					}
   5026 					else if(internal.samples == 16)
   5027 					{
   5028 						for(int y = 0; y < height; y++)
   5029 						{
   5030 							for(int x = 0; x < width; x += 4)
   5031 							{
   5032 								__m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
   5033 								__m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
   5034 								__m128 c2 = _mm_load_ps((float*)(source2 + 4 * x));
   5035 								__m128 c3 = _mm_load_ps((float*)(source3 + 4 * x));
   5036 								__m128 c4 = _mm_load_ps((float*)(source4 + 4 * x));
   5037 								__m128 c5 = _mm_load_ps((float*)(source5 + 4 * x));
   5038 								__m128 c6 = _mm_load_ps((float*)(source6 + 4 * x));
   5039 								__m128 c7 = _mm_load_ps((float*)(source7 + 4 * x));
   5040 								__m128 c8 = _mm_load_ps((float*)(source8 + 4 * x));
   5041 								__m128 c9 = _mm_load_ps((float*)(source9 + 4 * x));
   5042 								__m128 cA = _mm_load_ps((float*)(sourceA + 4 * x));
   5043 								__m128 cB = _mm_load_ps((float*)(sourceB + 4 * x));
   5044 								__m128 cC = _mm_load_ps((float*)(sourceC + 4 * x));
   5045 								__m128 cD = _mm_load_ps((float*)(sourceD + 4 * x));
   5046 								__m128 cE = _mm_load_ps((float*)(sourceE + 4 * x));
   5047 								__m128 cF = _mm_load_ps((float*)(sourceF + 4 * x));
   5048 
   5049 								c0 = _mm_add_ps(c0, c1);
   5050 								c2 = _mm_add_ps(c2, c3);
   5051 								c4 = _mm_add_ps(c4, c5);
   5052 								c6 = _mm_add_ps(c6, c7);
   5053 								c8 = _mm_add_ps(c8, c9);
   5054 								cA = _mm_add_ps(cA, cB);
   5055 								cC = _mm_add_ps(cC, cD);
   5056 								cE = _mm_add_ps(cE, cF);
   5057 								c0 = _mm_add_ps(c0, c2);
   5058 								c4 = _mm_add_ps(c4, c6);
   5059 								c8 = _mm_add_ps(c8, cA);
   5060 								cC = _mm_add_ps(cC, cE);
   5061 								c0 = _mm_add_ps(c0, c4);
   5062 								c8 = _mm_add_ps(c8, cC);
   5063 								c0 = _mm_add_ps(c0, c8);
   5064 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 16.0f));
   5065 
   5066 								_mm_store_ps((float*)(source0 + 4 * x), c0);
   5067 							}
   5068 
   5069 							source0 += pitch;
   5070 							source1 += pitch;
   5071 							source2 += pitch;
   5072 							source3 += pitch;
   5073 							source4 += pitch;
   5074 							source5 += pitch;
   5075 							source6 += pitch;
   5076 							source7 += pitch;
   5077 							source8 += pitch;
   5078 							source9 += pitch;
   5079 							sourceA += pitch;
   5080 							sourceB += pitch;
   5081 							sourceC += pitch;
   5082 							sourceD += pitch;
   5083 							sourceE += pitch;
   5084 							sourceF += pitch;
   5085 						}
   5086 					}
   5087 					else ASSERT(false);
   5088 				}
   5089 				else
   5090 			#endif
   5091 			{
   5092 				if(internal.samples == 2)
   5093 				{
   5094 					for(int y = 0; y < height; y++)
   5095 					{
   5096 						for(int x = 0; x < width; x++)
   5097 						{
   5098 							float c0 = *(float*)(source0 + 4 * x);
   5099 							float c1 = *(float*)(source1 + 4 * x);
   5100 
   5101 							c0 = c0 + c1;
   5102 							c0 *= 1.0f / 2.0f;
   5103 
   5104 							*(float*)(source0 + 4 * x) = c0;
   5105 						}
   5106 
   5107 						source0 += pitch;
   5108 						source1 += pitch;
   5109 					}
   5110 				}
   5111 				else if(internal.samples == 4)
   5112 				{
   5113 					for(int y = 0; y < height; y++)
   5114 					{
   5115 						for(int x = 0; x < width; x++)
   5116 						{
   5117 							float c0 = *(float*)(source0 + 4 * x);
   5118 							float c1 = *(float*)(source1 + 4 * x);
   5119 							float c2 = *(float*)(source2 + 4 * x);
   5120 							float c3 = *(float*)(source3 + 4 * x);
   5121 
   5122 							c0 = c0 + c1;
   5123 							c2 = c2 + c3;
   5124 							c0 = c0 + c2;
   5125 							c0 *= 1.0f / 4.0f;
   5126 
   5127 							*(float*)(source0 + 4 * x) = c0;
   5128 						}
   5129 
   5130 						source0 += pitch;
   5131 						source1 += pitch;
   5132 						source2 += pitch;
   5133 						source3 += pitch;
   5134 					}
   5135 				}
   5136 				else if(internal.samples == 8)
   5137 				{
   5138 					for(int y = 0; y < height; y++)
   5139 					{
   5140 						for(int x = 0; x < width; x++)
   5141 						{
   5142 							float c0 = *(float*)(source0 + 4 * x);
   5143 							float c1 = *(float*)(source1 + 4 * x);
   5144 							float c2 = *(float*)(source2 + 4 * x);
   5145 							float c3 = *(float*)(source3 + 4 * x);
   5146 							float c4 = *(float*)(source4 + 4 * x);
   5147 							float c5 = *(float*)(source5 + 4 * x);
   5148 							float c6 = *(float*)(source6 + 4 * x);
   5149 							float c7 = *(float*)(source7 + 4 * x);
   5150 
   5151 							c0 = c0 + c1;
   5152 							c2 = c2 + c3;
   5153 							c4 = c4 + c5;
   5154 							c6 = c6 + c7;
   5155 							c0 = c0 + c2;
   5156 							c4 = c4 + c6;
   5157 							c0 = c0 + c4;
   5158 							c0 *= 1.0f / 8.0f;
   5159 
   5160 							*(float*)(source0 + 4 * x) = c0;
   5161 						}
   5162 
   5163 						source0 += pitch;
   5164 						source1 += pitch;
   5165 						source2 += pitch;
   5166 						source3 += pitch;
   5167 						source4 += pitch;
   5168 						source5 += pitch;
   5169 						source6 += pitch;
   5170 						source7 += pitch;
   5171 					}
   5172 				}
   5173 				else if(internal.samples == 16)
   5174 				{
   5175 					for(int y = 0; y < height; y++)
   5176 					{
   5177 						for(int x = 0; x < width; x++)
   5178 						{
   5179 							float c0 = *(float*)(source0 + 4 * x);
   5180 							float c1 = *(float*)(source1 + 4 * x);
   5181 							float c2 = *(float*)(source2 + 4 * x);
   5182 							float c3 = *(float*)(source3 + 4 * x);
   5183 							float c4 = *(float*)(source4 + 4 * x);
   5184 							float c5 = *(float*)(source5 + 4 * x);
   5185 							float c6 = *(float*)(source6 + 4 * x);
   5186 							float c7 = *(float*)(source7 + 4 * x);
   5187 							float c8 = *(float*)(source8 + 4 * x);
   5188 							float c9 = *(float*)(source9 + 4 * x);
   5189 							float cA = *(float*)(sourceA + 4 * x);
   5190 							float cB = *(float*)(sourceB + 4 * x);
   5191 							float cC = *(float*)(sourceC + 4 * x);
   5192 							float cD = *(float*)(sourceD + 4 * x);
   5193 							float cE = *(float*)(sourceE + 4 * x);
   5194 							float cF = *(float*)(sourceF + 4 * x);
   5195 
   5196 							c0 = c0 + c1;
   5197 							c2 = c2 + c3;
   5198 							c4 = c4 + c5;
   5199 							c6 = c6 + c7;
   5200 							c8 = c8 + c9;
   5201 							cA = cA + cB;
   5202 							cC = cC + cD;
   5203 							cE = cE + cF;
   5204 							c0 = c0 + c2;
   5205 							c4 = c4 + c6;
   5206 							c8 = c8 + cA;
   5207 							cC = cC + cE;
   5208 							c0 = c0 + c4;
   5209 							c8 = c8 + cC;
   5210 							c0 = c0 + c8;
   5211 							c0 *= 1.0f / 16.0f;
   5212 
   5213 							*(float*)(source0 + 4 * x) = c0;
   5214 						}
   5215 
   5216 						source0 += pitch;
   5217 						source1 += pitch;
   5218 						source2 += pitch;
   5219 						source3 += pitch;
   5220 						source4 += pitch;
   5221 						source5 += pitch;
   5222 						source6 += pitch;
   5223 						source7 += pitch;
   5224 						source8 += pitch;
   5225 						source9 += pitch;
   5226 						sourceA += pitch;
   5227 						sourceB += pitch;
   5228 						sourceC += pitch;
   5229 						sourceD += pitch;
   5230 						sourceE += pitch;
   5231 						sourceF += pitch;
   5232 					}
   5233 				}
   5234 				else ASSERT(false);
   5235 			}
   5236 		}
   5237 		else if(internal.format == FORMAT_G32R32F)
   5238 		{
   5239 			#if defined(__i386__) || defined(__x86_64__)
   5240 				if(CPUID::supportsSSE() && (width % 2) == 0)
   5241 				{
   5242 					if(internal.samples == 2)
   5243 					{
   5244 						for(int y = 0; y < height; y++)
   5245 						{
   5246 							for(int x = 0; x < width; x += 2)
   5247 							{
   5248 								__m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
   5249 								__m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
   5250 
   5251 								c0 = _mm_add_ps(c0, c1);
   5252 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 2.0f));
   5253 
   5254 								_mm_store_ps((float*)(source0 + 8 * x), c0);
   5255 							}
   5256 
   5257 							source0 += pitch;
   5258 							source1 += pitch;
   5259 						}
   5260 					}
   5261 					else if(internal.samples == 4)
   5262 					{
   5263 						for(int y = 0; y < height; y++)
   5264 						{
   5265 							for(int x = 0; x < width; x += 2)
   5266 							{
   5267 								__m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
   5268 								__m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
   5269 								__m128 c2 = _mm_load_ps((float*)(source2 + 8 * x));
   5270 								__m128 c3 = _mm_load_ps((float*)(source3 + 8 * x));
   5271 
   5272 								c0 = _mm_add_ps(c0, c1);
   5273 								c2 = _mm_add_ps(c2, c3);
   5274 								c0 = _mm_add_ps(c0, c2);
   5275 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 4.0f));
   5276 
   5277 								_mm_store_ps((float*)(source0 + 8 * x), c0);
   5278 							}
   5279 
   5280 							source0 += pitch;
   5281 							source1 += pitch;
   5282 							source2 += pitch;
   5283 							source3 += pitch;
   5284 						}
   5285 					}
   5286 					else if(internal.samples == 8)
   5287 					{
   5288 						for(int y = 0; y < height; y++)
   5289 						{
   5290 							for(int x = 0; x < width; x += 2)
   5291 							{
   5292 								__m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
   5293 								__m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
   5294 								__m128 c2 = _mm_load_ps((float*)(source2 + 8 * x));
   5295 								__m128 c3 = _mm_load_ps((float*)(source3 + 8 * x));
   5296 								__m128 c4 = _mm_load_ps((float*)(source4 + 8 * x));
   5297 								__m128 c5 = _mm_load_ps((float*)(source5 + 8 * x));
   5298 								__m128 c6 = _mm_load_ps((float*)(source6 + 8 * x));
   5299 								__m128 c7 = _mm_load_ps((float*)(source7 + 8 * x));
   5300 
   5301 								c0 = _mm_add_ps(c0, c1);
   5302 								c2 = _mm_add_ps(c2, c3);
   5303 								c4 = _mm_add_ps(c4, c5);
   5304 								c6 = _mm_add_ps(c6, c7);
   5305 								c0 = _mm_add_ps(c0, c2);
   5306 								c4 = _mm_add_ps(c4, c6);
   5307 								c0 = _mm_add_ps(c0, c4);
   5308 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 8.0f));
   5309 
   5310 								_mm_store_ps((float*)(source0 + 8 * x), c0);
   5311 							}
   5312 
   5313 							source0 += pitch;
   5314 							source1 += pitch;
   5315 							source2 += pitch;
   5316 							source3 += pitch;
   5317 							source4 += pitch;
   5318 							source5 += pitch;
   5319 							source6 += pitch;
   5320 							source7 += pitch;
   5321 						}
   5322 					}
   5323 					else if(internal.samples == 16)
   5324 					{
   5325 						for(int y = 0; y < height; y++)
   5326 						{
   5327 							for(int x = 0; x < width; x += 2)
   5328 							{
   5329 								__m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
   5330 								__m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
   5331 								__m128 c2 = _mm_load_ps((float*)(source2 + 8 * x));
   5332 								__m128 c3 = _mm_load_ps((float*)(source3 + 8 * x));
   5333 								__m128 c4 = _mm_load_ps((float*)(source4 + 8 * x));
   5334 								__m128 c5 = _mm_load_ps((float*)(source5 + 8 * x));
   5335 								__m128 c6 = _mm_load_ps((float*)(source6 + 8 * x));
   5336 								__m128 c7 = _mm_load_ps((float*)(source7 + 8 * x));
   5337 								__m128 c8 = _mm_load_ps((float*)(source8 + 8 * x));
   5338 								__m128 c9 = _mm_load_ps((float*)(source9 + 8 * x));
   5339 								__m128 cA = _mm_load_ps((float*)(sourceA + 8 * x));
   5340 								__m128 cB = _mm_load_ps((float*)(sourceB + 8 * x));
   5341 								__m128 cC = _mm_load_ps((float*)(sourceC + 8 * x));
   5342 								__m128 cD = _mm_load_ps((float*)(sourceD + 8 * x));
   5343 								__m128 cE = _mm_load_ps((float*)(sourceE + 8 * x));
   5344 								__m128 cF = _mm_load_ps((float*)(sourceF + 8 * x));
   5345 
   5346 								c0 = _mm_add_ps(c0, c1);
   5347 								c2 = _mm_add_ps(c2, c3);
   5348 								c4 = _mm_add_ps(c4, c5);
   5349 								c6 = _mm_add_ps(c6, c7);
   5350 								c8 = _mm_add_ps(c8, c9);
   5351 								cA = _mm_add_ps(cA, cB);
   5352 								cC = _mm_add_ps(cC, cD);
   5353 								cE = _mm_add_ps(cE, cF);
   5354 								c0 = _mm_add_ps(c0, c2);
   5355 								c4 = _mm_add_ps(c4, c6);
   5356 								c8 = _mm_add_ps(c8, cA);
   5357 								cC = _mm_add_ps(cC, cE);
   5358 								c0 = _mm_add_ps(c0, c4);
   5359 								c8 = _mm_add_ps(c8, cC);
   5360 								c0 = _mm_add_ps(c0, c8);
   5361 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 16.0f));
   5362 
   5363 								_mm_store_ps((float*)(source0 + 8 * x), c0);
   5364 							}
   5365 
   5366 							source0 += pitch;
   5367 							source1 += pitch;
   5368 							source2 += pitch;
   5369 							source3 += pitch;
   5370 							source4 += pitch;
   5371 							source5 += pitch;
   5372 							source6 += pitch;
   5373 							source7 += pitch;
   5374 							source8 += pitch;
   5375 							source9 += pitch;
   5376 							sourceA += pitch;
   5377 							sourceB += pitch;
   5378 							sourceC += pitch;
   5379 							sourceD += pitch;
   5380 							sourceE += pitch;
   5381 							sourceF += pitch;
   5382 						}
   5383 					}
   5384 					else ASSERT(false);
   5385 				}
   5386 				else
   5387 			#endif
   5388 			{
   5389 				if(internal.samples == 2)
   5390 				{
   5391 					for(int y = 0; y < height; y++)
   5392 					{
   5393 						for(int x = 0; x < 2 * width; x++)
   5394 						{
   5395 							float c0 = *(float*)(source0 + 4 * x);
   5396 							float c1 = *(float*)(source1 + 4 * x);
   5397 
   5398 							c0 = c0 + c1;
   5399 							c0 *= 1.0f / 2.0f;
   5400 
   5401 							*(float*)(source0 + 4 * x) = c0;
   5402 						}
   5403 
   5404 						source0 += pitch;
   5405 						source1 += pitch;
   5406 					}
   5407 				}
   5408 				else if(internal.samples == 4)
   5409 				{
   5410 					for(int y = 0; y < height; y++)
   5411 					{
   5412 						for(int x = 0; x < 2 * width; x++)
   5413 						{
   5414 							float c0 = *(float*)(source0 + 4 * x);
   5415 							float c1 = *(float*)(source1 + 4 * x);
   5416 							float c2 = *(float*)(source2 + 4 * x);
   5417 							float c3 = *(float*)(source3 + 4 * x);
   5418 
   5419 							c0 = c0 + c1;
   5420 							c2 = c2 + c3;
   5421 							c0 = c0 + c2;
   5422 							c0 *= 1.0f / 4.0f;
   5423 
   5424 							*(float*)(source0 + 4 * x) = c0;
   5425 						}
   5426 
   5427 						source0 += pitch;
   5428 						source1 += pitch;
   5429 						source2 += pitch;
   5430 						source3 += pitch;
   5431 					}
   5432 				}
   5433 				else if(internal.samples == 8)
   5434 				{
   5435 					for(int y = 0; y < height; y++)
   5436 					{
   5437 						for(int x = 0; x < 2 * width; x++)
   5438 						{
   5439 							float c0 = *(float*)(source0 + 4 * x);
   5440 							float c1 = *(float*)(source1 + 4 * x);
   5441 							float c2 = *(float*)(source2 + 4 * x);
   5442 							float c3 = *(float*)(source3 + 4 * x);
   5443 							float c4 = *(float*)(source4 + 4 * x);
   5444 							float c5 = *(float*)(source5 + 4 * x);
   5445 							float c6 = *(float*)(source6 + 4 * x);
   5446 							float c7 = *(float*)(source7 + 4 * x);
   5447 
   5448 							c0 = c0 + c1;
   5449 							c2 = c2 + c3;
   5450 							c4 = c4 + c5;
   5451 							c6 = c6 + c7;
   5452 							c0 = c0 + c2;
   5453 							c4 = c4 + c6;
   5454 							c0 = c0 + c4;
   5455 							c0 *= 1.0f / 8.0f;
   5456 
   5457 							*(float*)(source0 + 4 * x) = c0;
   5458 						}
   5459 
   5460 						source0 += pitch;
   5461 						source1 += pitch;
   5462 						source2 += pitch;
   5463 						source3 += pitch;
   5464 						source4 += pitch;
   5465 						source5 += pitch;
   5466 						source6 += pitch;
   5467 						source7 += pitch;
   5468 					}
   5469 				}
   5470 				else if(internal.samples == 16)
   5471 				{
   5472 					for(int y = 0; y < height; y++)
   5473 					{
   5474 						for(int x = 0; x < 2 * width; x++)
   5475 						{
   5476 							float c0 = *(float*)(source0 + 4 * x);
   5477 							float c1 = *(float*)(source1 + 4 * x);
   5478 							float c2 = *(float*)(source2 + 4 * x);
   5479 							float c3 = *(float*)(source3 + 4 * x);
   5480 							float c4 = *(float*)(source4 + 4 * x);
   5481 							float c5 = *(float*)(source5 + 4 * x);
   5482 							float c6 = *(float*)(source6 + 4 * x);
   5483 							float c7 = *(float*)(source7 + 4 * x);
   5484 							float c8 = *(float*)(source8 + 4 * x);
   5485 							float c9 = *(float*)(source9 + 4 * x);
   5486 							float cA = *(float*)(sourceA + 4 * x);
   5487 							float cB = *(float*)(sourceB + 4 * x);
   5488 							float cC = *(float*)(sourceC + 4 * x);
   5489 							float cD = *(float*)(sourceD + 4 * x);
   5490 							float cE = *(float*)(sourceE + 4 * x);
   5491 							float cF = *(float*)(sourceF + 4 * x);
   5492 
   5493 							c0 = c0 + c1;
   5494 							c2 = c2 + c3;
   5495 							c4 = c4 + c5;
   5496 							c6 = c6 + c7;
   5497 							c8 = c8 + c9;
   5498 							cA = cA + cB;
   5499 							cC = cC + cD;
   5500 							cE = cE + cF;
   5501 							c0 = c0 + c2;
   5502 							c4 = c4 + c6;
   5503 							c8 = c8 + cA;
   5504 							cC = cC + cE;
   5505 							c0 = c0 + c4;
   5506 							c8 = c8 + cC;
   5507 							c0 = c0 + c8;
   5508 							c0 *= 1.0f / 16.0f;
   5509 
   5510 							*(float*)(source0 + 4 * x) = c0;
   5511 						}
   5512 
   5513 						source0 += pitch;
   5514 						source1 += pitch;
   5515 						source2 += pitch;
   5516 						source3 += pitch;
   5517 						source4 += pitch;
   5518 						source5 += pitch;
   5519 						source6 += pitch;
   5520 						source7 += pitch;
   5521 						source8 += pitch;
   5522 						source9 += pitch;
   5523 						sourceA += pitch;
   5524 						sourceB += pitch;
   5525 						sourceC += pitch;
   5526 						sourceD += pitch;
   5527 						sourceE += pitch;
   5528 						sourceF += pitch;
   5529 					}
   5530 				}
   5531 				else ASSERT(false);
   5532 			}
   5533 		}
   5534 		else if(internal.format == FORMAT_A32B32G32R32F ||
   5535 		        internal.format == FORMAT_X32B32G32R32F ||
   5536 		        internal.format == FORMAT_X32B32G32R32F_UNSIGNED)
   5537 		{
   5538 			#if defined(__i386__) || defined(__x86_64__)
   5539 				if(CPUID::supportsSSE())
   5540 				{
   5541 					if(internal.samples == 2)
   5542 					{
   5543 						for(int y = 0; y < height; y++)
   5544 						{
   5545 							for(int x = 0; x < width; x++)
   5546 							{
   5547 								__m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
   5548 								__m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
   5549 
   5550 								c0 = _mm_add_ps(c0, c1);
   5551 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 2.0f));
   5552 
   5553 								_mm_store_ps((float*)(source0 + 16 * x), c0);
   5554 							}
   5555 
   5556 							source0 += pitch;
   5557 							source1 += pitch;
   5558 						}
   5559 					}
   5560 					else if(internal.samples == 4)
   5561 					{
   5562 						for(int y = 0; y < height; y++)
   5563 						{
   5564 							for(int x = 0; x < width; x++)
   5565 							{
   5566 								__m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
   5567 								__m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
   5568 								__m128 c2 = _mm_load_ps((float*)(source2 + 16 * x));
   5569 								__m128 c3 = _mm_load_ps((float*)(source3 + 16 * x));
   5570 
   5571 								c0 = _mm_add_ps(c0, c1);
   5572 								c2 = _mm_add_ps(c2, c3);
   5573 								c0 = _mm_add_ps(c0, c2);
   5574 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 4.0f));
   5575 
   5576 								_mm_store_ps((float*)(source0 + 16 * x), c0);
   5577 							}
   5578 
   5579 							source0 += pitch;
   5580 							source1 += pitch;
   5581 							source2 += pitch;
   5582 							source3 += pitch;
   5583 						}
   5584 					}
   5585 					else if(internal.samples == 8)
   5586 					{
   5587 						for(int y = 0; y < height; y++)
   5588 						{
   5589 							for(int x = 0; x < width; x++)
   5590 							{
   5591 								__m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
   5592 								__m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
   5593 								__m128 c2 = _mm_load_ps((float*)(source2 + 16 * x));
   5594 								__m128 c3 = _mm_load_ps((float*)(source3 + 16 * x));
   5595 								__m128 c4 = _mm_load_ps((float*)(source4 + 16 * x));
   5596 								__m128 c5 = _mm_load_ps((float*)(source5 + 16 * x));
   5597 								__m128 c6 = _mm_load_ps((float*)(source6 + 16 * x));
   5598 								__m128 c7 = _mm_load_ps((float*)(source7 + 16 * x));
   5599 
   5600 								c0 = _mm_add_ps(c0, c1);
   5601 								c2 = _mm_add_ps(c2, c3);
   5602 								c4 = _mm_add_ps(c4, c5);
   5603 								c6 = _mm_add_ps(c6, c7);
   5604 								c0 = _mm_add_ps(c0, c2);
   5605 								c4 = _mm_add_ps(c4, c6);
   5606 								c0 = _mm_add_ps(c0, c4);
   5607 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 8.0f));
   5608 
   5609 								_mm_store_ps((float*)(source0 + 16 * x), c0);
   5610 							}
   5611 
   5612 							source0 += pitch;
   5613 							source1 += pitch;
   5614 							source2 += pitch;
   5615 							source3 += pitch;
   5616 							source4 += pitch;
   5617 							source5 += pitch;
   5618 							source6 += pitch;
   5619 							source7 += pitch;
   5620 						}
   5621 					}
   5622 					else if(internal.samples == 16)
   5623 					{
   5624 						for(int y = 0; y < height; y++)
   5625 						{
   5626 							for(int x = 0; x < width; x++)
   5627 							{
   5628 								__m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
   5629 								__m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
   5630 								__m128 c2 = _mm_load_ps((float*)(source2 + 16 * x));
   5631 								__m128 c3 = _mm_load_ps((float*)(source3 + 16 * x));
   5632 								__m128 c4 = _mm_load_ps((float*)(source4 + 16 * x));
   5633 								__m128 c5 = _mm_load_ps((float*)(source5 + 16 * x));
   5634 								__m128 c6 = _mm_load_ps((float*)(source6 + 16 * x));
   5635 								__m128 c7 = _mm_load_ps((float*)(source7 + 16 * x));
   5636 								__m128 c8 = _mm_load_ps((float*)(source8 + 16 * x));
   5637 								__m128 c9 = _mm_load_ps((float*)(source9 + 16 * x));
   5638 								__m128 cA = _mm_load_ps((float*)(sourceA + 16 * x));
   5639 								__m128 cB = _mm_load_ps((float*)(sourceB + 16 * x));
   5640 								__m128 cC = _mm_load_ps((float*)(sourceC + 16 * x));
   5641 								__m128 cD = _mm_load_ps((float*)(sourceD + 16 * x));
   5642 								__m128 cE = _mm_load_ps((float*)(sourceE + 16 * x));
   5643 								__m128 cF = _mm_load_ps((float*)(sourceF + 16 * x));
   5644 
   5645 								c0 = _mm_add_ps(c0, c1);
   5646 								c2 = _mm_add_ps(c2, c3);
   5647 								c4 = _mm_add_ps(c4, c5);
   5648 								c6 = _mm_add_ps(c6, c7);
   5649 								c8 = _mm_add_ps(c8, c9);
   5650 								cA = _mm_add_ps(cA, cB);
   5651 								cC = _mm_add_ps(cC, cD);
   5652 								cE = _mm_add_ps(cE, cF);
   5653 								c0 = _mm_add_ps(c0, c2);
   5654 								c4 = _mm_add_ps(c4, c6);
   5655 								c8 = _mm_add_ps(c8, cA);
   5656 								cC = _mm_add_ps(cC, cE);
   5657 								c0 = _mm_add_ps(c0, c4);
   5658 								c8 = _mm_add_ps(c8, cC);
   5659 								c0 = _mm_add_ps(c0, c8);
   5660 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 16.0f));
   5661 
   5662 								_mm_store_ps((float*)(source0 + 16 * x), c0);
   5663 							}
   5664 
   5665 							source0 += pitch;
   5666 							source1 += pitch;
   5667 							source2 += pitch;
   5668 							source3 += pitch;
   5669 							source4 += pitch;
   5670 							source5 += pitch;
   5671 							source6 += pitch;
   5672 							source7 += pitch;
   5673 							source8 += pitch;
   5674 							source9 += pitch;
   5675 							sourceA += pitch;
   5676 							sourceB += pitch;
   5677 							sourceC += pitch;
   5678 							sourceD += pitch;
   5679 							sourceE += pitch;
   5680 							sourceF += pitch;
   5681 						}
   5682 					}
   5683 					else ASSERT(false);
   5684 				}
   5685 				else
   5686 			#endif
   5687 			{
   5688 				if(internal.samples == 2)
   5689 				{
   5690 					for(int y = 0; y < height; y++)
   5691 					{
   5692 						for(int x = 0; x < 4 * width; x++)
   5693 						{
   5694 							float c0 = *(float*)(source0 + 4 * x);
   5695 							float c1 = *(float*)(source1 + 4 * x);
   5696 
   5697 							c0 = c0 + c1;
   5698 							c0 *= 1.0f / 2.0f;
   5699 
   5700 							*(float*)(source0 + 4 * x) = c0;
   5701 						}
   5702 
   5703 						source0 += pitch;
   5704 						source1 += pitch;
   5705 					}
   5706 				}
   5707 				else if(internal.samples == 4)
   5708 				{
   5709 					for(int y = 0; y < height; y++)
   5710 					{
   5711 						for(int x = 0; x < 4 * width; x++)
   5712 						{
   5713 							float c0 = *(float*)(source0 + 4 * x);
   5714 							float c1 = *(float*)(source1 + 4 * x);
   5715 							float c2 = *(float*)(source2 + 4 * x);
   5716 							float c3 = *(float*)(source3 + 4 * x);
   5717 
   5718 							c0 = c0 + c1;
   5719 							c2 = c2 + c3;
   5720 							c0 = c0 + c2;
   5721 							c0 *= 1.0f / 4.0f;
   5722 
   5723 							*(float*)(source0 + 4 * x) = c0;
   5724 						}
   5725 
   5726 						source0 += pitch;
   5727 						source1 += pitch;
   5728 						source2 += pitch;
   5729 						source3 += pitch;
   5730 					}
   5731 				}
   5732 				else if(internal.samples == 8)
   5733 				{
   5734 					for(int y = 0; y < height; y++)
   5735 					{
   5736 						for(int x = 0; x < 4 * width; x++)
   5737 						{
   5738 							float c0 = *(float*)(source0 + 4 * x);
   5739 							float c1 = *(float*)(source1 + 4 * x);
   5740 							float c2 = *(float*)(source2 + 4 * x);
   5741 							float c3 = *(float*)(source3 + 4 * x);
   5742 							float c4 = *(float*)(source4 + 4 * x);
   5743 							float c5 = *(float*)(source5 + 4 * x);
   5744 							float c6 = *(float*)(source6 + 4 * x);
   5745 							float c7 = *(float*)(source7 + 4 * x);
   5746 
   5747 							c0 = c0 + c1;
   5748 							c2 = c2 + c3;
   5749 							c4 = c4 + c5;
   5750 							c6 = c6 + c7;
   5751 							c0 = c0 + c2;
   5752 							c4 = c4 + c6;
   5753 							c0 = c0 + c4;
   5754 							c0 *= 1.0f / 8.0f;
   5755 
   5756 							*(float*)(source0 + 4 * x) = c0;
   5757 						}
   5758 
   5759 						source0 += pitch;
   5760 						source1 += pitch;
   5761 						source2 += pitch;
   5762 						source3 += pitch;
   5763 						source4 += pitch;
   5764 						source5 += pitch;
   5765 						source6 += pitch;
   5766 						source7 += pitch;
   5767 					}
   5768 				}
   5769 				else if(internal.samples == 16)
   5770 				{
   5771 					for(int y = 0; y < height; y++)
   5772 					{
   5773 						for(int x = 0; x < 4 * width; x++)
   5774 						{
   5775 							float c0 = *(float*)(source0 + 4 * x);
   5776 							float c1 = *(float*)(source1 + 4 * x);
   5777 							float c2 = *(float*)(source2 + 4 * x);
   5778 							float c3 = *(float*)(source3 + 4 * x);
   5779 							float c4 = *(float*)(source4 + 4 * x);
   5780 							float c5 = *(float*)(source5 + 4 * x);
   5781 							float c6 = *(float*)(source6 + 4 * x);
   5782 							float c7 = *(float*)(source7 + 4 * x);
   5783 							float c8 = *(float*)(source8 + 4 * x);
   5784 							float c9 = *(float*)(source9 + 4 * x);
   5785 							float cA = *(float*)(sourceA + 4 * x);
   5786 							float cB = *(float*)(sourceB + 4 * x);
   5787 							float cC = *(float*)(sourceC + 4 * x);
   5788 							float cD = *(float*)(sourceD + 4 * x);
   5789 							float cE = *(float*)(sourceE + 4 * x);
   5790 							float cF = *(float*)(sourceF + 4 * x);
   5791 
   5792 							c0 = c0 + c1;
   5793 							c2 = c2 + c3;
   5794 							c4 = c4 + c5;
   5795 							c6 = c6 + c7;
   5796 							c8 = c8 + c9;
   5797 							cA = cA + cB;
   5798 							cC = cC + cD;
   5799 							cE = cE + cF;
   5800 							c0 = c0 + c2;
   5801 							c4 = c4 + c6;
   5802 							c8 = c8 + cA;
   5803 							cC = cC + cE;
   5804 							c0 = c0 + c4;
   5805 							c8 = c8 + cC;
   5806 							c0 = c0 + c8;
   5807 							c0 *= 1.0f / 16.0f;
   5808 
   5809 							*(float*)(source0 + 4 * x) = c0;
   5810 						}
   5811 
   5812 						source0 += pitch;
   5813 						source1 += pitch;
   5814 						source2 += pitch;
   5815 						source3 += pitch;
   5816 						source4 += pitch;
   5817 						source5 += pitch;
   5818 						source6 += pitch;
   5819 						source7 += pitch;
   5820 						source8 += pitch;
   5821 						source9 += pitch;
   5822 						sourceA += pitch;
   5823 						sourceB += pitch;
   5824 						sourceC += pitch;
   5825 						sourceD += pitch;
   5826 						sourceE += pitch;
   5827 						sourceF += pitch;
   5828 					}
   5829 				}
   5830 				else ASSERT(false);
   5831 			}
   5832 		}
   5833 		else if(internal.format == FORMAT_R5G6B5)
   5834 		{
   5835 			#if defined(__i386__) || defined(__x86_64__)
   5836 				if(CPUID::supportsSSE2() && (width % 8) == 0)
   5837 				{
   5838 					if(internal.samples == 2)
   5839 					{
   5840 						for(int y = 0; y < height; y++)
   5841 						{
   5842 							for(int x = 0; x < width; x += 8)
   5843 							{
   5844 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
   5845 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
   5846 
   5847 								static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
   5848 								static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
   5849 								__m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
   5850 								__m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
   5851 								__m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
   5852 								__m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
   5853 
   5854 								c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
   5855 								c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
   5856 								c1 = _mm_avg_epu16(c0__g_, c1__g_);
   5857 								c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
   5858 								c0 = _mm_or_si128(c0, c1);
   5859 
   5860 								_mm_store_si128((__m128i*)(source0 + 2 * x), c0);
   5861 							}
   5862 
   5863 							source0 += pitch;
   5864 							source1 += pitch;
   5865 						}
   5866 					}
   5867 					else if(internal.samples == 4)
   5868 					{
   5869 						for(int y = 0; y < height; y++)
   5870 						{
   5871 							for(int x = 0; x < width; x += 8)
   5872 							{
   5873 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
   5874 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
   5875 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 2 * x));
   5876 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 2 * x));
   5877 
   5878 								static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
   5879 								static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
   5880 								__m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
   5881 								__m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
   5882 								__m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
   5883 								__m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
   5884 								__m128i c2_r_b = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(r_b));
   5885 								__m128i c2__g_ = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(_g_));
   5886 								__m128i c3_r_b = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(r_b));
   5887 								__m128i c3__g_ = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(_g_));
   5888 
   5889 								c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
   5890 								c2 = _mm_avg_epu8(c2_r_b, c3_r_b);
   5891 								c0 = _mm_avg_epu8(c0, c2);
   5892 								c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
   5893 								c1 = _mm_avg_epu16(c0__g_, c1__g_);
   5894 								c3 = _mm_avg_epu16(c2__g_, c3__g_);
   5895 								c1 = _mm_avg_epu16(c1, c3);
   5896 								c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
   5897 								c0 = _mm_or_si128(c0, c1);
   5898 
   5899 								_mm_store_si128((__m128i*)(source0 + 2 * x), c0);
   5900 							}
   5901 
   5902 							source0 += pitch;
   5903 							source1 += pitch;
   5904 							source2 += pitch;
   5905 							source3 += pitch;
   5906 						}
   5907 					}
   5908 					else if(internal.samples == 8)
   5909 					{
   5910 						for(int y = 0; y < height; y++)
   5911 						{
   5912 							for(int x = 0; x < width; x += 8)
   5913 							{
   5914 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
   5915 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
   5916 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 2 * x));
   5917 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 2 * x));
   5918 								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 2 * x));
   5919 								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 2 * x));
   5920 								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 2 * x));
   5921 								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 2 * x));
   5922 
   5923 								static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
   5924 								static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
   5925 								__m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
   5926 								__m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
   5927 								__m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
   5928 								__m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
   5929 								__m128i c2_r_b = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(r_b));
   5930 								__m128i c2__g_ = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(_g_));
   5931 								__m128i c3_r_b = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(r_b));
   5932 								__m128i c3__g_ = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(_g_));
   5933 								__m128i c4_r_b = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(r_b));
   5934 								__m128i c4__g_ = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(_g_));
   5935 								__m128i c5_r_b = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(r_b));
   5936 								__m128i c5__g_ = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(_g_));
   5937 								__m128i c6_r_b = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(r_b));
   5938 								__m128i c6__g_ = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(_g_));
   5939 								__m128i c7_r_b = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(r_b));
   5940 								__m128i c7__g_ = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(_g_));
   5941 
   5942 								c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
   5943 								c2 = _mm_avg_epu8(c2_r_b, c3_r_b);
   5944 								c4 = _mm_avg_epu8(c4_r_b, c5_r_b);
   5945 								c6 = _mm_avg_epu8(c6_r_b, c7_r_b);
   5946 								c0 = _mm_avg_epu8(c0, c2);
   5947 								c4 = _mm_avg_epu8(c4, c6);
   5948 								c0 = _mm_avg_epu8(c0, c4);
   5949 								c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
   5950 								c1 = _mm_avg_epu16(c0__g_, c1__g_);
   5951 								c3 = _mm_avg_epu16(c2__g_, c3__g_);
   5952 								c5 = _mm_avg_epu16(c4__g_, c5__g_);
   5953 								c7 = _mm_avg_epu16(c6__g_, c7__g_);
   5954 								c1 = _mm_avg_epu16(c1, c3);
   5955 								c5 = _mm_avg_epu16(c5, c7);
   5956 								c1 = _mm_avg_epu16(c1, c5);
   5957 								c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
   5958 								c0 = _mm_or_si128(c0, c1);
   5959 
   5960 								_mm_store_si128((__m128i*)(source0 + 2 * x), c0);
   5961 							}
   5962 
   5963 							source0 += pitch;
   5964 							source1 += pitch;
   5965 							source2 += pitch;
   5966 							source3 += pitch;
   5967 							source4 += pitch;
   5968 							source5 += pitch;
   5969 							source6 += pitch;
   5970 							source7 += pitch;
   5971 						}
   5972 					}
   5973 					else if(internal.samples == 16)
   5974 					{
   5975 						for(int y = 0; y < height; y++)
   5976 						{
   5977 							for(int x = 0; x < width; x += 8)
   5978 							{
   5979 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
   5980 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
   5981 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 2 * x));
   5982 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 2 * x));
   5983 								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 2 * x));
   5984 								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 2 * x));
   5985 								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 2 * x));
   5986 								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 2 * x));
   5987 								__m128i c8 = _mm_load_si128((__m128i*)(source8 + 2 * x));
   5988 								__m128i c9 = _mm_load_si128((__m128i*)(source9 + 2 * x));
   5989 								__m128i cA = _mm_load_si128((__m128i*)(sourceA + 2 * x));
   5990 								__m128i cB = _mm_load_si128((__m128i*)(sourceB + 2 * x));
   5991 								__m128i cC = _mm_load_si128((__m128i*)(sourceC + 2 * x));
   5992 								__m128i cD = _mm_load_si128((__m128i*)(sourceD + 2 * x));
   5993 								__m128i cE = _mm_load_si128((__m128i*)(sourceE + 2 * x));
   5994 								__m128i cF = _mm_load_si128((__m128i*)(sourceF + 2 * x));
   5995 
   5996 								static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
   5997 								static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
   5998 								__m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
   5999 								__m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
   6000 								__m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
   6001 								__m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
   6002 								__m128i c2_r_b = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(r_b));
   6003 								__m128i c2__g_ = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(_g_));
   6004 								__m128i c3_r_b = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(r_b));
   6005 								__m128i c3__g_ = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(_g_));
   6006 								__m128i c4_r_b = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(r_b));
   6007 								__m128i c4__g_ = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(_g_));
   6008 								__m128i c5_r_b = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(r_b));
   6009 								__m128i c5__g_ = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(_g_));
   6010 								__m128i c6_r_b = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(r_b));
   6011 								__m128i c6__g_ = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(_g_));
   6012 								__m128i c7_r_b = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(r_b));
   6013 								__m128i c7__g_ = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(_g_));
   6014 								__m128i c8_r_b = _mm_and_si128(c8, reinterpret_cast<const __m128i&>(r_b));
   6015 								__m128i c8__g_ = _mm_and_si128(c8, reinterpret_cast<const __m128i&>(_g_));
   6016 								__m128i c9_r_b = _mm_and_si128(c9, reinterpret_cast<const __m128i&>(r_b));
   6017 								__m128i c9__g_ = _mm_and_si128(c9, reinterpret_cast<const __m128i&>(_g_));
   6018 								__m128i cA_r_b = _mm_and_si128(cA, reinterpret_cast<const __m128i&>(r_b));
   6019 								__m128i cA__g_ = _mm_and_si128(cA, reinterpret_cast<const __m128i&>(_g_));
   6020 								__m128i cB_r_b = _mm_and_si128(cB, reinterpret_cast<const __m128i&>(r_b));
   6021 								__m128i cB__g_ = _mm_and_si128(cB, reinterpret_cast<const __m128i&>(_g_));
   6022 								__m128i cC_r_b = _mm_and_si128(cC, reinterpret_cast<const __m128i&>(r_b));
   6023 								__m128i cC__g_ = _mm_and_si128(cC, reinterpret_cast<const __m128i&>(_g_));
   6024 								__m128i cD_r_b = _mm_and_si128(cD, reinterpret_cast<const __m128i&>(r_b));
   6025 								__m128i cD__g_ = _mm_and_si128(cD, reinterpret_cast<const __m128i&>(_g_));
   6026 								__m128i cE_r_b = _mm_and_si128(cE, reinterpret_cast<const __m128i&>(r_b));
   6027 								__m128i cE__g_ = _mm_and_si128(cE, reinterpret_cast<const __m128i&>(_g_));
   6028 								__m128i cF_r_b = _mm_and_si128(cF, reinterpret_cast<const __m128i&>(r_b));
   6029 								__m128i cF__g_ = _mm_and_si128(cF, reinterpret_cast<const __m128i&>(_g_));
   6030 
   6031 								c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
   6032 								c2 = _mm_avg_epu8(c2_r_b, c3_r_b);
   6033 								c4 = _mm_avg_epu8(c4_r_b, c5_r_b);
   6034 								c6 = _mm_avg_epu8(c6_r_b, c7_r_b);
   6035 								c8 = _mm_avg_epu8(c8_r_b, c9_r_b);
   6036 								cA = _mm_avg_epu8(cA_r_b, cB_r_b);
   6037 								cC = _mm_avg_epu8(cC_r_b, cD_r_b);
   6038 								cE = _mm_avg_epu8(cE_r_b, cF_r_b);
   6039 								c0 = _mm_avg_epu8(c0, c2);
   6040 								c4 = _mm_avg_epu8(c4, c6);
   6041 								c8 = _mm_avg_epu8(c8, cA);
   6042 								cC = _mm_avg_epu8(cC, cE);
   6043 								c0 = _mm_avg_epu8(c0, c4);
   6044 								c8 = _mm_avg_epu8(c8, cC);
   6045 								c0 = _mm_avg_epu8(c0, c8);
   6046 								c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
   6047 								c1 = _mm_avg_epu16(c0__g_, c1__g_);
   6048 								c3 = _mm_avg_epu16(c2__g_, c3__g_);
   6049 								c5 = _mm_avg_epu16(c4__g_, c5__g_);
   6050 								c7 = _mm_avg_epu16(c6__g_, c7__g_);
   6051 								c9 = _mm_avg_epu16(c8__g_, c9__g_);
   6052 								cB = _mm_avg_epu16(cA__g_, cB__g_);
   6053 								cD = _mm_avg_epu16(cC__g_, cD__g_);
   6054 								cF = _mm_avg_epu16(cE__g_, cF__g_);
   6055 								c1 = _mm_avg_epu8(c1, c3);
   6056 								c5 = _mm_avg_epu8(c5, c7);
   6057 								c9 = _mm_avg_epu8(c9, cB);
   6058 								cD = _mm_avg_epu8(cD, cF);
   6059 								c1 = _mm_avg_epu8(c1, c5);
   6060 								c9 = _mm_avg_epu8(c9, cD);
   6061 								c1 = _mm_avg_epu8(c1, c9);
   6062 								c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
   6063 								c0 = _mm_or_si128(c0, c1);
   6064 
   6065 								_mm_store_si128((__m128i*)(source0 + 2 * x), c0);
   6066 							}
   6067 
   6068 							source0 += pitch;
   6069 							source1 += pitch;
   6070 							source2 += pitch;
   6071 							source3 += pitch;
   6072 							source4 += pitch;
   6073 							source5 += pitch;
   6074 							source6 += pitch;
   6075 							source7 += pitch;
   6076 							source8 += pitch;
   6077 							source9 += pitch;
   6078 							sourceA += pitch;
   6079 							sourceB += pitch;
   6080 							sourceC += pitch;
   6081 							sourceD += pitch;
   6082 							sourceE += pitch;
   6083 							sourceF += pitch;
   6084 						}
   6085 					}
   6086 					else ASSERT(false);
   6087 				}
   6088 				else
   6089 			#endif
   6090 			{
   6091 				#define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7BEF) + (((x) ^ (y)) & 0x0821))
   6092 
   6093 				if(internal.samples == 2)
   6094 				{
   6095 					for(int y = 0; y < height; y++)
   6096 					{
   6097 						for(int x = 0; x < width; x++)
   6098 						{
   6099 							unsigned short c0 = *(unsigned short*)(source0 + 2 * x);
   6100 							unsigned short c1 = *(unsigned short*)(source1 + 2 * x);
   6101 
   6102 							c0 = AVERAGE(c0, c1);
   6103 
   6104 							*(unsigned short*)(source0 + 2 * x) = c0;
   6105 						}
   6106 
   6107 						source0 += pitch;
   6108 						source1 += pitch;
   6109 					}
   6110 				}
   6111 				else if(internal.samples == 4)
   6112 				{
   6113 					for(int y = 0; y < height; y++)
   6114 					{
   6115 						for(int x = 0; x < width; x++)
   6116 						{
   6117 							unsigned short c0 = *(unsigned short*)(source0 + 2 * x);
   6118 							unsigned short c1 = *(unsigned short*)(source1 + 2 * x);
   6119 							unsigned short c2 = *(unsigned short*)(source2 + 2 * x);
   6120 							unsigned short c3 = *(unsigned short*)(source3 + 2 * x);
   6121 
   6122 							c0 = AVERAGE(c0, c1);
   6123 							c2 = AVERAGE(c2, c3);
   6124 							c0 = AVERAGE(c0, c2);
   6125 
   6126 							*(unsigned short*)(source0 + 2 * x) = c0;
   6127 						}
   6128 
   6129 						source0 += pitch;
   6130 						source1 += pitch;
   6131 						source2 += pitch;
   6132 						source3 += pitch;
   6133 					}
   6134 				}
   6135 				else if(internal.samples == 8)
   6136 				{
   6137 					for(int y = 0; y < height; y++)
   6138 					{
   6139 						for(int x = 0; x < width; x++)
   6140 						{
   6141 							unsigned short c0 = *(unsigned short*)(source0 + 2 * x);
   6142 							unsigned short c1 = *(unsigned short*)(source1 + 2 * x);
   6143 							unsigned short c2 = *(unsigned short*)(source2 + 2 * x);
   6144 							unsigned short c3 = *(unsigned short*)(source3 + 2 * x);
   6145 							unsigned short c4 = *(unsigned short*)(source4 + 2 * x);
   6146 							unsigned short c5 = *(unsigned short*)(source5 + 2 * x);
   6147 							unsigned short c6 = *(unsigned short*)(source6 + 2 * x);
   6148 							unsigned short c7 = *(unsigned short*)(source7 + 2 * x);
   6149 
   6150 							c0 = AVERAGE(c0, c1);
   6151 							c2 = AVERAGE(c2, c3);
   6152 							c4 = AVERAGE(c4, c5);
   6153 							c6 = AVERAGE(c6, c7);
   6154 							c0 = AVERAGE(c0, c2);
   6155 							c4 = AVERAGE(c4, c6);
   6156 							c0 = AVERAGE(c0, c4);
   6157 
   6158 							*(unsigned short*)(source0 + 2 * x) = c0;
   6159 						}
   6160 
   6161 						source0 += pitch;
   6162 						source1 += pitch;
   6163 						source2 += pitch;
   6164 						source3 += pitch;
   6165 						source4 += pitch;
   6166 						source5 += pitch;
   6167 						source6 += pitch;
   6168 						source7 += pitch;
   6169 					}
   6170 				}
   6171 				else if(internal.samples == 16)
   6172 				{
   6173 					for(int y = 0; y < height; y++)
   6174 					{
   6175 						for(int x = 0; x < width; x++)
   6176 						{
   6177 							unsigned short c0 = *(unsigned short*)(source0 + 2 * x);
   6178 							unsigned short c1 = *(unsigned short*)(source1 + 2 * x);
   6179 							unsigned short c2 = *(unsigned short*)(source2 + 2 * x);
   6180 							unsigned short c3 = *(unsigned short*)(source3 + 2 * x);
   6181 							unsigned short c4 = *(unsigned short*)(source4 + 2 * x);
   6182 							unsigned short c5 = *(unsigned short*)(source5 + 2 * x);
   6183 							unsigned short c6 = *(unsigned short*)(source6 + 2 * x);
   6184 							unsigned short c7 = *(unsigned short*)(source7 + 2 * x);
   6185 							unsigned short c8 = *(unsigned short*)(source8 + 2 * x);
   6186 							unsigned short c9 = *(unsigned short*)(source9 + 2 * x);
   6187 							unsigned short cA = *(unsigned short*)(sourceA + 2 * x);
   6188 							unsigned short cB = *(unsigned short*)(sourceB + 2 * x);
   6189 							unsigned short cC = *(unsigned short*)(sourceC + 2 * x);
   6190 							unsigned short cD = *(unsigned short*)(sourceD + 2 * x);
   6191 							unsigned short cE = *(unsigned short*)(sourceE + 2 * x);
   6192 							unsigned short cF = *(unsigned short*)(sourceF + 2 * x);
   6193 
   6194 							c0 = AVERAGE(c0, c1);
   6195 							c2 = AVERAGE(c2, c3);
   6196 							c4 = AVERAGE(c4, c5);
   6197 							c6 = AVERAGE(c6, c7);
   6198 							c8 = AVERAGE(c8, c9);
   6199 							cA = AVERAGE(cA, cB);
   6200 							cC = AVERAGE(cC, cD);
   6201 							cE = AVERAGE(cE, cF);
   6202 							c0 = AVERAGE(c0, c2);
   6203 							c4 = AVERAGE(c4, c6);
   6204 							c8 = AVERAGE(c8, cA);
   6205 							cC = AVERAGE(cC, cE);
   6206 							c0 = AVERAGE(c0, c4);
   6207 							c8 = AVERAGE(c8, cC);
   6208 							c0 = AVERAGE(c0, c8);
   6209 
   6210 							*(unsigned short*)(source0 + 2 * x) = c0;
   6211 						}
   6212 
   6213 						source0 += pitch;
   6214 						source1 += pitch;
   6215 						source2 += pitch;
   6216 						source3 += pitch;
   6217 						source4 += pitch;
   6218 						source5 += pitch;
   6219 						source6 += pitch;
   6220 						source7 += pitch;
   6221 						source8 += pitch;
   6222 						source9 += pitch;
   6223 						sourceA += pitch;
   6224 						sourceB += pitch;
   6225 						sourceC += pitch;
   6226 						sourceD += pitch;
   6227 						sourceE += pitch;
   6228 						sourceF += pitch;
   6229 					}
   6230 				}
   6231 				else ASSERT(false);
   6232 
   6233 				#undef AVERAGE
   6234 			}
   6235 		}
   6236 		else
   6237 		{
   6238 		//	UNIMPLEMENTED();
   6239 		}
   6240 	}
   6241 }
   6242