Home | History | Annotate | Download | only in spulibs
      1 /*
      2  * SDL - Simple DirectMedia Layer
      3  * CELL BE Support for PS3 Framebuffer
      4  * Copyright (C) 2008, 2009 International Business Machines Corporation
      5  *
      6  * This library is free software; you can redistribute it and/or modify it
      7  * under the terms of the GNU Lesser General Public License as published
      8  * by the Free Software Foundation; either version 2.1 of the License, or
      9  * (at your option) any later version.
     10  *
     11  * This library is distributed in the hope that it will be useful, but
     12  * WITHOUT ANY WARRANTY; without even the implied warranty of
     13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     14  * Lesser General Public License for more details.
     15  *
     16  * You should have received a copy of the GNU Lesser General Public
     17  * License along with this library; if not, write to the Free Software
     18  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
     19  * USA
     20  *
     21  *  Martin Lowinski  <lowinski [at] de [dot] ibm [ibm] com>
     22  *  Dirk Herrendoerfer <d.herrendoerfer [at] de [dot] ibm [dot] com>
     23  *  SPE code based on research by:
     24  *  Rene Becker
     25  *  Thimo Emmerich
     26  */
     27 
     28 #include "spu_common.h"
     29 
     30 #include <spu_intrinsics.h>
     31 #include <spu_mfcio.h>
     32 
     33 // Debugging
     34 //#define DEBUG
     35 
     36 #ifdef DEBUG
     37 #define deprintf(fmt, args... ) \
     38 	fprintf( stdout, fmt, ##args ); \
     39 	fflush( stdout );
     40 #else
     41 #define deprintf( fmt, args... )
     42 #endif
     43 
     44 struct yuv2rgb_parms_t parms_converter __attribute__((aligned(128)));
     45 
     46 /* A maximum of 8 lines Y, therefore 4 lines V, 4 lines U are stored
     47  * there might be the need to retrieve misaligned data, adjust
     48  * incoming v and u plane to be able to handle this (add 128)
     49  */
     50 unsigned char y_plane[2][(MAX_HDTV_WIDTH + 128) * 4] __attribute__((aligned(128)));
     51 unsigned char v_plane[2][(MAX_HDTV_WIDTH + 128) * 2] __attribute__((aligned(128)));
     52 unsigned char u_plane[2][(MAX_HDTV_WIDTH + 128) * 2] __attribute__((aligned(128)));
     53 
     54 /* A maximum of 4 lines BGRA are stored, 4 byte per pixel */
     55 unsigned char bgra[4 * MAX_HDTV_WIDTH * 4] __attribute__((aligned(128)));
     56 
     57 /* some vectors needed by the float to int conversion */
     58 static const vector float vec_255 = { 255.0f, 255.0f, 255.0f, 255.0f };
     59 static const vector float vec_0_1 = { 0.1f, 0.1f, 0.1f, 0.1f };
     60 
     61 void yuv_to_rgb_w16();
     62 void yuv_to_rgb_w32();
     63 
     64 void yuv_to_rgb_w16_line(unsigned char* y_addr, unsigned char* v_addr, unsigned char* u_addr, unsigned char* bgra_addr, unsigned int width);
     65 void yuv_to_rgb_w32_line(unsigned char* y_addr, unsigned char* v_addr, unsigned char* u_addr, unsigned char* bgra_addr_, unsigned int width);
     66 
     67 
     68 int main(unsigned long long spe_id __attribute__((unused)), unsigned long long argp __attribute__ ((unused)))
     69 {
     70 	deprintf("[SPU] yuv2rgb_spu is up... (on SPE #%llu)\n", spe_id);
     71 	uint32_t ea_mfc, mbox;
     72 	// send ready message
     73 	spu_write_out_mbox(SPU_READY);
     74 
     75 	while (1) {
     76 		/* Check mailbox */
     77 		mbox = spu_read_in_mbox();
     78 		deprintf("[SPU] Message is %u\n", mbox);
     79 		switch (mbox) {
     80 			case SPU_EXIT:
     81 				deprintf("[SPU] fb_writer goes down...\n");
     82 				return 0;
     83 			case SPU_START:
     84 				break;
     85 			default:
     86 				deprintf("[SPU] Cannot handle message\n");
     87 				continue;
     88 		}
     89 
     90 		/* Tag Manager setup */
     91 		unsigned int tag_id;
     92 		tag_id = mfc_multi_tag_reserve(1);
     93 		if (tag_id == MFC_TAG_INVALID) {
     94 			deprintf("[SPU] Failed to reserve mfc tags on yuv2rgb_converter\n");
     95 			return 0;
     96 		}
     97 
     98 		/* DMA transfer for the input parameters */
     99 		ea_mfc = spu_read_in_mbox();
    100 		deprintf("[SPU] Message on yuv2rgb_converter is %u\n", ea_mfc);
    101 		spu_mfcdma32(&parms_converter, (unsigned int)ea_mfc, sizeof(struct yuv2rgb_parms_t), tag_id, MFC_GET_CMD);
    102 		DMA_WAIT_TAG(tag_id);
    103 
    104 		/* There are alignment issues that involve handling of special cases
    105 		 * a width of 32 results in a width of 16 in the chrominance
    106 		 * --> choose the proper handling to optimize the performance
    107 		 */
    108 		deprintf("[SPU] Convert %ix%i from YUV to RGB\n", parms_converter.src_pixel_width, parms_converter.src_pixel_height);
    109 		if (parms_converter.src_pixel_width & 0x1f) {
    110 			deprintf("[SPU] Using yuv_to_rgb_w16\n");
    111 			yuv_to_rgb_w16();
    112 		} else {
    113 			deprintf("[SPU] Using yuv_to_rgb_w32\n");
    114 			yuv_to_rgb_w32();
    115 		}
    116 
    117 		mfc_multi_tag_release(tag_id, 1);
    118 		deprintf("[SPU] yuv2rgb_spu... done!\n");
    119 		/* Send FIN message */
    120 		spu_write_out_mbox(SPU_FIN);
    121 	}
    122 
    123 	return 0;
    124 }
    125 
    126 
    127 /*
    128  * float_to_char()
    129  *
    130  * converts a float to a character using saturated
    131  * arithmetic
    132  *
    133  * @param s float for conversion
    134  * @returns converted character
    135  */
    136 inline static unsigned char float_to_char(float s) {
    137 	vector float vec_s = spu_splats(s);
    138 	vector unsigned int select_1 = spu_cmpgt(vec_0_1, vec_s);
    139 	vec_s = spu_sel(vec_s, vec_0_1, select_1);
    140 
    141 	vector unsigned int select_2 = spu_cmpgt(vec_s, vec_255);
    142 	vec_s = spu_sel(vec_s, vec_255, select_2);
    143 	return (unsigned char) spu_extract(vec_s,0);
    144 }
    145 
    146 
    147 /*
    148  * vfloat_to_vuint()
    149  *
    150  * converts a float vector to an unsinged int vector using saturated
    151  * arithmetic
    152  *
    153  * @param vec_s float vector for conversion
    154  * @returns converted unsigned int vector
    155  */
    156 inline static vector unsigned int vfloat_to_vuint(vector float vec_s) {
    157 	vector unsigned int select_1 = spu_cmpgt(vec_0_1, vec_s);
    158 	vec_s = spu_sel(vec_s, vec_0_1, select_1);
    159 
    160 	vector unsigned int select_2 = spu_cmpgt(vec_s, vec_255);
    161 	vec_s = spu_sel(vec_s, vec_255, select_2);
    162 	return spu_convtu(vec_s,0);
    163 }
    164 
    165 
    166 void yuv_to_rgb_w16() {
    167 	// Pixel dimensions of the picture
    168 	uint32_t width, height;
    169 
    170 	// Extract parameters
    171 	width = parms_converter.src_pixel_width;
    172 	height = parms_converter.src_pixel_height;
    173 
    174 	// Plane data management
    175 	// Y
    176 	unsigned char* ram_addr_y = parms_converter.y_plane;
    177 	// V
    178 	unsigned char* ram_addr_v = parms_converter.v_plane;
    179 	// U
    180 	unsigned char* ram_addr_u = parms_converter.u_plane;
    181 
    182 	// BGRA
    183 	unsigned char* ram_addr_bgra = parms_converter.dstBuffer;
    184 
    185 	// Strides
    186 	unsigned int stride_y = width;
    187 	unsigned int stride_vu = width>>1;
    188 
    189 	// Buffer management
    190 	unsigned int buf_idx = 0;
    191 	unsigned int size_4lines_y = stride_y<<2;
    192 	unsigned int size_2lines_y = stride_y<<1;
    193 	unsigned int size_2lines_vu = stride_vu<<1;
    194 
    195 	// 2*width*4byte_per_pixel
    196 	unsigned int size_2lines_bgra = width<<3;
    197 
    198 
    199 	// start double-buffered processing
    200 	// 4 lines y
    201 	spu_mfcdma32(y_plane[buf_idx], (unsigned int) ram_addr_y, size_4lines_y, RETR_BUF+buf_idx, MFC_GET_CMD);
    202 
    203 	// 2 lines v
    204 	spu_mfcdma32(v_plane[buf_idx], (unsigned int) ram_addr_v, size_2lines_vu, RETR_BUF+buf_idx, MFC_GET_CMD);
    205 
    206 	// 2 lines u
    207 	spu_mfcdma32(u_plane[buf_idx], (unsigned int) ram_addr_u, size_2lines_vu, RETR_BUF+buf_idx, MFC_GET_CMD);
    208 
    209 	// Wait for these transfers to be completed
    210 	DMA_WAIT_TAG((RETR_BUF + buf_idx));
    211 
    212 	unsigned int i;
    213 	for(i=0; i<(height>>2)-1; i++) {
    214 
    215 		buf_idx^=1;
    216 
    217 		// 4 lines y
    218 		spu_mfcdma32(y_plane[buf_idx], (unsigned int) ram_addr_y+size_4lines_y, size_4lines_y, RETR_BUF+buf_idx, MFC_GET_CMD);
    219 
    220 		// 2 lines v
    221 		spu_mfcdma32(v_plane[buf_idx], (unsigned int) ram_addr_v+size_2lines_vu, size_2lines_vu, RETR_BUF+buf_idx, MFC_GET_CMD);
    222 
    223 		// 2 lines u
    224 		spu_mfcdma32(u_plane[buf_idx], (unsigned int) ram_addr_u+size_2lines_vu, size_2lines_vu, RETR_BUF+buf_idx, MFC_GET_CMD);
    225 
    226 		DMA_WAIT_TAG((RETR_BUF + buf_idx));
    227 
    228 		buf_idx^=1;
    229 
    230 
    231 		// Convert YUV to BGRA, store it back (first two lines)
    232 		yuv_to_rgb_w16_line(y_plane[buf_idx], v_plane[buf_idx], u_plane[buf_idx], bgra, width);
    233 
    234 		// Next two lines
    235 		yuv_to_rgb_w16_line(y_plane[buf_idx] + size_2lines_y,
    236 				v_plane[buf_idx] + stride_vu,
    237 				u_plane[buf_idx] + stride_vu,
    238 				bgra + size_2lines_bgra,
    239 				width);
    240 
    241 		// Wait for previous storing transfer to be completed
    242 		DMA_WAIT_TAG(STR_BUF);
    243 
    244 		// Store converted lines in two steps->max transfer size 16384
    245 		spu_mfcdma32(bgra, (unsigned int) ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD);
    246 		ram_addr_bgra += size_2lines_bgra;
    247 		spu_mfcdma32(bgra+size_2lines_bgra, (unsigned int) ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD);
    248 		ram_addr_bgra += size_2lines_bgra;
    249 
    250 		// Move 4 lines
    251 		ram_addr_y += size_4lines_y;
    252 		ram_addr_v += size_2lines_vu;
    253 		ram_addr_u += size_2lines_vu;
    254 
    255 		buf_idx^=1;
    256 	}
    257 
    258 	// Convert YUV to BGRA, store it back (first two lines)
    259 	yuv_to_rgb_w16_line(y_plane[buf_idx], v_plane[buf_idx], u_plane[buf_idx], bgra, width);
    260 
    261 	// Next two lines
    262 	yuv_to_rgb_w16_line(y_plane[buf_idx] + size_2lines_y,
    263 			v_plane[buf_idx] + stride_vu,
    264 			u_plane[buf_idx] + stride_vu,
    265 			bgra + size_2lines_bgra,
    266 			width);
    267 
    268 	// Wait for previous storing transfer to be completed
    269 	DMA_WAIT_TAG(STR_BUF);
    270 	spu_mfcdma32(bgra, (unsigned int) ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD);
    271 	ram_addr_bgra += size_2lines_bgra;
    272 	spu_mfcdma32(bgra+size_2lines_bgra, (unsigned int) ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD);
    273 
    274 	// wait for previous storing transfer to be completed
    275 	DMA_WAIT_TAG(STR_BUF);
    276 
    277 }
    278 
    279 
    280 void yuv_to_rgb_w32() {
    281 	// Pixel dimensions of the picture
    282 	uint32_t width, height;
    283 
    284 	// Extract parameters
    285 	width = parms_converter.src_pixel_width;
    286 	height = parms_converter.src_pixel_height;
    287 
    288 	// Plane data management
    289 	// Y
    290 	unsigned char* ram_addr_y = parms_converter.y_plane;
    291 	// V
    292 	unsigned char* ram_addr_v = parms_converter.v_plane;
    293 	// U
    294 	unsigned char* ram_addr_u = parms_converter.u_plane;
    295 
    296 	// BGRA
    297 	unsigned char* ram_addr_bgra = parms_converter.dstBuffer;
    298 
    299 	// Strides
    300 	unsigned int stride_y = width;
    301 	unsigned int stride_vu = width>>1;
    302 
    303 	// Buffer management
    304 	unsigned int buf_idx = 0;
    305 	unsigned int size_4lines_y = stride_y<<2;
    306 	unsigned int size_2lines_y = stride_y<<1;
    307 	unsigned int size_2lines_vu = stride_vu<<1;
    308 
    309 	// 2*width*4byte_per_pixel
    310 	unsigned int size_2lines_bgra = width<<3;
    311 
    312 	// start double-buffered processing
    313 	// 4 lines y
    314 	spu_mfcdma32(y_plane[buf_idx], (unsigned int) ram_addr_y, size_4lines_y, RETR_BUF + buf_idx, MFC_GET_CMD);
    315 	// 2 lines v
    316 	spu_mfcdma32(v_plane[buf_idx], (unsigned int) ram_addr_v, size_2lines_vu, RETR_BUF + buf_idx, MFC_GET_CMD);
    317 	// 2 lines u
    318 	spu_mfcdma32(u_plane[buf_idx], (unsigned int) ram_addr_u, size_2lines_vu, RETR_BUF + buf_idx, MFC_GET_CMD);
    319 
    320 	// Wait for these transfers to be completed
    321 	DMA_WAIT_TAG((RETR_BUF + buf_idx));
    322 
    323 	unsigned int i;
    324 	for(i=0; i < (height>>2)-1; i++) {
    325 		buf_idx^=1;
    326 		// 4 lines y
    327 		spu_mfcdma32(y_plane[buf_idx], (unsigned int) ram_addr_y+size_4lines_y, size_4lines_y, RETR_BUF + buf_idx, MFC_GET_CMD);
    328 		deprintf("4lines = %d\n", size_4lines_y);
    329 		// 2 lines v
    330 		spu_mfcdma32(v_plane[buf_idx], (unsigned int) ram_addr_v+size_2lines_vu, size_2lines_vu, RETR_BUF + buf_idx, MFC_GET_CMD);
    331 		deprintf("2lines = %d\n", size_2lines_vu);
    332 		// 2 lines u
    333 		spu_mfcdma32(u_plane[buf_idx], (unsigned int) ram_addr_u+size_2lines_vu, size_2lines_vu, RETR_BUF + buf_idx, MFC_GET_CMD);
    334 		deprintf("2lines = %d\n", size_2lines_vu);
    335 
    336 		DMA_WAIT_TAG((RETR_BUF + buf_idx));
    337 
    338 		buf_idx^=1;
    339 
    340 		// Convert YUV to BGRA, store it back (first two lines)
    341 		yuv_to_rgb_w32_line(y_plane[buf_idx], v_plane[buf_idx], u_plane[buf_idx], bgra, width);
    342 
    343 		// Next two lines
    344 		yuv_to_rgb_w32_line(y_plane[buf_idx] + size_2lines_y,
    345 				v_plane[buf_idx] + stride_vu,
    346 				u_plane[buf_idx] + stride_vu,
    347 				bgra + size_2lines_bgra,
    348 				width);
    349 
    350 		// Wait for previous storing transfer to be completed
    351 		DMA_WAIT_TAG(STR_BUF);
    352 
    353 		// Store converted lines in two steps->max transfer size 16384
    354 		spu_mfcdma32(bgra, (unsigned int)ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD);
    355 		ram_addr_bgra += size_2lines_bgra;
    356 		spu_mfcdma32(bgra + size_2lines_bgra, (unsigned int)ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD);
    357 		ram_addr_bgra += size_2lines_bgra;
    358 
    359 		// Move 4 lines
    360 		ram_addr_y += size_4lines_y;
    361 		ram_addr_v += size_2lines_vu;
    362 		ram_addr_u += size_2lines_vu;
    363 
    364 		buf_idx^=1;
    365 	}
    366 
    367 	// Convert YUV to BGRA, store it back (first two lines)
    368 	yuv_to_rgb_w32_line(y_plane[buf_idx], v_plane[buf_idx], u_plane[buf_idx], bgra, width);
    369 
    370 	// Next two lines
    371 	yuv_to_rgb_w32_line(y_plane[buf_idx] + size_2lines_y,
    372 			v_plane[buf_idx] + stride_vu,
    373 			u_plane[buf_idx] + stride_vu,
    374 			bgra + size_2lines_bgra,
    375 			width);
    376 
    377 	// Wait for previous storing transfer to be completed
    378 	DMA_WAIT_TAG(STR_BUF);
    379 	spu_mfcdma32(bgra, (unsigned int) ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD);
    380 	ram_addr_bgra += size_2lines_bgra;
    381 	spu_mfcdma32(bgra + size_2lines_bgra, (unsigned int) ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD);
    382 
    383 	// Wait for previous storing transfer to be completed
    384 	DMA_WAIT_TAG(STR_BUF);
    385 }
    386 
    387 
    388 /* Some vectors needed by the yuv 2 rgb conversion algorithm */
    389 const vector float vec_minus_128 = { -128.0f, -128.0f, -128.0f, -128.0f };
    390 const vector unsigned char vec_null = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
    391 const vector unsigned char vec_char2int_first = { 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x13 };
    392 const vector unsigned char vec_char2int_second = { 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x15, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x17 };
    393 const vector unsigned char vec_char2int_third = { 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x00, 0x00, 0x1B };
    394 const vector unsigned char vec_char2int_fourth = { 0x00, 0x00, 0x00, 0x1C, 0x00, 0x00, 0x00, 0x1D, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x00, 0x1F };
    395 
    396 const vector float vec_R_precalc_coeff = {1.403f, 1.403f, 1.403f, 1.403f};
    397 const vector float vec_Gu_precalc_coeff = {-0.344f, -0.344f, -0.344f, -0.344f};
    398 const vector float vec_Gv_precalc_coeff = {-0.714f, -0.714f, -0.714f, -0.714f};
    399 const vector float vec_B_precalc_coeff = {1.773f, 1.773f, 1.773f, 1.773f};
    400 
    401 const vector unsigned int vec_alpha =  { 255 << 24, 255 << 24, 255 << 24, 255 << 24 };
    402 
    403 const vector unsigned char vec_select_floats_upper = { 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07 };
    404 const vector unsigned char vec_select_floats_lower = { 0x08, 0x09, 0x0A, 0x0B, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x0C, 0x0D, 0x0E, 0x0F };
    405 
    406 
    407 /*
    408  * yuv_to_rgb_w16()
    409  *
    410  * processes to line of yuv-input, width has to be a multiple of 16
    411  * two lines of yuv are taken as input
    412  *
    413  * @param y_addr address of the y plane in local store
    414  * @param v_addr address of the v plane in local store
    415  * @param u_addr address of the u plane in local store
    416  * @param bgra_addr_ address of the bgra output buffer
    417  * @param width the width in pixel
    418  */
    419 void yuv_to_rgb_w16_line(unsigned char* y_addr, unsigned char* v_addr, unsigned char* u_addr, unsigned char* bgra_addr_, unsigned int width) {
    420 	// each pixel is stored as an integer
    421 	unsigned int* bgra_addr = (unsigned int*) bgra_addr_;
    422 
    423 	unsigned int x;
    424 	for(x = 0; x < width; x+=2) {
    425 		// Gehe zweischrittig durch die zeile, da jeder u und v wert fuer 4 pixel(zwei hoch, zwei breit) gilt
    426 		const unsigned char Y_1 = *(y_addr + x);
    427 		const unsigned char Y_2 = *(y_addr + x + 1);
    428 		const unsigned char Y_3 = *(y_addr + x + width);
    429 		const unsigned char Y_4 = *(y_addr + x + width + 1);
    430 		const unsigned char U = *(u_addr + (x >> 1));
    431 		const unsigned char V = *(v_addr + (x >> 1));
    432 
    433 		float V_minus_128 = (float)((float)V - 128.0f);
    434 		float U_minus_128 = (float)((float)U - 128.0f);
    435 
    436 		float R_precalculate = 1.403f * V_minus_128;
    437 		float G_precalculate = -(0.344f * U_minus_128 + 0.714f * V_minus_128);
    438 		float B_precalculate = 1.773f * U_minus_128;
    439 
    440 		const unsigned char R_1 = float_to_char((Y_1 + R_precalculate));
    441 		const unsigned char R_2 = float_to_char((Y_2 + R_precalculate));
    442 		const unsigned char R_3 = float_to_char((Y_3 + R_precalculate));
    443 		const unsigned char R_4 = float_to_char((Y_4 + R_precalculate));
    444 		const unsigned char G_1 = float_to_char((Y_1 + G_precalculate));
    445 		const unsigned char G_2 = float_to_char((Y_2 + G_precalculate));
    446 		const unsigned char G_3 = float_to_char((Y_3 + G_precalculate));
    447 		const unsigned char G_4 = float_to_char((Y_4 + G_precalculate));
    448 		const unsigned char B_1 = float_to_char((Y_1 + B_precalculate));
    449 		const unsigned char B_2 = float_to_char((Y_2 + B_precalculate));
    450 		const unsigned char B_3 = float_to_char((Y_3 + B_precalculate));
    451 		const unsigned char B_4 = float_to_char((Y_4 + B_precalculate));
    452 
    453 		*(bgra_addr + x) = (B_1 << 0)| (G_1 << 8) | (R_1 << 16) | (255 << 24);
    454 		*(bgra_addr + x + 1) = (B_2 << 0)| (G_2 << 8) | (R_2 << 16) | (255 << 24);
    455 		*(bgra_addr + x + width) = (B_3 << 0)| (G_3 << 8) | (R_3 << 16) | (255 << 24);
    456 		*(bgra_addr + x + width + 1) = (B_4 << 0)| (G_4 << 8) | (R_4 << 16) | (255 << 24);
    457 	}
    458 }
    459 
    460 
    461 /*
    462  * yuv_to_rgb_w32()
    463  *
    464  * processes to line of yuv-input, width has to be a multiple of 32
    465  * two lines of yuv are taken as input
    466  *
    467  * @param y_addr address of the y plane in local store
    468  * @param v_addr address of the v plane in local store
    469  * @param u_addr address of the u plane in local store
    470  * @param bgra_addr_ address of the bgra output buffer
    471  * @param width the width in pixel
    472  */
    473 void yuv_to_rgb_w32_line(unsigned char* y_addr, unsigned char* v_addr, unsigned char* u_addr, unsigned char* bgra_addr_, unsigned int width) {
    474 	// each pixel is stored as an integer
    475 	unsigned int* bgra_addr = (unsigned int*) bgra_addr_;
    476 
    477 	unsigned int x;
    478 	for(x = 0; x < width; x+=32) {
    479 		// Gehe zweischrittig durch die zeile, da jeder u und v wert fuer 4 pixel(zwei hoch, zwei breit) gilt
    480 
    481 		const vector unsigned char vchar_Y_1 = *((vector unsigned char*)(y_addr + x));
    482 		const vector unsigned char vchar_Y_2 = *((vector unsigned char*)(y_addr + x + 16));
    483 		const vector unsigned char vchar_Y_3 = *((vector unsigned char*)(y_addr + x + width));
    484 		const vector unsigned char vchar_Y_4 = *((vector unsigned char*)(y_addr + x + width + 16));
    485 		const vector unsigned char vchar_U = *((vector unsigned char*)(u_addr + (x >> 1)));
    486 		const vector unsigned char vchar_V = *((vector unsigned char*)(v_addr + (x >> 1)));
    487 
    488 		const vector float vfloat_U_1 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_U, vec_char2int_first), 0),vec_minus_128);
    489 		const vector float vfloat_U_2 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_U, vec_char2int_second), 0),vec_minus_128);
    490 		const vector float vfloat_U_3 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_U, vec_char2int_third), 0),vec_minus_128);
    491 		const vector float vfloat_U_4 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_U, vec_char2int_fourth), 0),vec_minus_128);
    492 
    493 		const vector float vfloat_V_1 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_V, vec_char2int_first), 0),vec_minus_128);
    494 		const vector float vfloat_V_2 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_V, vec_char2int_second), 0),vec_minus_128);
    495 		const vector float vfloat_V_3 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_V, vec_char2int_third), 0),vec_minus_128);
    496 		const vector float vfloat_V_4 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_V, vec_char2int_fourth), 0),vec_minus_128);
    497 
    498 		vector float Y_1 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_1, vec_char2int_first), 0);
    499 		vector float Y_2 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_1, vec_char2int_second), 0);
    500 		vector float Y_3 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_1, vec_char2int_third), 0);
    501 		vector float Y_4 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_1, vec_char2int_fourth), 0);
    502 		vector float Y_5 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_2, vec_char2int_first), 0);
    503 		vector float Y_6 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_2, vec_char2int_second), 0);
    504 		vector float Y_7 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_2, vec_char2int_third), 0);
    505 		vector float Y_8 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_2, vec_char2int_fourth), 0);
    506 		vector float Y_9 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_3, vec_char2int_first), 0);
    507 		vector float Y_10 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_3, vec_char2int_second), 0);
    508 		vector float Y_11 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_3, vec_char2int_third), 0);
    509 		vector float Y_12 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_3, vec_char2int_fourth), 0);
    510 		vector float Y_13 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_4, vec_char2int_first), 0);
    511 		vector float Y_14 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_4, vec_char2int_second), 0);
    512 		vector float Y_15 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_4, vec_char2int_third), 0);
    513 		vector float Y_16 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_4, vec_char2int_fourth), 0);
    514 
    515 		const vector float R1a_precalculate = spu_mul(vec_R_precalc_coeff, vfloat_V_1);
    516 		const vector float R2a_precalculate = spu_mul(vec_R_precalc_coeff, vfloat_V_2);
    517 		const vector float R3a_precalculate = spu_mul(vec_R_precalc_coeff, vfloat_V_3);
    518 		const vector float R4a_precalculate = spu_mul(vec_R_precalc_coeff, vfloat_V_4);
    519 
    520 		const vector float R1_precalculate = spu_shuffle(R1a_precalculate,  R1a_precalculate, vec_select_floats_upper);
    521 		const vector float R2_precalculate = spu_shuffle(R1a_precalculate,  R1a_precalculate, vec_select_floats_lower);
    522 		const vector float R3_precalculate = spu_shuffle(R2a_precalculate,  R2a_precalculate, vec_select_floats_upper);
    523 		const vector float R4_precalculate = spu_shuffle(R2a_precalculate,  R2a_precalculate, vec_select_floats_lower);
    524 		const vector float R5_precalculate = spu_shuffle(R3a_precalculate,  R3a_precalculate, vec_select_floats_upper);
    525 		const vector float R6_precalculate = spu_shuffle(R3a_precalculate,  R3a_precalculate, vec_select_floats_lower);
    526 		const vector float R7_precalculate = spu_shuffle(R4a_precalculate,  R4a_precalculate, vec_select_floats_upper);
    527 		const vector float R8_precalculate = spu_shuffle(R4a_precalculate,  R4a_precalculate, vec_select_floats_lower);
    528 
    529 
    530 		const vector float G1a_precalculate = spu_madd(vec_Gu_precalc_coeff, vfloat_U_1, spu_mul(vfloat_V_1, vec_Gv_precalc_coeff));
    531 		const vector float G2a_precalculate = spu_madd(vec_Gu_precalc_coeff, vfloat_U_2, spu_mul(vfloat_V_2, vec_Gv_precalc_coeff));
    532 		const vector float G3a_precalculate = spu_madd(vec_Gu_precalc_coeff, vfloat_U_3, spu_mul(vfloat_V_3, vec_Gv_precalc_coeff));
    533 		const vector float G4a_precalculate = spu_madd(vec_Gu_precalc_coeff, vfloat_U_4, spu_mul(vfloat_V_4, vec_Gv_precalc_coeff));
    534 
    535 		const vector float G1_precalculate = spu_shuffle(G1a_precalculate,  G1a_precalculate, vec_select_floats_upper);
    536 		const vector float G2_precalculate = spu_shuffle(G1a_precalculate,  G1a_precalculate, vec_select_floats_lower);
    537 		const vector float G3_precalculate = spu_shuffle(G2a_precalculate,  G2a_precalculate, vec_select_floats_upper);
    538 		const vector float G4_precalculate = spu_shuffle(G2a_precalculate,  G2a_precalculate, vec_select_floats_lower);
    539 		const vector float G5_precalculate = spu_shuffle(G3a_precalculate,  G3a_precalculate, vec_select_floats_upper);
    540 		const vector float G6_precalculate = spu_shuffle(G3a_precalculate,  G3a_precalculate, vec_select_floats_lower);
    541 		const vector float G7_precalculate = spu_shuffle(G4a_precalculate,  G4a_precalculate, vec_select_floats_upper);
    542 		const vector float G8_precalculate = spu_shuffle(G4a_precalculate,  G4a_precalculate, vec_select_floats_lower);
    543 
    544 
    545 		const vector float B1a_precalculate = spu_mul(vec_B_precalc_coeff, vfloat_U_1);
    546 		const vector float B2a_precalculate = spu_mul(vec_B_precalc_coeff, vfloat_U_2);
    547 		const vector float B3a_precalculate = spu_mul(vec_B_precalc_coeff, vfloat_U_3);
    548 		const vector float B4a_precalculate = spu_mul(vec_B_precalc_coeff, vfloat_U_4);
    549 
    550 		const vector float B1_precalculate = spu_shuffle(B1a_precalculate,  B1a_precalculate, vec_select_floats_upper);
    551 		const vector float B2_precalculate = spu_shuffle(B1a_precalculate,  B1a_precalculate, vec_select_floats_lower);
    552 		const vector float B3_precalculate = spu_shuffle(B2a_precalculate,  B2a_precalculate, vec_select_floats_upper);
    553 		const vector float B4_precalculate = spu_shuffle(B2a_precalculate,  B2a_precalculate, vec_select_floats_lower);
    554 		const vector float B5_precalculate = spu_shuffle(B3a_precalculate,  B3a_precalculate, vec_select_floats_upper);
    555 		const vector float B6_precalculate = spu_shuffle(B3a_precalculate,  B3a_precalculate, vec_select_floats_lower);
    556 		const vector float B7_precalculate = spu_shuffle(B4a_precalculate,  B4a_precalculate, vec_select_floats_upper);
    557 		const vector float B8_precalculate = spu_shuffle(B4a_precalculate,  B4a_precalculate, vec_select_floats_lower);
    558 
    559 
    560 		const vector unsigned int  R_1 = vfloat_to_vuint(spu_add( Y_1, R1_precalculate));
    561 		const vector unsigned int  R_2 = vfloat_to_vuint(spu_add( Y_2, R2_precalculate));
    562 		const vector unsigned int  R_3 = vfloat_to_vuint(spu_add( Y_3, R3_precalculate));
    563 		const vector unsigned int  R_4 = vfloat_to_vuint(spu_add( Y_4, R4_precalculate));
    564 		const vector unsigned int  R_5 = vfloat_to_vuint(spu_add( Y_5, R5_precalculate));
    565 		const vector unsigned int  R_6 = vfloat_to_vuint(spu_add( Y_6, R6_precalculate));
    566 		const vector unsigned int  R_7 = vfloat_to_vuint(spu_add( Y_7, R7_precalculate));
    567 		const vector unsigned int  R_8 = vfloat_to_vuint(spu_add( Y_8, R8_precalculate));
    568 		const vector unsigned int  R_9 = vfloat_to_vuint(spu_add( Y_9, R1_precalculate));
    569 		const vector unsigned int R_10 = vfloat_to_vuint(spu_add(Y_10, R2_precalculate));
    570 		const vector unsigned int R_11 = vfloat_to_vuint(spu_add(Y_11, R3_precalculate));
    571 		const vector unsigned int R_12 = vfloat_to_vuint(spu_add(Y_12, R4_precalculate));
    572 		const vector unsigned int R_13 = vfloat_to_vuint(spu_add(Y_13, R5_precalculate));
    573 		const vector unsigned int R_14 = vfloat_to_vuint(spu_add(Y_14, R6_precalculate));
    574 		const vector unsigned int R_15 = vfloat_to_vuint(spu_add(Y_15, R7_precalculate));
    575 		const vector unsigned int R_16 = vfloat_to_vuint(spu_add(Y_16, R8_precalculate));
    576 
    577 		const vector unsigned int  G_1 = vfloat_to_vuint(spu_add( Y_1, G1_precalculate));
    578 		const vector unsigned int  G_2 = vfloat_to_vuint(spu_add( Y_2, G2_precalculate));
    579 		const vector unsigned int  G_3 = vfloat_to_vuint(spu_add( Y_3, G3_precalculate));
    580 		const vector unsigned int  G_4 = vfloat_to_vuint(spu_add( Y_4, G4_precalculate));
    581 		const vector unsigned int  G_5 = vfloat_to_vuint(spu_add( Y_5, G5_precalculate));
    582 		const vector unsigned int  G_6 = vfloat_to_vuint(spu_add( Y_6, G6_precalculate));
    583 		const vector unsigned int  G_7 = vfloat_to_vuint(spu_add( Y_7, G7_precalculate));
    584 		const vector unsigned int  G_8 = vfloat_to_vuint(spu_add( Y_8, G8_precalculate));
    585 		const vector unsigned int  G_9 = vfloat_to_vuint(spu_add( Y_9, G1_precalculate));
    586 		const vector unsigned int G_10 = vfloat_to_vuint(spu_add(Y_10, G2_precalculate));
    587 		const vector unsigned int G_11 = vfloat_to_vuint(spu_add(Y_11, G3_precalculate));
    588 		const vector unsigned int G_12 = vfloat_to_vuint(spu_add(Y_12, G4_precalculate));
    589 		const vector unsigned int G_13 = vfloat_to_vuint(spu_add(Y_13, G5_precalculate));
    590 		const vector unsigned int G_14 = vfloat_to_vuint(spu_add(Y_14, G6_precalculate));
    591 		const vector unsigned int G_15 = vfloat_to_vuint(spu_add(Y_15, G7_precalculate));
    592 		const vector unsigned int G_16 = vfloat_to_vuint(spu_add(Y_16, G8_precalculate));
    593 
    594 		const vector unsigned int  B_1 = vfloat_to_vuint(spu_add( Y_1, B1_precalculate));
    595 		const vector unsigned int  B_2 = vfloat_to_vuint(spu_add( Y_2, B2_precalculate));
    596 		const vector unsigned int  B_3 = vfloat_to_vuint(spu_add( Y_3, B3_precalculate));
    597 		const vector unsigned int  B_4 = vfloat_to_vuint(spu_add( Y_4, B4_precalculate));
    598 		const vector unsigned int  B_5 = vfloat_to_vuint(spu_add( Y_5, B5_precalculate));
    599 		const vector unsigned int  B_6 = vfloat_to_vuint(spu_add( Y_6, B6_precalculate));
    600 		const vector unsigned int  B_7 = vfloat_to_vuint(spu_add( Y_7, B7_precalculate));
    601 		const vector unsigned int  B_8 = vfloat_to_vuint(spu_add( Y_8, B8_precalculate));
    602 		const vector unsigned int  B_9 = vfloat_to_vuint(spu_add( Y_9, B1_precalculate));
    603 		const vector unsigned int B_10 = vfloat_to_vuint(spu_add(Y_10, B2_precalculate));
    604 		const vector unsigned int B_11 = vfloat_to_vuint(spu_add(Y_11, B3_precalculate));
    605 		const vector unsigned int B_12 = vfloat_to_vuint(spu_add(Y_12, B4_precalculate));
    606 		const vector unsigned int B_13 = vfloat_to_vuint(spu_add(Y_13, B5_precalculate));
    607 		const vector unsigned int B_14 = vfloat_to_vuint(spu_add(Y_14, B6_precalculate));
    608 		const vector unsigned int B_15 = vfloat_to_vuint(spu_add(Y_15, B7_precalculate));
    609 		const vector unsigned int B_16 = vfloat_to_vuint(spu_add(Y_16, B8_precalculate));
    610 
    611 		*((vector unsigned int*)(bgra_addr + x)) = spu_or(spu_or(vec_alpha,  B_1), spu_or(spu_slqwbyte( R_1, 2),spu_slqwbyte(G_1, 1)));
    612 		*((vector unsigned int*)(bgra_addr + x + 4)) = spu_or(spu_or(vec_alpha,  B_2), spu_or(spu_slqwbyte( R_2, 2),spu_slqwbyte(G_2, 1)));
    613 		*((vector unsigned int*)(bgra_addr + x + 8)) = spu_or(spu_or(vec_alpha,  B_3), spu_or(spu_slqwbyte( R_3, 2),spu_slqwbyte(G_3, 1)));
    614 		*((vector unsigned int*)(bgra_addr + x + 12)) = spu_or(spu_or(vec_alpha,  B_4), spu_or(spu_slqwbyte( R_4, 2),spu_slqwbyte(G_4, 1)));
    615 		*((vector unsigned int*)(bgra_addr + x + 16)) = spu_or(spu_or(vec_alpha,  B_5), spu_or(spu_slqwbyte( R_5, 2),spu_slqwbyte(G_5, 1)));
    616 		*((vector unsigned int*)(bgra_addr + x + 20)) = spu_or(spu_or(vec_alpha,  B_6), spu_or(spu_slqwbyte( R_6, 2),spu_slqwbyte(G_6, 1)));
    617 		*((vector unsigned int*)(bgra_addr + x + 24)) = spu_or(spu_or(vec_alpha,  B_7), spu_or(spu_slqwbyte( R_7, 2),spu_slqwbyte(G_7, 1)));
    618 		*((vector unsigned int*)(bgra_addr + x + 28)) = spu_or(spu_or(vec_alpha,  B_8), spu_or(spu_slqwbyte( R_8, 2),spu_slqwbyte(G_8, 1)));
    619 		*((vector unsigned int*)(bgra_addr + x + width)) = spu_or(spu_or(vec_alpha,  B_9), spu_or(spu_slqwbyte( R_9, 2),spu_slqwbyte(G_9, 1)));
    620 		*((vector unsigned int*)(bgra_addr + x + width + 4)) = spu_or(spu_or(vec_alpha, B_10), spu_or(spu_slqwbyte(R_10, 2),spu_slqwbyte(G_10, 1)));
    621 		*((vector unsigned int*)(bgra_addr + x + width + 8)) = spu_or(spu_or(vec_alpha, B_11), spu_or(spu_slqwbyte(R_11, 2),spu_slqwbyte(G_11, 1)));
    622 		*((vector unsigned int*)(bgra_addr + x + width + 12)) = spu_or(spu_or(vec_alpha, B_12), spu_or(spu_slqwbyte(R_12, 2),spu_slqwbyte(G_12, 1)));
    623 		*((vector unsigned int*)(bgra_addr + x + width + 16)) = spu_or(spu_or(vec_alpha, B_13), spu_or(spu_slqwbyte(R_13, 2),spu_slqwbyte(G_13, 1)));
    624 		*((vector unsigned int*)(bgra_addr + x + width + 20)) = spu_or(spu_or(vec_alpha, B_14), spu_or(spu_slqwbyte(R_14, 2),spu_slqwbyte(G_14, 1)));
    625 		*((vector unsigned int*)(bgra_addr + x + width + 24)) = spu_or(spu_or(vec_alpha, B_15), spu_or(spu_slqwbyte(R_15, 2),spu_slqwbyte(G_15, 1)));
    626 		*((vector unsigned int*)(bgra_addr + x + width + 28)) = spu_or(spu_or(vec_alpha, B_16), spu_or(spu_slqwbyte(R_16, 2),spu_slqwbyte(G_16, 1)));
    627 	}
    628 }
    629 
    630