Home | History | Annotate | Download | only in spulibs
      1 /*
      2  * SDL - Simple DirectMedia Layer
      3  * CELL BE Support for PS3 Framebuffer
      4  * Copyright (C) 2008, 2009 International Business Machines Corporation
      5  *
      6  * This library is free software; you can redistribute it and/or modify it
      7  * under the terms of the GNU Lesser General Public License as published
      8  * by the Free Software Foundation; either version 2.1 of the License, or
      9  * (at your option) any later version.
     10  *
     11  * This library is distributed in the hope that it will be useful, but
     12  * WITHOUT ANY WARRANTY; without even the implied warranty of
     13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     14  * Lesser General Public License for more details.
     15  *
     16  * You should have received a copy of the GNU Lesser General Public
     17  * License along with this library; if not, write to the Free Software
     18  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
     19  * USA
     20  *
     21  *  Martin Lowinski  <lowinski [at] de [dot] ibm [ibm] com>
     22  *  Dirk Herrendoerfer <d.herrendoerfer [at] de [dot] ibm [dot] com>
     23  *  SPE code based on research by:
     24  *  Rene Becker
     25  *  Thimo Emmerich
     26  */
     27 
     28 #include "spu_common.h"
     29 
     30 #include <spu_intrinsics.h>
     31 #include <spu_mfcio.h>
     32 
     33 // Debugging
     34 //#define DEBUG
     35 
     36 #ifdef DEBUG
     37 #define deprintf(fmt, args... ) \
     38 	fprintf( stdout, fmt, ##args ); \
     39 	fflush( stdout );
     40 #else
     41 #define deprintf( fmt, args... )
     42 #endif
     43 
     44 struct scale_parms_t parms __attribute__((aligned(128)));
     45 
     46 /* A maximum of 8 lines Y, therefore 4 lines V, 4 lines U are stored
     47  * there might be the need to retrieve misaligned data, adjust
     48  * incoming v and u plane to be able to handle this (add 128)
     49  */
     50 unsigned char y_plane[2][(MAX_HDTV_WIDTH+128)*4] __attribute__((aligned(128)));
     51 unsigned char v_plane[2][(MAX_HDTV_WIDTH+128)*2] __attribute__((aligned(128)));
     52 unsigned char u_plane[2][(MAX_HDTV_WIDTH+128)*2] __attribute__((aligned(128)));
     53 
     54 /* temp-buffer for scaling: 4 lines Y, therefore 2 lines V, 2 lines U */
     55 unsigned char scaled_y_plane[2][MAX_HDTV_WIDTH*2] __attribute__((aligned(128)));
     56 unsigned char scaled_v_plane[2][MAX_HDTV_WIDTH/2] __attribute__((aligned(128)));
     57 unsigned char scaled_u_plane[2][MAX_HDTV_WIDTH/2] __attribute__((aligned(128)));
     58 
     59 /* some vectors needed by the float to int conversion */
     60 static const vector float vec_255 = { 255.0f, 255.0f, 255.0f, 255.0f };
     61 static const vector float vec_0_1 = { 0.1f, 0.1f, 0.1f, 0.1f };
     62 
     63 void bilinear_scale_line_w8(unsigned char* src, unsigned char* dst_, unsigned int dst_width, vector float vf_x_scale, vector float vf_NSweight, unsigned int src_linestride);
     64 void bilinear_scale_line_w16(unsigned char* src, unsigned char* dst_, unsigned int dst_width, vector float vf_x_scale, vector float vf_NSweight, unsigned int src_linestride);
     65 
     66 void scale_srcw16_dstw16();
     67 void scale_srcw16_dstw32();
     68 void scale_srcw32_dstw16();
     69 void scale_srcw32_dstw32();
     70 
     71 int main( unsigned long long spe_id __attribute__((unused)), unsigned long long argp )
     72 {
     73 	deprintf("[SPU] bilin_scaler_spu is up... (on SPE #%llu)\n", spe_id);
     74 	/* DMA transfer for the input parameters */
     75 	spu_mfcdma32(&parms, (unsigned int)argp, sizeof(struct scale_parms_t), TAG_INIT, MFC_GET_CMD);
     76 	DMA_WAIT_TAG(TAG_INIT);
     77 
     78 	deprintf("[SPU] Scale %ux%u to %ux%u\n", parms.src_pixel_width, parms.src_pixel_height,
     79 			parms.dst_pixel_width, parms.dst_pixel_height);
     80 
     81 	if(parms.src_pixel_width & 0x1f) {
     82 		if(parms.dst_pixel_width & 0x1F) {
     83 			deprintf("[SPU] Using scale_srcw16_dstw16\n");
     84 			scale_srcw16_dstw16();
     85 		} else {
     86 			deprintf("[SPU] Using scale_srcw16_dstw32\n");
     87 			scale_srcw16_dstw32();
     88 		}
     89 	} else {
     90 		if(parms.dst_pixel_width & 0x1F) {
     91 			deprintf("[SPU] Using scale_srcw32_dstw16\n");
     92 			scale_srcw32_dstw16();
     93 		} else {
     94 			deprintf("[SPU] Using scale_srcw32_dstw32\n");
     95 			scale_srcw32_dstw32();
     96 		}
     97 	}
     98 	deprintf("[SPU] bilin_scaler_spu... done!\n");
     99 
    100 	return 0;
    101 }
    102 
    103 
    104 /*
    105  * vfloat_to_vuint()
    106  *
    107  * converts a float vector to an unsinged int vector using saturated
    108  * arithmetic
    109  *
    110  * @param vec_s float vector for conversion
    111  * @returns converted unsigned int vector
    112  */
    113 inline static vector unsigned int vfloat_to_vuint(vector float vec_s) {
    114 	vector unsigned int select_1 = spu_cmpgt(vec_0_1, vec_s);
    115 	vec_s = spu_sel(vec_s, vec_0_1, select_1);
    116 
    117 	vector unsigned int select_2 = spu_cmpgt(vec_s, vec_255);
    118 	vec_s = spu_sel(vec_s, vec_255, select_2);
    119 	return spu_convtu(vec_s,0);
    120 }
    121 
    122 
    123 /*
    124  * scale_srcw16_dstw16()
    125  *
    126  * processes an input image of width 16
    127  * scaling is done to a width 16
    128  * result stored in RAM
    129  */
    130 void scale_srcw16_dstw16() {
    131 	// extract parameters
    132 	unsigned char* dst_addr = (unsigned char *)parms.dstBuffer;
    133 
    134 	unsigned int src_width = parms.src_pixel_width;
    135 	unsigned int src_height = parms.src_pixel_height;
    136 	unsigned int dst_width = parms.dst_pixel_width;
    137 	unsigned int dst_height = parms.dst_pixel_height;
    138 
    139 	// YVU
    140 	unsigned int src_linestride_y = src_width;
    141 	unsigned int src_dbl_linestride_y = src_width<<1;
    142 	unsigned int src_linestride_vu = src_width>>1;
    143 	unsigned int src_dbl_linestride_vu = src_width;
    144 
    145 	// scaled YVU
    146 	unsigned int scaled_src_linestride_y = dst_width;
    147 
    148 	// ram addresses
    149 	unsigned char* src_addr_y = parms.y_plane;
    150 	unsigned char* src_addr_v = parms.v_plane;
    151 	unsigned char* src_addr_u = parms.u_plane;
    152 
    153 	// for handling misalignment, addresses are precalculated
    154 	unsigned char* precalc_src_addr_v = src_addr_v;
    155 	unsigned char* precalc_src_addr_u = src_addr_u;
    156 
    157 	unsigned int dst_picture_size = dst_width*dst_height;
    158 
    159 	// Sizes for destination
    160 	unsigned int dst_dbl_linestride_y = dst_width<<1;
    161 	unsigned int dst_dbl_linestride_vu = dst_width>>1;
    162 
    163 	// Perform address calculation for Y, V and U in main memory with dst_addr as base
    164 	unsigned char* dst_addr_main_memory_y = dst_addr;
    165 	unsigned char* dst_addr_main_memory_v = dst_addr + dst_picture_size;
    166 	unsigned char* dst_addr_main_memory_u = dst_addr_main_memory_v +(dst_picture_size>>2);
    167 
    168 	// calculate scale factors
    169 	vector float vf_x_scale = spu_splats( (float)src_width/(float)dst_width );
    170 	float y_scale = (float)src_height/(float)dst_height;
    171 
    172 	// double buffered processing
    173 	// buffer switching
    174 	unsigned int curr_src_idx = 0;
    175 	unsigned int curr_dst_idx = 0;
    176 	unsigned int next_src_idx, next_dst_idx;
    177 
    178 	// 2 lines y as output, upper and lowerline
    179 	unsigned int curr_interpl_y_upper = 0;
    180 	unsigned int next_interpl_y_upper;
    181 	unsigned int curr_interpl_y_lower, next_interpl_y_lower;
    182 	// only 1 line v/u output, both planes have the same dimension
    183 	unsigned int curr_interpl_vu = 0;
    184 	unsigned int next_interpl_vu;
    185 
    186 	// weights, calculated in every loop iteration
    187 	vector float vf_curr_NSweight_y_upper = { 0.0f, 0.0f, 0.0f, 0.0f };
    188 	vector float vf_next_NSweight_y_upper;
    189 	vector float vf_curr_NSweight_y_lower, vf_next_NSweight_y_lower;
    190 	vector float vf_curr_NSweight_vu = { 0.0f, 0.0f, 0.0f, 0.0f };
    191 	vector float vf_next_NSweight_vu;
    192 
    193 	// line indices for the src picture
    194 	float curr_src_y_upper = 0.0f, next_src_y_upper;
    195 	float curr_src_y_lower, next_src_y_lower;
    196 	float curr_src_vu = 0.0f, next_src_vu;
    197 
    198 	// line indices for the dst picture
    199 	unsigned int dst_y=0, dst_vu=0;
    200 
    201 	// offset for the v and u plane to handle misalignement
    202 	unsigned int curr_lsoff_v = 0, next_lsoff_v;
    203 	unsigned int curr_lsoff_u = 0, next_lsoff_u;
    204 
    205 	// calculate lower line indices
    206 	curr_src_y_lower = ((float)curr_interpl_y_upper+1)*y_scale;
    207 	curr_interpl_y_lower = (unsigned int)curr_src_y_lower;
    208 	// lower line weight
    209 	vf_curr_NSweight_y_lower = spu_splats( curr_src_y_lower-(float)curr_interpl_y_lower );
    210 
    211 
    212 	// start partially double buffered processing
    213 	// get initial data, 2 sets of y, 1 set v, 1 set u
    214 	mfc_get( y_plane[curr_src_idx], (unsigned int) src_addr_y, src_dbl_linestride_y, RETR_BUF, 0, 0 );
    215 	mfc_get( y_plane[curr_src_idx]+src_dbl_linestride_y,
    216 			(unsigned int) src_addr_y+(curr_interpl_y_lower*src_linestride_y),
    217 			src_dbl_linestride_y,
    218 			RETR_BUF,
    219 			0, 0 );
    220 	mfc_get( v_plane[curr_src_idx], (unsigned int) src_addr_v, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
    221 	mfc_get( u_plane[curr_src_idx], (unsigned int) src_addr_u, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
    222 
    223 	/* iteration loop
    224 	 * within each iteration 4 lines y, 2 lines v, 2 lines u are retrieved
    225 	 * the scaled output is 2 lines y, 1 line v, 1 line u
    226 	 * the yuv2rgb-converted output is stored to RAM
    227 	 */
    228 	for( dst_vu=0; dst_vu<(dst_height>>1)-1; dst_vu++ ) {
    229 		dst_y = dst_vu<<1;
    230 
    231 		// calculate next indices
    232 		next_src_vu = ((float)dst_vu+1)*y_scale;
    233 		next_src_y_upper = ((float)dst_y+2)*y_scale;
    234 		next_src_y_lower = ((float)dst_y+3)*y_scale;
    235 
    236 		next_interpl_vu = (unsigned int) next_src_vu;
    237 		next_interpl_y_upper = (unsigned int) next_src_y_upper;
    238 		next_interpl_y_lower = (unsigned int) next_src_y_lower;
    239 
    240 		// calculate weight NORTH-SOUTH
    241 		vf_next_NSweight_vu = spu_splats( next_src_vu-(float)next_interpl_vu );
    242 		vf_next_NSweight_y_upper = spu_splats( next_src_y_upper-(float)next_interpl_y_upper );
    243 		vf_next_NSweight_y_lower = spu_splats( next_src_y_lower-(float)next_interpl_y_lower );
    244 
    245 		// get next lines
    246 		next_src_idx = curr_src_idx^1;
    247 		next_dst_idx = curr_dst_idx^1;
    248 
    249 		// 4 lines y
    250 		mfc_get( y_plane[next_src_idx],
    251 				(unsigned int) src_addr_y+(next_interpl_y_upper*src_linestride_y),
    252 				src_dbl_linestride_y,
    253 				RETR_BUF+next_src_idx,
    254 				0, 0 );
    255 		mfc_get( y_plane[next_src_idx]+src_dbl_linestride_y,
    256 				(unsigned int) src_addr_y+(next_interpl_y_lower*src_linestride_y),
    257 				src_dbl_linestride_y,
    258 				RETR_BUF+next_src_idx,
    259 				0, 0 );
    260 
    261 		// 2 lines v
    262 		precalc_src_addr_v = src_addr_v+(next_interpl_vu*src_linestride_vu);
    263 		next_lsoff_v = ((unsigned int)precalc_src_addr_v)&0x0F;
    264 		mfc_get( v_plane[next_src_idx],
    265 				((unsigned int) precalc_src_addr_v)&0xFFFFFFF0,
    266 				src_dbl_linestride_vu+(next_lsoff_v<<1),
    267 				RETR_BUF+next_src_idx,
    268 				0, 0 );
    269 		// 2 lines u
    270 		precalc_src_addr_u = src_addr_u+(next_interpl_vu*src_linestride_vu);
    271 		next_lsoff_u = ((unsigned int)precalc_src_addr_u)&0x0F;
    272 		mfc_get( u_plane[next_src_idx],
    273 				((unsigned int) precalc_src_addr_u)&0xFFFFFFF0,
    274 				src_dbl_linestride_vu+(next_lsoff_v<<1),
    275 				RETR_BUF+next_src_idx,
    276 				0, 0 );
    277 
    278 		DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );
    279 
    280 		// scaling
    281 		// work line y_upper
    282 		bilinear_scale_line_w16( y_plane[curr_src_idx],
    283 				scaled_y_plane[curr_src_idx],
    284 				dst_width,
    285 				vf_x_scale,
    286 				vf_curr_NSweight_y_upper,
    287 				src_linestride_y );
    288 		// work line y_lower
    289 		bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
    290 				scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
    291 				dst_width,
    292 				vf_x_scale,
    293 				vf_curr_NSweight_y_lower,
    294 				src_linestride_y );
    295 		// work line v
    296 		bilinear_scale_line_w8( v_plane[curr_src_idx]+curr_lsoff_v,
    297 				scaled_v_plane[curr_src_idx],
    298 				dst_width>>1,
    299 				vf_x_scale,
    300 				vf_curr_NSweight_vu,
    301 				src_linestride_vu );
    302 		// work line u
    303 		bilinear_scale_line_w8( u_plane[curr_src_idx]+curr_lsoff_u,
    304 				scaled_u_plane[curr_src_idx],
    305 				dst_width>>1,
    306 				vf_x_scale,
    307 				vf_curr_NSweight_vu,
    308 				src_linestride_vu );
    309 
    310 
    311 		// Store the result back to main memory into a destination buffer in YUV format
    312 		//---------------------------------------------------------------------------------------------
    313 		DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
    314 
    315 		// Perform three DMA transfers to 3 different locations in the main memory!
    316 		// dst_width:	Pixel width of destination image
    317 		// dst_addr:	Destination address in main memory
    318 		// dst_vu:	Counter which is incremented one by one
    319 		// dst_y:	Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
    320 		mfc_put(	scaled_y_plane[curr_src_idx],					// What from local store (addr)
    321 				(unsigned int)dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y),	// Destination in main memory (addr)
    322 				dst_dbl_linestride_y,						// Two Y lines (depending on the widht of the destination resolution)
    323 				STR_BUF+curr_dst_idx,						// Tag
    324 				0, 0 );
    325 
    326 		mfc_put(	scaled_v_plane[curr_src_idx],					// What from local store (addr)
    327 				(unsigned int)dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu),	// Destination in main memory (addr)
    328 				dst_dbl_linestride_vu,						// Two V lines (depending on the widht of the destination resolution)
    329 				STR_BUF+curr_dst_idx,						// Tag
    330 				0, 0 );
    331 
    332 		mfc_put(	scaled_u_plane[curr_src_idx],					// What from local store (addr)
    333 				(unsigned int)dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu),	// Destination in main memory (addr)
    334 				dst_dbl_linestride_vu,						// Two U lines (depending on the widht of the destination resolution)
    335 				STR_BUF+curr_dst_idx,						// Tag
    336 				0, 0 );
    337 		//---------------------------------------------------------------------------------------------
    338 
    339 
    340 		// update for next cycle
    341 		curr_src_idx = next_src_idx;
    342 		curr_dst_idx = next_dst_idx;
    343 
    344 		curr_interpl_y_upper = next_interpl_y_upper;
    345 		curr_interpl_y_lower = next_interpl_y_lower;
    346 		curr_interpl_vu = next_interpl_vu;
    347 
    348 		vf_curr_NSweight_y_upper = vf_curr_NSweight_y_upper;
    349 		vf_curr_NSweight_y_lower = vf_curr_NSweight_y_lower;
    350 		vf_curr_NSweight_vu = vf_next_NSweight_vu;
    351 
    352 		curr_src_y_upper = next_src_y_upper;
    353 		curr_src_y_lower = next_src_y_lower;
    354 		curr_src_vu = next_src_vu;
    355 
    356 		curr_lsoff_v = next_lsoff_v;
    357 		curr_lsoff_u = next_lsoff_u;
    358 	}
    359 
    360 
    361 
    362 	DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );
    363 
    364 	// scaling
    365 	// work line y_upper
    366 	bilinear_scale_line_w16( y_plane[curr_src_idx],
    367 			scaled_y_plane[curr_src_idx],
    368 			dst_width,
    369 			vf_x_scale,
    370 			vf_curr_NSweight_y_upper,
    371 			src_linestride_y );
    372 	// work line y_lower
    373 	bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
    374 			scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
    375 			dst_width,
    376 			vf_x_scale,
    377 			vf_curr_NSweight_y_lower,
    378 			src_linestride_y );
    379 	// work line v
    380 	bilinear_scale_line_w8( v_plane[curr_src_idx]+curr_lsoff_v,
    381 			scaled_v_plane[curr_src_idx],
    382 			dst_width>>1,
    383 			vf_x_scale,
    384 			vf_curr_NSweight_vu,
    385 			src_linestride_vu );
    386 	// work line u
    387 	bilinear_scale_line_w8( u_plane[curr_src_idx]+curr_lsoff_u,
    388 			scaled_u_plane[curr_src_idx],
    389 			dst_width>>1,
    390 			vf_x_scale,
    391 			vf_curr_NSweight_vu,
    392 			src_linestride_vu );
    393 
    394 
    395 	// Store the result back to main memory into a destination buffer in YUV format
    396 	//---------------------------------------------------------------------------------------------
    397 	DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
    398 
    399 	// Perform three DMA transfers to 3 different locations in the main memory!
    400 	// dst_width:	Pixel width of destination image
    401 	// dst_addr:	Destination address in main memory
    402 	// dst_vu:	Counter which is incremented one by one
    403 	// dst_y:	Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
    404 	mfc_put(	scaled_y_plane[curr_src_idx],					// What from local store (addr)
    405 			(unsigned int)dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y),	// Destination in main memory (addr)
    406 			dst_dbl_linestride_y,						// Two Y lines (depending on the widht of the destination resolution)
    407 			STR_BUF+curr_dst_idx,						// Tag
    408 			0, 0 );
    409 
    410 	mfc_put(	scaled_v_plane[curr_src_idx],					// What from local store (addr)
    411 			(unsigned int)dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu),	// Destination in main memory (addr)
    412 			dst_dbl_linestride_vu,						// Two V lines (depending on the widht of the destination resolution)
    413 			STR_BUF+curr_dst_idx,						// Tag
    414 			0, 0 );
    415 
    416 	mfc_put(	scaled_u_plane[curr_src_idx],					// What from local store (addr)
    417 			(unsigned int)dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu),	// Destination in main memory (addr)
    418 			dst_dbl_linestride_vu,						// Two U lines (depending on the widht of the destination resolution)
    419 			STR_BUF+curr_dst_idx,						// Tag
    420 			0, 0 );
    421 
    422 	// wait for completion
    423 	DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
    424 	//---------------------------------------------------------------------------------------------
    425 }
    426 
    427 
    428 /*
    429  * scale_srcw16_dstw32()
    430  *
    431  * processes an input image of width 16
    432  * scaling is done to a width 32
    433  * yuv2rgb conversion on a width of 32
    434  * result stored in RAM
    435  */
    436 void scale_srcw16_dstw32() {
    437 	// extract parameters
    438 	unsigned char* dst_addr = (unsigned char *)parms.dstBuffer;
    439 
    440 	unsigned int src_width = parms.src_pixel_width;
    441 	unsigned int src_height = parms.src_pixel_height;
    442 	unsigned int dst_width = parms.dst_pixel_width;
    443 	unsigned int dst_height = parms.dst_pixel_height;
    444 
    445 	// YVU
    446 	unsigned int src_linestride_y = src_width;
    447 	unsigned int src_dbl_linestride_y = src_width<<1;
    448 	unsigned int src_linestride_vu = src_width>>1;
    449 	unsigned int src_dbl_linestride_vu = src_width;
    450 	// scaled YVU
    451 	unsigned int scaled_src_linestride_y = dst_width;
    452 
    453 	// ram addresses
    454 	unsigned char* src_addr_y = parms.y_plane;
    455 	unsigned char* src_addr_v = parms.v_plane;
    456 	unsigned char* src_addr_u = parms.u_plane;
    457 
    458 	unsigned int dst_picture_size = dst_width*dst_height;
    459 
    460 	// Sizes for destination
    461 	unsigned int dst_dbl_linestride_y = dst_width<<1;
    462 	unsigned int dst_dbl_linestride_vu = dst_width>>1;
    463 
    464 	// Perform address calculation for Y, V and U in main memory with dst_addr as base
    465 	unsigned char* dst_addr_main_memory_y = dst_addr;
    466 	unsigned char* dst_addr_main_memory_v = dst_addr + dst_picture_size;
    467 	unsigned char* dst_addr_main_memory_u = dst_addr_main_memory_v +(dst_picture_size>>2);
    468 
    469 
    470 	// for handling misalignment, addresses are precalculated
    471 	unsigned char* precalc_src_addr_v = src_addr_v;
    472 	unsigned char* precalc_src_addr_u = src_addr_u;
    473 
    474 	// calculate scale factors
    475 	vector float vf_x_scale = spu_splats( (float)src_width/(float)dst_width );
    476 	float y_scale = (float)src_height/(float)dst_height;
    477 
    478 	// double buffered processing
    479 	// buffer switching
    480 	unsigned int curr_src_idx = 0;
    481 	unsigned int curr_dst_idx = 0;
    482 	unsigned int next_src_idx, next_dst_idx;
    483 
    484 	// 2 lines y as output, upper and lowerline
    485 	unsigned int curr_interpl_y_upper = 0;
    486 	unsigned int next_interpl_y_upper;
    487 	unsigned int curr_interpl_y_lower, next_interpl_y_lower;
    488 	// only 1 line v/u output, both planes have the same dimension
    489 	unsigned int curr_interpl_vu = 0;
    490 	unsigned int next_interpl_vu;
    491 
    492 	// weights, calculated in every loop iteration
    493 	vector float vf_curr_NSweight_y_upper = { 0.0f, 0.0f, 0.0f, 0.0f };
    494 	vector float vf_next_NSweight_y_upper;
    495 	vector float vf_curr_NSweight_y_lower, vf_next_NSweight_y_lower;
    496 	vector float vf_curr_NSweight_vu = { 0.0f, 0.0f, 0.0f, 0.0f };
    497 	vector float vf_next_NSweight_vu;
    498 
    499 	// line indices for the src picture
    500 	float curr_src_y_upper = 0.0f, next_src_y_upper;
    501 	float curr_src_y_lower, next_src_y_lower;
    502 	float curr_src_vu = 0.0f, next_src_vu;
    503 
    504 	// line indices for the dst picture
    505 	unsigned int dst_y=0, dst_vu=0;
    506 
    507 	// offset for the v and u plane to handle misalignement
    508 	unsigned int curr_lsoff_v = 0, next_lsoff_v;
    509 	unsigned int curr_lsoff_u = 0, next_lsoff_u;
    510 
    511 	// calculate lower line idices
    512 	curr_src_y_lower = ((float)curr_interpl_y_upper+1)*y_scale;
    513 	curr_interpl_y_lower = (unsigned int)curr_src_y_lower;
    514 	// lower line weight
    515 	vf_curr_NSweight_y_lower = spu_splats( curr_src_y_lower-(float)curr_interpl_y_lower );
    516 
    517 
    518 	// start partially double buffered processing
    519 	// get initial data, 2 sets of y, 1 set v, 1 set u
    520 	mfc_get( y_plane[curr_src_idx], (unsigned int) src_addr_y, src_dbl_linestride_y, RETR_BUF, 0, 0 );
    521 	mfc_get( y_plane[curr_src_idx]+src_dbl_linestride_y,
    522 			(unsigned int) src_addr_y+(curr_interpl_y_lower*src_linestride_y),
    523 			src_dbl_linestride_y,
    524 			RETR_BUF,
    525 			0, 0 );
    526 	mfc_get( v_plane[curr_src_idx], (unsigned int) src_addr_v, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
    527 	mfc_get( u_plane[curr_src_idx], (unsigned int) src_addr_u, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
    528 
    529 	// iteration loop
    530 	// within each iteration 4 lines y, 2 lines v, 2 lines u are retrieved
    531 	// the scaled output is 2 lines y, 1 line v, 1 line u
    532 	// the yuv2rgb-converted output is stored to RAM
    533 	for( dst_vu=0; dst_vu<(dst_height>>1)-1; dst_vu++ ) {
    534 		dst_y = dst_vu<<1;
    535 
    536 		// calculate next indices
    537 		next_src_vu = ((float)dst_vu+1)*y_scale;
    538 		next_src_y_upper = ((float)dst_y+2)*y_scale;
    539 		next_src_y_lower = ((float)dst_y+3)*y_scale;
    540 
    541 		next_interpl_vu = (unsigned int) next_src_vu;
    542 		next_interpl_y_upper = (unsigned int) next_src_y_upper;
    543 		next_interpl_y_lower = (unsigned int) next_src_y_lower;
    544 
    545 		// calculate weight NORTH-SOUTH
    546 		vf_next_NSweight_vu = spu_splats( next_src_vu-(float)next_interpl_vu );
    547 		vf_next_NSweight_y_upper = spu_splats( next_src_y_upper-(float)next_interpl_y_upper );
    548 		vf_next_NSweight_y_lower = spu_splats( next_src_y_lower-(float)next_interpl_y_lower );
    549 
    550 		// get next lines
    551 		next_src_idx = curr_src_idx^1;
    552 		next_dst_idx = curr_dst_idx^1;
    553 
    554 		// 4 lines y
    555 		mfc_get( y_plane[next_src_idx],
    556 				(unsigned int) src_addr_y+(next_interpl_y_upper*src_linestride_y),
    557 				src_dbl_linestride_y,
    558 				RETR_BUF+next_src_idx,
    559 				0, 0 );
    560 		mfc_get( y_plane[next_src_idx]+src_dbl_linestride_y,
    561 				(unsigned int) src_addr_y+(next_interpl_y_lower*src_linestride_y),
    562 				src_dbl_linestride_y,
    563 				RETR_BUF+next_src_idx,
    564 				0, 0 );
    565 
    566 		// 2 lines v
    567 		precalc_src_addr_v = src_addr_v+(next_interpl_vu*src_linestride_vu);
    568 		next_lsoff_v = ((unsigned int)precalc_src_addr_v)&0x0F;
    569 		mfc_get( v_plane[next_src_idx],
    570 				((unsigned int) precalc_src_addr_v)&0xFFFFFFF0,
    571 				src_dbl_linestride_vu+(next_lsoff_v<<1),
    572 				RETR_BUF+next_src_idx,
    573 				0, 0 );
    574 		// 2 lines u
    575 		precalc_src_addr_u = src_addr_u+(next_interpl_vu*src_linestride_vu);
    576 		next_lsoff_u = ((unsigned int)precalc_src_addr_u)&0x0F;
    577 		mfc_get( u_plane[next_src_idx],
    578 				((unsigned int) precalc_src_addr_u)&0xFFFFFFF0,
    579 				src_dbl_linestride_vu+(next_lsoff_v<<1),
    580 				RETR_BUF+next_src_idx,
    581 				0, 0 );
    582 
    583 		DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );
    584 
    585 		// scaling
    586 		// work line y_upper
    587 		bilinear_scale_line_w16( y_plane[curr_src_idx],
    588 				scaled_y_plane[curr_src_idx],
    589 				dst_width,
    590 				vf_x_scale,
    591 				vf_curr_NSweight_y_upper,
    592 				src_linestride_y );
    593 		// work line y_lower
    594 		bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
    595 				scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
    596 				dst_width,
    597 				vf_x_scale,
    598 				vf_curr_NSweight_y_lower,
    599 				src_linestride_y );
    600 		// work line v
    601 		bilinear_scale_line_w8( v_plane[curr_src_idx]+curr_lsoff_v,
    602 				scaled_v_plane[curr_src_idx],
    603 				dst_width>>1,
    604 				vf_x_scale,
    605 				vf_curr_NSweight_vu,
    606 				src_linestride_vu );
    607 		// work line u
    608 		bilinear_scale_line_w8( u_plane[curr_src_idx]+curr_lsoff_u,
    609 				scaled_u_plane[curr_src_idx],
    610 				dst_width>>1,
    611 				vf_x_scale,
    612 				vf_curr_NSweight_vu,
    613 				src_linestride_vu );
    614 
    615 		//---------------------------------------------------------------------------------------------
    616 		DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
    617 
    618 		// Perform three DMA transfers to 3 different locations in the main memory!
    619 		// dst_width:	Pixel width of destination image
    620 		// dst_addr:	Destination address in main memory
    621 		// dst_vu:	Counter which is incremented one by one
    622 		// dst_y:	Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
    623 
    624 		mfc_put(	scaled_y_plane[curr_src_idx],							// What from local store (addr)
    625 				(unsigned int)  dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y),	// Destination in main memory (addr)
    626 				dst_dbl_linestride_y,								// Two Y lines (depending on the widht of the destination resolution)
    627 				STR_BUF+curr_dst_idx,								// Tag
    628 				0, 0 );
    629 
    630 		mfc_put(	scaled_v_plane[curr_src_idx],							// What from local store (addr)
    631 				(unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu),	// Destination in main memory (addr)
    632 				dst_dbl_linestride_vu,								// Two V lines (depending on the widht of the destination resolution)
    633 				STR_BUF+curr_dst_idx,								// Tag
    634 				0, 0 );
    635 
    636 		mfc_put(	scaled_u_plane[curr_src_idx],							// What from local store (addr)
    637 				(unsigned int)  dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu),	// Destination in main memory (addr)
    638 				dst_dbl_linestride_vu,								// Two U lines (depending on the widht of the destination resolution)
    639 				STR_BUF+curr_dst_idx,								// Tag
    640 				0, 0 );
    641 		//---------------------------------------------------------------------------------------------
    642 
    643 
    644 		// update for next cycle
    645 		curr_src_idx = next_src_idx;
    646 		curr_dst_idx = next_dst_idx;
    647 
    648 		curr_interpl_y_upper = next_interpl_y_upper;
    649 		curr_interpl_y_lower = next_interpl_y_lower;
    650 		curr_interpl_vu = next_interpl_vu;
    651 
    652 		vf_curr_NSweight_y_upper = vf_curr_NSweight_y_upper;
    653 		vf_curr_NSweight_y_lower = vf_curr_NSweight_y_lower;
    654 		vf_curr_NSweight_vu = vf_next_NSweight_vu;
    655 
    656 		curr_src_y_upper = next_src_y_upper;
    657 		curr_src_y_lower = next_src_y_lower;
    658 		curr_src_vu = next_src_vu;
    659 
    660 		curr_lsoff_v = next_lsoff_v;
    661 		curr_lsoff_u = next_lsoff_u;
    662 	}
    663 
    664 
    665 
    666 	DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );
    667 
    668 	// scaling
    669 	// work line y_upper
    670 	bilinear_scale_line_w16( y_plane[curr_src_idx],
    671 			scaled_y_plane[curr_src_idx],
    672 			dst_width,
    673 			vf_x_scale,
    674 			vf_curr_NSweight_y_upper,
    675 			src_linestride_y );
    676 	// work line y_lower
    677 	bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
    678 			scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
    679 			dst_width,
    680 			vf_x_scale,
    681 			vf_curr_NSweight_y_lower,
    682 			src_linestride_y );
    683 	// work line v
    684 	bilinear_scale_line_w8( v_plane[curr_src_idx]+curr_lsoff_v,
    685 			scaled_v_plane[curr_src_idx],
    686 			dst_width>>1,
    687 			vf_x_scale,
    688 			vf_curr_NSweight_vu,
    689 			src_linestride_vu );
    690 	// work line u
    691 	bilinear_scale_line_w8( u_plane[curr_src_idx]+curr_lsoff_u,
    692 			scaled_u_plane[curr_src_idx],
    693 			dst_width>>1,
    694 			vf_x_scale,
    695 			vf_curr_NSweight_vu,
    696 			src_linestride_vu );
    697 
    698 	//---------------------------------------------------------------------------------------------
    699 	DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
    700 
    701 	// Perform three DMA transfers to 3 different locations in the main memory!
    702 	// dst_width:	Pixel width of destination image
    703 	// dst_addr:	Destination address in main memory
    704 	// dst_vu:	Counter which is incremented one by one
    705 	// dst_y:	Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
    706 
    707 	mfc_put(	scaled_y_plane[curr_src_idx],							// What from local store (addr)
    708 			(unsigned int)  dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y),	// Destination in main memory (addr)
    709 			dst_dbl_linestride_y,								// Two Y lines (depending on the widht of the destination resolution)
    710 			STR_BUF+curr_dst_idx,								// Tag
    711 			0, 0 );
    712 
    713 	mfc_put(	scaled_v_plane[curr_src_idx],							// What from local store (addr)
    714 			(unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu),	// Destination in main memory (addr)
    715 			dst_dbl_linestride_vu,								// Two V lines (depending on the widht of the destination resolution)
    716 			STR_BUF+curr_dst_idx,								// Tag
    717 			0, 0 );
    718 
    719 	mfc_put(	scaled_u_plane[curr_src_idx],							// What from local store (addr)
    720 			(unsigned int)  dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu),	// Destination in main memory (addr)
    721 			dst_dbl_linestride_vu,								// Two U lines (depending on the widht of the destination resolution)
    722 			STR_BUF+curr_dst_idx,								// Tag
    723 			0, 0 );
    724 
    725 	// wait for completion
    726 	DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
    727 	//---------------------------------------------------------------------------------------------
    728 }
    729 
    730 
    731 /*
    732  * scale_srcw32_dstw16()
    733  *
    734  * processes an input image of width 32
    735  * scaling is done to a width 16
    736  * yuv2rgb conversion on a width of 16
    737  * result stored in RAM
    738  */
    739 void scale_srcw32_dstw16() {
    740 	// extract parameters
    741 	unsigned char* dst_addr = (unsigned char *)parms.dstBuffer;
    742 
    743 	unsigned int src_width = parms.src_pixel_width;
    744 	unsigned int src_height = parms.src_pixel_height;
    745 	unsigned int dst_width = parms.dst_pixel_width;
    746 	unsigned int dst_height = parms.dst_pixel_height;
    747 
    748 	// YVU
    749 	unsigned int src_linestride_y = src_width;
    750 	unsigned int src_dbl_linestride_y = src_width<<1;
    751 	unsigned int src_linestride_vu = src_width>>1;
    752 	unsigned int src_dbl_linestride_vu = src_width;
    753 	// scaled YVU
    754 	unsigned int scaled_src_linestride_y = dst_width;
    755 
    756 	// ram addresses
    757 	unsigned char* src_addr_y = parms.y_plane;
    758 	unsigned char* src_addr_v = parms.v_plane;
    759 	unsigned char* src_addr_u = parms.u_plane;
    760 
    761 	unsigned int dst_picture_size = dst_width*dst_height;
    762 
    763 	// Sizes for destination
    764 	unsigned int dst_dbl_linestride_y = dst_width<<1;
    765 	unsigned int dst_dbl_linestride_vu = dst_width>>1;
    766 
    767 	// Perform address calculation for Y, V and U in main memory with dst_addr as base
    768 	unsigned char* dst_addr_main_memory_y = dst_addr;
    769 	unsigned char* dst_addr_main_memory_v = dst_addr + dst_picture_size;
    770 	unsigned char* dst_addr_main_memory_u = dst_addr_main_memory_v +(dst_picture_size>>2);
    771 
    772 	// calculate scale factors
    773 	vector float vf_x_scale = spu_splats( (float)src_width/(float)dst_width );
    774 	float y_scale = (float)src_height/(float)dst_height;
    775 
    776 	// double buffered processing
    777 	// buffer switching
    778 	unsigned int curr_src_idx = 0;
    779 	unsigned int curr_dst_idx = 0;
    780 	unsigned int next_src_idx, next_dst_idx;
    781 
    782 	// 2 lines y as output, upper and lowerline
    783 	unsigned int curr_interpl_y_upper = 0;
    784 	unsigned int next_interpl_y_upper;
    785 	unsigned int curr_interpl_y_lower, next_interpl_y_lower;
    786 	// only 1 line v/u output, both planes have the same dimension
    787 	unsigned int curr_interpl_vu = 0;
    788 	unsigned int next_interpl_vu;
    789 
    790 	// weights, calculated in every loop iteration
    791 	vector float vf_curr_NSweight_y_upper = { 0.0f, 0.0f, 0.0f, 0.0f };
    792 	vector float vf_next_NSweight_y_upper;
    793 	vector float vf_curr_NSweight_y_lower, vf_next_NSweight_y_lower;
    794 	vector float vf_curr_NSweight_vu = { 0.0f, 0.0f, 0.0f, 0.0f };
    795 	vector float vf_next_NSweight_vu;
    796 
    797 	// line indices for the src picture
    798 	float curr_src_y_upper = 0.0f, next_src_y_upper;
    799 	float curr_src_y_lower, next_src_y_lower;
    800 	float curr_src_vu = 0.0f, next_src_vu;
    801 
    802 	// line indices for the dst picture
    803 	unsigned int dst_y=0, dst_vu=0;
    804 
    805 	// calculate lower line idices
    806 	curr_src_y_lower = ((float)curr_interpl_y_upper+1)*y_scale;
    807 	curr_interpl_y_lower = (unsigned int)curr_src_y_lower;
    808 	// lower line weight
    809 	vf_curr_NSweight_y_lower = spu_splats( curr_src_y_lower-(float)curr_interpl_y_lower );
    810 
    811 
    812 	// start partially double buffered processing
    813 	// get initial data, 2 sets of y, 1 set v, 1 set u
    814 	mfc_get( y_plane[curr_src_idx], (unsigned int) src_addr_y, src_dbl_linestride_y, RETR_BUF, 0, 0 );
    815 	mfc_get( y_plane[curr_src_idx]+src_dbl_linestride_y,
    816 			(unsigned int) src_addr_y+(curr_interpl_y_lower*src_linestride_y),
    817 			src_dbl_linestride_y,
    818 			RETR_BUF,
    819 			0, 0 );
    820 	mfc_get( v_plane[curr_src_idx], (unsigned int) src_addr_v, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
    821 	mfc_get( u_plane[curr_src_idx], (unsigned int) src_addr_u, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
    822 
    823 	// iteration loop
    824 	// within each iteration 4 lines y, 2 lines v, 2 lines u are retrieved
    825 	// the scaled output is 2 lines y, 1 line v, 1 line u
    826 	// the yuv2rgb-converted output is stored to RAM
    827 	for( dst_vu=0; dst_vu<(dst_height>>1)-1; dst_vu++ ) {
    828 		dst_y = dst_vu<<1;
    829 
    830 		// calculate next indices
    831 		next_src_vu = ((float)dst_vu+1)*y_scale;
    832 		next_src_y_upper = ((float)dst_y+2)*y_scale;
    833 		next_src_y_lower = ((float)dst_y+3)*y_scale;
    834 
    835 		next_interpl_vu = (unsigned int) next_src_vu;
    836 		next_interpl_y_upper = (unsigned int) next_src_y_upper;
    837 		next_interpl_y_lower = (unsigned int) next_src_y_lower;
    838 
    839 		// calculate weight NORTH-SOUTH
    840 		vf_next_NSweight_vu = spu_splats( next_src_vu-(float)next_interpl_vu );
    841 		vf_next_NSweight_y_upper = spu_splats( next_src_y_upper-(float)next_interpl_y_upper );
    842 		vf_next_NSweight_y_lower = spu_splats( next_src_y_lower-(float)next_interpl_y_lower );
    843 
    844 		// get next lines
    845 		next_src_idx = curr_src_idx^1;
    846 		next_dst_idx = curr_dst_idx^1;
    847 
    848 		// 4 lines y
    849 		mfc_get( y_plane[next_src_idx],
    850 				(unsigned int) src_addr_y+(next_interpl_y_upper*src_linestride_y),
    851 				src_dbl_linestride_y,
    852 				RETR_BUF+next_src_idx,
    853 				0, 0 );
    854 		mfc_get( y_plane[next_src_idx]+src_dbl_linestride_y,
    855 				(unsigned int) src_addr_y+(next_interpl_y_lower*src_linestride_y),
    856 				src_dbl_linestride_y,
    857 				RETR_BUF+next_src_idx,
    858 				0, 0 );
    859 
    860 		// 2 lines v
    861 		mfc_get( v_plane[next_src_idx],
    862 				(unsigned int) src_addr_v+(next_interpl_vu*src_linestride_vu),
    863 				src_dbl_linestride_vu,
    864 				RETR_BUF+next_src_idx,
    865 				0, 0 );
    866 		// 2 lines u
    867 		mfc_get( u_plane[next_src_idx],
    868 				(unsigned int) src_addr_u+(next_interpl_vu*src_linestride_vu),
    869 				src_dbl_linestride_vu,
    870 				RETR_BUF+next_src_idx,
    871 				0, 0 );
    872 
    873 		DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );
    874 
    875 		// scaling
    876 		// work line y_upper
    877 		bilinear_scale_line_w16( y_plane[curr_src_idx],
    878 				scaled_y_plane[curr_src_idx],
    879 				dst_width,
    880 				vf_x_scale,
    881 				vf_curr_NSweight_y_upper,
    882 				src_linestride_y );
    883 		// work line y_lower
    884 		bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
    885 				scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
    886 				dst_width,
    887 				vf_x_scale,
    888 				vf_curr_NSweight_y_lower,
    889 				src_linestride_y );
    890 		// work line v
    891 		bilinear_scale_line_w16( v_plane[curr_src_idx],
    892 				scaled_v_plane[curr_src_idx],
    893 				dst_width>>1,
    894 				vf_x_scale,
    895 				vf_curr_NSweight_vu,
    896 				src_linestride_vu );
    897 		// work line u
    898 		bilinear_scale_line_w16( u_plane[curr_src_idx],
    899 				scaled_u_plane[curr_src_idx],
    900 				dst_width>>1,
    901 				vf_x_scale,
    902 				vf_curr_NSweight_vu,
    903 				src_linestride_vu );
    904 
    905 		//---------------------------------------------------------------------------------------------
    906 		DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
    907 
    908 		// Perform three DMA transfers to 3 different locations in the main memory!
    909 		// dst_width:	Pixel width of destination image
    910 		// dst_addr:	Destination address in main memory
    911 		// dst_vu:	Counter which is incremented one by one
    912 		// dst_y:	Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
    913 
    914 		mfc_put(	scaled_y_plane[curr_src_idx],							// What from local store (addr)
    915 				(unsigned int)  dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y),	// Destination in main memory (addr)
    916 				dst_dbl_linestride_y,								// Two Y lines (depending on the widht of the destination resolution)
    917 				STR_BUF+curr_dst_idx,								// Tag
    918 				0, 0 );
    919 
    920 		mfc_put(	scaled_v_plane[curr_src_idx],							// What from local store (addr)
    921 				(unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu),	// Destination in main memory (addr)
    922 				dst_dbl_linestride_vu,								// Two V lines (depending on the widht of the destination resolution)
    923 				STR_BUF+curr_dst_idx,								// Tag
    924 				0, 0 );
    925 
    926 		mfc_put(	scaled_u_plane[curr_src_idx],							// What from local store (addr)
    927 				(unsigned int)  dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
    928 				dst_dbl_linestride_vu,								// Two U lines (depending on the widht of the destination resolution)
    929 				STR_BUF+curr_dst_idx,								// Tag
    930 				0, 0 );
    931 		//---------------------------------------------------------------------------------------------
    932 
    933 
    934 		// update for next cycle
    935 		curr_src_idx = next_src_idx;
    936 		curr_dst_idx = next_dst_idx;
    937 
    938 		curr_interpl_y_upper = next_interpl_y_upper;
    939 		curr_interpl_y_lower = next_interpl_y_lower;
    940 		curr_interpl_vu = next_interpl_vu;
    941 
    942 		vf_curr_NSweight_y_upper = vf_curr_NSweight_y_upper;
    943 		vf_curr_NSweight_y_lower = vf_curr_NSweight_y_lower;
    944 		vf_curr_NSweight_vu = vf_next_NSweight_vu;
    945 
    946 		curr_src_y_upper = next_src_y_upper;
    947 		curr_src_y_lower = next_src_y_lower;
    948 		curr_src_vu = next_src_vu;
    949 	}
    950 
    951 
    952 
    953 	DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );
    954 
    955 	// scaling
    956 	// work line y_upper
    957 	bilinear_scale_line_w16( y_plane[curr_src_idx],
    958 			scaled_y_plane[curr_src_idx],
    959 			dst_width,
    960 			vf_x_scale,
    961 			vf_curr_NSweight_y_upper,
    962 			src_linestride_y );
    963 	// work line y_lower
    964 	bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
    965 			scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
    966 			dst_width,
    967 			vf_x_scale,
    968 			vf_curr_NSweight_y_lower,
    969 			src_linestride_y );
    970 	// work line v
    971 	bilinear_scale_line_w16( v_plane[curr_src_idx],
    972 			scaled_v_plane[curr_src_idx],
    973 			dst_width>>1,
    974 			vf_x_scale,
    975 			vf_curr_NSweight_vu,
    976 			src_linestride_vu );
    977 	// work line u
    978 	bilinear_scale_line_w16( u_plane[curr_src_idx],
    979 			scaled_u_plane[curr_src_idx],
    980 			dst_width>>1,
    981 			vf_x_scale,
    982 			vf_curr_NSweight_vu,
    983 			src_linestride_vu );
    984 
    985 
    986 	//---------------------------------------------------------------------------------------------
    987 	DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
    988 
    989 	// Perform three DMA transfers to 3 different locations in the main memory!
    990 	// dst_width:	Pixel width of destination image
    991 	// dst_addr:	Destination address in main memory
    992 	// dst_vu:	Counter which is incremented one by one
    993 	// dst_y:	Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
    994 
    995 	mfc_put(	scaled_y_plane[curr_src_idx],							// What from local store (addr)
    996 			(unsigned int)  dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y),	// Destination in main memory (addr)
    997 			dst_dbl_linestride_y,								// Two Y lines (depending on the widht of the destination resolution)
    998 			STR_BUF+curr_dst_idx,								// Tag
    999 			0, 0 );
   1000 
   1001 	mfc_put(	scaled_v_plane[curr_src_idx],							// What from local store (addr)
   1002 			(unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu),	// Destination in main memory (addr)
   1003 			dst_dbl_linestride_vu,								// Two V lines (depending on the widht of the destination resolution)
   1004 			STR_BUF+curr_dst_idx,								// Tag
   1005 			0, 0 );
   1006 
   1007 	mfc_put(	scaled_u_plane[curr_src_idx],							// What from local store (addr)
   1008 			(unsigned int)  dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
   1009 			dst_dbl_linestride_vu,								// Two U lines (depending on the widht of the destination resolution)
   1010 			STR_BUF+curr_dst_idx,								// Tag
   1011 			0, 0 );
   1012 
   1013 	// wait for completion
   1014 	DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
   1015 	//---------------------------------------------------------------------------------------------
   1016 }
   1017 
   1018 
   1019 /**
   1020  * scale_srcw32_dstw32()
   1021  *
   1022  * processes an input image of width 32
   1023  * scaling is done to a width 32
   1024  * yuv2rgb conversion on a width of 32
   1025  * result stored in RAM
   1026  */
   1027 void scale_srcw32_dstw32() {
   1028 	// extract parameters
   1029 	unsigned char* dst_addr = (unsigned char *)parms.dstBuffer;
   1030 
   1031 	unsigned int src_width = parms.src_pixel_width;
   1032 	unsigned int src_height = parms.src_pixel_height;
   1033 	unsigned int dst_width = parms.dst_pixel_width;
   1034 	unsigned int dst_height = parms.dst_pixel_height;
   1035 
   1036 	// YVU
   1037 	unsigned int src_linestride_y = src_width;
   1038 	unsigned int src_dbl_linestride_y = src_width<<1;
   1039 	unsigned int src_linestride_vu = src_width>>1;
   1040 	unsigned int src_dbl_linestride_vu = src_width;
   1041 
   1042 	// scaled YVU
   1043 	unsigned int scaled_src_linestride_y = dst_width;
   1044 
   1045 	// ram addresses
   1046 	unsigned char* src_addr_y = parms.y_plane;
   1047 	unsigned char* src_addr_v = parms.v_plane;
   1048 	unsigned char* src_addr_u = parms.u_plane;
   1049 
   1050 	unsigned int dst_picture_size = dst_width*dst_height;
   1051 
   1052 	// Sizes for destination
   1053 	unsigned int dst_dbl_linestride_y = dst_width<<1;
   1054 	unsigned int dst_dbl_linestride_vu = dst_width>>1;
   1055 
   1056 	// Perform address calculation for Y, V and U in main memory with dst_addr as base
   1057 	unsigned char* dst_addr_main_memory_y = dst_addr;
   1058 	unsigned char* dst_addr_main_memory_v = dst_addr + dst_picture_size;
   1059 	unsigned char* dst_addr_main_memory_u = dst_addr_main_memory_v +(dst_picture_size>>2);
   1060 
   1061 	// calculate scale factors
   1062 	vector float vf_x_scale = spu_splats( (float)src_width/(float)dst_width );
   1063 	float y_scale = (float)src_height/(float)dst_height;
   1064 
   1065 	// double buffered processing
   1066 	// buffer switching
   1067 	unsigned int curr_src_idx = 0;
   1068 	unsigned int curr_dst_idx = 0;
   1069 	unsigned int next_src_idx, next_dst_idx;
   1070 
   1071 	// 2 lines y as output, upper and lowerline
   1072 	unsigned int curr_interpl_y_upper = 0;
   1073 	unsigned int next_interpl_y_upper;
   1074 	unsigned int curr_interpl_y_lower, next_interpl_y_lower;
   1075 	// only 1 line v/u output, both planes have the same dimension
   1076 	unsigned int curr_interpl_vu = 0;
   1077 	unsigned int next_interpl_vu;
   1078 
   1079 	// weights, calculated in every loop iteration
   1080 	vector float vf_curr_NSweight_y_upper = { 0.0f, 0.0f, 0.0f, 0.0f };
   1081 	vector float vf_next_NSweight_y_upper;
   1082 	vector float vf_curr_NSweight_y_lower, vf_next_NSweight_y_lower;
   1083 	vector float vf_curr_NSweight_vu = { 0.0f, 0.0f, 0.0f, 0.0f };
   1084 	vector float vf_next_NSweight_vu;
   1085 
   1086 	// line indices for the src picture
   1087 	float curr_src_y_upper = 0.0f, next_src_y_upper;
   1088 	float curr_src_y_lower, next_src_y_lower;
   1089 	float curr_src_vu = 0.0f, next_src_vu;
   1090 
   1091 	// line indices for the dst picture
   1092 	unsigned int dst_y=0, dst_vu=0;
   1093 
   1094 	// calculate lower line idices
   1095 	curr_src_y_lower = ((float)curr_interpl_y_upper+1)*y_scale;
   1096 	curr_interpl_y_lower = (unsigned int)curr_src_y_lower;
   1097 	// lower line weight
   1098 	vf_curr_NSweight_y_lower = spu_splats( curr_src_y_lower-(float)curr_interpl_y_lower );
   1099 
   1100 
   1101 	// start partially double buffered processing
   1102 	// get initial data, 2 sets of y, 1 set v, 1 set u
   1103 	mfc_get( y_plane[curr_src_idx], (unsigned int) src_addr_y, src_dbl_linestride_y, RETR_BUF, 0, 0 );
   1104 	mfc_get( y_plane[curr_src_idx]+src_dbl_linestride_y,
   1105 			(unsigned int) src_addr_y+(curr_interpl_y_lower*src_linestride_y),
   1106 			src_dbl_linestride_y,
   1107 			RETR_BUF,
   1108 			0, 0 );
   1109 	mfc_get( v_plane[curr_src_idx], (unsigned int) src_addr_v, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
   1110 	mfc_get( u_plane[curr_src_idx], (unsigned int) src_addr_u, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
   1111 
   1112 	// iteration loop
   1113 	// within each iteration 4 lines y, 2 lines v, 2 lines u are retrieved
   1114 	// the scaled output is 2 lines y, 1 line v, 1 line u
   1115 	// the yuv2rgb-converted output is stored to RAM
   1116 	for( dst_vu=0; dst_vu<(dst_height>>1)-1; dst_vu++ ) {
   1117 		dst_y = dst_vu<<1;
   1118 
   1119 		// calculate next indices
   1120 		next_src_vu = ((float)dst_vu+1)*y_scale;
   1121 		next_src_y_upper = ((float)dst_y+2)*y_scale;
   1122 		next_src_y_lower = ((float)dst_y+3)*y_scale;
   1123 
   1124 		next_interpl_vu = (unsigned int) next_src_vu;
   1125 		next_interpl_y_upper = (unsigned int) next_src_y_upper;
   1126 		next_interpl_y_lower = (unsigned int) next_src_y_lower;
   1127 
   1128 		// calculate weight NORTH-SOUTH
   1129 		vf_next_NSweight_vu = spu_splats( next_src_vu-(float)next_interpl_vu );
   1130 		vf_next_NSweight_y_upper = spu_splats( next_src_y_upper-(float)next_interpl_y_upper );
   1131 		vf_next_NSweight_y_lower = spu_splats( next_src_y_lower-(float)next_interpl_y_lower );
   1132 
   1133 		// get next lines
   1134 		next_src_idx = curr_src_idx^1;
   1135 		next_dst_idx = curr_dst_idx^1;
   1136 
   1137 		// 4 lines y
   1138 		mfc_get( y_plane[next_src_idx],
   1139 				(unsigned int) src_addr_y+(next_interpl_y_upper*src_linestride_y),
   1140 				src_dbl_linestride_y,
   1141 				RETR_BUF+next_src_idx,
   1142 				0, 0 );
   1143 		mfc_get( y_plane[next_src_idx]+src_dbl_linestride_y,
   1144 				(unsigned int) src_addr_y+(next_interpl_y_lower*src_linestride_y),
   1145 				src_dbl_linestride_y,
   1146 				RETR_BUF+next_src_idx,
   1147 				0, 0 );
   1148 
   1149 		// 2 lines v
   1150 		mfc_get( v_plane[next_src_idx],
   1151 				(unsigned int) src_addr_v+(next_interpl_vu*src_linestride_vu),
   1152 				src_dbl_linestride_vu,
   1153 				RETR_BUF+next_src_idx,
   1154 				0, 0 );
   1155 		// 2 lines u
   1156 		mfc_get( u_plane[next_src_idx],
   1157 				(unsigned int) src_addr_u+(next_interpl_vu*src_linestride_vu),
   1158 				src_dbl_linestride_vu,
   1159 				RETR_BUF+next_src_idx,
   1160 				0, 0 );
   1161 
   1162 		DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );
   1163 
   1164 		// scaling
   1165 		// work line y_upper
   1166 		bilinear_scale_line_w16( y_plane[curr_src_idx],
   1167 				scaled_y_plane[curr_src_idx],
   1168 				dst_width,
   1169 				vf_x_scale,
   1170 				vf_curr_NSweight_y_upper,
   1171 				src_linestride_y );
   1172 		// work line y_lower
   1173 		bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
   1174 				scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
   1175 				dst_width,
   1176 				vf_x_scale,
   1177 				vf_curr_NSweight_y_lower,
   1178 				src_linestride_y );
   1179 		// work line v
   1180 		bilinear_scale_line_w16( v_plane[curr_src_idx],
   1181 				scaled_v_plane[curr_src_idx],
   1182 				dst_width>>1,
   1183 				vf_x_scale,
   1184 				vf_curr_NSweight_vu,
   1185 				src_linestride_vu );
   1186 		// work line u
   1187 		bilinear_scale_line_w16( u_plane[curr_src_idx],
   1188 				scaled_u_plane[curr_src_idx],
   1189 				dst_width>>1,
   1190 				vf_x_scale,
   1191 				vf_curr_NSweight_vu,
   1192 				src_linestride_vu );
   1193 
   1194 
   1195 
   1196 		// Store the result back to main memory into a destination buffer in YUV format
   1197 		//---------------------------------------------------------------------------------------------
   1198 		DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
   1199 
   1200 		// Perform three DMA transfers to 3 different locations in the main memory!
   1201 		// dst_width:	Pixel width of destination image
   1202 		// dst_addr:	Destination address in main memory
   1203 		// dst_vu:	Counter which is incremented one by one
   1204 		// dst_y:	Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
   1205 
   1206 		mfc_put(	scaled_y_plane[curr_src_idx],							// What from local store (addr)
   1207 				(unsigned int) dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y),	// Destination in main memory (addr)
   1208 				dst_dbl_linestride_y,								// Two Y lines (depending on the widht of the destination resolution)
   1209 				STR_BUF+curr_dst_idx,								// Tag
   1210 				0, 0 );
   1211 
   1212 		mfc_put(	scaled_v_plane[curr_src_idx],							// What from local store (addr)
   1213 				(unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu),	// Destination in main memory (addr)
   1214 				dst_dbl_linestride_vu,								// Two V lines (depending on the widht of the destination resolution)
   1215 				STR_BUF+curr_dst_idx,								// Tag
   1216 				0, 0 );
   1217 
   1218 		mfc_put(	scaled_u_plane[curr_src_idx],							// What from local store (addr)
   1219 				(unsigned int) dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu),	// Destination in main memory (addr)
   1220 				dst_dbl_linestride_vu,								// Two U lines (depending on the widht of the destination resolution)
   1221 				STR_BUF+curr_dst_idx,								// Tag
   1222 				0, 0 );
   1223 		//---------------------------------------------------------------------------------------------
   1224 
   1225 
   1226 		// update for next cycle
   1227 		curr_src_idx = next_src_idx;
   1228 		curr_dst_idx = next_dst_idx;
   1229 
   1230 		curr_interpl_y_upper = next_interpl_y_upper;
   1231 		curr_interpl_y_lower = next_interpl_y_lower;
   1232 		curr_interpl_vu = next_interpl_vu;
   1233 
   1234 		vf_curr_NSweight_y_upper = vf_curr_NSweight_y_upper;
   1235 		vf_curr_NSweight_y_lower = vf_curr_NSweight_y_lower;
   1236 		vf_curr_NSweight_vu = vf_next_NSweight_vu;
   1237 
   1238 		curr_src_y_upper = next_src_y_upper;
   1239 		curr_src_y_lower = next_src_y_lower;
   1240 		curr_src_vu = next_src_vu;
   1241 	}
   1242 
   1243 
   1244 
   1245 	DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );
   1246 
   1247 	// scaling
   1248 	// work line y_upper
   1249 	bilinear_scale_line_w16( y_plane[curr_src_idx],
   1250 			scaled_y_plane[curr_src_idx],
   1251 			dst_width,
   1252 			vf_x_scale,
   1253 			vf_curr_NSweight_y_upper,
   1254 			src_linestride_y );
   1255 	// work line y_lower
   1256 	bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
   1257 			scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
   1258 			dst_width,
   1259 			vf_x_scale,
   1260 			vf_curr_NSweight_y_lower,
   1261 			src_linestride_y );
   1262 	// work line v
   1263 	bilinear_scale_line_w16( v_plane[curr_src_idx],
   1264 			scaled_v_plane[curr_src_idx],
   1265 			dst_width>>1,
   1266 			vf_x_scale,
   1267 			vf_curr_NSweight_vu,
   1268 			src_linestride_vu );
   1269 	// work line u
   1270 	bilinear_scale_line_w16( u_plane[curr_src_idx],
   1271 			scaled_u_plane[curr_src_idx],
   1272 			dst_width>>1,
   1273 			vf_x_scale,
   1274 			vf_curr_NSweight_vu,
   1275 			src_linestride_vu );
   1276 
   1277 
   1278 	// Store the result back to main memory into a destination buffer in YUV format
   1279 	//---------------------------------------------------------------------------------------------
   1280 	DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
   1281 
   1282 	// Perform three DMA transfers to 3 different locations in the main memory!
   1283 	// dst_width:	Pixel width of destination image
   1284 	// dst_addr:	Destination address in main memory
   1285 	// dst_vu:	Counter which is incremented one by one
   1286 	// dst_y:	Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
   1287 
   1288 	mfc_put(	scaled_y_plane[curr_src_idx],							// What from local store (addr)
   1289 			(unsigned int)  dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y),	// Destination in main memory (addr)
   1290 			dst_dbl_linestride_y,								// Two Y lines (depending on the widht of the destination resolution)
   1291 			STR_BUF+curr_dst_idx,								// Tag
   1292 			0, 0 );
   1293 
   1294 	mfc_put(	scaled_v_plane[curr_src_idx],							// What from local store (addr)
   1295 			(unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu),	// Destination in main memory (addr)
   1296 			dst_dbl_linestride_vu,								// Two V lines (depending on the widht of the destination resolution)
   1297 			STR_BUF+curr_dst_idx,								// Tag
   1298 			0, 0 );
   1299 
   1300 	mfc_put(	scaled_u_plane[curr_src_idx],							// What from local store (addr)
   1301 			(unsigned int)  dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
   1302 			dst_dbl_linestride_vu,								// Two U lines (depending on the widht of the destination resolution)
   1303 			STR_BUF+curr_dst_idx,								// Tag
   1304 			0, 0 );
   1305 
   1306 	// wait for completion
   1307 	DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
   1308 	//---------------------------------------------------------------------------------------------
   1309 }
   1310 
   1311 
   1312 /*
   1313  * bilinear_scale_line_w8()
   1314  *
   1315  * processes a line of yuv-input, width has to be a multiple of 8
   1316  * scaled yuv-output is written to local store buffer
   1317  *
   1318  * @param src buffer for 2 lines input
   1319  * @param dst_ buffer for 1 line output
   1320  * @param dst_width the width of the destination line
   1321  * @param vf_x_scale a float vector, at each entry is the x_scale-factor
   1322  * @param vf_NSweight a float vector, at each position is the weight NORTH/SOUTH for the current line
   1323  * @param src_linestride the stride of the srcline
   1324  */
   1325 void bilinear_scale_line_w8( unsigned char* src, unsigned char* dst_, unsigned int dst_width, vector float vf_x_scale, vector float vf_NSweight, unsigned int src_linestride ) {
   1326 
   1327 	unsigned char* dst = dst_;
   1328 
   1329 	unsigned int dst_x;
   1330 	for( dst_x=0; dst_x<dst_width; dst_x+=8) {
   1331 		// address calculation for loading the 4 surrounding pixel of each calculated
   1332 		// destination pixel
   1333 		vector unsigned int vui_dst_x_tmp = spu_splats( dst_x );
   1334 		// lower range->first 4 pixel
   1335 		// upper range->next 4 pixel
   1336 		vector unsigned int vui_inc_dst_x_lower_range = { 0, 1, 2, 3 };
   1337 		vector unsigned int vui_inc_dst_x_upper_range = { 4, 5, 6, 7 };
   1338 		vector unsigned int vui_dst_x_lower_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_lower_range );
   1339 		vector unsigned int vui_dst_x_upper_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_upper_range );
   1340 
   1341 		// calculate weight EAST-WEST
   1342 		vector float vf_dst_x_lower_range = spu_convtf( vui_dst_x_lower_range, 0 );
   1343 		vector float vf_dst_x_upper_range = spu_convtf( vui_dst_x_upper_range, 0 );
   1344 		vector float vf_src_x_lower_range = spu_mul( vf_dst_x_lower_range, vf_x_scale );
   1345 		vector float vf_src_x_upper_range = spu_mul( vf_dst_x_upper_range, vf_x_scale );
   1346 		vector unsigned int vui_interpl_x_lower_range = spu_convtu( vf_src_x_lower_range, 0 );
   1347 		vector unsigned int vui_interpl_x_upper_range = spu_convtu( vf_src_x_upper_range, 0 );
   1348 		vector float vf_interpl_x_lower_range = spu_convtf( vui_interpl_x_lower_range, 0 );
   1349 		vector float vf_interpl_x_upper_range = spu_convtf( vui_interpl_x_upper_range, 0 );
   1350 		vector float vf_EWweight_lower_range = spu_sub( vf_src_x_lower_range, vf_interpl_x_lower_range );
   1351 		vector float vf_EWweight_upper_range = spu_sub( vf_src_x_upper_range, vf_interpl_x_upper_range );
   1352 
   1353 		// calculate address offset
   1354 		//
   1355 		// pixel NORTH WEST
   1356 		vector unsigned int vui_off_pixelNW_lower_range = vui_interpl_x_lower_range;
   1357 		vector unsigned int vui_off_pixelNW_upper_range = vui_interpl_x_upper_range;
   1358 
   1359 		// pixel NORTH EAST-->(offpixelNW+1)
   1360 		vector unsigned int vui_add_1 = { 1, 1, 1, 1 };
   1361 		vector unsigned int vui_off_pixelNE_lower_range = spu_add( vui_off_pixelNW_lower_range, vui_add_1 );
   1362 		vector unsigned int vui_off_pixelNE_upper_range = spu_add( vui_off_pixelNW_upper_range, vui_add_1 );
   1363 
   1364 		// SOUTH-WEST-->(offpixelNW+src_linestride)
   1365 		vector unsigned int vui_srclinestride = spu_splats( src_linestride );
   1366 		vector unsigned int vui_off_pixelSW_lower_range = spu_add( vui_srclinestride, vui_off_pixelNW_lower_range );
   1367 		vector unsigned int vui_off_pixelSW_upper_range = spu_add( vui_srclinestride, vui_off_pixelNW_upper_range );
   1368 
   1369 		// SOUTH-EAST-->(offpixelNW+src_linestride+1)
   1370 		vector unsigned int vui_off_pixelSE_lower_range = spu_add( vui_srclinestride, vui_off_pixelNE_lower_range );
   1371 		vector unsigned int vui_off_pixelSE_upper_range = spu_add( vui_srclinestride, vui_off_pixelNE_upper_range );
   1372 
   1373 		// calculate each address
   1374 		vector unsigned int vui_src_ls = spu_splats( (unsigned int) src );
   1375 		vector unsigned int vui_addr_pixelNW_lower_range = spu_add( vui_src_ls, vui_off_pixelNW_lower_range );
   1376 		vector unsigned int vui_addr_pixelNW_upper_range = spu_add( vui_src_ls, vui_off_pixelNW_upper_range );
   1377 		vector unsigned int vui_addr_pixelNE_lower_range = spu_add( vui_src_ls, vui_off_pixelNE_lower_range );
   1378 		vector unsigned int vui_addr_pixelNE_upper_range = spu_add( vui_src_ls, vui_off_pixelNE_upper_range );
   1379 
   1380 		vector unsigned int vui_addr_pixelSW_lower_range = spu_add( vui_src_ls, vui_off_pixelSW_lower_range );
   1381 		vector unsigned int vui_addr_pixelSW_upper_range = spu_add( vui_src_ls, vui_off_pixelSW_upper_range );
   1382 		vector unsigned int vui_addr_pixelSE_lower_range = spu_add( vui_src_ls, vui_off_pixelSE_lower_range );
   1383 		vector unsigned int vui_addr_pixelSE_upper_range = spu_add( vui_src_ls, vui_off_pixelSE_upper_range );
   1384 
   1385 		// get each pixel
   1386 		//
   1387 		// scalar load, afterwards insertion into the right position
   1388 		// NORTH WEST
   1389 		vector unsigned char null_vector = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
   1390 		vector unsigned char vuc_pixel_NW_lower_range = spu_insert(
   1391 				*((unsigned char*) spu_extract( vui_addr_pixelNW_lower_range, 0 )), null_vector, 3 );
   1392 		vuc_pixel_NW_lower_range = spu_insert(
   1393 				*((unsigned char*) spu_extract( vui_addr_pixelNW_lower_range, 1 )),
   1394 				vuc_pixel_NW_lower_range, 7 );
   1395 		vuc_pixel_NW_lower_range = spu_insert(
   1396 				*((unsigned char*) spu_extract( vui_addr_pixelNW_lower_range, 2 )),
   1397 				vuc_pixel_NW_lower_range, 11 );
   1398 		vuc_pixel_NW_lower_range = spu_insert(
   1399 				*((unsigned char*) spu_extract( vui_addr_pixelNW_lower_range, 3 )),
   1400 				vuc_pixel_NW_lower_range, 15 );
   1401 
   1402 		vector unsigned char vuc_pixel_NW_upper_range = spu_insert(
   1403 				*((unsigned char*) spu_extract( vui_addr_pixelNW_upper_range, 0 )), null_vector, 3 );
   1404 		vuc_pixel_NW_upper_range = spu_insert(
   1405 				*((unsigned char*) spu_extract( vui_addr_pixelNW_upper_range, 1 )),
   1406 				vuc_pixel_NW_upper_range, 7 );
   1407 		vuc_pixel_NW_upper_range = spu_insert(
   1408 				*((unsigned char*) spu_extract( vui_addr_pixelNW_upper_range, 2 )),
   1409 				vuc_pixel_NW_upper_range, 11 );
   1410 		vuc_pixel_NW_upper_range = spu_insert(
   1411 				*((unsigned char*) spu_extract( vui_addr_pixelNW_upper_range, 3 )),
   1412 				vuc_pixel_NW_upper_range, 15 );
   1413 
   1414 		// NORTH EAST
   1415 		vector unsigned char vuc_pixel_NE_lower_range = spu_insert(
   1416 				*((unsigned char*) spu_extract( vui_addr_pixelNE_lower_range, 0 )), null_vector, 3 );
   1417 		vuc_pixel_NE_lower_range = spu_insert(
   1418 				*((unsigned char*) spu_extract( vui_addr_pixelNE_lower_range, 1 )),
   1419 				vuc_pixel_NE_lower_range, 7 );
   1420 		vuc_pixel_NE_lower_range = spu_insert(
   1421 				*((unsigned char*) spu_extract( vui_addr_pixelNE_lower_range, 2 )),
   1422 				vuc_pixel_NE_lower_range, 11 );
   1423 		vuc_pixel_NE_lower_range = spu_insert(
   1424 				*((unsigned char*) spu_extract( vui_addr_pixelNE_lower_range, 3 )),
   1425 				vuc_pixel_NE_lower_range, 15 );
   1426 
   1427 		vector unsigned char vuc_pixel_NE_upper_range = spu_insert(
   1428 				*((unsigned char*) spu_extract( vui_addr_pixelNE_upper_range, 0 )), null_vector, 3 );
   1429 		vuc_pixel_NE_upper_range = spu_insert(
   1430 				*((unsigned char*) spu_extract( vui_addr_pixelNE_upper_range, 1 )),
   1431 				vuc_pixel_NE_upper_range, 7 );
   1432 		vuc_pixel_NE_upper_range = spu_insert(
   1433 				*((unsigned char*) spu_extract( vui_addr_pixelNE_upper_range, 2 )),
   1434 				vuc_pixel_NE_upper_range, 11 );
   1435 		vuc_pixel_NE_upper_range = spu_insert(
   1436 				*((unsigned char*) spu_extract( vui_addr_pixelNE_upper_range, 3 )),
   1437 				vuc_pixel_NE_upper_range, 15 );
   1438 
   1439 
   1440 		// SOUTH WEST
   1441 		vector unsigned char vuc_pixel_SW_lower_range = spu_insert(
   1442 				*((unsigned char*) spu_extract( vui_addr_pixelSW_lower_range, 0 )), null_vector, 3 );
   1443 		vuc_pixel_SW_lower_range = spu_insert(
   1444 				*((unsigned char*) spu_extract( vui_addr_pixelSW_lower_range, 1 )),
   1445 				vuc_pixel_SW_lower_range, 7 );
   1446 		vuc_pixel_SW_lower_range = spu_insert(
   1447 				*((unsigned char*) spu_extract( vui_addr_pixelSW_lower_range, 2 )),
   1448 				vuc_pixel_SW_lower_range, 11 );
   1449 		vuc_pixel_SW_lower_range = spu_insert(
   1450 				*((unsigned char*) spu_extract( vui_addr_pixelSW_lower_range, 3 )),
   1451 				vuc_pixel_SW_lower_range, 15 );
   1452 
   1453 		vector unsigned char vuc_pixel_SW_upper_range = spu_insert(
   1454 				*((unsigned char*) spu_extract( vui_addr_pixelSW_upper_range, 0 )), null_vector, 3 );
   1455 		vuc_pixel_SW_upper_range = spu_insert(
   1456 				*((unsigned char*) spu_extract( vui_addr_pixelSW_upper_range, 1 )),
   1457 				vuc_pixel_SW_upper_range, 7 );
   1458 		vuc_pixel_SW_upper_range = spu_insert(
   1459 				*((unsigned char*) spu_extract( vui_addr_pixelSW_upper_range, 2 )),
   1460 				vuc_pixel_SW_upper_range, 11 );
   1461 		vuc_pixel_SW_upper_range = spu_insert(
   1462 				*((unsigned char*) spu_extract( vui_addr_pixelSW_upper_range, 3 )),
   1463 				vuc_pixel_SW_upper_range, 15 );
   1464 
   1465 		// SOUTH EAST
   1466 		vector unsigned char vuc_pixel_SE_lower_range = spu_insert(
   1467 				*((unsigned char*) spu_extract( vui_addr_pixelSE_lower_range, 0 )), null_vector, 3 );
   1468 		vuc_pixel_SE_lower_range = spu_insert(
   1469 				*((unsigned char*) spu_extract( vui_addr_pixelSE_lower_range, 1 )),
   1470 				vuc_pixel_SE_lower_range, 7 );
   1471 		vuc_pixel_SE_lower_range = spu_insert(
   1472 				*((unsigned char*) spu_extract( vui_addr_pixelSE_lower_range, 2 )),
   1473 				vuc_pixel_SE_lower_range, 11 );
   1474 		vuc_pixel_SE_lower_range = spu_insert(
   1475 				*((unsigned char*) spu_extract( vui_addr_pixelSE_lower_range, 3 )),
   1476 				vuc_pixel_SE_lower_range, 15 );
   1477 
   1478 		vector unsigned char vuc_pixel_SE_upper_range = spu_insert(
   1479 				*((unsigned char*) spu_extract( vui_addr_pixelSE_upper_range, 0 )), null_vector, 3 );
   1480 		vuc_pixel_SE_upper_range = spu_insert(
   1481 				*((unsigned char*) spu_extract( vui_addr_pixelSE_upper_range, 1 )),
   1482 				vuc_pixel_SE_upper_range, 7 );
   1483 		vuc_pixel_SE_upper_range = spu_insert(
   1484 				*((unsigned char*) spu_extract( vui_addr_pixelSE_upper_range, 2 )),
   1485 				vuc_pixel_SE_upper_range, 11 );
   1486 		vuc_pixel_SE_upper_range = spu_insert(
   1487 				*((unsigned char*) spu_extract( vui_addr_pixelSE_upper_range, 3 )),
   1488 				vuc_pixel_SE_upper_range, 15 );
   1489 
   1490 
   1491 		// convert to float
   1492 		vector float vf_pixel_NW_lower_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_lower_range, 0 );
   1493 		vector float vf_pixel_NW_upper_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_upper_range, 0 );
   1494 
   1495 		vector float vf_pixel_SW_lower_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_lower_range, 0 );
   1496 		vector float vf_pixel_SW_upper_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_upper_range, 0 );
   1497 
   1498 		vector float vf_pixel_NE_lower_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_lower_range, 0 );
   1499 		vector float vf_pixel_NE_upper_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_upper_range, 0 );
   1500 
   1501 		vector float vf_pixel_SE_lower_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_lower_range, 0 );
   1502 		vector float vf_pixel_SE_upper_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_upper_range, 0 );
   1503 
   1504 
   1505 
   1506 		// first linear interpolation: EWtop
   1507 		// EWtop = NW + EWweight*(NE-NW)
   1508 		//
   1509 		// lower range
   1510 		vector float vf_EWtop_lower_range_tmp = spu_sub( vf_pixel_NE_lower_range, vf_pixel_NW_lower_range );
   1511 		vector float vf_EWtop_lower_range = spu_madd( vf_EWweight_lower_range,
   1512 								vf_EWtop_lower_range_tmp,
   1513 								vf_pixel_NW_lower_range );
   1514 
   1515 		// upper range
   1516 		vector float vf_EWtop_upper_range_tmp = spu_sub( vf_pixel_NE_upper_range, vf_pixel_NW_upper_range );
   1517 		vector float vf_EWtop_upper_range = spu_madd( vf_EWweight_upper_range,
   1518 								vf_EWtop_upper_range_tmp,
   1519 								vf_pixel_NW_upper_range );
   1520 
   1521 
   1522 
   1523 		// second linear interpolation: EWbottom
   1524 		// EWbottom = SW + EWweight*(SE-SW)
   1525 		//
   1526 		// lower range
   1527 		vector float vf_EWbottom_lower_range_tmp = spu_sub( vf_pixel_SE_lower_range, vf_pixel_SW_lower_range );
   1528 		vector float vf_EWbottom_lower_range = spu_madd( vf_EWweight_lower_range,
   1529 								vf_EWbottom_lower_range_tmp,
   1530 								vf_pixel_SW_lower_range );
   1531 
   1532 		// upper range
   1533 		vector float vf_EWbottom_upper_range_tmp = spu_sub( vf_pixel_SE_upper_range, vf_pixel_SW_upper_range );
   1534 		vector float vf_EWbottom_upper_range = spu_madd( vf_EWweight_upper_range,
   1535 								vf_EWbottom_upper_range_tmp,
   1536 								vf_pixel_SW_upper_range );
   1537 
   1538 
   1539 
   1540 		// third linear interpolation: the bilinear interpolated value
   1541 		// result = EWtop + NSweight*(EWbottom-EWtop);
   1542 		//
   1543 		// lower range
   1544 		vector float vf_result_lower_range_tmp = spu_sub( vf_EWbottom_lower_range, vf_EWtop_lower_range );
   1545 		vector float vf_result_lower_range = spu_madd( vf_NSweight,
   1546 								vf_result_lower_range_tmp,
   1547 								vf_EWtop_lower_range );
   1548 
   1549 		// upper range
   1550 		vector float vf_result_upper_range_tmp = spu_sub( vf_EWbottom_upper_range, vf_EWtop_upper_range );
   1551 		vector float vf_result_upper_range = spu_madd( vf_NSweight,
   1552 								vf_result_upper_range_tmp,
   1553 								vf_EWtop_upper_range );
   1554 
   1555 
   1556 		// convert back: using saturated arithmetic
   1557 		vector unsigned int vui_result_lower_range = vfloat_to_vuint( vf_result_lower_range );
   1558 		vector unsigned int vui_result_upper_range = vfloat_to_vuint( vf_result_upper_range );
   1559 
   1560 		// merge results->lower,upper
   1561 		vector unsigned char vuc_mask_merge_result = { 0x03, 0x07, 0x0B, 0x0F,
   1562 							       0x13, 0x17, 0x1B, 0x1F,
   1563 							       0x00, 0x00, 0x00, 0x00,
   1564 							       0x00, 0x00, 0x00, 0x00 };
   1565 
   1566 		vector unsigned char vuc_result = spu_shuffle( (vector unsigned char) vui_result_lower_range,
   1567 								(vector unsigned char) vui_result_upper_range,
   1568 								vuc_mask_merge_result );
   1569 
   1570 		// partial storing
   1571 		vector unsigned char vuc_mask_out = { 0x00, 0x00, 0x00, 0x00,
   1572 						      0x00, 0x00, 0x00, 0x00,
   1573 						      0xFF, 0xFF, 0xFF, 0xFF,
   1574 						      0xFF, 0xFF, 0xFF, 0xFF };
   1575 
   1576 
   1577 		// get currently stored data
   1578 		vector unsigned char vuc_orig = *((vector unsigned char*)dst);
   1579 
   1580 		// clear currently stored data
   1581 		vuc_orig = spu_and( vuc_orig,
   1582 				spu_rlqwbyte( vuc_mask_out, ((unsigned int)dst)&0x0F) );
   1583 
   1584 		// rotate result according to storing address
   1585 		vuc_result = spu_rlqwbyte( vuc_result, ((unsigned int)dst)&0x0F );
   1586 
   1587 		// store result
   1588 		*((vector unsigned char*)dst) = spu_or( vuc_result,
   1589 							vuc_orig );
   1590 		dst += 8;
   1591 	}
   1592 }
   1593 
   1594 
   1595 /*
   1596  * bilinear_scale_line_w16()
   1597  *
   1598  * processes a line of yuv-input, width has to be a multiple of 16
   1599  * scaled yuv-output is written to local store buffer
   1600  *
   1601  * @param src buffer for 2 lines input
   1602  * @param dst_ buffer for 1 line output
   1603  * @param dst_width the width of the destination line
   1604  * @param vf_x_scale a float vector, at each entry is the x_scale-factor
   1605  * @param vf_NSweight a float vector, at each position is the weight NORTH/SOUTH for the current line
   1606  * @param src_linestride the stride of the srcline
   1607  */
   1608 void bilinear_scale_line_w16( unsigned char* src, unsigned char* dst_, unsigned int dst_width, vector float vf_x_scale, vector float vf_NSweight, unsigned int src_linestride ) {
   1609 
   1610 	unsigned char* dst = dst_;
   1611 
   1612 	unsigned int dst_x;
   1613 	for( dst_x=0; dst_x<dst_width; dst_x+=16) {
   1614 		// address calculation for loading the 4 surrounding pixel of each calculated
   1615 		// destination pixel
   1616 		vector unsigned int vui_dst_x_tmp = spu_splats( dst_x );
   1617 		// parallelised processing
   1618 		// first range->pixel 1 2 3 4
   1619 		// second range->pixel 5 6 7 8
   1620 		// third range->pixel 9 10 11 12
   1621 		// fourth range->pixel 13 14 15 16
   1622 		vector unsigned int vui_inc_dst_x_first_range = { 0, 1, 2, 3 };
   1623 		vector unsigned int vui_inc_dst_x_second_range = { 4, 5, 6, 7 };
   1624 		vector unsigned int vui_inc_dst_x_third_range = { 8, 9, 10, 11 };
   1625 		vector unsigned int vui_inc_dst_x_fourth_range = { 12, 13, 14, 15 };
   1626 		vector unsigned int vui_dst_x_first_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_first_range );
   1627 		vector unsigned int vui_dst_x_second_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_second_range );
   1628 		vector unsigned int vui_dst_x_third_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_third_range );
   1629 		vector unsigned int vui_dst_x_fourth_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_fourth_range );
   1630 
   1631 		// calculate weight EAST-WEST
   1632 		vector float vf_dst_x_first_range = spu_convtf( vui_dst_x_first_range, 0 );
   1633 		vector float vf_dst_x_second_range = spu_convtf( vui_dst_x_second_range, 0 );
   1634 		vector float vf_dst_x_third_range = spu_convtf( vui_dst_x_third_range, 0 );
   1635 		vector float vf_dst_x_fourth_range = spu_convtf( vui_dst_x_fourth_range, 0 );
   1636 		vector float vf_src_x_first_range = spu_mul( vf_dst_x_first_range, vf_x_scale );
   1637 		vector float vf_src_x_second_range = spu_mul( vf_dst_x_second_range, vf_x_scale );
   1638 		vector float vf_src_x_third_range = spu_mul( vf_dst_x_third_range, vf_x_scale );
   1639 		vector float vf_src_x_fourth_range = spu_mul( vf_dst_x_fourth_range, vf_x_scale );
   1640 		vector unsigned int vui_interpl_x_first_range = spu_convtu( vf_src_x_first_range, 0 );
   1641 		vector unsigned int vui_interpl_x_second_range = spu_convtu( vf_src_x_second_range, 0 );
   1642 		vector unsigned int vui_interpl_x_third_range = spu_convtu( vf_src_x_third_range, 0 );
   1643 		vector unsigned int vui_interpl_x_fourth_range = spu_convtu( vf_src_x_fourth_range, 0 );
   1644 		vector float vf_interpl_x_first_range = spu_convtf( vui_interpl_x_first_range, 0 );
   1645 		vector float vf_interpl_x_second_range = spu_convtf( vui_interpl_x_second_range, 0 );
   1646 		vector float vf_interpl_x_third_range = spu_convtf( vui_interpl_x_third_range, 0 );
   1647 		vector float vf_interpl_x_fourth_range = spu_convtf( vui_interpl_x_fourth_range, 0 );
   1648 		vector float vf_EWweight_first_range = spu_sub( vf_src_x_first_range, vf_interpl_x_first_range );
   1649 		vector float vf_EWweight_second_range = spu_sub( vf_src_x_second_range, vf_interpl_x_second_range );
   1650 		vector float vf_EWweight_third_range = spu_sub( vf_src_x_third_range, vf_interpl_x_third_range );
   1651 		vector float vf_EWweight_fourth_range = spu_sub( vf_src_x_fourth_range, vf_interpl_x_fourth_range );
   1652 
   1653 		// calculate address offset
   1654 		//
   1655 		// pixel NORTH WEST
   1656 		vector unsigned int vui_off_pixelNW_first_range = vui_interpl_x_first_range;
   1657 		vector unsigned int vui_off_pixelNW_second_range = vui_interpl_x_second_range;
   1658 		vector unsigned int vui_off_pixelNW_third_range = vui_interpl_x_third_range;
   1659 		vector unsigned int vui_off_pixelNW_fourth_range = vui_interpl_x_fourth_range;
   1660 
   1661 		// pixel NORTH EAST-->(offpixelNW+1)
   1662 		vector unsigned int vui_add_1 = { 1, 1, 1, 1 };
   1663 		vector unsigned int vui_off_pixelNE_first_range = spu_add( vui_off_pixelNW_first_range, vui_add_1 );
   1664 		vector unsigned int vui_off_pixelNE_second_range = spu_add( vui_off_pixelNW_second_range, vui_add_1 );
   1665 		vector unsigned int vui_off_pixelNE_third_range = spu_add( vui_off_pixelNW_third_range, vui_add_1 );
   1666 		vector unsigned int vui_off_pixelNE_fourth_range = spu_add( vui_off_pixelNW_fourth_range, vui_add_1 );
   1667 
   1668 		// SOUTH-WEST-->(offpixelNW+src_linestride)
   1669 		vector unsigned int vui_srclinestride = spu_splats( src_linestride );
   1670 		vector unsigned int vui_off_pixelSW_first_range = spu_add( vui_srclinestride, vui_off_pixelNW_first_range );
   1671 		vector unsigned int vui_off_pixelSW_second_range = spu_add( vui_srclinestride, vui_off_pixelNW_second_range );
   1672 		vector unsigned int vui_off_pixelSW_third_range = spu_add( vui_srclinestride, vui_off_pixelNW_third_range );
   1673 		vector unsigned int vui_off_pixelSW_fourth_range = spu_add( vui_srclinestride, vui_off_pixelNW_fourth_range );
   1674 
   1675 		// SOUTH-EAST-->(offpixelNW+src_linestride+1)
   1676 		vector unsigned int vui_off_pixelSE_first_range = spu_add( vui_srclinestride, vui_off_pixelNE_first_range );
   1677 		vector unsigned int vui_off_pixelSE_second_range = spu_add( vui_srclinestride, vui_off_pixelNE_second_range );
   1678 		vector unsigned int vui_off_pixelSE_third_range = spu_add( vui_srclinestride, vui_off_pixelNE_third_range );
   1679 		vector unsigned int vui_off_pixelSE_fourth_range = spu_add( vui_srclinestride, vui_off_pixelNE_fourth_range );
   1680 
   1681 		// calculate each address
   1682 		vector unsigned int vui_src_ls = spu_splats( (unsigned int) src );
   1683 		vector unsigned int vui_addr_pixelNW_first_range = spu_add( vui_src_ls, vui_off_pixelNW_first_range );
   1684 		vector unsigned int vui_addr_pixelNW_second_range = spu_add( vui_src_ls, vui_off_pixelNW_second_range );
   1685 		vector unsigned int vui_addr_pixelNW_third_range = spu_add( vui_src_ls, vui_off_pixelNW_third_range );
   1686 		vector unsigned int vui_addr_pixelNW_fourth_range = spu_add( vui_src_ls, vui_off_pixelNW_fourth_range );
   1687 
   1688 		vector unsigned int vui_addr_pixelNE_first_range = spu_add( vui_src_ls, vui_off_pixelNE_first_range );
   1689 		vector unsigned int vui_addr_pixelNE_second_range = spu_add( vui_src_ls, vui_off_pixelNE_second_range );
   1690 		vector unsigned int vui_addr_pixelNE_third_range = spu_add( vui_src_ls, vui_off_pixelNE_third_range );
   1691 		vector unsigned int vui_addr_pixelNE_fourth_range = spu_add( vui_src_ls, vui_off_pixelNE_fourth_range );
   1692 
   1693 		vector unsigned int vui_addr_pixelSW_first_range = spu_add( vui_src_ls, vui_off_pixelSW_first_range );
   1694 		vector unsigned int vui_addr_pixelSW_second_range = spu_add( vui_src_ls, vui_off_pixelSW_second_range );
   1695 		vector unsigned int vui_addr_pixelSW_third_range = spu_add( vui_src_ls, vui_off_pixelSW_third_range );
   1696 		vector unsigned int vui_addr_pixelSW_fourth_range = spu_add( vui_src_ls, vui_off_pixelSW_fourth_range );
   1697 
   1698 		vector unsigned int vui_addr_pixelSE_first_range = spu_add( vui_src_ls, vui_off_pixelSE_first_range );
   1699 		vector unsigned int vui_addr_pixelSE_second_range = spu_add( vui_src_ls, vui_off_pixelSE_second_range );
   1700 		vector unsigned int vui_addr_pixelSE_third_range = spu_add( vui_src_ls, vui_off_pixelSE_third_range );
   1701 		vector unsigned int vui_addr_pixelSE_fourth_range = spu_add( vui_src_ls, vui_off_pixelSE_fourth_range );
   1702 
   1703 
   1704 		// get each pixel
   1705 		//
   1706 		// scalar load, afterwards insertion into the right position
   1707 		// NORTH WEST
   1708 		// first range
   1709 		vector unsigned char null_vector = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
   1710 		vector unsigned char vuc_pixel_NW_first_range = spu_insert(
   1711 				*((unsigned char*) spu_extract( vui_addr_pixelNW_first_range, 0 )), null_vector, 3 );
   1712 		vuc_pixel_NW_first_range = spu_insert(
   1713 				*((unsigned char*) spu_extract( vui_addr_pixelNW_first_range, 1 )),
   1714 				vuc_pixel_NW_first_range, 7 );
   1715 		vuc_pixel_NW_first_range = spu_insert(
   1716 				*((unsigned char*) spu_extract( vui_addr_pixelNW_first_range, 2 )),
   1717 				vuc_pixel_NW_first_range, 11 );
   1718 		vuc_pixel_NW_first_range = spu_insert(
   1719 				*((unsigned char*) spu_extract( vui_addr_pixelNW_first_range, 3 )),
   1720 				vuc_pixel_NW_first_range, 15 );
   1721 		// second range
   1722 		vector unsigned char vuc_pixel_NW_second_range = spu_insert(
   1723 				*((unsigned char*) spu_extract( vui_addr_pixelNW_second_range, 0 )), null_vector, 3 );
   1724 		vuc_pixel_NW_second_range = spu_insert(
   1725 				*((unsigned char*) spu_extract( vui_addr_pixelNW_second_range, 1 )),
   1726 				vuc_pixel_NW_second_range, 7 );
   1727 		vuc_pixel_NW_second_range = spu_insert(
   1728 				*((unsigned char*) spu_extract( vui_addr_pixelNW_second_range, 2 )),
   1729 				vuc_pixel_NW_second_range, 11 );
   1730 		vuc_pixel_NW_second_range = spu_insert(
   1731 				*((unsigned char*) spu_extract( vui_addr_pixelNW_second_range, 3 )),
   1732 				vuc_pixel_NW_second_range, 15 );
   1733 		// third range
   1734 		vector unsigned char vuc_pixel_NW_third_range = spu_insert(
   1735 				*((unsigned char*) spu_extract( vui_addr_pixelNW_third_range, 0 )), null_vector, 3 );
   1736 		vuc_pixel_NW_third_range = spu_insert(
   1737 				*((unsigned char*) spu_extract( vui_addr_pixelNW_third_range, 1 )),
   1738 				vuc_pixel_NW_third_range, 7 );
   1739 		vuc_pixel_NW_third_range = spu_insert(
   1740 				*((unsigned char*) spu_extract( vui_addr_pixelNW_third_range, 2 )),
   1741 				vuc_pixel_NW_third_range, 11 );
   1742 		vuc_pixel_NW_third_range = spu_insert(
   1743 				*((unsigned char*) spu_extract( vui_addr_pixelNW_third_range, 3 )),
   1744 				vuc_pixel_NW_third_range, 15 );
   1745 		// fourth range
   1746 		vector unsigned char vuc_pixel_NW_fourth_range = spu_insert(
   1747 				*((unsigned char*) spu_extract( vui_addr_pixelNW_fourth_range, 0 )), null_vector, 3 );
   1748 		vuc_pixel_NW_fourth_range = spu_insert(
   1749 				*((unsigned char*) spu_extract( vui_addr_pixelNW_fourth_range, 1 )),
   1750 				vuc_pixel_NW_fourth_range, 7 );
   1751 		vuc_pixel_NW_fourth_range = spu_insert(
   1752 				*((unsigned char*) spu_extract( vui_addr_pixelNW_fourth_range, 2 )),
   1753 				vuc_pixel_NW_fourth_range, 11 );
   1754 		vuc_pixel_NW_fourth_range = spu_insert(
   1755 				*((unsigned char*) spu_extract( vui_addr_pixelNW_fourth_range, 3 )),
   1756 				vuc_pixel_NW_fourth_range, 15 );
   1757 
   1758 		// NORTH EAST
   1759 		// first range
   1760 		vector unsigned char vuc_pixel_NE_first_range = spu_insert(
   1761 				*((unsigned char*) spu_extract( vui_addr_pixelNE_first_range, 0 )), null_vector, 3 );
   1762 		vuc_pixel_NE_first_range = spu_insert(
   1763 				*((unsigned char*) spu_extract( vui_addr_pixelNE_first_range, 1 )),
   1764 				vuc_pixel_NE_first_range, 7 );
   1765 		vuc_pixel_NE_first_range = spu_insert(
   1766 				*((unsigned char*) spu_extract( vui_addr_pixelNE_first_range, 2 )),
   1767 				vuc_pixel_NE_first_range, 11 );
   1768 		vuc_pixel_NE_first_range = spu_insert(
   1769 				*((unsigned char*) spu_extract( vui_addr_pixelNE_first_range, 3 )),
   1770 				vuc_pixel_NE_first_range, 15 );
   1771 		// second range
   1772 		vector unsigned char vuc_pixel_NE_second_range = spu_insert(
   1773 				*((unsigned char*) spu_extract( vui_addr_pixelNE_second_range, 0 )), null_vector, 3 );
   1774 		vuc_pixel_NE_second_range = spu_insert(
   1775 				*((unsigned char*) spu_extract( vui_addr_pixelNE_second_range, 1 )),
   1776 				vuc_pixel_NE_second_range, 7 );
   1777 		vuc_pixel_NE_second_range = spu_insert(
   1778 				*((unsigned char*) spu_extract( vui_addr_pixelNE_second_range, 2 )),
   1779 				vuc_pixel_NE_second_range, 11 );
   1780 		vuc_pixel_NE_second_range = spu_insert(
   1781 				*((unsigned char*) spu_extract( vui_addr_pixelNE_second_range, 3 )),
   1782 				vuc_pixel_NE_second_range, 15 );
   1783 		// third range
   1784 		vector unsigned char vuc_pixel_NE_third_range = spu_insert(
   1785 				*((unsigned char*) spu_extract( vui_addr_pixelNE_third_range, 0 )), null_vector, 3 );
   1786 		vuc_pixel_NE_third_range = spu_insert(
   1787 				*((unsigned char*) spu_extract( vui_addr_pixelNE_third_range, 1 )),
   1788 				vuc_pixel_NE_third_range, 7 );
   1789 		vuc_pixel_NE_third_range = spu_insert(
   1790 				*((unsigned char*) spu_extract( vui_addr_pixelNE_third_range, 2 )),
   1791 				vuc_pixel_NE_third_range, 11 );
   1792 		vuc_pixel_NE_third_range = spu_insert(
   1793 				*((unsigned char*) spu_extract( vui_addr_pixelNE_third_range, 3 )),
   1794 				vuc_pixel_NE_third_range, 15 );
   1795 		// fourth range
   1796 		vector unsigned char vuc_pixel_NE_fourth_range = spu_insert(
   1797 				*((unsigned char*) spu_extract( vui_addr_pixelNE_fourth_range, 0 )), null_vector, 3 );
   1798 		vuc_pixel_NE_fourth_range = spu_insert(
   1799 				*((unsigned char*) spu_extract( vui_addr_pixelNE_fourth_range, 1 )),
   1800 				vuc_pixel_NE_fourth_range, 7 );
   1801 		vuc_pixel_NE_fourth_range = spu_insert(
   1802 				*((unsigned char*) spu_extract( vui_addr_pixelNE_fourth_range, 2 )),
   1803 				vuc_pixel_NE_fourth_range, 11 );
   1804 		vuc_pixel_NE_fourth_range = spu_insert(
   1805 				*((unsigned char*) spu_extract( vui_addr_pixelNE_fourth_range, 3 )),
   1806 				vuc_pixel_NE_fourth_range, 15 );
   1807 
   1808 		// SOUTH WEST
   1809 		// first range
   1810 		vector unsigned char vuc_pixel_SW_first_range = spu_insert(
   1811 				*((unsigned char*) spu_extract( vui_addr_pixelSW_first_range, 0 )), null_vector, 3 );
   1812 		vuc_pixel_SW_first_range = spu_insert(
   1813 				*((unsigned char*) spu_extract( vui_addr_pixelSW_first_range, 1 )),
   1814 				vuc_pixel_SW_first_range, 7 );
   1815 		vuc_pixel_SW_first_range = spu_insert(
   1816 				*((unsigned char*) spu_extract( vui_addr_pixelSW_first_range, 2 )),
   1817 				vuc_pixel_SW_first_range, 11 );
   1818 		vuc_pixel_SW_first_range = spu_insert(
   1819 				*((unsigned char*) spu_extract( vui_addr_pixelSW_first_range, 3 )),
   1820 				vuc_pixel_SW_first_range, 15 );
   1821 		// second range
   1822 		vector unsigned char vuc_pixel_SW_second_range = spu_insert(
   1823 				*((unsigned char*) spu_extract( vui_addr_pixelSW_second_range, 0 )), null_vector, 3 );
   1824 		vuc_pixel_SW_second_range = spu_insert(
   1825 				*((unsigned char*) spu_extract( vui_addr_pixelSW_second_range, 1 )),
   1826 				vuc_pixel_SW_second_range, 7 );
   1827 		vuc_pixel_SW_second_range = spu_insert(
   1828 				*((unsigned char*) spu_extract( vui_addr_pixelSW_second_range, 2 )),
   1829 				vuc_pixel_SW_second_range, 11 );
   1830 		vuc_pixel_SW_second_range = spu_insert(
   1831 				*((unsigned char*) spu_extract( vui_addr_pixelSW_second_range, 3 )),
   1832 				vuc_pixel_SW_second_range, 15 );
   1833 		// third range
   1834 		vector unsigned char vuc_pixel_SW_third_range = spu_insert(
   1835 				*((unsigned char*) spu_extract( vui_addr_pixelSW_third_range, 0 )), null_vector, 3 );
   1836 		vuc_pixel_SW_third_range = spu_insert(
   1837 				*((unsigned char*) spu_extract( vui_addr_pixelSW_third_range, 1 )),
   1838 				vuc_pixel_SW_third_range, 7 );
   1839 		vuc_pixel_SW_third_range = spu_insert(
   1840 				*((unsigned char*) spu_extract( vui_addr_pixelSW_third_range, 2 )),
   1841 				vuc_pixel_SW_third_range, 11 );
   1842 		vuc_pixel_SW_third_range = spu_insert(
   1843 				*((unsigned char*) spu_extract( vui_addr_pixelSW_third_range, 3 )),
   1844 				vuc_pixel_SW_third_range, 15 );
   1845 		// fourth range
   1846 		vector unsigned char vuc_pixel_SW_fourth_range = spu_insert(
   1847 				*((unsigned char*) spu_extract( vui_addr_pixelSW_fourth_range, 0 )), null_vector, 3 );
   1848 		vuc_pixel_SW_fourth_range = spu_insert(
   1849 				*((unsigned char*) spu_extract( vui_addr_pixelSW_fourth_range, 1 )),
   1850 				vuc_pixel_SW_fourth_range, 7 );
   1851 		vuc_pixel_SW_fourth_range = spu_insert(
   1852 				*((unsigned char*) spu_extract( vui_addr_pixelSW_fourth_range, 2 )),
   1853 				vuc_pixel_SW_fourth_range, 11 );
   1854 		vuc_pixel_SW_fourth_range = spu_insert(
   1855 				*((unsigned char*) spu_extract( vui_addr_pixelSW_fourth_range, 3 )),
   1856 				vuc_pixel_SW_fourth_range, 15 );
   1857 
   1858 		// NORTH EAST
   1859 		// first range
   1860 		vector unsigned char vuc_pixel_SE_first_range = spu_insert(
   1861 				*((unsigned char*) spu_extract( vui_addr_pixelSE_first_range, 0 )), null_vector, 3 );
   1862 		vuc_pixel_SE_first_range = spu_insert(
   1863 				*((unsigned char*) spu_extract( vui_addr_pixelSE_first_range, 1 )),
   1864 				vuc_pixel_SE_first_range, 7 );
   1865 		vuc_pixel_SE_first_range = spu_insert(
   1866 				*((unsigned char*) spu_extract( vui_addr_pixelSE_first_range, 2 )),
   1867 				vuc_pixel_SE_first_range, 11 );
   1868 		vuc_pixel_SE_first_range = spu_insert(
   1869 				*((unsigned char*) spu_extract( vui_addr_pixelSE_first_range, 3 )),
   1870 				vuc_pixel_SE_first_range, 15 );
   1871 		// second range
   1872 		vector unsigned char vuc_pixel_SE_second_range = spu_insert(
   1873 				*((unsigned char*) spu_extract( vui_addr_pixelSE_second_range, 0 )), null_vector, 3 );
   1874 		vuc_pixel_SE_second_range = spu_insert(
   1875 				*((unsigned char*) spu_extract( vui_addr_pixelSE_second_range, 1 )),
   1876 				vuc_pixel_SE_second_range, 7 );
   1877 		vuc_pixel_SE_second_range = spu_insert(
   1878 				*((unsigned char*) spu_extract( vui_addr_pixelSE_second_range, 2 )),
   1879 				vuc_pixel_SE_second_range, 11 );
   1880 		vuc_pixel_SE_second_range = spu_insert(
   1881 				*((unsigned char*) spu_extract( vui_addr_pixelSE_second_range, 3 )),
   1882 				vuc_pixel_SE_second_range, 15 );
   1883 		// third range
   1884 		vector unsigned char vuc_pixel_SE_third_range = spu_insert(
   1885 				*((unsigned char*) spu_extract( vui_addr_pixelSE_third_range, 0 )), null_vector, 3 );
   1886 		vuc_pixel_SE_third_range = spu_insert(
   1887 				*((unsigned char*) spu_extract( vui_addr_pixelSE_third_range, 1 )),
   1888 				vuc_pixel_SE_third_range, 7 );
   1889 		vuc_pixel_SE_third_range = spu_insert(
   1890 				*((unsigned char*) spu_extract( vui_addr_pixelSE_third_range, 2 )),
   1891 				vuc_pixel_SE_third_range, 11 );
   1892 		vuc_pixel_SE_third_range = spu_insert(
   1893 				*((unsigned char*) spu_extract( vui_addr_pixelSE_third_range, 3 )),
   1894 				vuc_pixel_SE_third_range, 15 );
   1895 		// fourth range
   1896 		vector unsigned char vuc_pixel_SE_fourth_range = spu_insert(
   1897 				*((unsigned char*) spu_extract( vui_addr_pixelSE_fourth_range, 0 )), null_vector, 3 );
   1898 		vuc_pixel_SE_fourth_range = spu_insert(
   1899 				*((unsigned char*) spu_extract( vui_addr_pixelSE_fourth_range, 1 )),
   1900 				vuc_pixel_SE_fourth_range, 7 );
   1901 		vuc_pixel_SE_fourth_range = spu_insert(
   1902 				*((unsigned char*) spu_extract( vui_addr_pixelSE_fourth_range, 2 )),
   1903 				vuc_pixel_SE_fourth_range, 11 );
   1904 		vuc_pixel_SE_fourth_range = spu_insert(
   1905 				*((unsigned char*) spu_extract( vui_addr_pixelSE_fourth_range, 3 )),
   1906 				vuc_pixel_SE_fourth_range, 15 );
   1907 
   1908 
   1909 
   1910 		// convert to float
   1911 		vector float vf_pixel_NW_first_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_first_range, 0 );
   1912 		vector float vf_pixel_NW_second_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_second_range, 0 );
   1913 		vector float vf_pixel_NW_third_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_third_range, 0 );
   1914 		vector float vf_pixel_NW_fourth_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_fourth_range, 0 );
   1915 
   1916 		vector float vf_pixel_NE_first_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_first_range, 0 );
   1917 		vector float vf_pixel_NE_second_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_second_range, 0 );
   1918 		vector float vf_pixel_NE_third_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_third_range, 0 );
   1919 		vector float vf_pixel_NE_fourth_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_fourth_range, 0 );
   1920 
   1921 		vector float vf_pixel_SW_first_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_first_range, 0 );
   1922 		vector float vf_pixel_SW_second_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_second_range, 0 );
   1923 		vector float vf_pixel_SW_third_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_third_range, 0 );
   1924 		vector float vf_pixel_SW_fourth_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_fourth_range, 0 );
   1925 
   1926 		vector float vf_pixel_SE_first_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_first_range, 0 );
   1927 		vector float vf_pixel_SE_second_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_second_range, 0 );
   1928 		vector float vf_pixel_SE_third_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_third_range, 0 );
   1929 		vector float vf_pixel_SE_fourth_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_fourth_range, 0 );
   1930 
   1931 		// first linear interpolation: EWtop
   1932 		// EWtop = NW + EWweight*(NE-NW)
   1933 		//
   1934 		// first range
   1935 		vector float vf_EWtop_first_range_tmp = spu_sub( vf_pixel_NE_first_range, vf_pixel_NW_first_range );
   1936 		vector float vf_EWtop_first_range = spu_madd( vf_EWweight_first_range,
   1937 								vf_EWtop_first_range_tmp,
   1938 								vf_pixel_NW_first_range );
   1939 
   1940 		// second range
   1941 		vector float vf_EWtop_second_range_tmp = spu_sub( vf_pixel_NE_second_range, vf_pixel_NW_second_range );
   1942 		vector float vf_EWtop_second_range = spu_madd( vf_EWweight_second_range,
   1943 								vf_EWtop_second_range_tmp,
   1944 								vf_pixel_NW_second_range );
   1945 
   1946 		// third range
   1947 		vector float vf_EWtop_third_range_tmp = spu_sub( vf_pixel_NE_third_range, vf_pixel_NW_third_range );
   1948 		vector float vf_EWtop_third_range = spu_madd( vf_EWweight_third_range,
   1949 								vf_EWtop_third_range_tmp,
   1950 								vf_pixel_NW_third_range );
   1951 
   1952 		// fourth range
   1953 		vector float vf_EWtop_fourth_range_tmp = spu_sub( vf_pixel_NE_fourth_range, vf_pixel_NW_fourth_range );
   1954 		vector float vf_EWtop_fourth_range = spu_madd( vf_EWweight_fourth_range,
   1955 								vf_EWtop_fourth_range_tmp,
   1956 								vf_pixel_NW_fourth_range );
   1957 
   1958 
   1959 
   1960 		// second linear interpolation: EWbottom
   1961 		// EWbottom = SW + EWweight*(SE-SW)
   1962 		//
   1963 		// first range
   1964 		vector float vf_EWbottom_first_range_tmp = spu_sub( vf_pixel_SE_first_range, vf_pixel_SW_first_range );
   1965 		vector float vf_EWbottom_first_range = spu_madd( vf_EWweight_first_range,
   1966 								vf_EWbottom_first_range_tmp,
   1967 								vf_pixel_SW_first_range );
   1968 
   1969 		// second range
   1970 		vector float vf_EWbottom_second_range_tmp = spu_sub( vf_pixel_SE_second_range, vf_pixel_SW_second_range );
   1971 		vector float vf_EWbottom_second_range = spu_madd( vf_EWweight_second_range,
   1972 								vf_EWbottom_second_range_tmp,
   1973 								vf_pixel_SW_second_range );
   1974 		// first range
   1975 		vector float vf_EWbottom_third_range_tmp = spu_sub( vf_pixel_SE_third_range, vf_pixel_SW_third_range );
   1976 		vector float vf_EWbottom_third_range = spu_madd( vf_EWweight_third_range,
   1977 								vf_EWbottom_third_range_tmp,
   1978 								vf_pixel_SW_third_range );
   1979 
   1980 		// first range
   1981 		vector float vf_EWbottom_fourth_range_tmp = spu_sub( vf_pixel_SE_fourth_range, vf_pixel_SW_fourth_range );
   1982 		vector float vf_EWbottom_fourth_range = spu_madd( vf_EWweight_fourth_range,
   1983 								vf_EWbottom_fourth_range_tmp,
   1984 								vf_pixel_SW_fourth_range );
   1985 
   1986 
   1987 
   1988 		// third linear interpolation: the bilinear interpolated value
   1989 		// result = EWtop + NSweight*(EWbottom-EWtop);
   1990 		//
   1991 		// first range
   1992 		vector float vf_result_first_range_tmp = spu_sub( vf_EWbottom_first_range, vf_EWtop_first_range );
   1993 		vector float vf_result_first_range = spu_madd( vf_NSweight,
   1994 								vf_result_first_range_tmp,
   1995 								vf_EWtop_first_range );
   1996 
   1997 		// second range
   1998 		vector float vf_result_second_range_tmp = spu_sub( vf_EWbottom_second_range, vf_EWtop_second_range );
   1999 		vector float vf_result_second_range = spu_madd( vf_NSweight,
   2000 								vf_result_second_range_tmp,
   2001 								vf_EWtop_second_range );
   2002 
   2003 		// third range
   2004 		vector float vf_result_third_range_tmp = spu_sub( vf_EWbottom_third_range, vf_EWtop_third_range );
   2005 		vector float vf_result_third_range = spu_madd( vf_NSweight,
   2006 								vf_result_third_range_tmp,
   2007 								vf_EWtop_third_range );
   2008 
   2009 		// fourth range
   2010 		vector float vf_result_fourth_range_tmp = spu_sub( vf_EWbottom_fourth_range, vf_EWtop_fourth_range );
   2011 		vector float vf_result_fourth_range = spu_madd( vf_NSweight,
   2012 								vf_result_fourth_range_tmp,
   2013 								vf_EWtop_fourth_range );
   2014 
   2015 
   2016 
   2017 		// convert back: using saturated arithmetic
   2018 		vector unsigned int vui_result_first_range = vfloat_to_vuint( vf_result_first_range );
   2019 		vector unsigned int vui_result_second_range = vfloat_to_vuint( vf_result_second_range );
   2020 		vector unsigned int vui_result_third_range = vfloat_to_vuint( vf_result_third_range );
   2021 		vector unsigned int vui_result_fourth_range = vfloat_to_vuint( vf_result_fourth_range );
   2022 
   2023 		// merge results->lower,upper
   2024 		vector unsigned char vuc_mask_merge_result_first_second = { 0x03, 0x07, 0x0B, 0x0F,
   2025 							       		    0x13, 0x17, 0x1B, 0x1F,
   2026 							       		    0x00, 0x00, 0x00, 0x00,
   2027 							       		    0x00, 0x00, 0x00, 0x00 };
   2028 
   2029 		vector unsigned char vuc_mask_merge_result_third_fourth = { 0x00, 0x00, 0x00, 0x00,
   2030 							       		    0x00, 0x00, 0x00, 0x00,
   2031 									    0x03, 0x07, 0x0B, 0x0F,
   2032 							       		    0x13, 0x17, 0x1B, 0x1F };
   2033 
   2034 		vector unsigned char vuc_result_first_second =
   2035 						spu_shuffle( (vector unsigned char) vui_result_first_range,
   2036 								 (vector unsigned char) vui_result_second_range,
   2037 								vuc_mask_merge_result_first_second );
   2038 
   2039 		vector unsigned char vuc_result_third_fourth =
   2040 						spu_shuffle( (vector unsigned char) vui_result_third_range,
   2041 								 (vector unsigned char) vui_result_fourth_range,
   2042 								vuc_mask_merge_result_third_fourth );
   2043 
   2044 		// store result
   2045 		*((vector unsigned char*)dst) = spu_or( vuc_result_first_second,
   2046 							vuc_result_third_fourth );
   2047 		dst += 16;
   2048 	}
   2049 }
   2050 
   2051