1 /* 2 * SDL - Simple DirectMedia Layer 3 * CELL BE Support for PS3 Framebuffer 4 * Copyright (C) 2008, 2009 International Business Machines Corporation 5 * 6 * This library is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU Lesser General Public License as published 8 * by the Free Software Foundation; either version 2.1 of the License, or 9 * (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, but 12 * WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with this library; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 19 * USA 20 * 21 * Martin Lowinski <lowinski [at] de [dot] ibm [ibm] com> 22 * Dirk Herrendoerfer <d.herrendoerfer [at] de [dot] ibm [dot] com> 23 * SPE code based on research by: 24 * Rene Becker 25 * Thimo Emmerich 26 */ 27 28 #include "spu_common.h" 29 30 #include <spu_intrinsics.h> 31 #include <spu_mfcio.h> 32 33 // Debugging 34 //#define DEBUG 35 36 #ifdef DEBUG 37 #define deprintf(fmt, args... ) \ 38 fprintf( stdout, fmt, ##args ); \ 39 fflush( stdout ); 40 #else 41 #define deprintf( fmt, args... ) 42 #endif 43 44 struct yuv2rgb_parms_t parms_converter __attribute__((aligned(128))); 45 46 /* A maximum of 8 lines Y, therefore 4 lines V, 4 lines U are stored 47 * there might be the need to retrieve misaligned data, adjust 48 * incoming v and u plane to be able to handle this (add 128) 49 */ 50 unsigned char y_plane[2][(MAX_HDTV_WIDTH + 128) * 4] __attribute__((aligned(128))); 51 unsigned char v_plane[2][(MAX_HDTV_WIDTH + 128) * 2] __attribute__((aligned(128))); 52 unsigned char u_plane[2][(MAX_HDTV_WIDTH + 128) * 2] __attribute__((aligned(128))); 53 54 /* A maximum of 4 lines BGRA are stored, 4 byte per pixel */ 55 unsigned char bgra[4 * MAX_HDTV_WIDTH * 4] __attribute__((aligned(128))); 56 57 /* some vectors needed by the float to int conversion */ 58 static const vector float vec_255 = { 255.0f, 255.0f, 255.0f, 255.0f }; 59 static const vector float vec_0_1 = { 0.1f, 0.1f, 0.1f, 0.1f }; 60 61 void yuv_to_rgb_w16(); 62 void yuv_to_rgb_w32(); 63 64 void yuv_to_rgb_w16_line(unsigned char* y_addr, unsigned char* v_addr, unsigned char* u_addr, unsigned char* bgra_addr, unsigned int width); 65 void yuv_to_rgb_w32_line(unsigned char* y_addr, unsigned char* v_addr, unsigned char* u_addr, unsigned char* bgra_addr_, unsigned int width); 66 67 68 int main(unsigned long long spe_id __attribute__((unused)), unsigned long long argp __attribute__ ((unused))) 69 { 70 deprintf("[SPU] yuv2rgb_spu is up... (on SPE #%llu)\n", spe_id); 71 uint32_t ea_mfc, mbox; 72 // send ready message 73 spu_write_out_mbox(SPU_READY); 74 75 while (1) { 76 /* Check mailbox */ 77 mbox = spu_read_in_mbox(); 78 deprintf("[SPU] Message is %u\n", mbox); 79 switch (mbox) { 80 case SPU_EXIT: 81 deprintf("[SPU] fb_writer goes down...\n"); 82 return 0; 83 case SPU_START: 84 break; 85 default: 86 deprintf("[SPU] Cannot handle message\n"); 87 continue; 88 } 89 90 /* Tag Manager setup */ 91 unsigned int tag_id; 92 tag_id = mfc_multi_tag_reserve(1); 93 if (tag_id == MFC_TAG_INVALID) { 94 deprintf("[SPU] Failed to reserve mfc tags on yuv2rgb_converter\n"); 95 return 0; 96 } 97 98 /* DMA transfer for the input parameters */ 99 ea_mfc = spu_read_in_mbox(); 100 deprintf("[SPU] Message on yuv2rgb_converter is %u\n", ea_mfc); 101 spu_mfcdma32(&parms_converter, (unsigned int)ea_mfc, sizeof(struct yuv2rgb_parms_t), tag_id, MFC_GET_CMD); 102 DMA_WAIT_TAG(tag_id); 103 104 /* There are alignment issues that involve handling of special cases 105 * a width of 32 results in a width of 16 in the chrominance 106 * --> choose the proper handling to optimize the performance 107 */ 108 deprintf("[SPU] Convert %ix%i from YUV to RGB\n", parms_converter.src_pixel_width, parms_converter.src_pixel_height); 109 if (parms_converter.src_pixel_width & 0x1f) { 110 deprintf("[SPU] Using yuv_to_rgb_w16\n"); 111 yuv_to_rgb_w16(); 112 } else { 113 deprintf("[SPU] Using yuv_to_rgb_w32\n"); 114 yuv_to_rgb_w32(); 115 } 116 117 mfc_multi_tag_release(tag_id, 1); 118 deprintf("[SPU] yuv2rgb_spu... done!\n"); 119 /* Send FIN message */ 120 spu_write_out_mbox(SPU_FIN); 121 } 122 123 return 0; 124 } 125 126 127 /* 128 * float_to_char() 129 * 130 * converts a float to a character using saturated 131 * arithmetic 132 * 133 * @param s float for conversion 134 * @returns converted character 135 */ 136 inline static unsigned char float_to_char(float s) { 137 vector float vec_s = spu_splats(s); 138 vector unsigned int select_1 = spu_cmpgt(vec_0_1, vec_s); 139 vec_s = spu_sel(vec_s, vec_0_1, select_1); 140 141 vector unsigned int select_2 = spu_cmpgt(vec_s, vec_255); 142 vec_s = spu_sel(vec_s, vec_255, select_2); 143 return (unsigned char) spu_extract(vec_s,0); 144 } 145 146 147 /* 148 * vfloat_to_vuint() 149 * 150 * converts a float vector to an unsinged int vector using saturated 151 * arithmetic 152 * 153 * @param vec_s float vector for conversion 154 * @returns converted unsigned int vector 155 */ 156 inline static vector unsigned int vfloat_to_vuint(vector float vec_s) { 157 vector unsigned int select_1 = spu_cmpgt(vec_0_1, vec_s); 158 vec_s = spu_sel(vec_s, vec_0_1, select_1); 159 160 vector unsigned int select_2 = spu_cmpgt(vec_s, vec_255); 161 vec_s = spu_sel(vec_s, vec_255, select_2); 162 return spu_convtu(vec_s,0); 163 } 164 165 166 void yuv_to_rgb_w16() { 167 // Pixel dimensions of the picture 168 uint32_t width, height; 169 170 // Extract parameters 171 width = parms_converter.src_pixel_width; 172 height = parms_converter.src_pixel_height; 173 174 // Plane data management 175 // Y 176 unsigned char* ram_addr_y = parms_converter.y_plane; 177 // V 178 unsigned char* ram_addr_v = parms_converter.v_plane; 179 // U 180 unsigned char* ram_addr_u = parms_converter.u_plane; 181 182 // BGRA 183 unsigned char* ram_addr_bgra = parms_converter.dstBuffer; 184 185 // Strides 186 unsigned int stride_y = width; 187 unsigned int stride_vu = width>>1; 188 189 // Buffer management 190 unsigned int buf_idx = 0; 191 unsigned int size_4lines_y = stride_y<<2; 192 unsigned int size_2lines_y = stride_y<<1; 193 unsigned int size_2lines_vu = stride_vu<<1; 194 195 // 2*width*4byte_per_pixel 196 unsigned int size_2lines_bgra = width<<3; 197 198 199 // start double-buffered processing 200 // 4 lines y 201 spu_mfcdma32(y_plane[buf_idx], (unsigned int) ram_addr_y, size_4lines_y, RETR_BUF+buf_idx, MFC_GET_CMD); 202 203 // 2 lines v 204 spu_mfcdma32(v_plane[buf_idx], (unsigned int) ram_addr_v, size_2lines_vu, RETR_BUF+buf_idx, MFC_GET_CMD); 205 206 // 2 lines u 207 spu_mfcdma32(u_plane[buf_idx], (unsigned int) ram_addr_u, size_2lines_vu, RETR_BUF+buf_idx, MFC_GET_CMD); 208 209 // Wait for these transfers to be completed 210 DMA_WAIT_TAG((RETR_BUF + buf_idx)); 211 212 unsigned int i; 213 for(i=0; i<(height>>2)-1; i++) { 214 215 buf_idx^=1; 216 217 // 4 lines y 218 spu_mfcdma32(y_plane[buf_idx], (unsigned int) ram_addr_y+size_4lines_y, size_4lines_y, RETR_BUF+buf_idx, MFC_GET_CMD); 219 220 // 2 lines v 221 spu_mfcdma32(v_plane[buf_idx], (unsigned int) ram_addr_v+size_2lines_vu, size_2lines_vu, RETR_BUF+buf_idx, MFC_GET_CMD); 222 223 // 2 lines u 224 spu_mfcdma32(u_plane[buf_idx], (unsigned int) ram_addr_u+size_2lines_vu, size_2lines_vu, RETR_BUF+buf_idx, MFC_GET_CMD); 225 226 DMA_WAIT_TAG((RETR_BUF + buf_idx)); 227 228 buf_idx^=1; 229 230 231 // Convert YUV to BGRA, store it back (first two lines) 232 yuv_to_rgb_w16_line(y_plane[buf_idx], v_plane[buf_idx], u_plane[buf_idx], bgra, width); 233 234 // Next two lines 235 yuv_to_rgb_w16_line(y_plane[buf_idx] + size_2lines_y, 236 v_plane[buf_idx] + stride_vu, 237 u_plane[buf_idx] + stride_vu, 238 bgra + size_2lines_bgra, 239 width); 240 241 // Wait for previous storing transfer to be completed 242 DMA_WAIT_TAG(STR_BUF); 243 244 // Store converted lines in two steps->max transfer size 16384 245 spu_mfcdma32(bgra, (unsigned int) ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD); 246 ram_addr_bgra += size_2lines_bgra; 247 spu_mfcdma32(bgra+size_2lines_bgra, (unsigned int) ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD); 248 ram_addr_bgra += size_2lines_bgra; 249 250 // Move 4 lines 251 ram_addr_y += size_4lines_y; 252 ram_addr_v += size_2lines_vu; 253 ram_addr_u += size_2lines_vu; 254 255 buf_idx^=1; 256 } 257 258 // Convert YUV to BGRA, store it back (first two lines) 259 yuv_to_rgb_w16_line(y_plane[buf_idx], v_plane[buf_idx], u_plane[buf_idx], bgra, width); 260 261 // Next two lines 262 yuv_to_rgb_w16_line(y_plane[buf_idx] + size_2lines_y, 263 v_plane[buf_idx] + stride_vu, 264 u_plane[buf_idx] + stride_vu, 265 bgra + size_2lines_bgra, 266 width); 267 268 // Wait for previous storing transfer to be completed 269 DMA_WAIT_TAG(STR_BUF); 270 spu_mfcdma32(bgra, (unsigned int) ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD); 271 ram_addr_bgra += size_2lines_bgra; 272 spu_mfcdma32(bgra+size_2lines_bgra, (unsigned int) ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD); 273 274 // wait for previous storing transfer to be completed 275 DMA_WAIT_TAG(STR_BUF); 276 277 } 278 279 280 void yuv_to_rgb_w32() { 281 // Pixel dimensions of the picture 282 uint32_t width, height; 283 284 // Extract parameters 285 width = parms_converter.src_pixel_width; 286 height = parms_converter.src_pixel_height; 287 288 // Plane data management 289 // Y 290 unsigned char* ram_addr_y = parms_converter.y_plane; 291 // V 292 unsigned char* ram_addr_v = parms_converter.v_plane; 293 // U 294 unsigned char* ram_addr_u = parms_converter.u_plane; 295 296 // BGRA 297 unsigned char* ram_addr_bgra = parms_converter.dstBuffer; 298 299 // Strides 300 unsigned int stride_y = width; 301 unsigned int stride_vu = width>>1; 302 303 // Buffer management 304 unsigned int buf_idx = 0; 305 unsigned int size_4lines_y = stride_y<<2; 306 unsigned int size_2lines_y = stride_y<<1; 307 unsigned int size_2lines_vu = stride_vu<<1; 308 309 // 2*width*4byte_per_pixel 310 unsigned int size_2lines_bgra = width<<3; 311 312 // start double-buffered processing 313 // 4 lines y 314 spu_mfcdma32(y_plane[buf_idx], (unsigned int) ram_addr_y, size_4lines_y, RETR_BUF + buf_idx, MFC_GET_CMD); 315 // 2 lines v 316 spu_mfcdma32(v_plane[buf_idx], (unsigned int) ram_addr_v, size_2lines_vu, RETR_BUF + buf_idx, MFC_GET_CMD); 317 // 2 lines u 318 spu_mfcdma32(u_plane[buf_idx], (unsigned int) ram_addr_u, size_2lines_vu, RETR_BUF + buf_idx, MFC_GET_CMD); 319 320 // Wait for these transfers to be completed 321 DMA_WAIT_TAG((RETR_BUF + buf_idx)); 322 323 unsigned int i; 324 for(i=0; i < (height>>2)-1; i++) { 325 buf_idx^=1; 326 // 4 lines y 327 spu_mfcdma32(y_plane[buf_idx], (unsigned int) ram_addr_y+size_4lines_y, size_4lines_y, RETR_BUF + buf_idx, MFC_GET_CMD); 328 deprintf("4lines = %d\n", size_4lines_y); 329 // 2 lines v 330 spu_mfcdma32(v_plane[buf_idx], (unsigned int) ram_addr_v+size_2lines_vu, size_2lines_vu, RETR_BUF + buf_idx, MFC_GET_CMD); 331 deprintf("2lines = %d\n", size_2lines_vu); 332 // 2 lines u 333 spu_mfcdma32(u_plane[buf_idx], (unsigned int) ram_addr_u+size_2lines_vu, size_2lines_vu, RETR_BUF + buf_idx, MFC_GET_CMD); 334 deprintf("2lines = %d\n", size_2lines_vu); 335 336 DMA_WAIT_TAG((RETR_BUF + buf_idx)); 337 338 buf_idx^=1; 339 340 // Convert YUV to BGRA, store it back (first two lines) 341 yuv_to_rgb_w32_line(y_plane[buf_idx], v_plane[buf_idx], u_plane[buf_idx], bgra, width); 342 343 // Next two lines 344 yuv_to_rgb_w32_line(y_plane[buf_idx] + size_2lines_y, 345 v_plane[buf_idx] + stride_vu, 346 u_plane[buf_idx] + stride_vu, 347 bgra + size_2lines_bgra, 348 width); 349 350 // Wait for previous storing transfer to be completed 351 DMA_WAIT_TAG(STR_BUF); 352 353 // Store converted lines in two steps->max transfer size 16384 354 spu_mfcdma32(bgra, (unsigned int)ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD); 355 ram_addr_bgra += size_2lines_bgra; 356 spu_mfcdma32(bgra + size_2lines_bgra, (unsigned int)ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD); 357 ram_addr_bgra += size_2lines_bgra; 358 359 // Move 4 lines 360 ram_addr_y += size_4lines_y; 361 ram_addr_v += size_2lines_vu; 362 ram_addr_u += size_2lines_vu; 363 364 buf_idx^=1; 365 } 366 367 // Convert YUV to BGRA, store it back (first two lines) 368 yuv_to_rgb_w32_line(y_plane[buf_idx], v_plane[buf_idx], u_plane[buf_idx], bgra, width); 369 370 // Next two lines 371 yuv_to_rgb_w32_line(y_plane[buf_idx] + size_2lines_y, 372 v_plane[buf_idx] + stride_vu, 373 u_plane[buf_idx] + stride_vu, 374 bgra + size_2lines_bgra, 375 width); 376 377 // Wait for previous storing transfer to be completed 378 DMA_WAIT_TAG(STR_BUF); 379 spu_mfcdma32(bgra, (unsigned int) ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD); 380 ram_addr_bgra += size_2lines_bgra; 381 spu_mfcdma32(bgra + size_2lines_bgra, (unsigned int) ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD); 382 383 // Wait for previous storing transfer to be completed 384 DMA_WAIT_TAG(STR_BUF); 385 } 386 387 388 /* Some vectors needed by the yuv 2 rgb conversion algorithm */ 389 const vector float vec_minus_128 = { -128.0f, -128.0f, -128.0f, -128.0f }; 390 const vector unsigned char vec_null = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; 391 const vector unsigned char vec_char2int_first = { 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x13 }; 392 const vector unsigned char vec_char2int_second = { 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x15, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x17 }; 393 const vector unsigned char vec_char2int_third = { 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x00, 0x00, 0x1B }; 394 const vector unsigned char vec_char2int_fourth = { 0x00, 0x00, 0x00, 0x1C, 0x00, 0x00, 0x00, 0x1D, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x00, 0x1F }; 395 396 const vector float vec_R_precalc_coeff = {1.403f, 1.403f, 1.403f, 1.403f}; 397 const vector float vec_Gu_precalc_coeff = {-0.344f, -0.344f, -0.344f, -0.344f}; 398 const vector float vec_Gv_precalc_coeff = {-0.714f, -0.714f, -0.714f, -0.714f}; 399 const vector float vec_B_precalc_coeff = {1.773f, 1.773f, 1.773f, 1.773f}; 400 401 const vector unsigned int vec_alpha = { 255 << 24, 255 << 24, 255 << 24, 255 << 24 }; 402 403 const vector unsigned char vec_select_floats_upper = { 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07 }; 404 const vector unsigned char vec_select_floats_lower = { 0x08, 0x09, 0x0A, 0x0B, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x0C, 0x0D, 0x0E, 0x0F }; 405 406 407 /* 408 * yuv_to_rgb_w16() 409 * 410 * processes to line of yuv-input, width has to be a multiple of 16 411 * two lines of yuv are taken as input 412 * 413 * @param y_addr address of the y plane in local store 414 * @param v_addr address of the v plane in local store 415 * @param u_addr address of the u plane in local store 416 * @param bgra_addr_ address of the bgra output buffer 417 * @param width the width in pixel 418 */ 419 void yuv_to_rgb_w16_line(unsigned char* y_addr, unsigned char* v_addr, unsigned char* u_addr, unsigned char* bgra_addr_, unsigned int width) { 420 // each pixel is stored as an integer 421 unsigned int* bgra_addr = (unsigned int*) bgra_addr_; 422 423 unsigned int x; 424 for(x = 0; x < width; x+=2) { 425 // Gehe zweischrittig durch die zeile, da jeder u und v wert fuer 4 pixel(zwei hoch, zwei breit) gilt 426 const unsigned char Y_1 = *(y_addr + x); 427 const unsigned char Y_2 = *(y_addr + x + 1); 428 const unsigned char Y_3 = *(y_addr + x + width); 429 const unsigned char Y_4 = *(y_addr + x + width + 1); 430 const unsigned char U = *(u_addr + (x >> 1)); 431 const unsigned char V = *(v_addr + (x >> 1)); 432 433 float V_minus_128 = (float)((float)V - 128.0f); 434 float U_minus_128 = (float)((float)U - 128.0f); 435 436 float R_precalculate = 1.403f * V_minus_128; 437 float G_precalculate = -(0.344f * U_minus_128 + 0.714f * V_minus_128); 438 float B_precalculate = 1.773f * U_minus_128; 439 440 const unsigned char R_1 = float_to_char((Y_1 + R_precalculate)); 441 const unsigned char R_2 = float_to_char((Y_2 + R_precalculate)); 442 const unsigned char R_3 = float_to_char((Y_3 + R_precalculate)); 443 const unsigned char R_4 = float_to_char((Y_4 + R_precalculate)); 444 const unsigned char G_1 = float_to_char((Y_1 + G_precalculate)); 445 const unsigned char G_2 = float_to_char((Y_2 + G_precalculate)); 446 const unsigned char G_3 = float_to_char((Y_3 + G_precalculate)); 447 const unsigned char G_4 = float_to_char((Y_4 + G_precalculate)); 448 const unsigned char B_1 = float_to_char((Y_1 + B_precalculate)); 449 const unsigned char B_2 = float_to_char((Y_2 + B_precalculate)); 450 const unsigned char B_3 = float_to_char((Y_3 + B_precalculate)); 451 const unsigned char B_4 = float_to_char((Y_4 + B_precalculate)); 452 453 *(bgra_addr + x) = (B_1 << 0)| (G_1 << 8) | (R_1 << 16) | (255 << 24); 454 *(bgra_addr + x + 1) = (B_2 << 0)| (G_2 << 8) | (R_2 << 16) | (255 << 24); 455 *(bgra_addr + x + width) = (B_3 << 0)| (G_3 << 8) | (R_3 << 16) | (255 << 24); 456 *(bgra_addr + x + width + 1) = (B_4 << 0)| (G_4 << 8) | (R_4 << 16) | (255 << 24); 457 } 458 } 459 460 461 /* 462 * yuv_to_rgb_w32() 463 * 464 * processes to line of yuv-input, width has to be a multiple of 32 465 * two lines of yuv are taken as input 466 * 467 * @param y_addr address of the y plane in local store 468 * @param v_addr address of the v plane in local store 469 * @param u_addr address of the u plane in local store 470 * @param bgra_addr_ address of the bgra output buffer 471 * @param width the width in pixel 472 */ 473 void yuv_to_rgb_w32_line(unsigned char* y_addr, unsigned char* v_addr, unsigned char* u_addr, unsigned char* bgra_addr_, unsigned int width) { 474 // each pixel is stored as an integer 475 unsigned int* bgra_addr = (unsigned int*) bgra_addr_; 476 477 unsigned int x; 478 for(x = 0; x < width; x+=32) { 479 // Gehe zweischrittig durch die zeile, da jeder u und v wert fuer 4 pixel(zwei hoch, zwei breit) gilt 480 481 const vector unsigned char vchar_Y_1 = *((vector unsigned char*)(y_addr + x)); 482 const vector unsigned char vchar_Y_2 = *((vector unsigned char*)(y_addr + x + 16)); 483 const vector unsigned char vchar_Y_3 = *((vector unsigned char*)(y_addr + x + width)); 484 const vector unsigned char vchar_Y_4 = *((vector unsigned char*)(y_addr + x + width + 16)); 485 const vector unsigned char vchar_U = *((vector unsigned char*)(u_addr + (x >> 1))); 486 const vector unsigned char vchar_V = *((vector unsigned char*)(v_addr + (x >> 1))); 487 488 const vector float vfloat_U_1 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_U, vec_char2int_first), 0),vec_minus_128); 489 const vector float vfloat_U_2 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_U, vec_char2int_second), 0),vec_minus_128); 490 const vector float vfloat_U_3 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_U, vec_char2int_third), 0),vec_minus_128); 491 const vector float vfloat_U_4 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_U, vec_char2int_fourth), 0),vec_minus_128); 492 493 const vector float vfloat_V_1 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_V, vec_char2int_first), 0),vec_minus_128); 494 const vector float vfloat_V_2 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_V, vec_char2int_second), 0),vec_minus_128); 495 const vector float vfloat_V_3 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_V, vec_char2int_third), 0),vec_minus_128); 496 const vector float vfloat_V_4 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_V, vec_char2int_fourth), 0),vec_minus_128); 497 498 vector float Y_1 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_1, vec_char2int_first), 0); 499 vector float Y_2 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_1, vec_char2int_second), 0); 500 vector float Y_3 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_1, vec_char2int_third), 0); 501 vector float Y_4 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_1, vec_char2int_fourth), 0); 502 vector float Y_5 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_2, vec_char2int_first), 0); 503 vector float Y_6 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_2, vec_char2int_second), 0); 504 vector float Y_7 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_2, vec_char2int_third), 0); 505 vector float Y_8 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_2, vec_char2int_fourth), 0); 506 vector float Y_9 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_3, vec_char2int_first), 0); 507 vector float Y_10 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_3, vec_char2int_second), 0); 508 vector float Y_11 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_3, vec_char2int_third), 0); 509 vector float Y_12 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_3, vec_char2int_fourth), 0); 510 vector float Y_13 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_4, vec_char2int_first), 0); 511 vector float Y_14 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_4, vec_char2int_second), 0); 512 vector float Y_15 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_4, vec_char2int_third), 0); 513 vector float Y_16 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_4, vec_char2int_fourth), 0); 514 515 const vector float R1a_precalculate = spu_mul(vec_R_precalc_coeff, vfloat_V_1); 516 const vector float R2a_precalculate = spu_mul(vec_R_precalc_coeff, vfloat_V_2); 517 const vector float R3a_precalculate = spu_mul(vec_R_precalc_coeff, vfloat_V_3); 518 const vector float R4a_precalculate = spu_mul(vec_R_precalc_coeff, vfloat_V_4); 519 520 const vector float R1_precalculate = spu_shuffle(R1a_precalculate, R1a_precalculate, vec_select_floats_upper); 521 const vector float R2_precalculate = spu_shuffle(R1a_precalculate, R1a_precalculate, vec_select_floats_lower); 522 const vector float R3_precalculate = spu_shuffle(R2a_precalculate, R2a_precalculate, vec_select_floats_upper); 523 const vector float R4_precalculate = spu_shuffle(R2a_precalculate, R2a_precalculate, vec_select_floats_lower); 524 const vector float R5_precalculate = spu_shuffle(R3a_precalculate, R3a_precalculate, vec_select_floats_upper); 525 const vector float R6_precalculate = spu_shuffle(R3a_precalculate, R3a_precalculate, vec_select_floats_lower); 526 const vector float R7_precalculate = spu_shuffle(R4a_precalculate, R4a_precalculate, vec_select_floats_upper); 527 const vector float R8_precalculate = spu_shuffle(R4a_precalculate, R4a_precalculate, vec_select_floats_lower); 528 529 530 const vector float G1a_precalculate = spu_madd(vec_Gu_precalc_coeff, vfloat_U_1, spu_mul(vfloat_V_1, vec_Gv_precalc_coeff)); 531 const vector float G2a_precalculate = spu_madd(vec_Gu_precalc_coeff, vfloat_U_2, spu_mul(vfloat_V_2, vec_Gv_precalc_coeff)); 532 const vector float G3a_precalculate = spu_madd(vec_Gu_precalc_coeff, vfloat_U_3, spu_mul(vfloat_V_3, vec_Gv_precalc_coeff)); 533 const vector float G4a_precalculate = spu_madd(vec_Gu_precalc_coeff, vfloat_U_4, spu_mul(vfloat_V_4, vec_Gv_precalc_coeff)); 534 535 const vector float G1_precalculate = spu_shuffle(G1a_precalculate, G1a_precalculate, vec_select_floats_upper); 536 const vector float G2_precalculate = spu_shuffle(G1a_precalculate, G1a_precalculate, vec_select_floats_lower); 537 const vector float G3_precalculate = spu_shuffle(G2a_precalculate, G2a_precalculate, vec_select_floats_upper); 538 const vector float G4_precalculate = spu_shuffle(G2a_precalculate, G2a_precalculate, vec_select_floats_lower); 539 const vector float G5_precalculate = spu_shuffle(G3a_precalculate, G3a_precalculate, vec_select_floats_upper); 540 const vector float G6_precalculate = spu_shuffle(G3a_precalculate, G3a_precalculate, vec_select_floats_lower); 541 const vector float G7_precalculate = spu_shuffle(G4a_precalculate, G4a_precalculate, vec_select_floats_upper); 542 const vector float G8_precalculate = spu_shuffle(G4a_precalculate, G4a_precalculate, vec_select_floats_lower); 543 544 545 const vector float B1a_precalculate = spu_mul(vec_B_precalc_coeff, vfloat_U_1); 546 const vector float B2a_precalculate = spu_mul(vec_B_precalc_coeff, vfloat_U_2); 547 const vector float B3a_precalculate = spu_mul(vec_B_precalc_coeff, vfloat_U_3); 548 const vector float B4a_precalculate = spu_mul(vec_B_precalc_coeff, vfloat_U_4); 549 550 const vector float B1_precalculate = spu_shuffle(B1a_precalculate, B1a_precalculate, vec_select_floats_upper); 551 const vector float B2_precalculate = spu_shuffle(B1a_precalculate, B1a_precalculate, vec_select_floats_lower); 552 const vector float B3_precalculate = spu_shuffle(B2a_precalculate, B2a_precalculate, vec_select_floats_upper); 553 const vector float B4_precalculate = spu_shuffle(B2a_precalculate, B2a_precalculate, vec_select_floats_lower); 554 const vector float B5_precalculate = spu_shuffle(B3a_precalculate, B3a_precalculate, vec_select_floats_upper); 555 const vector float B6_precalculate = spu_shuffle(B3a_precalculate, B3a_precalculate, vec_select_floats_lower); 556 const vector float B7_precalculate = spu_shuffle(B4a_precalculate, B4a_precalculate, vec_select_floats_upper); 557 const vector float B8_precalculate = spu_shuffle(B4a_precalculate, B4a_precalculate, vec_select_floats_lower); 558 559 560 const vector unsigned int R_1 = vfloat_to_vuint(spu_add( Y_1, R1_precalculate)); 561 const vector unsigned int R_2 = vfloat_to_vuint(spu_add( Y_2, R2_precalculate)); 562 const vector unsigned int R_3 = vfloat_to_vuint(spu_add( Y_3, R3_precalculate)); 563 const vector unsigned int R_4 = vfloat_to_vuint(spu_add( Y_4, R4_precalculate)); 564 const vector unsigned int R_5 = vfloat_to_vuint(spu_add( Y_5, R5_precalculate)); 565 const vector unsigned int R_6 = vfloat_to_vuint(spu_add( Y_6, R6_precalculate)); 566 const vector unsigned int R_7 = vfloat_to_vuint(spu_add( Y_7, R7_precalculate)); 567 const vector unsigned int R_8 = vfloat_to_vuint(spu_add( Y_8, R8_precalculate)); 568 const vector unsigned int R_9 = vfloat_to_vuint(spu_add( Y_9, R1_precalculate)); 569 const vector unsigned int R_10 = vfloat_to_vuint(spu_add(Y_10, R2_precalculate)); 570 const vector unsigned int R_11 = vfloat_to_vuint(spu_add(Y_11, R3_precalculate)); 571 const vector unsigned int R_12 = vfloat_to_vuint(spu_add(Y_12, R4_precalculate)); 572 const vector unsigned int R_13 = vfloat_to_vuint(spu_add(Y_13, R5_precalculate)); 573 const vector unsigned int R_14 = vfloat_to_vuint(spu_add(Y_14, R6_precalculate)); 574 const vector unsigned int R_15 = vfloat_to_vuint(spu_add(Y_15, R7_precalculate)); 575 const vector unsigned int R_16 = vfloat_to_vuint(spu_add(Y_16, R8_precalculate)); 576 577 const vector unsigned int G_1 = vfloat_to_vuint(spu_add( Y_1, G1_precalculate)); 578 const vector unsigned int G_2 = vfloat_to_vuint(spu_add( Y_2, G2_precalculate)); 579 const vector unsigned int G_3 = vfloat_to_vuint(spu_add( Y_3, G3_precalculate)); 580 const vector unsigned int G_4 = vfloat_to_vuint(spu_add( Y_4, G4_precalculate)); 581 const vector unsigned int G_5 = vfloat_to_vuint(spu_add( Y_5, G5_precalculate)); 582 const vector unsigned int G_6 = vfloat_to_vuint(spu_add( Y_6, G6_precalculate)); 583 const vector unsigned int G_7 = vfloat_to_vuint(spu_add( Y_7, G7_precalculate)); 584 const vector unsigned int G_8 = vfloat_to_vuint(spu_add( Y_8, G8_precalculate)); 585 const vector unsigned int G_9 = vfloat_to_vuint(spu_add( Y_9, G1_precalculate)); 586 const vector unsigned int G_10 = vfloat_to_vuint(spu_add(Y_10, G2_precalculate)); 587 const vector unsigned int G_11 = vfloat_to_vuint(spu_add(Y_11, G3_precalculate)); 588 const vector unsigned int G_12 = vfloat_to_vuint(spu_add(Y_12, G4_precalculate)); 589 const vector unsigned int G_13 = vfloat_to_vuint(spu_add(Y_13, G5_precalculate)); 590 const vector unsigned int G_14 = vfloat_to_vuint(spu_add(Y_14, G6_precalculate)); 591 const vector unsigned int G_15 = vfloat_to_vuint(spu_add(Y_15, G7_precalculate)); 592 const vector unsigned int G_16 = vfloat_to_vuint(spu_add(Y_16, G8_precalculate)); 593 594 const vector unsigned int B_1 = vfloat_to_vuint(spu_add( Y_1, B1_precalculate)); 595 const vector unsigned int B_2 = vfloat_to_vuint(spu_add( Y_2, B2_precalculate)); 596 const vector unsigned int B_3 = vfloat_to_vuint(spu_add( Y_3, B3_precalculate)); 597 const vector unsigned int B_4 = vfloat_to_vuint(spu_add( Y_4, B4_precalculate)); 598 const vector unsigned int B_5 = vfloat_to_vuint(spu_add( Y_5, B5_precalculate)); 599 const vector unsigned int B_6 = vfloat_to_vuint(spu_add( Y_6, B6_precalculate)); 600 const vector unsigned int B_7 = vfloat_to_vuint(spu_add( Y_7, B7_precalculate)); 601 const vector unsigned int B_8 = vfloat_to_vuint(spu_add( Y_8, B8_precalculate)); 602 const vector unsigned int B_9 = vfloat_to_vuint(spu_add( Y_9, B1_precalculate)); 603 const vector unsigned int B_10 = vfloat_to_vuint(spu_add(Y_10, B2_precalculate)); 604 const vector unsigned int B_11 = vfloat_to_vuint(spu_add(Y_11, B3_precalculate)); 605 const vector unsigned int B_12 = vfloat_to_vuint(spu_add(Y_12, B4_precalculate)); 606 const vector unsigned int B_13 = vfloat_to_vuint(spu_add(Y_13, B5_precalculate)); 607 const vector unsigned int B_14 = vfloat_to_vuint(spu_add(Y_14, B6_precalculate)); 608 const vector unsigned int B_15 = vfloat_to_vuint(spu_add(Y_15, B7_precalculate)); 609 const vector unsigned int B_16 = vfloat_to_vuint(spu_add(Y_16, B8_precalculate)); 610 611 *((vector unsigned int*)(bgra_addr + x)) = spu_or(spu_or(vec_alpha, B_1), spu_or(spu_slqwbyte( R_1, 2),spu_slqwbyte(G_1, 1))); 612 *((vector unsigned int*)(bgra_addr + x + 4)) = spu_or(spu_or(vec_alpha, B_2), spu_or(spu_slqwbyte( R_2, 2),spu_slqwbyte(G_2, 1))); 613 *((vector unsigned int*)(bgra_addr + x + 8)) = spu_or(spu_or(vec_alpha, B_3), spu_or(spu_slqwbyte( R_3, 2),spu_slqwbyte(G_3, 1))); 614 *((vector unsigned int*)(bgra_addr + x + 12)) = spu_or(spu_or(vec_alpha, B_4), spu_or(spu_slqwbyte( R_4, 2),spu_slqwbyte(G_4, 1))); 615 *((vector unsigned int*)(bgra_addr + x + 16)) = spu_or(spu_or(vec_alpha, B_5), spu_or(spu_slqwbyte( R_5, 2),spu_slqwbyte(G_5, 1))); 616 *((vector unsigned int*)(bgra_addr + x + 20)) = spu_or(spu_or(vec_alpha, B_6), spu_or(spu_slqwbyte( R_6, 2),spu_slqwbyte(G_6, 1))); 617 *((vector unsigned int*)(bgra_addr + x + 24)) = spu_or(spu_or(vec_alpha, B_7), spu_or(spu_slqwbyte( R_7, 2),spu_slqwbyte(G_7, 1))); 618 *((vector unsigned int*)(bgra_addr + x + 28)) = spu_or(spu_or(vec_alpha, B_8), spu_or(spu_slqwbyte( R_8, 2),spu_slqwbyte(G_8, 1))); 619 *((vector unsigned int*)(bgra_addr + x + width)) = spu_or(spu_or(vec_alpha, B_9), spu_or(spu_slqwbyte( R_9, 2),spu_slqwbyte(G_9, 1))); 620 *((vector unsigned int*)(bgra_addr + x + width + 4)) = spu_or(spu_or(vec_alpha, B_10), spu_or(spu_slqwbyte(R_10, 2),spu_slqwbyte(G_10, 1))); 621 *((vector unsigned int*)(bgra_addr + x + width + 8)) = spu_or(spu_or(vec_alpha, B_11), spu_or(spu_slqwbyte(R_11, 2),spu_slqwbyte(G_11, 1))); 622 *((vector unsigned int*)(bgra_addr + x + width + 12)) = spu_or(spu_or(vec_alpha, B_12), spu_or(spu_slqwbyte(R_12, 2),spu_slqwbyte(G_12, 1))); 623 *((vector unsigned int*)(bgra_addr + x + width + 16)) = spu_or(spu_or(vec_alpha, B_13), spu_or(spu_slqwbyte(R_13, 2),spu_slqwbyte(G_13, 1))); 624 *((vector unsigned int*)(bgra_addr + x + width + 20)) = spu_or(spu_or(vec_alpha, B_14), spu_or(spu_slqwbyte(R_14, 2),spu_slqwbyte(G_14, 1))); 625 *((vector unsigned int*)(bgra_addr + x + width + 24)) = spu_or(spu_or(vec_alpha, B_15), spu_or(spu_slqwbyte(R_15, 2),spu_slqwbyte(G_15, 1))); 626 *((vector unsigned int*)(bgra_addr + x + width + 28)) = spu_or(spu_or(vec_alpha, B_16), spu_or(spu_slqwbyte(R_16, 2),spu_slqwbyte(G_16, 1))); 627 } 628 } 629 630