1 /* 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 12 /**************************************************************************** 13 * 14 * Module Title : scaleopt.cpp 15 * 16 * Description : Optimized scaling functions 17 * 18 ****************************************************************************/ 19 #include "pragmas.h" 20 21 /**************************************************************************** 22 * Module Statics 23 ****************************************************************************/ 24 #if 0 25 __declspec(align(16)) const static unsigned short one_fifth[] = { 51, 51, 51, 51 }; 26 __declspec(align(16)) const static unsigned short two_fifths[] = { 102, 102, 102, 102 }; 27 __declspec(align(16)) const static unsigned short three_fifths[] = { 154, 154, 154, 154 }; 28 __declspec(align(16)) const static unsigned short four_fifths[] = { 205, 205, 205, 205 }; 29 __declspec(align(16)) const static unsigned short round_values[] = { 128, 128, 128, 128 }; 30 __declspec(align(16)) const static unsigned short four_ones[] = { 1, 1, 1, 1}; 31 __declspec(align(16)) const static unsigned short const45_2[] = {205, 154, 102, 51 }; 32 __declspec(align(16)) const static unsigned short const45_1[] = { 51, 102, 154, 205 }; 33 __declspec(align(16)) const static unsigned char mask45[] = { 0, 0, 0, 0, 0, 0, 255, 0}; 34 __declspec(align(16)) const static unsigned short const35_2[] = { 154, 51, 205, 102 }; 35 __declspec(align(16)) const static unsigned short const35_1[] = { 102, 205, 51, 154 }; 36 #endif 37 38 #include "vpx_scale/vpxscale.h" 39 #include "vpx_mem/vpx_mem.h" 40 41 /**************************************************************************** 42 * 43 * ROUTINE : horizontal_line_3_5_scale_mmx 44 * 45 * INPUTS : const unsigned char *source : 46 * unsigned int source_width : 47 * unsigned char *dest : 48 * unsigned int dest_width : 49 * 50 * OUTPUTS : None. 51 * 52 * RETURNS : void 53 * 54 * FUNCTION : 3 to 5 up-scaling of a horizontal line of pixels. 55 * 56 * SPECIAL NOTES : None. 57 * 58 ****************************************************************************/ 59 static 60 void horizontal_line_3_5_scale_mmx 61 ( 62 const unsigned char *source, 63 unsigned int source_width, 64 unsigned char *dest, 65 unsigned int dest_width 66 ) 67 { 68 __declspec(align(16)) unsigned short const35_2[] = { 154, 51, 205, 102 }; 69 __declspec(align(16)) unsigned short const35_1[] = { 102, 205, 51, 154 }; 70 __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 }; 71 72 (void) dest_width; 73 74 __asm 75 { 76 77 push ebx 78 79 mov esi, source 80 mov edi, dest 81 82 mov ecx, source_width 83 lea edx, [esi+ecx-3]; 84 85 movq mm5, const35_1 // mm5 = 66 xx cd xx 33 xx 9a xx 86 movq mm6, const35_2 // mm6 = 9a xx 33 xx cd xx 66 xx 87 88 movq mm4, round_values // mm4 = 80 xx 80 xx 80 xx 80 xx 89 pxor mm7, mm7 // clear mm7 90 91 horiz_line_3_5_loop: 92 93 mov eax, DWORD PTR [esi] // eax = 00 01 02 03 94 mov ebx, eax 95 96 and ebx, 0xffff00 // ebx = xx 01 02 xx 97 mov ecx, eax // ecx = 00 01 02 03 98 99 and eax, 0xffff0000 // eax = xx xx 02 03 100 xor ecx, eax // ecx = 00 01 xx xx 101 102 shr ebx, 8 // ebx = 01 02 xx xx 103 or eax, ebx // eax = 01 02 02 03 104 105 shl ebx, 16 // ebx = xx xx 01 02 106 movd mm1, eax // mm1 = 01 02 02 03 xx xx xx xx 107 108 or ebx, ecx // ebx = 00 01 01 02 109 punpcklbw mm1, mm7 // mm1 = 01 xx 02 xx 02 xx 03 xx 110 111 movd mm0, ebx // mm0 = 00 01 01 02 112 pmullw mm1, mm6 // 113 114 punpcklbw mm0, mm7 // mm0 = 00 xx 01 xx 01 xx 02 xx 115 pmullw mm0, mm5 // 116 117 mov [edi], ebx // writeoutput 00 xx xx xx 118 add esi, 3 119 120 add edi, 5 121 paddw mm0, mm1 122 123 paddw mm0, mm4 124 psrlw mm0, 8 125 126 cmp esi, edx 127 packuswb mm0, mm7 128 129 movd DWORD Ptr [edi-4], mm0 130 jl horiz_line_3_5_loop 131 132 //Exit: 133 mov eax, DWORD PTR [esi] // eax = 00 01 02 03 134 mov ebx, eax 135 136 and ebx, 0xffff00 // ebx = xx 01 02 xx 137 mov ecx, eax // ecx = 00 01 02 03 138 139 and eax, 0xffff0000 // eax = xx xx 02 03 140 xor ecx, eax // ecx = 00 01 xx xx 141 142 shr ebx, 8 // ebx = 01 02 xx xx 143 or eax, ebx // eax = 01 02 02 03 144 145 shl eax, 8 // eax = xx 01 02 02 146 and eax, 0xffff0000 // eax = xx xx 02 02 147 148 or eax, ebx // eax = 01 02 02 02 149 150 shl ebx, 16 // ebx = xx xx 01 02 151 movd mm1, eax // mm1 = 01 02 02 02 xx xx xx xx 152 153 or ebx, ecx // ebx = 00 01 01 02 154 punpcklbw mm1, mm7 // mm1 = 01 xx 02 xx 02 xx 02 xx 155 156 movd mm0, ebx // mm0 = 00 01 01 02 157 pmullw mm1, mm6 // 158 159 punpcklbw mm0, mm7 // mm0 = 00 xx 01 xx 01 xx 02 xx 160 pmullw mm0, mm5 // 161 162 mov [edi], ebx // writeoutput 00 xx xx xx 163 paddw mm0, mm1 164 165 paddw mm0, mm4 166 psrlw mm0, 8 167 168 packuswb mm0, mm7 169 movd DWORD Ptr [edi+1], mm0 170 171 pop ebx 172 173 } 174 175 /* 176 const unsigned char *src = source; 177 unsigned char *des = dest; 178 unsigned int a, b, c ; 179 unsigned int i; 180 (void) dest_width; 181 182 for ( i=0; i<source_width-3; i+=3 ) 183 { 184 a = src[0]; 185 b = src[1]; 186 des [0] = (UINT8) (a); 187 // 2 * left + 3 * right /5 188 des [1] = (UINT8) (( a * 102 + 154 * b + 128 ) >> 8); 189 c = src[2] ; 190 // 4 * left + 1 * right /5 191 des [2] = (UINT8) (( b * 205 + c * 51 + 128 ) >> 8); 192 // 1 * left + 4 * right /5 193 des [3] = (UINT8) (( b * 51 + c * 205 + 128 ) >> 8); 194 195 a = src[3]; 196 // 3 * left + 2 * right /5 197 des [4] = (UINT8) (( c * 154 + a * 102 + 128 ) >> 8); 198 199 src += 3; 200 des += 5; 201 } 202 203 a = src[0]; 204 b = src[1]; 205 des [0] = (UINT8) (a); 206 // 2 * left + 3 * right /5 207 des [1] = (UINT8) (( a * 102 + 154 * b + 128 ) >> 8); 208 c = src[2] ; 209 // 4 * left + 1 * right /5 210 des [2] = (UINT8) (( b * 205 + c * 51 + 128 ) >> 8); 211 // 1 * left + 4 * right /5 212 des [3] = (UINT8) (( b * 51 + c * 205 + 128 ) >> 8); 213 214 des [4] = (UINT8) (c); 215 */ 216 } 217 218 219 /**************************************************************************** 220 * 221 * ROUTINE : horizontal_line_4_5_scale_mmx 222 * 223 * INPUTS : const unsigned char *source : 224 * unsigned int source_width : 225 * unsigned char *dest : 226 * unsigned int dest_width : 227 * 228 * OUTPUTS : None. 229 * 230 * RETURNS : void 231 * 232 * FUNCTION : 4 to 5 up-scaling of a horizontal line of pixels. 233 * 234 * SPECIAL NOTES : None. 235 * 236 ****************************************************************************/ 237 static 238 void horizontal_line_4_5_scale_mmx 239 ( 240 const unsigned char *source, 241 unsigned int source_width, 242 unsigned char *dest, 243 unsigned int dest_width 244 ) 245 { 246 __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 }; 247 __declspec(align(16)) unsigned short const45_2[] = {205, 154, 102, 51 }; 248 __declspec(align(16)) unsigned short const45_1[] = { 51, 102, 154, 205 }; 249 __declspec(align(16)) unsigned char mask45[] = { 0, 0, 0, 0, 0, 0, 255, 0}; 250 251 (void)dest_width; 252 253 __asm 254 { 255 256 mov esi, source 257 mov edi, dest 258 259 mov ecx, source_width 260 lea edx, [esi+ecx-8]; 261 262 movq mm5, const45_1 // mm5 = 33 xx 66 xx 9a xx cd xx 263 movq mm6, const45_2 // mm6 = cd xx 9a xx 66 xx 33 xx 264 265 movq mm4, round_values // mm4 = 80 xx 80 xx 80 xx 80 xx 266 pxor mm7, mm7 // clear mm7 267 268 horiz_line_4_5_loop: 269 270 movq mm0, QWORD PTR [esi] // mm0 = 00 01 02 03 04 05 06 07 271 movq mm1, QWORD PTR [esi+1]; // mm1 = 01 02 03 04 05 06 07 08 272 273 movq mm2, mm0 // mm2 = 00 01 02 03 04 05 06 07 274 movq mm3, mm1 // mm3 = 01 02 03 04 05 06 07 08 275 276 movd DWORD PTR [edi], mm0 // write output 00 xx xx xx 277 punpcklbw mm0, mm7 // mm0 = 00 xx 01 xx 02 xx 03 xx 278 279 punpcklbw mm1, mm7 // mm1 = 01 xx 02 xx 03 xx 04 xx 280 pmullw mm0, mm5 // 00* 51 01*102 02*154 03*205 281 282 pmullw mm1, mm6 // 01*205 02*154 03*102 04* 51 283 punpckhbw mm2, mm7 // mm2 = 04 xx 05 xx 06 xx 07 xx 284 285 movd DWORD PTR [edi+5], mm2 // write ouput 05 xx xx xx 286 pmullw mm2, mm5 // 04* 51 05*102 06*154 07*205 287 288 punpckhbw mm3, mm7 // mm3 = 05 xx 06 xx 07 xx 08 xx 289 pmullw mm3, mm6 // 05*205 06*154 07*102 08* 51 290 291 paddw mm0, mm1 // added round values 292 paddw mm0, mm4 293 294 psrlw mm0, 8 // output: 01 xx 02 xx 03 xx 04 xx 295 packuswb mm0, mm7 296 297 movd DWORD PTR [edi+1], mm0 // write output 01 02 03 04 298 add edi, 10 299 300 add esi, 8 301 paddw mm2, mm3 // 302 303 paddw mm2, mm4 // added round values 304 cmp esi, edx 305 306 psrlw mm2, 8 307 packuswb mm2, mm7 308 309 movd DWORD PTR [edi-4], mm2 // writeoutput 06 07 08 09 310 jl horiz_line_4_5_loop 311 312 //Exit: 313 movq mm0, [esi] // mm0 = 00 01 02 03 04 05 06 07 314 movq mm1, mm0 // mm1 = 00 01 02 03 04 05 06 07 315 316 movq mm2, mm0 // mm2 = 00 01 02 03 04 05 06 07 317 psrlq mm1, 8 // mm1 = 01 02 03 04 05 06 07 00 318 319 movq mm3, mask45 // mm3 = 00 00 00 00 00 00 ff 00 320 pand mm3, mm1 // mm3 = 00 00 00 00 00 00 07 00 321 322 psllq mm3, 8 // mm3 = 00 00 00 00 00 00 00 07 323 por mm1, mm3 // mm1 = 01 02 03 04 05 06 07 07 324 325 movq mm3, mm1 326 327 movd DWORD PTR [edi], mm0 // write output 00 xx xx xx 328 punpcklbw mm0, mm7 // mm0 = 00 xx 01 xx 02 xx 03 xx 329 330 punpcklbw mm1, mm7 // mm1 = 01 xx 02 xx 03 xx 04 xx 331 pmullw mm0, mm5 // 00* 51 01*102 02*154 03*205 332 333 pmullw mm1, mm6 // 01*205 02*154 03*102 04* 51 334 punpckhbw mm2, mm7 // mm2 = 04 xx 05 xx 06 xx 07 xx 335 336 movd DWORD PTR [edi+5], mm2 // write ouput 05 xx xx xx 337 pmullw mm2, mm5 // 04* 51 05*102 06*154 07*205 338 339 punpckhbw mm3, mm7 // mm3 = 05 xx 06 xx 07 xx 08 xx 340 pmullw mm3, mm6 // 05*205 06*154 07*102 07* 51 341 342 paddw mm0, mm1 // added round values 343 paddw mm0, mm4 344 345 psrlw mm0, 8 // output: 01 xx 02 xx 03 xx 04 xx 346 packuswb mm0, mm7 // 01 02 03 04 xx xx xx xx 347 348 movd DWORD PTR [edi+1], mm0 // write output 01 02 03 04 349 paddw mm2, mm3 // 350 351 paddw mm2, mm4 // added round values 352 psrlw mm2, 8 353 354 packuswb mm2, mm7 355 movd DWORD PTR [edi+6], mm2 // writeoutput 06 07 08 09 356 357 358 } 359 /* 360 const unsigned char *src = source; 361 unsigned char *des = dest; 362 unsigned int a, b, c ; 363 unsigned i; 364 (void) dest_width; 365 366 for ( i=0; i<source_width-4; i+=4 ) 367 { 368 a = src[0]; 369 b = src[1]; 370 des [0] = (UINT8) a; 371 des [1] = (UINT8) (( a * 51 + 205 * b + 128) >> 8); 372 c = src[2] * 154; 373 a = src[3]; 374 des [2] = (UINT8) (( b * 102 + c + 128) >> 8); 375 des [3] = (UINT8) (( c + 102 * a + 128) >> 8); 376 b = src[4]; 377 des [4] = (UINT8) (( a * 205 + 51 * b + 128) >> 8); 378 379 src += 4; 380 des += 5; 381 } 382 383 a = src[0]; 384 b = src[1]; 385 des [0] = (UINT8) (a); 386 des [1] = (UINT8) (( a * 51 + 205 * b + 128) >> 8); 387 c = src[2] * 154; 388 a = src[3]; 389 des [2] = (UINT8) (( b * 102 + c + 128) >> 8); 390 des [3] = (UINT8) (( c + 102 * a + 128) >> 8); 391 des [4] = (UINT8) (a); 392 */ 393 } 394 395 /**************************************************************************** 396 * 397 * ROUTINE : vertical_band_4_5_scale_mmx 398 * 399 * INPUTS : unsigned char *dest : 400 * unsigned int dest_pitch : 401 * unsigned int dest_width : 402 * 403 * OUTPUTS : None. 404 * 405 * RETURNS : void 406 * 407 * FUNCTION : 4 to 5 up-scaling of a 4 pixel high band of pixels. 408 * 409 * SPECIAL NOTES : The routine uses the first line of the band below 410 * the current band. The function also has a "C" only 411 * version. 412 * 413 ****************************************************************************/ 414 static 415 void vertical_band_4_5_scale_mmx 416 ( 417 unsigned char *dest, 418 unsigned int dest_pitch, 419 unsigned int dest_width 420 ) 421 { 422 423 __declspec(align(16)) unsigned short one_fifth[] = { 51, 51, 51, 51 }; 424 __declspec(align(16)) unsigned short two_fifths[] = { 102, 102, 102, 102 }; 425 __declspec(align(16)) unsigned short three_fifths[] = { 154, 154, 154, 154 }; 426 __declspec(align(16)) unsigned short four_fifths[] = { 205, 205, 205, 205 }; 427 __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 }; 428 429 __asm 430 { 431 432 mov esi, dest // Get the source and destination pointer 433 mov ecx, dest_pitch // Get the pitch size 434 435 lea edi, [esi+ecx*2] // tow lines below 436 add edi, ecx // three lines below 437 438 pxor mm7, mm7 // clear out mm7 439 mov edx, dest_width // Loop counter 440 441 vs_4_5_loop: 442 443 movq mm0, QWORD ptr [esi] // src[0]; 444 movq mm1, QWORD ptr [esi+ecx] // src[1]; 445 446 movq mm2, mm0 // Make a copy 447 punpcklbw mm0, mm7 // unpack low to word 448 449 movq mm5, one_fifth 450 punpckhbw mm2, mm7 // unpack high to word 451 452 pmullw mm0, mm5 // a * 1/5 453 454 movq mm3, mm1 // make a copy 455 punpcklbw mm1, mm7 // unpack low to word 456 457 pmullw mm2, mm5 // a * 1/5 458 movq mm6, four_fifths // constan 459 460 movq mm4, mm1 // copy of low b 461 pmullw mm4, mm6 // b * 4/5 462 463 punpckhbw mm3, mm7 // unpack high to word 464 movq mm5, mm3 // copy of high b 465 466 pmullw mm5, mm6 // b * 4/5 467 paddw mm0, mm4 // a * 1/5 + b * 4/5 468 469 paddw mm2, mm5 // a * 1/5 + b * 4/5 470 paddw mm0, round_values // + 128 471 472 paddw mm2, round_values // + 128 473 psrlw mm0, 8 474 475 psrlw mm2, 8 476 packuswb mm0, mm2 // des [1] 477 478 movq QWORD ptr [esi+ecx], mm0 // write des[1] 479 movq mm0, [esi+ecx*2] // mm0 = src[2] 480 481 // mm1, mm3 --- Src[1] 482 // mm0 --- Src[2] 483 // mm7 for unpacking 484 485 movq mm5, two_fifths 486 movq mm2, mm0 // make a copy 487 488 pmullw mm1, mm5 // b * 2/5 489 movq mm6, three_fifths 490 491 492 punpcklbw mm0, mm7 // unpack low to word 493 pmullw mm3, mm5 // b * 2/5 494 495 movq mm4, mm0 // make copy of c 496 punpckhbw mm2, mm7 // unpack high to word 497 498 pmullw mm4, mm6 // c * 3/5 499 movq mm5, mm2 500 501 pmullw mm5, mm6 // c * 3/5 502 paddw mm1, mm4 // b * 2/5 + c * 3/5 503 504 paddw mm3, mm5 // b * 2/5 + c * 3/5 505 paddw mm1, round_values // + 128 506 507 paddw mm3, round_values // + 128 508 psrlw mm1, 8 509 510 psrlw mm3, 8 511 packuswb mm1, mm3 // des[2] 512 513 movq QWORD ptr [esi+ecx*2], mm1 // write des[2] 514 movq mm1, [edi] // mm1=Src[3]; 515 516 // mm0, mm2 --- Src[2] 517 // mm1 --- Src[3] 518 // mm6 --- 3/5 519 // mm7 for unpacking 520 521 pmullw mm0, mm6 // c * 3/5 522 movq mm5, two_fifths // mm5 = 2/5 523 524 movq mm3, mm1 // make a copy 525 pmullw mm2, mm6 // c * 3/5 526 527 punpcklbw mm1, mm7 // unpack low 528 movq mm4, mm1 // make a copy 529 530 punpckhbw mm3, mm7 // unpack high 531 pmullw mm4, mm5 // d * 2/5 532 533 movq mm6, mm3 // make a copy 534 pmullw mm6, mm5 // d * 2/5 535 536 paddw mm0, mm4 // c * 3/5 + d * 2/5 537 paddw mm2, mm6 // c * 3/5 + d * 2/5 538 539 paddw mm0, round_values // + 128 540 paddw mm2, round_values // + 128 541 542 psrlw mm0, 8 543 psrlw mm2, 8 544 545 packuswb mm0, mm2 // des[3] 546 movq QWORD ptr [edi], mm0 // write des[3] 547 548 // mm1, mm3 --- Src[3] 549 // mm7 -- cleared for unpacking 550 551 movq mm0, [edi+ecx*2] // mm0, Src[0] of the next group 552 553 movq mm5, four_fifths // mm5 = 4/5 554 pmullw mm1, mm5 // d * 4/5 555 556 movq mm6, one_fifth // mm6 = 1/5 557 movq mm2, mm0 // make a copy 558 559 pmullw mm3, mm5 // d * 4/5 560 punpcklbw mm0, mm7 // unpack low 561 562 pmullw mm0, mm6 // an * 1/5 563 punpckhbw mm2, mm7 // unpack high 564 565 paddw mm1, mm0 // d * 4/5 + an * 1/5 566 pmullw mm2, mm6 // an * 1/5 567 568 paddw mm3, mm2 // d * 4/5 + an * 1/5 569 paddw mm1, round_values // + 128 570 571 paddw mm3, round_values // + 128 572 psrlw mm1, 8 573 574 psrlw mm3, 8 575 packuswb mm1, mm3 // des[4] 576 577 movq QWORD ptr [edi+ecx], mm1 // write des[4] 578 579 add edi, 8 580 add esi, 8 581 582 sub edx, 8 583 jg vs_4_5_loop 584 } 585 } 586 587 /**************************************************************************** 588 * 589 * ROUTINE : last_vertical_band_4_5_scale_mmx 590 * 591 * INPUTS : unsigned char *dest : 592 * unsigned int dest_pitch : 593 * unsigned int dest_width : 594 * 595 * OUTPUTS : None. 596 * 597 * RETURNS : None 598 * 599 * FUNCTION : 4 to 5 up-scaling of the last 4-pixel high band in an image. 600 * 601 * SPECIAL NOTES : The routine uses the first line of the band below 602 * the current band. The function also has an "C" only 603 * version. 604 * 605 ****************************************************************************/ 606 static 607 void last_vertical_band_4_5_scale_mmx 608 ( 609 unsigned char *dest, 610 unsigned int dest_pitch, 611 unsigned int dest_width 612 ) 613 { 614 __declspec(align(16)) unsigned short one_fifth[] = { 51, 51, 51, 51 }; 615 __declspec(align(16)) unsigned short two_fifths[] = { 102, 102, 102, 102 }; 616 __declspec(align(16)) unsigned short three_fifths[] = { 154, 154, 154, 154 }; 617 __declspec(align(16)) unsigned short four_fifths[] = { 205, 205, 205, 205 }; 618 __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 }; 619 620 __asm 621 { 622 mov esi, dest // Get the source and destination pointer 623 mov ecx, dest_pitch // Get the pitch size 624 625 lea edi, [esi+ecx*2] // tow lines below 626 add edi, ecx // three lines below 627 628 pxor mm7, mm7 // clear out mm7 629 mov edx, dest_width // Loop counter 630 631 last_vs_4_5_loop: 632 633 movq mm0, QWORD ptr [esi] // src[0]; 634 movq mm1, QWORD ptr [esi+ecx] // src[1]; 635 636 movq mm2, mm0 // Make a copy 637 punpcklbw mm0, mm7 // unpack low to word 638 639 movq mm5, one_fifth 640 punpckhbw mm2, mm7 // unpack high to word 641 642 pmullw mm0, mm5 // a * 1/5 643 644 movq mm3, mm1 // make a copy 645 punpcklbw mm1, mm7 // unpack low to word 646 647 pmullw mm2, mm5 // a * 1/5 648 movq mm6, four_fifths // constan 649 650 movq mm4, mm1 // copy of low b 651 pmullw mm4, mm6 // b * 4/5 652 653 punpckhbw mm3, mm7 // unpack high to word 654 movq mm5, mm3 // copy of high b 655 656 pmullw mm5, mm6 // b * 4/5 657 paddw mm0, mm4 // a * 1/5 + b * 4/5 658 659 paddw mm2, mm5 // a * 1/5 + b * 4/5 660 paddw mm0, round_values // + 128 661 662 paddw mm2, round_values // + 128 663 psrlw mm0, 8 664 665 psrlw mm2, 8 666 packuswb mm0, mm2 // des [1] 667 668 movq QWORD ptr [esi+ecx], mm0 // write des[1] 669 movq mm0, [esi+ecx*2] // mm0 = src[2] 670 671 // mm1, mm3 --- Src[1] 672 // mm0 --- Src[2] 673 // mm7 for unpacking 674 675 movq mm5, two_fifths 676 movq mm2, mm0 // make a copy 677 678 pmullw mm1, mm5 // b * 2/5 679 movq mm6, three_fifths 680 681 682 punpcklbw mm0, mm7 // unpack low to word 683 pmullw mm3, mm5 // b * 2/5 684 685 movq mm4, mm0 // make copy of c 686 punpckhbw mm2, mm7 // unpack high to word 687 688 pmullw mm4, mm6 // c * 3/5 689 movq mm5, mm2 690 691 pmullw mm5, mm6 // c * 3/5 692 paddw mm1, mm4 // b * 2/5 + c * 3/5 693 694 paddw mm3, mm5 // b * 2/5 + c * 3/5 695 paddw mm1, round_values // + 128 696 697 paddw mm3, round_values // + 128 698 psrlw mm1, 8 699 700 psrlw mm3, 8 701 packuswb mm1, mm3 // des[2] 702 703 movq QWORD ptr [esi+ecx*2], mm1 // write des[2] 704 movq mm1, [edi] // mm1=Src[3]; 705 706 movq QWORD ptr [edi+ecx], mm1 // write des[4]; 707 708 // mm0, mm2 --- Src[2] 709 // mm1 --- Src[3] 710 // mm6 --- 3/5 711 // mm7 for unpacking 712 713 pmullw mm0, mm6 // c * 3/5 714 movq mm5, two_fifths // mm5 = 2/5 715 716 movq mm3, mm1 // make a copy 717 pmullw mm2, mm6 // c * 3/5 718 719 punpcklbw mm1, mm7 // unpack low 720 movq mm4, mm1 // make a copy 721 722 punpckhbw mm3, mm7 // unpack high 723 pmullw mm4, mm5 // d * 2/5 724 725 movq mm6, mm3 // make a copy 726 pmullw mm6, mm5 // d * 2/5 727 728 paddw mm0, mm4 // c * 3/5 + d * 2/5 729 paddw mm2, mm6 // c * 3/5 + d * 2/5 730 731 paddw mm0, round_values // + 128 732 paddw mm2, round_values // + 128 733 734 psrlw mm0, 8 735 psrlw mm2, 8 736 737 packuswb mm0, mm2 // des[3] 738 movq QWORD ptr [edi], mm0 // write des[3] 739 740 // mm1, mm3 --- Src[3] 741 // mm7 -- cleared for unpacking 742 add edi, 8 743 add esi, 8 744 745 sub edx, 8 746 jg last_vs_4_5_loop 747 } 748 } 749 750 /**************************************************************************** 751 * 752 * ROUTINE : vertical_band_3_5_scale_mmx 753 * 754 * INPUTS : unsigned char *dest : 755 * unsigned int dest_pitch : 756 * unsigned int dest_width : 757 * 758 * OUTPUTS : None. 759 * 760 * RETURNS : void 761 * 762 * FUNCTION : 3 to 5 up-scaling of a 3-pixel high band of pixels. 763 * 764 * SPECIAL NOTES : The routine uses the first line of the band below 765 * the current band. The function also has an "C" only 766 * version. 767 * 768 ****************************************************************************/ 769 static 770 void vertical_band_3_5_scale_mmx 771 ( 772 unsigned char *dest, 773 unsigned int dest_pitch, 774 unsigned int dest_width 775 ) 776 { 777 __declspec(align(16)) unsigned short one_fifth[] = { 51, 51, 51, 51 }; 778 __declspec(align(16)) unsigned short two_fifths[] = { 102, 102, 102, 102 }; 779 __declspec(align(16)) unsigned short three_fifths[] = { 154, 154, 154, 154 }; 780 __declspec(align(16)) unsigned short four_fifths[] = { 205, 205, 205, 205 }; 781 __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 }; 782 783 __asm 784 { 785 mov esi, dest // Get the source and destination pointer 786 mov ecx, dest_pitch // Get the pitch size 787 788 lea edi, [esi+ecx*2] // tow lines below 789 add edi, ecx // three lines below 790 791 pxor mm7, mm7 // clear out mm7 792 mov edx, dest_width // Loop counter 793 794 vs_3_5_loop: 795 796 movq mm0, QWORD ptr [esi] // src[0]; 797 movq mm1, QWORD ptr [esi+ecx] // src[1]; 798 799 movq mm2, mm0 // Make a copy 800 punpcklbw mm0, mm7 // unpack low to word 801 802 movq mm5, two_fifths // mm5 = 2/5 803 punpckhbw mm2, mm7 // unpack high to word 804 805 pmullw mm0, mm5 // a * 2/5 806 807 movq mm3, mm1 // make a copy 808 punpcklbw mm1, mm7 // unpack low to word 809 810 pmullw mm2, mm5 // a * 2/5 811 movq mm6, three_fifths // mm6 = 3/5 812 813 movq mm4, mm1 // copy of low b 814 pmullw mm4, mm6 // b * 3/5 815 816 punpckhbw mm3, mm7 // unpack high to word 817 movq mm5, mm3 // copy of high b 818 819 pmullw mm5, mm6 // b * 3/5 820 paddw mm0, mm4 // a * 2/5 + b * 3/5 821 822 paddw mm2, mm5 // a * 2/5 + b * 3/5 823 paddw mm0, round_values // + 128 824 825 paddw mm2, round_values // + 128 826 psrlw mm0, 8 827 828 psrlw mm2, 8 829 packuswb mm0, mm2 // des [1] 830 831 movq QWORD ptr [esi+ecx], mm0 // write des[1] 832 movq mm0, [esi+ecx*2] // mm0 = src[2] 833 834 // mm1, mm3 --- Src[1] 835 // mm0 --- Src[2] 836 // mm7 for unpacking 837 838 movq mm4, mm1 // b low 839 pmullw mm1, four_fifths // b * 4/5 low 840 841 movq mm5, mm3 // b high 842 pmullw mm3, four_fifths // b * 4/5 high 843 844 movq mm2, mm0 // c 845 pmullw mm4, one_fifth // b * 1/5 846 847 punpcklbw mm0, mm7 // c low 848 pmullw mm5, one_fifth // b * 1/5 849 850 movq mm6, mm0 // make copy of c low 851 punpckhbw mm2, mm7 // c high 852 853 pmullw mm6, one_fifth // c * 1/5 low 854 movq mm7, mm2 // make copy of c high 855 856 pmullw mm7, one_fifth // c * 1/5 high 857 paddw mm1, mm6 // b * 4/5 + c * 1/5 low 858 859 paddw mm3, mm7 // b * 4/5 + c * 1/5 high 860 movq mm6, mm0 // make copy of c low 861 862 pmullw mm6, four_fifths // c * 4/5 low 863 movq mm7, mm2 // make copy of c high 864 865 pmullw mm7, four_fifths // c * 4/5 high 866 867 paddw mm4, mm6 // b * 1/5 + c * 4/5 low 868 paddw mm5, mm7 // b * 1/5 + c * 4/5 high 869 870 paddw mm1, round_values // + 128 871 paddw mm3, round_values // + 128 872 873 psrlw mm1, 8 874 psrlw mm3, 8 875 876 packuswb mm1, mm3 // des[2] 877 movq QWORD ptr [esi+ecx*2], mm1 // write des[2] 878 879 paddw mm4, round_values // + 128 880 paddw mm5, round_values // + 128 881 882 psrlw mm4, 8 883 psrlw mm5, 8 884 885 packuswb mm4, mm5 // des[3] 886 movq QWORD ptr [edi], mm4 // write des[3] 887 888 // mm0, mm2 --- Src[3] 889 890 pxor mm7, mm7 // clear mm7 for unpacking 891 movq mm1, [edi+ecx*2] // mm1 = Src[0] of the next group 892 893 movq mm5, three_fifths // mm5 = 3/5 894 pmullw mm0, mm5 // d * 3/5 895 896 movq mm6, two_fifths // mm6 = 2/5 897 movq mm3, mm1 // make a copy 898 899 pmullw mm2, mm5 // d * 3/5 900 punpcklbw mm1, mm7 // unpack low 901 902 pmullw mm1, mm6 // an * 2/5 903 punpckhbw mm3, mm7 // unpack high 904 905 paddw mm0, mm1 // d * 3/5 + an * 2/5 906 pmullw mm3, mm6 // an * 2/5 907 908 paddw mm2, mm3 // d * 3/5 + an * 2/5 909 paddw mm0, round_values // + 128 910 911 paddw mm2, round_values // + 128 912 psrlw mm0, 8 913 914 psrlw mm2, 8 915 packuswb mm0, mm2 // des[4] 916 917 movq QWORD ptr [edi+ecx], mm0 // write des[4] 918 919 add edi, 8 920 add esi, 8 921 922 sub edx, 8 923 jg vs_3_5_loop 924 } 925 } 926 927 /**************************************************************************** 928 * 929 * ROUTINE : last_vertical_band_3_5_scale_mmx 930 * 931 * INPUTS : unsigned char *dest : 932 * unsigned int dest_pitch : 933 * unsigned int dest_width : 934 * 935 * OUTPUTS : None. 936 * 937 * RETURNS : void 938 * 939 * FUNCTION : 3 to 5 up-scaling of a 3-pixel high band of pixels. 940 * 941 * SPECIAL NOTES : The routine uses the first line of the band below 942 * the current band. The function also has an "C" only 943 * version. 944 * 945 ****************************************************************************/ 946 static 947 void last_vertical_band_3_5_scale_mmx 948 ( 949 unsigned char *dest, 950 unsigned int dest_pitch, 951 unsigned int dest_width 952 ) 953 { 954 __declspec(align(16)) unsigned short one_fifth[] = { 51, 51, 51, 51 }; 955 __declspec(align(16)) unsigned short two_fifths[] = { 102, 102, 102, 102 }; 956 __declspec(align(16)) unsigned short three_fifths[] = { 154, 154, 154, 154 }; 957 __declspec(align(16)) unsigned short four_fifths[] = { 205, 205, 205, 205 }; 958 __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 }; 959 __asm 960 { 961 mov esi, dest // Get the source and destination pointer 962 mov ecx, dest_pitch // Get the pitch size 963 964 lea edi, [esi+ecx*2] // tow lines below 965 add edi, ecx // three lines below 966 967 pxor mm7, mm7 // clear out mm7 968 mov edx, dest_width // Loop counter 969 970 971 last_vs_3_5_loop: 972 973 movq mm0, QWORD ptr [esi] // src[0]; 974 movq mm1, QWORD ptr [esi+ecx] // src[1]; 975 976 movq mm2, mm0 // Make a copy 977 punpcklbw mm0, mm7 // unpack low to word 978 979 movq mm5, two_fifths // mm5 = 2/5 980 punpckhbw mm2, mm7 // unpack high to word 981 982 pmullw mm0, mm5 // a * 2/5 983 984 movq mm3, mm1 // make a copy 985 punpcklbw mm1, mm7 // unpack low to word 986 987 pmullw mm2, mm5 // a * 2/5 988 movq mm6, three_fifths // mm6 = 3/5 989 990 movq mm4, mm1 // copy of low b 991 pmullw mm4, mm6 // b * 3/5 992 993 punpckhbw mm3, mm7 // unpack high to word 994 movq mm5, mm3 // copy of high b 995 996 pmullw mm5, mm6 // b * 3/5 997 paddw mm0, mm4 // a * 2/5 + b * 3/5 998 999 paddw mm2, mm5 // a * 2/5 + b * 3/5 1000 paddw mm0, round_values // + 128 1001 1002 paddw mm2, round_values // + 128 1003 psrlw mm0, 8 1004 1005 psrlw mm2, 8 1006 packuswb mm0, mm2 // des [1] 1007 1008 movq QWORD ptr [esi+ecx], mm0 // write des[1] 1009 movq mm0, [esi+ecx*2] // mm0 = src[2] 1010 1011 1012 1013 // mm1, mm3 --- Src[1] 1014 // mm0 --- Src[2] 1015 // mm7 for unpacking 1016 1017 movq mm4, mm1 // b low 1018 pmullw mm1, four_fifths // b * 4/5 low 1019 1020 movq QWORD ptr [edi+ecx], mm0 // write des[4] 1021 1022 movq mm5, mm3 // b high 1023 pmullw mm3, four_fifths // b * 4/5 high 1024 1025 movq mm2, mm0 // c 1026 pmullw mm4, one_fifth // b * 1/5 1027 1028 punpcklbw mm0, mm7 // c low 1029 pmullw mm5, one_fifth // b * 1/5 1030 1031 movq mm6, mm0 // make copy of c low 1032 punpckhbw mm2, mm7 // c high 1033 1034 pmullw mm6, one_fifth // c * 1/5 low 1035 movq mm7, mm2 // make copy of c high 1036 1037 pmullw mm7, one_fifth // c * 1/5 high 1038 paddw mm1, mm6 // b * 4/5 + c * 1/5 low 1039 1040 paddw mm3, mm7 // b * 4/5 + c * 1/5 high 1041 movq mm6, mm0 // make copy of c low 1042 1043 pmullw mm6, four_fifths // c * 4/5 low 1044 movq mm7, mm2 // make copy of c high 1045 1046 pmullw mm7, four_fifths // c * 4/5 high 1047 1048 paddw mm4, mm6 // b * 1/5 + c * 4/5 low 1049 paddw mm5, mm7 // b * 1/5 + c * 4/5 high 1050 1051 paddw mm1, round_values // + 128 1052 paddw mm3, round_values // + 128 1053 1054 psrlw mm1, 8 1055 psrlw mm3, 8 1056 1057 packuswb mm1, mm3 // des[2] 1058 movq QWORD ptr [esi+ecx*2], mm1 // write des[2] 1059 1060 paddw mm4, round_values // + 128 1061 paddw mm5, round_values // + 128 1062 1063 psrlw mm4, 8 1064 psrlw mm5, 8 1065 1066 packuswb mm4, mm5 // des[3] 1067 movq QWORD ptr [edi], mm4 // write des[3] 1068 1069 // mm0, mm2 --- Src[3] 1070 1071 add edi, 8 1072 add esi, 8 1073 1074 sub edx, 8 1075 jg last_vs_3_5_loop 1076 } 1077 } 1078 1079 /**************************************************************************** 1080 * 1081 * ROUTINE : vertical_band_1_2_scale_mmx 1082 * 1083 * INPUTS : unsigned char *dest : 1084 * unsigned int dest_pitch : 1085 * unsigned int dest_width : 1086 * 1087 * OUTPUTS : None. 1088 * 1089 * RETURNS : void 1090 * 1091 * FUNCTION : 1 to 2 up-scaling of a band of pixels. 1092 * 1093 * SPECIAL NOTES : The routine uses the first line of the band below 1094 * the current band. The function also has an "C" only 1095 * version. 1096 * 1097 ****************************************************************************/ 1098 static 1099 void vertical_band_1_2_scale_mmx 1100 ( 1101 unsigned char *dest, 1102 unsigned int dest_pitch, 1103 unsigned int dest_width 1104 ) 1105 { 1106 __declspec(align(16))unsigned short four_ones[] = { 1, 1, 1, 1}; 1107 1108 __asm 1109 { 1110 1111 mov esi, dest // Get the source and destination pointer 1112 mov ecx, dest_pitch // Get the pitch size 1113 1114 pxor mm7, mm7 // clear out mm7 1115 mov edx, dest_width // Loop counter 1116 1117 vs_1_2_loop: 1118 1119 movq mm0, [esi] // get Src[0] 1120 movq mm1, [esi + ecx * 2] // get Src[1] 1121 1122 movq mm2, mm0 // make copy before unpack 1123 movq mm3, mm1 // make copy before unpack 1124 1125 punpcklbw mm0, mm7 // low Src[0] 1126 movq mm6, four_ones // mm6= 1, 1, 1, 1 1127 1128 punpcklbw mm1, mm7 // low Src[1] 1129 paddw mm0, mm1 // low (a + b) 1130 1131 punpckhbw mm2, mm7 // high Src[0] 1132 paddw mm0, mm6 // low (a + b + 1) 1133 1134 punpckhbw mm3, mm7 1135 paddw mm2, mm3 // high (a + b ) 1136 1137 psraw mm0, 1 // low (a + b +1 )/2 1138 paddw mm2, mm6 // high (a + b + 1) 1139 1140 psraw mm2, 1 // high (a + b + 1)/2 1141 packuswb mm0, mm2 // pack results 1142 1143 movq [esi+ecx], mm0 // write out eight bytes 1144 add esi, 8 1145 1146 sub edx, 8 1147 jg vs_1_2_loop 1148 } 1149 1150 } 1151 1152 /**************************************************************************** 1153 * 1154 * ROUTINE : last_vertical_band_1_2_scale_mmx 1155 * 1156 * INPUTS : unsigned char *dest : 1157 * unsigned int dest_pitch : 1158 * unsigned int dest_width : 1159 * 1160 * OUTPUTS : None. 1161 * 1162 * RETURNS : void 1163 * 1164 * FUNCTION : 1 to 2 up-scaling of band of pixels. 1165 * 1166 * SPECIAL NOTES : The routine uses the first line of the band below 1167 * the current band. The function also has an "C" only 1168 * version. 1169 * 1170 ****************************************************************************/ 1171 static 1172 void last_vertical_band_1_2_scale_mmx 1173 ( 1174 unsigned char *dest, 1175 unsigned int dest_pitch, 1176 unsigned int dest_width 1177 ) 1178 { 1179 __asm 1180 { 1181 mov esi, dest // Get the source and destination pointer 1182 mov ecx, dest_pitch // Get the pitch size 1183 1184 mov edx, dest_width // Loop counter 1185 1186 last_vs_1_2_loop: 1187 1188 movq mm0, [esi] // get Src[0] 1189 movq [esi+ecx], mm0 // write out eight bytes 1190 1191 add esi, 8 1192 sub edx, 8 1193 1194 jg last_vs_1_2_loop 1195 } 1196 } 1197 1198 /**************************************************************************** 1199 * 1200 * ROUTINE : horizontal_line_1_2_scale 1201 * 1202 * INPUTS : const unsigned char *source : 1203 * unsigned int source_width : 1204 * unsigned char *dest : 1205 * unsigned int dest_width : 1206 * 1207 * OUTPUTS : None. 1208 * 1209 * RETURNS : void 1210 * 1211 * FUNCTION : 1 to 2 up-scaling of a horizontal line of pixels. 1212 * 1213 * SPECIAL NOTES : None. 1214 * 1215 ****************************************************************************/ 1216 static 1217 void horizontal_line_1_2_scale_mmx 1218 ( 1219 const unsigned char *source, 1220 unsigned int source_width, 1221 unsigned char *dest, 1222 unsigned int dest_width 1223 ) 1224 { 1225 __declspec(align(16))unsigned short four_ones[] = { 1, 1, 1, 1}; 1226 1227 (void) dest_width; 1228 1229 __asm 1230 { 1231 mov esi, source 1232 mov edi, dest 1233 1234 pxor mm7, mm7 1235 movq mm6, four_ones 1236 1237 mov ecx, source_width 1238 1239 hs_1_2_loop: 1240 1241 movq mm0, [esi] 1242 movq mm1, [esi+1] 1243 1244 movq mm2, mm0 1245 movq mm3, mm1 1246 1247 movq mm4, mm0 1248 punpcklbw mm0, mm7 1249 1250 punpcklbw mm1, mm7 1251 paddw mm0, mm1 1252 1253 paddw mm0, mm6 1254 punpckhbw mm2, mm7 1255 1256 punpckhbw mm3, mm7 1257 paddw mm2, mm3 1258 1259 paddw mm2, mm6 1260 psraw mm0, 1 1261 1262 psraw mm2, 1 1263 packuswb mm0, mm2 1264 1265 movq mm2, mm4 1266 punpcklbw mm2, mm0 1267 1268 movq [edi], mm2 1269 punpckhbw mm4, mm0 1270 1271 movq [edi+8], mm4 1272 add esi, 8 1273 1274 add edi, 16 1275 sub ecx, 8 1276 1277 cmp ecx, 8 1278 jg hs_1_2_loop 1279 1280 // last eight pixel 1281 1282 movq mm0, [esi] 1283 movq mm1, mm0 1284 1285 movq mm2, mm0 1286 movq mm3, mm1 1287 1288 psrlq mm1, 8 1289 psrlq mm3, 56 1290 1291 psllq mm3, 56 1292 por mm1, mm3 1293 1294 movq mm3, mm1 1295 movq mm4, mm0 1296 1297 punpcklbw mm0, mm7 1298 punpcklbw mm1, mm7 1299 1300 paddw mm0, mm1 1301 paddw mm0, mm6 1302 1303 punpckhbw mm2, mm7 1304 punpckhbw mm3, mm7 1305 1306 paddw mm2, mm3 1307 paddw mm2, mm6 1308 1309 psraw mm0, 1 1310 psraw mm2, 1 1311 1312 packuswb mm0, mm2 1313 movq mm2, mm4 1314 1315 punpcklbw mm2, mm0 1316 movq [edi], mm2 1317 1318 punpckhbw mm4, mm0 1319 movq [edi+8], mm4 1320 } 1321 } 1322 1323 1324 1325 1326 1327 1328 /**************************************************************************** 1329 * 1330 * ROUTINE : horizontal_line_5_4_scale_mmx 1331 * 1332 * INPUTS : const unsigned char *source : Pointer to source data. 1333 * unsigned int source_width : Stride of source. 1334 * unsigned char *dest : Pointer to destination data. 1335 * unsigned int dest_width : Stride of destination (NOT USED). 1336 * 1337 * OUTPUTS : None. 1338 * 1339 * RETURNS : void 1340 * 1341 * FUNCTION : Copies horizontal line of pixels from source to 1342 * destination scaling up by 4 to 5. 1343 * 1344 * SPECIAL NOTES : None. 1345 * 1346 ****************************************************************************/ 1347 static 1348 void horizontal_line_5_4_scale_mmx 1349 ( 1350 const unsigned char *source, 1351 unsigned int source_width, 1352 unsigned char *dest, 1353 unsigned int dest_width 1354 ) 1355 { 1356 1357 __declspec(align(16)) const unsigned short const54_2[] = { 0, 64, 128, 192 }; 1358 __declspec(align(16)) const unsigned short const54_1[] = {256, 192, 128, 64 }; 1359 __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 }; 1360 /* 1361 unsigned i; 1362 unsigned int a, b, c, d, e; 1363 unsigned char *des = dest; 1364 const unsigned char *src = source; 1365 1366 (void) dest_width; 1367 1368 for ( i=0; i<source_width; i+=5 ) 1369 { 1370 a = src[0]; 1371 b = src[1]; 1372 c = src[2]; 1373 d = src[3]; 1374 e = src[4]; 1375 1376 des[0] = a; 1377 des[1] = ((b*192 + c* 64 + 128)>>8); 1378 des[2] = ((c*128 + d*128 + 128)>>8); 1379 des[3] = ((d* 64 + e*192 + 128)>>8); 1380 1381 src += 5; 1382 des += 4; 1383 } 1384 */ 1385 __asm 1386 { 1387 1388 mov esi, source ; 1389 mov edi, dest ; 1390 1391 mov ecx, source_width ; 1392 movq mm5, const54_1 ; 1393 1394 pxor mm7, mm7 ; 1395 movq mm6, const54_2 ; 1396 1397 movq mm4, round_values ; 1398 lea edx, [esi+ecx] ; 1399 horizontal_line_5_4_loop: 1400 1401 movq mm0, QWORD PTR [esi] ; 1402 00 01 02 03 04 05 06 07 1403 movq mm1, mm0 ; 1404 00 01 02 03 04 05 06 07 1405 1406 psrlq mm0, 8 ; 1407 01 02 03 04 05 06 07 xx 1408 punpcklbw mm1, mm7 ; 1409 xx 00 xx 01 xx 02 xx 03 1410 1411 punpcklbw mm0, mm7 ; 1412 xx 01 xx 02 xx 03 xx 04 1413 pmullw mm1, mm5 1414 1415 pmullw mm0, mm6 1416 add esi, 5 1417 1418 add edi, 4 1419 paddw mm1, mm0 1420 1421 paddw mm1, mm4 1422 psrlw mm1, 8 1423 1424 cmp esi, edx 1425 packuswb mm1, mm7 1426 1427 movd DWORD PTR [edi-4], mm1 1428 1429 jl horizontal_line_5_4_loop 1430 1431 } 1432 1433 } 1434 1435 static 1436 void vertical_band_5_4_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) 1437 { 1438 1439 __declspec(align(16)) const unsigned short one_fourths[] = { 64, 64, 64, 64 }; 1440 __declspec(align(16)) const unsigned short two_fourths[] = { 128, 128, 128, 128 }; 1441 __declspec(align(16)) const unsigned short three_fourths[] = { 192, 192, 192, 192 }; 1442 1443 __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 }; 1444 __asm 1445 { 1446 push ebx 1447 1448 mov esi, source // Get the source and destination pointer 1449 mov ecx, src_pitch // Get the pitch size 1450 1451 mov edi, dest // tow lines below 1452 pxor mm7, mm7 // clear out mm7 1453 1454 mov edx, dest_pitch // Loop counter 1455 mov ebx, dest_width 1456 1457 vs_5_4_loop: 1458 1459 movd mm0, DWORD ptr [esi] // src[0]; 1460 movd mm1, DWORD ptr [esi+ecx] // src[1]; 1461 1462 movd mm2, DWORD ptr [esi+ecx*2] 1463 lea eax, [esi+ecx*2] // 1464 1465 punpcklbw mm1, mm7 1466 punpcklbw mm2, mm7 1467 1468 movq mm3, mm2 1469 pmullw mm1, three_fourths 1470 1471 pmullw mm2, one_fourths 1472 movd mm4, [eax+ecx] 1473 1474 pmullw mm3, two_fourths 1475 punpcklbw mm4, mm7 1476 1477 movq mm5, mm4 1478 pmullw mm4, two_fourths 1479 1480 paddw mm1, mm2 1481 movd mm6, [eax+ecx*2] 1482 1483 pmullw mm5, one_fourths 1484 paddw mm1, round_values; 1485 1486 paddw mm3, mm4 1487 psrlw mm1, 8 1488 1489 punpcklbw mm6, mm7 1490 paddw mm3, round_values 1491 1492 pmullw mm6, three_fourths 1493 psrlw mm3, 8 1494 1495 packuswb mm1, mm7 1496 packuswb mm3, mm7 1497 1498 movd DWORD PTR [edi], mm0 1499 movd DWORD PTR [edi+edx], mm1 1500 1501 1502 paddw mm5, mm6 1503 movd DWORD PTR [edi+edx*2], mm3 1504 1505 lea eax, [edi+edx*2] 1506 paddw mm5, round_values 1507 1508 psrlw mm5, 8 1509 add edi, 4 1510 1511 packuswb mm5, mm7 1512 movd DWORD PTR [eax+edx], mm5 1513 1514 add esi, 4 1515 sub ebx, 4 1516 1517 jg vs_5_4_loop 1518 1519 pop ebx 1520 } 1521 } 1522 1523 1524 1525 static 1526 void horizontal_line_5_3_scale_mmx 1527 ( 1528 const unsigned char *source, 1529 unsigned int source_width, 1530 unsigned char *dest, 1531 unsigned int dest_width 1532 ) 1533 { 1534 __declspec(align(16)) const unsigned short const53_1[] = { 0, 85, 171, 0 }; 1535 __declspec(align(16)) const unsigned short const53_2[] = {256, 171, 85, 0 }; 1536 __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 }; 1537 __asm 1538 { 1539 1540 mov esi, source ; 1541 mov edi, dest ; 1542 1543 mov ecx, source_width ; 1544 movq mm5, const53_1 ; 1545 1546 pxor mm7, mm7 ; 1547 movq mm6, const53_2 ; 1548 1549 movq mm4, round_values ; 1550 lea edx, [esi+ecx-5] ; 1551 horizontal_line_5_3_loop: 1552 1553 movq mm0, QWORD PTR [esi] ; 1554 00 01 02 03 04 05 06 07 1555 movq mm1, mm0 ; 1556 00 01 02 03 04 05 06 07 1557 1558 psllw mm0, 8 ; 1559 xx 00 xx 02 xx 04 xx 06 1560 psrlw mm1, 8 ; 1561 01 xx 03 xx 05 xx 07 xx 1562 1563 psrlw mm0, 8 ; 1564 00 xx 02 xx 04 xx 06 xx 1565 psllq mm1, 16 ; 1566 xx xx 01 xx 03 xx 05 xx 1567 1568 pmullw mm0, mm6 1569 1570 pmullw mm1, mm5 1571 add esi, 5 1572 1573 add edi, 3 1574 paddw mm1, mm0 1575 1576 paddw mm1, mm4 1577 psrlw mm1, 8 1578 1579 cmp esi, edx 1580 packuswb mm1, mm7 1581 1582 movd DWORD PTR [edi-3], mm1 1583 jl horizontal_line_5_3_loop 1584 1585 //exit condition 1586 movq mm0, QWORD PTR [esi] ; 1587 00 01 02 03 04 05 06 07 1588 movq mm1, mm0 ; 1589 00 01 02 03 04 05 06 07 1590 1591 psllw mm0, 8 ; 1592 xx 00 xx 02 xx 04 xx 06 1593 psrlw mm1, 8 ; 1594 01 xx 03 xx 05 xx 07 xx 1595 1596 psrlw mm0, 8 ; 1597 00 xx 02 xx 04 xx 06 xx 1598 psllq mm1, 16 ; 1599 xx xx 01 xx 03 xx 05 xx 1600 1601 pmullw mm0, mm6 1602 1603 pmullw mm1, mm5 1604 paddw mm1, mm0 1605 1606 paddw mm1, mm4 1607 psrlw mm1, 8 1608 1609 packuswb mm1, mm7 1610 movd eax, mm1 1611 1612 mov edx, eax 1613 shr edx, 16 1614 1615 mov WORD PTR[edi], ax 1616 mov BYTE PTR[edi+2], dl 1617 1618 } 1619 1620 } 1621 1622 1623 static 1624 void vertical_band_5_3_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) 1625 { 1626 __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 }; 1627 __declspec(align(16)) const unsigned short one_thirds[] = { 85, 85, 85, 85 }; 1628 __declspec(align(16)) const unsigned short two_thirds[] = { 171, 171, 171, 171 }; 1629 1630 __asm 1631 { 1632 push ebx 1633 1634 mov esi, source // Get the source and destination pointer 1635 mov ecx, src_pitch // Get the pitch size 1636 1637 mov edi, dest // tow lines below 1638 pxor mm7, mm7 // clear out mm7 1639 1640 mov edx, dest_pitch // Loop counter 1641 movq mm5, one_thirds 1642 1643 movq mm6, two_thirds 1644 mov ebx, dest_width; 1645 1646 vs_5_3_loop: 1647 1648 movd mm0, DWORD ptr [esi] // src[0]; 1649 movd mm1, DWORD ptr [esi+ecx] // src[1]; 1650 1651 movd mm2, DWORD ptr [esi+ecx*2] 1652 lea eax, [esi+ecx*2] // 1653 1654 punpcklbw mm1, mm7 1655 punpcklbw mm2, mm7 1656 1657 pmullw mm1, mm5 1658 pmullw mm2, mm6 1659 1660 movd mm3, DWORD ptr [eax+ecx] 1661 movd mm4, DWORD ptr [eax+ecx*2] 1662 1663 punpcklbw mm3, mm7 1664 punpcklbw mm4, mm7 1665 1666 pmullw mm3, mm6 1667 pmullw mm4, mm5 1668 1669 1670 movd DWORD PTR [edi], mm0 1671 paddw mm1, mm2 1672 1673 paddw mm1, round_values 1674 psrlw mm1, 8 1675 1676 packuswb mm1, mm7 1677 paddw mm3, mm4 1678 1679 paddw mm3, round_values 1680 movd DWORD PTR [edi+edx], mm1 1681 1682 psrlw mm3, 8 1683 packuswb mm3, mm7 1684 1685 movd DWORD PTR [edi+edx*2], mm3 1686 1687 1688 add edi, 4 1689 add esi, 4 1690 1691 sub ebx, 4 1692 jg vs_5_3_loop 1693 1694 pop ebx 1695 } 1696 } 1697 1698 1699 1700 1701 /**************************************************************************** 1702 * 1703 * ROUTINE : horizontal_line_2_1_scale 1704 * 1705 * INPUTS : const unsigned char *source : 1706 * unsigned int source_width : 1707 * unsigned char *dest : 1708 * unsigned int dest_width : 1709 * 1710 * OUTPUTS : None. 1711 * 1712 * RETURNS : void 1713 * 1714 * FUNCTION : 1 to 2 up-scaling of a horizontal line of pixels. 1715 * 1716 * SPECIAL NOTES : None. 1717 * 1718 ****************************************************************************/ 1719 static 1720 void horizontal_line_2_1_scale_mmx 1721 ( 1722 const unsigned char *source, 1723 unsigned int source_width, 1724 unsigned char *dest, 1725 unsigned int dest_width 1726 ) 1727 { 1728 (void) dest_width; 1729 1730 __asm 1731 { 1732 mov esi, source 1733 mov edi, dest 1734 1735 pxor mm7, mm7 1736 mov ecx, dest_width 1737 1738 xor edx, edx 1739 hs_2_1_loop: 1740 1741 movq mm0, [esi+edx*2] 1742 psllw mm0, 8 1743 1744 psrlw mm0, 8 1745 packuswb mm0, mm7 1746 1747 movd DWORD Ptr [edi+edx], mm0; 1748 add edx, 4 1749 1750 cmp edx, ecx 1751 jl hs_2_1_loop 1752 1753 } 1754 } 1755 1756 1757 1758 static 1759 void vertical_band_2_1_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) 1760 { 1761 vpx_memcpy(dest, source, dest_width); 1762 } 1763 1764 1765 1766 static 1767 void vertical_band_2_1_scale_i_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) 1768 { 1769 1770 __declspec(align(16)) const unsigned short three_sixteenths[] = { 48, 48, 48, 48 }; 1771 __declspec(align(16)) const unsigned short ten_sixteenths[] = { 160, 160, 160, 160 }; 1772 __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 }; 1773 __asm 1774 { 1775 mov esi, source 1776 mov edi, dest 1777 1778 mov eax, src_pitch 1779 mov edx, dest_width 1780 1781 pxor mm7, mm7 1782 sub esi, eax //back one line 1783 1784 1785 lea ecx, [esi+edx]; 1786 movq mm6, round_values; 1787 1788 movq mm5, three_sixteenths; 1789 movq mm4, ten_sixteenths; 1790 1791 vs_2_1_i_loop: 1792 movd mm0, [esi] // 1793 movd mm1, [esi+eax] // 1794 1795 movd mm2, [esi+eax*2] // 1796 punpcklbw mm0, mm7 1797 1798 pmullw mm0, mm5 1799 punpcklbw mm1, mm7 1800 1801 pmullw mm1, mm4 1802 punpcklbw mm2, mm7 1803 1804 pmullw mm2, mm5 1805 paddw mm0, round_values 1806 1807 paddw mm1, mm2 1808 paddw mm0, mm1 1809 1810 psrlw mm0, 8 1811 packuswb mm0, mm7 1812 1813 movd DWORD PTR [edi], mm0 1814 add esi, 4 1815 1816 add edi, 4; 1817 cmp esi, ecx 1818 jl vs_2_1_i_loop 1819 1820 } 1821 } 1822 1823 void 1824 register_mmxscalers(void) 1825 { 1826 vp8_horizontal_line_1_2_scale = horizontal_line_1_2_scale_mmx; 1827 vp8_vertical_band_1_2_scale = vertical_band_1_2_scale_mmx; 1828 vp8_last_vertical_band_1_2_scale = last_vertical_band_1_2_scale_mmx; 1829 vp8_horizontal_line_3_5_scale = horizontal_line_3_5_scale_mmx; 1830 vp8_vertical_band_3_5_scale = vertical_band_3_5_scale_mmx; 1831 vp8_last_vertical_band_3_5_scale = last_vertical_band_3_5_scale_mmx; 1832 vp8_horizontal_line_4_5_scale = horizontal_line_4_5_scale_mmx; 1833 vp8_vertical_band_4_5_scale = vertical_band_4_5_scale_mmx; 1834 vp8_last_vertical_band_4_5_scale = last_vertical_band_4_5_scale_mmx; 1835 1836 vp8_horizontal_line_3_4_scale = vp8cx_horizontal_line_3_4_scale_c; 1837 vp8_vertical_band_3_4_scale = vp8cx_vertical_band_3_4_scale_c; 1838 vp8_last_vertical_band_3_4_scale = vp8cx_last_vertical_band_3_4_scale_c; 1839 vp8_horizontal_line_2_3_scale = vp8cx_horizontal_line_2_3_scale_c; 1840 vp8_vertical_band_2_3_scale = vp8cx_vertical_band_2_3_scale_c; 1841 vp8_last_vertical_band_2_3_scale = vp8cx_last_vertical_band_2_3_scale_c; 1842 1843 1844 1845 vp8_vertical_band_5_4_scale = vertical_band_5_4_scale_mmx; 1846 vp8_vertical_band_5_3_scale = vertical_band_5_3_scale_mmx; 1847 vp8_vertical_band_2_1_scale = vertical_band_2_1_scale_mmx; 1848 vp8_vertical_band_2_1_scale_i = vertical_band_2_1_scale_i_mmx; 1849 vp8_horizontal_line_2_1_scale = horizontal_line_2_1_scale_mmx; 1850 vp8_horizontal_line_5_3_scale = horizontal_line_5_3_scale_mmx; 1851 vp8_horizontal_line_5_4_scale = horizontal_line_5_4_scale_mmx; 1852 1853 } 1854