1 /* 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 12 /**************************************************************************** 13 * 14 * Module Title : scaleopt.cpp 15 * 16 * Description : Optimized scaling functions 17 * 18 ****************************************************************************/ 19 #include "pragmas.h" 20 21 22 23 /**************************************************************************** 24 * Module Statics 25 ****************************************************************************/ 26 __declspec(align(16)) const static unsigned short one_fifth[] = { 51, 51, 51, 51 }; 27 __declspec(align(16)) const static unsigned short two_fifths[] = { 102, 102, 102, 102 }; 28 __declspec(align(16)) const static unsigned short three_fifths[] = { 154, 154, 154, 154 }; 29 __declspec(align(16)) const static unsigned short four_fifths[] = { 205, 205, 205, 205 }; 30 __declspec(align(16)) const static unsigned short round_values[] = { 128, 128, 128, 128 }; 31 __declspec(align(16)) const static unsigned short four_ones[] = { 1, 1, 1, 1}; 32 __declspec(align(16)) const static unsigned short const45_2[] = {205, 154, 102, 51 }; 33 __declspec(align(16)) const static unsigned short const45_1[] = { 51, 102, 154, 205 }; 34 __declspec(align(16)) const static unsigned char mask45[] = { 0, 0, 0, 0, 0, 0, 255, 0}; 35 __declspec(align(16)) const static unsigned short const35_2[] = { 154, 51, 205, 102 }; 36 __declspec(align(16)) const static unsigned short const35_1[] = { 102, 205, 51, 154 }; 37 38 39 40 #include "vpx_scale/vpxscale.h" 41 #include "vpx_mem/vpx_mem.h" 42 43 /**************************************************************************** 44 * 45 * ROUTINE : horizontal_line_3_5_scale_mmx 46 * 47 * INPUTS : const unsigned char *source : 48 * unsigned int source_width : 49 * unsigned char *dest : 50 * unsigned int dest_width : 51 * 52 * OUTPUTS : None. 53 * 54 * RETURNS : void 55 * 56 * FUNCTION : 3 to 5 up-scaling of a horizontal line of pixels. 57 * 58 * SPECIAL NOTES : None. 59 * 60 ****************************************************************************/ 61 static 62 void horizontal_line_3_5_scale_mmx 63 ( 64 const unsigned char *source, 65 unsigned int source_width, 66 unsigned char *dest, 67 unsigned int dest_width 68 ) 69 { 70 (void) dest_width; 71 72 __asm 73 { 74 75 push ebx 76 77 mov esi, source 78 mov edi, dest 79 80 mov ecx, source_width 81 lea edx, [esi+ecx-3]; 82 83 movq mm5, const35_1 // mm5 = 66 xx cd xx 33 xx 9a xx 84 movq mm6, const35_2 // mm6 = 9a xx 33 xx cd xx 66 xx 85 86 movq mm4, round_values // mm4 = 80 xx 80 xx 80 xx 80 xx 87 pxor mm7, mm7 // clear mm7 88 89 horiz_line_3_5_loop: 90 91 mov eax, DWORD PTR [esi] // eax = 00 01 02 03 92 mov ebx, eax 93 94 and ebx, 0xffff00 // ebx = xx 01 02 xx 95 mov ecx, eax // ecx = 00 01 02 03 96 97 and eax, 0xffff0000 // eax = xx xx 02 03 98 xor ecx, eax // ecx = 00 01 xx xx 99 100 shr ebx, 8 // ebx = 01 02 xx xx 101 or eax, ebx // eax = 01 02 02 03 102 103 shl ebx, 16 // ebx = xx xx 01 02 104 movd mm1, eax // mm1 = 01 02 02 03 xx xx xx xx 105 106 or ebx, ecx // ebx = 00 01 01 02 107 punpcklbw mm1, mm7 // mm1 = 01 xx 02 xx 02 xx 03 xx 108 109 movd mm0, ebx // mm0 = 00 01 01 02 110 pmullw mm1, mm6 // 111 112 punpcklbw mm0, mm7 // mm0 = 00 xx 01 xx 01 xx 02 xx 113 pmullw mm0, mm5 // 114 115 mov [edi], ebx // writeoutput 00 xx xx xx 116 add esi, 3 117 118 add edi, 5 119 paddw mm0, mm1 120 121 paddw mm0, mm4 122 psrlw mm0, 8 123 124 cmp esi, edx 125 packuswb mm0, mm7 126 127 movd DWORD Ptr [edi-4], mm0 128 jl horiz_line_3_5_loop 129 130 //Exit: 131 mov eax, DWORD PTR [esi] // eax = 00 01 02 03 132 mov ebx, eax 133 134 and ebx, 0xffff00 // ebx = xx 01 02 xx 135 mov ecx, eax // ecx = 00 01 02 03 136 137 and eax, 0xffff0000 // eax = xx xx 02 03 138 xor ecx, eax // ecx = 00 01 xx xx 139 140 shr ebx, 8 // ebx = 01 02 xx xx 141 or eax, ebx // eax = 01 02 02 03 142 143 shl eax, 8 // eax = xx 01 02 02 144 and eax, 0xffff0000 // eax = xx xx 02 02 145 146 or eax, ebx // eax = 01 02 02 02 147 148 shl ebx, 16 // ebx = xx xx 01 02 149 movd mm1, eax // mm1 = 01 02 02 02 xx xx xx xx 150 151 or ebx, ecx // ebx = 00 01 01 02 152 punpcklbw mm1, mm7 // mm1 = 01 xx 02 xx 02 xx 02 xx 153 154 movd mm0, ebx // mm0 = 00 01 01 02 155 pmullw mm1, mm6 // 156 157 punpcklbw mm0, mm7 // mm0 = 00 xx 01 xx 01 xx 02 xx 158 pmullw mm0, mm5 // 159 160 mov [edi], ebx // writeoutput 00 xx xx xx 161 paddw mm0, mm1 162 163 paddw mm0, mm4 164 psrlw mm0, 8 165 166 packuswb mm0, mm7 167 movd DWORD Ptr [edi+1], mm0 168 169 pop ebx 170 171 } 172 173 } 174 175 176 /**************************************************************************** 177 * 178 * ROUTINE : horizontal_line_4_5_scale_mmx 179 * 180 * INPUTS : const unsigned char *source : 181 * unsigned int source_width : 182 * unsigned char *dest : 183 * unsigned int dest_width : 184 * 185 * OUTPUTS : None. 186 * 187 * RETURNS : void 188 * 189 * FUNCTION : 4 to 5 up-scaling of a horizontal line of pixels. 190 * 191 * SPECIAL NOTES : None. 192 * 193 ****************************************************************************/ 194 static 195 void horizontal_line_4_5_scale_mmx 196 ( 197 const unsigned char *source, 198 unsigned int source_width, 199 unsigned char *dest, 200 unsigned int dest_width 201 ) 202 { 203 (void)dest_width; 204 205 __asm 206 { 207 208 mov esi, source 209 mov edi, dest 210 211 mov ecx, source_width 212 lea edx, [esi+ecx-8]; 213 214 movq mm5, const45_1 // mm5 = 33 xx 66 xx 9a xx cd xx 215 movq mm6, const45_2 // mm6 = cd xx 9a xx 66 xx 33 xx 216 217 movq mm4, round_values // mm4 = 80 xx 80 xx 80 xx 80 xx 218 pxor mm7, mm7 // clear mm7 219 220 horiz_line_4_5_loop: 221 222 movq mm0, QWORD PTR [esi] // mm0 = 00 01 02 03 04 05 06 07 223 movq mm1, QWORD PTR [esi+1]; // mm1 = 01 02 03 04 05 06 07 08 224 225 movq mm2, mm0 // mm2 = 00 01 02 03 04 05 06 07 226 movq mm3, mm1 // mm3 = 01 02 03 04 05 06 07 08 227 228 movd DWORD PTR [edi], mm0 // write output 00 xx xx xx 229 punpcklbw mm0, mm7 // mm0 = 00 xx 01 xx 02 xx 03 xx 230 231 punpcklbw mm1, mm7 // mm1 = 01 xx 02 xx 03 xx 04 xx 232 pmullw mm0, mm5 // 00* 51 01*102 02*154 03*205 233 234 pmullw mm1, mm6 // 01*205 02*154 03*102 04* 51 235 punpckhbw mm2, mm7 // mm2 = 04 xx 05 xx 06 xx 07 xx 236 237 movd DWORD PTR [edi+5], mm2 // write ouput 05 xx xx xx 238 pmullw mm2, mm5 // 04* 51 05*102 06*154 07*205 239 240 punpckhbw mm3, mm7 // mm3 = 05 xx 06 xx 07 xx 08 xx 241 pmullw mm3, mm6 // 05*205 06*154 07*102 08* 51 242 243 paddw mm0, mm1 // added round values 244 paddw mm0, mm4 245 246 psrlw mm0, 8 // output: 01 xx 02 xx 03 xx 04 xx 247 packuswb mm0, mm7 248 249 movd DWORD PTR [edi+1], mm0 // write output 01 02 03 04 250 add edi, 10 251 252 add esi, 8 253 paddw mm2, mm3 // 254 255 paddw mm2, mm4 // added round values 256 cmp esi, edx 257 258 psrlw mm2, 8 259 packuswb mm2, mm7 260 261 movd DWORD PTR [edi-4], mm2 // writeoutput 06 07 08 09 262 jl horiz_line_4_5_loop 263 264 //Exit: 265 movq mm0, [esi] // mm0 = 00 01 02 03 04 05 06 07 266 movq mm1, mm0 // mm1 = 00 01 02 03 04 05 06 07 267 268 movq mm2, mm0 // mm2 = 00 01 02 03 04 05 06 07 269 psrlq mm1, 8 // mm1 = 01 02 03 04 05 06 07 00 270 271 movq mm3, mask45 // mm3 = 00 00 00 00 00 00 ff 00 272 pand mm3, mm1 // mm3 = 00 00 00 00 00 00 07 00 273 274 psllq mm3, 8 // mm3 = 00 00 00 00 00 00 00 07 275 por mm1, mm3 // mm1 = 01 02 03 04 05 06 07 07 276 277 movq mm3, mm1 278 279 movd DWORD PTR [edi], mm0 // write output 00 xx xx xx 280 punpcklbw mm0, mm7 // mm0 = 00 xx 01 xx 02 xx 03 xx 281 282 punpcklbw mm1, mm7 // mm1 = 01 xx 02 xx 03 xx 04 xx 283 pmullw mm0, mm5 // 00* 51 01*102 02*154 03*205 284 285 pmullw mm1, mm6 // 01*205 02*154 03*102 04* 51 286 punpckhbw mm2, mm7 // mm2 = 04 xx 05 xx 06 xx 07 xx 287 288 movd DWORD PTR [edi+5], mm2 // write ouput 05 xx xx xx 289 pmullw mm2, mm5 // 04* 51 05*102 06*154 07*205 290 291 punpckhbw mm3, mm7 // mm3 = 05 xx 06 xx 07 xx 08 xx 292 pmullw mm3, mm6 // 05*205 06*154 07*102 07* 51 293 294 paddw mm0, mm1 // added round values 295 paddw mm0, mm4 296 297 psrlw mm0, 8 // output: 01 xx 02 xx 03 xx 04 xx 298 packuswb mm0, mm7 // 01 02 03 04 xx xx xx xx 299 300 movd DWORD PTR [edi+1], mm0 // write output 01 02 03 04 301 paddw mm2, mm3 // 302 303 paddw mm2, mm4 // added round values 304 psrlw mm2, 8 305 306 packuswb mm2, mm7 307 movd DWORD PTR [edi+6], mm2 // writeoutput 06 07 08 09 308 309 310 } 311 } 312 313 /**************************************************************************** 314 * 315 * ROUTINE : vertical_band_4_5_scale_mmx 316 * 317 * INPUTS : unsigned char *dest : 318 * unsigned int dest_pitch : 319 * unsigned int dest_width : 320 * 321 * OUTPUTS : None. 322 * 323 * RETURNS : void 324 * 325 * FUNCTION : 4 to 5 up-scaling of a 4 pixel high band of pixels. 326 * 327 * SPECIAL NOTES : The routine uses the first line of the band below 328 * the current band. The function also has a "C" only 329 * version. 330 * 331 ****************************************************************************/ 332 static 333 void vertical_band_4_5_scale_mmx 334 ( 335 unsigned char *dest, 336 unsigned int dest_pitch, 337 unsigned int dest_width 338 ) 339 { 340 __asm 341 { 342 343 mov esi, dest // Get the source and destination pointer 344 mov ecx, dest_pitch // Get the pitch size 345 346 lea edi, [esi+ecx*2] // tow lines below 347 add edi, ecx // three lines below 348 349 pxor mm7, mm7 // clear out mm7 350 mov edx, dest_width // Loop counter 351 352 vs_4_5_loop: 353 354 movq mm0, QWORD ptr [esi] // src[0]; 355 movq mm1, QWORD ptr [esi+ecx] // src[1]; 356 357 movq mm2, mm0 // Make a copy 358 punpcklbw mm0, mm7 // unpack low to word 359 360 movq mm5, one_fifth 361 punpckhbw mm2, mm7 // unpack high to word 362 363 pmullw mm0, mm5 // a * 1/5 364 365 movq mm3, mm1 // make a copy 366 punpcklbw mm1, mm7 // unpack low to word 367 368 pmullw mm2, mm5 // a * 1/5 369 movq mm6, four_fifths // constan 370 371 movq mm4, mm1 // copy of low b 372 pmullw mm4, mm6 // b * 4/5 373 374 punpckhbw mm3, mm7 // unpack high to word 375 movq mm5, mm3 // copy of high b 376 377 pmullw mm5, mm6 // b * 4/5 378 paddw mm0, mm4 // a * 1/5 + b * 4/5 379 380 paddw mm2, mm5 // a * 1/5 + b * 4/5 381 paddw mm0, round_values // + 128 382 383 paddw mm2, round_values // + 128 384 psrlw mm0, 8 385 386 psrlw mm2, 8 387 packuswb mm0, mm2 // des [1] 388 389 movq QWORD ptr [esi+ecx], mm0 // write des[1] 390 movq mm0, [esi+ecx*2] // mm0 = src[2] 391 392 // mm1, mm3 --- Src[1] 393 // mm0 --- Src[2] 394 // mm7 for unpacking 395 396 movq mm5, two_fifths 397 movq mm2, mm0 // make a copy 398 399 pmullw mm1, mm5 // b * 2/5 400 movq mm6, three_fifths 401 402 403 punpcklbw mm0, mm7 // unpack low to word 404 pmullw mm3, mm5 // b * 2/5 405 406 movq mm4, mm0 // make copy of c 407 punpckhbw mm2, mm7 // unpack high to word 408 409 pmullw mm4, mm6 // c * 3/5 410 movq mm5, mm2 411 412 pmullw mm5, mm6 // c * 3/5 413 paddw mm1, mm4 // b * 2/5 + c * 3/5 414 415 paddw mm3, mm5 // b * 2/5 + c * 3/5 416 paddw mm1, round_values // + 128 417 418 paddw mm3, round_values // + 128 419 psrlw mm1, 8 420 421 psrlw mm3, 8 422 packuswb mm1, mm3 // des[2] 423 424 movq QWORD ptr [esi+ecx*2], mm1 // write des[2] 425 movq mm1, [edi] // mm1=Src[3]; 426 427 // mm0, mm2 --- Src[2] 428 // mm1 --- Src[3] 429 // mm6 --- 3/5 430 // mm7 for unpacking 431 432 pmullw mm0, mm6 // c * 3/5 433 movq mm5, two_fifths // mm5 = 2/5 434 435 movq mm3, mm1 // make a copy 436 pmullw mm2, mm6 // c * 3/5 437 438 punpcklbw mm1, mm7 // unpack low 439 movq mm4, mm1 // make a copy 440 441 punpckhbw mm3, mm7 // unpack high 442 pmullw mm4, mm5 // d * 2/5 443 444 movq mm6, mm3 // make a copy 445 pmullw mm6, mm5 // d * 2/5 446 447 paddw mm0, mm4 // c * 3/5 + d * 2/5 448 paddw mm2, mm6 // c * 3/5 + d * 2/5 449 450 paddw mm0, round_values // + 128 451 paddw mm2, round_values // + 128 452 453 psrlw mm0, 8 454 psrlw mm2, 8 455 456 packuswb mm0, mm2 // des[3] 457 movq QWORD ptr [edi], mm0 // write des[3] 458 459 // mm1, mm3 --- Src[3] 460 // mm7 -- cleared for unpacking 461 462 movq mm0, [edi+ecx*2] // mm0, Src[0] of the next group 463 464 movq mm5, four_fifths // mm5 = 4/5 465 pmullw mm1, mm5 // d * 4/5 466 467 movq mm6, one_fifth // mm6 = 1/5 468 movq mm2, mm0 // make a copy 469 470 pmullw mm3, mm5 // d * 4/5 471 punpcklbw mm0, mm7 // unpack low 472 473 pmullw mm0, mm6 // an * 1/5 474 punpckhbw mm2, mm7 // unpack high 475 476 paddw mm1, mm0 // d * 4/5 + an * 1/5 477 pmullw mm2, mm6 // an * 1/5 478 479 paddw mm3, mm2 // d * 4/5 + an * 1/5 480 paddw mm1, round_values // + 128 481 482 paddw mm3, round_values // + 128 483 psrlw mm1, 8 484 485 psrlw mm3, 8 486 packuswb mm1, mm3 // des[4] 487 488 movq QWORD ptr [edi+ecx], mm1 // write des[4] 489 490 add edi, 8 491 add esi, 8 492 493 sub edx, 8 494 jg vs_4_5_loop 495 } 496 } 497 498 /**************************************************************************** 499 * 500 * ROUTINE : last_vertical_band_4_5_scale_mmx 501 * 502 * INPUTS : unsigned char *dest : 503 * unsigned int dest_pitch : 504 * unsigned int dest_width : 505 * 506 * OUTPUTS : None. 507 * 508 * RETURNS : None 509 * 510 * FUNCTION : 4 to 5 up-scaling of the last 4-pixel high band in an image. 511 * 512 * SPECIAL NOTES : The routine uses the first line of the band below 513 * the current band. The function also has an "C" only 514 * version. 515 * 516 ****************************************************************************/ 517 static 518 void last_vertical_band_4_5_scale_mmx 519 ( 520 unsigned char *dest, 521 unsigned int dest_pitch, 522 unsigned int dest_width 523 ) 524 { 525 __asm 526 { 527 mov esi, dest // Get the source and destination pointer 528 mov ecx, dest_pitch // Get the pitch size 529 530 lea edi, [esi+ecx*2] // tow lines below 531 add edi, ecx // three lines below 532 533 pxor mm7, mm7 // clear out mm7 534 mov edx, dest_width // Loop counter 535 536 last_vs_4_5_loop: 537 538 movq mm0, QWORD ptr [esi] // src[0]; 539 movq mm1, QWORD ptr [esi+ecx] // src[1]; 540 541 movq mm2, mm0 // Make a copy 542 punpcklbw mm0, mm7 // unpack low to word 543 544 movq mm5, one_fifth 545 punpckhbw mm2, mm7 // unpack high to word 546 547 pmullw mm0, mm5 // a * 1/5 548 549 movq mm3, mm1 // make a copy 550 punpcklbw mm1, mm7 // unpack low to word 551 552 pmullw mm2, mm5 // a * 1/5 553 movq mm6, four_fifths // constan 554 555 movq mm4, mm1 // copy of low b 556 pmullw mm4, mm6 // b * 4/5 557 558 punpckhbw mm3, mm7 // unpack high to word 559 movq mm5, mm3 // copy of high b 560 561 pmullw mm5, mm6 // b * 4/5 562 paddw mm0, mm4 // a * 1/5 + b * 4/5 563 564 paddw mm2, mm5 // a * 1/5 + b * 4/5 565 paddw mm0, round_values // + 128 566 567 paddw mm2, round_values // + 128 568 psrlw mm0, 8 569 570 psrlw mm2, 8 571 packuswb mm0, mm2 // des [1] 572 573 movq QWORD ptr [esi+ecx], mm0 // write des[1] 574 movq mm0, [esi+ecx*2] // mm0 = src[2] 575 576 // mm1, mm3 --- Src[1] 577 // mm0 --- Src[2] 578 // mm7 for unpacking 579 580 movq mm5, two_fifths 581 movq mm2, mm0 // make a copy 582 583 pmullw mm1, mm5 // b * 2/5 584 movq mm6, three_fifths 585 586 587 punpcklbw mm0, mm7 // unpack low to word 588 pmullw mm3, mm5 // b * 2/5 589 590 movq mm4, mm0 // make copy of c 591 punpckhbw mm2, mm7 // unpack high to word 592 593 pmullw mm4, mm6 // c * 3/5 594 movq mm5, mm2 595 596 pmullw mm5, mm6 // c * 3/5 597 paddw mm1, mm4 // b * 2/5 + c * 3/5 598 599 paddw mm3, mm5 // b * 2/5 + c * 3/5 600 paddw mm1, round_values // + 128 601 602 paddw mm3, round_values // + 128 603 psrlw mm1, 8 604 605 psrlw mm3, 8 606 packuswb mm1, mm3 // des[2] 607 608 movq QWORD ptr [esi+ecx*2], mm1 // write des[2] 609 movq mm1, [edi] // mm1=Src[3]; 610 611 movq QWORD ptr [edi+ecx], mm1 // write des[4]; 612 613 // mm0, mm2 --- Src[2] 614 // mm1 --- Src[3] 615 // mm6 --- 3/5 616 // mm7 for unpacking 617 618 pmullw mm0, mm6 // c * 3/5 619 movq mm5, two_fifths // mm5 = 2/5 620 621 movq mm3, mm1 // make a copy 622 pmullw mm2, mm6 // c * 3/5 623 624 punpcklbw mm1, mm7 // unpack low 625 movq mm4, mm1 // make a copy 626 627 punpckhbw mm3, mm7 // unpack high 628 pmullw mm4, mm5 // d * 2/5 629 630 movq mm6, mm3 // make a copy 631 pmullw mm6, mm5 // d * 2/5 632 633 paddw mm0, mm4 // c * 3/5 + d * 2/5 634 paddw mm2, mm6 // c * 3/5 + d * 2/5 635 636 paddw mm0, round_values // + 128 637 paddw mm2, round_values // + 128 638 639 psrlw mm0, 8 640 psrlw mm2, 8 641 642 packuswb mm0, mm2 // des[3] 643 movq QWORD ptr [edi], mm0 // write des[3] 644 645 // mm1, mm3 --- Src[3] 646 // mm7 -- cleared for unpacking 647 add edi, 8 648 add esi, 8 649 650 sub edx, 8 651 jg last_vs_4_5_loop 652 } 653 } 654 655 /**************************************************************************** 656 * 657 * ROUTINE : vertical_band_3_5_scale_mmx 658 * 659 * INPUTS : unsigned char *dest : 660 * unsigned int dest_pitch : 661 * unsigned int dest_width : 662 * 663 * OUTPUTS : None. 664 * 665 * RETURNS : void 666 * 667 * FUNCTION : 3 to 5 up-scaling of a 3-pixel high band of pixels. 668 * 669 * SPECIAL NOTES : The routine uses the first line of the band below 670 * the current band. The function also has an "C" only 671 * version. 672 * 673 ****************************************************************************/ 674 static 675 void vertical_band_3_5_scale_mmx 676 ( 677 unsigned char *dest, 678 unsigned int dest_pitch, 679 unsigned int dest_width 680 ) 681 { 682 __asm 683 { 684 mov esi, dest // Get the source and destination pointer 685 mov ecx, dest_pitch // Get the pitch size 686 687 lea edi, [esi+ecx*2] // tow lines below 688 add edi, ecx // three lines below 689 690 pxor mm7, mm7 // clear out mm7 691 mov edx, dest_width // Loop counter 692 693 vs_3_5_loop: 694 695 movq mm0, QWORD ptr [esi] // src[0]; 696 movq mm1, QWORD ptr [esi+ecx] // src[1]; 697 698 movq mm2, mm0 // Make a copy 699 punpcklbw mm0, mm7 // unpack low to word 700 701 movq mm5, two_fifths // mm5 = 2/5 702 punpckhbw mm2, mm7 // unpack high to word 703 704 pmullw mm0, mm5 // a * 2/5 705 706 movq mm3, mm1 // make a copy 707 punpcklbw mm1, mm7 // unpack low to word 708 709 pmullw mm2, mm5 // a * 2/5 710 movq mm6, three_fifths // mm6 = 3/5 711 712 movq mm4, mm1 // copy of low b 713 pmullw mm4, mm6 // b * 3/5 714 715 punpckhbw mm3, mm7 // unpack high to word 716 movq mm5, mm3 // copy of high b 717 718 pmullw mm5, mm6 // b * 3/5 719 paddw mm0, mm4 // a * 2/5 + b * 3/5 720 721 paddw mm2, mm5 // a * 2/5 + b * 3/5 722 paddw mm0, round_values // + 128 723 724 paddw mm2, round_values // + 128 725 psrlw mm0, 8 726 727 psrlw mm2, 8 728 packuswb mm0, mm2 // des [1] 729 730 movq QWORD ptr [esi+ecx], mm0 // write des[1] 731 movq mm0, [esi+ecx*2] // mm0 = src[2] 732 733 // mm1, mm3 --- Src[1] 734 // mm0 --- Src[2] 735 // mm7 for unpacking 736 737 movq mm4, mm1 // b low 738 pmullw mm1, four_fifths // b * 4/5 low 739 740 movq mm5, mm3 // b high 741 pmullw mm3, four_fifths // b * 4/5 high 742 743 movq mm2, mm0 // c 744 pmullw mm4, one_fifth // b * 1/5 745 746 punpcklbw mm0, mm7 // c low 747 pmullw mm5, one_fifth // b * 1/5 748 749 movq mm6, mm0 // make copy of c low 750 punpckhbw mm2, mm7 // c high 751 752 pmullw mm6, one_fifth // c * 1/5 low 753 movq mm7, mm2 // make copy of c high 754 755 pmullw mm7, one_fifth // c * 1/5 high 756 paddw mm1, mm6 // b * 4/5 + c * 1/5 low 757 758 paddw mm3, mm7 // b * 4/5 + c * 1/5 high 759 movq mm6, mm0 // make copy of c low 760 761 pmullw mm6, four_fifths // c * 4/5 low 762 movq mm7, mm2 // make copy of c high 763 764 pmullw mm7, four_fifths // c * 4/5 high 765 766 paddw mm4, mm6 // b * 1/5 + c * 4/5 low 767 paddw mm5, mm7 // b * 1/5 + c * 4/5 high 768 769 paddw mm1, round_values // + 128 770 paddw mm3, round_values // + 128 771 772 psrlw mm1, 8 773 psrlw mm3, 8 774 775 packuswb mm1, mm3 // des[2] 776 movq QWORD ptr [esi+ecx*2], mm1 // write des[2] 777 778 paddw mm4, round_values // + 128 779 paddw mm5, round_values // + 128 780 781 psrlw mm4, 8 782 psrlw mm5, 8 783 784 packuswb mm4, mm5 // des[3] 785 movq QWORD ptr [edi], mm4 // write des[3] 786 787 // mm0, mm2 --- Src[3] 788 789 pxor mm7, mm7 // clear mm7 for unpacking 790 movq mm1, [edi+ecx*2] // mm1 = Src[0] of the next group 791 792 movq mm5, three_fifths // mm5 = 3/5 793 pmullw mm0, mm5 // d * 3/5 794 795 movq mm6, two_fifths // mm6 = 2/5 796 movq mm3, mm1 // make a copy 797 798 pmullw mm2, mm5 // d * 3/5 799 punpcklbw mm1, mm7 // unpack low 800 801 pmullw mm1, mm6 // an * 2/5 802 punpckhbw mm3, mm7 // unpack high 803 804 paddw mm0, mm1 // d * 3/5 + an * 2/5 805 pmullw mm3, mm6 // an * 2/5 806 807 paddw mm2, mm3 // d * 3/5 + an * 2/5 808 paddw mm0, round_values // + 128 809 810 paddw mm2, round_values // + 128 811 psrlw mm0, 8 812 813 psrlw mm2, 8 814 packuswb mm0, mm2 // des[4] 815 816 movq QWORD ptr [edi+ecx], mm0 // write des[4] 817 818 add edi, 8 819 add esi, 8 820 821 sub edx, 8 822 jg vs_3_5_loop 823 } 824 } 825 826 /**************************************************************************** 827 * 828 * ROUTINE : last_vertical_band_3_5_scale_mmx 829 * 830 * INPUTS : unsigned char *dest : 831 * unsigned int dest_pitch : 832 * unsigned int dest_width : 833 * 834 * OUTPUTS : None. 835 * 836 * RETURNS : void 837 * 838 * FUNCTION : 3 to 5 up-scaling of a 3-pixel high band of pixels. 839 * 840 * SPECIAL NOTES : The routine uses the first line of the band below 841 * the current band. The function also has an "C" only 842 * version. 843 * 844 ****************************************************************************/ 845 static 846 void last_vertical_band_3_5_scale_mmx 847 ( 848 unsigned char *dest, 849 unsigned int dest_pitch, 850 unsigned int dest_width 851 ) 852 { 853 __asm 854 { 855 mov esi, dest // Get the source and destination pointer 856 mov ecx, dest_pitch // Get the pitch size 857 858 lea edi, [esi+ecx*2] // tow lines below 859 add edi, ecx // three lines below 860 861 pxor mm7, mm7 // clear out mm7 862 mov edx, dest_width // Loop counter 863 864 865 last_vs_3_5_loop: 866 867 movq mm0, QWORD ptr [esi] // src[0]; 868 movq mm1, QWORD ptr [esi+ecx] // src[1]; 869 870 movq mm2, mm0 // Make a copy 871 punpcklbw mm0, mm7 // unpack low to word 872 873 movq mm5, two_fifths // mm5 = 2/5 874 punpckhbw mm2, mm7 // unpack high to word 875 876 pmullw mm0, mm5 // a * 2/5 877 878 movq mm3, mm1 // make a copy 879 punpcklbw mm1, mm7 // unpack low to word 880 881 pmullw mm2, mm5 // a * 2/5 882 movq mm6, three_fifths // mm6 = 3/5 883 884 movq mm4, mm1 // copy of low b 885 pmullw mm4, mm6 // b * 3/5 886 887 punpckhbw mm3, mm7 // unpack high to word 888 movq mm5, mm3 // copy of high b 889 890 pmullw mm5, mm6 // b * 3/5 891 paddw mm0, mm4 // a * 2/5 + b * 3/5 892 893 paddw mm2, mm5 // a * 2/5 + b * 3/5 894 paddw mm0, round_values // + 128 895 896 paddw mm2, round_values // + 128 897 psrlw mm0, 8 898 899 psrlw mm2, 8 900 packuswb mm0, mm2 // des [1] 901 902 movq QWORD ptr [esi+ecx], mm0 // write des[1] 903 movq mm0, [esi+ecx*2] // mm0 = src[2] 904 905 906 907 // mm1, mm3 --- Src[1] 908 // mm0 --- Src[2] 909 // mm7 for unpacking 910 911 movq mm4, mm1 // b low 912 pmullw mm1, four_fifths // b * 4/5 low 913 914 movq QWORD ptr [edi+ecx], mm0 // write des[4] 915 916 movq mm5, mm3 // b high 917 pmullw mm3, four_fifths // b * 4/5 high 918 919 movq mm2, mm0 // c 920 pmullw mm4, one_fifth // b * 1/5 921 922 punpcklbw mm0, mm7 // c low 923 pmullw mm5, one_fifth // b * 1/5 924 925 movq mm6, mm0 // make copy of c low 926 punpckhbw mm2, mm7 // c high 927 928 pmullw mm6, one_fifth // c * 1/5 low 929 movq mm7, mm2 // make copy of c high 930 931 pmullw mm7, one_fifth // c * 1/5 high 932 paddw mm1, mm6 // b * 4/5 + c * 1/5 low 933 934 paddw mm3, mm7 // b * 4/5 + c * 1/5 high 935 movq mm6, mm0 // make copy of c low 936 937 pmullw mm6, four_fifths // c * 4/5 low 938 movq mm7, mm2 // make copy of c high 939 940 pmullw mm7, four_fifths // c * 4/5 high 941 942 paddw mm4, mm6 // b * 1/5 + c * 4/5 low 943 paddw mm5, mm7 // b * 1/5 + c * 4/5 high 944 945 paddw mm1, round_values // + 128 946 paddw mm3, round_values // + 128 947 948 psrlw mm1, 8 949 psrlw mm3, 8 950 951 packuswb mm1, mm3 // des[2] 952 movq QWORD ptr [esi+ecx*2], mm1 // write des[2] 953 954 paddw mm4, round_values // + 128 955 paddw mm5, round_values // + 128 956 957 psrlw mm4, 8 958 psrlw mm5, 8 959 960 packuswb mm4, mm5 // des[3] 961 movq QWORD ptr [edi], mm4 // write des[3] 962 963 // mm0, mm2 --- Src[3] 964 965 add edi, 8 966 add esi, 8 967 968 sub edx, 8 969 jg last_vs_3_5_loop 970 } 971 } 972 973 /**************************************************************************** 974 * 975 * ROUTINE : vertical_band_1_2_scale_mmx 976 * 977 * INPUTS : unsigned char *dest : 978 * unsigned int dest_pitch : 979 * unsigned int dest_width : 980 * 981 * OUTPUTS : None. 982 * 983 * RETURNS : void 984 * 985 * FUNCTION : 1 to 2 up-scaling of a band of pixels. 986 * 987 * SPECIAL NOTES : The routine uses the first line of the band below 988 * the current band. The function also has an "C" only 989 * version. 990 * 991 ****************************************************************************/ 992 static 993 void vertical_band_1_2_scale_mmx 994 ( 995 unsigned char *dest, 996 unsigned int dest_pitch, 997 unsigned int dest_width 998 ) 999 { 1000 __asm 1001 { 1002 1003 mov esi, dest // Get the source and destination pointer 1004 mov ecx, dest_pitch // Get the pitch size 1005 1006 pxor mm7, mm7 // clear out mm7 1007 mov edx, dest_width // Loop counter 1008 1009 vs_1_2_loop: 1010 1011 movq mm0, [esi] // get Src[0] 1012 movq mm1, [esi + ecx * 2] // get Src[1] 1013 1014 movq mm2, mm0 // make copy before unpack 1015 movq mm3, mm1 // make copy before unpack 1016 1017 punpcklbw mm0, mm7 // low Src[0] 1018 movq mm6, four_ones // mm6= 1, 1, 1, 1 1019 1020 punpcklbw mm1, mm7 // low Src[1] 1021 paddw mm0, mm1 // low (a + b) 1022 1023 punpckhbw mm2, mm7 // high Src[0] 1024 paddw mm0, mm6 // low (a + b + 1) 1025 1026 punpckhbw mm3, mm7 1027 paddw mm2, mm3 // high (a + b ) 1028 1029 psraw mm0, 1 // low (a + b +1 )/2 1030 paddw mm2, mm6 // high (a + b + 1) 1031 1032 psraw mm2, 1 // high (a + b + 1)/2 1033 packuswb mm0, mm2 // pack results 1034 1035 movq [esi+ecx], mm0 // write out eight bytes 1036 add esi, 8 1037 1038 sub edx, 8 1039 jg vs_1_2_loop 1040 } 1041 1042 } 1043 1044 /**************************************************************************** 1045 * 1046 * ROUTINE : last_vertical_band_1_2_scale_mmx 1047 * 1048 * INPUTS : unsigned char *dest : 1049 * unsigned int dest_pitch : 1050 * unsigned int dest_width : 1051 * 1052 * OUTPUTS : None. 1053 * 1054 * RETURNS : void 1055 * 1056 * FUNCTION : 1 to 2 up-scaling of band of pixels. 1057 * 1058 * SPECIAL NOTES : The routine uses the first line of the band below 1059 * the current band. The function also has an "C" only 1060 * version. 1061 * 1062 ****************************************************************************/ 1063 static 1064 void last_vertical_band_1_2_scale_mmx 1065 ( 1066 unsigned char *dest, 1067 unsigned int dest_pitch, 1068 unsigned int dest_width 1069 ) 1070 { 1071 __asm 1072 { 1073 mov esi, dest // Get the source and destination pointer 1074 mov ecx, dest_pitch // Get the pitch size 1075 1076 mov edx, dest_width // Loop counter 1077 1078 last_vs_1_2_loop: 1079 1080 movq mm0, [esi] // get Src[0] 1081 movq [esi+ecx], mm0 // write out eight bytes 1082 1083 add esi, 8 1084 sub edx, 8 1085 1086 jg last_vs_1_2_loop 1087 } 1088 } 1089 1090 /**************************************************************************** 1091 * 1092 * ROUTINE : horizontal_line_1_2_scale 1093 * 1094 * INPUTS : const unsigned char *source : 1095 * unsigned int source_width : 1096 * unsigned char *dest : 1097 * unsigned int dest_width : 1098 * 1099 * OUTPUTS : None. 1100 * 1101 * RETURNS : void 1102 * 1103 * FUNCTION : 1 to 2 up-scaling of a horizontal line of pixels. 1104 * 1105 * SPECIAL NOTES : None. 1106 * 1107 ****************************************************************************/ 1108 static 1109 void horizontal_line_1_2_scale_mmx 1110 ( 1111 const unsigned char *source, 1112 unsigned int source_width, 1113 unsigned char *dest, 1114 unsigned int dest_width 1115 ) 1116 { 1117 (void) dest_width; 1118 1119 __asm 1120 { 1121 mov esi, source 1122 mov edi, dest 1123 1124 pxor mm7, mm7 1125 movq mm6, four_ones 1126 1127 mov ecx, source_width 1128 1129 hs_1_2_loop: 1130 1131 movq mm0, [esi] 1132 movq mm1, [esi+1] 1133 1134 movq mm2, mm0 1135 movq mm3, mm1 1136 1137 movq mm4, mm0 1138 punpcklbw mm0, mm7 1139 1140 punpcklbw mm1, mm7 1141 paddw mm0, mm1 1142 1143 paddw mm0, mm6 1144 punpckhbw mm2, mm7 1145 1146 punpckhbw mm3, mm7 1147 paddw mm2, mm3 1148 1149 paddw mm2, mm6 1150 psraw mm0, 1 1151 1152 psraw mm2, 1 1153 packuswb mm0, mm2 1154 1155 movq mm2, mm4 1156 punpcklbw mm2, mm0 1157 1158 movq [edi], mm2 1159 punpckhbw mm4, mm0 1160 1161 movq [edi+8], mm4 1162 add esi, 8 1163 1164 add edi, 16 1165 sub ecx, 8 1166 1167 cmp ecx, 8 1168 jg hs_1_2_loop 1169 1170 // last eight pixel 1171 1172 movq mm0, [esi] 1173 movq mm1, mm0 1174 1175 movq mm2, mm0 1176 movq mm3, mm1 1177 1178 psrlq mm1, 8 1179 psrlq mm3, 56 1180 1181 psllq mm3, 56 1182 por mm1, mm3 1183 1184 movq mm3, mm1 1185 movq mm4, mm0 1186 1187 punpcklbw mm0, mm7 1188 punpcklbw mm1, mm7 1189 1190 paddw mm0, mm1 1191 paddw mm0, mm6 1192 1193 punpckhbw mm2, mm7 1194 punpckhbw mm3, mm7 1195 1196 paddw mm2, mm3 1197 paddw mm2, mm6 1198 1199 psraw mm0, 1 1200 psraw mm2, 1 1201 1202 packuswb mm0, mm2 1203 movq mm2, mm4 1204 1205 punpcklbw mm2, mm0 1206 movq [edi], mm2 1207 1208 punpckhbw mm4, mm0 1209 movq [edi+8], mm4 1210 } 1211 } 1212 1213 1214 1215 1216 1217 __declspec(align(16)) const static unsigned short const54_2[] = { 0, 64, 128, 192 }; 1218 __declspec(align(16)) const static unsigned short const54_1[] = {256, 192, 128, 64 }; 1219 1220 1221 /**************************************************************************** 1222 * 1223 * ROUTINE : horizontal_line_5_4_scale_mmx 1224 * 1225 * INPUTS : const unsigned char *source : Pointer to source data. 1226 * unsigned int source_width : Stride of source. 1227 * unsigned char *dest : Pointer to destination data. 1228 * unsigned int dest_width : Stride of destination (NOT USED). 1229 * 1230 * OUTPUTS : None. 1231 * 1232 * RETURNS : void 1233 * 1234 * FUNCTION : Copies horizontal line of pixels from source to 1235 * destination scaling up by 4 to 5. 1236 * 1237 * SPECIAL NOTES : None. 1238 * 1239 ****************************************************************************/ 1240 static 1241 void horizontal_line_5_4_scale_mmx 1242 ( 1243 const unsigned char *source, 1244 unsigned int source_width, 1245 unsigned char *dest, 1246 unsigned int dest_width 1247 ) 1248 { 1249 /* 1250 unsigned i; 1251 unsigned int a, b, c, d, e; 1252 unsigned char *des = dest; 1253 const unsigned char *src = source; 1254 1255 (void) dest_width; 1256 1257 for ( i=0; i<source_width; i+=5 ) 1258 { 1259 a = src[0]; 1260 b = src[1]; 1261 c = src[2]; 1262 d = src[3]; 1263 e = src[4]; 1264 1265 des[0] = a; 1266 des[1] = ((b*192 + c* 64 + 128)>>8); 1267 des[2] = ((c*128 + d*128 + 128)>>8); 1268 des[3] = ((d* 64 + e*192 + 128)>>8); 1269 1270 src += 5; 1271 des += 4; 1272 } 1273 */ 1274 (void) dest_width; 1275 1276 __asm 1277 { 1278 1279 mov esi, source ; 1280 mov edi, dest ; 1281 1282 mov ecx, source_width ; 1283 movq mm5, const54_1 ; 1284 1285 pxor mm7, mm7 ; 1286 movq mm6, const54_2 ; 1287 1288 movq mm4, round_values ; 1289 lea edx, [esi+ecx] ; 1290 horizontal_line_5_4_loop: 1291 1292 movq mm0, QWORD PTR [esi] ; 1293 00 01 02 03 04 05 06 07 1294 movq mm1, mm0 ; 1295 00 01 02 03 04 05 06 07 1296 1297 psrlq mm0, 8 ; 1298 01 02 03 04 05 06 07 xx 1299 punpcklbw mm1, mm7 ; 1300 xx 00 xx 01 xx 02 xx 03 1301 1302 punpcklbw mm0, mm7 ; 1303 xx 01 xx 02 xx 03 xx 04 1304 pmullw mm1, mm5 1305 1306 pmullw mm0, mm6 1307 add esi, 5 1308 1309 add edi, 4 1310 paddw mm1, mm0 1311 1312 paddw mm1, mm4 1313 psrlw mm1, 8 1314 1315 cmp esi, edx 1316 packuswb mm1, mm7 1317 1318 movd DWORD PTR [edi-4], mm1 1319 1320 jl horizontal_line_5_4_loop 1321 1322 } 1323 1324 } 1325 __declspec(align(16)) const static unsigned short one_fourths[] = { 64, 64, 64, 64 }; 1326 __declspec(align(16)) const static unsigned short two_fourths[] = { 128, 128, 128, 128 }; 1327 __declspec(align(16)) const static unsigned short three_fourths[] = { 192, 192, 192, 192 }; 1328 1329 static 1330 void vertical_band_5_4_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) 1331 { 1332 1333 __asm 1334 { 1335 push ebx 1336 1337 mov esi, source // Get the source and destination pointer 1338 mov ecx, src_pitch // Get the pitch size 1339 1340 mov edi, dest // tow lines below 1341 pxor mm7, mm7 // clear out mm7 1342 1343 mov edx, dest_pitch // Loop counter 1344 mov ebx, dest_width 1345 1346 vs_5_4_loop: 1347 1348 movd mm0, DWORD ptr [esi] // src[0]; 1349 movd mm1, DWORD ptr [esi+ecx] // src[1]; 1350 1351 movd mm2, DWORD ptr [esi+ecx*2] 1352 lea eax, [esi+ecx*2] // 1353 1354 punpcklbw mm1, mm7 1355 punpcklbw mm2, mm7 1356 1357 movq mm3, mm2 1358 pmullw mm1, three_fourths 1359 1360 pmullw mm2, one_fourths 1361 movd mm4, [eax+ecx] 1362 1363 pmullw mm3, two_fourths 1364 punpcklbw mm4, mm7 1365 1366 movq mm5, mm4 1367 pmullw mm4, two_fourths 1368 1369 paddw mm1, mm2 1370 movd mm6, [eax+ecx*2] 1371 1372 pmullw mm5, one_fourths 1373 paddw mm1, round_values; 1374 1375 paddw mm3, mm4 1376 psrlw mm1, 8 1377 1378 punpcklbw mm6, mm7 1379 paddw mm3, round_values 1380 1381 pmullw mm6, three_fourths 1382 psrlw mm3, 8 1383 1384 packuswb mm1, mm7 1385 packuswb mm3, mm7 1386 1387 movd DWORD PTR [edi], mm0 1388 movd DWORD PTR [edi+edx], mm1 1389 1390 1391 paddw mm5, mm6 1392 movd DWORD PTR [edi+edx*2], mm3 1393 1394 lea eax, [edi+edx*2] 1395 paddw mm5, round_values 1396 1397 psrlw mm5, 8 1398 add edi, 4 1399 1400 packuswb mm5, mm7 1401 movd DWORD PTR [eax+edx], mm5 1402 1403 add esi, 4 1404 sub ebx, 4 1405 1406 jg vs_5_4_loop 1407 1408 pop ebx 1409 } 1410 } 1411 1412 1413 __declspec(align(16)) const static unsigned short const53_1[] = { 0, 85, 171, 0 }; 1414 __declspec(align(16)) const static unsigned short const53_2[] = {256, 171, 85, 0 }; 1415 1416 1417 static 1418 void horizontal_line_5_3_scale_mmx 1419 ( 1420 const unsigned char *source, 1421 unsigned int source_width, 1422 unsigned char *dest, 1423 unsigned int dest_width 1424 ) 1425 { 1426 1427 (void) dest_width; 1428 __asm 1429 { 1430 1431 mov esi, source ; 1432 mov edi, dest ; 1433 1434 mov ecx, source_width ; 1435 movq mm5, const53_1 ; 1436 1437 pxor mm7, mm7 ; 1438 movq mm6, const53_2 ; 1439 1440 movq mm4, round_values ; 1441 lea edx, [esi+ecx-5] ; 1442 horizontal_line_5_3_loop: 1443 1444 movq mm0, QWORD PTR [esi] ; 1445 00 01 02 03 04 05 06 07 1446 movq mm1, mm0 ; 1447 00 01 02 03 04 05 06 07 1448 1449 psllw mm0, 8 ; 1450 xx 00 xx 02 xx 04 xx 06 1451 psrlw mm1, 8 ; 1452 01 xx 03 xx 05 xx 07 xx 1453 1454 psrlw mm0, 8 ; 1455 00 xx 02 xx 04 xx 06 xx 1456 psllq mm1, 16 ; 1457 xx xx 01 xx 03 xx 05 xx 1458 1459 pmullw mm0, mm6 1460 1461 pmullw mm1, mm5 1462 add esi, 5 1463 1464 add edi, 3 1465 paddw mm1, mm0 1466 1467 paddw mm1, mm4 1468 psrlw mm1, 8 1469 1470 cmp esi, edx 1471 packuswb mm1, mm7 1472 1473 movd DWORD PTR [edi-3], mm1 1474 jl horizontal_line_5_3_loop 1475 1476 //exit condition 1477 movq mm0, QWORD PTR [esi] ; 1478 00 01 02 03 04 05 06 07 1479 movq mm1, mm0 ; 1480 00 01 02 03 04 05 06 07 1481 1482 psllw mm0, 8 ; 1483 xx 00 xx 02 xx 04 xx 06 1484 psrlw mm1, 8 ; 1485 01 xx 03 xx 05 xx 07 xx 1486 1487 psrlw mm0, 8 ; 1488 00 xx 02 xx 04 xx 06 xx 1489 psllq mm1, 16 ; 1490 xx xx 01 xx 03 xx 05 xx 1491 1492 pmullw mm0, mm6 1493 1494 pmullw mm1, mm5 1495 paddw mm1, mm0 1496 1497 paddw mm1, mm4 1498 psrlw mm1, 8 1499 1500 packuswb mm1, mm7 1501 movd eax, mm1 1502 1503 mov edx, eax 1504 shr edx, 16 1505 1506 mov WORD PTR[edi], ax 1507 mov BYTE PTR[edi+2], dl 1508 1509 } 1510 1511 } 1512 1513 __declspec(align(16)) const static unsigned short one_thirds[] = { 85, 85, 85, 85 }; 1514 __declspec(align(16)) const static unsigned short two_thirds[] = { 171, 171, 171, 171 }; 1515 1516 static 1517 void vertical_band_5_3_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) 1518 { 1519 1520 __asm 1521 { 1522 push ebx 1523 1524 mov esi, source // Get the source and destination pointer 1525 mov ecx, src_pitch // Get the pitch size 1526 1527 mov edi, dest // tow lines below 1528 pxor mm7, mm7 // clear out mm7 1529 1530 mov edx, dest_pitch // Loop counter 1531 movq mm5, one_thirds 1532 1533 movq mm6, two_thirds 1534 mov ebx, dest_width; 1535 1536 vs_5_3_loop: 1537 1538 movd mm0, DWORD ptr [esi] // src[0]; 1539 movd mm1, DWORD ptr [esi+ecx] // src[1]; 1540 1541 movd mm2, DWORD ptr [esi+ecx*2] 1542 lea eax, [esi+ecx*2] // 1543 1544 punpcklbw mm1, mm7 1545 punpcklbw mm2, mm7 1546 1547 pmullw mm1, mm5 1548 pmullw mm2, mm6 1549 1550 movd mm3, DWORD ptr [eax+ecx] 1551 movd mm4, DWORD ptr [eax+ecx*2] 1552 1553 punpcklbw mm3, mm7 1554 punpcklbw mm4, mm7 1555 1556 pmullw mm3, mm6 1557 pmullw mm4, mm5 1558 1559 1560 movd DWORD PTR [edi], mm0 1561 paddw mm1, mm2 1562 1563 paddw mm1, round_values 1564 psrlw mm1, 8 1565 1566 packuswb mm1, mm7 1567 paddw mm3, mm4 1568 1569 paddw mm3, round_values 1570 movd DWORD PTR [edi+edx], mm1 1571 1572 psrlw mm3, 8 1573 packuswb mm3, mm7 1574 1575 movd DWORD PTR [edi+edx*2], mm3 1576 1577 1578 add edi, 4 1579 add esi, 4 1580 1581 sub ebx, 4 1582 jg vs_5_3_loop 1583 1584 pop ebx 1585 } 1586 } 1587 1588 1589 1590 1591 /**************************************************************************** 1592 * 1593 * ROUTINE : horizontal_line_2_1_scale 1594 * 1595 * INPUTS : const unsigned char *source : 1596 * unsigned int source_width : 1597 * unsigned char *dest : 1598 * unsigned int dest_width : 1599 * 1600 * OUTPUTS : None. 1601 * 1602 * RETURNS : void 1603 * 1604 * FUNCTION : 1 to 2 up-scaling of a horizontal line of pixels. 1605 * 1606 * SPECIAL NOTES : None. 1607 * 1608 ****************************************************************************/ 1609 static 1610 void horizontal_line_2_1_scale_mmx 1611 ( 1612 const unsigned char *source, 1613 unsigned int source_width, 1614 unsigned char *dest, 1615 unsigned int dest_width 1616 ) 1617 { 1618 (void) dest_width; 1619 (void) source_width; 1620 __asm 1621 { 1622 mov esi, source 1623 mov edi, dest 1624 1625 pxor mm7, mm7 1626 mov ecx, dest_width 1627 1628 xor edx, edx 1629 hs_2_1_loop: 1630 1631 movq mm0, [esi+edx*2] 1632 psllw mm0, 8 1633 1634 psrlw mm0, 8 1635 packuswb mm0, mm7 1636 1637 movd DWORD Ptr [edi+edx], mm0; 1638 add edx, 4 1639 1640 cmp edx, ecx 1641 jl hs_2_1_loop 1642 1643 } 1644 } 1645 1646 1647 1648 static 1649 void vertical_band_2_1_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) 1650 { 1651 (void) dest_pitch; 1652 (void) src_pitch; 1653 vpx_memcpy(dest, source, dest_width); 1654 } 1655 1656 1657 __declspec(align(16)) const static unsigned short three_sixteenths[] = { 48, 48, 48, 48 }; 1658 __declspec(align(16)) const static unsigned short ten_sixteenths[] = { 160, 160, 160, 160 }; 1659 1660 static 1661 void vertical_band_2_1_scale_i_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) 1662 { 1663 1664 (void) dest_pitch; 1665 __asm 1666 { 1667 mov esi, source 1668 mov edi, dest 1669 1670 mov eax, src_pitch 1671 mov edx, dest_width 1672 1673 pxor mm7, mm7 1674 sub esi, eax //back one line 1675 1676 1677 lea ecx, [esi+edx]; 1678 movq mm6, round_values; 1679 1680 movq mm5, three_sixteenths; 1681 movq mm4, ten_sixteenths; 1682 1683 vs_2_1_i_loop: 1684 movd mm0, [esi] // 1685 movd mm1, [esi+eax] // 1686 1687 movd mm2, [esi+eax*2] // 1688 punpcklbw mm0, mm7 1689 1690 pmullw mm0, mm5 1691 punpcklbw mm1, mm7 1692 1693 pmullw mm1, mm4 1694 punpcklbw mm2, mm7 1695 1696 pmullw mm2, mm5 1697 paddw mm0, round_values 1698 1699 paddw mm1, mm2 1700 paddw mm0, mm1 1701 1702 psrlw mm0, 8 1703 packuswb mm0, mm7 1704 1705 movd DWORD PTR [edi], mm0 1706 add esi, 4 1707 1708 add edi, 4; 1709 cmp esi, ecx 1710 jl vs_2_1_i_loop 1711 1712 } 1713 } 1714 1715 1716 1717 void 1718 register_mmxscalers(void) 1719 { 1720 vp8_horizontal_line_1_2_scale = horizontal_line_1_2_scale_mmx; 1721 vp8_vertical_band_1_2_scale = vertical_band_1_2_scale_mmx; 1722 vp8_last_vertical_band_1_2_scale = last_vertical_band_1_2_scale_mmx; 1723 vp8_horizontal_line_3_5_scale = horizontal_line_3_5_scale_mmx; 1724 vp8_vertical_band_3_5_scale = vertical_band_3_5_scale_mmx; 1725 vp8_last_vertical_band_3_5_scale = last_vertical_band_3_5_scale_mmx; 1726 vp8_horizontal_line_4_5_scale = horizontal_line_4_5_scale_mmx; 1727 vp8_vertical_band_4_5_scale = vertical_band_4_5_scale_mmx; 1728 vp8_last_vertical_band_4_5_scale = last_vertical_band_4_5_scale_mmx; 1729 1730 vp8_horizontal_line_3_4_scale = vp8cx_horizontal_line_3_4_scale_c; 1731 vp8_vertical_band_3_4_scale = vp8cx_vertical_band_3_4_scale_c; 1732 vp8_last_vertical_band_3_4_scale = vp8cx_last_vertical_band_3_4_scale_c; 1733 vp8_horizontal_line_2_3_scale = vp8cx_horizontal_line_2_3_scale_c; 1734 vp8_vertical_band_2_3_scale = vp8cx_vertical_band_2_3_scale_c; 1735 vp8_last_vertical_band_2_3_scale = vp8cx_last_vertical_band_2_3_scale_c; 1736 1737 1738 1739 vp8_vertical_band_5_4_scale = vertical_band_5_4_scale_mmx; 1740 vp8_vertical_band_5_3_scale = vertical_band_5_3_scale_mmx; 1741 vp8_vertical_band_2_1_scale = vertical_band_2_1_scale_mmx; 1742 vp8_vertical_band_2_1_scale_i = vertical_band_2_1_scale_i_mmx; 1743 vp8_horizontal_line_2_1_scale = horizontal_line_2_1_scale_mmx; 1744 vp8_horizontal_line_5_3_scale = horizontal_line_5_3_scale_mmx; 1745 vp8_horizontal_line_5_4_scale = horizontal_line_5_4_scale_mmx; 1746 1747 1748 1749 1750 } 1751