1 /* 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 12 /**************************************************************************** 13 * 14 * Module Title : scaleopt.cpp 15 * 16 * Description : Optimized scaling functions 17 * 18 ****************************************************************************/ 19 #include "pragmas.h" 20 21 22 23 /**************************************************************************** 24 * Module Statics 25 ****************************************************************************/ 26 __declspec(align(16)) const static unsigned short one_fifth[] = { 51, 51, 51, 51 }; 27 __declspec(align(16)) const static unsigned short two_fifths[] = { 102, 102, 102, 102 }; 28 __declspec(align(16)) const static unsigned short three_fifths[] = { 154, 154, 154, 154 }; 29 __declspec(align(16)) const static unsigned short four_fifths[] = { 205, 205, 205, 205 }; 30 __declspec(align(16)) const static unsigned short round_values[] = { 128, 128, 128, 128 }; 31 __declspec(align(16)) const static unsigned short four_ones[] = { 1, 1, 1, 1}; 32 __declspec(align(16)) const static unsigned short const45_2[] = {205, 154, 102, 51 }; 33 __declspec(align(16)) const static unsigned short const45_1[] = { 51, 102, 154, 205 }; 34 __declspec(align(16)) const static unsigned char mask45[] = { 0, 0, 0, 0, 0, 0, 255, 0}; 35 __declspec(align(16)) const static unsigned short const35_2[] = { 154, 51, 205, 102 }; 36 __declspec(align(16)) const static unsigned short const35_1[] = { 102, 205, 51, 154 }; 37 38 39 40 #include "vpx_scale/vpxscale.h" 41 #include "vpx_mem/vpx_mem.h" 42 43 /**************************************************************************** 44 * 45 * ROUTINE : horizontal_line_3_5_scale_mmx 46 * 47 * INPUTS : const unsigned char *source : 48 * unsigned int source_width : 49 * unsigned char *dest : 50 * unsigned int dest_width : 51 * 52 * OUTPUTS : None. 53 * 54 * RETURNS : void 55 * 56 * FUNCTION : 3 to 5 up-scaling of a horizontal line of pixels. 57 * 58 * SPECIAL NOTES : None. 59 * 60 ****************************************************************************/ 61 static 62 void horizontal_line_3_5_scale_mmx 63 ( 64 const unsigned char *source, 65 unsigned int source_width, 66 unsigned char *dest, 67 unsigned int dest_width 68 ) 69 { 70 (void) dest_width; 71 72 __asm 73 { 74 75 push rbx 76 77 mov rsi, source 78 mov rdi, dest 79 80 mov ecx, source_width 81 lea rdx, [rsi+rcx-3]; 82 83 movq mm5, const35_1 // mm5 = 66 xx cd xx 33 xx 9a xx 84 movq mm6, const35_2 // mm6 = 9a xx 33 xx cd xx 66 xx 85 86 movq mm4, round_values // mm4 = 80 xx 80 xx 80 xx 80 xx 87 pxor mm7, mm7 // clear mm7 88 89 horiz_line_3_5_loop: 90 91 mov eax, DWORD PTR [rsi] // eax = 00 01 02 03 92 mov ebx, eax 93 94 and ebx, 0xffff00 // ebx = xx 01 02 xx 95 mov ecx, eax // ecx = 00 01 02 03 96 97 and eax, 0xffff0000 // eax = xx xx 02 03 98 xor ecx, eax // ecx = 00 01 xx xx 99 100 shr ebx, 8 // ebx = 01 02 xx xx 101 or eax, ebx // eax = 01 02 02 03 102 103 shl ebx, 16 // ebx = xx xx 01 02 104 movd mm1, eax // mm1 = 01 02 02 03 xx xx xx xx 105 106 or ebx, ecx // ebx = 00 01 01 02 107 punpcklbw mm1, mm7 // mm1 = 01 xx 02 xx 02 xx 03 xx 108 109 movd mm0, ebx // mm0 = 00 01 01 02 110 pmullw mm1, mm6 // 111 112 punpcklbw mm0, mm7 // mm0 = 00 xx 01 xx 01 xx 02 xx 113 pmullw mm0, mm5 // 114 115 mov [rdi], ebx // writeoutput 00 xx xx xx 116 add rsi, 3 117 118 add rdi, 5 119 paddw mm0, mm1 120 121 paddw mm0, mm4 122 psrlw mm0, 8 123 124 cmp rsi, rdx 125 packuswb mm0, mm7 126 127 movd DWORD Ptr [rdi-4], mm0 128 jl horiz_line_3_5_loop 129 130 //Exit: 131 mov eax, DWORD PTR [rsi] // eax = 00 01 02 03 132 mov ebx, eax 133 134 and ebx, 0xffff00 // ebx = xx 01 02 xx 135 mov ecx, eax // ecx = 00 01 02 03 136 137 and eax, 0xffff0000 // eax = xx xx 02 03 138 xor ecx, eax // ecx = 00 01 xx xx 139 140 shr ebx, 8 // ebx = 01 02 xx xx 141 or eax, ebx // eax = 01 02 02 03 142 143 shl eax, 8 // eax = xx 01 02 02 144 and eax, 0xffff0000 // eax = xx xx 02 02 145 146 or eax, ebx // eax = 01 02 02 02 147 148 shl ebx, 16 // ebx = xx xx 01 02 149 movd mm1, eax // mm1 = 01 02 02 02 xx xx xx xx 150 151 or ebx, ecx // ebx = 00 01 01 02 152 punpcklbw mm1, mm7 // mm1 = 01 xx 02 xx 02 xx 02 xx 153 154 movd mm0, ebx // mm0 = 00 01 01 02 155 pmullw mm1, mm6 // 156 157 punpcklbw mm0, mm7 // mm0 = 00 xx 01 xx 01 xx 02 xx 158 pmullw mm0, mm5 // 159 160 mov [rdi], ebx // writeoutput 00 xx xx xx 161 paddw mm0, mm1 162 163 paddw mm0, mm4 164 psrlw mm0, 8 165 166 packuswb mm0, mm7 167 movd DWORD Ptr [rdi+1], mm0 168 169 pop rbx 170 171 } 172 173 } 174 175 176 /**************************************************************************** 177 * 178 * ROUTINE : horizontal_line_4_5_scale_mmx 179 * 180 * INPUTS : const unsigned char *source : 181 * unsigned int source_width : 182 * unsigned char *dest : 183 * unsigned int dest_width : 184 * 185 * OUTPUTS : None. 186 * 187 * RETURNS : void 188 * 189 * FUNCTION : 4 to 5 up-scaling of a horizontal line of pixels. 190 * 191 * SPECIAL NOTES : None. 192 * 193 ****************************************************************************/ 194 static 195 void horizontal_line_4_5_scale_mmx 196 ( 197 const unsigned char *source, 198 unsigned int source_width, 199 unsigned char *dest, 200 unsigned int dest_width 201 ) 202 { 203 (void)dest_width; 204 205 __asm 206 { 207 208 mov rsi, source 209 mov rdi, dest 210 211 mov ecx, source_width 212 lea rdx, [rsi+rcx-8]; 213 214 movq mm5, const45_1 // mm5 = 33 xx 66 xx 9a xx cd xx 215 movq mm6, const45_2 // mm6 = cd xx 9a xx 66 xx 33 xx 216 217 movq mm4, round_values // mm4 = 80 xx 80 xx 80 xx 80 xx 218 pxor mm7, mm7 // clear mm7 219 220 horiz_line_4_5_loop: 221 222 movq mm0, QWORD PTR [rsi] // mm0 = 00 01 02 03 04 05 06 07 223 movq mm1, QWORD PTR [rsi+1]; // mm1 = 01 02 03 04 05 06 07 08 224 225 movq mm2, mm0 // mm2 = 00 01 02 03 04 05 06 07 226 movq mm3, mm1 // mm3 = 01 02 03 04 05 06 07 08 227 228 movd DWORD PTR [rdi], mm0 // write output 00 xx xx xx 229 punpcklbw mm0, mm7 // mm0 = 00 xx 01 xx 02 xx 03 xx 230 231 punpcklbw mm1, mm7 // mm1 = 01 xx 02 xx 03 xx 04 xx 232 pmullw mm0, mm5 // 00* 51 01*102 02*154 03*205 233 234 pmullw mm1, mm6 // 01*205 02*154 03*102 04* 51 235 punpckhbw mm2, mm7 // mm2 = 04 xx 05 xx 06 xx 07 xx 236 237 movd DWORD PTR [rdi+5], mm2 // write ouput 05 xx xx xx 238 pmullw mm2, mm5 // 04* 51 05*102 06*154 07*205 239 240 punpckhbw mm3, mm7 // mm3 = 05 xx 06 xx 07 xx 08 xx 241 pmullw mm3, mm6 // 05*205 06*154 07*102 08* 51 242 243 paddw mm0, mm1 // added round values 244 paddw mm0, mm4 245 246 psrlw mm0, 8 // output: 01 xx 02 xx 03 xx 04 xx 247 packuswb mm0, mm7 248 249 movd DWORD PTR [rdi+1], mm0 // write output 01 02 03 04 250 add rdi, 10 251 252 add rsi, 8 253 paddw mm2, mm3 // 254 255 paddw mm2, mm4 // added round values 256 cmp rsi, rdx 257 258 psrlw mm2, 8 259 packuswb mm2, mm7 260 261 movd DWORD PTR [rdi-4], mm2 // writeoutput 06 07 08 09 262 jl horiz_line_4_5_loop 263 264 //Exit: 265 movq mm0, [rsi] // mm0 = 00 01 02 03 04 05 06 07 266 movq mm1, mm0 // mm1 = 00 01 02 03 04 05 06 07 267 268 movq mm2, mm0 // mm2 = 00 01 02 03 04 05 06 07 269 psrlq mm1, 8 // mm1 = 01 02 03 04 05 06 07 00 270 271 movq mm3, mask45 // mm3 = 00 00 00 00 00 00 ff 00 272 pand mm3, mm1 // mm3 = 00 00 00 00 00 00 07 00 273 274 psllq mm3, 8 // mm3 = 00 00 00 00 00 00 00 07 275 por mm1, mm3 // mm1 = 01 02 03 04 05 06 07 07 276 277 movq mm3, mm1 278 279 movd DWORD PTR [rdi], mm0 // write output 00 xx xx xx 280 punpcklbw mm0, mm7 // mm0 = 00 xx 01 xx 02 xx 03 xx 281 282 punpcklbw mm1, mm7 // mm1 = 01 xx 02 xx 03 xx 04 xx 283 pmullw mm0, mm5 // 00* 51 01*102 02*154 03*205 284 285 pmullw mm1, mm6 // 01*205 02*154 03*102 04* 51 286 punpckhbw mm2, mm7 // mm2 = 04 xx 05 xx 06 xx 07 xx 287 288 movd DWORD PTR [rdi+5], mm2 // write ouput 05 xx xx xx 289 pmullw mm2, mm5 // 04* 51 05*102 06*154 07*205 290 291 punpckhbw mm3, mm7 // mm3 = 05 xx 06 xx 07 xx 08 xx 292 pmullw mm3, mm6 // 05*205 06*154 07*102 07* 51 293 294 paddw mm0, mm1 // added round values 295 paddw mm0, mm4 296 297 psrlw mm0, 8 // output: 01 xx 02 xx 03 xx 04 xx 298 packuswb mm0, mm7 // 01 02 03 04 xx xx xx xx 299 300 movd DWORD PTR [rdi+1], mm0 // write output 01 02 03 04 301 paddw mm2, mm3 // 302 303 paddw mm2, mm4 // added round values 304 psrlw mm2, 8 305 306 packuswb mm2, mm7 307 movd DWORD PTR [rdi+6], mm2 // writeoutput 06 07 08 09 308 309 310 } 311 } 312 313 /**************************************************************************** 314 * 315 * ROUTINE : vertical_band_4_5_scale_mmx 316 * 317 * INPUTS : unsigned char *dest : 318 * unsigned int dest_pitch : 319 * unsigned int dest_width : 320 * 321 * OUTPUTS : None. 322 * 323 * RETURNS : void 324 * 325 * FUNCTION : 4 to 5 up-scaling of a 4 pixel high band of pixels. 326 * 327 * SPECIAL NOTES : The routine uses the first line of the band below 328 * the current band. The function also has a "C" only 329 * version. 330 * 331 ****************************************************************************/ 332 static 333 void vertical_band_4_5_scale_mmx 334 ( 335 unsigned char *dest, 336 unsigned int dest_pitch, 337 unsigned int dest_width 338 ) 339 { 340 __asm 341 { 342 343 mov rsi, dest // Get the source and destination pointer 344 mov ecx, dest_pitch // Get the pitch size 345 346 lea rdi, [rsi+rcx*2] // tow lines below 347 add rdi, rcx // three lines below 348 349 pxor mm7, mm7 // clear out mm7 350 mov edx, dest_width // Loop counter 351 352 vs_4_5_loop: 353 354 movq mm0, QWORD ptr [rsi] // src[0]; 355 movq mm1, QWORD ptr [rsi+rcx] // src[1]; 356 357 movq mm2, mm0 // Make a copy 358 punpcklbw mm0, mm7 // unpack low to word 359 360 movq mm5, one_fifth 361 punpckhbw mm2, mm7 // unpack high to word 362 363 pmullw mm0, mm5 // a * 1/5 364 365 movq mm3, mm1 // make a copy 366 punpcklbw mm1, mm7 // unpack low to word 367 368 pmullw mm2, mm5 // a * 1/5 369 movq mm6, four_fifths // constan 370 371 movq mm4, mm1 // copy of low b 372 pmullw mm4, mm6 // b * 4/5 373 374 punpckhbw mm3, mm7 // unpack high to word 375 movq mm5, mm3 // copy of high b 376 377 pmullw mm5, mm6 // b * 4/5 378 paddw mm0, mm4 // a * 1/5 + b * 4/5 379 380 paddw mm2, mm5 // a * 1/5 + b * 4/5 381 paddw mm0, round_values // + 128 382 383 paddw mm2, round_values // + 128 384 psrlw mm0, 8 385 386 psrlw mm2, 8 387 packuswb mm0, mm2 // des [1] 388 389 movq QWORD ptr [rsi+rcx], mm0 // write des[1] 390 movq mm0, [rsi+rcx*2] // mm0 = src[2] 391 392 // mm1, mm3 --- Src[1] 393 // mm0 --- Src[2] 394 // mm7 for unpacking 395 396 movq mm5, two_fifths 397 movq mm2, mm0 // make a copy 398 399 pmullw mm1, mm5 // b * 2/5 400 movq mm6, three_fifths 401 402 403 punpcklbw mm0, mm7 // unpack low to word 404 pmullw mm3, mm5 // b * 2/5 405 406 movq mm4, mm0 // make copy of c 407 punpckhbw mm2, mm7 // unpack high to word 408 409 pmullw mm4, mm6 // c * 3/5 410 movq mm5, mm2 411 412 pmullw mm5, mm6 // c * 3/5 413 paddw mm1, mm4 // b * 2/5 + c * 3/5 414 415 paddw mm3, mm5 // b * 2/5 + c * 3/5 416 paddw mm1, round_values // + 128 417 418 paddw mm3, round_values // + 128 419 psrlw mm1, 8 420 421 psrlw mm3, 8 422 packuswb mm1, mm3 // des[2] 423 424 movq QWORD ptr [rsi+rcx*2], mm1 // write des[2] 425 movq mm1, [rdi] // mm1=Src[3]; 426 427 // mm0, mm2 --- Src[2] 428 // mm1 --- Src[3] 429 // mm6 --- 3/5 430 // mm7 for unpacking 431 432 pmullw mm0, mm6 // c * 3/5 433 movq mm5, two_fifths // mm5 = 2/5 434 435 movq mm3, mm1 // make a copy 436 pmullw mm2, mm6 // c * 3/5 437 438 punpcklbw mm1, mm7 // unpack low 439 movq mm4, mm1 // make a copy 440 441 punpckhbw mm3, mm7 // unpack high 442 pmullw mm4, mm5 // d * 2/5 443 444 movq mm6, mm3 // make a copy 445 pmullw mm6, mm5 // d * 2/5 446 447 paddw mm0, mm4 // c * 3/5 + d * 2/5 448 paddw mm2, mm6 // c * 3/5 + d * 2/5 449 450 paddw mm0, round_values // + 128 451 paddw mm2, round_values // + 128 452 453 psrlw mm0, 8 454 psrlw mm2, 8 455 456 packuswb mm0, mm2 // des[3] 457 movq QWORD ptr [rdi], mm0 // write des[3] 458 459 // mm1, mm3 --- Src[3] 460 // mm7 -- cleared for unpacking 461 462 movq mm0, [rdi+rcx*2] // mm0, Src[0] of the next group 463 464 movq mm5, four_fifths // mm5 = 4/5 465 pmullw mm1, mm5 // d * 4/5 466 467 movq mm6, one_fifth // mm6 = 1/5 468 movq mm2, mm0 // make a copy 469 470 pmullw mm3, mm5 // d * 4/5 471 punpcklbw mm0, mm7 // unpack low 472 473 pmullw mm0, mm6 // an * 1/5 474 punpckhbw mm2, mm7 // unpack high 475 476 paddw mm1, mm0 // d * 4/5 + an * 1/5 477 pmullw mm2, mm6 // an * 1/5 478 479 paddw mm3, mm2 // d * 4/5 + an * 1/5 480 paddw mm1, round_values // + 128 481 482 paddw mm3, round_values // + 128 483 psrlw mm1, 8 484 485 psrlw mm3, 8 486 packuswb mm1, mm3 // des[4] 487 488 movq QWORD ptr [rdi+rcx], mm1 // write des[4] 489 490 add rdi, 8 491 add rsi, 8 492 493 sub rdx, 8 494 jg vs_4_5_loop 495 } 496 } 497 498 /**************************************************************************** 499 * 500 * ROUTINE : last_vertical_band_4_5_scale_mmx 501 * 502 * INPUTS : unsigned char *dest : 503 * unsigned int dest_pitch : 504 * unsigned int dest_width : 505 * 506 * OUTPUTS : None. 507 * 508 * RETURNS : None 509 * 510 * FUNCTION : 4 to 5 up-scaling of the last 4-pixel high band in an image. 511 * 512 * SPECIAL NOTES : The routine uses the first line of the band below 513 * the current band. The function also has an "C" only 514 * version. 515 * 516 ****************************************************************************/ 517 static 518 void last_vertical_band_4_5_scale_mmx 519 ( 520 unsigned char *dest, 521 unsigned int dest_pitch, 522 unsigned int dest_width 523 ) 524 { 525 __asm 526 { 527 mov rsi, dest // Get the source and destination pointer 528 mov ecx, dest_pitch // Get the pitch size 529 530 lea rdi, [rsi+rcx*2] // tow lines below 531 add rdi, rcx // three lines below 532 533 pxor mm7, mm7 // clear out mm7 534 mov edx, dest_width // Loop counter 535 536 last_vs_4_5_loop: 537 538 movq mm0, QWORD ptr [rsi] // src[0]; 539 movq mm1, QWORD ptr [rsi+rcx] // src[1]; 540 541 movq mm2, mm0 // Make a copy 542 punpcklbw mm0, mm7 // unpack low to word 543 544 movq mm5, one_fifth 545 punpckhbw mm2, mm7 // unpack high to word 546 547 pmullw mm0, mm5 // a * 1/5 548 549 movq mm3, mm1 // make a copy 550 punpcklbw mm1, mm7 // unpack low to word 551 552 pmullw mm2, mm5 // a * 1/5 553 movq mm6, four_fifths // constan 554 555 movq mm4, mm1 // copy of low b 556 pmullw mm4, mm6 // b * 4/5 557 558 punpckhbw mm3, mm7 // unpack high to word 559 movq mm5, mm3 // copy of high b 560 561 pmullw mm5, mm6 // b * 4/5 562 paddw mm0, mm4 // a * 1/5 + b * 4/5 563 564 paddw mm2, mm5 // a * 1/5 + b * 4/5 565 paddw mm0, round_values // + 128 566 567 paddw mm2, round_values // + 128 568 psrlw mm0, 8 569 570 psrlw mm2, 8 571 packuswb mm0, mm2 // des [1] 572 573 movq QWORD ptr [rsi+rcx], mm0 // write des[1] 574 movq mm0, [rsi+rcx*2] // mm0 = src[2] 575 576 // mm1, mm3 --- Src[1] 577 // mm0 --- Src[2] 578 // mm7 for unpacking 579 580 movq mm5, two_fifths 581 movq mm2, mm0 // make a copy 582 583 pmullw mm1, mm5 // b * 2/5 584 movq mm6, three_fifths 585 586 587 punpcklbw mm0, mm7 // unpack low to word 588 pmullw mm3, mm5 // b * 2/5 589 590 movq mm4, mm0 // make copy of c 591 punpckhbw mm2, mm7 // unpack high to word 592 593 pmullw mm4, mm6 // c * 3/5 594 movq mm5, mm2 595 596 pmullw mm5, mm6 // c * 3/5 597 paddw mm1, mm4 // b * 2/5 + c * 3/5 598 599 paddw mm3, mm5 // b * 2/5 + c * 3/5 600 paddw mm1, round_values // + 128 601 602 paddw mm3, round_values // + 128 603 psrlw mm1, 8 604 605 psrlw mm3, 8 606 packuswb mm1, mm3 // des[2] 607 608 movq QWORD ptr [rsi+rcx*2], mm1 // write des[2] 609 movq mm1, [rdi] // mm1=Src[3]; 610 611 movq QWORD ptr [rdi+rcx], mm1 // write des[4]; 612 613 // mm0, mm2 --- Src[2] 614 // mm1 --- Src[3] 615 // mm6 --- 3/5 616 // mm7 for unpacking 617 618 pmullw mm0, mm6 // c * 3/5 619 movq mm5, two_fifths // mm5 = 2/5 620 621 movq mm3, mm1 // make a copy 622 pmullw mm2, mm6 // c * 3/5 623 624 punpcklbw mm1, mm7 // unpack low 625 movq mm4, mm1 // make a copy 626 627 punpckhbw mm3, mm7 // unpack high 628 pmullw mm4, mm5 // d * 2/5 629 630 movq mm6, mm3 // make a copy 631 pmullw mm6, mm5 // d * 2/5 632 633 paddw mm0, mm4 // c * 3/5 + d * 2/5 634 paddw mm2, mm6 // c * 3/5 + d * 2/5 635 636 paddw mm0, round_values // + 128 637 paddw mm2, round_values // + 128 638 639 psrlw mm0, 8 640 psrlw mm2, 8 641 642 packuswb mm0, mm2 // des[3] 643 movq QWORD ptr [rdi], mm0 // write des[3] 644 645 // mm1, mm3 --- Src[3] 646 // mm7 -- cleared for unpacking 647 add rdi, 8 648 add rsi, 8 649 650 sub rdx, 8 651 jg last_vs_4_5_loop 652 } 653 } 654 655 /**************************************************************************** 656 * 657 * ROUTINE : vertical_band_3_5_scale_mmx 658 * 659 * INPUTS : unsigned char *dest : 660 * unsigned int dest_pitch : 661 * unsigned int dest_width : 662 * 663 * OUTPUTS : None. 664 * 665 * RETURNS : void 666 * 667 * FUNCTION : 3 to 5 up-scaling of a 3-pixel high band of pixels. 668 * 669 * SPECIAL NOTES : The routine uses the first line of the band below 670 * the current band. The function also has an "C" only 671 * version. 672 * 673 ****************************************************************************/ 674 static 675 void vertical_band_3_5_scale_mmx 676 ( 677 unsigned char *dest, 678 unsigned int dest_pitch, 679 unsigned int dest_width 680 ) 681 { 682 __asm 683 { 684 mov rsi, dest // Get the source and destination pointer 685 mov ecx, dest_pitch // Get the pitch size 686 687 lea rdi, [rsi+rcx*2] // two lines below 688 add rdi, rcx // three lines below 689 690 pxor mm7, mm7 // clear out mm7 691 mov edx, dest_width // Loop counter 692 693 vs_3_5_loop: 694 695 movq mm0, QWORD ptr [rsi] // src[0]; 696 movq mm1, QWORD ptr [rsi+rcx] // src[1]; 697 698 movq mm2, mm0 // Make a copy 699 punpcklbw mm0, mm7 // unpack low to word 700 701 movq mm5, two_fifths // mm5 = 2/5 702 punpckhbw mm2, mm7 // unpack high to word 703 704 pmullw mm0, mm5 // a * 2/5 705 706 movq mm3, mm1 // make a copy 707 punpcklbw mm1, mm7 // unpack low to word 708 709 pmullw mm2, mm5 // a * 2/5 710 movq mm6, three_fifths // mm6 = 3/5 711 712 movq mm4, mm1 // copy of low b 713 pmullw mm4, mm6 // b * 3/5 714 715 punpckhbw mm3, mm7 // unpack high to word 716 movq mm5, mm3 // copy of high b 717 718 pmullw mm5, mm6 // b * 3/5 719 paddw mm0, mm4 // a * 2/5 + b * 3/5 720 721 paddw mm2, mm5 // a * 2/5 + b * 3/5 722 paddw mm0, round_values // + 128 723 724 paddw mm2, round_values // + 128 725 psrlw mm0, 8 726 727 psrlw mm2, 8 728 packuswb mm0, mm2 // des [1] 729 730 movq QWORD ptr [rsi+rcx], mm0 // write des[1] 731 movq mm0, [rsi+rcx*2] // mm0 = src[2] 732 733 // mm1, mm3 --- Src[1] 734 // mm0 --- Src[2] 735 // mm7 for unpacking 736 737 movq mm4, mm1 // b low 738 pmullw mm1, four_fifths // b * 4/5 low 739 740 movq mm5, mm3 // b high 741 pmullw mm3, four_fifths // b * 4/5 high 742 743 movq mm2, mm0 // c 744 pmullw mm4, one_fifth // b * 1/5 745 746 punpcklbw mm0, mm7 // c low 747 pmullw mm5, one_fifth // b * 1/5 748 749 movq mm6, mm0 // make copy of c low 750 punpckhbw mm2, mm7 // c high 751 752 pmullw mm6, one_fifth // c * 1/5 low 753 movq mm7, mm2 // make copy of c high 754 755 pmullw mm7, one_fifth // c * 1/5 high 756 paddw mm1, mm6 // b * 4/5 + c * 1/5 low 757 758 paddw mm3, mm7 // b * 4/5 + c * 1/5 high 759 movq mm6, mm0 // make copy of c low 760 761 pmullw mm6, four_fifths // c * 4/5 low 762 movq mm7, mm2 // make copy of c high 763 764 pmullw mm7, four_fifths // c * 4/5 high 765 766 paddw mm4, mm6 // b * 1/5 + c * 4/5 low 767 paddw mm5, mm7 // b * 1/5 + c * 4/5 high 768 769 paddw mm1, round_values // + 128 770 paddw mm3, round_values // + 128 771 772 psrlw mm1, 8 773 psrlw mm3, 8 774 775 packuswb mm1, mm3 // des[2] 776 movq QWORD ptr [rsi+rcx*2], mm1 // write des[2] 777 778 paddw mm4, round_values // + 128 779 paddw mm5, round_values // + 128 780 781 psrlw mm4, 8 782 psrlw mm5, 8 783 784 packuswb mm4, mm5 // des[3] 785 movq QWORD ptr [rdi], mm4 // write des[3] 786 787 // mm0, mm2 --- Src[3] 788 789 pxor mm7, mm7 // clear mm7 for unpacking 790 movq mm1, [rdi+rcx*2] // mm1 = Src[0] of the next group 791 792 movq mm5, three_fifths // mm5 = 3/5 793 pmullw mm0, mm5 // d * 3/5 794 795 movq mm6, two_fifths // mm6 = 2/5 796 movq mm3, mm1 // make a copy 797 798 pmullw mm2, mm5 // d * 3/5 799 punpcklbw mm1, mm7 // unpack low 800 801 pmullw mm1, mm6 // an * 2/5 802 punpckhbw mm3, mm7 // unpack high 803 804 paddw mm0, mm1 // d * 3/5 + an * 2/5 805 pmullw mm3, mm6 // an * 2/5 806 807 paddw mm2, mm3 // d * 3/5 + an * 2/5 808 paddw mm0, round_values // + 128 809 810 paddw mm2, round_values // + 128 811 psrlw mm0, 8 812 813 psrlw mm2, 8 814 packuswb mm0, mm2 // des[4] 815 816 movq QWORD ptr [rdi+rcx], mm0 // write des[4] 817 818 add rdi, 8 819 add rsi, 8 820 821 sub rdx, 8 822 jg vs_3_5_loop 823 } 824 } 825 826 /**************************************************************************** 827 * 828 * ROUTINE : last_vertical_band_3_5_scale_mmx 829 * 830 * INPUTS : unsigned char *dest : 831 * unsigned int dest_pitch : 832 * unsigned int dest_width : 833 * 834 * OUTPUTS : None. 835 * 836 * RETURNS : void 837 * 838 * FUNCTION : 3 to 5 up-scaling of a 3-pixel high band of pixels. 839 * 840 * SPECIAL NOTES : The routine uses the first line of the band below 841 * the current band. The function also has an "C" only 842 * version. 843 * 844 ****************************************************************************/ 845 static 846 void last_vertical_band_3_5_scale_mmx 847 ( 848 unsigned char *dest, 849 unsigned int dest_pitch, 850 unsigned int dest_width 851 ) 852 { 853 __asm 854 { 855 mov rsi, dest // Get the source and destination pointer 856 mov ecx, dest_pitch // Get the pitch size 857 858 lea rdi, [rsi+rcx*2] // tow lines below 859 add rdi, rcx // three lines below 860 861 pxor mm7, mm7 // clear out mm7 862 mov edx, dest_width // Loop counter 863 864 865 last_vs_3_5_loop: 866 867 movq mm0, QWORD ptr [rsi] // src[0]; 868 movq mm1, QWORD ptr [rsi+rcx] // src[1]; 869 870 movq mm2, mm0 // Make a copy 871 punpcklbw mm0, mm7 // unpack low to word 872 873 movq mm5, two_fifths // mm5 = 2/5 874 punpckhbw mm2, mm7 // unpack high to word 875 876 pmullw mm0, mm5 // a * 2/5 877 878 movq mm3, mm1 // make a copy 879 punpcklbw mm1, mm7 // unpack low to word 880 881 pmullw mm2, mm5 // a * 2/5 882 movq mm6, three_fifths // mm6 = 3/5 883 884 movq mm4, mm1 // copy of low b 885 pmullw mm4, mm6 // b * 3/5 886 887 punpckhbw mm3, mm7 // unpack high to word 888 movq mm5, mm3 // copy of high b 889 890 pmullw mm5, mm6 // b * 3/5 891 paddw mm0, mm4 // a * 2/5 + b * 3/5 892 893 paddw mm2, mm5 // a * 2/5 + b * 3/5 894 paddw mm0, round_values // + 128 895 896 paddw mm2, round_values // + 128 897 psrlw mm0, 8 898 899 psrlw mm2, 8 900 packuswb mm0, mm2 // des [1] 901 902 movq QWORD ptr [rsi+rcx], mm0 // write des[1] 903 movq mm0, [rsi+rcx*2] // mm0 = src[2] 904 905 906 907 // mm1, mm3 --- Src[1] 908 // mm0 --- Src[2] 909 // mm7 for unpacking 910 911 movq mm4, mm1 // b low 912 pmullw mm1, four_fifths // b * 4/5 low 913 914 movq QWORD ptr [rdi+rcx], mm0 // write des[4] 915 916 movq mm5, mm3 // b high 917 pmullw mm3, four_fifths // b * 4/5 high 918 919 movq mm2, mm0 // c 920 pmullw mm4, one_fifth // b * 1/5 921 922 punpcklbw mm0, mm7 // c low 923 pmullw mm5, one_fifth // b * 1/5 924 925 movq mm6, mm0 // make copy of c low 926 punpckhbw mm2, mm7 // c high 927 928 pmullw mm6, one_fifth // c * 1/5 low 929 movq mm7, mm2 // make copy of c high 930 931 pmullw mm7, one_fifth // c * 1/5 high 932 paddw mm1, mm6 // b * 4/5 + c * 1/5 low 933 934 paddw mm3, mm7 // b * 4/5 + c * 1/5 high 935 movq mm6, mm0 // make copy of c low 936 937 pmullw mm6, four_fifths // c * 4/5 low 938 movq mm7, mm2 // make copy of c high 939 940 pmullw mm7, four_fifths // c * 4/5 high 941 942 paddw mm4, mm6 // b * 1/5 + c * 4/5 low 943 paddw mm5, mm7 // b * 1/5 + c * 4/5 high 944 945 paddw mm1, round_values // + 128 946 paddw mm3, round_values // + 128 947 948 psrlw mm1, 8 949 psrlw mm3, 8 950 951 packuswb mm1, mm3 // des[2] 952 movq QWORD ptr [rsi+rcx*2], mm1 // write des[2] 953 954 paddw mm4, round_values // + 128 955 paddw mm5, round_values // + 128 956 957 psrlw mm4, 8 958 psrlw mm5, 8 959 960 packuswb mm4, mm5 // des[3] 961 movq QWORD ptr [rdi], mm4 // write des[3] 962 963 // mm0, mm2 --- Src[3] 964 965 add rdi, 8 966 add rsi, 8 967 968 sub rdx, 8 969 jg last_vs_3_5_loop 970 } 971 } 972 973 /**************************************************************************** 974 * 975 * ROUTINE : vertical_band_1_2_scale_mmx 976 * 977 * INPUTS : unsigned char *dest : 978 * unsigned int dest_pitch : 979 * unsigned int dest_width : 980 * 981 * OUTPUTS : None. 982 * 983 * RETURNS : void 984 * 985 * FUNCTION : 1 to 2 up-scaling of a band of pixels. 986 * 987 * SPECIAL NOTES : The routine uses the first line of the band below 988 * the current band. The function also has an "C" only 989 * version. 990 * 991 ****************************************************************************/ 992 static 993 void vertical_band_1_2_scale_mmx 994 ( 995 unsigned char *dest, 996 unsigned int dest_pitch, 997 unsigned int dest_width 998 ) 999 { 1000 __asm 1001 { 1002 1003 mov rsi, dest // Get the source and destination pointer 1004 mov ecx, dest_pitch // Get the pitch size 1005 1006 pxor mm7, mm7 // clear out mm7 1007 mov edx, dest_width // Loop counter 1008 1009 vs_1_2_loop: 1010 1011 movq mm0, [rsi] // get Src[0] 1012 movq mm1, [rsi + rcx * 2] // get Src[1] 1013 1014 movq mm2, mm0 // make copy before unpack 1015 movq mm3, mm1 // make copy before unpack 1016 1017 punpcklbw mm0, mm7 // low Src[0] 1018 movq mm6, four_ones // mm6= 1, 1, 1, 1 1019 1020 punpcklbw mm1, mm7 // low Src[1] 1021 paddw mm0, mm1 // low (a + b) 1022 1023 punpckhbw mm2, mm7 // high Src[0] 1024 paddw mm0, mm6 // low (a + b + 1) 1025 1026 punpckhbw mm3, mm7 1027 paddw mm2, mm3 // high (a + b ) 1028 1029 psraw mm0, 1 // low (a + b +1 )/2 1030 paddw mm2, mm6 // high (a + b + 1) 1031 1032 psraw mm2, 1 // high (a + b + 1)/2 1033 packuswb mm0, mm2 // pack results 1034 1035 movq [rsi+rcx], mm0 // write out eight bytes 1036 add rsi, 8 1037 1038 sub rdx, 8 1039 jg vs_1_2_loop 1040 } 1041 1042 } 1043 1044 /**************************************************************************** 1045 * 1046 * ROUTINE : last_vertical_band_1_2_scale_mmx 1047 * 1048 * INPUTS : unsigned char *dest : 1049 * unsigned int dest_pitch : 1050 * unsigned int dest_width : 1051 * 1052 * OUTPUTS : None. 1053 * 1054 * RETURNS : void 1055 * 1056 * FUNCTION : 1 to 2 up-scaling of band of pixels. 1057 * 1058 * SPECIAL NOTES : The routine uses the first line of the band below 1059 * the current band. The function also has an "C" only 1060 * version. 1061 * 1062 ****************************************************************************/ 1063 static 1064 void last_vertical_band_1_2_scale_mmx 1065 ( 1066 unsigned char *dest, 1067 unsigned int dest_pitch, 1068 unsigned int dest_width 1069 ) 1070 { 1071 __asm 1072 { 1073 mov rsi, dest // Get the source and destination pointer 1074 mov ecx, dest_pitch // Get the pitch size 1075 1076 mov edx, dest_width // Loop counter 1077 1078 last_vs_1_2_loop: 1079 1080 movq mm0, [rsi] // get Src[0] 1081 movq [rsi+rcx], mm0 // write out eight bytes 1082 1083 add rsi, 8 1084 sub rdx, 8 1085 1086 jg last_vs_1_2_loop 1087 } 1088 } 1089 1090 /**************************************************************************** 1091 * 1092 * ROUTINE : horizontal_line_1_2_scale 1093 * 1094 * INPUTS : const unsigned char *source : 1095 * unsigned int source_width : 1096 * unsigned char *dest : 1097 * unsigned int dest_width : 1098 * 1099 * OUTPUTS : None. 1100 * 1101 * RETURNS : void 1102 * 1103 * FUNCTION : 1 to 2 up-scaling of a horizontal line of pixels. 1104 * 1105 * SPECIAL NOTES : None. 1106 * 1107 ****************************************************************************/ 1108 static 1109 void horizontal_line_1_2_scale_mmx 1110 ( 1111 const unsigned char *source, 1112 unsigned int source_width, 1113 unsigned char *dest, 1114 unsigned int dest_width 1115 ) 1116 { 1117 (void) dest_width; 1118 1119 __asm 1120 { 1121 mov rsi, source 1122 mov rdi, dest 1123 1124 pxor mm7, mm7 1125 movq mm6, four_ones 1126 1127 mov ecx, source_width 1128 1129 hs_1_2_loop: 1130 1131 movq mm0, [rsi] 1132 movq mm1, [rsi+1] 1133 1134 movq mm2, mm0 1135 movq mm3, mm1 1136 1137 movq mm4, mm0 1138 punpcklbw mm0, mm7 1139 1140 punpcklbw mm1, mm7 1141 paddw mm0, mm1 1142 1143 paddw mm0, mm6 1144 punpckhbw mm2, mm7 1145 1146 punpckhbw mm3, mm7 1147 paddw mm2, mm3 1148 1149 paddw mm2, mm6 1150 psraw mm0, 1 1151 1152 psraw mm2, 1 1153 packuswb mm0, mm2 1154 1155 movq mm2, mm4 1156 punpcklbw mm2, mm0 1157 1158 movq [rdi], mm2 1159 punpckhbw mm4, mm0 1160 1161 movq [rdi+8], mm4 1162 add rsi, 8 1163 1164 add rdi, 16 1165 sub rcx, 8 1166 1167 cmp rcx, 8 1168 jg hs_1_2_loop 1169 1170 // last eight pixel 1171 1172 movq mm0, [rsi] 1173 movq mm1, mm0 1174 1175 movq mm2, mm0 1176 movq mm3, mm1 1177 1178 psrlq mm1, 8 1179 psrlq mm3, 56 1180 1181 psllq mm3, 56 1182 por mm1, mm3 1183 1184 movq mm3, mm1 1185 movq mm4, mm0 1186 1187 punpcklbw mm0, mm7 1188 punpcklbw mm1, mm7 1189 1190 paddw mm0, mm1 1191 paddw mm0, mm6 1192 1193 punpckhbw mm2, mm7 1194 punpckhbw mm3, mm7 1195 1196 paddw mm2, mm3 1197 paddw mm2, mm6 1198 1199 psraw mm0, 1 1200 psraw mm2, 1 1201 1202 packuswb mm0, mm2 1203 movq mm2, mm4 1204 1205 punpcklbw mm2, mm0 1206 movq [rdi], mm2 1207 1208 punpckhbw mm4, mm0 1209 movq [rdi+8], mm4 1210 } 1211 } 1212 1213 1214 1215 1216 1217 __declspec(align(16)) const static unsigned short const54_2[] = { 0, 64, 128, 192 }; 1218 __declspec(align(16)) const static unsigned short const54_1[] = {256, 192, 128, 64 }; 1219 1220 1221 /**************************************************************************** 1222 * 1223 * ROUTINE : horizontal_line_5_4_scale_mmx 1224 * 1225 * INPUTS : const unsigned char *source : Pointer to source data. 1226 * unsigned int source_width : Stride of source. 1227 * unsigned char *dest : Pointer to destination data. 1228 * unsigned int dest_width : Stride of destination (NOT USED). 1229 * 1230 * OUTPUTS : None. 1231 * 1232 * RETURNS : void 1233 * 1234 * FUNCTION : Copies horizontal line of pixels from source to 1235 * destination scaling up by 4 to 5. 1236 * 1237 * SPECIAL NOTES : None. 1238 * 1239 ****************************************************************************/ 1240 static 1241 void horizontal_line_5_4_scale_mmx 1242 ( 1243 const unsigned char *source, 1244 unsigned int source_width, 1245 unsigned char *dest, 1246 unsigned int dest_width 1247 ) 1248 { 1249 /* 1250 unsigned i; 1251 unsigned int a, b, c, d, e; 1252 unsigned char *des = dest; 1253 const unsigned char *src = source; 1254 1255 (void) dest_width; 1256 1257 for ( i=0; i<source_width; i+=5 ) 1258 { 1259 a = src[0]; 1260 b = src[1]; 1261 c = src[2]; 1262 d = src[3]; 1263 e = src[4]; 1264 1265 des[0] = a; 1266 des[1] = ((b*192 + c* 64 + 128)>>8); 1267 des[2] = ((c*128 + d*128 + 128)>>8); 1268 des[3] = ((d* 64 + e*192 + 128)>>8); 1269 1270 src += 5; 1271 des += 4; 1272 } 1273 */ 1274 __asm 1275 { 1276 1277 mov rsi, source ; 1278 mov rdi, dest ; 1279 1280 mov ecx, source_width ; 1281 movq mm5, const54_1 ; 1282 1283 pxor mm7, mm7 ; 1284 movq mm6, const54_2 ; 1285 1286 movq mm4, round_values ; 1287 lea rdx, [rsi+rcx] ; 1288 horizontal_line_5_4_loop: 1289 1290 movq mm0, QWORD PTR [rsi] ; 1291 00 01 02 03 04 05 06 07 1292 movq mm1, mm0 ; 1293 00 01 02 03 04 05 06 07 1294 1295 psrlq mm0, 8 ; 1296 01 02 03 04 05 06 07 xx 1297 punpcklbw mm1, mm7 ; 1298 xx 00 xx 01 xx 02 xx 03 1299 1300 punpcklbw mm0, mm7 ; 1301 xx 01 xx 02 xx 03 xx 04 1302 pmullw mm1, mm5 1303 1304 pmullw mm0, mm6 1305 add rsi, 5 1306 1307 add rdi, 4 1308 paddw mm1, mm0 1309 1310 paddw mm1, mm4 1311 psrlw mm1, 8 1312 1313 cmp rsi, rdx 1314 packuswb mm1, mm7 1315 1316 movd DWORD PTR [rdi-4], mm1 1317 1318 jl horizontal_line_5_4_loop 1319 1320 } 1321 1322 } 1323 __declspec(align(16)) const static unsigned short one_fourths[] = { 64, 64, 64, 64 }; 1324 __declspec(align(16)) const static unsigned short two_fourths[] = { 128, 128, 128, 128 }; 1325 __declspec(align(16)) const static unsigned short three_fourths[] = { 192, 192, 192, 192 }; 1326 1327 static 1328 void vertical_band_5_4_scale_mmx 1329 ( 1330 unsigned char *source, 1331 unsigned int src_pitch, 1332 unsigned char *dest, 1333 unsigned int dest_pitch, 1334 unsigned int dest_width 1335 ) 1336 { 1337 1338 __asm 1339 { 1340 1341 mov rsi, source // Get the source and destination pointer 1342 mov ecx, src_pitch // Get the pitch size 1343 1344 mov rdi, dest // tow lines below 1345 pxor mm7, mm7 // clear out mm7 1346 1347 mov edx, dest_pitch // Loop counter 1348 mov ebx, dest_width 1349 1350 vs_5_4_loop: 1351 1352 movd mm0, DWORD ptr [rsi] // src[0]; 1353 movd mm1, DWORD ptr [rsi+rcx] // src[1]; 1354 1355 movd mm2, DWORD ptr [rsi+rcx*2] 1356 lea rax, [rsi+rcx*2] // 1357 1358 punpcklbw mm1, mm7 1359 punpcklbw mm2, mm7 1360 1361 movq mm3, mm2 1362 pmullw mm1, three_fourths 1363 1364 pmullw mm2, one_fourths 1365 movd mm4, [rax+rcx] 1366 1367 pmullw mm3, two_fourths 1368 punpcklbw mm4, mm7 1369 1370 movq mm5, mm4 1371 pmullw mm4, two_fourths 1372 1373 paddw mm1, mm2 1374 movd mm6, [rax+rcx*2] 1375 1376 pmullw mm5, one_fourths 1377 paddw mm1, round_values; 1378 1379 paddw mm3, mm4 1380 psrlw mm1, 8 1381 1382 punpcklbw mm6, mm7 1383 paddw mm3, round_values 1384 1385 pmullw mm6, three_fourths 1386 psrlw mm3, 8 1387 1388 packuswb mm1, mm7 1389 packuswb mm3, mm7 1390 1391 movd DWORD PTR [rdi], mm0 1392 movd DWORD PTR [rdi+rdx], mm1 1393 1394 1395 paddw mm5, mm6 1396 movd DWORD PTR [rdi+rdx*2], mm3 1397 1398 lea rax, [rdi+rdx*2] 1399 paddw mm5, round_values 1400 1401 psrlw mm5, 8 1402 add rdi, 4 1403 1404 packuswb mm5, mm7 1405 movd DWORD PTR [rax+rdx], mm5 1406 1407 add rsi, 4 1408 sub rbx, 4 1409 1410 jg vs_5_4_loop 1411 } 1412 } 1413 1414 1415 __declspec(align(16)) const static unsigned short const53_1[] = { 0, 85, 171, 0 }; 1416 __declspec(align(16)) const static unsigned short const53_2[] = {256, 171, 85, 0 }; 1417 1418 1419 static 1420 void horizontal_line_5_3_scale_mmx 1421 ( 1422 const unsigned char *source, 1423 unsigned int source_width, 1424 unsigned char *dest, 1425 unsigned int dest_width 1426 ) 1427 { 1428 __asm 1429 { 1430 1431 mov rsi, source ; 1432 mov rdi, dest ; 1433 1434 mov ecx, source_width ; 1435 movq mm5, const53_1 ; 1436 1437 pxor mm7, mm7 ; 1438 movq mm6, const53_2 ; 1439 1440 movq mm4, round_values ; 1441 lea rdx, [rsi+rcx-5] ; 1442 horizontal_line_5_3_loop: 1443 1444 movq mm0, QWORD PTR [rsi] ; 1445 00 01 02 03 04 05 06 07 1446 movq mm1, mm0 ; 1447 00 01 02 03 04 05 06 07 1448 1449 psllw mm0, 8 ; 1450 xx 00 xx 02 xx 04 xx 06 1451 psrlw mm1, 8 ; 1452 01 xx 03 xx 05 xx 07 xx 1453 1454 psrlw mm0, 8 ; 1455 00 xx 02 xx 04 xx 06 xx 1456 psllq mm1, 16 ; 1457 xx xx 01 xx 03 xx 05 xx 1458 1459 pmullw mm0, mm6 1460 1461 pmullw mm1, mm5 1462 add rsi, 5 1463 1464 add rdi, 3 1465 paddw mm1, mm0 1466 1467 paddw mm1, mm4 1468 psrlw mm1, 8 1469 1470 cmp rsi, rdx 1471 packuswb mm1, mm7 1472 1473 movd DWORD PTR [rdi-3], mm1 1474 jl horizontal_line_5_3_loop 1475 1476 //exit condition 1477 movq mm0, QWORD PTR [rsi] ; 1478 00 01 02 03 04 05 06 07 1479 movq mm1, mm0 ; 1480 00 01 02 03 04 05 06 07 1481 1482 psllw mm0, 8 ; 1483 xx 00 xx 02 xx 04 xx 06 1484 psrlw mm1, 8 ; 1485 01 xx 03 xx 05 xx 07 xx 1486 1487 psrlw mm0, 8 ; 1488 00 xx 02 xx 04 xx 06 xx 1489 psllq mm1, 16 ; 1490 xx xx 01 xx 03 xx 05 xx 1491 1492 pmullw mm0, mm6 1493 1494 pmullw mm1, mm5 1495 paddw mm1, mm0 1496 1497 paddw mm1, mm4 1498 psrlw mm1, 8 1499 1500 packuswb mm1, mm7 1501 movd rax, mm1 1502 1503 mov rdx, rax 1504 shr rdx, 16 1505 1506 mov WORD PTR[rdi], ax 1507 mov BYTE PTR[rdi+2], dl 1508 1509 } 1510 1511 } 1512 1513 __declspec(align(16)) const static unsigned short one_thirds[] = { 85, 85, 85, 85 }; 1514 __declspec(align(16)) const static unsigned short two_thirds[] = { 171, 171, 171, 171 }; 1515 1516 static 1517 void vertical_band_5_3_scale_mmx 1518 ( 1519 unsigned char *source, 1520 unsigned int src_pitch, 1521 unsigned char *dest, 1522 unsigned int dest_pitch, 1523 unsigned int dest_width 1524 ) 1525 { 1526 1527 __asm 1528 { 1529 1530 mov rsi, source // Get the source and destination pointer 1531 mov ecx, src_pitch // Get the pitch size 1532 1533 mov rdi, dest // tow lines below 1534 pxor mm7, mm7 // clear out mm7 1535 1536 mov edx, dest_pitch // Loop counter 1537 movq mm5, one_thirds 1538 1539 movq mm6, two_thirds 1540 mov ebx, dest_width; 1541 1542 vs_5_3_loop: 1543 1544 movd mm0, DWORD ptr [rsi] // src[0]; 1545 movd mm1, DWORD ptr [rsi+rcx] // src[1]; 1546 1547 movd mm2, DWORD ptr [rsi+rcx*2] 1548 lea rax, [rsi+rcx*2] // 1549 1550 punpcklbw mm1, mm7 1551 punpcklbw mm2, mm7 1552 1553 pmullw mm1, mm5 1554 pmullw mm2, mm6 1555 1556 movd mm3, DWORD ptr [rax+rcx] 1557 movd mm4, DWORD ptr [rax+rcx*2] 1558 1559 punpcklbw mm3, mm7 1560 punpcklbw mm4, mm7 1561 1562 pmullw mm3, mm6 1563 pmullw mm4, mm5 1564 1565 1566 movd DWORD PTR [rdi], mm0 1567 paddw mm1, mm2 1568 1569 paddw mm1, round_values 1570 psrlw mm1, 8 1571 1572 packuswb mm1, mm7 1573 paddw mm3, mm4 1574 1575 paddw mm3, round_values 1576 movd DWORD PTR [rdi+rdx], mm1 1577 1578 psrlw mm3, 8 1579 packuswb mm3, mm7 1580 1581 movd DWORD PTR [rdi+rdx*2], mm3 1582 1583 1584 add rdi, 4 1585 add rsi, 4 1586 1587 sub rbx, 4 1588 jg vs_5_3_loop 1589 } 1590 } 1591 1592 1593 1594 1595 /**************************************************************************** 1596 * 1597 * ROUTINE : horizontal_line_2_1_scale 1598 * 1599 * INPUTS : const unsigned char *source : 1600 * unsigned int source_width : 1601 * unsigned char *dest : 1602 * unsigned int dest_width : 1603 * 1604 * OUTPUTS : None. 1605 * 1606 * RETURNS : void 1607 * 1608 * FUNCTION : 1 to 2 up-scaling of a horizontal line of pixels. 1609 * 1610 * SPECIAL NOTES : None. 1611 * 1612 ****************************************************************************/ 1613 static 1614 void horizontal_line_2_1_scale_mmx 1615 ( 1616 const unsigned char *source, 1617 unsigned int source_width, 1618 unsigned char *dest, 1619 unsigned int dest_width 1620 ) 1621 { 1622 (void) dest_width; 1623 1624 __asm 1625 { 1626 mov rsi, source 1627 mov rdi, dest 1628 1629 pxor mm7, mm7 1630 mov ecx, dest_width 1631 1632 xor rdx, rdx 1633 hs_2_1_loop: 1634 1635 movq mm0, [rsi+rdx*2] 1636 psllw mm0, 8 1637 1638 psrlw mm0, 8 1639 packuswb mm0, mm7 1640 1641 movd DWORD Ptr [rdi+rdx], mm0; 1642 add rdx, 4 1643 1644 cmp rdx, rcx 1645 jl hs_2_1_loop 1646 1647 } 1648 } 1649 1650 1651 1652 static 1653 void vertical_band_2_1_scale_mmx 1654 ( 1655 unsigned char *source, 1656 unsigned int src_pitch, 1657 unsigned char *dest, 1658 unsigned int dest_pitch, 1659 unsigned int dest_width) 1660 { 1661 vpx_memcpy(dest, source, dest_width); 1662 } 1663 1664 1665 __declspec(align(16)) const static unsigned short three_sixteenths[] = { 48, 48, 48, 48 }; 1666 __declspec(align(16)) const static unsigned short ten_sixteenths[] = { 160, 160, 160, 160 }; 1667 1668 static 1669 void vertical_band_2_1_scale_i_mmx 1670 ( 1671 unsigned char *source, 1672 unsigned int src_pitch, 1673 unsigned char *dest, 1674 unsigned int dest_pitch, 1675 unsigned int dest_width 1676 ) 1677 { 1678 __asm 1679 { 1680 mov rsi, source 1681 mov rdi, dest 1682 1683 mov eax, src_pitch 1684 mov edx, dest_width 1685 1686 pxor mm7, mm7 1687 sub rsi, rax //back one line 1688 1689 1690 lea rcx, [rsi+rdx]; 1691 movq mm6, round_values; 1692 1693 movq mm5, three_sixteenths; 1694 movq mm4, ten_sixteenths; 1695 1696 vs_2_1_i_loop: 1697 movd mm0, [rsi] // 1698 movd mm1, [rsi+rax] // 1699 1700 movd mm2, [rsi+rax*2] // 1701 punpcklbw mm0, mm7 1702 1703 pmullw mm0, mm5 1704 punpcklbw mm1, mm7 1705 1706 pmullw mm1, mm4 1707 punpcklbw mm2, mm7 1708 1709 pmullw mm2, mm5 1710 paddw mm0, round_values 1711 1712 paddw mm1, mm2 1713 paddw mm0, mm1 1714 1715 psrlw mm0, 8 1716 packuswb mm0, mm7 1717 1718 movd DWORD PTR [rdi], mm0 1719 add rsi, 4 1720 1721 add rdi, 4; 1722 cmp rsi, rcx 1723 jl vs_2_1_i_loop 1724 1725 } 1726 } 1727 1728 1729 1730 void 1731 register_mmxscalers(void) 1732 { 1733 vp8_horizontal_line_1_2_scale = horizontal_line_1_2_scale_mmx; 1734 vp8_horizontal_line_3_5_scale = horizontal_line_3_5_scale_mmx; 1735 vp8_horizontal_line_4_5_scale = horizontal_line_4_5_scale_mmx; 1736 vp8_vertical_band_1_2_scale = vertical_band_1_2_scale_mmx; 1737 vp8_last_vertical_band_1_2_scale = last_vertical_band_1_2_scale_mmx; 1738 vp8_vertical_band_3_5_scale = vertical_band_3_5_scale_mmx; 1739 vp8_last_vertical_band_3_5_scale = last_vertical_band_3_5_scale_mmx; 1740 vp8_vertical_band_4_5_scale = vertical_band_4_5_scale_mmx; 1741 vp8_last_vertical_band_4_5_scale = last_vertical_band_4_5_scale_mmx; 1742 1743 vp8_vertical_band_5_4_scale = vertical_band_5_4_scale_mmx; 1744 vp8_vertical_band_5_3_scale = vertical_band_5_3_scale_mmx; 1745 vp8_vertical_band_2_1_scale = vertical_band_2_1_scale_mmx; 1746 vp8_vertical_band_2_1_scale_i = vertical_band_2_1_scale_i_mmx; 1747 vp8_horizontal_line_2_1_scale = horizontal_line_2_1_scale_mmx; 1748 vp8_horizontal_line_5_3_scale = horizontal_line_5_3_scale_mmx; 1749 vp8_horizontal_line_5_4_scale = horizontal_line_5_4_scale_mmx; 1750 } 1751