1 /* 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 12 #include <math.h> 13 #include <stdlib.h> 14 #include "vpx_scale/yv12config.h" 15 #include "pragmas.h" 16 17 #define VP8_FILTER_WEIGHT 128 18 #define VP8_FILTER_SHIFT 7 19 20 21 22 /* static constants */ 23 __declspec(align(16)) 24 const static short Blur[48] = 25 { 26 27 16, 16, 16, 16, 16, 16, 16, 16, 28 16, 16, 16, 16, 16, 16, 16, 16, 29 64, 64, 64, 64, 64, 64, 64, 64, 30 16, 16, 16, 16, 16, 16, 16, 16, 31 16, 16, 16, 16, 16, 16, 16, 16, 32 0, 0, 0, 0, 0, 0, 0, 0, 33 34 }; 35 #define RD __declspec(align(16)) __int64 rd = 0x0040004000400040; 36 #define R4D2 __declspec(align(16)) __int64 rd42[2] = {0x0004000400040004,0x0004000400040004}; 37 38 #ifndef RELOCATEABLE 39 const static RD; 40 const static R4D2; 41 #endif 42 43 44 /* external references */ 45 extern double vp8_gaussian(double sigma, double mu, double x); 46 extern short vp8_rv[]; 47 extern int vp8_q2mbl(int x) ; 48 49 50 51 void vp8_post_proc_down_and_across_mmx 52 ( 53 unsigned char *src_ptr, 54 unsigned char *dst_ptr, 55 int src_pixels_per_line, 56 int dst_pixels_per_line, 57 int rows, 58 int cols, 59 int flimit 60 ) 61 { 62 #ifdef RELOCATEABLE 63 RD 64 R4D2 65 #endif 66 67 __asm 68 { 69 push ebx 70 lea ebx, Blur 71 movd mm2, flimit 72 punpcklwd mm2, mm2 73 punpckldq mm2, mm2 74 75 mov esi, src_ptr 76 mov edi, dst_ptr 77 78 mov ecx, DWORD PTR rows 79 mov eax, src_pixels_per_line ; 80 destination pitch? 81 pxor mm0, mm0 ; 82 mm0 = 00000000 83 84 nextrow: 85 86 xor edx, edx ; 87 88 clear out edx for use as loop counter 89 nextcol: 90 91 pxor mm7, mm7 ; 92 93 mm7 = 00000000 94 movq mm6, [ebx + 32 ] ; 95 mm6 = kernel 2 taps 96 movq mm3, [esi] ; 97 mm4 = r0 p0..p7 98 punpcklbw mm3, mm0 ; 99 mm3 = p0..p3 100 movq mm1, mm3 ; 101 mm1 = p0..p3 102 pmullw mm3, mm6 ; 103 mm3 *= kernel 2 modifiers 104 105 movq mm6, [ebx + 48] ; 106 mm6 = kernel 3 taps 107 movq mm5, [esi + eax] ; 108 mm4 = r1 p0..p7 109 punpcklbw mm5, mm0 ; 110 mm5 = r1 p0..p3 111 pmullw mm6, mm5 ; 112 mm6 *= p0..p3 * kernel 3 modifiers 113 paddusw mm3, mm6 ; 114 mm3 += mm6 115 116 ; 117 thresholding 118 movq mm7, mm1 ; 119 mm7 = r0 p0..p3 120 psubusw mm7, mm5 ; 121 mm7 = r0 p0..p3 - r1 p0..p3 122 psubusw mm5, mm1 ; 123 mm5 = r1 p0..p3 - r0 p0..p3 124 paddusw mm7, mm5 ; 125 mm7 = abs(r0 p0..p3 - r1 p0..p3) 126 pcmpgtw mm7, mm2 127 128 movq mm6, [ebx + 64 ] ; 129 mm6 = kernel 4 modifiers 130 movq mm5, [esi + 2*eax] ; 131 mm4 = r2 p0..p7 132 punpcklbw mm5, mm0 ; 133 mm5 = r2 p0..p3 134 pmullw mm6, mm5 ; 135 mm5 *= kernel 4 modifiers 136 paddusw mm3, mm6 ; 137 mm3 += mm5 138 139 ; 140 thresholding 141 movq mm6, mm1 ; 142 mm6 = r0 p0..p3 143 psubusw mm6, mm5 ; 144 mm6 = r0 p0..p3 - r2 p0..p3 145 psubusw mm5, mm1 ; 146 mm5 = r2 p0..p3 - r2 p0..p3 147 paddusw mm6, mm5 ; 148 mm6 = abs(r0 p0..p3 - r2 p0..p3) 149 pcmpgtw mm6, mm2 150 por mm7, mm6 ; 151 accumulate thresholds 152 153 154 neg eax 155 movq mm6, [ebx ] ; 156 kernel 0 taps 157 movq mm5, [esi+2*eax] ; 158 mm4 = r-2 p0..p7 159 punpcklbw mm5, mm0 ; 160 mm5 = r-2 p0..p3 161 pmullw mm6, mm5 ; 162 mm5 *= kernel 0 modifiers 163 paddusw mm3, mm6 ; 164 mm3 += mm5 165 166 ; 167 thresholding 168 movq mm6, mm1 ; 169 mm6 = r0 p0..p3 170 psubusw mm6, mm5 ; 171 mm6 = p0..p3 - r-2 p0..p3 172 psubusw mm5, mm1 ; 173 mm5 = r-2 p0..p3 - p0..p3 174 paddusw mm6, mm5 ; 175 mm6 = abs(r0 p0..p3 - r-2 p0..p3) 176 pcmpgtw mm6, mm2 177 por mm7, mm6 ; 178 accumulate thresholds 179 180 movq mm6, [ebx + 16] ; 181 kernel 1 taps 182 movq mm4, [esi+eax] ; 183 mm4 = r-1 p0..p7 184 punpcklbw mm4, mm0 ; 185 mm4 = r-1 p0..p3 186 pmullw mm6, mm4 ; 187 mm4 *= kernel 1 modifiers. 188 paddusw mm3, mm6 ; 189 mm3 += mm5 190 191 ; 192 thresholding 193 movq mm6, mm1 ; 194 mm6 = r0 p0..p3 195 psubusw mm6, mm4 ; 196 mm6 = p0..p3 - r-2 p0..p3 197 psubusw mm4, mm1 ; 198 mm5 = r-1 p0..p3 - p0..p3 199 paddusw mm6, mm4 ; 200 mm6 = abs(r0 p0..p3 - r-1 p0..p3) 201 pcmpgtw mm6, mm2 202 por mm7, mm6 ; 203 accumulate thresholds 204 205 206 paddusw mm3, rd ; 207 mm3 += round value 208 psraw mm3, VP8_FILTER_SHIFT ; 209 mm3 /= 128 210 211 pand mm1, mm7 ; 212 mm1 select vals > thresh from source 213 pandn mm7, mm3 ; 214 mm7 select vals < thresh from blurred result 215 paddusw mm1, mm7 ; 216 combination 217 218 packuswb mm1, mm0 ; 219 pack to bytes 220 221 movd [edi], mm1 ; 222 neg eax ; 223 pitch is positive 224 225 226 add esi, 4 227 add edi, 4 228 add edx, 4 229 230 cmp edx, cols 231 jl nextcol 232 // done with the all cols, start the across filtering in place 233 sub esi, edx 234 sub edi, edx 235 236 237 push eax 238 xor edx, edx 239 mov eax, [edi-4]; 240 241 acrossnextcol: 242 pxor mm7, mm7 ; 243 mm7 = 00000000 244 movq mm6, [ebx + 32 ] ; 245 movq mm4, [edi+edx] ; 246 mm4 = p0..p7 247 movq mm3, mm4 ; 248 mm3 = p0..p7 249 punpcklbw mm3, mm0 ; 250 mm3 = p0..p3 251 movq mm1, mm3 ; 252 mm1 = p0..p3 253 pmullw mm3, mm6 ; 254 mm3 *= kernel 2 modifiers 255 256 movq mm6, [ebx + 48] 257 psrlq mm4, 8 ; 258 mm4 = p1..p7 259 movq mm5, mm4 ; 260 mm5 = p1..p7 261 punpcklbw mm5, mm0 ; 262 mm5 = p1..p4 263 pmullw mm6, mm5 ; 264 mm6 *= p1..p4 * kernel 3 modifiers 265 paddusw mm3, mm6 ; 266 mm3 += mm6 267 268 ; 269 thresholding 270 movq mm7, mm1 ; 271 mm7 = p0..p3 272 psubusw mm7, mm5 ; 273 mm7 = p0..p3 - p1..p4 274 psubusw mm5, mm1 ; 275 mm5 = p1..p4 - p0..p3 276 paddusw mm7, mm5 ; 277 mm7 = abs(p0..p3 - p1..p4) 278 pcmpgtw mm7, mm2 279 280 movq mm6, [ebx + 64 ] 281 psrlq mm4, 8 ; 282 mm4 = p2..p7 283 movq mm5, mm4 ; 284 mm5 = p2..p7 285 punpcklbw mm5, mm0 ; 286 mm5 = p2..p5 287 pmullw mm6, mm5 ; 288 mm5 *= kernel 4 modifiers 289 paddusw mm3, mm6 ; 290 mm3 += mm5 291 292 ; 293 thresholding 294 movq mm6, mm1 ; 295 mm6 = p0..p3 296 psubusw mm6, mm5 ; 297 mm6 = p0..p3 - p1..p4 298 psubusw mm5, mm1 ; 299 mm5 = p1..p4 - p0..p3 300 paddusw mm6, mm5 ; 301 mm6 = abs(p0..p3 - p1..p4) 302 pcmpgtw mm6, mm2 303 por mm7, mm6 ; 304 accumulate thresholds 305 306 307 movq mm6, [ebx ] 308 movq mm4, [edi+edx-2] ; 309 mm4 = p-2..p5 310 movq mm5, mm4 ; 311 mm5 = p-2..p5 312 punpcklbw mm5, mm0 ; 313 mm5 = p-2..p1 314 pmullw mm6, mm5 ; 315 mm5 *= kernel 0 modifiers 316 paddusw mm3, mm6 ; 317 mm3 += mm5 318 319 ; 320 thresholding 321 movq mm6, mm1 ; 322 mm6 = p0..p3 323 psubusw mm6, mm5 ; 324 mm6 = p0..p3 - p1..p4 325 psubusw mm5, mm1 ; 326 mm5 = p1..p4 - p0..p3 327 paddusw mm6, mm5 ; 328 mm6 = abs(p0..p3 - p1..p4) 329 pcmpgtw mm6, mm2 330 por mm7, mm6 ; 331 accumulate thresholds 332 333 movq mm6, [ebx + 16] 334 psrlq mm4, 8 ; 335 mm4 = p-1..p5 336 punpcklbw mm4, mm0 ; 337 mm4 = p-1..p2 338 pmullw mm6, mm4 ; 339 mm4 *= kernel 1 modifiers. 340 paddusw mm3, mm6 ; 341 mm3 += mm5 342 343 ; 344 thresholding 345 movq mm6, mm1 ; 346 mm6 = p0..p3 347 psubusw mm6, mm4 ; 348 mm6 = p0..p3 - p1..p4 349 psubusw mm4, mm1 ; 350 mm5 = p1..p4 - p0..p3 351 paddusw mm6, mm4 ; 352 mm6 = abs(p0..p3 - p1..p4) 353 pcmpgtw mm6, mm2 354 por mm7, mm6 ; 355 accumulate thresholds 356 357 paddusw mm3, rd ; 358 mm3 += round value 359 psraw mm3, VP8_FILTER_SHIFT ; 360 mm3 /= 128 361 362 pand mm1, mm7 ; 363 mm1 select vals > thresh from source 364 pandn mm7, mm3 ; 365 mm7 select vals < thresh from blurred result 366 paddusw mm1, mm7 ; 367 combination 368 369 packuswb mm1, mm0 ; 370 pack to bytes 371 mov DWORD PTR [edi+edx-4], eax ; 372 store previous four bytes 373 movd eax, mm1 374 375 add edx, 4 376 cmp edx, cols 377 jl acrossnextcol; 378 379 mov DWORD PTR [edi+edx-4], eax 380 pop eax 381 382 // done with this rwo 383 add esi, eax ; 384 next line 385 mov eax, dst_pixels_per_line ; 386 destination pitch? 387 add edi, eax ; 388 next destination 389 mov eax, src_pixels_per_line ; 390 destination pitch? 391 392 dec ecx ; 393 decrement count 394 jnz nextrow ; 395 next row 396 pop ebx 397 398 } 399 } 400 401 402 403 void vp8_post_proc_down_and_across_xmm 404 ( 405 unsigned char *src_ptr, 406 unsigned char *dst_ptr, 407 int src_pixels_per_line, 408 int dst_pixels_per_line, 409 int rows, 410 int cols, 411 int flimit 412 ) 413 { 414 #ifdef RELOCATEABLE 415 R4D2 416 #endif 417 418 __asm 419 { 420 movd xmm2, flimit 421 punpcklwd xmm2, xmm2 422 punpckldq xmm2, xmm2 423 punpcklqdq xmm2, xmm2 424 425 mov esi, src_ptr 426 mov edi, dst_ptr 427 428 mov ecx, DWORD PTR rows 429 mov eax, src_pixels_per_line ; 430 destination pitch? 431 pxor xmm0, xmm0 ; 432 mm0 = 00000000 433 434 nextrow: 435 436 xor edx, edx ; 437 438 clear out edx for use as loop counter 439 nextcol: 440 movq xmm3, QWORD PTR [esi] ; 441 442 mm4 = r0 p0..p7 443 punpcklbw xmm3, xmm0 ; 444 mm3 = p0..p3 445 movdqa xmm1, xmm3 ; 446 mm1 = p0..p3 447 psllw xmm3, 2 ; 448 449 movq xmm5, QWORD PTR [esi + eax] ; 450 mm4 = r1 p0..p7 451 punpcklbw xmm5, xmm0 ; 452 mm5 = r1 p0..p3 453 paddusw xmm3, xmm5 ; 454 mm3 += mm6 455 456 ; 457 thresholding 458 movdqa xmm7, xmm1 ; 459 mm7 = r0 p0..p3 460 psubusw xmm7, xmm5 ; 461 mm7 = r0 p0..p3 - r1 p0..p3 462 psubusw xmm5, xmm1 ; 463 mm5 = r1 p0..p3 - r0 p0..p3 464 paddusw xmm7, xmm5 ; 465 mm7 = abs(r0 p0..p3 - r1 p0..p3) 466 pcmpgtw xmm7, xmm2 467 468 movq xmm5, QWORD PTR [esi + 2*eax] ; 469 mm4 = r2 p0..p7 470 punpcklbw xmm5, xmm0 ; 471 mm5 = r2 p0..p3 472 paddusw xmm3, xmm5 ; 473 mm3 += mm5 474 475 ; 476 thresholding 477 movdqa xmm6, xmm1 ; 478 mm6 = r0 p0..p3 479 psubusw xmm6, xmm5 ; 480 mm6 = r0 p0..p3 - r2 p0..p3 481 psubusw xmm5, xmm1 ; 482 mm5 = r2 p0..p3 - r2 p0..p3 483 paddusw xmm6, xmm5 ; 484 mm6 = abs(r0 p0..p3 - r2 p0..p3) 485 pcmpgtw xmm6, xmm2 486 por xmm7, xmm6 ; 487 accumulate thresholds 488 489 490 neg eax 491 movq xmm5, QWORD PTR [esi+2*eax] ; 492 mm4 = r-2 p0..p7 493 punpcklbw xmm5, xmm0 ; 494 mm5 = r-2 p0..p3 495 paddusw xmm3, xmm5 ; 496 mm3 += mm5 497 498 ; 499 thresholding 500 movdqa xmm6, xmm1 ; 501 mm6 = r0 p0..p3 502 psubusw xmm6, xmm5 ; 503 mm6 = p0..p3 - r-2 p0..p3 504 psubusw xmm5, xmm1 ; 505 mm5 = r-2 p0..p3 - p0..p3 506 paddusw xmm6, xmm5 ; 507 mm6 = abs(r0 p0..p3 - r-2 p0..p3) 508 pcmpgtw xmm6, xmm2 509 por xmm7, xmm6 ; 510 accumulate thresholds 511 512 movq xmm4, QWORD PTR [esi+eax] ; 513 mm4 = r-1 p0..p7 514 punpcklbw xmm4, xmm0 ; 515 mm4 = r-1 p0..p3 516 paddusw xmm3, xmm4 ; 517 mm3 += mm5 518 519 ; 520 thresholding 521 movdqa xmm6, xmm1 ; 522 mm6 = r0 p0..p3 523 psubusw xmm6, xmm4 ; 524 mm6 = p0..p3 - r-2 p0..p3 525 psubusw xmm4, xmm1 ; 526 mm5 = r-1 p0..p3 - p0..p3 527 paddusw xmm6, xmm4 ; 528 mm6 = abs(r0 p0..p3 - r-1 p0..p3) 529 pcmpgtw xmm6, xmm2 530 por xmm7, xmm6 ; 531 accumulate thresholds 532 533 534 paddusw xmm3, rd42 ; 535 mm3 += round value 536 psraw xmm3, 3 ; 537 mm3 /= 8 538 539 pand xmm1, xmm7 ; 540 mm1 select vals > thresh from source 541 pandn xmm7, xmm3 ; 542 mm7 select vals < thresh from blurred result 543 paddusw xmm1, xmm7 ; 544 combination 545 546 packuswb xmm1, xmm0 ; 547 pack to bytes 548 movq QWORD PTR [edi], xmm1 ; 549 550 neg eax ; 551 pitch is positive 552 add esi, 8 553 add edi, 8 554 555 add edx, 8 556 cmp edx, cols 557 558 jl nextcol 559 560 // done with the all cols, start the across filtering in place 561 sub esi, edx 562 sub edi, edx 563 564 xor edx, edx 565 movq mm0, QWORD PTR [edi-8]; 566 567 acrossnextcol: 568 movq xmm7, QWORD PTR [edi +edx -2] 569 movd xmm4, DWORD PTR [edi +edx +6] 570 571 pslldq xmm4, 8 572 por xmm4, xmm7 573 574 movdqa xmm3, xmm4 575 psrldq xmm3, 2 576 punpcklbw xmm3, xmm0 ; 577 mm3 = p0..p3 578 movdqa xmm1, xmm3 ; 579 mm1 = p0..p3 580 psllw xmm3, 2 581 582 583 movdqa xmm5, xmm4 584 psrldq xmm5, 3 585 punpcklbw xmm5, xmm0 ; 586 mm5 = p1..p4 587 paddusw xmm3, xmm5 ; 588 mm3 += mm6 589 590 ; 591 thresholding 592 movdqa xmm7, xmm1 ; 593 mm7 = p0..p3 594 psubusw xmm7, xmm5 ; 595 mm7 = p0..p3 - p1..p4 596 psubusw xmm5, xmm1 ; 597 mm5 = p1..p4 - p0..p3 598 paddusw xmm7, xmm5 ; 599 mm7 = abs(p0..p3 - p1..p4) 600 pcmpgtw xmm7, xmm2 601 602 movdqa xmm5, xmm4 603 psrldq xmm5, 4 604 punpcklbw xmm5, xmm0 ; 605 mm5 = p2..p5 606 paddusw xmm3, xmm5 ; 607 mm3 += mm5 608 609 ; 610 thresholding 611 movdqa xmm6, xmm1 ; 612 mm6 = p0..p3 613 psubusw xmm6, xmm5 ; 614 mm6 = p0..p3 - p1..p4 615 psubusw xmm5, xmm1 ; 616 mm5 = p1..p4 - p0..p3 617 paddusw xmm6, xmm5 ; 618 mm6 = abs(p0..p3 - p1..p4) 619 pcmpgtw xmm6, xmm2 620 por xmm7, xmm6 ; 621 accumulate thresholds 622 623 624 movdqa xmm5, xmm4 ; 625 mm5 = p-2..p5 626 punpcklbw xmm5, xmm0 ; 627 mm5 = p-2..p1 628 paddusw xmm3, xmm5 ; 629 mm3 += mm5 630 631 ; 632 thresholding 633 movdqa xmm6, xmm1 ; 634 mm6 = p0..p3 635 psubusw xmm6, xmm5 ; 636 mm6 = p0..p3 - p1..p4 637 psubusw xmm5, xmm1 ; 638 mm5 = p1..p4 - p0..p3 639 paddusw xmm6, xmm5 ; 640 mm6 = abs(p0..p3 - p1..p4) 641 pcmpgtw xmm6, xmm2 642 por xmm7, xmm6 ; 643 accumulate thresholds 644 645 psrldq xmm4, 1 ; 646 mm4 = p-1..p5 647 punpcklbw xmm4, xmm0 ; 648 mm4 = p-1..p2 649 paddusw xmm3, xmm4 ; 650 mm3 += mm5 651 652 ; 653 thresholding 654 movdqa xmm6, xmm1 ; 655 mm6 = p0..p3 656 psubusw xmm6, xmm4 ; 657 mm6 = p0..p3 - p1..p4 658 psubusw xmm4, xmm1 ; 659 mm5 = p1..p4 - p0..p3 660 paddusw xmm6, xmm4 ; 661 mm6 = abs(p0..p3 - p1..p4) 662 pcmpgtw xmm6, xmm2 663 por xmm7, xmm6 ; 664 accumulate thresholds 665 666 paddusw xmm3, rd42 ; 667 mm3 += round value 668 psraw xmm3, 3 ; 669 mm3 /= 8 670 671 pand xmm1, xmm7 ; 672 mm1 select vals > thresh from source 673 pandn xmm7, xmm3 ; 674 mm7 select vals < thresh from blurred result 675 paddusw xmm1, xmm7 ; 676 combination 677 678 packuswb xmm1, xmm0 ; 679 pack to bytes 680 movq QWORD PTR [edi+edx-8], mm0 ; 681 store previous four bytes 682 movdq2q mm0, xmm1 683 684 add edx, 8 685 cmp edx, cols 686 jl acrossnextcol; 687 688 // last 8 pixels 689 movq QWORD PTR [edi+edx-8], mm0 690 691 // done with this rwo 692 add esi, eax ; 693 next line 694 mov eax, dst_pixels_per_line ; 695 destination pitch? 696 add edi, eax ; 697 next destination 698 mov eax, src_pixels_per_line ; 699 destination pitch? 700 701 dec ecx ; 702 decrement count 703 jnz nextrow ; 704 next row 705 } 706 } 707 708 709 void vp8_mbpost_proc_down_mmx(unsigned char *dst, int pitch, int rows, int cols, int flimit) 710 { 711 int c, i; 712 __declspec(align(16)) 713 int flimit2[2]; 714 __declspec(align(16)) 715 unsigned char d[16][8]; 716 717 flimit = vp8_q2mbl(flimit); 718 719 for (i = 0; i < 2; i++) 720 flimit2[i] = flimit; 721 722 rows += 8; 723 724 for (c = 0; c < cols; c += 4) 725 { 726 unsigned char *s = &dst[c]; 727 728 __asm 729 { 730 mov esi, s ; 731 pxor mm0, mm0 ; 732 733 mov eax, pitch ; 734 neg eax // eax = -pitch 735 736 lea esi, [esi + eax*8]; // edi = s[-pitch*8] 737 neg eax 738 739 740 pxor mm5, mm5 741 pxor mm6, mm6 ; 742 743 pxor mm7, mm7 ; 744 mov edi, esi 745 746 mov ecx, 15 ; 747 748 loop_initvar: 749 movd mm1, DWORD PTR [edi]; 750 punpcklbw mm1, mm0 ; 751 752 paddw mm5, mm1 ; 753 pmullw mm1, mm1 ; 754 755 movq mm2, mm1 ; 756 punpcklwd mm1, mm0 ; 757 758 punpckhwd mm2, mm0 ; 759 paddd mm6, mm1 ; 760 761 paddd mm7, mm2 ; 762 lea edi, [edi+eax] ; 763 764 dec ecx 765 jne loop_initvar 766 //save the var and sum 767 xor edx, edx 768 loop_row: 769 movd mm1, DWORD PTR [esi] // [s-pitch*8] 770 movd mm2, DWORD PTR [edi] // [s+pitch*7] 771 772 punpcklbw mm1, mm0 773 punpcklbw mm2, mm0 774 775 paddw mm5, mm2 776 psubw mm5, mm1 777 778 pmullw mm2, mm2 779 movq mm4, mm2 780 781 punpcklwd mm2, mm0 782 punpckhwd mm4, mm0 783 784 paddd mm6, mm2 785 paddd mm7, mm4 786 787 pmullw mm1, mm1 788 movq mm2, mm1 789 790 punpcklwd mm1, mm0 791 psubd mm6, mm1 792 793 punpckhwd mm2, mm0 794 psubd mm7, mm2 795 796 797 movq mm3, mm6 798 pslld mm3, 4 799 800 psubd mm3, mm6 801 movq mm1, mm5 802 803 movq mm4, mm5 804 pmullw mm1, mm1 805 806 pmulhw mm4, mm4 807 movq mm2, mm1 808 809 punpcklwd mm1, mm4 810 punpckhwd mm2, mm4 811 812 movq mm4, mm7 813 pslld mm4, 4 814 815 psubd mm4, mm7 816 817 psubd mm3, mm1 818 psubd mm4, mm2 819 820 psubd mm3, flimit2 821 psubd mm4, flimit2 822 823 psrad mm3, 31 824 psrad mm4, 31 825 826 packssdw mm3, mm4 827 packsswb mm3, mm0 828 829 movd mm1, DWORD PTR [esi+eax*8] 830 831 movq mm2, mm1 832 punpcklbw mm1, mm0 833 834 paddw mm1, mm5 835 mov ecx, edx 836 837 and ecx, 127 838 movq mm4, vp8_rv[ecx*2] 839 840 paddw mm1, mm4 841 //paddw xmm1, eight8s 842 psraw mm1, 4 843 844 packuswb mm1, mm0 845 pand mm1, mm3 846 847 pandn mm3, mm2 848 por mm1, mm3 849 850 and ecx, 15 851 movd DWORD PTR d[ecx*4], mm1 852 853 mov ecx, edx 854 sub ecx, 8 855 856 and ecx, 15 857 movd mm1, DWORD PTR d[ecx*4] 858 859 movd [esi], mm1 860 lea esi, [esi+eax] 861 862 lea edi, [edi+eax] 863 add edx, 1 864 865 cmp edx, rows 866 jl loop_row 867 868 } 869 870 } 871 } 872 873 void vp8_mbpost_proc_down_xmm(unsigned char *dst, int pitch, int rows, int cols, int flimit) 874 { 875 int c, i; 876 __declspec(align(16)) 877 int flimit4[4]; 878 __declspec(align(16)) 879 unsigned char d[16][8]; 880 881 flimit = vp8_q2mbl(flimit); 882 883 for (i = 0; i < 4; i++) 884 flimit4[i] = flimit; 885 886 rows += 8; 887 888 for (c = 0; c < cols; c += 8) 889 { 890 unsigned char *s = &dst[c]; 891 892 __asm 893 { 894 mov esi, s ; 895 pxor xmm0, xmm0 ; 896 897 mov eax, pitch ; 898 neg eax // eax = -pitch 899 900 lea esi, [esi + eax*8]; // edi = s[-pitch*8] 901 neg eax 902 903 904 pxor xmm5, xmm5 905 pxor xmm6, xmm6 ; 906 907 pxor xmm7, xmm7 ; 908 mov edi, esi 909 910 mov ecx, 15 ; 911 912 loop_initvar: 913 movq xmm1, QWORD PTR [edi]; 914 punpcklbw xmm1, xmm0 ; 915 916 paddw xmm5, xmm1 ; 917 pmullw xmm1, xmm1 ; 918 919 movdqa xmm2, xmm1 ; 920 punpcklwd xmm1, xmm0 ; 921 922 punpckhwd xmm2, xmm0 ; 923 paddd xmm6, xmm1 ; 924 925 paddd xmm7, xmm2 ; 926 lea edi, [edi+eax] ; 927 928 dec ecx 929 jne loop_initvar 930 //save the var and sum 931 xor edx, edx 932 loop_row: 933 movq xmm1, QWORD PTR [esi] // [s-pitch*8] 934 movq xmm2, QWORD PTR [edi] // [s+pitch*7] 935 936 punpcklbw xmm1, xmm0 937 punpcklbw xmm2, xmm0 938 939 paddw xmm5, xmm2 940 psubw xmm5, xmm1 941 942 pmullw xmm2, xmm2 943 movdqa xmm4, xmm2 944 945 punpcklwd xmm2, xmm0 946 punpckhwd xmm4, xmm0 947 948 paddd xmm6, xmm2 949 paddd xmm7, xmm4 950 951 pmullw xmm1, xmm1 952 movdqa xmm2, xmm1 953 954 punpcklwd xmm1, xmm0 955 psubd xmm6, xmm1 956 957 punpckhwd xmm2, xmm0 958 psubd xmm7, xmm2 959 960 961 movdqa xmm3, xmm6 962 pslld xmm3, 4 963 964 psubd xmm3, xmm6 965 movdqa xmm1, xmm5 966 967 movdqa xmm4, xmm5 968 pmullw xmm1, xmm1 969 970 pmulhw xmm4, xmm4 971 movdqa xmm2, xmm1 972 973 punpcklwd xmm1, xmm4 974 punpckhwd xmm2, xmm4 975 976 movdqa xmm4, xmm7 977 pslld xmm4, 4 978 979 psubd xmm4, xmm7 980 981 psubd xmm3, xmm1 982 psubd xmm4, xmm2 983 984 psubd xmm3, flimit4 985 psubd xmm4, flimit4 986 987 psrad xmm3, 31 988 psrad xmm4, 31 989 990 packssdw xmm3, xmm4 991 packsswb xmm3, xmm0 992 993 movq xmm1, QWORD PTR [esi+eax*8] 994 995 movq xmm2, xmm1 996 punpcklbw xmm1, xmm0 997 998 paddw xmm1, xmm5 999 mov ecx, edx 1000 1001 and ecx, 127 1002 movdqu xmm4, vp8_rv[ecx*2] 1003 1004 paddw xmm1, xmm4 1005 //paddw xmm1, eight8s 1006 psraw xmm1, 4 1007 1008 packuswb xmm1, xmm0 1009 pand xmm1, xmm3 1010 1011 pandn xmm3, xmm2 1012 por xmm1, xmm3 1013 1014 and ecx, 15 1015 movq QWORD PTR d[ecx*8], xmm1 1016 1017 mov ecx, edx 1018 sub ecx, 8 1019 1020 and ecx, 15 1021 movq mm0, d[ecx*8] 1022 1023 movq [esi], mm0 1024 lea esi, [esi+eax] 1025 1026 lea edi, [edi+eax] 1027 add edx, 1 1028 1029 cmp edx, rows 1030 jl loop_row 1031 1032 } 1033 1034 } 1035 } 1036 #if 0 1037 /**************************************************************************** 1038 * 1039 * ROUTINE : plane_add_noise_wmt 1040 * 1041 * INPUTS : unsigned char *Start starting address of buffer to add gaussian 1042 * noise to 1043 * unsigned int Width width of plane 1044 * unsigned int Height height of plane 1045 * int Pitch distance between subsequent lines of frame 1046 * int q quantizer used to determine amount of noise 1047 * to add 1048 * 1049 * OUTPUTS : None. 1050 * 1051 * RETURNS : void. 1052 * 1053 * FUNCTION : adds gaussian noise to a plane of pixels 1054 * 1055 * SPECIAL NOTES : None. 1056 * 1057 ****************************************************************************/ 1058 void vp8_plane_add_noise_wmt(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a) 1059 { 1060 unsigned int i; 1061 1062 __declspec(align(16)) unsigned char blackclamp[16]; 1063 __declspec(align(16)) unsigned char whiteclamp[16]; 1064 __declspec(align(16)) unsigned char bothclamp[16]; 1065 char char_dist[300]; 1066 char Rand[2048]; 1067 double sigma; 1068 // return; 1069 __asm emms 1070 sigma = a + .5 + .6 * (63 - q) / 63.0; 1071 1072 // set up a lookup table of 256 entries that matches 1073 // a gaussian distribution with sigma determined by q. 1074 // 1075 { 1076 double i; 1077 int next, j; 1078 1079 next = 0; 1080 1081 for (i = -32; i < 32; i++) 1082 { 1083 double g = 256 * vp8_gaussian(sigma, 0, 1.0 * i); 1084 int a = (int)(g + .5); 1085 1086 if (a) 1087 { 1088 for (j = 0; j < a; j++) 1089 { 1090 char_dist[next+j] = (char) i; 1091 } 1092 1093 next = next + j; 1094 } 1095 1096 } 1097 1098 for (next = next; next < 256; next++) 1099 char_dist[next] = 0; 1100 1101 } 1102 1103 for (i = 0; i < 2048; i++) 1104 { 1105 Rand[i] = char_dist[rand() & 0xff]; 1106 } 1107 1108 for (i = 0; i < 16; i++) 1109 { 1110 blackclamp[i] = -char_dist[0]; 1111 whiteclamp[i] = -char_dist[0]; 1112 bothclamp[i] = -2 * char_dist[0]; 1113 } 1114 1115 for (i = 0; i < Height; i++) 1116 { 1117 unsigned char *Pos = Start + i * Pitch; 1118 char *Ref = Rand + (rand() & 0xff); 1119 1120 __asm 1121 { 1122 mov ecx, [Width] 1123 mov esi, Pos 1124 mov edi, Ref 1125 xor eax, eax 1126 1127 nextset: 1128 movdqu xmm1, [esi+eax] // get the source 1129 1130 psubusb xmm1, blackclamp // clamp both sides so we don't outrange adding noise 1131 paddusb xmm1, bothclamp 1132 psubusb xmm1, whiteclamp 1133 1134 movdqu xmm2, [edi+eax] // get the noise for this line 1135 paddb xmm1, xmm2 // add it in 1136 movdqu [esi+eax], xmm1 // store the result 1137 1138 add eax, 16 // move to the next line 1139 1140 cmp eax, ecx 1141 jl nextset 1142 1143 1144 } 1145 1146 } 1147 } 1148 #endif 1149 __declspec(align(16)) 1150 static const int four8s[4] = { 8, 8, 8, 8}; 1151 void vp8_mbpost_proc_across_ip_xmm(unsigned char *src, int pitch, int rows, int cols, int flimit) 1152 { 1153 int r, i; 1154 __declspec(align(16)) 1155 int flimit4[4]; 1156 unsigned char *s = src; 1157 int sumsq; 1158 int sum; 1159 1160 1161 flimit = vp8_q2mbl(flimit); 1162 flimit4[0] = 1163 flimit4[1] = 1164 flimit4[2] = 1165 flimit4[3] = flimit; 1166 1167 for (r = 0; r < rows; r++) 1168 { 1169 1170 1171 sumsq = 0; 1172 sum = 0; 1173 1174 for (i = -8; i <= 6; i++) 1175 { 1176 sumsq += s[i] * s[i]; 1177 sum += s[i]; 1178 } 1179 1180 __asm 1181 { 1182 mov eax, sumsq 1183 movd xmm7, eax 1184 1185 mov eax, sum 1186 movd xmm6, eax 1187 1188 mov esi, s 1189 xor ecx, ecx 1190 1191 mov edx, cols 1192 add edx, 8 1193 pxor mm0, mm0 1194 pxor mm1, mm1 1195 1196 pxor xmm0, xmm0 1197 nextcol4: 1198 1199 movd xmm1, DWORD PTR [esi+ecx-8] // -8 -7 -6 -5 1200 movd xmm2, DWORD PTR [esi+ecx+7] // +7 +8 +9 +10 1201 1202 punpcklbw xmm1, xmm0 // expanding 1203 punpcklbw xmm2, xmm0 // expanding 1204 1205 punpcklwd xmm1, xmm0 // expanding to dwords 1206 punpcklwd xmm2, xmm0 // expanding to dwords 1207 1208 psubd xmm2, xmm1 // 7--8 8--7 9--6 10--5 1209 paddd xmm1, xmm1 // -8*2 -7*2 -6*2 -5*2 1210 1211 paddd xmm1, xmm2 // 7+-8 8+-7 9+-6 10+-5 1212 pmaddwd xmm1, xmm2 // squared of 7+-8 8+-7 9+-6 10+-5 1213 1214 paddd xmm6, xmm2 1215 paddd xmm7, xmm1 1216 1217 pshufd xmm6, xmm6, 0 // duplicate the last ones 1218 pshufd xmm7, xmm7, 0 // duplicate the last ones 1219 1220 psrldq xmm1, 4 // 8--7 9--6 10--5 0000 1221 psrldq xmm2, 4 // 8--7 9--6 10--5 0000 1222 1223 pshufd xmm3, xmm1, 3 // 0000 8--7 8--7 8--7 squared 1224 pshufd xmm4, xmm2, 3 // 0000 8--7 8--7 8--7 squared 1225 1226 paddd xmm6, xmm4 1227 paddd xmm7, xmm3 1228 1229 pshufd xmm3, xmm1, 01011111b // 0000 0000 9--6 9--6 squared 1230 pshufd xmm4, xmm2, 01011111b // 0000 0000 9--6 9--6 squared 1231 1232 paddd xmm7, xmm3 1233 paddd xmm6, xmm4 1234 1235 pshufd xmm3, xmm1, 10111111b // 0000 0000 8--7 8--7 squared 1236 pshufd xmm4, xmm2, 10111111b // 0000 0000 8--7 8--7 squared 1237 1238 paddd xmm7, xmm3 1239 paddd xmm6, xmm4 1240 1241 movdqa xmm3, xmm6 1242 pmaddwd xmm3, xmm3 1243 1244 movdqa xmm5, xmm7 1245 pslld xmm5, 4 1246 1247 psubd xmm5, xmm7 1248 psubd xmm5, xmm3 1249 1250 psubd xmm5, flimit4 1251 psrad xmm5, 31 1252 1253 packssdw xmm5, xmm0 1254 packsswb xmm5, xmm0 1255 1256 movd xmm1, DWORD PTR [esi+ecx] 1257 movq xmm2, xmm1 1258 1259 punpcklbw xmm1, xmm0 1260 punpcklwd xmm1, xmm0 1261 1262 paddd xmm1, xmm6 1263 paddd xmm1, four8s 1264 1265 psrad xmm1, 4 1266 packssdw xmm1, xmm0 1267 1268 packuswb xmm1, xmm0 1269 pand xmm1, xmm5 1270 1271 pandn xmm5, xmm2 1272 por xmm5, xmm1 1273 1274 movd [esi+ecx-8], mm0 1275 movq mm0, mm1 1276 1277 movdq2q mm1, xmm5 1278 psrldq xmm7, 12 1279 1280 psrldq xmm6, 12 1281 add ecx, 4 1282 1283 cmp ecx, edx 1284 jl nextcol4 1285 1286 } 1287 s += pitch; 1288 } 1289 } 1290 1291 #if 0 1292 1293 /**************************************************************************** 1294 * 1295 * ROUTINE : plane_add_noise_mmx 1296 * 1297 * INPUTS : unsigned char *Start starting address of buffer to add gaussian 1298 * noise to 1299 * unsigned int Width width of plane 1300 * unsigned int Height height of plane 1301 * int Pitch distance between subsequent lines of frame 1302 * int q quantizer used to determine amount of noise 1303 * to add 1304 * 1305 * OUTPUTS : None. 1306 * 1307 * RETURNS : void. 1308 * 1309 * FUNCTION : adds gaussian noise to a plane of pixels 1310 * 1311 * SPECIAL NOTES : None. 1312 * 1313 ****************************************************************************/ 1314 void vp8_plane_add_noise_mmx(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a) 1315 { 1316 unsigned int i; 1317 int Pitch4 = Pitch * 4; 1318 const int noise_amount = 2; 1319 const int noise_adder = 2 * noise_amount + 1; 1320 1321 __declspec(align(16)) unsigned char blackclamp[16]; 1322 __declspec(align(16)) unsigned char whiteclamp[16]; 1323 __declspec(align(16)) unsigned char bothclamp[16]; 1324 1325 char char_dist[300]; 1326 char Rand[2048]; 1327 1328 double sigma; 1329 __asm emms 1330 sigma = a + .5 + .6 * (63 - q) / 63.0; 1331 1332 // set up a lookup table of 256 entries that matches 1333 // a gaussian distribution with sigma determined by q. 1334 // 1335 { 1336 double i, sum = 0; 1337 int next, j; 1338 1339 next = 0; 1340 1341 for (i = -32; i < 32; i++) 1342 { 1343 int a = (int)(.5 + 256 * vp8_gaussian(sigma, 0, i)); 1344 1345 if (a) 1346 { 1347 for (j = 0; j < a; j++) 1348 { 1349 char_dist[next+j] = (char) i; 1350 } 1351 1352 next = next + j; 1353 } 1354 1355 } 1356 1357 for (next = next; next < 256; next++) 1358 char_dist[next] = 0; 1359 1360 } 1361 1362 for (i = 0; i < 2048; i++) 1363 { 1364 Rand[i] = char_dist[rand() & 0xff]; 1365 } 1366 1367 for (i = 0; i < 16; i++) 1368 { 1369 blackclamp[i] = -char_dist[0]; 1370 whiteclamp[i] = -char_dist[0]; 1371 bothclamp[i] = -2 * char_dist[0]; 1372 } 1373 1374 for (i = 0; i < Height; i++) 1375 { 1376 unsigned char *Pos = Start + i * Pitch; 1377 char *Ref = Rand + (rand() & 0xff); 1378 1379 __asm 1380 { 1381 mov ecx, [Width] 1382 mov esi, Pos 1383 mov edi, Ref 1384 xor eax, eax 1385 1386 nextset: 1387 movq mm1, [esi+eax] // get the source 1388 1389 psubusb mm1, blackclamp // clamp both sides so we don't outrange adding noise 1390 paddusb mm1, bothclamp 1391 psubusb mm1, whiteclamp 1392 1393 movq mm2, [edi+eax] // get the noise for this line 1394 paddb mm1, mm2 // add it in 1395 movq [esi+eax], mm1 // store the result 1396 1397 add eax, 8 // move to the next line 1398 1399 cmp eax, ecx 1400 jl nextset 1401 1402 1403 } 1404 1405 } 1406 } 1407 #else 1408 extern char an[8][64][3072]; 1409 extern int cd[8][64]; 1410 1411 void vp8_plane_add_noise_mmx(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a) 1412 { 1413 unsigned int i; 1414 __declspec(align(16)) unsigned char blackclamp[16]; 1415 __declspec(align(16)) unsigned char whiteclamp[16]; 1416 __declspec(align(16)) unsigned char bothclamp[16]; 1417 1418 1419 __asm emms 1420 1421 for (i = 0; i < 16; i++) 1422 { 1423 blackclamp[i] = -cd[a][q]; 1424 whiteclamp[i] = -cd[a][q]; 1425 bothclamp[i] = -2 * cd[a][q]; 1426 } 1427 1428 for (i = 0; i < Height; i++) 1429 { 1430 unsigned char *Pos = Start + i * Pitch; 1431 char *Ref = an[a][q] + (rand() & 0xff); 1432 1433 __asm 1434 { 1435 mov ecx, [Width] 1436 mov esi, Pos 1437 mov edi, Ref 1438 xor eax, eax 1439 1440 nextset: 1441 movq mm1, [esi+eax] // get the source 1442 1443 psubusb mm1, blackclamp // clamp both sides so we don't outrange adding noise 1444 paddusb mm1, bothclamp 1445 psubusb mm1, whiteclamp 1446 1447 movq mm2, [edi+eax] // get the noise for this line 1448 paddb mm1, mm2 // add it in 1449 movq [esi+eax], mm1 // store the result 1450 1451 add eax, 8 // move to the next line 1452 1453 cmp eax, ecx 1454 jl nextset 1455 } 1456 } 1457 } 1458 1459 1460 void vp8_plane_add_noise_wmt(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a) 1461 { 1462 unsigned int i; 1463 1464 __declspec(align(16)) unsigned char blackclamp[16]; 1465 __declspec(align(16)) unsigned char whiteclamp[16]; 1466 __declspec(align(16)) unsigned char bothclamp[16]; 1467 1468 __asm emms 1469 1470 for (i = 0; i < 16; i++) 1471 { 1472 blackclamp[i] = -cd[a][q]; 1473 whiteclamp[i] = -cd[a][q]; 1474 bothclamp[i] = -2 * cd[a][q]; 1475 } 1476 1477 for (i = 0; i < Height; i++) 1478 { 1479 unsigned char *Pos = Start + i * Pitch; 1480 char *Ref = an[a][q] + (rand() & 0xff); 1481 1482 __asm 1483 { 1484 mov ecx, [Width] 1485 mov esi, Pos 1486 mov edi, Ref 1487 xor eax, eax 1488 1489 nextset: 1490 movdqu xmm1, [esi+eax] // get the source 1491 1492 psubusb xmm1, blackclamp // clamp both sides so we don't outrange adding noise 1493 paddusb xmm1, bothclamp 1494 psubusb xmm1, whiteclamp 1495 1496 movdqu xmm2, [edi+eax] // get the noise for this line 1497 paddb xmm1, xmm2 // add it in 1498 movdqu [esi+eax], xmm1 // store the result 1499 1500 add eax, 16 // move to the next line 1501 1502 cmp eax, ecx 1503 jl nextset 1504 } 1505 } 1506 } 1507 1508 #endif 1509