1 2 /* pngvcrd.c - mixed C/assembler version of utilities to read a PNG file 3 * 4 * For Intel x86 CPU and Microsoft Visual C++ compiler 5 * 6 * Last changed in libpng 1.2.19 August 18, 2007 7 * For conditions of distribution and use, see copyright notice in png.h 8 * Copyright (c) 1998-2007 Glenn Randers-Pehrson 9 * Copyright (c) 1998, Intel Corporation 10 * 11 * Contributed by Nirav Chhatrapati, Intel Corporation, 1998 12 * Interface to libpng contributed by Gilles Vollant, 1999 13 * 14 * 15 * In png_do_read_interlace() in libpng versions 1.0.3a through 1.0.4d, 16 * a sign error in the post-MMX cleanup code for each pixel_depth resulted 17 * in bad pixels at the beginning of some rows of some images, and also 18 * (due to out-of-range memory reads and writes) caused heap corruption 19 * when compiled with MSVC 6.0. The error was fixed in version 1.0.4e. 20 * 21 * [png_read_filter_row_mmx_avg() bpp == 2 bugfix, GRR 20000916] 22 * 23 * [runtime MMX configuration, GRR 20010102] 24 * 25 * [Copy 6 bytes per pixel, not 4, and use stride of 6, not 4, in the 26 * second loop of interlace processing of 48-bit pixels, GR-P 20070717] 27 * 28 * [move instances of uAll union into local, except for two constant 29 * instances, GR-P 20070805] 30 */ 31 32 #define PNG_INTERNAL 33 #include "png.h" 34 35 #if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_USE_PNGVCRD) 36 37 38 static int mmx_supported=2; 39 40 int PNGAPI 41 png_mmx_support(void) 42 { 43 int mmx_supported_local = 0; 44 _asm { 45 push ebx //CPUID will trash these 46 push ecx 47 push edx 48 49 pushfd //Save Eflag to stack 50 pop eax //Get Eflag from stack into eax 51 mov ecx, eax //Make another copy of Eflag in ecx 52 xor eax, 0x200000 //Toggle ID bit in Eflag [i.e. bit(21)] 53 push eax //Save modified Eflag back to stack 54 55 popfd //Restored modified value back to Eflag reg 56 pushfd //Save Eflag to stack 57 pop eax //Get Eflag from stack 58 push ecx // save original Eflag to stack 59 popfd // restore original Eflag 60 xor eax, ecx //Compare the new Eflag with the original Eflag 61 jz NOT_SUPPORTED //If the same, CPUID instruction is not supported, 62 //skip following instructions and jump to 63 //NOT_SUPPORTED label 64 65 xor eax, eax //Set eax to zero 66 67 _asm _emit 0x0f //CPUID instruction (two bytes opcode) 68 _asm _emit 0xa2 69 70 cmp eax, 1 //make sure eax return non-zero value 71 jl NOT_SUPPORTED //If eax is zero, mmx not supported 72 73 xor eax, eax //set eax to zero 74 inc eax //Now increment eax to 1. This instruction is 75 //faster than the instruction "mov eax, 1" 76 77 _asm _emit 0x0f //CPUID instruction 78 _asm _emit 0xa2 79 80 and edx, 0x00800000 //mask out all bits but mmx bit(24) 81 cmp edx, 0 // 0 = mmx not supported 82 jz NOT_SUPPORTED // non-zero = Yes, mmx IS supported 83 84 mov mmx_supported_local, 1 //set return value to 1 85 86 NOT_SUPPORTED: 87 mov eax, mmx_supported_local //move return value to eax 88 pop edx //CPUID trashed these 89 pop ecx 90 pop ebx 91 } 92 93 //mmx_supported_local=0; // test code for force don't support MMX 94 //printf("MMX : %u (1=MMX supported)\n",mmx_supported_local); 95 96 mmx_supported = mmx_supported_local; 97 return mmx_supported_local; 98 } 99 100 /* Combines the row recently read in with the previous row. 101 This routine takes care of alpha and transparency if requested. 102 This routine also handles the two methods of progressive display 103 of interlaced images, depending on the mask value. 104 The mask value describes which pixels are to be combined with 105 the row. The pattern always repeats every 8 pixels, so just 8 106 bits are needed. A one indicates the pixel is to be combined; a 107 zero indicates the pixel is to be skipped. This is in addition 108 to any alpha or transparency value associated with the pixel. If 109 you want all pixels to be combined, pass 0xff (255) in mask. */ 110 111 /* Use this routine for x86 platform - uses faster MMX routine if machine 112 supports MMX */ 113 114 void /* PRIVATE */ 115 png_combine_row(png_structp png_ptr, png_bytep row, int mask) 116 { 117 #ifdef PNG_USE_LOCAL_ARRAYS 118 PNG_CONST int png_pass_inc[7] = {8, 8, 4, 4, 2, 2, 1}; 119 #endif 120 121 png_debug(1,"in png_combine_row_asm\n"); 122 123 if (mmx_supported == 2) { 124 #if !defined(PNG_1_0_X) 125 /* this should have happened in png_init_mmx_flags() already */ 126 png_warning(png_ptr, "asm_flags may not have been initialized"); 127 #endif 128 png_mmx_support(); 129 } 130 131 if (mask == 0xff) 132 { 133 png_memcpy(row, png_ptr->row_buf + 1, 134 (png_size_t)PNG_ROWBYTES(png_ptr->row_info.pixel_depth, 135 png_ptr->width)); 136 } 137 /* GRR: add "else if (mask == 0)" case? 138 * or does png_combine_row() not even get called in that case? */ 139 else 140 { 141 switch (png_ptr->row_info.pixel_depth) 142 { 143 case 24: 144 { 145 png_bytep srcptr; 146 png_bytep dstptr; 147 png_uint_32 len; 148 int unmask, diff; 149 150 __int64 mask2=0x0101010202020404, //24bpp 151 mask1=0x0408080810101020, 152 mask0=0x2020404040808080; 153 154 srcptr = png_ptr->row_buf + 1; 155 dstptr = row; 156 157 unmask = ~mask; 158 len = (png_ptr->width)&~7; 159 diff = (png_ptr->width)&7; 160 161 #if !defined(PNG_1_0_X) 162 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW) 163 /* && mmx_supported */ ) 164 #else 165 if (mmx_supported) 166 #endif 167 { 168 _asm 169 { 170 movd mm7, unmask //load bit pattern 171 psubb mm6,mm6 //zero mm6 172 punpcklbw mm7,mm7 173 punpcklwd mm7,mm7 174 punpckldq mm7,mm7 //fill register with 8 masks 175 176 movq mm0,mask0 177 movq mm1,mask1 178 movq mm2,mask2 179 180 pand mm0,mm7 181 pand mm1,mm7 182 pand mm2,mm7 183 184 pcmpeqb mm0,mm6 185 pcmpeqb mm1,mm6 186 pcmpeqb mm2,mm6 187 188 mov ecx,len //load length of line 189 mov esi,srcptr //load source 190 mov ebx,dstptr //load dest 191 cmp ecx,0 192 jz mainloop24end 193 194 mainloop24: 195 movq mm4,[esi] 196 pand mm4,mm0 197 movq mm6,mm0 198 movq mm7,[ebx] 199 pandn mm6,mm7 200 por mm4,mm6 201 movq [ebx],mm4 202 203 204 movq mm5,[esi+8] 205 pand mm5,mm1 206 movq mm7,mm1 207 movq mm6,[ebx+8] 208 pandn mm7,mm6 209 por mm5,mm7 210 movq [ebx+8],mm5 211 212 movq mm6,[esi+16] 213 pand mm6,mm2 214 movq mm4,mm2 215 movq mm7,[ebx+16] 216 pandn mm4,mm7 217 por mm6,mm4 218 movq [ebx+16],mm6 219 220 add esi,24 //inc by 24 bytes processed 221 add ebx,24 222 sub ecx,8 //dec by 8 pixels processed 223 224 ja mainloop24 225 226 mainloop24end: 227 mov ecx,diff 228 cmp ecx,0 229 jz end24 230 231 mov edx,mask 232 sal edx,24 //make low byte the high byte 233 secondloop24: 234 sal edx,1 //move high bit to CF 235 jnc skip24 //if CF = 0 236 mov ax,[esi] 237 mov [ebx],ax 238 xor eax,eax 239 mov al,[esi+2] 240 mov [ebx+2],al 241 skip24: 242 add esi,3 243 add ebx,3 244 245 dec ecx 246 jnz secondloop24 247 248 end24: 249 emms 250 } 251 } 252 else /* mmx not supported - use modified C routine */ 253 { 254 register unsigned int incr1, initial_val, final_val; 255 png_size_t pixel_bytes; 256 png_uint_32 i; 257 register int disp = png_pass_inc[png_ptr->pass]; 258 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0}; 259 260 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3); 261 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]* 262 pixel_bytes; 263 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes; 264 initial_val = offset_table[png_ptr->pass]*pixel_bytes; 265 final_val = png_ptr->width*pixel_bytes; 266 incr1 = (disp)*pixel_bytes; 267 for (i = initial_val; i < final_val; i += incr1) 268 { 269 png_memcpy(dstptr, srcptr, pixel_bytes); 270 srcptr += incr1; 271 dstptr += incr1; 272 } 273 } /* end of else */ 274 275 break; 276 } // end 24 bpp 277 278 case 32: 279 { 280 png_bytep srcptr; 281 png_bytep dstptr; 282 png_uint_32 len; 283 int unmask, diff; 284 285 __int64 mask3=0x0101010102020202, //32bpp 286 mask2=0x0404040408080808, 287 mask1=0x1010101020202020, 288 mask0=0x4040404080808080; 289 290 srcptr = png_ptr->row_buf + 1; 291 dstptr = row; 292 293 unmask = ~mask; 294 len = (png_ptr->width)&~7; 295 diff = (png_ptr->width)&7; 296 297 #if !defined(PNG_1_0_X) 298 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW) 299 /* && mmx_supported */ ) 300 #else 301 if (mmx_supported) 302 #endif 303 { 304 _asm 305 { 306 movd mm7, unmask //load bit pattern 307 psubb mm6,mm6 //zero mm6 308 punpcklbw mm7,mm7 309 punpcklwd mm7,mm7 310 punpckldq mm7,mm7 //fill register with 8 masks 311 312 movq mm0,mask0 313 movq mm1,mask1 314 movq mm2,mask2 315 movq mm3,mask3 316 317 pand mm0,mm7 318 pand mm1,mm7 319 pand mm2,mm7 320 pand mm3,mm7 321 322 pcmpeqb mm0,mm6 323 pcmpeqb mm1,mm6 324 pcmpeqb mm2,mm6 325 pcmpeqb mm3,mm6 326 327 mov ecx,len //load length of line 328 mov esi,srcptr //load source 329 mov ebx,dstptr //load dest 330 331 cmp ecx,0 //lcr 332 jz mainloop32end 333 334 mainloop32: 335 movq mm4,[esi] 336 pand mm4,mm0 337 movq mm6,mm0 338 movq mm7,[ebx] 339 pandn mm6,mm7 340 por mm4,mm6 341 movq [ebx],mm4 342 343 movq mm5,[esi+8] 344 pand mm5,mm1 345 movq mm7,mm1 346 movq mm6,[ebx+8] 347 pandn mm7,mm6 348 por mm5,mm7 349 movq [ebx+8],mm5 350 351 movq mm6,[esi+16] 352 pand mm6,mm2 353 movq mm4,mm2 354 movq mm7,[ebx+16] 355 pandn mm4,mm7 356 por mm6,mm4 357 movq [ebx+16],mm6 358 359 movq mm7,[esi+24] 360 pand mm7,mm3 361 movq mm5,mm3 362 movq mm4,[ebx+24] 363 pandn mm5,mm4 364 por mm7,mm5 365 movq [ebx+24],mm7 366 367 add esi,32 //inc by 32 bytes processed 368 add ebx,32 369 sub ecx,8 //dec by 8 pixels processed 370 371 ja mainloop32 372 373 mainloop32end: 374 mov ecx,diff 375 cmp ecx,0 376 jz end32 377 378 mov edx,mask 379 sal edx,24 //make low byte the high byte 380 secondloop32: 381 sal edx,1 //move high bit to CF 382 jnc skip32 //if CF = 0 383 mov eax,[esi] 384 mov [ebx],eax 385 skip32: 386 add esi,4 387 add ebx,4 388 389 dec ecx 390 jnz secondloop32 391 392 end32: 393 emms 394 } 395 } 396 else /* mmx _not supported - Use modified C routine */ 397 { 398 register unsigned int incr1, initial_val, final_val; 399 png_size_t pixel_bytes; 400 png_uint_32 i; 401 register int disp = png_pass_inc[png_ptr->pass]; 402 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0}; 403 404 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3); 405 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]* 406 pixel_bytes; 407 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes; 408 initial_val = offset_table[png_ptr->pass]*pixel_bytes; 409 final_val = png_ptr->width*pixel_bytes; 410 incr1 = (disp)*pixel_bytes; 411 for (i = initial_val; i < final_val; i += incr1) 412 { 413 png_memcpy(dstptr, srcptr, pixel_bytes); 414 srcptr += incr1; 415 dstptr += incr1; 416 } 417 } /* end of else */ 418 419 break; 420 } // end 32 bpp 421 422 case 8: 423 { 424 png_bytep srcptr; 425 png_bytep dstptr; 426 png_uint_32 len; 427 int m; 428 int diff, unmask; 429 430 __int64 mask0=0x0102040810204080; 431 432 #if !defined(PNG_1_0_X) 433 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW) 434 /* && mmx_supported */ ) 435 #else 436 if (mmx_supported) 437 #endif 438 { 439 srcptr = png_ptr->row_buf + 1; 440 dstptr = row; 441 m = 0x80; 442 unmask = ~mask; 443 len = png_ptr->width &~7; //reduce to multiple of 8 444 diff = png_ptr->width & 7; //amount lost 445 446 _asm 447 { 448 movd mm7, unmask //load bit pattern 449 psubb mm6,mm6 //zero mm6 450 punpcklbw mm7,mm7 451 punpcklwd mm7,mm7 452 punpckldq mm7,mm7 //fill register with 8 masks 453 454 movq mm0,mask0 455 456 pand mm0,mm7 //nonzero if keep byte 457 pcmpeqb mm0,mm6 //zeros->1s, v versa 458 459 mov ecx,len //load length of line (pixels) 460 mov esi,srcptr //load source 461 mov ebx,dstptr //load dest 462 cmp ecx,0 //lcr 463 je mainloop8end 464 465 mainloop8: 466 movq mm4,[esi] 467 pand mm4,mm0 468 movq mm6,mm0 469 pandn mm6,[ebx] 470 por mm4,mm6 471 movq [ebx],mm4 472 473 add esi,8 //inc by 8 bytes processed 474 add ebx,8 475 sub ecx,8 //dec by 8 pixels processed 476 477 ja mainloop8 478 mainloop8end: 479 480 mov ecx,diff 481 cmp ecx,0 482 jz end8 483 484 mov edx,mask 485 sal edx,24 //make low byte the high byte 486 487 secondloop8: 488 sal edx,1 //move high bit to CF 489 jnc skip8 //if CF = 0 490 mov al,[esi] 491 mov [ebx],al 492 skip8: 493 inc esi 494 inc ebx 495 496 dec ecx 497 jnz secondloop8 498 end8: 499 emms 500 } 501 } 502 else /* mmx not supported - use modified C routine */ 503 { 504 register unsigned int incr1, initial_val, final_val; 505 png_size_t pixel_bytes; 506 png_uint_32 i; 507 register int disp = png_pass_inc[png_ptr->pass]; 508 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0}; 509 510 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3); 511 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]* 512 pixel_bytes; 513 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes; 514 initial_val = offset_table[png_ptr->pass]*pixel_bytes; 515 final_val = png_ptr->width*pixel_bytes; 516 incr1 = (disp)*pixel_bytes; 517 for (i = initial_val; i < final_val; i += incr1) 518 { 519 png_memcpy(dstptr, srcptr, pixel_bytes); 520 srcptr += incr1; 521 dstptr += incr1; 522 } 523 } /* end of else */ 524 525 break; 526 } // end 8 bpp 527 528 case 1: 529 { 530 png_bytep sp; 531 png_bytep dp; 532 int s_inc, s_start, s_end; 533 int m; 534 int shift; 535 png_uint_32 i; 536 537 sp = png_ptr->row_buf + 1; 538 dp = row; 539 m = 0x80; 540 #if defined(PNG_READ_PACKSWAP_SUPPORTED) 541 if (png_ptr->transformations & PNG_PACKSWAP) 542 { 543 s_start = 0; 544 s_end = 7; 545 s_inc = 1; 546 } 547 else 548 #endif 549 { 550 s_start = 7; 551 s_end = 0; 552 s_inc = -1; 553 } 554 555 shift = s_start; 556 557 for (i = 0; i < png_ptr->width; i++) 558 { 559 if (m & mask) 560 { 561 int value; 562 563 value = (*sp >> shift) & 0x1; 564 *dp &= (png_byte)((0x7f7f >> (7 - shift)) & 0xff); 565 *dp |= (png_byte)(value << shift); 566 } 567 568 if (shift == s_end) 569 { 570 shift = s_start; 571 sp++; 572 dp++; 573 } 574 else 575 shift += s_inc; 576 577 if (m == 1) 578 m = 0x80; 579 else 580 m >>= 1; 581 } 582 break; 583 } 584 585 case 2: 586 { 587 png_bytep sp; 588 png_bytep dp; 589 int s_start, s_end, s_inc; 590 int m; 591 int shift; 592 png_uint_32 i; 593 int value; 594 595 sp = png_ptr->row_buf + 1; 596 dp = row; 597 m = 0x80; 598 #if defined(PNG_READ_PACKSWAP_SUPPORTED) 599 if (png_ptr->transformations & PNG_PACKSWAP) 600 { 601 s_start = 0; 602 s_end = 6; 603 s_inc = 2; 604 } 605 else 606 #endif 607 { 608 s_start = 6; 609 s_end = 0; 610 s_inc = -2; 611 } 612 613 shift = s_start; 614 615 for (i = 0; i < png_ptr->width; i++) 616 { 617 if (m & mask) 618 { 619 value = (*sp >> shift) & 0x3; 620 *dp &= (png_byte)((0x3f3f >> (6 - shift)) & 0xff); 621 *dp |= (png_byte)(value << shift); 622 } 623 624 if (shift == s_end) 625 { 626 shift = s_start; 627 sp++; 628 dp++; 629 } 630 else 631 shift += s_inc; 632 if (m == 1) 633 m = 0x80; 634 else 635 m >>= 1; 636 } 637 break; 638 } 639 640 case 4: 641 { 642 png_bytep sp; 643 png_bytep dp; 644 int s_start, s_end, s_inc; 645 int m; 646 int shift; 647 png_uint_32 i; 648 int value; 649 650 sp = png_ptr->row_buf + 1; 651 dp = row; 652 m = 0x80; 653 #if defined(PNG_READ_PACKSWAP_SUPPORTED) 654 if (png_ptr->transformations & PNG_PACKSWAP) 655 { 656 s_start = 0; 657 s_end = 4; 658 s_inc = 4; 659 } 660 else 661 #endif 662 { 663 s_start = 4; 664 s_end = 0; 665 s_inc = -4; 666 } 667 shift = s_start; 668 669 for (i = 0; i < png_ptr->width; i++) 670 { 671 if (m & mask) 672 { 673 value = (*sp >> shift) & 0xf; 674 *dp &= (png_byte)((0xf0f >> (4 - shift)) & 0xff); 675 *dp |= (png_byte)(value << shift); 676 } 677 678 if (shift == s_end) 679 { 680 shift = s_start; 681 sp++; 682 dp++; 683 } 684 else 685 shift += s_inc; 686 if (m == 1) 687 m = 0x80; 688 else 689 m >>= 1; 690 } 691 break; 692 } 693 694 case 16: 695 { 696 png_bytep srcptr; 697 png_bytep dstptr; 698 png_uint_32 len; 699 int unmask, diff; 700 __int64 mask1=0x0101020204040808, 701 mask0=0x1010202040408080; 702 703 #if !defined(PNG_1_0_X) 704 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW) 705 /* && mmx_supported */ ) 706 #else 707 if (mmx_supported) 708 #endif 709 { 710 srcptr = png_ptr->row_buf + 1; 711 dstptr = row; 712 713 unmask = ~mask; 714 len = (png_ptr->width)&~7; 715 diff = (png_ptr->width)&7; 716 _asm 717 { 718 movd mm7, unmask //load bit pattern 719 psubb mm6,mm6 //zero mm6 720 punpcklbw mm7,mm7 721 punpcklwd mm7,mm7 722 punpckldq mm7,mm7 //fill register with 8 masks 723 724 movq mm0,mask0 725 movq mm1,mask1 726 727 pand mm0,mm7 728 pand mm1,mm7 729 730 pcmpeqb mm0,mm6 731 pcmpeqb mm1,mm6 732 733 mov ecx,len //load length of line 734 mov esi,srcptr //load source 735 mov ebx,dstptr //load dest 736 cmp ecx,0 //lcr 737 jz mainloop16end 738 739 mainloop16: 740 movq mm4,[esi] 741 pand mm4,mm0 742 movq mm6,mm0 743 movq mm7,[ebx] 744 pandn mm6,mm7 745 por mm4,mm6 746 movq [ebx],mm4 747 748 movq mm5,[esi+8] 749 pand mm5,mm1 750 movq mm7,mm1 751 movq mm6,[ebx+8] 752 pandn mm7,mm6 753 por mm5,mm7 754 movq [ebx+8],mm5 755 756 add esi,16 //inc by 16 bytes processed 757 add ebx,16 758 sub ecx,8 //dec by 8 pixels processed 759 760 ja mainloop16 761 762 mainloop16end: 763 mov ecx,diff 764 cmp ecx,0 765 jz end16 766 767 mov edx,mask 768 sal edx,24 //make low byte the high byte 769 secondloop16: 770 sal edx,1 //move high bit to CF 771 jnc skip16 //if CF = 0 772 mov ax,[esi] 773 mov [ebx],ax 774 skip16: 775 add esi,2 776 add ebx,2 777 778 dec ecx 779 jnz secondloop16 780 end16: 781 emms 782 } 783 } 784 else /* mmx not supported - use modified C routine */ 785 { 786 register unsigned int incr1, initial_val, final_val; 787 png_size_t pixel_bytes; 788 png_uint_32 i; 789 register int disp = png_pass_inc[png_ptr->pass]; 790 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0}; 791 792 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3); 793 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]* 794 pixel_bytes; 795 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes; 796 initial_val = offset_table[png_ptr->pass]*pixel_bytes; 797 final_val = png_ptr->width*pixel_bytes; 798 incr1 = (disp)*pixel_bytes; 799 for (i = initial_val; i < final_val; i += incr1) 800 { 801 png_memcpy(dstptr, srcptr, pixel_bytes); 802 srcptr += incr1; 803 dstptr += incr1; 804 } 805 } /* end of else */ 806 807 break; 808 } // end 16 bpp 809 810 case 48: 811 { 812 png_bytep srcptr; 813 png_bytep dstptr; 814 png_uint_32 len; 815 int unmask, diff; 816 817 __int64 mask5=0x0101010101010202, 818 mask4=0x0202020204040404, 819 mask3=0x0404080808080808, 820 mask2=0x1010101010102020, 821 mask1=0x2020202040404040, 822 mask0=0x4040808080808080; 823 824 #if !defined(PNG_1_0_X) 825 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW) 826 /* && mmx_supported */ ) 827 #else 828 if (mmx_supported) 829 #endif 830 { 831 srcptr = png_ptr->row_buf + 1; 832 dstptr = row; 833 834 unmask = ~mask; 835 len = (png_ptr->width)&~7; 836 diff = (png_ptr->width)&7; 837 _asm 838 { 839 movd mm7, unmask //load bit pattern 840 psubb mm6,mm6 //zero mm6 841 punpcklbw mm7,mm7 842 punpcklwd mm7,mm7 843 punpckldq mm7,mm7 //fill register with 8 masks 844 845 movq mm0,mask0 846 movq mm1,mask1 847 movq mm2,mask2 848 movq mm3,mask3 849 movq mm4,mask4 850 movq mm5,mask5 851 852 pand mm0,mm7 853 pand mm1,mm7 854 pand mm2,mm7 855 pand mm3,mm7 856 pand mm4,mm7 857 pand mm5,mm7 858 859 pcmpeqb mm0,mm6 860 pcmpeqb mm1,mm6 861 pcmpeqb mm2,mm6 862 pcmpeqb mm3,mm6 863 pcmpeqb mm4,mm6 864 pcmpeqb mm5,mm6 865 866 mov ecx,len //load length of line 867 mov esi,srcptr //load source 868 mov ebx,dstptr //load dest 869 870 cmp ecx,0 871 jz mainloop48end 872 873 mainloop48: 874 movq mm7,[esi] 875 pand mm7,mm0 876 movq mm6,mm0 877 pandn mm6,[ebx] 878 por mm7,mm6 879 movq [ebx],mm7 880 881 movq mm6,[esi+8] 882 pand mm6,mm1 883 movq mm7,mm1 884 pandn mm7,[ebx+8] 885 por mm6,mm7 886 movq [ebx+8],mm6 887 888 movq mm6,[esi+16] 889 pand mm6,mm2 890 movq mm7,mm2 891 pandn mm7,[ebx+16] 892 por mm6,mm7 893 movq [ebx+16],mm6 894 895 movq mm7,[esi+24] 896 pand mm7,mm3 897 movq mm6,mm3 898 pandn mm6,[ebx+24] 899 por mm7,mm6 900 movq [ebx+24],mm7 901 902 movq mm6,[esi+32] 903 pand mm6,mm4 904 movq mm7,mm4 905 pandn mm7,[ebx+32] 906 por mm6,mm7 907 movq [ebx+32],mm6 908 909 movq mm7,[esi+40] 910 pand mm7,mm5 911 movq mm6,mm5 912 pandn mm6,[ebx+40] 913 por mm7,mm6 914 movq [ebx+40],mm7 915 916 add esi,48 //inc by 32 bytes processed 917 add ebx,48 918 sub ecx,8 //dec by 8 pixels processed 919 920 ja mainloop48 921 mainloop48end: 922 923 mov ecx,diff 924 cmp ecx,0 925 jz end48 926 927 mov edx,mask 928 sal edx,24 //make low byte the high byte 929 930 secondloop48: 931 sal edx,1 //move high bit to CF 932 jnc skip48 //if CF = 0 933 mov eax,[esi] 934 mov [ebx],eax 935 mov ax,[esi+4] // These 2 lines added 20070717 936 mov [ebx+4],ax // Glenn R-P 937 skip48: 938 add esi,6 // Changed 4 to 6 on these 2 939 add ebx,6 // lines. Glenn R-P 20070717 940 941 dec ecx 942 jnz secondloop48 943 944 end48: 945 emms 946 } 947 } 948 else /* mmx _not supported - Use modified C routine */ 949 { 950 register unsigned int incr1, initial_val, final_val; 951 png_size_t pixel_bytes; 952 png_uint_32 i; 953 register int disp = png_pass_inc[png_ptr->pass]; 954 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0}; 955 956 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3); 957 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]* 958 pixel_bytes; 959 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes; 960 initial_val = offset_table[png_ptr->pass]*pixel_bytes; 961 final_val = png_ptr->width*pixel_bytes; 962 incr1 = (disp)*pixel_bytes; 963 for (i = initial_val; i < final_val; i += incr1) 964 { 965 png_memcpy(dstptr, srcptr, pixel_bytes); 966 srcptr += incr1; 967 dstptr += incr1; 968 } 969 } /* end of else */ 970 971 break; 972 } // end 48 bpp 973 974 default: 975 { 976 png_bytep sptr; 977 png_bytep dp; 978 png_size_t pixel_bytes; 979 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0}; 980 unsigned int i; 981 register int disp = png_pass_inc[png_ptr->pass]; // get the offset 982 register unsigned int incr1, initial_val, final_val; 983 984 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3); 985 sptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]* 986 pixel_bytes; 987 dp = row + offset_table[png_ptr->pass]*pixel_bytes; 988 initial_val = offset_table[png_ptr->pass]*pixel_bytes; 989 final_val = png_ptr->width*pixel_bytes; 990 incr1 = (disp)*pixel_bytes; 991 for (i = initial_val; i < final_val; i += incr1) 992 { 993 png_memcpy(dp, sptr, pixel_bytes); 994 sptr += incr1; 995 dp += incr1; 996 } 997 break; 998 } 999 } /* end switch (png_ptr->row_info.pixel_depth) */ 1000 } /* end if (non-trivial mask) */ 1001 1002 } /* end png_combine_row() */ 1003 1004 1005 #if defined(PNG_READ_INTERLACING_SUPPORTED) 1006 1007 void /* PRIVATE */ 1008 png_do_read_interlace(png_structp png_ptr) 1009 { 1010 png_row_infop row_info = &(png_ptr->row_info); 1011 png_bytep row = png_ptr->row_buf + 1; 1012 int pass = png_ptr->pass; 1013 png_uint_32 transformations = png_ptr->transformations; 1014 #ifdef PNG_USE_LOCAL_ARRAYS 1015 PNG_CONST int png_pass_inc[7] = {8, 8, 4, 4, 2, 2, 1}; 1016 #endif 1017 1018 png_debug(1,"in png_do_read_interlace\n"); 1019 1020 if (mmx_supported == 2) { 1021 #if !defined(PNG_1_0_X) 1022 /* this should have happened in png_init_mmx_flags() already */ 1023 png_warning(png_ptr, "asm_flags may not have been initialized"); 1024 #endif 1025 png_mmx_support(); 1026 } 1027 1028 if (row != NULL && row_info != NULL) 1029 { 1030 png_uint_32 final_width; 1031 1032 final_width = row_info->width * png_pass_inc[pass]; 1033 1034 switch (row_info->pixel_depth) 1035 { 1036 case 1: 1037 { 1038 png_bytep sp, dp; 1039 int sshift, dshift; 1040 int s_start, s_end, s_inc; 1041 png_byte v; 1042 png_uint_32 i; 1043 int j; 1044 1045 sp = row + (png_size_t)((row_info->width - 1) >> 3); 1046 dp = row + (png_size_t)((final_width - 1) >> 3); 1047 #if defined(PNG_READ_PACKSWAP_SUPPORTED) 1048 if (transformations & PNG_PACKSWAP) 1049 { 1050 sshift = (int)((row_info->width + 7) & 7); 1051 dshift = (int)((final_width + 7) & 7); 1052 s_start = 7; 1053 s_end = 0; 1054 s_inc = -1; 1055 } 1056 else 1057 #endif 1058 { 1059 sshift = 7 - (int)((row_info->width + 7) & 7); 1060 dshift = 7 - (int)((final_width + 7) & 7); 1061 s_start = 0; 1062 s_end = 7; 1063 s_inc = 1; 1064 } 1065 1066 for (i = row_info->width; i; i--) 1067 { 1068 v = (png_byte)((*sp >> sshift) & 0x1); 1069 for (j = 0; j < png_pass_inc[pass]; j++) 1070 { 1071 *dp &= (png_byte)((0x7f7f >> (7 - dshift)) & 0xff); 1072 *dp |= (png_byte)(v << dshift); 1073 if (dshift == s_end) 1074 { 1075 dshift = s_start; 1076 dp--; 1077 } 1078 else 1079 dshift += s_inc; 1080 } 1081 if (sshift == s_end) 1082 { 1083 sshift = s_start; 1084 sp--; 1085 } 1086 else 1087 sshift += s_inc; 1088 } 1089 break; 1090 } 1091 1092 case 2: 1093 { 1094 png_bytep sp, dp; 1095 int sshift, dshift; 1096 int s_start, s_end, s_inc; 1097 png_uint_32 i; 1098 1099 sp = row + (png_size_t)((row_info->width - 1) >> 2); 1100 dp = row + (png_size_t)((final_width - 1) >> 2); 1101 #if defined(PNG_READ_PACKSWAP_SUPPORTED) 1102 if (transformations & PNG_PACKSWAP) 1103 { 1104 sshift = (png_size_t)(((row_info->width + 3) & 3) << 1); 1105 dshift = (png_size_t)(((final_width + 3) & 3) << 1); 1106 s_start = 6; 1107 s_end = 0; 1108 s_inc = -2; 1109 } 1110 else 1111 #endif 1112 { 1113 sshift = (png_size_t)((3 - ((row_info->width + 3) & 3)) << 1); 1114 dshift = (png_size_t)((3 - ((final_width + 3) & 3)) << 1); 1115 s_start = 0; 1116 s_end = 6; 1117 s_inc = 2; 1118 } 1119 1120 for (i = row_info->width; i; i--) 1121 { 1122 png_byte v; 1123 int j; 1124 1125 v = (png_byte)((*sp >> sshift) & 0x3); 1126 for (j = 0; j < png_pass_inc[pass]; j++) 1127 { 1128 *dp &= (png_byte)((0x3f3f >> (6 - dshift)) & 0xff); 1129 *dp |= (png_byte)(v << dshift); 1130 if (dshift == s_end) 1131 { 1132 dshift = s_start; 1133 dp--; 1134 } 1135 else 1136 dshift += s_inc; 1137 } 1138 if (sshift == s_end) 1139 { 1140 sshift = s_start; 1141 sp--; 1142 } 1143 else 1144 sshift += s_inc; 1145 } 1146 break; 1147 } 1148 1149 case 4: 1150 { 1151 png_bytep sp, dp; 1152 int sshift, dshift; 1153 int s_start, s_end, s_inc; 1154 png_uint_32 i; 1155 1156 sp = row + (png_size_t)((row_info->width - 1) >> 1); 1157 dp = row + (png_size_t)((final_width - 1) >> 1); 1158 #if defined(PNG_READ_PACKSWAP_SUPPORTED) 1159 if (transformations & PNG_PACKSWAP) 1160 { 1161 sshift = (png_size_t)(((row_info->width + 1) & 1) << 2); 1162 dshift = (png_size_t)(((final_width + 1) & 1) << 2); 1163 s_start = 4; 1164 s_end = 0; 1165 s_inc = -4; 1166 } 1167 else 1168 #endif 1169 { 1170 sshift = (png_size_t)((1 - ((row_info->width + 1) & 1)) << 2); 1171 dshift = (png_size_t)((1 - ((final_width + 1) & 1)) << 2); 1172 s_start = 0; 1173 s_end = 4; 1174 s_inc = 4; 1175 } 1176 1177 for (i = row_info->width; i; i--) 1178 { 1179 png_byte v; 1180 int j; 1181 1182 v = (png_byte)((*sp >> sshift) & 0xf); 1183 for (j = 0; j < png_pass_inc[pass]; j++) 1184 { 1185 *dp &= (png_byte)((0xf0f >> (4 - dshift)) & 0xff); 1186 *dp |= (png_byte)(v << dshift); 1187 if (dshift == s_end) 1188 { 1189 dshift = s_start; 1190 dp--; 1191 } 1192 else 1193 dshift += s_inc; 1194 } 1195 if (sshift == s_end) 1196 { 1197 sshift = s_start; 1198 sp--; 1199 } 1200 else 1201 sshift += s_inc; 1202 } 1203 break; 1204 } 1205 1206 default: // This is the place where the routine is modified 1207 { 1208 __int64 const4 = 0x0000000000FFFFFF; 1209 // __int64 const5 = 0x000000FFFFFF0000; // unused... 1210 __int64 const6 = 0x00000000000000FF; 1211 png_bytep sptr, dp; 1212 png_uint_32 i; 1213 png_size_t pixel_bytes; 1214 int width = row_info->width; 1215 1216 pixel_bytes = (row_info->pixel_depth >> 3); 1217 1218 sptr = row + (width - 1) * pixel_bytes; 1219 dp = row + (final_width - 1) * pixel_bytes; 1220 // New code by Nirav Chhatrapati - Intel Corporation 1221 // sign fix by GRR 1222 // NOTE: there is NO MMX code for 48-bit and 64-bit images 1223 1224 // use MMX routine if machine supports it 1225 #if !defined(PNG_1_0_X) 1226 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_INTERLACE) 1227 /* && mmx_supported */ ) 1228 #else 1229 if (mmx_supported) 1230 #endif 1231 { 1232 if (pixel_bytes == 3) 1233 { 1234 if (((pass == 4) || (pass == 5)) && width) 1235 { 1236 int width_mmx = ((width >> 1) << 1) - 8; 1237 if (width_mmx < 0) 1238 width_mmx = 0; 1239 width -= width_mmx; // 8 or 9 pix, 24 or 27 bytes 1240 if (width_mmx) 1241 { 1242 _asm 1243 { 1244 mov esi, sptr 1245 mov edi, dp 1246 mov ecx, width_mmx 1247 sub esi, 3 1248 sub edi, 9 1249 loop_pass4: 1250 movq mm0, [esi] ; X X v2 v1 v0 v5 v4 v3 1251 movq mm7, mm0 ; X X v2 v1 v0 v5 v4 v3 1252 movq mm6, mm0 ; X X v2 v1 v0 v5 v4 v3 1253 psllq mm0, 24 ; v1 v0 v5 v4 v3 0 0 0 1254 pand mm7, const4 ; 0 0 0 0 0 v5 v4 v3 1255 psrlq mm6, 24 ; 0 0 0 X X v2 v1 v0 1256 por mm0, mm7 ; v1 v0 v5 v4 v3 v5 v4 v3 1257 movq mm5, mm6 ; 0 0 0 X X v2 v1 v0 1258 psllq mm6, 8 ; 0 0 X X v2 v1 v0 0 1259 movq [edi], mm0 ; move quad to memory 1260 psrlq mm5, 16 ; 0 0 0 0 0 X X v2 1261 pand mm5, const6 ; 0 0 0 0 0 0 0 v2 1262 por mm6, mm5 ; 0 0 X X v2 v1 v0 v2 1263 movd [edi+8], mm6 ; move double to memory 1264 sub esi, 6 1265 sub edi, 12 1266 sub ecx, 2 1267 jnz loop_pass4 1268 EMMS 1269 } 1270 } 1271 1272 sptr -= width_mmx*3; 1273 dp -= width_mmx*6; 1274 for (i = width; i; i--) 1275 { 1276 png_byte v[8]; 1277 int j; 1278 1279 png_memcpy(v, sptr, 3); 1280 for (j = 0; j < png_pass_inc[pass]; j++) 1281 { 1282 png_memcpy(dp, v, 3); 1283 dp -= 3; 1284 } 1285 sptr -= 3; 1286 } 1287 } 1288 else if (((pass == 2) || (pass == 3)) && width) 1289 { 1290 _asm 1291 { 1292 mov esi, sptr 1293 mov edi, dp 1294 mov ecx, width 1295 sub edi, 9 // (png_pass_inc[pass] - 1)*pixel_bytes 1296 loop_pass2: 1297 movd mm0, [esi] ; X X X X X v2 v1 v0 1298 pand mm0, const4 ; 0 0 0 0 0 v2 v1 v0 1299 movq mm1, mm0 ; 0 0 0 0 0 v2 v1 v0 1300 psllq mm0, 16 ; 0 0 0 v2 v1 v0 0 0 1301 movq mm2, mm0 ; 0 0 0 v2 v1 v0 0 0 1302 psllq mm0, 24 ; v2 v1 v0 0 0 0 0 0 1303 psrlq mm1, 8 ; 0 0 0 0 0 0 v2 v1 1304 por mm0, mm2 ; v2 v1 v0 v2 v1 v0 0 0 1305 por mm0, mm1 ; v2 v1 v0 v2 v1 v0 v2 v1 1306 movq [edi+4], mm0 ; move to memory 1307 psrlq mm0, 16 ; 0 0 v2 v1 v0 v2 v1 v0 1308 movd [edi], mm0 ; move to memory 1309 sub esi, 3 1310 sub edi, 12 1311 dec ecx 1312 jnz loop_pass2 1313 EMMS 1314 } 1315 } 1316 else if (width) /* && ((pass == 0) || (pass == 1))) */ 1317 { 1318 _asm 1319 { 1320 mov esi, sptr 1321 mov edi, dp 1322 mov ecx, width 1323 sub edi, 21 // (png_pass_inc[pass] - 1)*pixel_bytes 1324 loop_pass0: 1325 movd mm0, [esi] ; X X X X X v2 v1 v0 1326 pand mm0, const4 ; 0 0 0 0 0 v2 v1 v0 1327 movq mm1, mm0 ; 0 0 0 0 0 v2 v1 v0 1328 psllq mm0, 16 ; 0 0 0 v2 v1 v0 0 0 1329 movq mm2, mm0 ; 0 0 0 v2 v1 v0 0 0 1330 psllq mm0, 24 ; v2 v1 v0 0 0 0 0 0 1331 psrlq mm1, 8 ; 0 0 0 0 0 0 v2 v1 1332 por mm0, mm2 ; v2 v1 v0 v2 v1 v0 0 0 1333 por mm0, mm1 ; v2 v1 v0 v2 v1 v0 v2 v1 1334 movq mm3, mm0 ; v2 v1 v0 v2 v1 v0 v2 v1 1335 psllq mm0, 16 ; v0 v2 v1 v0 v2 v1 0 0 1336 movq mm4, mm3 ; v2 v1 v0 v2 v1 v0 v2 v1 1337 punpckhdq mm3, mm0 ; v0 v2 v1 v0 v2 v1 v0 v2 1338 movq [edi+16] , mm4 1339 psrlq mm0, 32 ; 0 0 0 0 v0 v2 v1 v0 1340 movq [edi+8] , mm3 1341 punpckldq mm0, mm4 ; v1 v0 v2 v1 v0 v2 v1 v0 1342 sub esi, 3 1343 movq [edi], mm0 1344 sub edi, 24 1345 //sub esi, 3 1346 dec ecx 1347 jnz loop_pass0 1348 EMMS 1349 } 1350 } 1351 } /* end of pixel_bytes == 3 */ 1352 1353 else if (pixel_bytes == 1) 1354 { 1355 if (((pass == 4) || (pass == 5)) && width) 1356 { 1357 int width_mmx = ((width >> 3) << 3); 1358 width -= width_mmx; 1359 if (width_mmx) 1360 { 1361 _asm 1362 { 1363 mov esi, sptr 1364 mov edi, dp 1365 mov ecx, width_mmx 1366 sub edi, 15 1367 sub esi, 7 1368 loop1_pass4: 1369 movq mm0, [esi] ; v0 v1 v2 v3 v4 v5 v6 v7 1370 movq mm1, mm0 ; v0 v1 v2 v3 v4 v5 v6 v7 1371 punpcklbw mm0, mm0 ; v4 v4 v5 v5 v6 v6 v7 v7 1372 //movq mm1, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3 1373 punpckhbw mm1, mm1 ;v0 v0 v1 v1 v2 v2 v3 v3 1374 movq [edi+8], mm1 ; move to memory v0 v1 v2 and v3 1375 sub esi, 8 1376 movq [edi], mm0 ; move to memory v4 v5 v6 and v7 1377 //sub esi, 4 1378 sub edi, 16 1379 sub ecx, 8 1380 jnz loop1_pass4 1381 EMMS 1382 } 1383 } 1384 1385 sptr -= width_mmx; 1386 dp -= width_mmx*2; 1387 for (i = width; i; i--) 1388 { 1389 int j; 1390 1391 for (j = 0; j < png_pass_inc[pass]; j++) 1392 { 1393 *dp-- = *sptr; 1394 } 1395 sptr --; 1396 } 1397 } 1398 else if (((pass == 2) || (pass == 3)) && width) 1399 { 1400 int width_mmx = ((width >> 2) << 2); 1401 width -= width_mmx; 1402 if (width_mmx) 1403 { 1404 _asm 1405 { 1406 mov esi, sptr 1407 mov edi, dp 1408 mov ecx, width_mmx 1409 sub edi, 15 1410 sub esi, 3 1411 loop1_pass2: 1412 movd mm0, [esi] ; X X X X v0 v1 v2 v3 1413 punpcklbw mm0, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3 1414 movq mm1, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3 1415 punpcklwd mm0, mm0 ; v2 v2 v2 v2 v3 v3 v3 v3 1416 punpckhwd mm1, mm1 ; v0 v0 v0 v0 v1 v1 v1 v1 1417 movq [edi], mm0 ; move to memory v2 and v3 1418 sub esi, 4 1419 movq [edi+8], mm1 ; move to memory v1 and v0 1420 sub edi, 16 1421 sub ecx, 4 1422 jnz loop1_pass2 1423 EMMS 1424 } 1425 } 1426 1427 sptr -= width_mmx; 1428 dp -= width_mmx*4; 1429 for (i = width; i; i--) 1430 { 1431 int j; 1432 1433 for (j = 0; j < png_pass_inc[pass]; j++) 1434 { 1435 *dp-- = *sptr; 1436 } 1437 sptr --; 1438 } 1439 } 1440 else if (width) /* && ((pass == 0) || (pass == 1))) */ 1441 { 1442 int width_mmx = ((width >> 2) << 2); 1443 width -= width_mmx; 1444 if (width_mmx) 1445 { 1446 _asm 1447 { 1448 mov esi, sptr 1449 mov edi, dp 1450 mov ecx, width_mmx 1451 sub edi, 31 1452 sub esi, 3 1453 loop1_pass0: 1454 movd mm0, [esi] ; X X X X v0 v1 v2 v3 1455 movq mm1, mm0 ; X X X X v0 v1 v2 v3 1456 punpcklbw mm0, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3 1457 movq mm2, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3 1458 punpcklwd mm0, mm0 ; v2 v2 v2 v2 v3 v3 v3 v3 1459 movq mm3, mm0 ; v2 v2 v2 v2 v3 v3 v3 v3 1460 punpckldq mm0, mm0 ; v3 v3 v3 v3 v3 v3 v3 v3 1461 punpckhdq mm3, mm3 ; v2 v2 v2 v2 v2 v2 v2 v2 1462 movq [edi], mm0 ; move to memory v3 1463 punpckhwd mm2, mm2 ; v0 v0 v0 v0 v1 v1 v1 v1 1464 movq [edi+8], mm3 ; move to memory v2 1465 movq mm4, mm2 ; v0 v0 v0 v0 v1 v1 v1 v1 1466 punpckldq mm2, mm2 ; v1 v1 v1 v1 v1 v1 v1 v1 1467 punpckhdq mm4, mm4 ; v0 v0 v0 v0 v0 v0 v0 v0 1468 movq [edi+16], mm2 ; move to memory v1 1469 movq [edi+24], mm4 ; move to memory v0 1470 sub esi, 4 1471 sub edi, 32 1472 sub ecx, 4 1473 jnz loop1_pass0 1474 EMMS 1475 } 1476 } 1477 1478 sptr -= width_mmx; 1479 dp -= width_mmx*8; 1480 for (i = width; i; i--) 1481 { 1482 int j; 1483 1484 /* I simplified this part in version 1.0.4e 1485 * here and in several other instances where 1486 * pixel_bytes == 1 -- GR-P 1487 * 1488 * Original code: 1489 * 1490 * png_byte v[8]; 1491 * png_memcpy(v, sptr, pixel_bytes); 1492 * for (j = 0; j < png_pass_inc[pass]; j++) 1493 * { 1494 * png_memcpy(dp, v, pixel_bytes); 1495 * dp -= pixel_bytes; 1496 * } 1497 * sptr -= pixel_bytes; 1498 * 1499 * Replacement code is in the next three lines: 1500 */ 1501 1502 for (j = 0; j < png_pass_inc[pass]; j++) 1503 *dp-- = *sptr; 1504 sptr--; 1505 } 1506 } 1507 } /* end of pixel_bytes == 1 */ 1508 1509 else if (pixel_bytes == 2) 1510 { 1511 if (((pass == 4) || (pass == 5)) && width) 1512 { 1513 int width_mmx = ((width >> 1) << 1) ; 1514 width -= width_mmx; 1515 if (width_mmx) 1516 { 1517 _asm 1518 { 1519 mov esi, sptr 1520 mov edi, dp 1521 mov ecx, width_mmx 1522 sub esi, 2 1523 sub edi, 6 1524 loop2_pass4: 1525 movd mm0, [esi] ; X X X X v1 v0 v3 v2 1526 punpcklwd mm0, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2 1527 sub esi, 4 1528 movq [edi], mm0 1529 sub edi, 8 1530 sub ecx, 2 1531 jnz loop2_pass4 1532 EMMS 1533 } 1534 } 1535 1536 sptr -= (width_mmx*2 - 2); // sign fixed 1537 dp -= (width_mmx*4 - 2); // sign fixed 1538 for (i = width; i; i--) 1539 { 1540 png_byte v[8]; 1541 int j; 1542 sptr -= 2; 1543 png_memcpy(v, sptr, 2); 1544 for (j = 0; j < png_pass_inc[pass]; j++) 1545 { 1546 dp -= 2; 1547 png_memcpy(dp, v, 2); 1548 } 1549 } 1550 } 1551 else if (((pass == 2) || (pass == 3)) && width) 1552 { 1553 int width_mmx = ((width >> 1) << 1) ; 1554 width -= width_mmx; 1555 if (width_mmx) 1556 { 1557 _asm 1558 { 1559 mov esi, sptr 1560 mov edi, dp 1561 mov ecx, width_mmx 1562 sub esi, 2 1563 sub edi, 14 1564 loop2_pass2: 1565 movd mm0, [esi] ; X X X X v1 v0 v3 v2 1566 punpcklwd mm0, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2 1567 movq mm1, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2 1568 punpckldq mm0, mm0 ; v3 v2 v3 v2 v3 v2 v3 v2 1569 punpckhdq mm1, mm1 ; v1 v0 v1 v0 v1 v0 v1 v0 1570 movq [edi], mm0 1571 sub esi, 4 1572 movq [edi + 8], mm1 1573 //sub esi, 4 1574 sub edi, 16 1575 sub ecx, 2 1576 jnz loop2_pass2 1577 EMMS 1578 } 1579 } 1580 1581 sptr -= (width_mmx*2 - 2); // sign fixed 1582 dp -= (width_mmx*8 - 2); // sign fixed 1583 for (i = width; i; i--) 1584 { 1585 png_byte v[8]; 1586 int j; 1587 sptr -= 2; 1588 png_memcpy(v, sptr, 2); 1589 for (j = 0; j < png_pass_inc[pass]; j++) 1590 { 1591 dp -= 2; 1592 png_memcpy(dp, v, 2); 1593 } 1594 } 1595 } 1596 else if (width) /* && ((pass == 0) || (pass == 1))) */ 1597 { 1598 int width_mmx = ((width >> 1) << 1); 1599 width -= width_mmx; 1600 if (width_mmx) 1601 { 1602 _asm 1603 { 1604 mov esi, sptr 1605 mov edi, dp 1606 mov ecx, width_mmx 1607 sub esi, 2 1608 sub edi, 30 1609 loop2_pass0: 1610 movd mm0, [esi] ; X X X X v1 v0 v3 v2 1611 punpcklwd mm0, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2 1612 movq mm1, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2 1613 punpckldq mm0, mm0 ; v3 v2 v3 v2 v3 v2 v3 v2 1614 punpckhdq mm1, mm1 ; v1 v0 v1 v0 v1 v0 v1 v0 1615 movq [edi], mm0 1616 movq [edi + 8], mm0 1617 movq [edi + 16], mm1 1618 movq [edi + 24], mm1 1619 sub esi, 4 1620 sub edi, 32 1621 sub ecx, 2 1622 jnz loop2_pass0 1623 EMMS 1624 } 1625 } 1626 1627 sptr -= (width_mmx*2 - 2); // sign fixed 1628 dp -= (width_mmx*16 - 2); // sign fixed 1629 for (i = width; i; i--) 1630 { 1631 png_byte v[8]; 1632 int j; 1633 sptr -= 2; 1634 png_memcpy(v, sptr, 2); 1635 for (j = 0; j < png_pass_inc[pass]; j++) 1636 { 1637 dp -= 2; 1638 png_memcpy(dp, v, 2); 1639 } 1640 } 1641 } 1642 } /* end of pixel_bytes == 2 */ 1643 1644 else if (pixel_bytes == 4) 1645 { 1646 if (((pass == 4) || (pass == 5)) && width) 1647 { 1648 int width_mmx = ((width >> 1) << 1) ; 1649 width -= width_mmx; 1650 if (width_mmx) 1651 { 1652 _asm 1653 { 1654 mov esi, sptr 1655 mov edi, dp 1656 mov ecx, width_mmx 1657 sub esi, 4 1658 sub edi, 12 1659 loop4_pass4: 1660 movq mm0, [esi] ; v3 v2 v1 v0 v7 v6 v5 v4 1661 movq mm1, mm0 ; v3 v2 v1 v0 v7 v6 v5 v4 1662 punpckldq mm0, mm0 ; v7 v6 v5 v4 v7 v6 v5 v4 1663 punpckhdq mm1, mm1 ; v3 v2 v1 v0 v3 v2 v1 v0 1664 movq [edi], mm0 1665 sub esi, 8 1666 movq [edi + 8], mm1 1667 sub edi, 16 1668 sub ecx, 2 1669 jnz loop4_pass4 1670 EMMS 1671 } 1672 } 1673 1674 sptr -= (width_mmx*4 - 4); // sign fixed 1675 dp -= (width_mmx*8 - 4); // sign fixed 1676 for (i = width; i; i--) 1677 { 1678 png_byte v[8]; 1679 int j; 1680 sptr -= 4; 1681 png_memcpy(v, sptr, 4); 1682 for (j = 0; j < png_pass_inc[pass]; j++) 1683 { 1684 dp -= 4; 1685 png_memcpy(dp, v, 4); 1686 } 1687 } 1688 } 1689 else if (((pass == 2) || (pass == 3)) && width) 1690 { 1691 int width_mmx = ((width >> 1) << 1) ; 1692 width -= width_mmx; 1693 if (width_mmx) 1694 { 1695 _asm 1696 { 1697 mov esi, sptr 1698 mov edi, dp 1699 mov ecx, width_mmx 1700 sub esi, 4 1701 sub edi, 28 1702 loop4_pass2: 1703 movq mm0, [esi] ; v3 v2 v1 v0 v7 v6 v5 v4 1704 movq mm1, mm0 ; v3 v2 v1 v0 v7 v6 v5 v4 1705 punpckldq mm0, mm0 ; v7 v6 v5 v4 v7 v6 v5 v4 1706 punpckhdq mm1, mm1 ; v3 v2 v1 v0 v3 v2 v1 v0 1707 movq [edi], mm0 1708 movq [edi + 8], mm0 1709 movq [edi+16], mm1 1710 movq [edi + 24], mm1 1711 sub esi, 8 1712 sub edi, 32 1713 sub ecx, 2 1714 jnz loop4_pass2 1715 EMMS 1716 } 1717 } 1718 1719 sptr -= (width_mmx*4 - 4); // sign fixed 1720 dp -= (width_mmx*16 - 4); // sign fixed 1721 for (i = width; i; i--) 1722 { 1723 png_byte v[8]; 1724 int j; 1725 sptr -= 4; 1726 png_memcpy(v, sptr, 4); 1727 for (j = 0; j < png_pass_inc[pass]; j++) 1728 { 1729 dp -= 4; 1730 png_memcpy(dp, v, 4); 1731 } 1732 } 1733 } 1734 else if (width) /* && ((pass == 0) || (pass == 1))) */ 1735 { 1736 int width_mmx = ((width >> 1) << 1) ; 1737 width -= width_mmx; 1738 if (width_mmx) 1739 { 1740 _asm 1741 { 1742 mov esi, sptr 1743 mov edi, dp 1744 mov ecx, width_mmx 1745 sub esi, 4 1746 sub edi, 60 1747 loop4_pass0: 1748 movq mm0, [esi] ; v3 v2 v1 v0 v7 v6 v5 v4 1749 movq mm1, mm0 ; v3 v2 v1 v0 v7 v6 v5 v4 1750 punpckldq mm0, mm0 ; v7 v6 v5 v4 v7 v6 v5 v4 1751 punpckhdq mm1, mm1 ; v3 v2 v1 v0 v3 v2 v1 v0 1752 movq [edi], mm0 1753 movq [edi + 8], mm0 1754 movq [edi + 16], mm0 1755 movq [edi + 24], mm0 1756 movq [edi+32], mm1 1757 movq [edi + 40], mm1 1758 movq [edi+ 48], mm1 1759 sub esi, 8 1760 movq [edi + 56], mm1 1761 sub edi, 64 1762 sub ecx, 2 1763 jnz loop4_pass0 1764 EMMS 1765 } 1766 } 1767 1768 sptr -= (width_mmx*4 - 4); // sign fixed 1769 dp -= (width_mmx*32 - 4); // sign fixed 1770 for (i = width; i; i--) 1771 { 1772 png_byte v[8]; 1773 int j; 1774 sptr -= 4; 1775 png_memcpy(v, sptr, 4); 1776 for (j = 0; j < png_pass_inc[pass]; j++) 1777 { 1778 dp -= 4; 1779 png_memcpy(dp, v, 4); 1780 } 1781 } 1782 } 1783 1784 } /* end of pixel_bytes == 4 */ 1785 1786 else if (pixel_bytes == 6) 1787 { 1788 for (i = width; i; i--) 1789 { 1790 png_byte v[8]; 1791 int j; 1792 png_memcpy(v, sptr, 6); 1793 for (j = 0; j < png_pass_inc[pass]; j++) 1794 { 1795 png_memcpy(dp, v, 6); 1796 dp -= 6; 1797 } 1798 sptr -= 6; 1799 } 1800 } /* end of pixel_bytes == 6 */ 1801 1802 else 1803 { 1804 for (i = width; i; i--) 1805 { 1806 png_byte v[8]; 1807 int j; 1808 png_memcpy(v, sptr, pixel_bytes); 1809 for (j = 0; j < png_pass_inc[pass]; j++) 1810 { 1811 png_memcpy(dp, v, pixel_bytes); 1812 dp -= pixel_bytes; 1813 } 1814 sptr-= pixel_bytes; 1815 } 1816 } 1817 } /* end of mmx_supported */ 1818 1819 else /* MMX not supported: use modified C code - takes advantage 1820 * of inlining of memcpy for a constant */ 1821 { 1822 if (pixel_bytes == 1) 1823 { 1824 for (i = width; i; i--) 1825 { 1826 int j; 1827 for (j = 0; j < png_pass_inc[pass]; j++) 1828 *dp-- = *sptr; 1829 sptr--; 1830 } 1831 } 1832 else if (pixel_bytes == 3) 1833 { 1834 for (i = width; i; i--) 1835 { 1836 png_byte v[8]; 1837 int j; 1838 png_memcpy(v, sptr, pixel_bytes); 1839 for (j = 0; j < png_pass_inc[pass]; j++) 1840 { 1841 png_memcpy(dp, v, pixel_bytes); 1842 dp -= pixel_bytes; 1843 } 1844 sptr -= pixel_bytes; 1845 } 1846 } 1847 else if (pixel_bytes == 2) 1848 { 1849 for (i = width; i; i--) 1850 { 1851 png_byte v[8]; 1852 int j; 1853 png_memcpy(v, sptr, pixel_bytes); 1854 for (j = 0; j < png_pass_inc[pass]; j++) 1855 { 1856 png_memcpy(dp, v, pixel_bytes); 1857 dp -= pixel_bytes; 1858 } 1859 sptr -= pixel_bytes; 1860 } 1861 } 1862 else if (pixel_bytes == 4) 1863 { 1864 for (i = width; i; i--) 1865 { 1866 png_byte v[8]; 1867 int j; 1868 png_memcpy(v, sptr, pixel_bytes); 1869 for (j = 0; j < png_pass_inc[pass]; j++) 1870 { 1871 png_memcpy(dp, v, pixel_bytes); 1872 dp -= pixel_bytes; 1873 } 1874 sptr -= pixel_bytes; 1875 } 1876 } 1877 else if (pixel_bytes == 6) 1878 { 1879 for (i = width; i; i--) 1880 { 1881 png_byte v[8]; 1882 int j; 1883 png_memcpy(v, sptr, pixel_bytes); 1884 for (j = 0; j < png_pass_inc[pass]; j++) 1885 { 1886 png_memcpy(dp, v, pixel_bytes); 1887 dp -= pixel_bytes; 1888 } 1889 sptr -= pixel_bytes; 1890 } 1891 } 1892 else 1893 { 1894 for (i = width; i; i--) 1895 { 1896 png_byte v[8]; 1897 int j; 1898 png_memcpy(v, sptr, pixel_bytes); 1899 for (j = 0; j < png_pass_inc[pass]; j++) 1900 { 1901 png_memcpy(dp, v, pixel_bytes); 1902 dp -= pixel_bytes; 1903 } 1904 sptr -= pixel_bytes; 1905 } 1906 } 1907 1908 } /* end of MMX not supported */ 1909 break; 1910 } 1911 } /* end switch (row_info->pixel_depth) */ 1912 1913 row_info->width = final_width; 1914 1915 row_info->rowbytes = PNG_ROWBYTES(row_info->pixel_depth,final_width); 1916 } 1917 1918 } 1919 1920 #endif /* PNG_READ_INTERLACING_SUPPORTED */ 1921 1922 1923 // These global constants are declared 1924 // here to ensure alignment on 8-byte boundaries. 1925 union uAll { 1926 __int64 use; 1927 double double_align; 1928 long long long_long_align; 1929 } ; 1930 static PNG_CONST union uAll LBCarryMask = {0x0101010101010101}, 1931 HBClearMask = {0x7f7f7f7f7f7f7f7f}; 1932 1933 // Optimized code for PNG Average filter decoder 1934 void /* PRIVATE */ 1935 png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row 1936 , png_bytep prev_row) 1937 { 1938 // These variables are declared 1939 // here to ensure alignment on 8-byte boundaries. 1940 union uAll ActiveMask, ShiftBpp, ShiftRem; 1941 1942 int bpp; 1943 png_uint_32 FullLength; 1944 png_uint_32 MMXLength; 1945 //png_uint_32 len; 1946 int diff; 1947 1948 bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel 1949 FullLength = row_info->rowbytes; // # of bytes to filter 1950 _asm { 1951 // Init address pointers and offset 1952 mov edi, row // edi ==> Avg(x) 1953 xor ebx, ebx // ebx ==> x 1954 mov edx, edi 1955 mov esi, prev_row // esi ==> Prior(x) 1956 sub edx, bpp // edx ==> Raw(x-bpp) 1957 1958 xor eax, eax 1959 // Compute the Raw value for the first bpp bytes 1960 // Raw(x) = Avg(x) + (Prior(x)/2) 1961 davgrlp: 1962 mov al, [esi + ebx] // Load al with Prior(x) 1963 inc ebx 1964 shr al, 1 // divide by 2 1965 add al, [edi+ebx-1] // Add Avg(x); -1 to offset inc ebx 1966 cmp ebx, bpp 1967 mov [edi+ebx-1], al // Write back Raw(x); 1968 // mov does not affect flags; -1 to offset inc ebx 1969 jb davgrlp 1970 // get # of bytes to alignment 1971 mov diff, edi // take start of row 1972 add diff, ebx // add bpp 1973 add diff, 0xf // add 7 + 8 to incr past alignment boundary 1974 and diff, 0xfffffff8 // mask to alignment boundary 1975 sub diff, edi // subtract from start ==> value ebx at alignment 1976 jz davggo 1977 // fix alignment 1978 // Compute the Raw value for the bytes upto the alignment boundary 1979 // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2) 1980 xor ecx, ecx 1981 davglp1: 1982 xor eax, eax 1983 mov cl, [esi + ebx] // load cl with Prior(x) 1984 mov al, [edx + ebx] // load al with Raw(x-bpp) 1985 add ax, cx 1986 inc ebx 1987 shr ax, 1 // divide by 2 1988 add al, [edi+ebx-1] // Add Avg(x); -1 to offset inc ebx 1989 cmp ebx, diff // Check if at alignment boundary 1990 mov [edi+ebx-1], al // Write back Raw(x); 1991 // mov does not affect flags; -1 to offset inc ebx 1992 jb davglp1 // Repeat until at alignment boundary 1993 davggo: 1994 mov eax, FullLength 1995 mov ecx, eax 1996 sub eax, ebx // subtract alignment fix 1997 and eax, 0x00000007 // calc bytes over mult of 8 1998 sub ecx, eax // drop over bytes from original length 1999 mov MMXLength, ecx 2000 } // end _asm block 2001 // Now do the math for the rest of the row 2002 switch ( bpp ) 2003 { 2004 case 3: 2005 { 2006 ActiveMask.use = 0x0000000000ffffff; 2007 ShiftBpp.use = 24; // == 3 * 8 2008 ShiftRem.use = 40; // == 64 - 24 2009 _asm { 2010 // Re-init address pointers and offset 2011 movq mm7, ActiveMask 2012 mov ebx, diff // ebx ==> x = offset to alignment boundary 2013 movq mm5, LBCarryMask 2014 mov edi, row // edi ==> Avg(x) 2015 movq mm4, HBClearMask 2016 mov esi, prev_row // esi ==> Prior(x) 2017 // PRIME the pump (load the first Raw(x-bpp) data set 2018 movq mm2, [edi + ebx - 8] // Load previous aligned 8 bytes 2019 // (we correct position in loop below) 2020 davg3lp: 2021 movq mm0, [edi + ebx] // Load mm0 with Avg(x) 2022 // Add (Prev_row/2) to Average 2023 movq mm3, mm5 2024 psrlq mm2, ShiftRem // Correct position Raw(x-bpp) data 2025 movq mm1, [esi + ebx] // Load mm1 with Prior(x) 2026 movq mm6, mm7 2027 pand mm3, mm1 // get lsb for each prev_row byte 2028 psrlq mm1, 1 // divide prev_row bytes by 2 2029 pand mm1, mm4 // clear invalid bit 7 of each byte 2030 paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte 2031 // Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry 2032 movq mm1, mm3 // now use mm1 for getting LBCarrys 2033 pand mm1, mm2 // get LBCarrys for each byte where both 2034 // lsb's were == 1 (Only valid for active group) 2035 psrlq mm2, 1 // divide raw bytes by 2 2036 pand mm2, mm4 // clear invalid bit 7 of each byte 2037 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte 2038 pand mm2, mm6 // Leave only Active Group 1 bytes to add to Avg 2039 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active 2040 // byte 2041 // Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry 2042 psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 3-5 2043 movq mm2, mm0 // mov updated Raws to mm2 2044 psllq mm2, ShiftBpp // shift data to position correctly 2045 movq mm1, mm3 // now use mm1 for getting LBCarrys 2046 pand mm1, mm2 // get LBCarrys for each byte where both 2047 // lsb's were == 1 (Only valid for active group) 2048 psrlq mm2, 1 // divide raw bytes by 2 2049 pand mm2, mm4 // clear invalid bit 7 of each byte 2050 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte 2051 pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg 2052 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active 2053 // byte 2054 2055 // Add 3rd active group (Raw(x-bpp)/2) to Average with LBCarry 2056 psllq mm6, ShiftBpp // shift the mm6 mask to cover the last two 2057 // bytes 2058 movq mm2, mm0 // mov updated Raws to mm2 2059 psllq mm2, ShiftBpp // shift data to position correctly 2060 // Data only needs to be shifted once here to 2061 // get the correct x-bpp offset. 2062 movq mm1, mm3 // now use mm1 for getting LBCarrys 2063 pand mm1, mm2 // get LBCarrys for each byte where both 2064 // lsb's were == 1 (Only valid for active group) 2065 psrlq mm2, 1 // divide raw bytes by 2 2066 pand mm2, mm4 // clear invalid bit 7 of each byte 2067 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte 2068 pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg 2069 add ebx, 8 2070 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active 2071 // byte 2072 2073 // Now ready to write back to memory 2074 movq [edi + ebx - 8], mm0 2075 // Move updated Raw(x) to use as Raw(x-bpp) for next loop 2076 cmp ebx, MMXLength 2077 movq mm2, mm0 // mov updated Raw(x) to mm2 2078 jb davg3lp 2079 } // end _asm block 2080 } 2081 break; 2082 2083 case 6: 2084 case 4: 2085 case 7: 2086 case 5: 2087 { 2088 ActiveMask.use = 0xffffffffffffffff; // use shift below to clear 2089 // appropriate inactive bytes 2090 ShiftBpp.use = bpp << 3; 2091 ShiftRem.use = 64 - ShiftBpp.use; 2092 _asm { 2093 movq mm4, HBClearMask 2094 // Re-init address pointers and offset 2095 mov ebx, diff // ebx ==> x = offset to alignment boundary 2096 // Load ActiveMask and clear all bytes except for 1st active group 2097 movq mm7, ActiveMask 2098 mov edi, row // edi ==> Avg(x) 2099 psrlq mm7, ShiftRem 2100 mov esi, prev_row // esi ==> Prior(x) 2101 movq mm6, mm7 2102 movq mm5, LBCarryMask 2103 psllq mm6, ShiftBpp // Create mask for 2nd active group 2104 // PRIME the pump (load the first Raw(x-bpp) data set 2105 movq mm2, [edi + ebx - 8] // Load previous aligned 8 bytes 2106 // (we correct position in loop below) 2107 davg4lp: 2108 movq mm0, [edi + ebx] 2109 psrlq mm2, ShiftRem // shift data to position correctly 2110 movq mm1, [esi + ebx] 2111 // Add (Prev_row/2) to Average 2112 movq mm3, mm5 2113 pand mm3, mm1 // get lsb for each prev_row byte 2114 psrlq mm1, 1 // divide prev_row bytes by 2 2115 pand mm1, mm4 // clear invalid bit 7 of each byte 2116 paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte 2117 // Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry 2118 movq mm1, mm3 // now use mm1 for getting LBCarrys 2119 pand mm1, mm2 // get LBCarrys for each byte where both 2120 // lsb's were == 1 (Only valid for active group) 2121 psrlq mm2, 1 // divide raw bytes by 2 2122 pand mm2, mm4 // clear invalid bit 7 of each byte 2123 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte 2124 pand mm2, mm7 // Leave only Active Group 1 bytes to add to Avg 2125 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active 2126 // byte 2127 // Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry 2128 movq mm2, mm0 // mov updated Raws to mm2 2129 psllq mm2, ShiftBpp // shift data to position correctly 2130 add ebx, 8 2131 movq mm1, mm3 // now use mm1 for getting LBCarrys 2132 pand mm1, mm2 // get LBCarrys for each byte where both 2133 // lsb's were == 1 (Only valid for active group) 2134 psrlq mm2, 1 // divide raw bytes by 2 2135 pand mm2, mm4 // clear invalid bit 7 of each byte 2136 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte 2137 pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg 2138 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active 2139 // byte 2140 cmp ebx, MMXLength 2141 // Now ready to write back to memory 2142 movq [edi + ebx - 8], mm0 2143 // Prep Raw(x-bpp) for next loop 2144 movq mm2, mm0 // mov updated Raws to mm2 2145 jb davg4lp 2146 } // end _asm block 2147 } 2148 break; 2149 case 2: 2150 { 2151 ActiveMask.use = 0x000000000000ffff; 2152 ShiftBpp.use = 16; // == 2 * 8 [BUGFIX] 2153 ShiftRem.use = 48; // == 64 - 16 [BUGFIX] 2154 _asm { 2155 // Load ActiveMask 2156 movq mm7, ActiveMask 2157 // Re-init address pointers and offset 2158 mov ebx, diff // ebx ==> x = offset to alignment boundary 2159 movq mm5, LBCarryMask 2160 mov edi, row // edi ==> Avg(x) 2161 movq mm4, HBClearMask 2162 mov esi, prev_row // esi ==> Prior(x) 2163 // PRIME the pump (load the first Raw(x-bpp) data set 2164 movq mm2, [edi + ebx - 8] // Load previous aligned 8 bytes 2165 // (we correct position in loop below) 2166 davg2lp: 2167 movq mm0, [edi + ebx] 2168 psrlq mm2, ShiftRem // shift data to position correctly [BUGFIX] 2169 movq mm1, [esi + ebx] 2170 // Add (Prev_row/2) to Average 2171 movq mm3, mm5 2172 pand mm3, mm1 // get lsb for each prev_row byte 2173 psrlq mm1, 1 // divide prev_row bytes by 2 2174 pand mm1, mm4 // clear invalid bit 7 of each byte 2175 movq mm6, mm7 2176 paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte 2177 // Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry 2178 movq mm1, mm3 // now use mm1 for getting LBCarrys 2179 pand mm1, mm2 // get LBCarrys for each byte where both 2180 // lsb's were == 1 (Only valid for active group) 2181 psrlq mm2, 1 // divide raw bytes by 2 2182 pand mm2, mm4 // clear invalid bit 7 of each byte 2183 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte 2184 pand mm2, mm6 // Leave only Active Group 1 bytes to add to Avg 2185 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte 2186 // Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry 2187 psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 2 & 3 2188 movq mm2, mm0 // mov updated Raws to mm2 2189 psllq mm2, ShiftBpp // shift data to position correctly 2190 movq mm1, mm3 // now use mm1 for getting LBCarrys 2191 pand mm1, mm2 // get LBCarrys for each byte where both 2192 // lsb's were == 1 (Only valid for active group) 2193 psrlq mm2, 1 // divide raw bytes by 2 2194 pand mm2, mm4 // clear invalid bit 7 of each byte 2195 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte 2196 pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg 2197 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte 2198 2199 // Add rdd active group (Raw(x-bpp)/2) to Average with LBCarry 2200 psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 4 & 5 2201 movq mm2, mm0 // mov updated Raws to mm2 2202 psllq mm2, ShiftBpp // shift data to position correctly 2203 // Data only needs to be shifted once here to 2204 // get the correct x-bpp offset. 2205 movq mm1, mm3 // now use mm1 for getting LBCarrys 2206 pand mm1, mm2 // get LBCarrys for each byte where both 2207 // lsb's were == 1 (Only valid for active group) 2208 psrlq mm2, 1 // divide raw bytes by 2 2209 pand mm2, mm4 // clear invalid bit 7 of each byte 2210 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte 2211 pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg 2212 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte 2213 2214 // Add 4th active group (Raw(x-bpp)/2) to Average with LBCarry 2215 psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 6 & 7 2216 movq mm2, mm0 // mov updated Raws to mm2 2217 psllq mm2, ShiftBpp // shift data to position correctly 2218 // Data only needs to be shifted once here to 2219 // get the correct x-bpp offset. 2220 add ebx, 8 2221 movq mm1, mm3 // now use mm1 for getting LBCarrys 2222 pand mm1, mm2 // get LBCarrys for each byte where both 2223 // lsb's were == 1 (Only valid for active group) 2224 psrlq mm2, 1 // divide raw bytes by 2 2225 pand mm2, mm4 // clear invalid bit 7 of each byte 2226 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte 2227 pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg 2228 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte 2229 2230 cmp ebx, MMXLength 2231 // Now ready to write back to memory 2232 movq [edi + ebx - 8], mm0 2233 // Prep Raw(x-bpp) for next loop 2234 movq mm2, mm0 // mov updated Raws to mm2 2235 jb davg2lp 2236 } // end _asm block 2237 } 2238 break; 2239 2240 case 1: // bpp == 1 2241 { 2242 _asm { 2243 // Re-init address pointers and offset 2244 mov ebx, diff // ebx ==> x = offset to alignment boundary 2245 mov edi, row // edi ==> Avg(x) 2246 cmp ebx, FullLength // Test if offset at end of array 2247 jnb davg1end 2248 // Do Paeth decode for remaining bytes 2249 mov esi, prev_row // esi ==> Prior(x) 2250 mov edx, edi 2251 xor ecx, ecx // zero ecx before using cl & cx in loop below 2252 sub edx, bpp // edx ==> Raw(x-bpp) 2253 davg1lp: 2254 // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2) 2255 xor eax, eax 2256 mov cl, [esi + ebx] // load cl with Prior(x) 2257 mov al, [edx + ebx] // load al with Raw(x-bpp) 2258 add ax, cx 2259 inc ebx 2260 shr ax, 1 // divide by 2 2261 add al, [edi+ebx-1] // Add Avg(x); -1 to offset inc ebx 2262 cmp ebx, FullLength // Check if at end of array 2263 mov [edi+ebx-1], al // Write back Raw(x); 2264 // mov does not affect flags; -1 to offset inc ebx 2265 jb davg1lp 2266 davg1end: 2267 } // end _asm block 2268 } 2269 return; 2270 2271 case 8: // bpp == 8 2272 { 2273 _asm { 2274 // Re-init address pointers and offset 2275 mov ebx, diff // ebx ==> x = offset to alignment boundary 2276 movq mm5, LBCarryMask 2277 mov edi, row // edi ==> Avg(x) 2278 movq mm4, HBClearMask 2279 mov esi, prev_row // esi ==> Prior(x) 2280 // PRIME the pump (load the first Raw(x-bpp) data set 2281 movq mm2, [edi + ebx - 8] // Load previous aligned 8 bytes 2282 // (NO NEED to correct position in loop below) 2283 davg8lp: 2284 movq mm0, [edi + ebx] 2285 movq mm3, mm5 2286 movq mm1, [esi + ebx] 2287 add ebx, 8 2288 pand mm3, mm1 // get lsb for each prev_row byte 2289 psrlq mm1, 1 // divide prev_row bytes by 2 2290 pand mm3, mm2 // get LBCarrys for each byte where both 2291 // lsb's were == 1 2292 psrlq mm2, 1 // divide raw bytes by 2 2293 pand mm1, mm4 // clear invalid bit 7 of each byte 2294 paddb mm0, mm3 // add LBCarrys to Avg for each byte 2295 pand mm2, mm4 // clear invalid bit 7 of each byte 2296 paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte 2297 paddb mm0, mm2 // add (Raw/2) to Avg for each byte 2298 cmp ebx, MMXLength 2299 movq [edi + ebx - 8], mm0 2300 movq mm2, mm0 // reuse as Raw(x-bpp) 2301 jb davg8lp 2302 } // end _asm block 2303 } 2304 break; 2305 default: // bpp greater than 8 2306 { 2307 _asm { 2308 movq mm5, LBCarryMask 2309 // Re-init address pointers and offset 2310 mov ebx, diff // ebx ==> x = offset to alignment boundary 2311 mov edi, row // edi ==> Avg(x) 2312 movq mm4, HBClearMask 2313 mov edx, edi 2314 mov esi, prev_row // esi ==> Prior(x) 2315 sub edx, bpp // edx ==> Raw(x-bpp) 2316 davgAlp: 2317 movq mm0, [edi + ebx] 2318 movq mm3, mm5 2319 movq mm1, [esi + ebx] 2320 pand mm3, mm1 // get lsb for each prev_row byte 2321 movq mm2, [edx + ebx] 2322 psrlq mm1, 1 // divide prev_row bytes by 2 2323 pand mm3, mm2 // get LBCarrys for each byte where both 2324 // lsb's were == 1 2325 psrlq mm2, 1 // divide raw bytes by 2 2326 pand mm1, mm4 // clear invalid bit 7 of each byte 2327 paddb mm0, mm3 // add LBCarrys to Avg for each byte 2328 pand mm2, mm4 // clear invalid bit 7 of each byte 2329 paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte 2330 add ebx, 8 2331 paddb mm0, mm2 // add (Raw/2) to Avg for each byte 2332 cmp ebx, MMXLength 2333 movq [edi + ebx - 8], mm0 2334 jb davgAlp 2335 } // end _asm block 2336 } 2337 break; 2338 } // end switch ( bpp ) 2339 2340 _asm { 2341 // MMX acceleration complete now do clean-up 2342 // Check if any remaining bytes left to decode 2343 mov ebx, MMXLength // ebx ==> x = offset bytes remaining after MMX 2344 mov edi, row // edi ==> Avg(x) 2345 cmp ebx, FullLength // Test if offset at end of array 2346 jnb davgend 2347 // Do Paeth decode for remaining bytes 2348 mov esi, prev_row // esi ==> Prior(x) 2349 mov edx, edi 2350 xor ecx, ecx // zero ecx before using cl & cx in loop below 2351 sub edx, bpp // edx ==> Raw(x-bpp) 2352 davglp2: 2353 // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2) 2354 xor eax, eax 2355 mov cl, [esi + ebx] // load cl with Prior(x) 2356 mov al, [edx + ebx] // load al with Raw(x-bpp) 2357 add ax, cx 2358 inc ebx 2359 shr ax, 1 // divide by 2 2360 add al, [edi+ebx-1] // Add Avg(x); -1 to offset inc ebx 2361 cmp ebx, FullLength // Check if at end of array 2362 mov [edi+ebx-1], al // Write back Raw(x); 2363 // mov does not affect flags; -1 to offset inc ebx 2364 jb davglp2 2365 davgend: 2366 emms // End MMX instructions; prep for possible FP instrs. 2367 } // end _asm block 2368 } 2369 2370 // Optimized code for PNG Paeth filter decoder 2371 void /* PRIVATE */ 2372 png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row, 2373 png_bytep prev_row) 2374 { 2375 // These variables are declared 2376 // here to ensure alignment on 8-byte boundaries. 2377 union uAll ActiveMask, ActiveMask2, ActiveMaskEnd, ShiftBpp, ShiftRem; 2378 2379 png_uint_32 FullLength; 2380 png_uint_32 MMXLength; 2381 //png_uint_32 len; 2382 int bpp; 2383 int diff; 2384 //int ptemp; 2385 int patemp, pbtemp, pctemp; 2386 2387 bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel 2388 FullLength = row_info->rowbytes; // # of bytes to filter 2389 _asm 2390 { 2391 xor ebx, ebx // ebx ==> x offset 2392 mov edi, row 2393 xor edx, edx // edx ==> x-bpp offset 2394 mov esi, prev_row 2395 xor eax, eax 2396 2397 // Compute the Raw value for the first bpp bytes 2398 // Note: the formula works out to be always 2399 // Paeth(x) = Raw(x) + Prior(x) where x < bpp 2400 dpthrlp: 2401 mov al, [edi + ebx] 2402 add al, [esi + ebx] 2403 inc ebx 2404 cmp ebx, bpp 2405 mov [edi + ebx - 1], al 2406 jb dpthrlp 2407 // get # of bytes to alignment 2408 mov diff, edi // take start of row 2409 add diff, ebx // add bpp 2410 xor ecx, ecx 2411 add diff, 0xf // add 7 + 8 to incr past alignment boundary 2412 and diff, 0xfffffff8 // mask to alignment boundary 2413 sub diff, edi // subtract from start ==> value ebx at alignment 2414 jz dpthgo 2415 // fix alignment 2416 dpthlp1: 2417 xor eax, eax 2418 // pav = p - a = (a + b - c) - a = b - c 2419 mov al, [esi + ebx] // load Prior(x) into al 2420 mov cl, [esi + edx] // load Prior(x-bpp) into cl 2421 sub eax, ecx // subtract Prior(x-bpp) 2422 mov patemp, eax // Save pav for later use 2423 xor eax, eax 2424 // pbv = p - b = (a + b - c) - b = a - c 2425 mov al, [edi + edx] // load Raw(x-bpp) into al 2426 sub eax, ecx // subtract Prior(x-bpp) 2427 mov ecx, eax 2428 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv 2429 add eax, patemp // pcv = pav + pbv 2430 // pc = abs(pcv) 2431 test eax, 0x80000000 2432 jz dpthpca 2433 neg eax // reverse sign of neg values 2434 dpthpca: 2435 mov pctemp, eax // save pc for later use 2436 // pb = abs(pbv) 2437 test ecx, 0x80000000 2438 jz dpthpba 2439 neg ecx // reverse sign of neg values 2440 dpthpba: 2441 mov pbtemp, ecx // save pb for later use 2442 // pa = abs(pav) 2443 mov eax, patemp 2444 test eax, 0x80000000 2445 jz dpthpaa 2446 neg eax // reverse sign of neg values 2447 dpthpaa: 2448 mov patemp, eax // save pa for later use 2449 // test if pa <= pb 2450 cmp eax, ecx 2451 jna dpthabb 2452 // pa > pb; now test if pb <= pc 2453 cmp ecx, pctemp 2454 jna dpthbbc 2455 // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp) 2456 mov cl, [esi + edx] // load Prior(x-bpp) into cl 2457 jmp dpthpaeth 2458 dpthbbc: 2459 // pb <= pc; Raw(x) = Paeth(x) + Prior(x) 2460 mov cl, [esi + ebx] // load Prior(x) into cl 2461 jmp dpthpaeth 2462 dpthabb: 2463 // pa <= pb; now test if pa <= pc 2464 cmp eax, pctemp 2465 jna dpthabc 2466 // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp) 2467 mov cl, [esi + edx] // load Prior(x-bpp) into cl 2468 jmp dpthpaeth 2469 dpthabc: 2470 // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp) 2471 mov cl, [edi + edx] // load Raw(x-bpp) into cl 2472 dpthpaeth: 2473 inc ebx 2474 inc edx 2475 // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256 2476 add [edi + ebx - 1], cl 2477 cmp ebx, diff 2478 jb dpthlp1 2479 dpthgo: 2480 mov ecx, FullLength 2481 mov eax, ecx 2482 sub eax, ebx // subtract alignment fix 2483 and eax, 0x00000007 // calc bytes over mult of 8 2484 sub ecx, eax // drop over bytes from original length 2485 mov MMXLength, ecx 2486 } // end _asm block 2487 // Now do the math for the rest of the row 2488 switch ( bpp ) 2489 { 2490 case 3: 2491 { 2492 ActiveMask.use = 0x0000000000ffffff; 2493 ActiveMaskEnd.use = 0xffff000000000000; 2494 ShiftBpp.use = 24; // == bpp(3) * 8 2495 ShiftRem.use = 40; // == 64 - 24 2496 _asm 2497 { 2498 mov ebx, diff 2499 mov edi, row 2500 mov esi, prev_row 2501 pxor mm0, mm0 2502 // PRIME the pump (load the first Raw(x-bpp) data set 2503 movq mm1, [edi+ebx-8] 2504 dpth3lp: 2505 psrlq mm1, ShiftRem // shift last 3 bytes to 1st 3 bytes 2506 movq mm2, [esi + ebx] // load b=Prior(x) 2507 punpcklbw mm1, mm0 // Unpack High bytes of a 2508 movq mm3, [esi+ebx-8] // Prep c=Prior(x-bpp) bytes 2509 punpcklbw mm2, mm0 // Unpack High bytes of b 2510 psrlq mm3, ShiftRem // shift last 3 bytes to 1st 3 bytes 2511 // pav = p - a = (a + b - c) - a = b - c 2512 movq mm4, mm2 2513 punpcklbw mm3, mm0 // Unpack High bytes of c 2514 // pbv = p - b = (a + b - c) - b = a - c 2515 movq mm5, mm1 2516 psubw mm4, mm3 2517 pxor mm7, mm7 2518 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv 2519 movq mm6, mm4 2520 psubw mm5, mm3 2521 2522 // pa = abs(p-a) = abs(pav) 2523 // pb = abs(p-b) = abs(pbv) 2524 // pc = abs(p-c) = abs(pcv) 2525 pcmpgtw mm0, mm4 // Create mask pav bytes < 0 2526 paddw mm6, mm5 2527 pand mm0, mm4 // Only pav bytes < 0 in mm7 2528 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0 2529 psubw mm4, mm0 2530 pand mm7, mm5 // Only pbv bytes < 0 in mm0 2531 psubw mm4, mm0 2532 psubw mm5, mm7 2533 pxor mm0, mm0 2534 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0 2535 pand mm0, mm6 // Only pav bytes < 0 in mm7 2536 psubw mm5, mm7 2537 psubw mm6, mm0 2538 // test pa <= pb 2539 movq mm7, mm4 2540 psubw mm6, mm0 2541 pcmpgtw mm7, mm5 // pa > pb? 2542 movq mm0, mm7 2543 // use mm7 mask to merge pa & pb 2544 pand mm5, mm7 2545 // use mm0 mask copy to merge a & b 2546 pand mm2, mm0 2547 pandn mm7, mm4 2548 pandn mm0, mm1 2549 paddw mm7, mm5 2550 paddw mm0, mm2 2551 // test ((pa <= pb)? pa:pb) <= pc 2552 pcmpgtw mm7, mm6 // pab > pc? 2553 pxor mm1, mm1 2554 pand mm3, mm7 2555 pandn mm7, mm0 2556 paddw mm7, mm3 2557 pxor mm0, mm0 2558 packuswb mm7, mm1 2559 movq mm3, [esi + ebx] // load c=Prior(x-bpp) 2560 pand mm7, ActiveMask 2561 movq mm2, mm3 // load b=Prior(x) step 1 2562 paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x) 2563 punpcklbw mm3, mm0 // Unpack High bytes of c 2564 movq [edi + ebx], mm7 // write back updated value 2565 movq mm1, mm7 // Now mm1 will be used as Raw(x-bpp) 2566 // Now do Paeth for 2nd set of bytes (3-5) 2567 psrlq mm2, ShiftBpp // load b=Prior(x) step 2 2568 punpcklbw mm1, mm0 // Unpack High bytes of a 2569 pxor mm7, mm7 2570 punpcklbw mm2, mm0 // Unpack High bytes of b 2571 // pbv = p - b = (a + b - c) - b = a - c 2572 movq mm5, mm1 2573 // pav = p - a = (a + b - c) - a = b - c 2574 movq mm4, mm2 2575 psubw mm5, mm3 2576 psubw mm4, mm3 2577 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = 2578 // pav + pbv = pbv + pav 2579 movq mm6, mm5 2580 paddw mm6, mm4 2581 2582 // pa = abs(p-a) = abs(pav) 2583 // pb = abs(p-b) = abs(pbv) 2584 // pc = abs(p-c) = abs(pcv) 2585 pcmpgtw mm0, mm5 // Create mask pbv bytes < 0 2586 pcmpgtw mm7, mm4 // Create mask pav bytes < 0 2587 pand mm0, mm5 // Only pbv bytes < 0 in mm0 2588 pand mm7, mm4 // Only pav bytes < 0 in mm7 2589 psubw mm5, mm0 2590 psubw mm4, mm7 2591 psubw mm5, mm0 2592 psubw mm4, mm7 2593 pxor mm0, mm0 2594 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0 2595 pand mm0, mm6 // Only pav bytes < 0 in mm7 2596 psubw mm6, mm0 2597 // test pa <= pb 2598 movq mm7, mm4 2599 psubw mm6, mm0 2600 pcmpgtw mm7, mm5 // pa > pb? 2601 movq mm0, mm7 2602 // use mm7 mask to merge pa & pb 2603 pand mm5, mm7 2604 // use mm0 mask copy to merge a & b 2605 pand mm2, mm0 2606 pandn mm7, mm4 2607 pandn mm0, mm1 2608 paddw mm7, mm5 2609 paddw mm0, mm2 2610 // test ((pa <= pb)? pa:pb) <= pc 2611 pcmpgtw mm7, mm6 // pab > pc? 2612 movq mm2, [esi + ebx] // load b=Prior(x) 2613 pand mm3, mm7 2614 pandn mm7, mm0 2615 pxor mm1, mm1 2616 paddw mm7, mm3 2617 pxor mm0, mm0 2618 packuswb mm7, mm1 2619 movq mm3, mm2 // load c=Prior(x-bpp) step 1 2620 pand mm7, ActiveMask 2621 punpckhbw mm2, mm0 // Unpack High bytes of b 2622 psllq mm7, ShiftBpp // Shift bytes to 2nd group of 3 bytes 2623 // pav = p - a = (a + b - c) - a = b - c 2624 movq mm4, mm2 2625 paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x) 2626 psllq mm3, ShiftBpp // load c=Prior(x-bpp) step 2 2627 movq [edi + ebx], mm7 // write back updated value 2628 movq mm1, mm7 2629 punpckhbw mm3, mm0 // Unpack High bytes of c 2630 psllq mm1, ShiftBpp // Shift bytes 2631 // Now mm1 will be used as Raw(x-bpp) 2632 // Now do Paeth for 3rd, and final, set of bytes (6-7) 2633 pxor mm7, mm7 2634 punpckhbw mm1, mm0 // Unpack High bytes of a 2635 psubw mm4, mm3 2636 // pbv = p - b = (a + b - c) - b = a - c 2637 movq mm5, mm1 2638 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv 2639 movq mm6, mm4 2640 psubw mm5, mm3 2641 pxor mm0, mm0 2642 paddw mm6, mm5 2643 2644 // pa = abs(p-a) = abs(pav) 2645 // pb = abs(p-b) = abs(pbv) 2646 // pc = abs(p-c) = abs(pcv) 2647 pcmpgtw mm0, mm4 // Create mask pav bytes < 0 2648 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0 2649 pand mm0, mm4 // Only pav bytes < 0 in mm7 2650 pand mm7, mm5 // Only pbv bytes < 0 in mm0 2651 psubw mm4, mm0 2652 psubw mm5, mm7 2653 psubw mm4, mm0 2654 psubw mm5, mm7 2655 pxor mm0, mm0 2656 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0 2657 pand mm0, mm6 // Only pav bytes < 0 in mm7 2658 psubw mm6, mm0 2659 // test pa <= pb 2660 movq mm7, mm4 2661 psubw mm6, mm0 2662 pcmpgtw mm7, mm5 // pa > pb? 2663 movq mm0, mm7 2664 // use mm0 mask copy to merge a & b 2665 pand mm2, mm0 2666 // use mm7 mask to merge pa & pb 2667 pand mm5, mm7 2668 pandn mm0, mm1 2669 pandn mm7, mm4 2670 paddw mm0, mm2 2671 paddw mm7, mm5 2672 // test ((pa <= pb)? pa:pb) <= pc 2673 pcmpgtw mm7, mm6 // pab > pc? 2674 pand mm3, mm7 2675 pandn mm7, mm0 2676 paddw mm7, mm3 2677 pxor mm1, mm1 2678 packuswb mm1, mm7 2679 // Step ebx to next set of 8 bytes and repeat loop til done 2680 add ebx, 8 2681 pand mm1, ActiveMaskEnd 2682 paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x) 2683 2684 cmp ebx, MMXLength 2685 pxor mm0, mm0 // pxor does not affect flags 2686 movq [edi + ebx - 8], mm1 // write back updated value 2687 // mm1 will be used as Raw(x-bpp) next loop 2688 // mm3 ready to be used as Prior(x-bpp) next loop 2689 jb dpth3lp 2690 } // end _asm block 2691 } 2692 break; 2693 2694 case 6: 2695 case 7: 2696 case 5: 2697 { 2698 ActiveMask.use = 0x00000000ffffffff; 2699 ActiveMask2.use = 0xffffffff00000000; 2700 ShiftBpp.use = bpp << 3; // == bpp * 8 2701 ShiftRem.use = 64 - ShiftBpp.use; 2702 _asm 2703 { 2704 mov ebx, diff 2705 mov edi, row 2706 mov esi, prev_row 2707 // PRIME the pump (load the first Raw(x-bpp) data set 2708 movq mm1, [edi+ebx-8] 2709 pxor mm0, mm0 2710 dpth6lp: 2711 // Must shift to position Raw(x-bpp) data 2712 psrlq mm1, ShiftRem 2713 // Do first set of 4 bytes 2714 movq mm3, [esi+ebx-8] // read c=Prior(x-bpp) bytes 2715 punpcklbw mm1, mm0 // Unpack Low bytes of a 2716 movq mm2, [esi + ebx] // load b=Prior(x) 2717 punpcklbw mm2, mm0 // Unpack Low bytes of b 2718 // Must shift to position Prior(x-bpp) data 2719 psrlq mm3, ShiftRem 2720 // pav = p - a = (a + b - c) - a = b - c 2721 movq mm4, mm2 2722 punpcklbw mm3, mm0 // Unpack Low bytes of c 2723 // pbv = p - b = (a + b - c) - b = a - c 2724 movq mm5, mm1 2725 psubw mm4, mm3 2726 pxor mm7, mm7 2727 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv 2728 movq mm6, mm4 2729 psubw mm5, mm3 2730 // pa = abs(p-a) = abs(pav) 2731 // pb = abs(p-b) = abs(pbv) 2732 // pc = abs(p-c) = abs(pcv) 2733 pcmpgtw mm0, mm4 // Create mask pav bytes < 0 2734 paddw mm6, mm5 2735 pand mm0, mm4 // Only pav bytes < 0 in mm7 2736 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0 2737 psubw mm4, mm0 2738 pand mm7, mm5 // Only pbv bytes < 0 in mm0 2739 psubw mm4, mm0 2740 psubw mm5, mm7 2741 pxor mm0, mm0 2742 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0 2743 pand mm0, mm6 // Only pav bytes < 0 in mm7 2744 psubw mm5, mm7 2745 psubw mm6, mm0 2746 // test pa <= pb 2747 movq mm7, mm4 2748 psubw mm6, mm0 2749 pcmpgtw mm7, mm5 // pa > pb? 2750 movq mm0, mm7 2751 // use mm7 mask to merge pa & pb 2752 pand mm5, mm7 2753 // use mm0 mask copy to merge a & b 2754 pand mm2, mm0 2755 pandn mm7, mm4 2756 pandn mm0, mm1 2757 paddw mm7, mm5 2758 paddw mm0, mm2 2759 // test ((pa <= pb)? pa:pb) <= pc 2760 pcmpgtw mm7, mm6 // pab > pc? 2761 pxor mm1, mm1 2762 pand mm3, mm7 2763 pandn mm7, mm0 2764 paddw mm7, mm3 2765 pxor mm0, mm0 2766 packuswb mm7, mm1 2767 movq mm3, [esi + ebx - 8] // load c=Prior(x-bpp) 2768 pand mm7, ActiveMask 2769 psrlq mm3, ShiftRem 2770 movq mm2, [esi + ebx] // load b=Prior(x) step 1 2771 paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x) 2772 movq mm6, mm2 2773 movq [edi + ebx], mm7 // write back updated value 2774 movq mm1, [edi+ebx-8] 2775 psllq mm6, ShiftBpp 2776 movq mm5, mm7 2777 psrlq mm1, ShiftRem 2778 por mm3, mm6 2779 psllq mm5, ShiftBpp 2780 punpckhbw mm3, mm0 // Unpack High bytes of c 2781 por mm1, mm5 2782 // Do second set of 4 bytes 2783 punpckhbw mm2, mm0 // Unpack High bytes of b 2784 punpckhbw mm1, mm0 // Unpack High bytes of a 2785 // pav = p - a = (a + b - c) - a = b - c 2786 movq mm4, mm2 2787 // pbv = p - b = (a + b - c) - b = a - c 2788 movq mm5, mm1 2789 psubw mm4, mm3 2790 pxor mm7, mm7 2791 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv 2792 movq mm6, mm4 2793 psubw mm5, mm3 2794 // pa = abs(p-a) = abs(pav) 2795 // pb = abs(p-b) = abs(pbv) 2796 // pc = abs(p-c) = abs(pcv) 2797 pcmpgtw mm0, mm4 // Create mask pav bytes < 0 2798 paddw mm6, mm5 2799 pand mm0, mm4 // Only pav bytes < 0 in mm7 2800 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0 2801 psubw mm4, mm0 2802 pand mm7, mm5 // Only pbv bytes < 0 in mm0 2803 psubw mm4, mm0 2804 psubw mm5, mm7 2805 pxor mm0, mm0 2806 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0 2807 pand mm0, mm6 // Only pav bytes < 0 in mm7 2808 psubw mm5, mm7 2809 psubw mm6, mm0 2810 // test pa <= pb 2811 movq mm7, mm4 2812 psubw mm6, mm0 2813 pcmpgtw mm7, mm5 // pa > pb? 2814 movq mm0, mm7 2815 // use mm7 mask to merge pa & pb 2816 pand mm5, mm7 2817 // use mm0 mask copy to merge a & b 2818 pand mm2, mm0 2819 pandn mm7, mm4 2820 pandn mm0, mm1 2821 paddw mm7, mm5 2822 paddw mm0, mm2 2823 // test ((pa <= pb)? pa:pb) <= pc 2824 pcmpgtw mm7, mm6 // pab > pc? 2825 pxor mm1, mm1 2826 pand mm3, mm7 2827 pandn mm7, mm0 2828 pxor mm1, mm1 2829 paddw mm7, mm3 2830 pxor mm0, mm0 2831 // Step ex to next set of 8 bytes and repeat loop til done 2832 add ebx, 8 2833 packuswb mm1, mm7 2834 paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x) 2835 cmp ebx, MMXLength 2836 movq [edi + ebx - 8], mm1 // write back updated value 2837 // mm1 will be used as Raw(x-bpp) next loop 2838 jb dpth6lp 2839 } // end _asm block 2840 } 2841 break; 2842 2843 case 4: 2844 { 2845 ActiveMask.use = 0x00000000ffffffff; 2846 _asm { 2847 mov ebx, diff 2848 mov edi, row 2849 mov esi, prev_row 2850 pxor mm0, mm0 2851 // PRIME the pump (load the first Raw(x-bpp) data set 2852 movq mm1, [edi+ebx-8] // Only time should need to read 2853 // a=Raw(x-bpp) bytes 2854 dpth4lp: 2855 // Do first set of 4 bytes 2856 movq mm3, [esi+ebx-8] // read c=Prior(x-bpp) bytes 2857 punpckhbw mm1, mm0 // Unpack Low bytes of a 2858 movq mm2, [esi + ebx] // load b=Prior(x) 2859 punpcklbw mm2, mm0 // Unpack High bytes of b 2860 // pav = p - a = (a + b - c) - a = b - c 2861 movq mm4, mm2 2862 punpckhbw mm3, mm0 // Unpack High bytes of c 2863 // pbv = p - b = (a + b - c) - b = a - c 2864 movq mm5, mm1 2865 psubw mm4, mm3 2866 pxor mm7, mm7 2867 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv 2868 movq mm6, mm4 2869 psubw mm5, mm3 2870 // pa = abs(p-a) = abs(pav) 2871 // pb = abs(p-b) = abs(pbv) 2872 // pc = abs(p-c) = abs(pcv) 2873 pcmpgtw mm0, mm4 // Create mask pav bytes < 0 2874 paddw mm6, mm5 2875 pand mm0, mm4 // Only pav bytes < 0 in mm7 2876 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0 2877 psubw mm4, mm0 2878 pand mm7, mm5 // Only pbv bytes < 0 in mm0 2879 psubw mm4, mm0 2880 psubw mm5, mm7 2881 pxor mm0, mm0 2882 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0 2883 pand mm0, mm6 // Only pav bytes < 0 in mm7 2884 psubw mm5, mm7 2885 psubw mm6, mm0 2886 // test pa <= pb 2887 movq mm7, mm4 2888 psubw mm6, mm0 2889 pcmpgtw mm7, mm5 // pa > pb? 2890 movq mm0, mm7 2891 // use mm7 mask to merge pa & pb 2892 pand mm5, mm7 2893 // use mm0 mask copy to merge a & b 2894 pand mm2, mm0 2895 pandn mm7, mm4 2896 pandn mm0, mm1 2897 paddw mm7, mm5 2898 paddw mm0, mm2 2899 // test ((pa <= pb)? pa:pb) <= pc 2900 pcmpgtw mm7, mm6 // pab > pc? 2901 pxor mm1, mm1 2902 pand mm3, mm7 2903 pandn mm7, mm0 2904 paddw mm7, mm3 2905 pxor mm0, mm0 2906 packuswb mm7, mm1 2907 movq mm3, [esi + ebx] // load c=Prior(x-bpp) 2908 pand mm7, ActiveMask 2909 movq mm2, mm3 // load b=Prior(x) step 1 2910 paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x) 2911 punpcklbw mm3, mm0 // Unpack High bytes of c 2912 movq [edi + ebx], mm7 // write back updated value 2913 movq mm1, mm7 // Now mm1 will be used as Raw(x-bpp) 2914 // Do second set of 4 bytes 2915 punpckhbw mm2, mm0 // Unpack Low bytes of b 2916 punpcklbw mm1, mm0 // Unpack Low bytes of a 2917 // pav = p - a = (a + b - c) - a = b - c 2918 movq mm4, mm2 2919 // pbv = p - b = (a + b - c) - b = a - c 2920 movq mm5, mm1 2921 psubw mm4, mm3 2922 pxor mm7, mm7 2923 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv 2924 movq mm6, mm4 2925 psubw mm5, mm3 2926 // pa = abs(p-a) = abs(pav) 2927 // pb = abs(p-b) = abs(pbv) 2928 // pc = abs(p-c) = abs(pcv) 2929 pcmpgtw mm0, mm4 // Create mask pav bytes < 0 2930 paddw mm6, mm5 2931 pand mm0, mm4 // Only pav bytes < 0 in mm7 2932 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0 2933 psubw mm4, mm0 2934 pand mm7, mm5 // Only pbv bytes < 0 in mm0 2935 psubw mm4, mm0 2936 psubw mm5, mm7 2937 pxor mm0, mm0 2938 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0 2939 pand mm0, mm6 // Only pav bytes < 0 in mm7 2940 psubw mm5, mm7 2941 psubw mm6, mm0 2942 // test pa <= pb 2943 movq mm7, mm4 2944 psubw mm6, mm0 2945 pcmpgtw mm7, mm5 // pa > pb? 2946 movq mm0, mm7 2947 // use mm7 mask to merge pa & pb 2948 pand mm5, mm7 2949 // use mm0 mask copy to merge a & b 2950 pand mm2, mm0 2951 pandn mm7, mm4 2952 pandn mm0, mm1 2953 paddw mm7, mm5 2954 paddw mm0, mm2 2955 // test ((pa <= pb)? pa:pb) <= pc 2956 pcmpgtw mm7, mm6 // pab > pc? 2957 pxor mm1, mm1 2958 pand mm3, mm7 2959 pandn mm7, mm0 2960 pxor mm1, mm1 2961 paddw mm7, mm3 2962 pxor mm0, mm0 2963 // Step ex to next set of 8 bytes and repeat loop til done 2964 add ebx, 8 2965 packuswb mm1, mm7 2966 paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x) 2967 cmp ebx, MMXLength 2968 movq [edi + ebx - 8], mm1 // write back updated value 2969 // mm1 will be used as Raw(x-bpp) next loop 2970 jb dpth4lp 2971 } // end _asm block 2972 } 2973 break; 2974 case 8: // bpp == 8 2975 { 2976 ActiveMask.use = 0x00000000ffffffff; 2977 _asm { 2978 mov ebx, diff 2979 mov edi, row 2980 mov esi, prev_row 2981 pxor mm0, mm0 2982 // PRIME the pump (load the first Raw(x-bpp) data set 2983 movq mm1, [edi+ebx-8] // Only time should need to read 2984 // a=Raw(x-bpp) bytes 2985 dpth8lp: 2986 // Do first set of 4 bytes 2987 movq mm3, [esi+ebx-8] // read c=Prior(x-bpp) bytes 2988 punpcklbw mm1, mm0 // Unpack Low bytes of a 2989 movq mm2, [esi + ebx] // load b=Prior(x) 2990 punpcklbw mm2, mm0 // Unpack Low bytes of b 2991 // pav = p - a = (a + b - c) - a = b - c 2992 movq mm4, mm2 2993 punpcklbw mm3, mm0 // Unpack Low bytes of c 2994 // pbv = p - b = (a + b - c) - b = a - c 2995 movq mm5, mm1 2996 psubw mm4, mm3 2997 pxor mm7, mm7 2998 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv 2999 movq mm6, mm4 3000 psubw mm5, mm3 3001 // pa = abs(p-a) = abs(pav) 3002 // pb = abs(p-b) = abs(pbv) 3003 // pc = abs(p-c) = abs(pcv) 3004 pcmpgtw mm0, mm4 // Create mask pav bytes < 0 3005 paddw mm6, mm5 3006 pand mm0, mm4 // Only pav bytes < 0 in mm7 3007 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0 3008 psubw mm4, mm0 3009 pand mm7, mm5 // Only pbv bytes < 0 in mm0 3010 psubw mm4, mm0 3011 psubw mm5, mm7 3012 pxor mm0, mm0 3013 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0 3014 pand mm0, mm6 // Only pav bytes < 0 in mm7 3015 psubw mm5, mm7 3016 psubw mm6, mm0 3017 // test pa <= pb 3018 movq mm7, mm4 3019 psubw mm6, mm0 3020 pcmpgtw mm7, mm5 // pa > pb? 3021 movq mm0, mm7 3022 // use mm7 mask to merge pa & pb 3023 pand mm5, mm7 3024 // use mm0 mask copy to merge a & b 3025 pand mm2, mm0 3026 pandn mm7, mm4 3027 pandn mm0, mm1 3028 paddw mm7, mm5 3029 paddw mm0, mm2 3030 // test ((pa <= pb)? pa:pb) <= pc 3031 pcmpgtw mm7, mm6 // pab > pc? 3032 pxor mm1, mm1 3033 pand mm3, mm7 3034 pandn mm7, mm0 3035 paddw mm7, mm3 3036 pxor mm0, mm0 3037 packuswb mm7, mm1 3038 movq mm3, [esi+ebx-8] // read c=Prior(x-bpp) bytes 3039 pand mm7, ActiveMask 3040 movq mm2, [esi + ebx] // load b=Prior(x) 3041 paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x) 3042 punpckhbw mm3, mm0 // Unpack High bytes of c 3043 movq [edi + ebx], mm7 // write back updated value 3044 movq mm1, [edi+ebx-8] // read a=Raw(x-bpp) bytes 3045 3046 // Do second set of 4 bytes 3047 punpckhbw mm2, mm0 // Unpack High bytes of b 3048 punpckhbw mm1, mm0 // Unpack High bytes of a 3049 // pav = p - a = (a + b - c) - a = b - c 3050 movq mm4, mm2 3051 // pbv = p - b = (a + b - c) - b = a - c 3052 movq mm5, mm1 3053 psubw mm4, mm3 3054 pxor mm7, mm7 3055 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv 3056 movq mm6, mm4 3057 psubw mm5, mm3 3058 // pa = abs(p-a) = abs(pav) 3059 // pb = abs(p-b) = abs(pbv) 3060 // pc = abs(p-c) = abs(pcv) 3061 pcmpgtw mm0, mm4 // Create mask pav bytes < 0 3062 paddw mm6, mm5 3063 pand mm0, mm4 // Only pav bytes < 0 in mm7 3064 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0 3065 psubw mm4, mm0 3066 pand mm7, mm5 // Only pbv bytes < 0 in mm0 3067 psubw mm4, mm0 3068 psubw mm5, mm7 3069 pxor mm0, mm0 3070 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0 3071 pand mm0, mm6 // Only pav bytes < 0 in mm7 3072 psubw mm5, mm7 3073 psubw mm6, mm0 3074 // test pa <= pb 3075 movq mm7, mm4 3076 psubw mm6, mm0 3077 pcmpgtw mm7, mm5 // pa > pb? 3078 movq mm0, mm7 3079 // use mm7 mask to merge pa & pb 3080 pand mm5, mm7 3081 // use mm0 mask copy to merge a & b 3082 pand mm2, mm0 3083 pandn mm7, mm4 3084 pandn mm0, mm1 3085 paddw mm7, mm5 3086 paddw mm0, mm2 3087 // test ((pa <= pb)? pa:pb) <= pc 3088 pcmpgtw mm7, mm6 // pab > pc? 3089 pxor mm1, mm1 3090 pand mm3, mm7 3091 pandn mm7, mm0 3092 pxor mm1, mm1 3093 paddw mm7, mm3 3094 pxor mm0, mm0 3095 // Step ex to next set of 8 bytes and repeat loop til done 3096 add ebx, 8 3097 packuswb mm1, mm7 3098 paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x) 3099 cmp ebx, MMXLength 3100 movq [edi + ebx - 8], mm1 // write back updated value 3101 // mm1 will be used as Raw(x-bpp) next loop 3102 jb dpth8lp 3103 } // end _asm block 3104 } 3105 break; 3106 3107 case 1: // bpp = 1 3108 case 2: // bpp = 2 3109 default: // bpp > 8 3110 { 3111 _asm { 3112 mov ebx, diff 3113 cmp ebx, FullLength 3114 jnb dpthdend 3115 mov edi, row 3116 mov esi, prev_row 3117 // Do Paeth decode for remaining bytes 3118 mov edx, ebx 3119 xor ecx, ecx // zero ecx before using cl & cx in loop below 3120 sub edx, bpp // Set edx = ebx - bpp 3121 dpthdlp: 3122 xor eax, eax 3123 // pav = p - a = (a + b - c) - a = b - c 3124 mov al, [esi + ebx] // load Prior(x) into al 3125 mov cl, [esi + edx] // load Prior(x-bpp) into cl 3126 sub eax, ecx // subtract Prior(x-bpp) 3127 mov patemp, eax // Save pav for later use 3128 xor eax, eax 3129 // pbv = p - b = (a + b - c) - b = a - c 3130 mov al, [edi + edx] // load Raw(x-bpp) into al 3131 sub eax, ecx // subtract Prior(x-bpp) 3132 mov ecx, eax 3133 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv 3134 add eax, patemp // pcv = pav + pbv 3135 // pc = abs(pcv) 3136 test eax, 0x80000000 3137 jz dpthdpca 3138 neg eax // reverse sign of neg values 3139 dpthdpca: 3140 mov pctemp, eax // save pc for later use 3141 // pb = abs(pbv) 3142 test ecx, 0x80000000 3143 jz dpthdpba 3144 neg ecx // reverse sign of neg values 3145 dpthdpba: 3146 mov pbtemp, ecx // save pb for later use 3147 // pa = abs(pav) 3148 mov eax, patemp 3149 test eax, 0x80000000 3150 jz dpthdpaa 3151 neg eax // reverse sign of neg values 3152 dpthdpaa: 3153 mov patemp, eax // save pa for later use 3154 // test if pa <= pb 3155 cmp eax, ecx 3156 jna dpthdabb 3157 // pa > pb; now test if pb <= pc 3158 cmp ecx, pctemp 3159 jna dpthdbbc 3160 // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp) 3161 mov cl, [esi + edx] // load Prior(x-bpp) into cl 3162 jmp dpthdpaeth 3163 dpthdbbc: 3164 // pb <= pc; Raw(x) = Paeth(x) + Prior(x) 3165 mov cl, [esi + ebx] // load Prior(x) into cl 3166 jmp dpthdpaeth 3167 dpthdabb: 3168 // pa <= pb; now test if pa <= pc 3169 cmp eax, pctemp 3170 jna dpthdabc 3171 // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp) 3172 mov cl, [esi + edx] // load Prior(x-bpp) into cl 3173 jmp dpthdpaeth 3174 dpthdabc: 3175 // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp) 3176 mov cl, [edi + edx] // load Raw(x-bpp) into cl 3177 dpthdpaeth: 3178 inc ebx 3179 inc edx 3180 // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256 3181 add [edi + ebx - 1], cl 3182 cmp ebx, FullLength 3183 jb dpthdlp 3184 dpthdend: 3185 } // end _asm block 3186 } 3187 return; // No need to go further with this one 3188 } // end switch ( bpp ) 3189 _asm 3190 { 3191 // MMX acceleration complete now do clean-up 3192 // Check if any remaining bytes left to decode 3193 mov ebx, MMXLength 3194 cmp ebx, FullLength 3195 jnb dpthend 3196 mov edi, row 3197 mov esi, prev_row 3198 // Do Paeth decode for remaining bytes 3199 mov edx, ebx 3200 xor ecx, ecx // zero ecx before using cl & cx in loop below 3201 sub edx, bpp // Set edx = ebx - bpp 3202 dpthlp2: 3203 xor eax, eax 3204 // pav = p - a = (a + b - c) - a = b - c 3205 mov al, [esi + ebx] // load Prior(x) into al 3206 mov cl, [esi + edx] // load Prior(x-bpp) into cl 3207 sub eax, ecx // subtract Prior(x-bpp) 3208 mov patemp, eax // Save pav for later use 3209 xor eax, eax 3210 // pbv = p - b = (a + b - c) - b = a - c 3211 mov al, [edi + edx] // load Raw(x-bpp) into al 3212 sub eax, ecx // subtract Prior(x-bpp) 3213 mov ecx, eax 3214 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv 3215 add eax, patemp // pcv = pav + pbv 3216 // pc = abs(pcv) 3217 test eax, 0x80000000 3218 jz dpthpca2 3219 neg eax // reverse sign of neg values 3220 dpthpca2: 3221 mov pctemp, eax // save pc for later use 3222 // pb = abs(pbv) 3223 test ecx, 0x80000000 3224 jz dpthpba2 3225 neg ecx // reverse sign of neg values 3226 dpthpba2: 3227 mov pbtemp, ecx // save pb for later use 3228 // pa = abs(pav) 3229 mov eax, patemp 3230 test eax, 0x80000000 3231 jz dpthpaa2 3232 neg eax // reverse sign of neg values 3233 dpthpaa2: 3234 mov patemp, eax // save pa for later use 3235 // test if pa <= pb 3236 cmp eax, ecx 3237 jna dpthabb2 3238 // pa > pb; now test if pb <= pc 3239 cmp ecx, pctemp 3240 jna dpthbbc2 3241 // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp) 3242 mov cl, [esi + edx] // load Prior(x-bpp) into cl 3243 jmp dpthpaeth2 3244 dpthbbc2: 3245 // pb <= pc; Raw(x) = Paeth(x) + Prior(x) 3246 mov cl, [esi + ebx] // load Prior(x) into cl 3247 jmp dpthpaeth2 3248 dpthabb2: 3249 // pa <= pb; now test if pa <= pc 3250 cmp eax, pctemp 3251 jna dpthabc2 3252 // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp) 3253 mov cl, [esi + edx] // load Prior(x-bpp) into cl 3254 jmp dpthpaeth2 3255 dpthabc2: 3256 // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp) 3257 mov cl, [edi + edx] // load Raw(x-bpp) into cl 3258 dpthpaeth2: 3259 inc ebx 3260 inc edx 3261 // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256 3262 add [edi + ebx - 1], cl 3263 cmp ebx, FullLength 3264 jb dpthlp2 3265 dpthend: 3266 emms // End MMX instructions; prep for possible FP instrs. 3267 } // end _asm block 3268 } 3269 3270 // Optimized code for PNG Sub filter decoder 3271 void /* PRIVATE */ 3272 png_read_filter_row_mmx_sub(png_row_infop row_info, png_bytep row) 3273 { 3274 // These variables are declared 3275 // here to ensure alignment on 8-byte boundaries. 3276 union uAll ActiveMask, ShiftBpp, ShiftRem; 3277 3278 //int test; 3279 int bpp; 3280 png_uint_32 FullLength; 3281 png_uint_32 MMXLength; 3282 int diff; 3283 3284 bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel 3285 FullLength = row_info->rowbytes - bpp; // # of bytes to filter 3286 _asm { 3287 mov edi, row 3288 mov esi, edi // lp = row 3289 add edi, bpp // rp = row + bpp 3290 xor eax, eax 3291 // get # of bytes to alignment 3292 mov diff, edi // take start of row 3293 add diff, 0xf // add 7 + 8 to incr past 3294 // alignment boundary 3295 xor ebx, ebx 3296 and diff, 0xfffffff8 // mask to alignment boundary 3297 sub diff, edi // subtract from start ==> value 3298 // ebx at alignment 3299 jz dsubgo 3300 // fix alignment 3301 dsublp1: 3302 mov al, [esi+ebx] 3303 add [edi+ebx], al 3304 inc ebx 3305 cmp ebx, diff 3306 jb dsublp1 3307 dsubgo: 3308 mov ecx, FullLength 3309 mov edx, ecx 3310 sub edx, ebx // subtract alignment fix 3311 and edx, 0x00000007 // calc bytes over mult of 8 3312 sub ecx, edx // drop over bytes from length 3313 mov MMXLength, ecx 3314 } // end _asm block 3315 3316 // Now do the math for the rest of the row 3317 switch ( bpp ) 3318 { 3319 case 3: 3320 { 3321 ActiveMask.use = 0x0000ffffff000000; 3322 ShiftBpp.use = 24; // == 3 * 8 3323 ShiftRem.use = 40; // == 64 - 24 3324 _asm { 3325 mov edi, row 3326 movq mm7, ActiveMask // Load ActiveMask for 2nd active byte group 3327 mov esi, edi // lp = row 3328 add edi, bpp // rp = row + bpp 3329 movq mm6, mm7 3330 mov ebx, diff 3331 psllq mm6, ShiftBpp // Move mask in mm6 to cover 3rd active 3332 // byte group 3333 // PRIME the pump (load the first Raw(x-bpp) data set 3334 movq mm1, [edi+ebx-8] 3335 dsub3lp: 3336 psrlq mm1, ShiftRem // Shift data for adding 1st bpp bytes 3337 // no need for mask; shift clears inactive bytes 3338 // Add 1st active group 3339 movq mm0, [edi+ebx] 3340 paddb mm0, mm1 3341 // Add 2nd active group 3342 movq mm1, mm0 // mov updated Raws to mm1 3343 psllq mm1, ShiftBpp // shift data to position correctly 3344 pand mm1, mm7 // mask to use only 2nd active group 3345 paddb mm0, mm1 3346 // Add 3rd active group 3347 movq mm1, mm0 // mov updated Raws to mm1 3348 psllq mm1, ShiftBpp // shift data to position correctly 3349 pand mm1, mm6 // mask to use only 3rd active group 3350 add ebx, 8 3351 paddb mm0, mm1 3352 cmp ebx, MMXLength 3353 movq [edi+ebx-8], mm0 // Write updated Raws back to array 3354 // Prep for doing 1st add at top of loop 3355 movq mm1, mm0 3356 jb dsub3lp 3357 } // end _asm block 3358 } 3359 break; 3360 3361 case 1: 3362 { 3363 // Placed here just in case this is a duplicate of the 3364 // non-MMX code for the SUB filter in png_read_filter_row below 3365 // 3366 // png_bytep rp; 3367 // png_bytep lp; 3368 // png_uint_32 i; 3369 // bpp = (row_info->pixel_depth + 7) >> 3; 3370 // for (i = (png_uint_32)bpp, rp = row + bpp, lp = row; 3371 // i < row_info->rowbytes; i++, rp++, lp++) 3372 // { 3373 // *rp = (png_byte)(((int)(*rp) + (int)(*lp)) & 0xff); 3374 // } 3375 _asm { 3376 mov ebx, diff 3377 mov edi, row 3378 cmp ebx, FullLength 3379 jnb dsub1end 3380 mov esi, edi // lp = row 3381 xor eax, eax 3382 add edi, bpp // rp = row + bpp 3383 dsub1lp: 3384 mov al, [esi+ebx] 3385 add [edi+ebx], al 3386 inc ebx 3387 cmp ebx, FullLength 3388 jb dsub1lp 3389 dsub1end: 3390 } // end _asm block 3391 } 3392 return; 3393 3394 case 6: 3395 case 7: 3396 case 4: 3397 case 5: 3398 { 3399 ShiftBpp.use = bpp << 3; 3400 ShiftRem.use = 64 - ShiftBpp.use; 3401 _asm { 3402 mov edi, row 3403 mov ebx, diff 3404 mov esi, edi // lp = row 3405 add edi, bpp // rp = row + bpp 3406 // PRIME the pump (load the first Raw(x-bpp) data set 3407 movq mm1, [edi+ebx-8] 3408 dsub4lp: 3409 psrlq mm1, ShiftRem // Shift data for adding 1st bpp bytes 3410 // no need for mask; shift clears inactive bytes 3411 movq mm0, [edi+ebx] 3412 paddb mm0, mm1 3413 // Add 2nd active group 3414 movq mm1, mm0 // mov updated Raws to mm1 3415 psllq mm1, ShiftBpp // shift data to position correctly 3416 // there is no need for any mask 3417 // since shift clears inactive bits/bytes 3418 add ebx, 8 3419 paddb mm0, mm1 3420 cmp ebx, MMXLength 3421 movq [edi+ebx-8], mm0 3422 movq mm1, mm0 // Prep for doing 1st add at top of loop 3423 jb dsub4lp 3424 } // end _asm block 3425 } 3426 break; 3427 3428 case 2: 3429 { 3430 ActiveMask.use = 0x00000000ffff0000; 3431 ShiftBpp.use = 16; // == 2 * 8 3432 ShiftRem.use = 48; // == 64 - 16 3433 _asm { 3434 movq mm7, ActiveMask // Load ActiveMask for 2nd active byte group 3435 mov ebx, diff 3436 movq mm6, mm7 3437 mov edi, row 3438 psllq mm6, ShiftBpp // Move mask in mm6 to cover 3rd active 3439 // byte group 3440 mov esi, edi // lp = row 3441 movq mm5, mm6 3442 add edi, bpp // rp = row + bpp 3443 psllq mm5, ShiftBpp // Move mask in mm5 to cover 4th active 3444 // byte group 3445 // PRIME the pump (load the first Raw(x-bpp) data set 3446 movq mm1, [edi+ebx-8] 3447 dsub2lp: 3448 // Add 1st active group 3449 psrlq mm1, ShiftRem // Shift data for adding 1st bpp bytes 3450 // no need for mask; shift clears inactive 3451 // bytes 3452 movq mm0, [edi+ebx] 3453 paddb mm0, mm1 3454 // Add 2nd active group 3455 movq mm1, mm0 // mov updated Raws to mm1 3456 psllq mm1, ShiftBpp // shift data to position correctly 3457 pand mm1, mm7 // mask to use only 2nd active group 3458 paddb mm0, mm1 3459 // Add 3rd active group 3460 movq mm1, mm0 // mov updated Raws to mm1 3461 psllq mm1, ShiftBpp // shift data to position correctly 3462 pand mm1, mm6 // mask to use only 3rd active group 3463 paddb mm0, mm1 3464 // Add 4th active group 3465 movq mm1, mm0 // mov updated Raws to mm1 3466 psllq mm1, ShiftBpp // shift data to position correctly 3467 pand mm1, mm5 // mask to use only 4th active group 3468 add ebx, 8 3469 paddb mm0, mm1 3470 cmp ebx, MMXLength 3471 movq [edi+ebx-8], mm0 // Write updated Raws back to array 3472 movq mm1, mm0 // Prep for doing 1st add at top of loop 3473 jb dsub2lp 3474 } // end _asm block 3475 } 3476 break; 3477 case 8: 3478 { 3479 _asm { 3480 mov edi, row 3481 mov ebx, diff 3482 mov esi, edi // lp = row 3483 add edi, bpp // rp = row + bpp 3484 mov ecx, MMXLength 3485 movq mm7, [edi+ebx-8] // PRIME the pump (load the first 3486 // Raw(x-bpp) data set 3487 and ecx, 0x0000003f // calc bytes over mult of 64 3488 dsub8lp: 3489 movq mm0, [edi+ebx] // Load Sub(x) for 1st 8 bytes 3490 paddb mm0, mm7 3491 movq mm1, [edi+ebx+8] // Load Sub(x) for 2nd 8 bytes 3492 movq [edi+ebx], mm0 // Write Raw(x) for 1st 8 bytes 3493 // Now mm0 will be used as Raw(x-bpp) for 3494 // the 2nd group of 8 bytes. This will be 3495 // repeated for each group of 8 bytes with 3496 // the 8th group being used as the Raw(x-bpp) 3497 // for the 1st group of the next loop. 3498 paddb mm1, mm0 3499 movq mm2, [edi+ebx+16] // Load Sub(x) for 3rd 8 bytes 3500 movq [edi+ebx+8], mm1 // Write Raw(x) for 2nd 8 bytes 3501 paddb mm2, mm1 3502 movq mm3, [edi+ebx+24] // Load Sub(x) for 4th 8 bytes 3503 movq [edi+ebx+16], mm2 // Write Raw(x) for 3rd 8 bytes 3504 paddb mm3, mm2 3505 movq mm4, [edi+ebx+32] // Load Sub(x) for 5th 8 bytes 3506 movq [edi+ebx+24], mm3 // Write Raw(x) for 4th 8 bytes 3507 paddb mm4, mm3 3508 movq mm5, [edi+ebx+40] // Load Sub(x) for 6th 8 bytes 3509 movq [edi+ebx+32], mm4 // Write Raw(x) for 5th 8 bytes 3510 paddb mm5, mm4 3511 movq mm6, [edi+ebx+48] // Load Sub(x) for 7th 8 bytes 3512 movq [edi+ebx+40], mm5 // Write Raw(x) for 6th 8 bytes 3513 paddb mm6, mm5 3514 movq mm7, [edi+ebx+56] // Load Sub(x) for 8th 8 bytes 3515 movq [edi+ebx+48], mm6 // Write Raw(x) for 7th 8 bytes 3516 add ebx, 64 3517 paddb mm7, mm6 3518 cmp ebx, ecx 3519 movq [edi+ebx-8], mm7 // Write Raw(x) for 8th 8 bytes 3520 jb dsub8lp 3521 cmp ebx, MMXLength 3522 jnb dsub8lt8 3523 dsub8lpA: 3524 movq mm0, [edi+ebx] 3525 add ebx, 8 3526 paddb mm0, mm7 3527 cmp ebx, MMXLength 3528 movq [edi+ebx-8], mm0 // use -8 to offset early add to ebx 3529 movq mm7, mm0 // Move calculated Raw(x) data to mm1 to 3530 // be the new Raw(x-bpp) for the next loop 3531 jb dsub8lpA 3532 dsub8lt8: 3533 } // end _asm block 3534 } 3535 break; 3536 3537 default: // bpp greater than 8 bytes 3538 { 3539 _asm { 3540 mov ebx, diff 3541 mov edi, row 3542 mov esi, edi // lp = row 3543 add edi, bpp // rp = row + bpp 3544 dsubAlp: 3545 movq mm0, [edi+ebx] 3546 movq mm1, [esi+ebx] 3547 add ebx, 8 3548 paddb mm0, mm1 3549 cmp ebx, MMXLength 3550 movq [edi+ebx-8], mm0 // mov does not affect flags; -8 to offset 3551 // add ebx 3552 jb dsubAlp 3553 } // end _asm block 3554 } 3555 break; 3556 3557 } // end switch ( bpp ) 3558 3559 _asm { 3560 mov ebx, MMXLength 3561 mov edi, row 3562 cmp ebx, FullLength 3563 jnb dsubend 3564 mov esi, edi // lp = row 3565 xor eax, eax 3566 add edi, bpp // rp = row + bpp 3567 dsublp2: 3568 mov al, [esi+ebx] 3569 add [edi+ebx], al 3570 inc ebx 3571 cmp ebx, FullLength 3572 jb dsublp2 3573 dsubend: 3574 emms // End MMX instructions; prep for possible FP instrs. 3575 } // end _asm block 3576 } 3577 3578 // Optimized code for PNG Up filter decoder 3579 void /* PRIVATE */ 3580 png_read_filter_row_mmx_up(png_row_infop row_info, png_bytep row, 3581 png_bytep prev_row) 3582 { 3583 png_uint_32 len; 3584 len = row_info->rowbytes; // # of bytes to filter 3585 _asm { 3586 mov edi, row 3587 // get # of bytes to alignment 3588 mov ecx, edi 3589 xor ebx, ebx 3590 add ecx, 0x7 3591 xor eax, eax 3592 and ecx, 0xfffffff8 3593 mov esi, prev_row 3594 sub ecx, edi 3595 jz dupgo 3596 // fix alignment 3597 duplp1: 3598 mov al, [edi+ebx] 3599 add al, [esi+ebx] 3600 inc ebx 3601 cmp ebx, ecx 3602 mov [edi + ebx-1], al // mov does not affect flags; -1 to offset inc ebx 3603 jb duplp1 3604 dupgo: 3605 mov ecx, len 3606 mov edx, ecx 3607 sub edx, ebx // subtract alignment fix 3608 and edx, 0x0000003f // calc bytes over mult of 64 3609 sub ecx, edx // drop over bytes from length 3610 // Unrolled loop - use all MMX registers and interleave to reduce 3611 // number of branch instructions (loops) and reduce partial stalls 3612 duploop: 3613 movq mm1, [esi+ebx] 3614 movq mm0, [edi+ebx] 3615 movq mm3, [esi+ebx+8] 3616 paddb mm0, mm1 3617 movq mm2, [edi+ebx+8] 3618 movq [edi+ebx], mm0 3619 paddb mm2, mm3 3620 movq mm5, [esi+ebx+16] 3621 movq [edi+ebx+8], mm2 3622 movq mm4, [edi+ebx+16] 3623 movq mm7, [esi+ebx+24] 3624 paddb mm4, mm5 3625 movq mm6, [edi+ebx+24] 3626 movq [edi+ebx+16], mm4 3627 paddb mm6, mm7 3628 movq mm1, [esi+ebx+32] 3629 movq [edi+ebx+24], mm6 3630 movq mm0, [edi+ebx+32] 3631 movq mm3, [esi+ebx+40] 3632 paddb mm0, mm1 3633 movq mm2, [edi+ebx+40] 3634 movq [edi+ebx+32], mm0 3635 paddb mm2, mm3 3636 movq mm5, [esi+ebx+48] 3637 movq [edi+ebx+40], mm2 3638 movq mm4, [edi+ebx+48] 3639 movq mm7, [esi+ebx+56] 3640 paddb mm4, mm5 3641 movq mm6, [edi+ebx+56] 3642 movq [edi+ebx+48], mm4 3643 add ebx, 64 3644 paddb mm6, mm7 3645 cmp ebx, ecx 3646 movq [edi+ebx-8], mm6 // (+56)movq does not affect flags; 3647 // -8 to offset add ebx 3648 jb duploop 3649 3650 cmp edx, 0 // Test for bytes over mult of 64 3651 jz dupend 3652 3653 3654 // 2 lines added by lcreeve at netins.net 3655 // (mail 11 Jul 98 in png-implement list) 3656 cmp edx, 8 //test for less than 8 bytes 3657 jb duplt8 3658 3659 3660 add ecx, edx 3661 and edx, 0x00000007 // calc bytes over mult of 8 3662 sub ecx, edx // drop over bytes from length 3663 jz duplt8 3664 // Loop using MMX registers mm0 & mm1 to update 8 bytes simultaneously 3665 duplpA: 3666 movq mm1, [esi+ebx] 3667 movq mm0, [edi+ebx] 3668 add ebx, 8 3669 paddb mm0, mm1 3670 cmp ebx, ecx 3671 movq [edi+ebx-8], mm0 // movq does not affect flags; -8 to offset add ebx 3672 jb duplpA 3673 cmp edx, 0 // Test for bytes over mult of 8 3674 jz dupend 3675 duplt8: 3676 xor eax, eax 3677 add ecx, edx // move over byte count into counter 3678 // Loop using x86 registers to update remaining bytes 3679 duplp2: 3680 mov al, [edi + ebx] 3681 add al, [esi + ebx] 3682 inc ebx 3683 cmp ebx, ecx 3684 mov [edi + ebx-1], al // mov does not affect flags; -1 to offset inc ebx 3685 jb duplp2 3686 dupend: 3687 // Conversion of filtered row completed 3688 emms // End MMX instructions; prep for possible FP instrs. 3689 } // end _asm block 3690 } 3691 3692 3693 // Optimized png_read_filter_row routines 3694 void /* PRIVATE */ 3695 png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep 3696 row, png_bytep prev_row, int filter) 3697 { 3698 #ifdef PNG_DEBUG 3699 char filnm[10]; 3700 #endif 3701 3702 if (mmx_supported == 2) { 3703 #if !defined(PNG_1_0_X) 3704 /* this should have happened in png_init_mmx_flags() already */ 3705 png_warning(png_ptr, "asm_flags may not have been initialized"); 3706 #endif 3707 png_mmx_support(); 3708 } 3709 3710 #ifdef PNG_DEBUG 3711 png_debug(1, "in png_read_filter_row\n"); 3712 switch (filter) 3713 { 3714 case 0: png_snprintf(filnm, 10, "none"); 3715 break; 3716 #if !defined(PNG_1_0_X) 3717 case 1: png_snprintf(filnm, 10, "sub-%s", 3718 (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB)? "MMX" : "x86"); 3719 break; 3720 case 2: png_snprintf(filnm, 10, "up-%s", 3721 (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP)? "MMX" : "x86"); 3722 break; 3723 case 3: png_snprintf(filnm, 10, "avg-%s", 3724 (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG)? "MMX" : "x86"); 3725 break; 3726 case 4: png_snprintf(filnm, 10, "Paeth-%s", 3727 (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH)? "MMX":"x86"); 3728 break; 3729 #else 3730 case 1: png_snprintf(filnm, 10, "sub"); 3731 break; 3732 case 2: png_snprintf(filnm, 10, "up"); 3733 break; 3734 case 3: png_snprintf(filnm, 10, "avg"); 3735 break; 3736 case 4: png_snprintf(filnm, 10, "Paeth"); 3737 break; 3738 #endif 3739 default: png_snprintf(filnm, 10, "unknw"); 3740 break; 3741 } 3742 png_debug2(0,"row=%5d, %s, ", png_ptr->row_number, filnm); 3743 png_debug2(0, "pd=%2d, b=%d, ", (int)row_info->pixel_depth, 3744 (int)((row_info->pixel_depth + 7) >> 3)); 3745 png_debug1(0,"len=%8d, ", row_info->rowbytes); 3746 #endif /* PNG_DEBUG */ 3747 3748 switch (filter) 3749 { 3750 case PNG_FILTER_VALUE_NONE: 3751 break; 3752 3753 case PNG_FILTER_VALUE_SUB: 3754 { 3755 #if !defined(PNG_1_0_X) 3756 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB) && 3757 (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) && 3758 (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold)) 3759 #else 3760 if (mmx_supported) 3761 #endif 3762 { 3763 png_read_filter_row_mmx_sub(row_info, row); 3764 } 3765 else 3766 { 3767 png_uint_32 i; 3768 png_uint_32 istop = row_info->rowbytes; 3769 png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3; 3770 png_bytep rp = row + bpp; 3771 png_bytep lp = row; 3772 3773 for (i = bpp; i < istop; i++) 3774 { 3775 *rp = (png_byte)(((int)(*rp) + (int)(*lp++)) & 0xff); 3776 rp++; 3777 } 3778 } 3779 break; 3780 } 3781 3782 case PNG_FILTER_VALUE_UP: 3783 { 3784 #if !defined(PNG_1_0_X) 3785 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP) && 3786 (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) && 3787 (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold)) 3788 #else 3789 if (mmx_supported) 3790 #endif 3791 { 3792 png_read_filter_row_mmx_up(row_info, row, prev_row); 3793 } 3794 else 3795 { 3796 png_uint_32 i; 3797 png_uint_32 istop = row_info->rowbytes; 3798 png_bytep rp = row; 3799 png_bytep pp = prev_row; 3800 3801 for (i = 0; i < istop; ++i) 3802 { 3803 *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff); 3804 rp++; 3805 } 3806 } 3807 break; 3808 } 3809 3810 case PNG_FILTER_VALUE_AVG: 3811 { 3812 #if !defined(PNG_1_0_X) 3813 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG) && 3814 (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) && 3815 (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold)) 3816 #else 3817 if (mmx_supported) 3818 #endif 3819 { 3820 png_read_filter_row_mmx_avg(row_info, row, prev_row); 3821 } 3822 else 3823 { 3824 png_uint_32 i; 3825 png_bytep rp = row; 3826 png_bytep pp = prev_row; 3827 png_bytep lp = row; 3828 png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3; 3829 png_uint_32 istop = row_info->rowbytes - bpp; 3830 3831 for (i = 0; i < bpp; i++) 3832 { 3833 *rp = (png_byte)(((int)(*rp) + 3834 ((int)(*pp++) >> 1)) & 0xff); 3835 rp++; 3836 } 3837 3838 for (i = 0; i < istop; i++) 3839 { 3840 *rp = (png_byte)(((int)(*rp) + 3841 ((int)(*pp++ + *lp++) >> 1)) & 0xff); 3842 rp++; 3843 } 3844 } 3845 break; 3846 } 3847 3848 case PNG_FILTER_VALUE_PAETH: 3849 { 3850 #if !defined(PNG_1_0_X) 3851 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH) && 3852 (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) && 3853 (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold)) 3854 #else 3855 if (mmx_supported) 3856 #endif 3857 { 3858 png_read_filter_row_mmx_paeth(row_info, row, prev_row); 3859 } 3860 else 3861 { 3862 png_uint_32 i; 3863 png_bytep rp = row; 3864 png_bytep pp = prev_row; 3865 png_bytep lp = row; 3866 png_bytep cp = prev_row; 3867 png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3; 3868 png_uint_32 istop=row_info->rowbytes - bpp; 3869 3870 for (i = 0; i < bpp; i++) 3871 { 3872 *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff); 3873 rp++; 3874 } 3875 3876 for (i = 0; i < istop; i++) // use leftover rp,pp 3877 { 3878 int a, b, c, pa, pb, pc, p; 3879 3880 a = *lp++; 3881 b = *pp++; 3882 c = *cp++; 3883 3884 p = b - c; 3885 pc = a - c; 3886 3887 #ifdef PNG_USE_ABS 3888 pa = abs(p); 3889 pb = abs(pc); 3890 pc = abs(p + pc); 3891 #else 3892 pa = p < 0 ? -p : p; 3893 pb = pc < 0 ? -pc : pc; 3894 pc = (p + pc) < 0 ? -(p + pc) : p + pc; 3895 #endif 3896 3897 /* 3898 if (pa <= pb && pa <= pc) 3899 p = a; 3900 else if (pb <= pc) 3901 p = b; 3902 else 3903 p = c; 3904 */ 3905 3906 p = (pa <= pb && pa <=pc) ? a : (pb <= pc) ? b : c; 3907 3908 *rp = (png_byte)(((int)(*rp) + p) & 0xff); 3909 rp++; 3910 } 3911 } 3912 break; 3913 } 3914 3915 default: 3916 png_warning(png_ptr, "Ignoring bad row filter type"); 3917 *row=0; 3918 break; 3919 } 3920 } 3921 3922 #endif /* PNG_MMX_CODE_SUPPORTED && PNG_USE_PNGVCRD */ 3923