Home | History | Annotate | Download | only in libpng-1.2.19
      1 
      2 /* pngvcrd.c - mixed C/assembler version of utilities to read a PNG file
      3  *
      4  * For Intel x86 CPU and Microsoft Visual C++ compiler
      5  *
      6  * Last changed in libpng 1.2.19 August 18, 2007
      7  * For conditions of distribution and use, see copyright notice in png.h
      8  * Copyright (c) 1998-2007 Glenn Randers-Pehrson
      9  * Copyright (c) 1998, Intel Corporation
     10  *
     11  * Contributed by Nirav Chhatrapati, Intel Corporation, 1998
     12  * Interface to libpng contributed by Gilles Vollant, 1999
     13  *
     14  *
     15  * In png_do_read_interlace() in libpng versions 1.0.3a through 1.0.4d,
     16  * a sign error in the post-MMX cleanup code for each pixel_depth resulted
     17  * in bad pixels at the beginning of some rows of some images, and also
     18  * (due to out-of-range memory reads and writes) caused heap corruption
     19  * when compiled with MSVC 6.0.  The error was fixed in version 1.0.4e.
     20  *
     21  * [png_read_filter_row_mmx_avg() bpp == 2 bugfix, GRR 20000916]
     22  *
     23  * [runtime MMX configuration, GRR 20010102]
     24  *
     25  * [Copy 6 bytes per pixel, not 4, and use stride of 6, not 4, in the
     26  *  second loop of interlace processing of 48-bit pixels, GR-P 20070717]
     27  *
     28  * [move instances of uAll union into local, except for two constant
     29  * instances, GR-P 20070805]
     30  */
     31 
     32 #define PNG_INTERNAL
     33 #include "png.h"
     34 
     35 #if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_USE_PNGVCRD)
     36 
     37 
     38 static int mmx_supported=2;
     39 
     40 int PNGAPI
     41 png_mmx_support(void)
     42 {
     43   int mmx_supported_local = 0;
     44   _asm {
     45     push ebx          //CPUID will trash these
     46     push ecx
     47     push edx
     48 
     49     pushfd            //Save Eflag to stack
     50     pop eax           //Get Eflag from stack into eax
     51     mov ecx, eax      //Make another copy of Eflag in ecx
     52     xor eax, 0x200000 //Toggle ID bit in Eflag [i.e. bit(21)]
     53     push eax          //Save modified Eflag back to stack
     54 
     55     popfd             //Restored modified value back to Eflag reg
     56     pushfd            //Save Eflag to stack
     57     pop eax           //Get Eflag from stack
     58     push ecx          // save original Eflag to stack
     59     popfd             // restore original Eflag
     60     xor eax, ecx      //Compare the new Eflag with the original Eflag
     61     jz NOT_SUPPORTED  //If the same, CPUID instruction is not supported,
     62                       //skip following instructions and jump to
     63                       //NOT_SUPPORTED label
     64 
     65     xor eax, eax      //Set eax to zero
     66 
     67     _asm _emit 0x0f   //CPUID instruction  (two bytes opcode)
     68     _asm _emit 0xa2
     69 
     70     cmp eax, 1        //make sure eax return non-zero value
     71     jl NOT_SUPPORTED  //If eax is zero, mmx not supported
     72 
     73     xor eax, eax      //set eax to zero
     74     inc eax           //Now increment eax to 1.  This instruction is
     75                       //faster than the instruction "mov eax, 1"
     76 
     77     _asm _emit 0x0f   //CPUID instruction
     78     _asm _emit 0xa2
     79 
     80     and edx, 0x00800000  //mask out all bits but mmx bit(24)
     81     cmp edx, 0        // 0 = mmx not supported
     82     jz  NOT_SUPPORTED // non-zero = Yes, mmx IS supported
     83 
     84     mov  mmx_supported_local, 1  //set return value to 1
     85 
     86 NOT_SUPPORTED:
     87     mov  eax, mmx_supported_local  //move return value to eax
     88     pop edx          //CPUID trashed these
     89     pop ecx
     90     pop ebx
     91   }
     92 
     93   //mmx_supported_local=0; // test code for force don't support MMX
     94   //printf("MMX : %u (1=MMX supported)\n",mmx_supported_local);
     95 
     96   mmx_supported = mmx_supported_local;
     97   return mmx_supported_local;
     98 }
     99 
    100 /* Combines the row recently read in with the previous row.
    101    This routine takes care of alpha and transparency if requested.
    102    This routine also handles the two methods of progressive display
    103    of interlaced images, depending on the mask value.
    104    The mask value describes which pixels are to be combined with
    105    the row.  The pattern always repeats every 8 pixels, so just 8
    106    bits are needed.  A one indicates the pixel is to be combined; a
    107    zero indicates the pixel is to be skipped.  This is in addition
    108    to any alpha or transparency value associated with the pixel.  If
    109    you want all pixels to be combined, pass 0xff (255) in mask.  */
    110 
    111 /* Use this routine for x86 platform - uses faster MMX routine if machine
    112    supports MMX */
    113 
    114 void /* PRIVATE */
    115 png_combine_row(png_structp png_ptr, png_bytep row, int mask)
    116 {
    117 #ifdef PNG_USE_LOCAL_ARRAYS
    118    PNG_CONST int png_pass_inc[7] = {8, 8, 4, 4, 2, 2, 1};
    119 #endif
    120 
    121    png_debug(1,"in png_combine_row_asm\n");
    122 
    123    if (mmx_supported == 2) {
    124 #if !defined(PNG_1_0_X)
    125        /* this should have happened in png_init_mmx_flags() already */
    126        png_warning(png_ptr, "asm_flags may not have been initialized");
    127 #endif
    128        png_mmx_support();
    129    }
    130 
    131    if (mask == 0xff)
    132    {
    133       png_memcpy(row, png_ptr->row_buf + 1,
    134        (png_size_t)PNG_ROWBYTES(png_ptr->row_info.pixel_depth,
    135        png_ptr->width));
    136    }
    137    /* GRR:  add "else if (mask == 0)" case?
    138     *       or does png_combine_row() not even get called in that case? */
    139    else
    140    {
    141       switch (png_ptr->row_info.pixel_depth)
    142       {
    143          case 24:
    144          {
    145             png_bytep srcptr;
    146             png_bytep dstptr;
    147             png_uint_32 len;
    148             int unmask, diff;
    149 
    150             __int64 mask2=0x0101010202020404,  //24bpp
    151                     mask1=0x0408080810101020,
    152                     mask0=0x2020404040808080;
    153 
    154             srcptr = png_ptr->row_buf + 1;
    155             dstptr = row;
    156 
    157             unmask = ~mask;
    158             len     = (png_ptr->width)&~7;
    159             diff = (png_ptr->width)&7;
    160 
    161 #if !defined(PNG_1_0_X)
    162             if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
    163                 /* && mmx_supported */ )
    164 #else
    165             if (mmx_supported)
    166 #endif
    167             {
    168                _asm
    169                {
    170                   movd       mm7, unmask       //load bit pattern
    171                   psubb      mm6,mm6           //zero mm6
    172                   punpcklbw  mm7,mm7
    173                   punpcklwd  mm7,mm7
    174                   punpckldq  mm7,mm7           //fill register with 8 masks
    175 
    176                   movq       mm0,mask0
    177                   movq       mm1,mask1
    178                   movq       mm2,mask2
    179 
    180                   pand       mm0,mm7
    181                   pand       mm1,mm7
    182                   pand       mm2,mm7
    183 
    184                   pcmpeqb    mm0,mm6
    185                   pcmpeqb    mm1,mm6
    186                   pcmpeqb    mm2,mm6
    187 
    188                   mov        ecx,len           //load length of line
    189                   mov        esi,srcptr        //load source
    190                   mov        ebx,dstptr        //load dest
    191                   cmp        ecx,0
    192                   jz         mainloop24end
    193 
    194 mainloop24:
    195                   movq       mm4,[esi]
    196                   pand       mm4,mm0
    197                   movq       mm6,mm0
    198                   movq       mm7,[ebx]
    199                   pandn      mm6,mm7
    200                   por        mm4,mm6
    201                   movq       [ebx],mm4
    202 
    203 
    204                   movq       mm5,[esi+8]
    205                   pand       mm5,mm1
    206                   movq       mm7,mm1
    207                   movq       mm6,[ebx+8]
    208                   pandn      mm7,mm6
    209                   por        mm5,mm7
    210                   movq       [ebx+8],mm5
    211 
    212                   movq       mm6,[esi+16]
    213                   pand       mm6,mm2
    214                   movq       mm4,mm2
    215                   movq       mm7,[ebx+16]
    216                   pandn      mm4,mm7
    217                   por        mm6,mm4
    218                   movq       [ebx+16],mm6
    219 
    220                   add        esi,24            //inc by 24 bytes processed
    221                   add        ebx,24
    222                   sub        ecx,8             //dec by 8 pixels processed
    223 
    224                   ja         mainloop24
    225 
    226 mainloop24end:
    227                   mov        ecx,diff
    228                   cmp        ecx,0
    229                   jz         end24
    230 
    231                   mov        edx,mask
    232                   sal        edx,24            //make low byte the high byte
    233 secondloop24:
    234                   sal        edx,1             //move high bit to CF
    235                   jnc        skip24            //if CF = 0
    236                   mov        ax,[esi]
    237                   mov        [ebx],ax
    238                   xor        eax,eax
    239                   mov        al,[esi+2]
    240                   mov        [ebx+2],al
    241 skip24:
    242                   add        esi,3
    243                   add        ebx,3
    244 
    245                   dec        ecx
    246                   jnz        secondloop24
    247 
    248 end24:
    249                   emms
    250                }
    251             }
    252             else /* mmx not supported - use modified C routine */
    253             {
    254                register unsigned int incr1, initial_val, final_val;
    255                png_size_t pixel_bytes;
    256                png_uint_32 i;
    257                register int disp = png_pass_inc[png_ptr->pass];
    258                int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
    259 
    260                pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
    261                srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
    262                   pixel_bytes;
    263                dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
    264                initial_val = offset_table[png_ptr->pass]*pixel_bytes;
    265                final_val = png_ptr->width*pixel_bytes;
    266                incr1 = (disp)*pixel_bytes;
    267                for (i = initial_val; i < final_val; i += incr1)
    268                {
    269                   png_memcpy(dstptr, srcptr, pixel_bytes);
    270                   srcptr += incr1;
    271                   dstptr += incr1;
    272                }
    273             } /* end of else */
    274 
    275             break;
    276          }       // end 24 bpp
    277 
    278          case 32:
    279          {
    280             png_bytep srcptr;
    281             png_bytep dstptr;
    282             png_uint_32 len;
    283             int unmask, diff;
    284 
    285             __int64 mask3=0x0101010102020202,  //32bpp
    286                     mask2=0x0404040408080808,
    287                     mask1=0x1010101020202020,
    288                     mask0=0x4040404080808080;
    289 
    290             srcptr = png_ptr->row_buf + 1;
    291             dstptr = row;
    292 
    293             unmask = ~mask;
    294             len     = (png_ptr->width)&~7;
    295             diff = (png_ptr->width)&7;
    296 
    297 #if !defined(PNG_1_0_X)
    298             if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
    299                 /* && mmx_supported */ )
    300 #else
    301             if (mmx_supported)
    302 #endif
    303             {
    304                _asm
    305                {
    306                   movd       mm7, unmask       //load bit pattern
    307                   psubb      mm6,mm6           //zero mm6
    308                   punpcklbw  mm7,mm7
    309                   punpcklwd  mm7,mm7
    310                   punpckldq  mm7,mm7           //fill register with 8 masks
    311 
    312                   movq       mm0,mask0
    313                   movq       mm1,mask1
    314                   movq       mm2,mask2
    315                   movq       mm3,mask3
    316 
    317                   pand       mm0,mm7
    318                   pand       mm1,mm7
    319                   pand       mm2,mm7
    320                   pand       mm3,mm7
    321 
    322                   pcmpeqb    mm0,mm6
    323                   pcmpeqb    mm1,mm6
    324                   pcmpeqb    mm2,mm6
    325                   pcmpeqb    mm3,mm6
    326 
    327                   mov        ecx,len           //load length of line
    328                   mov        esi,srcptr        //load source
    329                   mov        ebx,dstptr        //load dest
    330 
    331                   cmp        ecx,0             //lcr
    332                   jz         mainloop32end
    333 
    334 mainloop32:
    335                   movq       mm4,[esi]
    336                   pand       mm4,mm0
    337                   movq       mm6,mm0
    338                   movq       mm7,[ebx]
    339                   pandn      mm6,mm7
    340                   por        mm4,mm6
    341                   movq       [ebx],mm4
    342 
    343                   movq       mm5,[esi+8]
    344                   pand       mm5,mm1
    345                   movq       mm7,mm1
    346                   movq       mm6,[ebx+8]
    347                   pandn      mm7,mm6
    348                   por        mm5,mm7
    349                   movq       [ebx+8],mm5
    350 
    351                   movq       mm6,[esi+16]
    352                   pand       mm6,mm2
    353                   movq       mm4,mm2
    354                   movq       mm7,[ebx+16]
    355                   pandn      mm4,mm7
    356                   por        mm6,mm4
    357                   movq       [ebx+16],mm6
    358 
    359                   movq       mm7,[esi+24]
    360                   pand       mm7,mm3
    361                   movq       mm5,mm3
    362                   movq       mm4,[ebx+24]
    363                   pandn      mm5,mm4
    364                   por        mm7,mm5
    365                   movq       [ebx+24],mm7
    366 
    367                   add        esi,32            //inc by 32 bytes processed
    368                   add        ebx,32
    369                   sub        ecx,8             //dec by 8 pixels processed
    370 
    371                   ja         mainloop32
    372 
    373 mainloop32end:
    374                   mov        ecx,diff
    375                   cmp        ecx,0
    376                   jz         end32
    377 
    378                   mov        edx,mask
    379                   sal        edx,24            //make low byte the high byte
    380 secondloop32:
    381                   sal        edx,1             //move high bit to CF
    382                   jnc        skip32            //if CF = 0
    383                   mov        eax,[esi]
    384                   mov        [ebx],eax
    385 skip32:
    386                   add        esi,4
    387                   add        ebx,4
    388 
    389                   dec        ecx
    390                   jnz        secondloop32
    391 
    392 end32:
    393                   emms
    394                }
    395             }
    396             else /* mmx _not supported - Use modified C routine */
    397             {
    398                register unsigned int incr1, initial_val, final_val;
    399                png_size_t pixel_bytes;
    400                png_uint_32 i;
    401                register int disp = png_pass_inc[png_ptr->pass];
    402                int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
    403 
    404                pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
    405                srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
    406                   pixel_bytes;
    407                dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
    408                initial_val = offset_table[png_ptr->pass]*pixel_bytes;
    409                final_val = png_ptr->width*pixel_bytes;
    410                incr1 = (disp)*pixel_bytes;
    411                for (i = initial_val; i < final_val; i += incr1)
    412                {
    413                   png_memcpy(dstptr, srcptr, pixel_bytes);
    414                   srcptr += incr1;
    415                   dstptr += incr1;
    416                }
    417             } /* end of else */
    418 
    419             break;
    420          }       // end 32 bpp
    421 
    422          case 8:
    423          {
    424             png_bytep srcptr;
    425             png_bytep dstptr;
    426             png_uint_32 len;
    427             int m;
    428             int diff, unmask;
    429 
    430             __int64 mask0=0x0102040810204080;
    431 
    432 #if !defined(PNG_1_0_X)
    433             if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
    434                 /* && mmx_supported */ )
    435 #else
    436             if (mmx_supported)
    437 #endif
    438             {
    439                srcptr = png_ptr->row_buf + 1;
    440                dstptr = row;
    441                m = 0x80;
    442                unmask = ~mask;
    443                len  = png_ptr->width &~7;  //reduce to multiple of 8
    444                diff = png_ptr->width & 7;  //amount lost
    445 
    446                _asm
    447                {
    448                   movd       mm7, unmask   //load bit pattern
    449                   psubb      mm6,mm6       //zero mm6
    450                   punpcklbw  mm7,mm7
    451                   punpcklwd  mm7,mm7
    452                   punpckldq  mm7,mm7       //fill register with 8 masks
    453 
    454                   movq       mm0,mask0
    455 
    456                   pand       mm0,mm7       //nonzero if keep byte
    457                   pcmpeqb    mm0,mm6       //zeros->1s, v versa
    458 
    459                   mov        ecx,len       //load length of line (pixels)
    460                   mov        esi,srcptr    //load source
    461                   mov        ebx,dstptr    //load dest
    462                   cmp        ecx,0         //lcr
    463                   je         mainloop8end
    464 
    465 mainloop8:
    466                   movq       mm4,[esi]
    467                   pand       mm4,mm0
    468                   movq       mm6,mm0
    469                   pandn      mm6,[ebx]
    470                   por        mm4,mm6
    471                   movq       [ebx],mm4
    472 
    473                   add        esi,8         //inc by 8 bytes processed
    474                   add        ebx,8
    475                   sub        ecx,8         //dec by 8 pixels processed
    476 
    477                   ja         mainloop8
    478 mainloop8end:
    479 
    480                   mov        ecx,diff
    481                   cmp        ecx,0
    482                   jz         end8
    483 
    484                   mov        edx,mask
    485                   sal        edx,24        //make low byte the high byte
    486 
    487 secondloop8:
    488                   sal        edx,1         //move high bit to CF
    489                   jnc        skip8         //if CF = 0
    490                   mov        al,[esi]
    491                   mov        [ebx],al
    492 skip8:
    493                   inc        esi
    494                   inc        ebx
    495 
    496                   dec        ecx
    497                   jnz        secondloop8
    498 end8:
    499                   emms
    500                }
    501             }
    502             else /* mmx not supported - use modified C routine */
    503             {
    504                register unsigned int incr1, initial_val, final_val;
    505                png_size_t pixel_bytes;
    506                png_uint_32 i;
    507                register int disp = png_pass_inc[png_ptr->pass];
    508                int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
    509 
    510                pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
    511                srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
    512                   pixel_bytes;
    513                dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
    514                initial_val = offset_table[png_ptr->pass]*pixel_bytes;
    515                final_val = png_ptr->width*pixel_bytes;
    516                incr1 = (disp)*pixel_bytes;
    517                for (i = initial_val; i < final_val; i += incr1)
    518                {
    519                   png_memcpy(dstptr, srcptr, pixel_bytes);
    520                   srcptr += incr1;
    521                   dstptr += incr1;
    522                }
    523             } /* end of else */
    524 
    525             break;
    526          }       // end 8 bpp
    527 
    528          case 1:
    529          {
    530             png_bytep sp;
    531             png_bytep dp;
    532             int s_inc, s_start, s_end;
    533             int m;
    534             int shift;
    535             png_uint_32 i;
    536 
    537             sp = png_ptr->row_buf + 1;
    538             dp = row;
    539             m = 0x80;
    540 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
    541             if (png_ptr->transformations & PNG_PACKSWAP)
    542             {
    543                 s_start = 0;
    544                 s_end = 7;
    545                 s_inc = 1;
    546             }
    547             else
    548 #endif
    549             {
    550                 s_start = 7;
    551                 s_end = 0;
    552                 s_inc = -1;
    553             }
    554 
    555             shift = s_start;
    556 
    557             for (i = 0; i < png_ptr->width; i++)
    558             {
    559                if (m & mask)
    560                {
    561                   int value;
    562 
    563                   value = (*sp >> shift) & 0x1;
    564                   *dp &= (png_byte)((0x7f7f >> (7 - shift)) & 0xff);
    565                   *dp |= (png_byte)(value << shift);
    566                }
    567 
    568                if (shift == s_end)
    569                {
    570                   shift = s_start;
    571                   sp++;
    572                   dp++;
    573                }
    574                else
    575                   shift += s_inc;
    576 
    577                if (m == 1)
    578                   m = 0x80;
    579                else
    580                   m >>= 1;
    581             }
    582             break;
    583          }
    584 
    585          case 2:
    586          {
    587             png_bytep sp;
    588             png_bytep dp;
    589             int s_start, s_end, s_inc;
    590             int m;
    591             int shift;
    592             png_uint_32 i;
    593             int value;
    594 
    595             sp = png_ptr->row_buf + 1;
    596             dp = row;
    597             m = 0x80;
    598 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
    599             if (png_ptr->transformations & PNG_PACKSWAP)
    600             {
    601                s_start = 0;
    602                s_end = 6;
    603                s_inc = 2;
    604             }
    605             else
    606 #endif
    607             {
    608                s_start = 6;
    609                s_end = 0;
    610                s_inc = -2;
    611             }
    612 
    613             shift = s_start;
    614 
    615             for (i = 0; i < png_ptr->width; i++)
    616             {
    617                if (m & mask)
    618                {
    619                   value = (*sp >> shift) & 0x3;
    620                   *dp &= (png_byte)((0x3f3f >> (6 - shift)) & 0xff);
    621                   *dp |= (png_byte)(value << shift);
    622                }
    623 
    624                if (shift == s_end)
    625                {
    626                   shift = s_start;
    627                   sp++;
    628                   dp++;
    629                }
    630                else
    631                   shift += s_inc;
    632                if (m == 1)
    633                   m = 0x80;
    634                else
    635                   m >>= 1;
    636             }
    637             break;
    638          }
    639 
    640          case 4:
    641          {
    642             png_bytep sp;
    643             png_bytep dp;
    644             int s_start, s_end, s_inc;
    645             int m;
    646             int shift;
    647             png_uint_32 i;
    648             int value;
    649 
    650             sp = png_ptr->row_buf + 1;
    651             dp = row;
    652             m = 0x80;
    653 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
    654             if (png_ptr->transformations & PNG_PACKSWAP)
    655             {
    656                s_start = 0;
    657                s_end = 4;
    658                s_inc = 4;
    659             }
    660             else
    661 #endif
    662             {
    663                s_start = 4;
    664                s_end = 0;
    665                s_inc = -4;
    666             }
    667             shift = s_start;
    668 
    669             for (i = 0; i < png_ptr->width; i++)
    670             {
    671                if (m & mask)
    672                {
    673                   value = (*sp >> shift) & 0xf;
    674                   *dp &= (png_byte)((0xf0f >> (4 - shift)) & 0xff);
    675                   *dp |= (png_byte)(value << shift);
    676                }
    677 
    678                if (shift == s_end)
    679                {
    680                   shift = s_start;
    681                   sp++;
    682                   dp++;
    683                }
    684                else
    685                   shift += s_inc;
    686                if (m == 1)
    687                   m = 0x80;
    688                else
    689                   m >>= 1;
    690             }
    691             break;
    692          }
    693 
    694          case 16:
    695          {
    696             png_bytep srcptr;
    697             png_bytep dstptr;
    698             png_uint_32 len;
    699             int unmask, diff;
    700             __int64 mask1=0x0101020204040808,
    701                     mask0=0x1010202040408080;
    702 
    703 #if !defined(PNG_1_0_X)
    704             if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
    705                 /* && mmx_supported */ )
    706 #else
    707             if (mmx_supported)
    708 #endif
    709             {
    710                srcptr = png_ptr->row_buf + 1;
    711                dstptr = row;
    712 
    713                unmask = ~mask;
    714                len     = (png_ptr->width)&~7;
    715                diff = (png_ptr->width)&7;
    716                _asm
    717                {
    718                   movd       mm7, unmask       //load bit pattern
    719                   psubb      mm6,mm6           //zero mm6
    720                   punpcklbw  mm7,mm7
    721                   punpcklwd  mm7,mm7
    722                   punpckldq  mm7,mm7           //fill register with 8 masks
    723 
    724                   movq       mm0,mask0
    725                   movq       mm1,mask1
    726 
    727                   pand       mm0,mm7
    728                   pand       mm1,mm7
    729 
    730                   pcmpeqb    mm0,mm6
    731                   pcmpeqb    mm1,mm6
    732 
    733                   mov        ecx,len           //load length of line
    734                   mov        esi,srcptr        //load source
    735                   mov        ebx,dstptr        //load dest
    736                   cmp        ecx,0             //lcr
    737                   jz         mainloop16end
    738 
    739 mainloop16:
    740                   movq       mm4,[esi]
    741                   pand       mm4,mm0
    742                   movq       mm6,mm0
    743                   movq       mm7,[ebx]
    744                   pandn      mm6,mm7
    745                   por        mm4,mm6
    746                   movq       [ebx],mm4
    747 
    748                   movq       mm5,[esi+8]
    749                   pand       mm5,mm1
    750                   movq       mm7,mm1
    751                   movq       mm6,[ebx+8]
    752                   pandn      mm7,mm6
    753                   por        mm5,mm7
    754                   movq       [ebx+8],mm5
    755 
    756                   add        esi,16            //inc by 16 bytes processed
    757                   add        ebx,16
    758                   sub        ecx,8             //dec by 8 pixels processed
    759 
    760                   ja         mainloop16
    761 
    762 mainloop16end:
    763                   mov        ecx,diff
    764                   cmp        ecx,0
    765                   jz         end16
    766 
    767                   mov        edx,mask
    768                   sal        edx,24            //make low byte the high byte
    769 secondloop16:
    770                   sal        edx,1             //move high bit to CF
    771                   jnc        skip16            //if CF = 0
    772                   mov        ax,[esi]
    773                   mov        [ebx],ax
    774 skip16:
    775                   add        esi,2
    776                   add        ebx,2
    777 
    778                   dec        ecx
    779                   jnz        secondloop16
    780 end16:
    781                   emms
    782                }
    783             }
    784             else /* mmx not supported - use modified C routine */
    785             {
    786                register unsigned int incr1, initial_val, final_val;
    787                png_size_t pixel_bytes;
    788                png_uint_32 i;
    789                register int disp = png_pass_inc[png_ptr->pass];
    790                int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
    791 
    792                pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
    793                srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
    794                   pixel_bytes;
    795                dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
    796                initial_val = offset_table[png_ptr->pass]*pixel_bytes;
    797                final_val = png_ptr->width*pixel_bytes;
    798                incr1 = (disp)*pixel_bytes;
    799                for (i = initial_val; i < final_val; i += incr1)
    800                {
    801                   png_memcpy(dstptr, srcptr, pixel_bytes);
    802                   srcptr += incr1;
    803                   dstptr += incr1;
    804                }
    805             } /* end of else */
    806 
    807             break;
    808          }       // end 16 bpp
    809 
    810          case 48:
    811          {
    812             png_bytep srcptr;
    813             png_bytep dstptr;
    814             png_uint_32 len;
    815             int unmask, diff;
    816 
    817             __int64 mask5=0x0101010101010202,
    818                     mask4=0x0202020204040404,
    819                     mask3=0x0404080808080808,
    820                     mask2=0x1010101010102020,
    821                     mask1=0x2020202040404040,
    822                     mask0=0x4040808080808080;
    823 
    824 #if !defined(PNG_1_0_X)
    825             if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
    826                 /* && mmx_supported */ )
    827 #else
    828             if (mmx_supported)
    829 #endif
    830             {
    831                srcptr = png_ptr->row_buf + 1;
    832                dstptr = row;
    833 
    834                unmask = ~mask;
    835                len     = (png_ptr->width)&~7;
    836                diff = (png_ptr->width)&7;
    837                _asm
    838                {
    839                   movd       mm7, unmask       //load bit pattern
    840                   psubb      mm6,mm6           //zero mm6
    841                   punpcklbw  mm7,mm7
    842                   punpcklwd  mm7,mm7
    843                   punpckldq  mm7,mm7           //fill register with 8 masks
    844 
    845                   movq       mm0,mask0
    846                   movq       mm1,mask1
    847                   movq       mm2,mask2
    848                   movq       mm3,mask3
    849                   movq       mm4,mask4
    850                   movq       mm5,mask5
    851 
    852                   pand       mm0,mm7
    853                   pand       mm1,mm7
    854                   pand       mm2,mm7
    855                   pand       mm3,mm7
    856                   pand       mm4,mm7
    857                   pand       mm5,mm7
    858 
    859                   pcmpeqb    mm0,mm6
    860                   pcmpeqb    mm1,mm6
    861                   pcmpeqb    mm2,mm6
    862                   pcmpeqb    mm3,mm6
    863                   pcmpeqb    mm4,mm6
    864                   pcmpeqb    mm5,mm6
    865 
    866                   mov        ecx,len           //load length of line
    867                   mov        esi,srcptr        //load source
    868                   mov        ebx,dstptr        //load dest
    869 
    870                   cmp        ecx,0
    871                   jz         mainloop48end
    872 
    873 mainloop48:
    874                   movq       mm7,[esi]
    875                   pand       mm7,mm0
    876                   movq       mm6,mm0
    877                   pandn      mm6,[ebx]
    878                   por        mm7,mm6
    879                   movq       [ebx],mm7
    880 
    881                   movq       mm6,[esi+8]
    882                   pand       mm6,mm1
    883                   movq       mm7,mm1
    884                   pandn      mm7,[ebx+8]
    885                   por        mm6,mm7
    886                   movq       [ebx+8],mm6
    887 
    888                   movq       mm6,[esi+16]
    889                   pand       mm6,mm2
    890                   movq       mm7,mm2
    891                   pandn      mm7,[ebx+16]
    892                   por        mm6,mm7
    893                   movq       [ebx+16],mm6
    894 
    895                   movq       mm7,[esi+24]
    896                   pand       mm7,mm3
    897                   movq       mm6,mm3
    898                   pandn      mm6,[ebx+24]
    899                   por        mm7,mm6
    900                   movq       [ebx+24],mm7
    901 
    902                   movq       mm6,[esi+32]
    903                   pand       mm6,mm4
    904                   movq       mm7,mm4
    905                   pandn      mm7,[ebx+32]
    906                   por        mm6,mm7
    907                   movq       [ebx+32],mm6
    908 
    909                   movq       mm7,[esi+40]
    910                   pand       mm7,mm5
    911                   movq       mm6,mm5
    912                   pandn      mm6,[ebx+40]
    913                   por        mm7,mm6
    914                   movq       [ebx+40],mm7
    915 
    916                   add        esi,48            //inc by 32 bytes processed
    917                   add        ebx,48
    918                   sub        ecx,8             //dec by 8 pixels processed
    919 
    920                   ja         mainloop48
    921 mainloop48end:
    922 
    923                   mov        ecx,diff
    924                   cmp        ecx,0
    925                   jz         end48
    926 
    927                   mov        edx,mask
    928                   sal        edx,24            //make low byte the high byte
    929 
    930 secondloop48:
    931                   sal        edx,1             //move high bit to CF
    932                   jnc        skip48            //if CF = 0
    933                   mov        eax,[esi]
    934                   mov        [ebx],eax
    935                   mov        ax,[esi+4]       // These 2 lines added 20070717
    936                   mov        [ebx+4],ax       // Glenn R-P
    937 skip48:
    938                   add        esi,6            // Changed 4 to 6 on these 2
    939                   add        ebx,6            // lines.  Glenn R-P 20070717
    940 
    941                   dec        ecx
    942                   jnz        secondloop48
    943 
    944 end48:
    945                   emms
    946                }
    947             }
    948             else /* mmx _not supported - Use modified C routine */
    949             {
    950                register unsigned int incr1, initial_val, final_val;
    951                png_size_t pixel_bytes;
    952                png_uint_32 i;
    953                register int disp = png_pass_inc[png_ptr->pass];
    954                int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
    955 
    956                pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
    957                srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
    958                   pixel_bytes;
    959                dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
    960                initial_val = offset_table[png_ptr->pass]*pixel_bytes;
    961                final_val = png_ptr->width*pixel_bytes;
    962                incr1 = (disp)*pixel_bytes;
    963                for (i = initial_val; i < final_val; i += incr1)
    964                {
    965                   png_memcpy(dstptr, srcptr, pixel_bytes);
    966                   srcptr += incr1;
    967                   dstptr += incr1;
    968                }
    969             } /* end of else */
    970 
    971             break;
    972          }       // end 48 bpp
    973 
    974          default:
    975          {
    976             png_bytep sptr;
    977             png_bytep dp;
    978             png_size_t pixel_bytes;
    979             int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
    980             unsigned int i;
    981             register int disp = png_pass_inc[png_ptr->pass];  // get the offset
    982             register unsigned int incr1, initial_val, final_val;
    983 
    984             pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
    985             sptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
    986                pixel_bytes;
    987             dp = row + offset_table[png_ptr->pass]*pixel_bytes;
    988             initial_val = offset_table[png_ptr->pass]*pixel_bytes;
    989             final_val = png_ptr->width*pixel_bytes;
    990             incr1 = (disp)*pixel_bytes;
    991             for (i = initial_val; i < final_val; i += incr1)
    992             {
    993                png_memcpy(dp, sptr, pixel_bytes);
    994                sptr += incr1;
    995                dp += incr1;
    996             }
    997             break;
    998          }
    999       } /* end switch (png_ptr->row_info.pixel_depth) */
   1000    } /* end if (non-trivial mask) */
   1001 
   1002 } /* end png_combine_row() */
   1003 
   1004 
   1005 #if defined(PNG_READ_INTERLACING_SUPPORTED)
   1006 
   1007 void /* PRIVATE */
   1008 png_do_read_interlace(png_structp png_ptr)
   1009 {
   1010    png_row_infop row_info = &(png_ptr->row_info);
   1011    png_bytep row = png_ptr->row_buf + 1;
   1012    int pass = png_ptr->pass;
   1013    png_uint_32 transformations = png_ptr->transformations;
   1014 #ifdef PNG_USE_LOCAL_ARRAYS
   1015    PNG_CONST int png_pass_inc[7] = {8, 8, 4, 4, 2, 2, 1};
   1016 #endif
   1017 
   1018    png_debug(1,"in png_do_read_interlace\n");
   1019 
   1020    if (mmx_supported == 2) {
   1021 #if !defined(PNG_1_0_X)
   1022        /* this should have happened in png_init_mmx_flags() already */
   1023        png_warning(png_ptr, "asm_flags may not have been initialized");
   1024 #endif
   1025        png_mmx_support();
   1026    }
   1027 
   1028    if (row != NULL && row_info != NULL)
   1029    {
   1030       png_uint_32 final_width;
   1031 
   1032       final_width = row_info->width * png_pass_inc[pass];
   1033 
   1034       switch (row_info->pixel_depth)
   1035       {
   1036          case 1:
   1037          {
   1038             png_bytep sp, dp;
   1039             int sshift, dshift;
   1040             int s_start, s_end, s_inc;
   1041             png_byte v;
   1042             png_uint_32 i;
   1043             int j;
   1044 
   1045             sp = row + (png_size_t)((row_info->width - 1) >> 3);
   1046             dp = row + (png_size_t)((final_width - 1) >> 3);
   1047 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
   1048             if (transformations & PNG_PACKSWAP)
   1049             {
   1050                sshift = (int)((row_info->width + 7) & 7);
   1051                dshift = (int)((final_width + 7) & 7);
   1052                s_start = 7;
   1053                s_end = 0;
   1054                s_inc = -1;
   1055             }
   1056             else
   1057 #endif
   1058             {
   1059                sshift = 7 - (int)((row_info->width + 7) & 7);
   1060                dshift = 7 - (int)((final_width + 7) & 7);
   1061                s_start = 0;
   1062                s_end = 7;
   1063                s_inc = 1;
   1064             }
   1065 
   1066             for (i = row_info->width; i; i--)
   1067             {
   1068                v = (png_byte)((*sp >> sshift) & 0x1);
   1069                for (j = 0; j < png_pass_inc[pass]; j++)
   1070                {
   1071                   *dp &= (png_byte)((0x7f7f >> (7 - dshift)) & 0xff);
   1072                   *dp |= (png_byte)(v << dshift);
   1073                   if (dshift == s_end)
   1074                   {
   1075                      dshift = s_start;
   1076                      dp--;
   1077                   }
   1078                   else
   1079                      dshift += s_inc;
   1080                }
   1081                if (sshift == s_end)
   1082                {
   1083                   sshift = s_start;
   1084                   sp--;
   1085                }
   1086                else
   1087                   sshift += s_inc;
   1088             }
   1089             break;
   1090          }
   1091 
   1092          case 2:
   1093          {
   1094             png_bytep sp, dp;
   1095             int sshift, dshift;
   1096             int s_start, s_end, s_inc;
   1097             png_uint_32 i;
   1098 
   1099             sp = row + (png_size_t)((row_info->width - 1) >> 2);
   1100             dp = row + (png_size_t)((final_width - 1) >> 2);
   1101 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
   1102             if (transformations & PNG_PACKSWAP)
   1103             {
   1104                sshift = (png_size_t)(((row_info->width + 3) & 3) << 1);
   1105                dshift = (png_size_t)(((final_width + 3) & 3) << 1);
   1106                s_start = 6;
   1107                s_end = 0;
   1108                s_inc = -2;
   1109             }
   1110             else
   1111 #endif
   1112             {
   1113                sshift = (png_size_t)((3 - ((row_info->width + 3) & 3)) << 1);
   1114                dshift = (png_size_t)((3 - ((final_width + 3) & 3)) << 1);
   1115                s_start = 0;
   1116                s_end = 6;
   1117                s_inc = 2;
   1118             }
   1119 
   1120             for (i = row_info->width; i; i--)
   1121             {
   1122                png_byte v;
   1123                int j;
   1124 
   1125                v = (png_byte)((*sp >> sshift) & 0x3);
   1126                for (j = 0; j < png_pass_inc[pass]; j++)
   1127                {
   1128                   *dp &= (png_byte)((0x3f3f >> (6 - dshift)) & 0xff);
   1129                   *dp |= (png_byte)(v << dshift);
   1130                   if (dshift == s_end)
   1131                   {
   1132                      dshift = s_start;
   1133                      dp--;
   1134                   }
   1135                   else
   1136                      dshift += s_inc;
   1137                }
   1138                if (sshift == s_end)
   1139                {
   1140                   sshift = s_start;
   1141                   sp--;
   1142                }
   1143                else
   1144                   sshift += s_inc;
   1145             }
   1146             break;
   1147          }
   1148 
   1149          case 4:
   1150          {
   1151             png_bytep sp, dp;
   1152             int sshift, dshift;
   1153             int s_start, s_end, s_inc;
   1154             png_uint_32 i;
   1155 
   1156             sp = row + (png_size_t)((row_info->width - 1) >> 1);
   1157             dp = row + (png_size_t)((final_width - 1) >> 1);
   1158 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
   1159             if (transformations & PNG_PACKSWAP)
   1160             {
   1161                sshift = (png_size_t)(((row_info->width + 1) & 1) << 2);
   1162                dshift = (png_size_t)(((final_width + 1) & 1) << 2);
   1163                s_start = 4;
   1164                s_end = 0;
   1165                s_inc = -4;
   1166             }
   1167             else
   1168 #endif
   1169             {
   1170                sshift = (png_size_t)((1 - ((row_info->width + 1) & 1)) << 2);
   1171                dshift = (png_size_t)((1 - ((final_width + 1) & 1)) << 2);
   1172                s_start = 0;
   1173                s_end = 4;
   1174                s_inc = 4;
   1175             }
   1176 
   1177             for (i = row_info->width; i; i--)
   1178             {
   1179                png_byte v;
   1180                int j;
   1181 
   1182                v = (png_byte)((*sp >> sshift) & 0xf);
   1183                for (j = 0; j < png_pass_inc[pass]; j++)
   1184                {
   1185                   *dp &= (png_byte)((0xf0f >> (4 - dshift)) & 0xff);
   1186                   *dp |= (png_byte)(v << dshift);
   1187                   if (dshift == s_end)
   1188                   {
   1189                      dshift = s_start;
   1190                      dp--;
   1191                   }
   1192                   else
   1193                      dshift += s_inc;
   1194                }
   1195                if (sshift == s_end)
   1196                {
   1197                   sshift = s_start;
   1198                   sp--;
   1199                }
   1200                else
   1201                   sshift += s_inc;
   1202             }
   1203             break;
   1204          }
   1205 
   1206          default:         // This is the place where the routine is modified
   1207          {
   1208             __int64 const4 = 0x0000000000FFFFFF;
   1209             // __int64 const5 = 0x000000FFFFFF0000;  // unused...
   1210             __int64 const6 = 0x00000000000000FF;
   1211             png_bytep sptr, dp;
   1212             png_uint_32 i;
   1213             png_size_t pixel_bytes;
   1214             int width = row_info->width;
   1215 
   1216             pixel_bytes = (row_info->pixel_depth >> 3);
   1217 
   1218             sptr = row + (width - 1) * pixel_bytes;
   1219             dp = row + (final_width - 1) * pixel_bytes;
   1220             // New code by Nirav Chhatrapati - Intel Corporation
   1221             // sign fix by GRR
   1222             // NOTE:  there is NO MMX code for 48-bit and 64-bit images
   1223 
   1224             // use MMX routine if machine supports it
   1225 #if !defined(PNG_1_0_X)
   1226             if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_INTERLACE)
   1227                 /* && mmx_supported */ )
   1228 #else
   1229             if (mmx_supported)
   1230 #endif
   1231             {
   1232                if (pixel_bytes == 3)
   1233                {
   1234                   if (((pass == 4) || (pass == 5)) && width)
   1235                   {
   1236                      int width_mmx = ((width >> 1) << 1) - 8;
   1237                      if (width_mmx < 0)
   1238                          width_mmx = 0;
   1239                      width -= width_mmx;        // 8 or 9 pix, 24 or 27 bytes
   1240                      if (width_mmx)
   1241                      {
   1242                         _asm
   1243                         {
   1244                            mov esi, sptr
   1245                            mov edi, dp
   1246                            mov ecx, width_mmx
   1247                            sub esi, 3
   1248                            sub edi, 9
   1249 loop_pass4:
   1250                            movq mm0, [esi]     ; X X v2 v1 v0 v5 v4 v3
   1251                            movq mm7, mm0       ; X X v2 v1 v0 v5 v4 v3
   1252                            movq mm6, mm0       ; X X v2 v1 v0 v5 v4 v3
   1253                            psllq mm0, 24       ; v1 v0 v5 v4 v3 0 0 0
   1254                            pand mm7, const4    ; 0 0 0 0 0 v5 v4 v3
   1255                            psrlq mm6, 24       ; 0 0 0 X X v2 v1 v0
   1256                            por mm0, mm7        ; v1 v0 v5 v4 v3 v5 v4 v3
   1257                            movq mm5, mm6       ; 0 0 0 X X v2 v1 v0
   1258                            psllq mm6, 8        ; 0 0 X X v2 v1 v0 0
   1259                            movq [edi], mm0     ; move quad to memory
   1260                            psrlq mm5, 16       ; 0 0 0 0 0 X X v2
   1261                            pand mm5, const6    ; 0 0 0 0 0 0 0 v2
   1262                            por mm6, mm5        ; 0 0 X X v2 v1 v0 v2
   1263                            movd [edi+8], mm6   ; move double to memory
   1264                            sub esi, 6
   1265                            sub edi, 12
   1266                            sub ecx, 2
   1267                            jnz loop_pass4
   1268                            EMMS
   1269                         }
   1270                      }
   1271 
   1272                      sptr -= width_mmx*3;
   1273                      dp -= width_mmx*6;
   1274                      for (i = width; i; i--)
   1275                      {
   1276                         png_byte v[8];
   1277                         int j;
   1278 
   1279                         png_memcpy(v, sptr, 3);
   1280                         for (j = 0; j < png_pass_inc[pass]; j++)
   1281                         {
   1282                            png_memcpy(dp, v, 3);
   1283                            dp -= 3;
   1284                         }
   1285                         sptr -= 3;
   1286                      }
   1287                   }
   1288                   else if (((pass == 2) || (pass == 3)) && width)
   1289                   {
   1290                      _asm
   1291                      {
   1292                         mov esi, sptr
   1293                         mov edi, dp
   1294                         mov ecx, width
   1295                         sub edi, 9   // (png_pass_inc[pass] - 1)*pixel_bytes
   1296 loop_pass2:
   1297                         movd mm0, [esi]     ; X X X X X v2 v1 v0
   1298                         pand mm0, const4    ; 0 0 0 0 0 v2 v1 v0
   1299                         movq mm1, mm0       ; 0 0 0 0 0 v2 v1 v0
   1300                         psllq mm0, 16       ; 0 0 0 v2 v1 v0 0 0
   1301                         movq mm2, mm0       ; 0 0 0 v2 v1 v0 0 0
   1302                         psllq mm0, 24       ; v2 v1 v0 0 0 0 0 0
   1303                         psrlq mm1, 8        ; 0 0 0 0 0 0 v2 v1
   1304                         por mm0, mm2        ; v2 v1 v0 v2 v1 v0 0 0
   1305                         por mm0, mm1        ; v2 v1 v0 v2 v1 v0 v2 v1
   1306                         movq [edi+4], mm0   ; move to memory
   1307                         psrlq mm0, 16       ; 0 0 v2 v1 v0 v2 v1 v0
   1308                         movd [edi], mm0     ; move to memory
   1309                         sub esi, 3
   1310                         sub edi, 12
   1311                         dec ecx
   1312                         jnz loop_pass2
   1313                         EMMS
   1314                      }
   1315                   }
   1316                   else if (width) /* && ((pass == 0) || (pass == 1))) */
   1317                   {
   1318                      _asm
   1319                      {
   1320                         mov esi, sptr
   1321                         mov edi, dp
   1322                         mov ecx, width
   1323                         sub edi, 21   // (png_pass_inc[pass] - 1)*pixel_bytes
   1324 loop_pass0:
   1325                         movd mm0, [esi]     ; X X X X X v2 v1 v0
   1326                         pand mm0, const4    ; 0 0 0 0 0 v2 v1 v0
   1327                         movq mm1, mm0       ; 0 0 0 0 0 v2 v1 v0
   1328                         psllq mm0, 16       ; 0 0 0 v2 v1 v0 0 0
   1329                         movq mm2, mm0       ; 0 0 0 v2 v1 v0 0 0
   1330                         psllq mm0, 24       ; v2 v1 v0 0 0 0 0 0
   1331                         psrlq mm1, 8        ; 0 0 0 0 0 0 v2 v1
   1332                         por mm0, mm2        ; v2 v1 v0 v2 v1 v0 0 0
   1333                         por mm0, mm1        ; v2 v1 v0 v2 v1 v0 v2 v1
   1334                         movq mm3, mm0       ; v2 v1 v0 v2 v1 v0 v2 v1
   1335                         psllq mm0, 16       ; v0 v2 v1 v0 v2 v1 0 0
   1336                         movq mm4, mm3       ; v2 v1 v0 v2 v1 v0 v2 v1
   1337                         punpckhdq mm3, mm0  ; v0 v2 v1 v0 v2 v1 v0 v2
   1338                         movq [edi+16] , mm4
   1339                         psrlq mm0, 32       ; 0 0 0 0 v0 v2 v1 v0
   1340                         movq [edi+8] , mm3
   1341                         punpckldq mm0, mm4  ; v1 v0 v2 v1 v0 v2 v1 v0
   1342                         sub esi, 3
   1343                         movq [edi], mm0
   1344                         sub edi, 24
   1345                         //sub esi, 3
   1346                         dec ecx
   1347                         jnz loop_pass0
   1348                         EMMS
   1349                      }
   1350                   }
   1351                } /* end of pixel_bytes == 3 */
   1352 
   1353                else if (pixel_bytes == 1)
   1354                {
   1355                   if (((pass == 4) || (pass == 5)) && width)
   1356                   {
   1357                      int width_mmx = ((width >> 3) << 3);
   1358                      width -= width_mmx;
   1359                      if (width_mmx)
   1360                      {
   1361                         _asm
   1362                         {
   1363                            mov esi, sptr
   1364                            mov edi, dp
   1365                            mov ecx, width_mmx
   1366                            sub edi, 15
   1367                            sub esi, 7
   1368 loop1_pass4:
   1369                            movq mm0, [esi]     ; v0 v1 v2 v3 v4 v5 v6 v7
   1370                            movq mm1, mm0       ; v0 v1 v2 v3 v4 v5 v6 v7
   1371                            punpcklbw mm0, mm0  ; v4 v4 v5 v5 v6 v6 v7 v7
   1372                            //movq mm1, mm0     ; v0 v0 v1 v1 v2 v2 v3 v3
   1373                            punpckhbw mm1, mm1  ;v0 v0 v1 v1 v2 v2 v3 v3
   1374                            movq [edi+8], mm1   ; move to memory v0 v1 v2 and v3
   1375                            sub esi, 8
   1376                            movq [edi], mm0     ; move to memory v4 v5 v6 and v7
   1377                            //sub esi, 4
   1378                            sub edi, 16
   1379                            sub ecx, 8
   1380                            jnz loop1_pass4
   1381                            EMMS
   1382                         }
   1383                      }
   1384 
   1385                      sptr -= width_mmx;
   1386                      dp -= width_mmx*2;
   1387                      for (i = width; i; i--)
   1388                      {
   1389                         int j;
   1390 
   1391                         for (j = 0; j < png_pass_inc[pass]; j++)
   1392                         {
   1393                            *dp-- = *sptr;
   1394                         }
   1395                         sptr --;
   1396                      }
   1397                   }
   1398                   else if (((pass == 2) || (pass == 3)) && width)
   1399                   {
   1400                      int width_mmx = ((width >> 2) << 2);
   1401                      width -= width_mmx;
   1402                      if (width_mmx)
   1403                      {
   1404                         _asm
   1405                         {
   1406                            mov esi, sptr
   1407                            mov edi, dp
   1408                            mov ecx, width_mmx
   1409                            sub edi, 15
   1410                            sub esi, 3
   1411 loop1_pass2:
   1412                            movd mm0, [esi]     ; X X X X v0 v1 v2 v3
   1413                            punpcklbw mm0, mm0  ; v0 v0 v1 v1 v2 v2 v3 v3
   1414                            movq mm1, mm0       ; v0 v0 v1 v1 v2 v2 v3 v3
   1415                            punpcklwd mm0, mm0  ; v2 v2 v2 v2 v3 v3 v3 v3
   1416                            punpckhwd mm1, mm1  ; v0 v0 v0 v0 v1 v1 v1 v1
   1417                            movq [edi], mm0     ; move to memory v2 and v3
   1418                            sub esi, 4
   1419                            movq [edi+8], mm1   ; move to memory v1     and v0
   1420                            sub edi, 16
   1421                            sub ecx, 4
   1422                            jnz loop1_pass2
   1423                            EMMS
   1424                         }
   1425                      }
   1426 
   1427                      sptr -= width_mmx;
   1428                      dp -= width_mmx*4;
   1429                      for (i = width; i; i--)
   1430                      {
   1431                         int j;
   1432 
   1433                         for (j = 0; j < png_pass_inc[pass]; j++)
   1434                         {
   1435                            *dp-- = *sptr;
   1436                         }
   1437                         sptr --;
   1438                      }
   1439                   }
   1440                   else if (width) /* && ((pass == 0) || (pass == 1))) */
   1441                   {
   1442                      int width_mmx = ((width >> 2) << 2);
   1443                      width -= width_mmx;
   1444                      if (width_mmx)
   1445                      {
   1446                         _asm
   1447                         {
   1448                            mov esi, sptr
   1449                            mov edi, dp
   1450                            mov ecx, width_mmx
   1451                            sub edi, 31
   1452                            sub esi, 3
   1453 loop1_pass0:
   1454                            movd mm0, [esi]     ; X X X X v0 v1 v2 v3
   1455                            movq mm1, mm0       ; X X X X v0 v1 v2 v3
   1456                            punpcklbw mm0, mm0  ; v0 v0 v1 v1 v2 v2 v3 v3
   1457                            movq mm2, mm0       ; v0 v0 v1 v1 v2 v2 v3 v3
   1458                            punpcklwd mm0, mm0  ; v2 v2 v2 v2 v3 v3 v3 v3
   1459                            movq mm3, mm0       ; v2 v2 v2 v2 v3 v3 v3 v3
   1460                            punpckldq mm0, mm0  ; v3 v3 v3 v3 v3 v3 v3 v3
   1461                            punpckhdq mm3, mm3  ; v2 v2 v2 v2 v2 v2 v2 v2
   1462                            movq [edi], mm0     ; move to memory v3
   1463                            punpckhwd mm2, mm2  ; v0 v0 v0 v0 v1 v1 v1 v1
   1464                            movq [edi+8], mm3   ; move to memory v2
   1465                            movq mm4, mm2       ; v0 v0 v0 v0 v1 v1 v1 v1
   1466                            punpckldq mm2, mm2  ; v1 v1 v1 v1 v1 v1 v1 v1
   1467                            punpckhdq mm4, mm4  ; v0 v0 v0 v0 v0 v0 v0 v0
   1468                            movq [edi+16], mm2  ; move to memory v1
   1469                            movq [edi+24], mm4  ; move to memory v0
   1470                            sub esi, 4
   1471                            sub edi, 32
   1472                            sub ecx, 4
   1473                            jnz loop1_pass0
   1474                            EMMS
   1475                         }
   1476                      }
   1477 
   1478                      sptr -= width_mmx;
   1479                      dp -= width_mmx*8;
   1480                      for (i = width; i; i--)
   1481                      {
   1482                         int j;
   1483 
   1484                        /* I simplified this part in version 1.0.4e
   1485                         * here and in several other instances where
   1486                         * pixel_bytes == 1  -- GR-P
   1487                         *
   1488                         * Original code:
   1489                         *
   1490                         * png_byte v[8];
   1491                         * png_memcpy(v, sptr, pixel_bytes);
   1492                         * for (j = 0; j < png_pass_inc[pass]; j++)
   1493                         * {
   1494                         *    png_memcpy(dp, v, pixel_bytes);
   1495                         *    dp -= pixel_bytes;
   1496                         * }
   1497                         * sptr -= pixel_bytes;
   1498                         *
   1499                         * Replacement code is in the next three lines:
   1500                         */
   1501 
   1502                         for (j = 0; j < png_pass_inc[pass]; j++)
   1503                            *dp-- = *sptr;
   1504                         sptr--;
   1505                      }
   1506                   }
   1507                } /* end of pixel_bytes == 1 */
   1508 
   1509                else if (pixel_bytes == 2)
   1510                {
   1511                   if (((pass == 4) || (pass == 5)) && width)
   1512                   {
   1513                      int width_mmx = ((width >> 1) << 1) ;
   1514                      width -= width_mmx;
   1515                      if (width_mmx)
   1516                      {
   1517                         _asm
   1518                         {
   1519                            mov esi, sptr
   1520                            mov edi, dp
   1521                            mov ecx, width_mmx
   1522                            sub esi, 2
   1523                            sub edi, 6
   1524 loop2_pass4:
   1525                            movd mm0, [esi]        ; X X X X v1 v0 v3 v2
   1526                            punpcklwd mm0, mm0     ; v1 v0 v1 v0 v3 v2 v3 v2
   1527                            sub esi, 4
   1528                            movq [edi], mm0
   1529                            sub edi, 8
   1530                            sub ecx, 2
   1531                            jnz loop2_pass4
   1532                            EMMS
   1533                         }
   1534                      }
   1535 
   1536                      sptr -= (width_mmx*2 - 2);            // sign fixed
   1537                      dp -= (width_mmx*4 - 2);            // sign fixed
   1538                      for (i = width; i; i--)
   1539                      {
   1540                         png_byte v[8];
   1541                         int j;
   1542                         sptr -= 2;
   1543                         png_memcpy(v, sptr, 2);
   1544                         for (j = 0; j < png_pass_inc[pass]; j++)
   1545                         {
   1546                            dp -= 2;
   1547                            png_memcpy(dp, v, 2);
   1548                         }
   1549                      }
   1550                   }
   1551                   else if (((pass == 2) || (pass == 3)) && width)
   1552                   {
   1553                      int width_mmx = ((width >> 1) << 1) ;
   1554                      width -= width_mmx;
   1555                      if (width_mmx)
   1556                      {
   1557                         _asm
   1558                         {
   1559                            mov esi, sptr
   1560                            mov edi, dp
   1561                            mov ecx, width_mmx
   1562                            sub esi, 2
   1563                            sub edi, 14
   1564 loop2_pass2:
   1565                            movd mm0, [esi]        ; X X X X v1 v0 v3 v2
   1566                            punpcklwd mm0, mm0     ; v1 v0 v1 v0 v3 v2 v3 v2
   1567                            movq mm1, mm0          ; v1 v0 v1 v0 v3 v2 v3 v2
   1568                            punpckldq mm0, mm0     ; v3 v2 v3 v2 v3 v2 v3 v2
   1569                            punpckhdq mm1, mm1     ; v1 v0 v1 v0 v1 v0 v1 v0
   1570                            movq [edi], mm0
   1571                            sub esi, 4
   1572                            movq [edi + 8], mm1
   1573                            //sub esi, 4
   1574                            sub edi, 16
   1575                            sub ecx, 2
   1576                            jnz loop2_pass2
   1577                            EMMS
   1578                         }
   1579                      }
   1580 
   1581                      sptr -= (width_mmx*2 - 2);            // sign fixed
   1582                      dp -= (width_mmx*8 - 2);            // sign fixed
   1583                      for (i = width; i; i--)
   1584                      {
   1585                         png_byte v[8];
   1586                         int j;
   1587                         sptr -= 2;
   1588                         png_memcpy(v, sptr, 2);
   1589                         for (j = 0; j < png_pass_inc[pass]; j++)
   1590                         {
   1591                            dp -= 2;
   1592                            png_memcpy(dp, v, 2);
   1593                         }
   1594                      }
   1595                   }
   1596                   else if (width) /* && ((pass == 0) || (pass == 1))) */
   1597                   {
   1598                      int width_mmx = ((width >> 1) << 1);
   1599                      width -= width_mmx;
   1600                      if (width_mmx)
   1601                      {
   1602                         _asm
   1603                         {
   1604                            mov esi, sptr
   1605                            mov edi, dp
   1606                            mov ecx, width_mmx
   1607                            sub esi, 2
   1608                            sub edi, 30
   1609 loop2_pass0:
   1610                            movd mm0, [esi]        ; X X X X v1 v0 v3 v2
   1611                            punpcklwd mm0, mm0     ; v1 v0 v1 v0 v3 v2 v3 v2
   1612                            movq mm1, mm0          ; v1 v0 v1 v0 v3 v2 v3 v2
   1613                            punpckldq mm0, mm0     ; v3 v2 v3 v2 v3 v2 v3 v2
   1614                            punpckhdq mm1, mm1     ; v1 v0 v1 v0 v1 v0 v1 v0
   1615                            movq [edi], mm0
   1616                            movq [edi + 8], mm0
   1617                            movq [edi + 16], mm1
   1618                            movq [edi + 24], mm1
   1619                            sub esi, 4
   1620                            sub edi, 32
   1621                            sub ecx, 2
   1622                            jnz loop2_pass0
   1623                            EMMS
   1624                         }
   1625                      }
   1626 
   1627                      sptr -= (width_mmx*2 - 2);            // sign fixed
   1628                      dp -= (width_mmx*16 - 2);            // sign fixed
   1629                      for (i = width; i; i--)
   1630                      {
   1631                         png_byte v[8];
   1632                         int j;
   1633                         sptr -= 2;
   1634                         png_memcpy(v, sptr, 2);
   1635                         for (j = 0; j < png_pass_inc[pass]; j++)
   1636                         {
   1637                            dp -= 2;
   1638                            png_memcpy(dp, v, 2);
   1639                         }
   1640                      }
   1641                   }
   1642                } /* end of pixel_bytes == 2 */
   1643 
   1644                else if (pixel_bytes == 4)
   1645                {
   1646                   if (((pass == 4) || (pass == 5)) && width)
   1647                   {
   1648                      int width_mmx = ((width >> 1) << 1) ;
   1649                      width -= width_mmx;
   1650                      if (width_mmx)
   1651                      {
   1652                         _asm
   1653                         {
   1654                            mov esi, sptr
   1655                            mov edi, dp
   1656                            mov ecx, width_mmx
   1657                            sub esi, 4
   1658                            sub edi, 12
   1659 loop4_pass4:
   1660                            movq mm0, [esi]      ; v3 v2 v1 v0 v7 v6 v5 v4
   1661                            movq mm1, mm0        ; v3 v2 v1 v0 v7 v6 v5 v4
   1662                            punpckldq mm0, mm0   ; v7 v6 v5 v4 v7 v6 v5 v4
   1663                            punpckhdq mm1, mm1   ; v3 v2 v1 v0 v3 v2 v1 v0
   1664                            movq [edi], mm0
   1665                            sub esi, 8
   1666                            movq [edi + 8], mm1
   1667                            sub edi, 16
   1668                            sub ecx, 2
   1669                            jnz loop4_pass4
   1670                            EMMS
   1671                         }
   1672                      }
   1673 
   1674                      sptr -= (width_mmx*4 - 4);          // sign fixed
   1675                      dp -= (width_mmx*8 - 4);            // sign fixed
   1676                      for (i = width; i; i--)
   1677                      {
   1678                         png_byte v[8];
   1679                         int j;
   1680                         sptr -= 4;
   1681                         png_memcpy(v, sptr, 4);
   1682                         for (j = 0; j < png_pass_inc[pass]; j++)
   1683                         {
   1684                            dp -= 4;
   1685                            png_memcpy(dp, v, 4);
   1686                         }
   1687                      }
   1688                   }
   1689                   else if (((pass == 2) || (pass == 3)) && width)
   1690                   {
   1691                      int width_mmx = ((width >> 1) << 1) ;
   1692                      width -= width_mmx;
   1693                      if (width_mmx)
   1694                      {
   1695                         _asm
   1696                         {
   1697                            mov esi, sptr
   1698                            mov edi, dp
   1699                            mov ecx, width_mmx
   1700                            sub esi, 4
   1701                            sub edi, 28
   1702 loop4_pass2:
   1703                            movq mm0, [esi]      ; v3 v2 v1 v0 v7 v6 v5 v4
   1704                            movq mm1, mm0        ; v3 v2 v1 v0 v7 v6 v5 v4
   1705                            punpckldq mm0, mm0   ; v7 v6 v5 v4 v7 v6 v5 v4
   1706                            punpckhdq mm1, mm1   ; v3 v2 v1 v0 v3 v2 v1 v0
   1707                            movq [edi], mm0
   1708                            movq [edi + 8], mm0
   1709                            movq [edi+16], mm1
   1710                            movq [edi + 24], mm1
   1711                            sub esi, 8
   1712                            sub edi, 32
   1713                            sub ecx, 2
   1714                            jnz loop4_pass2
   1715                            EMMS
   1716                         }
   1717                      }
   1718 
   1719                      sptr -= (width_mmx*4 - 4);            // sign fixed
   1720                      dp -= (width_mmx*16 - 4);            // sign fixed
   1721                      for (i = width; i; i--)
   1722                      {
   1723                         png_byte v[8];
   1724                         int j;
   1725                         sptr -= 4;
   1726                         png_memcpy(v, sptr, 4);
   1727                         for (j = 0; j < png_pass_inc[pass]; j++)
   1728                         {
   1729                            dp -= 4;
   1730                            png_memcpy(dp, v, 4);
   1731                         }
   1732                      }
   1733                   }
   1734                   else if (width) /* && ((pass == 0) || (pass == 1))) */
   1735                   {
   1736                      int width_mmx = ((width >> 1) << 1) ;
   1737                      width -= width_mmx;
   1738                      if (width_mmx)
   1739                      {
   1740                         _asm
   1741                         {
   1742                            mov esi, sptr
   1743                            mov edi, dp
   1744                            mov ecx, width_mmx
   1745                            sub esi, 4
   1746                            sub edi, 60
   1747 loop4_pass0:
   1748                            movq mm0, [esi]        ; v3 v2 v1 v0 v7 v6 v5 v4
   1749                            movq mm1, mm0          ; v3 v2 v1 v0 v7 v6 v5 v4
   1750                            punpckldq mm0, mm0     ; v7 v6 v5 v4 v7 v6 v5 v4
   1751                            punpckhdq mm1, mm1     ; v3 v2 v1 v0 v3 v2 v1 v0
   1752                            movq [edi], mm0
   1753                            movq [edi + 8], mm0
   1754                            movq [edi + 16], mm0
   1755                            movq [edi + 24], mm0
   1756                            movq [edi+32], mm1
   1757                            movq [edi + 40], mm1
   1758                            movq [edi+ 48], mm1
   1759                            sub esi, 8
   1760                            movq [edi + 56], mm1
   1761                            sub edi, 64
   1762                            sub ecx, 2
   1763                            jnz loop4_pass0
   1764                            EMMS
   1765                         }
   1766                      }
   1767 
   1768                      sptr -= (width_mmx*4 - 4);            // sign fixed
   1769                      dp -= (width_mmx*32 - 4);            // sign fixed
   1770                      for (i = width; i; i--)
   1771                      {
   1772                         png_byte v[8];
   1773                         int j;
   1774                         sptr -= 4;
   1775                         png_memcpy(v, sptr, 4);
   1776                         for (j = 0; j < png_pass_inc[pass]; j++)
   1777                         {
   1778                            dp -= 4;
   1779                            png_memcpy(dp, v, 4);
   1780                         }
   1781                      }
   1782                   }
   1783 
   1784                } /* end of pixel_bytes == 4 */
   1785 
   1786                else if (pixel_bytes == 6)
   1787                {
   1788                   for (i = width; i; i--)
   1789                   {
   1790                      png_byte v[8];
   1791                      int j;
   1792                      png_memcpy(v, sptr, 6);
   1793                      for (j = 0; j < png_pass_inc[pass]; j++)
   1794                      {
   1795                         png_memcpy(dp, v, 6);
   1796                         dp -= 6;
   1797                      }
   1798                      sptr -= 6;
   1799                   }
   1800                } /* end of pixel_bytes == 6 */
   1801 
   1802                else
   1803                {
   1804                   for (i = width; i; i--)
   1805                   {
   1806                      png_byte v[8];
   1807                      int j;
   1808                      png_memcpy(v, sptr, pixel_bytes);
   1809                      for (j = 0; j < png_pass_inc[pass]; j++)
   1810                      {
   1811                         png_memcpy(dp, v, pixel_bytes);
   1812                         dp -= pixel_bytes;
   1813                      }
   1814                      sptr-= pixel_bytes;
   1815                   }
   1816                }
   1817             } /* end of mmx_supported */
   1818 
   1819             else /* MMX not supported:  use modified C code - takes advantage
   1820                   * of inlining of memcpy for a constant */
   1821             {
   1822                if (pixel_bytes == 1)
   1823                {
   1824                   for (i = width; i; i--)
   1825                   {
   1826                      int j;
   1827                      for (j = 0; j < png_pass_inc[pass]; j++)
   1828                         *dp-- = *sptr;
   1829                      sptr--;
   1830                   }
   1831                }
   1832                else if (pixel_bytes == 3)
   1833                {
   1834                   for (i = width; i; i--)
   1835                   {
   1836                      png_byte v[8];
   1837                      int j;
   1838                      png_memcpy(v, sptr, pixel_bytes);
   1839                      for (j = 0; j < png_pass_inc[pass]; j++)
   1840                      {
   1841                         png_memcpy(dp, v, pixel_bytes);
   1842                         dp -= pixel_bytes;
   1843                      }
   1844                      sptr -= pixel_bytes;
   1845                   }
   1846                }
   1847                else if (pixel_bytes == 2)
   1848                {
   1849                   for (i = width; i; i--)
   1850                   {
   1851                      png_byte v[8];
   1852                      int j;
   1853                      png_memcpy(v, sptr, pixel_bytes);
   1854                      for (j = 0; j < png_pass_inc[pass]; j++)
   1855                      {
   1856                         png_memcpy(dp, v, pixel_bytes);
   1857                         dp -= pixel_bytes;
   1858                      }
   1859                      sptr -= pixel_bytes;
   1860                   }
   1861                }
   1862                else if (pixel_bytes == 4)
   1863                {
   1864                   for (i = width; i; i--)
   1865                   {
   1866                      png_byte v[8];
   1867                      int j;
   1868                      png_memcpy(v, sptr, pixel_bytes);
   1869                      for (j = 0; j < png_pass_inc[pass]; j++)
   1870                      {
   1871                         png_memcpy(dp, v, pixel_bytes);
   1872                         dp -= pixel_bytes;
   1873                      }
   1874                      sptr -= pixel_bytes;
   1875                   }
   1876                }
   1877                else if (pixel_bytes == 6)
   1878                {
   1879                   for (i = width; i; i--)
   1880                   {
   1881                      png_byte v[8];
   1882                      int j;
   1883                      png_memcpy(v, sptr, pixel_bytes);
   1884                      for (j = 0; j < png_pass_inc[pass]; j++)
   1885                      {
   1886                         png_memcpy(dp, v, pixel_bytes);
   1887                         dp -= pixel_bytes;
   1888                      }
   1889                      sptr -= pixel_bytes;
   1890                   }
   1891                }
   1892                else
   1893                {
   1894                   for (i = width; i; i--)
   1895                   {
   1896                      png_byte v[8];
   1897                      int j;
   1898                      png_memcpy(v, sptr, pixel_bytes);
   1899                      for (j = 0; j < png_pass_inc[pass]; j++)
   1900                      {
   1901                         png_memcpy(dp, v, pixel_bytes);
   1902                         dp -= pixel_bytes;
   1903                      }
   1904                      sptr -= pixel_bytes;
   1905                   }
   1906                }
   1907 
   1908             } /* end of MMX not supported */
   1909             break;
   1910          }
   1911       } /* end switch (row_info->pixel_depth) */
   1912 
   1913       row_info->width = final_width;
   1914 
   1915       row_info->rowbytes = PNG_ROWBYTES(row_info->pixel_depth,final_width);
   1916    }
   1917 
   1918 }
   1919 
   1920 #endif /* PNG_READ_INTERLACING_SUPPORTED */
   1921 
   1922 
   1923 // These global constants are declared
   1924 // here to ensure alignment on 8-byte boundaries.
   1925   union uAll {
   1926      __int64 use;
   1927      double  double_align;
   1928      long long long_long_align;
   1929   } ;
   1930   static PNG_CONST union uAll LBCarryMask = {0x0101010101010101},
   1931                               HBClearMask = {0x7f7f7f7f7f7f7f7f};
   1932 
   1933 // Optimized code for PNG Average filter decoder
   1934 void /* PRIVATE */
   1935 png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row
   1936                             , png_bytep prev_row)
   1937 {
   1938   // These variables are declared
   1939   // here to ensure alignment on 8-byte boundaries.
   1940   union uAll ActiveMask, ShiftBpp, ShiftRem;
   1941 
   1942    int bpp;
   1943    png_uint_32 FullLength;
   1944    png_uint_32 MMXLength;
   1945    //png_uint_32 len;
   1946    int diff;
   1947 
   1948    bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
   1949    FullLength  = row_info->rowbytes; // # of bytes to filter
   1950    _asm {
   1951          // Init address pointers and offset
   1952          mov edi, row          // edi ==> Avg(x)
   1953          xor ebx, ebx          // ebx ==> x
   1954          mov edx, edi
   1955          mov esi, prev_row           // esi ==> Prior(x)
   1956          sub edx, bpp          // edx ==> Raw(x-bpp)
   1957 
   1958          xor eax, eax
   1959          // Compute the Raw value for the first bpp bytes
   1960          //    Raw(x) = Avg(x) + (Prior(x)/2)
   1961 davgrlp:
   1962          mov al, [esi + ebx]   // Load al with Prior(x)
   1963          inc ebx
   1964          shr al, 1             // divide by 2
   1965          add al, [edi+ebx-1]   // Add Avg(x); -1 to offset inc ebx
   1966          cmp ebx, bpp
   1967          mov [edi+ebx-1], al    // Write back Raw(x);
   1968                             // mov does not affect flags; -1 to offset inc ebx
   1969          jb davgrlp
   1970          // get # of bytes to alignment
   1971          mov diff, edi         // take start of row
   1972          add diff, ebx         // add bpp
   1973          add diff, 0xf         // add 7 + 8 to incr past alignment boundary
   1974          and diff, 0xfffffff8  // mask to alignment boundary
   1975          sub diff, edi         // subtract from start ==> value ebx at alignment
   1976          jz davggo
   1977          // fix alignment
   1978          // Compute the Raw value for the bytes upto the alignment boundary
   1979          //    Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
   1980          xor ecx, ecx
   1981 davglp1:
   1982          xor eax, eax
   1983          mov cl, [esi + ebx]        // load cl with Prior(x)
   1984          mov al, [edx + ebx]  // load al with Raw(x-bpp)
   1985          add ax, cx
   1986          inc ebx
   1987          shr ax, 1            // divide by 2
   1988          add al, [edi+ebx-1]  // Add Avg(x); -1 to offset inc ebx
   1989          cmp ebx, diff              // Check if at alignment boundary
   1990          mov [edi+ebx-1], al        // Write back Raw(x);
   1991                             // mov does not affect flags; -1 to offset inc ebx
   1992          jb davglp1               // Repeat until at alignment boundary
   1993 davggo:
   1994          mov eax, FullLength
   1995          mov ecx, eax
   1996          sub eax, ebx          // subtract alignment fix
   1997          and eax, 0x00000007   // calc bytes over mult of 8
   1998          sub ecx, eax          // drop over bytes from original length
   1999          mov MMXLength, ecx
   2000    } // end _asm block
   2001    // Now do the math for the rest of the row
   2002    switch ( bpp )
   2003    {
   2004       case 3:
   2005       {
   2006          ActiveMask.use  = 0x0000000000ffffff;
   2007          ShiftBpp.use = 24;    // == 3 * 8
   2008          ShiftRem.use = 40;    // == 64 - 24
   2009          _asm {
   2010             // Re-init address pointers and offset
   2011             movq mm7, ActiveMask
   2012             mov ebx, diff      // ebx ==> x = offset to alignment boundary
   2013             movq mm5, LBCarryMask
   2014             mov edi, row       // edi ==> Avg(x)
   2015             movq mm4, HBClearMask
   2016             mov esi, prev_row        // esi ==> Prior(x)
   2017             // PRIME the pump (load the first Raw(x-bpp) data set
   2018             movq mm2, [edi + ebx - 8]  // Load previous aligned 8 bytes
   2019                                // (we correct position in loop below)
   2020 davg3lp:
   2021             movq mm0, [edi + ebx]      // Load mm0 with Avg(x)
   2022             // Add (Prev_row/2) to Average
   2023             movq mm3, mm5
   2024             psrlq mm2, ShiftRem      // Correct position Raw(x-bpp) data
   2025             movq mm1, [esi + ebx]    // Load mm1 with Prior(x)
   2026             movq mm6, mm7
   2027             pand mm3, mm1      // get lsb for each prev_row byte
   2028             psrlq mm1, 1       // divide prev_row bytes by 2
   2029             pand  mm1, mm4     // clear invalid bit 7 of each byte
   2030             paddb mm0, mm1     // add (Prev_row/2) to Avg for each byte
   2031             // Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
   2032             movq mm1, mm3      // now use mm1 for getting LBCarrys
   2033             pand mm1, mm2      // get LBCarrys for each byte where both
   2034                                // lsb's were == 1 (Only valid for active group)
   2035             psrlq mm2, 1       // divide raw bytes by 2
   2036             pand  mm2, mm4     // clear invalid bit 7 of each byte
   2037             paddb mm2, mm1     // add LBCarrys to (Raw(x-bpp)/2) for each byte
   2038             pand mm2, mm6      // Leave only Active Group 1 bytes to add to Avg
   2039             paddb mm0, mm2     // add (Raw/2) + LBCarrys to Avg for each Active
   2040                                //  byte
   2041             // Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
   2042             psllq mm6, ShiftBpp  // shift the mm6 mask to cover bytes 3-5
   2043             movq mm2, mm0        // mov updated Raws to mm2
   2044             psllq mm2, ShiftBpp  // shift data to position correctly
   2045             movq mm1, mm3        // now use mm1 for getting LBCarrys
   2046             pand mm1, mm2      // get LBCarrys for each byte where both
   2047                                // lsb's were == 1 (Only valid for active group)
   2048             psrlq mm2, 1       // divide raw bytes by 2
   2049             pand  mm2, mm4     // clear invalid bit 7 of each byte
   2050             paddb mm2, mm1     // add LBCarrys to (Raw(x-bpp)/2) for each byte
   2051             pand mm2, mm6      // Leave only Active Group 2 bytes to add to Avg
   2052             paddb mm0, mm2     // add (Raw/2) + LBCarrys to Avg for each Active
   2053                                //  byte
   2054 
   2055             // Add 3rd active group (Raw(x-bpp)/2) to Average with LBCarry
   2056             psllq mm6, ShiftBpp  // shift the mm6 mask to cover the last two
   2057                                  // bytes
   2058             movq mm2, mm0        // mov updated Raws to mm2
   2059             psllq mm2, ShiftBpp  // shift data to position correctly
   2060                               // Data only needs to be shifted once here to
   2061                               // get the correct x-bpp offset.
   2062             movq mm1, mm3     // now use mm1 for getting LBCarrys
   2063             pand mm1, mm2     // get LBCarrys for each byte where both
   2064                               // lsb's were == 1 (Only valid for active group)
   2065             psrlq mm2, 1      // divide raw bytes by 2
   2066             pand  mm2, mm4    // clear invalid bit 7 of each byte
   2067             paddb mm2, mm1    // add LBCarrys to (Raw(x-bpp)/2) for each byte
   2068             pand mm2, mm6     // Leave only Active Group 2 bytes to add to Avg
   2069             add ebx, 8
   2070             paddb mm0, mm2    // add (Raw/2) + LBCarrys to Avg for each Active
   2071                               // byte
   2072 
   2073             // Now ready to write back to memory
   2074             movq [edi + ebx - 8], mm0
   2075             // Move updated Raw(x) to use as Raw(x-bpp) for next loop
   2076             cmp ebx, MMXLength
   2077             movq mm2, mm0     // mov updated Raw(x) to mm2
   2078             jb davg3lp
   2079          } // end _asm block
   2080       }
   2081       break;
   2082 
   2083       case 6:
   2084       case 4:
   2085       case 7:
   2086       case 5:
   2087       {
   2088          ActiveMask.use  = 0xffffffffffffffff;  // use shift below to clear
   2089                                                 // appropriate inactive bytes
   2090          ShiftBpp.use = bpp << 3;
   2091          ShiftRem.use = 64 - ShiftBpp.use;
   2092          _asm {
   2093             movq mm4, HBClearMask
   2094             // Re-init address pointers and offset
   2095             mov ebx, diff       // ebx ==> x = offset to alignment boundary
   2096             // Load ActiveMask and clear all bytes except for 1st active group
   2097             movq mm7, ActiveMask
   2098             mov edi, row         // edi ==> Avg(x)
   2099             psrlq mm7, ShiftRem
   2100             mov esi, prev_row    // esi ==> Prior(x)
   2101             movq mm6, mm7
   2102             movq mm5, LBCarryMask
   2103             psllq mm6, ShiftBpp  // Create mask for 2nd active group
   2104             // PRIME the pump (load the first Raw(x-bpp) data set
   2105             movq mm2, [edi + ebx - 8]  // Load previous aligned 8 bytes
   2106                                  // (we correct position in loop below)
   2107 davg4lp:
   2108             movq mm0, [edi + ebx]
   2109             psrlq mm2, ShiftRem  // shift data to position correctly
   2110             movq mm1, [esi + ebx]
   2111             // Add (Prev_row/2) to Average
   2112             movq mm3, mm5
   2113             pand mm3, mm1     // get lsb for each prev_row byte
   2114             psrlq mm1, 1      // divide prev_row bytes by 2
   2115             pand  mm1, mm4    // clear invalid bit 7 of each byte
   2116             paddb mm0, mm1    // add (Prev_row/2) to Avg for each byte
   2117             // Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
   2118             movq mm1, mm3     // now use mm1 for getting LBCarrys
   2119             pand mm1, mm2     // get LBCarrys for each byte where both
   2120                               // lsb's were == 1 (Only valid for active group)
   2121             psrlq mm2, 1      // divide raw bytes by 2
   2122             pand  mm2, mm4    // clear invalid bit 7 of each byte
   2123             paddb mm2, mm1    // add LBCarrys to (Raw(x-bpp)/2) for each byte
   2124             pand mm2, mm7     // Leave only Active Group 1 bytes to add to Avg
   2125             paddb mm0, mm2    // add (Raw/2) + LBCarrys to Avg for each Active
   2126                               // byte
   2127             // Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
   2128             movq mm2, mm0     // mov updated Raws to mm2
   2129             psllq mm2, ShiftBpp // shift data to position correctly
   2130             add ebx, 8
   2131             movq mm1, mm3     // now use mm1 for getting LBCarrys
   2132             pand mm1, mm2     // get LBCarrys for each byte where both
   2133                               // lsb's were == 1 (Only valid for active group)
   2134             psrlq mm2, 1      // divide raw bytes by 2
   2135             pand  mm2, mm4    // clear invalid bit 7 of each byte
   2136             paddb mm2, mm1    // add LBCarrys to (Raw(x-bpp)/2) for each byte
   2137             pand mm2, mm6     // Leave only Active Group 2 bytes to add to Avg
   2138             paddb mm0, mm2    // add (Raw/2) + LBCarrys to Avg for each Active
   2139                               // byte
   2140             cmp ebx, MMXLength
   2141             // Now ready to write back to memory
   2142             movq [edi + ebx - 8], mm0
   2143             // Prep Raw(x-bpp) for next loop
   2144             movq mm2, mm0     // mov updated Raws to mm2
   2145             jb davg4lp
   2146          } // end _asm block
   2147       }
   2148       break;
   2149       case 2:
   2150       {
   2151          ActiveMask.use  = 0x000000000000ffff;
   2152          ShiftBpp.use = 16;   // == 2 * 8     [BUGFIX]
   2153          ShiftRem.use = 48;   // == 64 - 16   [BUGFIX]
   2154          _asm {
   2155             // Load ActiveMask
   2156             movq mm7, ActiveMask
   2157             // Re-init address pointers and offset
   2158             mov ebx, diff     // ebx ==> x = offset to alignment boundary
   2159             movq mm5, LBCarryMask
   2160             mov edi, row      // edi ==> Avg(x)
   2161             movq mm4, HBClearMask
   2162             mov esi, prev_row  // esi ==> Prior(x)
   2163             // PRIME the pump (load the first Raw(x-bpp) data set
   2164             movq mm2, [edi + ebx - 8]  // Load previous aligned 8 bytes
   2165                               // (we correct position in loop below)
   2166 davg2lp:
   2167             movq mm0, [edi + ebx]
   2168             psrlq mm2, ShiftRem  // shift data to position correctly   [BUGFIX]
   2169             movq mm1, [esi + ebx]
   2170             // Add (Prev_row/2) to Average
   2171             movq mm3, mm5
   2172             pand mm3, mm1     // get lsb for each prev_row byte
   2173             psrlq mm1, 1      // divide prev_row bytes by 2
   2174             pand  mm1, mm4    // clear invalid bit 7 of each byte
   2175             movq mm6, mm7
   2176             paddb mm0, mm1    // add (Prev_row/2) to Avg for each byte
   2177             // Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
   2178             movq mm1, mm3     // now use mm1 for getting LBCarrys
   2179             pand mm1, mm2     // get LBCarrys for each byte where both
   2180                               // lsb's were == 1 (Only valid for active group)
   2181             psrlq mm2, 1      // divide raw bytes by 2
   2182             pand  mm2, mm4    // clear invalid bit 7 of each byte
   2183             paddb mm2, mm1    // add LBCarrys to (Raw(x-bpp)/2) for each byte
   2184             pand mm2, mm6     // Leave only Active Group 1 bytes to add to Avg
   2185             paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
   2186             // Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
   2187             psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 2 & 3
   2188             movq mm2, mm0       // mov updated Raws to mm2
   2189             psllq mm2, ShiftBpp // shift data to position correctly
   2190             movq mm1, mm3       // now use mm1 for getting LBCarrys
   2191             pand mm1, mm2       // get LBCarrys for each byte where both
   2192                                 // lsb's were == 1 (Only valid for active group)
   2193             psrlq mm2, 1        // divide raw bytes by 2
   2194             pand  mm2, mm4      // clear invalid bit 7 of each byte
   2195             paddb mm2, mm1      // add LBCarrys to (Raw(x-bpp)/2) for each byte
   2196             pand mm2, mm6       // Leave only Active Group 2 bytes to add to Avg
   2197             paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
   2198 
   2199             // Add rdd active group (Raw(x-bpp)/2) to Average with LBCarry
   2200             psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 4 & 5
   2201             movq mm2, mm0       // mov updated Raws to mm2
   2202             psllq mm2, ShiftBpp // shift data to position correctly
   2203                                 // Data only needs to be shifted once here to
   2204                                 // get the correct x-bpp offset.
   2205             movq mm1, mm3       // now use mm1 for getting LBCarrys
   2206             pand mm1, mm2       // get LBCarrys for each byte where both
   2207                                 // lsb's were == 1 (Only valid for active group)
   2208             psrlq mm2, 1        // divide raw bytes by 2
   2209             pand  mm2, mm4      // clear invalid bit 7 of each byte
   2210             paddb mm2, mm1      // add LBCarrys to (Raw(x-bpp)/2) for each byte
   2211             pand mm2, mm6       // Leave only Active Group 2 bytes to add to Avg
   2212             paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
   2213 
   2214             // Add 4th active group (Raw(x-bpp)/2) to Average with LBCarry
   2215             psllq mm6, ShiftBpp  // shift the mm6 mask to cover bytes 6 & 7
   2216             movq mm2, mm0        // mov updated Raws to mm2
   2217             psllq mm2, ShiftBpp  // shift data to position correctly
   2218                                  // Data only needs to be shifted once here to
   2219                                  // get the correct x-bpp offset.
   2220             add ebx, 8
   2221             movq mm1, mm3    // now use mm1 for getting LBCarrys
   2222             pand mm1, mm2    // get LBCarrys for each byte where both
   2223                              // lsb's were == 1 (Only valid for active group)
   2224             psrlq mm2, 1     // divide raw bytes by 2
   2225             pand  mm2, mm4   // clear invalid bit 7 of each byte
   2226             paddb mm2, mm1   // add LBCarrys to (Raw(x-bpp)/2) for each byte
   2227             pand mm2, mm6    // Leave only Active Group 2 bytes to add to Avg
   2228             paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
   2229 
   2230             cmp ebx, MMXLength
   2231             // Now ready to write back to memory
   2232             movq [edi + ebx - 8], mm0
   2233             // Prep Raw(x-bpp) for next loop
   2234             movq mm2, mm0    // mov updated Raws to mm2
   2235             jb davg2lp
   2236         } // end _asm block
   2237       }
   2238       break;
   2239 
   2240       case 1:                 // bpp == 1
   2241       {
   2242          _asm {
   2243             // Re-init address pointers and offset
   2244             mov ebx, diff     // ebx ==> x = offset to alignment boundary
   2245             mov edi, row      // edi ==> Avg(x)
   2246             cmp ebx, FullLength  // Test if offset at end of array
   2247             jnb davg1end
   2248             // Do Paeth decode for remaining bytes
   2249             mov esi, prev_row    // esi ==> Prior(x)
   2250             mov edx, edi
   2251             xor ecx, ecx         // zero ecx before using cl & cx in loop below
   2252             sub edx, bpp         // edx ==> Raw(x-bpp)
   2253 davg1lp:
   2254             // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
   2255             xor eax, eax
   2256             mov cl, [esi + ebx]  // load cl with Prior(x)
   2257             mov al, [edx + ebx]  // load al with Raw(x-bpp)
   2258             add ax, cx
   2259             inc ebx
   2260             shr ax, 1            // divide by 2
   2261             add al, [edi+ebx-1]  // Add Avg(x); -1 to offset inc ebx
   2262             cmp ebx, FullLength  // Check if at end of array
   2263             mov [edi+ebx-1], al  // Write back Raw(x);
   2264                          // mov does not affect flags; -1 to offset inc ebx
   2265             jb davg1lp
   2266 davg1end:
   2267          } // end _asm block
   2268       }
   2269       return;
   2270 
   2271       case 8:             // bpp == 8
   2272       {
   2273          _asm {
   2274             // Re-init address pointers and offset
   2275             mov ebx, diff           // ebx ==> x = offset to alignment boundary
   2276             movq mm5, LBCarryMask
   2277             mov edi, row            // edi ==> Avg(x)
   2278             movq mm4, HBClearMask
   2279             mov esi, prev_row       // esi ==> Prior(x)
   2280             // PRIME the pump (load the first Raw(x-bpp) data set
   2281             movq mm2, [edi + ebx - 8]  // Load previous aligned 8 bytes
   2282                                 // (NO NEED to correct position in loop below)
   2283 davg8lp:
   2284             movq mm0, [edi + ebx]
   2285             movq mm3, mm5
   2286             movq mm1, [esi + ebx]
   2287             add ebx, 8
   2288             pand mm3, mm1       // get lsb for each prev_row byte
   2289             psrlq mm1, 1        // divide prev_row bytes by 2
   2290             pand mm3, mm2       // get LBCarrys for each byte where both
   2291                                 // lsb's were == 1
   2292             psrlq mm2, 1        // divide raw bytes by 2
   2293             pand  mm1, mm4      // clear invalid bit 7 of each byte
   2294             paddb mm0, mm3      // add LBCarrys to Avg for each byte
   2295             pand  mm2, mm4      // clear invalid bit 7 of each byte
   2296             paddb mm0, mm1      // add (Prev_row/2) to Avg for each byte
   2297             paddb mm0, mm2      // add (Raw/2) to Avg for each byte
   2298             cmp ebx, MMXLength
   2299             movq [edi + ebx - 8], mm0
   2300             movq mm2, mm0       // reuse as Raw(x-bpp)
   2301             jb davg8lp
   2302         } // end _asm block
   2303       }
   2304       break;
   2305       default:                  // bpp greater than 8
   2306       {
   2307         _asm {
   2308             movq mm5, LBCarryMask
   2309             // Re-init address pointers and offset
   2310             mov ebx, diff       // ebx ==> x = offset to alignment boundary
   2311             mov edi, row        // edi ==> Avg(x)
   2312             movq mm4, HBClearMask
   2313             mov edx, edi
   2314             mov esi, prev_row   // esi ==> Prior(x)
   2315             sub edx, bpp        // edx ==> Raw(x-bpp)
   2316 davgAlp:
   2317             movq mm0, [edi + ebx]
   2318             movq mm3, mm5
   2319             movq mm1, [esi + ebx]
   2320             pand mm3, mm1       // get lsb for each prev_row byte
   2321             movq mm2, [edx + ebx]
   2322             psrlq mm1, 1        // divide prev_row bytes by 2
   2323             pand mm3, mm2       // get LBCarrys for each byte where both
   2324                                 // lsb's were == 1
   2325             psrlq mm2, 1        // divide raw bytes by 2
   2326             pand  mm1, mm4      // clear invalid bit 7 of each byte
   2327             paddb mm0, mm3      // add LBCarrys to Avg for each byte
   2328             pand  mm2, mm4      // clear invalid bit 7 of each byte
   2329             paddb mm0, mm1      // add (Prev_row/2) to Avg for each byte
   2330             add ebx, 8
   2331             paddb mm0, mm2      // add (Raw/2) to Avg for each byte
   2332             cmp ebx, MMXLength
   2333             movq [edi + ebx - 8], mm0
   2334             jb davgAlp
   2335         } // end _asm block
   2336       }
   2337       break;
   2338    }                         // end switch ( bpp )
   2339 
   2340    _asm {
   2341          // MMX acceleration complete now do clean-up
   2342          // Check if any remaining bytes left to decode
   2343          mov ebx, MMXLength    // ebx ==> x = offset bytes remaining after MMX
   2344          mov edi, row          // edi ==> Avg(x)
   2345          cmp ebx, FullLength   // Test if offset at end of array
   2346          jnb davgend
   2347          // Do Paeth decode for remaining bytes
   2348          mov esi, prev_row     // esi ==> Prior(x)
   2349          mov edx, edi
   2350          xor ecx, ecx          // zero ecx before using cl & cx in loop below
   2351          sub edx, bpp          // edx ==> Raw(x-bpp)
   2352 davglp2:
   2353          // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
   2354          xor eax, eax
   2355          mov cl, [esi + ebx]   // load cl with Prior(x)
   2356          mov al, [edx + ebx]   // load al with Raw(x-bpp)
   2357          add ax, cx
   2358          inc ebx
   2359          shr ax, 1              // divide by 2
   2360          add al, [edi+ebx-1]    // Add Avg(x); -1 to offset inc ebx
   2361          cmp ebx, FullLength    // Check if at end of array
   2362          mov [edi+ebx-1], al    // Write back Raw(x);
   2363                           // mov does not affect flags; -1 to offset inc ebx
   2364          jb davglp2
   2365 davgend:
   2366          emms             // End MMX instructions; prep for possible FP instrs.
   2367    } // end _asm block
   2368 }
   2369 
   2370 // Optimized code for PNG Paeth filter decoder
   2371 void /* PRIVATE */
   2372 png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
   2373                               png_bytep prev_row)
   2374 {
   2375   // These variables are declared
   2376   // here to ensure alignment on 8-byte boundaries.
   2377   union uAll  ActiveMask, ActiveMask2, ActiveMaskEnd, ShiftBpp, ShiftRem;
   2378 
   2379    png_uint_32 FullLength;
   2380    png_uint_32 MMXLength;
   2381    //png_uint_32 len;
   2382    int bpp;
   2383    int diff;
   2384    //int ptemp;
   2385    int patemp, pbtemp, pctemp;
   2386 
   2387    bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
   2388    FullLength  = row_info->rowbytes; // # of bytes to filter
   2389    _asm
   2390    {
   2391          xor ebx, ebx        // ebx ==> x offset
   2392          mov edi, row
   2393          xor edx, edx        // edx ==> x-bpp offset
   2394          mov esi, prev_row
   2395          xor eax, eax
   2396 
   2397          // Compute the Raw value for the first bpp bytes
   2398          // Note: the formula works out to be always
   2399          //   Paeth(x) = Raw(x) + Prior(x)      where x < bpp
   2400 dpthrlp:
   2401          mov al, [edi + ebx]
   2402          add al, [esi + ebx]
   2403          inc ebx
   2404          cmp ebx, bpp
   2405          mov [edi + ebx - 1], al
   2406          jb dpthrlp
   2407          // get # of bytes to alignment
   2408          mov diff, edi         // take start of row
   2409          add diff, ebx         // add bpp
   2410          xor ecx, ecx
   2411          add diff, 0xf         // add 7 + 8 to incr past alignment boundary
   2412          and diff, 0xfffffff8  // mask to alignment boundary
   2413          sub diff, edi         // subtract from start ==> value ebx at alignment
   2414          jz dpthgo
   2415          // fix alignment
   2416 dpthlp1:
   2417          xor eax, eax
   2418          // pav = p - a = (a + b - c) - a = b - c
   2419          mov al, [esi + ebx]   // load Prior(x) into al
   2420          mov cl, [esi + edx]   // load Prior(x-bpp) into cl
   2421          sub eax, ecx          // subtract Prior(x-bpp)
   2422          mov patemp, eax       // Save pav for later use
   2423          xor eax, eax
   2424          // pbv = p - b = (a + b - c) - b = a - c
   2425          mov al, [edi + edx]   // load Raw(x-bpp) into al
   2426          sub eax, ecx          // subtract Prior(x-bpp)
   2427          mov ecx, eax
   2428          // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
   2429          add eax, patemp       // pcv = pav + pbv
   2430          // pc = abs(pcv)
   2431          test eax, 0x80000000
   2432          jz dpthpca
   2433          neg eax               // reverse sign of neg values
   2434 dpthpca:
   2435          mov pctemp, eax       // save pc for later use
   2436          // pb = abs(pbv)
   2437          test ecx, 0x80000000
   2438          jz dpthpba
   2439          neg ecx               // reverse sign of neg values
   2440 dpthpba:
   2441          mov pbtemp, ecx       // save pb for later use
   2442          // pa = abs(pav)
   2443          mov eax, patemp
   2444          test eax, 0x80000000
   2445          jz dpthpaa
   2446          neg eax               // reverse sign of neg values
   2447 dpthpaa:
   2448          mov patemp, eax       // save pa for later use
   2449          // test if pa <= pb
   2450          cmp eax, ecx
   2451          jna dpthabb
   2452          // pa > pb; now test if pb <= pc
   2453          cmp ecx, pctemp
   2454          jna dpthbbc
   2455          // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
   2456          mov cl, [esi + edx]  // load Prior(x-bpp) into cl
   2457          jmp dpthpaeth
   2458 dpthbbc:
   2459          // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
   2460          mov cl, [esi + ebx]   // load Prior(x) into cl
   2461          jmp dpthpaeth
   2462 dpthabb:
   2463          // pa <= pb; now test if pa <= pc
   2464          cmp eax, pctemp
   2465          jna dpthabc
   2466          // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
   2467          mov cl, [esi + edx]  // load Prior(x-bpp) into cl
   2468          jmp dpthpaeth
   2469 dpthabc:
   2470          // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
   2471          mov cl, [edi + edx]  // load Raw(x-bpp) into cl
   2472 dpthpaeth:
   2473          inc ebx
   2474          inc edx
   2475          // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
   2476          add [edi + ebx - 1], cl
   2477          cmp ebx, diff
   2478          jb dpthlp1
   2479 dpthgo:
   2480          mov ecx, FullLength
   2481          mov eax, ecx
   2482          sub eax, ebx          // subtract alignment fix
   2483          and eax, 0x00000007   // calc bytes over mult of 8
   2484          sub ecx, eax          // drop over bytes from original length
   2485          mov MMXLength, ecx
   2486    } // end _asm block
   2487    // Now do the math for the rest of the row
   2488    switch ( bpp )
   2489    {
   2490       case 3:
   2491       {
   2492          ActiveMask.use = 0x0000000000ffffff;
   2493          ActiveMaskEnd.use = 0xffff000000000000;
   2494          ShiftBpp.use = 24;    // == bpp(3) * 8
   2495          ShiftRem.use = 40;    // == 64 - 24
   2496          _asm
   2497          {
   2498             mov ebx, diff
   2499             mov edi, row
   2500             mov esi, prev_row
   2501             pxor mm0, mm0
   2502             // PRIME the pump (load the first Raw(x-bpp) data set
   2503             movq mm1, [edi+ebx-8]
   2504 dpth3lp:
   2505             psrlq mm1, ShiftRem     // shift last 3 bytes to 1st 3 bytes
   2506             movq mm2, [esi + ebx]   // load b=Prior(x)
   2507             punpcklbw mm1, mm0      // Unpack High bytes of a
   2508             movq mm3, [esi+ebx-8]   // Prep c=Prior(x-bpp) bytes
   2509             punpcklbw mm2, mm0      // Unpack High bytes of b
   2510             psrlq mm3, ShiftRem     // shift last 3 bytes to 1st 3 bytes
   2511             // pav = p - a = (a + b - c) - a = b - c
   2512             movq mm4, mm2
   2513             punpcklbw mm3, mm0      // Unpack High bytes of c
   2514             // pbv = p - b = (a + b - c) - b = a - c
   2515             movq mm5, mm1
   2516             psubw mm4, mm3
   2517             pxor mm7, mm7
   2518             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
   2519             movq mm6, mm4
   2520             psubw mm5, mm3
   2521 
   2522             // pa = abs(p-a) = abs(pav)
   2523             // pb = abs(p-b) = abs(pbv)
   2524             // pc = abs(p-c) = abs(pcv)
   2525             pcmpgtw mm0, mm4    // Create mask pav bytes < 0
   2526             paddw mm6, mm5
   2527             pand mm0, mm4       // Only pav bytes < 0 in mm7
   2528             pcmpgtw mm7, mm5    // Create mask pbv bytes < 0
   2529             psubw mm4, mm0
   2530             pand mm7, mm5       // Only pbv bytes < 0 in mm0
   2531             psubw mm4, mm0
   2532             psubw mm5, mm7
   2533             pxor mm0, mm0
   2534             pcmpgtw mm0, mm6    // Create mask pcv bytes < 0
   2535             pand mm0, mm6       // Only pav bytes < 0 in mm7
   2536             psubw mm5, mm7
   2537             psubw mm6, mm0
   2538             //  test pa <= pb
   2539             movq mm7, mm4
   2540             psubw mm6, mm0
   2541             pcmpgtw mm7, mm5    // pa > pb?
   2542             movq mm0, mm7
   2543             // use mm7 mask to merge pa & pb
   2544             pand mm5, mm7
   2545             // use mm0 mask copy to merge a & b
   2546             pand mm2, mm0
   2547             pandn mm7, mm4
   2548             pandn mm0, mm1
   2549             paddw mm7, mm5
   2550             paddw mm0, mm2
   2551             //  test  ((pa <= pb)? pa:pb) <= pc
   2552             pcmpgtw mm7, mm6       // pab > pc?
   2553             pxor mm1, mm1
   2554             pand mm3, mm7
   2555             pandn mm7, mm0
   2556             paddw mm7, mm3
   2557             pxor mm0, mm0
   2558             packuswb mm7, mm1
   2559             movq mm3, [esi + ebx]   // load c=Prior(x-bpp)
   2560             pand mm7, ActiveMask
   2561             movq mm2, mm3           // load b=Prior(x) step 1
   2562             paddb mm7, [edi + ebx]  // add Paeth predictor with Raw(x)
   2563             punpcklbw mm3, mm0      // Unpack High bytes of c
   2564             movq [edi + ebx], mm7   // write back updated value
   2565             movq mm1, mm7           // Now mm1 will be used as Raw(x-bpp)
   2566             // Now do Paeth for 2nd set of bytes (3-5)
   2567             psrlq mm2, ShiftBpp     // load b=Prior(x) step 2
   2568             punpcklbw mm1, mm0      // Unpack High bytes of a
   2569             pxor mm7, mm7
   2570             punpcklbw mm2, mm0      // Unpack High bytes of b
   2571             // pbv = p - b = (a + b - c) - b = a - c
   2572             movq mm5, mm1
   2573             // pav = p - a = (a + b - c) - a = b - c
   2574             movq mm4, mm2
   2575             psubw mm5, mm3
   2576             psubw mm4, mm3
   2577             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) =
   2578             //       pav + pbv = pbv + pav
   2579             movq mm6, mm5
   2580             paddw mm6, mm4
   2581 
   2582             // pa = abs(p-a) = abs(pav)
   2583             // pb = abs(p-b) = abs(pbv)
   2584             // pc = abs(p-c) = abs(pcv)
   2585             pcmpgtw mm0, mm5       // Create mask pbv bytes < 0
   2586             pcmpgtw mm7, mm4       // Create mask pav bytes < 0
   2587             pand mm0, mm5          // Only pbv bytes < 0 in mm0
   2588             pand mm7, mm4          // Only pav bytes < 0 in mm7
   2589             psubw mm5, mm0
   2590             psubw mm4, mm7
   2591             psubw mm5, mm0
   2592             psubw mm4, mm7
   2593             pxor mm0, mm0
   2594             pcmpgtw mm0, mm6       // Create mask pcv bytes < 0
   2595             pand mm0, mm6          // Only pav bytes < 0 in mm7
   2596             psubw mm6, mm0
   2597             //  test pa <= pb
   2598             movq mm7, mm4
   2599             psubw mm6, mm0
   2600             pcmpgtw mm7, mm5       // pa > pb?
   2601             movq mm0, mm7
   2602             // use mm7 mask to merge pa & pb
   2603             pand mm5, mm7
   2604             // use mm0 mask copy to merge a & b
   2605             pand mm2, mm0
   2606             pandn mm7, mm4
   2607             pandn mm0, mm1
   2608             paddw mm7, mm5
   2609             paddw mm0, mm2
   2610             //  test  ((pa <= pb)? pa:pb) <= pc
   2611             pcmpgtw mm7, mm6       // pab > pc?
   2612             movq mm2, [esi + ebx]  // load b=Prior(x)
   2613             pand mm3, mm7
   2614             pandn mm7, mm0
   2615             pxor mm1, mm1
   2616             paddw mm7, mm3
   2617             pxor mm0, mm0
   2618             packuswb mm7, mm1
   2619             movq mm3, mm2           // load c=Prior(x-bpp) step 1
   2620             pand mm7, ActiveMask
   2621             punpckhbw mm2, mm0      // Unpack High bytes of b
   2622             psllq mm7, ShiftBpp     // Shift bytes to 2nd group of 3 bytes
   2623              // pav = p - a = (a + b - c) - a = b - c
   2624             movq mm4, mm2
   2625             paddb mm7, [edi + ebx]  // add Paeth predictor with Raw(x)
   2626             psllq mm3, ShiftBpp     // load c=Prior(x-bpp) step 2
   2627             movq [edi + ebx], mm7   // write back updated value
   2628             movq mm1, mm7
   2629             punpckhbw mm3, mm0      // Unpack High bytes of c
   2630             psllq mm1, ShiftBpp     // Shift bytes
   2631                                     // Now mm1 will be used as Raw(x-bpp)
   2632             // Now do Paeth for 3rd, and final, set of bytes (6-7)
   2633             pxor mm7, mm7
   2634             punpckhbw mm1, mm0      // Unpack High bytes of a
   2635             psubw mm4, mm3
   2636             // pbv = p - b = (a + b - c) - b = a - c
   2637             movq mm5, mm1
   2638             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
   2639             movq mm6, mm4
   2640             psubw mm5, mm3
   2641             pxor mm0, mm0
   2642             paddw mm6, mm5
   2643 
   2644             // pa = abs(p-a) = abs(pav)
   2645             // pb = abs(p-b) = abs(pbv)
   2646             // pc = abs(p-c) = abs(pcv)
   2647             pcmpgtw mm0, mm4    // Create mask pav bytes < 0
   2648             pcmpgtw mm7, mm5    // Create mask pbv bytes < 0
   2649             pand mm0, mm4       // Only pav bytes < 0 in mm7
   2650             pand mm7, mm5       // Only pbv bytes < 0 in mm0
   2651             psubw mm4, mm0
   2652             psubw mm5, mm7
   2653             psubw mm4, mm0
   2654             psubw mm5, mm7
   2655             pxor mm0, mm0
   2656             pcmpgtw mm0, mm6    // Create mask pcv bytes < 0
   2657             pand mm0, mm6       // Only pav bytes < 0 in mm7
   2658             psubw mm6, mm0
   2659             //  test pa <= pb
   2660             movq mm7, mm4
   2661             psubw mm6, mm0
   2662             pcmpgtw mm7, mm5    // pa > pb?
   2663             movq mm0, mm7
   2664             // use mm0 mask copy to merge a & b
   2665             pand mm2, mm0
   2666             // use mm7 mask to merge pa & pb
   2667             pand mm5, mm7
   2668             pandn mm0, mm1
   2669             pandn mm7, mm4
   2670             paddw mm0, mm2
   2671             paddw mm7, mm5
   2672             //  test  ((pa <= pb)? pa:pb) <= pc
   2673             pcmpgtw mm7, mm6    // pab > pc?
   2674             pand mm3, mm7
   2675             pandn mm7, mm0
   2676             paddw mm7, mm3
   2677             pxor mm1, mm1
   2678             packuswb mm1, mm7
   2679             // Step ebx to next set of 8 bytes and repeat loop til done
   2680             add ebx, 8
   2681             pand mm1, ActiveMaskEnd
   2682             paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x)
   2683 
   2684             cmp ebx, MMXLength
   2685             pxor mm0, mm0              // pxor does not affect flags
   2686             movq [edi + ebx - 8], mm1  // write back updated value
   2687                                  // mm1 will be used as Raw(x-bpp) next loop
   2688                            // mm3 ready to be used as Prior(x-bpp) next loop
   2689             jb dpth3lp
   2690          } // end _asm block
   2691       }
   2692       break;
   2693 
   2694       case 6:
   2695       case 7:
   2696       case 5:
   2697       {
   2698          ActiveMask.use  = 0x00000000ffffffff;
   2699          ActiveMask2.use = 0xffffffff00000000;
   2700          ShiftBpp.use = bpp << 3;    // == bpp * 8
   2701          ShiftRem.use = 64 - ShiftBpp.use;
   2702          _asm
   2703          {
   2704             mov ebx, diff
   2705             mov edi, row
   2706             mov esi, prev_row
   2707             // PRIME the pump (load the first Raw(x-bpp) data set
   2708             movq mm1, [edi+ebx-8]
   2709             pxor mm0, mm0
   2710 dpth6lp:
   2711             // Must shift to position Raw(x-bpp) data
   2712             psrlq mm1, ShiftRem
   2713             // Do first set of 4 bytes
   2714             movq mm3, [esi+ebx-8]      // read c=Prior(x-bpp) bytes
   2715             punpcklbw mm1, mm0      // Unpack Low bytes of a
   2716             movq mm2, [esi + ebx]   // load b=Prior(x)
   2717             punpcklbw mm2, mm0      // Unpack Low bytes of b
   2718             // Must shift to position Prior(x-bpp) data
   2719             psrlq mm3, ShiftRem
   2720             // pav = p - a = (a + b - c) - a = b - c
   2721             movq mm4, mm2
   2722             punpcklbw mm3, mm0      // Unpack Low bytes of c
   2723             // pbv = p - b = (a + b - c) - b = a - c
   2724             movq mm5, mm1
   2725             psubw mm4, mm3
   2726             pxor mm7, mm7
   2727             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
   2728             movq mm6, mm4
   2729             psubw mm5, mm3
   2730             // pa = abs(p-a) = abs(pav)
   2731             // pb = abs(p-b) = abs(pbv)
   2732             // pc = abs(p-c) = abs(pcv)
   2733             pcmpgtw mm0, mm4    // Create mask pav bytes < 0
   2734             paddw mm6, mm5
   2735             pand mm0, mm4       // Only pav bytes < 0 in mm7
   2736             pcmpgtw mm7, mm5    // Create mask pbv bytes < 0
   2737             psubw mm4, mm0
   2738             pand mm7, mm5       // Only pbv bytes < 0 in mm0
   2739             psubw mm4, mm0
   2740             psubw mm5, mm7
   2741             pxor mm0, mm0
   2742             pcmpgtw mm0, mm6    // Create mask pcv bytes < 0
   2743             pand mm0, mm6       // Only pav bytes < 0 in mm7
   2744             psubw mm5, mm7
   2745             psubw mm6, mm0
   2746             //  test pa <= pb
   2747             movq mm7, mm4
   2748             psubw mm6, mm0
   2749             pcmpgtw mm7, mm5    // pa > pb?
   2750             movq mm0, mm7
   2751             // use mm7 mask to merge pa & pb
   2752             pand mm5, mm7
   2753             // use mm0 mask copy to merge a & b
   2754             pand mm2, mm0
   2755             pandn mm7, mm4
   2756             pandn mm0, mm1
   2757             paddw mm7, mm5
   2758             paddw mm0, mm2
   2759             //  test  ((pa <= pb)? pa:pb) <= pc
   2760             pcmpgtw mm7, mm6    // pab > pc?
   2761             pxor mm1, mm1
   2762             pand mm3, mm7
   2763             pandn mm7, mm0
   2764             paddw mm7, mm3
   2765             pxor mm0, mm0
   2766             packuswb mm7, mm1
   2767             movq mm3, [esi + ebx - 8]  // load c=Prior(x-bpp)
   2768             pand mm7, ActiveMask
   2769             psrlq mm3, ShiftRem
   2770             movq mm2, [esi + ebx]      // load b=Prior(x) step 1
   2771             paddb mm7, [edi + ebx]     // add Paeth predictor with Raw(x)
   2772             movq mm6, mm2
   2773             movq [edi + ebx], mm7      // write back updated value
   2774             movq mm1, [edi+ebx-8]
   2775             psllq mm6, ShiftBpp
   2776             movq mm5, mm7
   2777             psrlq mm1, ShiftRem
   2778             por mm3, mm6
   2779             psllq mm5, ShiftBpp
   2780             punpckhbw mm3, mm0         // Unpack High bytes of c
   2781             por mm1, mm5
   2782             // Do second set of 4 bytes
   2783             punpckhbw mm2, mm0         // Unpack High bytes of b
   2784             punpckhbw mm1, mm0         // Unpack High bytes of a
   2785             // pav = p - a = (a + b - c) - a = b - c
   2786             movq mm4, mm2
   2787             // pbv = p - b = (a + b - c) - b = a - c
   2788             movq mm5, mm1
   2789             psubw mm4, mm3
   2790             pxor mm7, mm7
   2791             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
   2792             movq mm6, mm4
   2793             psubw mm5, mm3
   2794             // pa = abs(p-a) = abs(pav)
   2795             // pb = abs(p-b) = abs(pbv)
   2796             // pc = abs(p-c) = abs(pcv)
   2797             pcmpgtw mm0, mm4       // Create mask pav bytes < 0
   2798             paddw mm6, mm5
   2799             pand mm0, mm4          // Only pav bytes < 0 in mm7
   2800             pcmpgtw mm7, mm5       // Create mask pbv bytes < 0
   2801             psubw mm4, mm0
   2802             pand mm7, mm5          // Only pbv bytes < 0 in mm0
   2803             psubw mm4, mm0
   2804             psubw mm5, mm7
   2805             pxor mm0, mm0
   2806             pcmpgtw mm0, mm6       // Create mask pcv bytes < 0
   2807             pand mm0, mm6          // Only pav bytes < 0 in mm7
   2808             psubw mm5, mm7
   2809             psubw mm6, mm0
   2810             //  test pa <= pb
   2811             movq mm7, mm4
   2812             psubw mm6, mm0
   2813             pcmpgtw mm7, mm5       // pa > pb?
   2814             movq mm0, mm7
   2815             // use mm7 mask to merge pa & pb
   2816             pand mm5, mm7
   2817             // use mm0 mask copy to merge a & b
   2818             pand mm2, mm0
   2819             pandn mm7, mm4
   2820             pandn mm0, mm1
   2821             paddw mm7, mm5
   2822             paddw mm0, mm2
   2823             //  test  ((pa <= pb)? pa:pb) <= pc
   2824             pcmpgtw mm7, mm6           // pab > pc?
   2825             pxor mm1, mm1
   2826             pand mm3, mm7
   2827             pandn mm7, mm0
   2828             pxor mm1, mm1
   2829             paddw mm7, mm3
   2830             pxor mm0, mm0
   2831             // Step ex to next set of 8 bytes and repeat loop til done
   2832             add ebx, 8
   2833             packuswb mm1, mm7
   2834             paddb mm1, [edi + ebx - 8]     // add Paeth predictor with Raw(x)
   2835             cmp ebx, MMXLength
   2836             movq [edi + ebx - 8], mm1      // write back updated value
   2837                                 // mm1 will be used as Raw(x-bpp) next loop
   2838             jb dpth6lp
   2839          } // end _asm block
   2840       }
   2841       break;
   2842 
   2843       case 4:
   2844       {
   2845          ActiveMask.use  = 0x00000000ffffffff;
   2846          _asm {
   2847             mov ebx, diff
   2848             mov edi, row
   2849             mov esi, prev_row
   2850             pxor mm0, mm0
   2851             // PRIME the pump (load the first Raw(x-bpp) data set
   2852             movq mm1, [edi+ebx-8]    // Only time should need to read
   2853                                      //  a=Raw(x-bpp) bytes
   2854 dpth4lp:
   2855             // Do first set of 4 bytes
   2856             movq mm3, [esi+ebx-8]    // read c=Prior(x-bpp) bytes
   2857             punpckhbw mm1, mm0       // Unpack Low bytes of a
   2858             movq mm2, [esi + ebx]    // load b=Prior(x)
   2859             punpcklbw mm2, mm0       // Unpack High bytes of b
   2860             // pav = p - a = (a + b - c) - a = b - c
   2861             movq mm4, mm2
   2862             punpckhbw mm3, mm0       // Unpack High bytes of c
   2863             // pbv = p - b = (a + b - c) - b = a - c
   2864             movq mm5, mm1
   2865             psubw mm4, mm3
   2866             pxor mm7, mm7
   2867             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
   2868             movq mm6, mm4
   2869             psubw mm5, mm3
   2870             // pa = abs(p-a) = abs(pav)
   2871             // pb = abs(p-b) = abs(pbv)
   2872             // pc = abs(p-c) = abs(pcv)
   2873             pcmpgtw mm0, mm4       // Create mask pav bytes < 0
   2874             paddw mm6, mm5
   2875             pand mm0, mm4          // Only pav bytes < 0 in mm7
   2876             pcmpgtw mm7, mm5       // Create mask pbv bytes < 0
   2877             psubw mm4, mm0
   2878             pand mm7, mm5          // Only pbv bytes < 0 in mm0
   2879             psubw mm4, mm0
   2880             psubw mm5, mm7
   2881             pxor mm0, mm0
   2882             pcmpgtw mm0, mm6       // Create mask pcv bytes < 0
   2883             pand mm0, mm6          // Only pav bytes < 0 in mm7
   2884             psubw mm5, mm7
   2885             psubw mm6, mm0
   2886             //  test pa <= pb
   2887             movq mm7, mm4
   2888             psubw mm6, mm0
   2889             pcmpgtw mm7, mm5       // pa > pb?
   2890             movq mm0, mm7
   2891             // use mm7 mask to merge pa & pb
   2892             pand mm5, mm7
   2893             // use mm0 mask copy to merge a & b
   2894             pand mm2, mm0
   2895             pandn mm7, mm4
   2896             pandn mm0, mm1
   2897             paddw mm7, mm5
   2898             paddw mm0, mm2
   2899             //  test  ((pa <= pb)? pa:pb) <= pc
   2900             pcmpgtw mm7, mm6       // pab > pc?
   2901             pxor mm1, mm1
   2902             pand mm3, mm7
   2903             pandn mm7, mm0
   2904             paddw mm7, mm3
   2905             pxor mm0, mm0
   2906             packuswb mm7, mm1
   2907             movq mm3, [esi + ebx]      // load c=Prior(x-bpp)
   2908             pand mm7, ActiveMask
   2909             movq mm2, mm3              // load b=Prior(x) step 1
   2910             paddb mm7, [edi + ebx]     // add Paeth predictor with Raw(x)
   2911             punpcklbw mm3, mm0         // Unpack High bytes of c
   2912             movq [edi + ebx], mm7      // write back updated value
   2913             movq mm1, mm7              // Now mm1 will be used as Raw(x-bpp)
   2914             // Do second set of 4 bytes
   2915             punpckhbw mm2, mm0         // Unpack Low bytes of b
   2916             punpcklbw mm1, mm0         // Unpack Low bytes of a
   2917             // pav = p - a = (a + b - c) - a = b - c
   2918             movq mm4, mm2
   2919             // pbv = p - b = (a + b - c) - b = a - c
   2920             movq mm5, mm1
   2921             psubw mm4, mm3
   2922             pxor mm7, mm7
   2923             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
   2924             movq mm6, mm4
   2925             psubw mm5, mm3
   2926             // pa = abs(p-a) = abs(pav)
   2927             // pb = abs(p-b) = abs(pbv)
   2928             // pc = abs(p-c) = abs(pcv)
   2929             pcmpgtw mm0, mm4       // Create mask pav bytes < 0
   2930             paddw mm6, mm5
   2931             pand mm0, mm4          // Only pav bytes < 0 in mm7
   2932             pcmpgtw mm7, mm5       // Create mask pbv bytes < 0
   2933             psubw mm4, mm0
   2934             pand mm7, mm5          // Only pbv bytes < 0 in mm0
   2935             psubw mm4, mm0
   2936             psubw mm5, mm7
   2937             pxor mm0, mm0
   2938             pcmpgtw mm0, mm6       // Create mask pcv bytes < 0
   2939             pand mm0, mm6          // Only pav bytes < 0 in mm7
   2940             psubw mm5, mm7
   2941             psubw mm6, mm0
   2942             //  test pa <= pb
   2943             movq mm7, mm4
   2944             psubw mm6, mm0
   2945             pcmpgtw mm7, mm5       // pa > pb?
   2946             movq mm0, mm7
   2947             // use mm7 mask to merge pa & pb
   2948             pand mm5, mm7
   2949             // use mm0 mask copy to merge a & b
   2950             pand mm2, mm0
   2951             pandn mm7, mm4
   2952             pandn mm0, mm1
   2953             paddw mm7, mm5
   2954             paddw mm0, mm2
   2955             //  test  ((pa <= pb)? pa:pb) <= pc
   2956             pcmpgtw mm7, mm6       // pab > pc?
   2957             pxor mm1, mm1
   2958             pand mm3, mm7
   2959             pandn mm7, mm0
   2960             pxor mm1, mm1
   2961             paddw mm7, mm3
   2962             pxor mm0, mm0
   2963             // Step ex to next set of 8 bytes and repeat loop til done
   2964             add ebx, 8
   2965             packuswb mm1, mm7
   2966             paddb mm1, [edi + ebx - 8]     // add Paeth predictor with Raw(x)
   2967             cmp ebx, MMXLength
   2968             movq [edi + ebx - 8], mm1      // write back updated value
   2969                                 // mm1 will be used as Raw(x-bpp) next loop
   2970             jb dpth4lp
   2971          } // end _asm block
   2972       }
   2973       break;
   2974       case 8:                          // bpp == 8
   2975       {
   2976          ActiveMask.use  = 0x00000000ffffffff;
   2977          _asm {
   2978             mov ebx, diff
   2979             mov edi, row
   2980             mov esi, prev_row
   2981             pxor mm0, mm0
   2982             // PRIME the pump (load the first Raw(x-bpp) data set
   2983             movq mm1, [edi+ebx-8]      // Only time should need to read
   2984                                        //  a=Raw(x-bpp) bytes
   2985 dpth8lp:
   2986             // Do first set of 4 bytes
   2987             movq mm3, [esi+ebx-8]      // read c=Prior(x-bpp) bytes
   2988             punpcklbw mm1, mm0         // Unpack Low bytes of a
   2989             movq mm2, [esi + ebx]      // load b=Prior(x)
   2990             punpcklbw mm2, mm0         // Unpack Low bytes of b
   2991             // pav = p - a = (a + b - c) - a = b - c
   2992             movq mm4, mm2
   2993             punpcklbw mm3, mm0         // Unpack Low bytes of c
   2994             // pbv = p - b = (a + b - c) - b = a - c
   2995             movq mm5, mm1
   2996             psubw mm4, mm3
   2997             pxor mm7, mm7
   2998             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
   2999             movq mm6, mm4
   3000             psubw mm5, mm3
   3001             // pa = abs(p-a) = abs(pav)
   3002             // pb = abs(p-b) = abs(pbv)
   3003             // pc = abs(p-c) = abs(pcv)
   3004             pcmpgtw mm0, mm4       // Create mask pav bytes < 0
   3005             paddw mm6, mm5
   3006             pand mm0, mm4          // Only pav bytes < 0 in mm7
   3007             pcmpgtw mm7, mm5       // Create mask pbv bytes < 0
   3008             psubw mm4, mm0
   3009             pand mm7, mm5          // Only pbv bytes < 0 in mm0
   3010             psubw mm4, mm0
   3011             psubw mm5, mm7
   3012             pxor mm0, mm0
   3013             pcmpgtw mm0, mm6       // Create mask pcv bytes < 0
   3014             pand mm0, mm6          // Only pav bytes < 0 in mm7
   3015             psubw mm5, mm7
   3016             psubw mm6, mm0
   3017             //  test pa <= pb
   3018             movq mm7, mm4
   3019             psubw mm6, mm0
   3020             pcmpgtw mm7, mm5       // pa > pb?
   3021             movq mm0, mm7
   3022             // use mm7 mask to merge pa & pb
   3023             pand mm5, mm7
   3024             // use mm0 mask copy to merge a & b
   3025             pand mm2, mm0
   3026             pandn mm7, mm4
   3027             pandn mm0, mm1
   3028             paddw mm7, mm5
   3029             paddw mm0, mm2
   3030             //  test  ((pa <= pb)? pa:pb) <= pc
   3031             pcmpgtw mm7, mm6       // pab > pc?
   3032             pxor mm1, mm1
   3033             pand mm3, mm7
   3034             pandn mm7, mm0
   3035             paddw mm7, mm3
   3036             pxor mm0, mm0
   3037             packuswb mm7, mm1
   3038             movq mm3, [esi+ebx-8]    // read c=Prior(x-bpp) bytes
   3039             pand mm7, ActiveMask
   3040             movq mm2, [esi + ebx]    // load b=Prior(x)
   3041             paddb mm7, [edi + ebx]   // add Paeth predictor with Raw(x)
   3042             punpckhbw mm3, mm0       // Unpack High bytes of c
   3043             movq [edi + ebx], mm7    // write back updated value
   3044             movq mm1, [edi+ebx-8]    // read a=Raw(x-bpp) bytes
   3045 
   3046             // Do second set of 4 bytes
   3047             punpckhbw mm2, mm0       // Unpack High bytes of b
   3048             punpckhbw mm1, mm0       // Unpack High bytes of a
   3049             // pav = p - a = (a + b - c) - a = b - c
   3050             movq mm4, mm2
   3051             // pbv = p - b = (a + b - c) - b = a - c
   3052             movq mm5, mm1
   3053             psubw mm4, mm3
   3054             pxor mm7, mm7
   3055             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
   3056             movq mm6, mm4
   3057             psubw mm5, mm3
   3058             // pa = abs(p-a) = abs(pav)
   3059             // pb = abs(p-b) = abs(pbv)
   3060             // pc = abs(p-c) = abs(pcv)
   3061             pcmpgtw mm0, mm4       // Create mask pav bytes < 0
   3062             paddw mm6, mm5
   3063             pand mm0, mm4          // Only pav bytes < 0 in mm7
   3064             pcmpgtw mm7, mm5       // Create mask pbv bytes < 0
   3065             psubw mm4, mm0
   3066             pand mm7, mm5          // Only pbv bytes < 0 in mm0
   3067             psubw mm4, mm0
   3068             psubw mm5, mm7
   3069             pxor mm0, mm0
   3070             pcmpgtw mm0, mm6       // Create mask pcv bytes < 0
   3071             pand mm0, mm6          // Only pav bytes < 0 in mm7
   3072             psubw mm5, mm7
   3073             psubw mm6, mm0
   3074             //  test pa <= pb
   3075             movq mm7, mm4
   3076             psubw mm6, mm0
   3077             pcmpgtw mm7, mm5       // pa > pb?
   3078             movq mm0, mm7
   3079             // use mm7 mask to merge pa & pb
   3080             pand mm5, mm7
   3081             // use mm0 mask copy to merge a & b
   3082             pand mm2, mm0
   3083             pandn mm7, mm4
   3084             pandn mm0, mm1
   3085             paddw mm7, mm5
   3086             paddw mm0, mm2
   3087             //  test  ((pa <= pb)? pa:pb) <= pc
   3088             pcmpgtw mm7, mm6       // pab > pc?
   3089             pxor mm1, mm1
   3090             pand mm3, mm7
   3091             pandn mm7, mm0
   3092             pxor mm1, mm1
   3093             paddw mm7, mm3
   3094             pxor mm0, mm0
   3095             // Step ex to next set of 8 bytes and repeat loop til done
   3096             add ebx, 8
   3097             packuswb mm1, mm7
   3098             paddb mm1, [edi + ebx - 8]     // add Paeth predictor with Raw(x)
   3099             cmp ebx, MMXLength
   3100             movq [edi + ebx - 8], mm1      // write back updated value
   3101                             // mm1 will be used as Raw(x-bpp) next loop
   3102             jb dpth8lp
   3103          } // end _asm block
   3104       }
   3105       break;
   3106 
   3107       case 1:                // bpp = 1
   3108       case 2:                // bpp = 2
   3109       default:               // bpp > 8
   3110       {
   3111          _asm {
   3112             mov ebx, diff
   3113             cmp ebx, FullLength
   3114             jnb dpthdend
   3115             mov edi, row
   3116             mov esi, prev_row
   3117             // Do Paeth decode for remaining bytes
   3118             mov edx, ebx
   3119             xor ecx, ecx        // zero ecx before using cl & cx in loop below
   3120             sub edx, bpp        // Set edx = ebx - bpp
   3121 dpthdlp:
   3122             xor eax, eax
   3123             // pav = p - a = (a + b - c) - a = b - c
   3124             mov al, [esi + ebx]        // load Prior(x) into al
   3125             mov cl, [esi + edx]        // load Prior(x-bpp) into cl
   3126             sub eax, ecx                 // subtract Prior(x-bpp)
   3127             mov patemp, eax                 // Save pav for later use
   3128             xor eax, eax
   3129             // pbv = p - b = (a + b - c) - b = a - c
   3130             mov al, [edi + edx]        // load Raw(x-bpp) into al
   3131             sub eax, ecx                 // subtract Prior(x-bpp)
   3132             mov ecx, eax
   3133             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
   3134             add eax, patemp                 // pcv = pav + pbv
   3135             // pc = abs(pcv)
   3136             test eax, 0x80000000
   3137             jz dpthdpca
   3138             neg eax                     // reverse sign of neg values
   3139 dpthdpca:
   3140             mov pctemp, eax             // save pc for later use
   3141             // pb = abs(pbv)
   3142             test ecx, 0x80000000
   3143             jz dpthdpba
   3144             neg ecx                     // reverse sign of neg values
   3145 dpthdpba:
   3146             mov pbtemp, ecx             // save pb for later use
   3147             // pa = abs(pav)
   3148             mov eax, patemp
   3149             test eax, 0x80000000
   3150             jz dpthdpaa
   3151             neg eax                     // reverse sign of neg values
   3152 dpthdpaa:
   3153             mov patemp, eax             // save pa for later use
   3154             // test if pa <= pb
   3155             cmp eax, ecx
   3156             jna dpthdabb
   3157             // pa > pb; now test if pb <= pc
   3158             cmp ecx, pctemp
   3159             jna dpthdbbc
   3160             // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
   3161             mov cl, [esi + edx]  // load Prior(x-bpp) into cl
   3162             jmp dpthdpaeth
   3163 dpthdbbc:
   3164             // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
   3165             mov cl, [esi + ebx]        // load Prior(x) into cl
   3166             jmp dpthdpaeth
   3167 dpthdabb:
   3168             // pa <= pb; now test if pa <= pc
   3169             cmp eax, pctemp
   3170             jna dpthdabc
   3171             // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
   3172             mov cl, [esi + edx]  // load Prior(x-bpp) into cl
   3173             jmp dpthdpaeth
   3174 dpthdabc:
   3175             // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
   3176             mov cl, [edi + edx]  // load Raw(x-bpp) into cl
   3177 dpthdpaeth:
   3178             inc ebx
   3179             inc edx
   3180             // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
   3181             add [edi + ebx - 1], cl
   3182             cmp ebx, FullLength
   3183             jb dpthdlp
   3184 dpthdend:
   3185          } // end _asm block
   3186       }
   3187       return;                   // No need to go further with this one
   3188    }                         // end switch ( bpp )
   3189    _asm
   3190    {
   3191          // MMX acceleration complete now do clean-up
   3192          // Check if any remaining bytes left to decode
   3193          mov ebx, MMXLength
   3194          cmp ebx, FullLength
   3195          jnb dpthend
   3196          mov edi, row
   3197          mov esi, prev_row
   3198          // Do Paeth decode for remaining bytes
   3199          mov edx, ebx
   3200          xor ecx, ecx         // zero ecx before using cl & cx in loop below
   3201          sub edx, bpp         // Set edx = ebx - bpp
   3202 dpthlp2:
   3203          xor eax, eax
   3204          // pav = p - a = (a + b - c) - a = b - c
   3205          mov al, [esi + ebx]  // load Prior(x) into al
   3206          mov cl, [esi + edx]  // load Prior(x-bpp) into cl
   3207          sub eax, ecx         // subtract Prior(x-bpp)
   3208          mov patemp, eax      // Save pav for later use
   3209          xor eax, eax
   3210          // pbv = p - b = (a + b - c) - b = a - c
   3211          mov al, [edi + edx]  // load Raw(x-bpp) into al
   3212          sub eax, ecx         // subtract Prior(x-bpp)
   3213          mov ecx, eax
   3214          // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
   3215          add eax, patemp      // pcv = pav + pbv
   3216          // pc = abs(pcv)
   3217          test eax, 0x80000000
   3218          jz dpthpca2
   3219          neg eax              // reverse sign of neg values
   3220 dpthpca2:
   3221          mov pctemp, eax      // save pc for later use
   3222          // pb = abs(pbv)
   3223          test ecx, 0x80000000
   3224          jz dpthpba2
   3225          neg ecx              // reverse sign of neg values
   3226 dpthpba2:
   3227          mov pbtemp, ecx      // save pb for later use
   3228          // pa = abs(pav)
   3229          mov eax, patemp
   3230          test eax, 0x80000000
   3231          jz dpthpaa2
   3232          neg eax              // reverse sign of neg values
   3233 dpthpaa2:
   3234          mov patemp, eax      // save pa for later use
   3235          // test if pa <= pb
   3236          cmp eax, ecx
   3237          jna dpthabb2
   3238          // pa > pb; now test if pb <= pc
   3239          cmp ecx, pctemp
   3240          jna dpthbbc2
   3241          // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
   3242          mov cl, [esi + edx]  // load Prior(x-bpp) into cl
   3243          jmp dpthpaeth2
   3244 dpthbbc2:
   3245          // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
   3246          mov cl, [esi + ebx]        // load Prior(x) into cl
   3247          jmp dpthpaeth2
   3248 dpthabb2:
   3249          // pa <= pb; now test if pa <= pc
   3250          cmp eax, pctemp
   3251          jna dpthabc2
   3252          // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
   3253          mov cl, [esi + edx]  // load Prior(x-bpp) into cl
   3254          jmp dpthpaeth2
   3255 dpthabc2:
   3256          // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
   3257          mov cl, [edi + edx]  // load Raw(x-bpp) into cl
   3258 dpthpaeth2:
   3259          inc ebx
   3260          inc edx
   3261          // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
   3262          add [edi + ebx - 1], cl
   3263          cmp ebx, FullLength
   3264          jb dpthlp2
   3265 dpthend:
   3266          emms             // End MMX instructions; prep for possible FP instrs.
   3267    } // end _asm block
   3268 }
   3269 
   3270 // Optimized code for PNG Sub filter decoder
   3271 void /* PRIVATE */
   3272 png_read_filter_row_mmx_sub(png_row_infop row_info, png_bytep row)
   3273 {
   3274   // These variables are declared
   3275   // here to ensure alignment on 8-byte boundaries.
   3276   union uAll ActiveMask, ShiftBpp, ShiftRem;
   3277 
   3278    //int test;
   3279    int bpp;
   3280    png_uint_32 FullLength;
   3281    png_uint_32 MMXLength;
   3282    int diff;
   3283 
   3284    bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
   3285    FullLength  = row_info->rowbytes - bpp; // # of bytes to filter
   3286    _asm {
   3287         mov edi, row
   3288         mov esi, edi               // lp = row
   3289         add edi, bpp               // rp = row + bpp
   3290         xor eax, eax
   3291         // get # of bytes to alignment
   3292         mov diff, edi               // take start of row
   3293         add diff, 0xf               // add 7 + 8 to incr past
   3294                                         // alignment boundary
   3295         xor ebx, ebx
   3296         and diff, 0xfffffff8        // mask to alignment boundary
   3297         sub diff, edi               // subtract from start ==> value
   3298                                         //  ebx at alignment
   3299         jz dsubgo
   3300         // fix alignment
   3301 dsublp1:
   3302         mov al, [esi+ebx]
   3303         add [edi+ebx], al
   3304         inc ebx
   3305         cmp ebx, diff
   3306         jb dsublp1
   3307 dsubgo:
   3308         mov ecx, FullLength
   3309         mov edx, ecx
   3310         sub edx, ebx                  // subtract alignment fix
   3311         and edx, 0x00000007           // calc bytes over mult of 8
   3312         sub ecx, edx                  // drop over bytes from length
   3313         mov MMXLength, ecx
   3314    } // end _asm block
   3315 
   3316    // Now do the math for the rest of the row
   3317    switch ( bpp )
   3318    {
   3319         case 3:
   3320         {
   3321          ActiveMask.use  = 0x0000ffffff000000;
   3322          ShiftBpp.use = 24;       // == 3 * 8
   3323          ShiftRem.use  = 40;      // == 64 - 24
   3324          _asm {
   3325             mov edi, row
   3326             movq mm7, ActiveMask  // Load ActiveMask for 2nd active byte group
   3327             mov esi, edi              // lp = row
   3328             add edi, bpp          // rp = row + bpp
   3329             movq mm6, mm7
   3330             mov ebx, diff
   3331             psllq mm6, ShiftBpp   // Move mask in mm6 to cover 3rd active
   3332                                   // byte group
   3333             // PRIME the pump (load the first Raw(x-bpp) data set
   3334             movq mm1, [edi+ebx-8]
   3335 dsub3lp:
   3336             psrlq mm1, ShiftRem   // Shift data for adding 1st bpp bytes
   3337                           // no need for mask; shift clears inactive bytes
   3338             // Add 1st active group
   3339             movq mm0, [edi+ebx]
   3340             paddb mm0, mm1
   3341             // Add 2nd active group
   3342             movq mm1, mm0         // mov updated Raws to mm1
   3343             psllq mm1, ShiftBpp   // shift data to position correctly
   3344             pand mm1, mm7         // mask to use only 2nd active group
   3345             paddb mm0, mm1
   3346             // Add 3rd active group
   3347             movq mm1, mm0         // mov updated Raws to mm1
   3348             psllq mm1, ShiftBpp   // shift data to position correctly
   3349             pand mm1, mm6         // mask to use only 3rd active group
   3350             add ebx, 8
   3351             paddb mm0, mm1
   3352             cmp ebx, MMXLength
   3353             movq [edi+ebx-8], mm0     // Write updated Raws back to array
   3354             // Prep for doing 1st add at top of loop
   3355             movq mm1, mm0
   3356             jb dsub3lp
   3357          } // end _asm block
   3358       }
   3359       break;
   3360 
   3361       case 1:
   3362       {
   3363          // Placed here just in case this is a duplicate of the
   3364          // non-MMX code for the SUB filter in png_read_filter_row below
   3365          //
   3366          //         png_bytep rp;
   3367          //         png_bytep lp;
   3368          //         png_uint_32 i;
   3369          //         bpp = (row_info->pixel_depth + 7) >> 3;
   3370          //         for (i = (png_uint_32)bpp, rp = row + bpp, lp = row;
   3371          //            i < row_info->rowbytes; i++, rp++, lp++)
   3372          //      {
   3373          //            *rp = (png_byte)(((int)(*rp) + (int)(*lp)) & 0xff);
   3374          //      }
   3375          _asm {
   3376             mov ebx, diff
   3377             mov edi, row
   3378             cmp ebx, FullLength
   3379             jnb dsub1end
   3380             mov esi, edi          // lp = row
   3381             xor eax, eax
   3382             add edi, bpp      // rp = row + bpp
   3383 dsub1lp:
   3384             mov al, [esi+ebx]
   3385             add [edi+ebx], al
   3386             inc ebx
   3387             cmp ebx, FullLength
   3388             jb dsub1lp
   3389 dsub1end:
   3390          } // end _asm block
   3391       }
   3392       return;
   3393 
   3394       case 6:
   3395       case 7:
   3396       case 4:
   3397       case 5:
   3398       {
   3399          ShiftBpp.use = bpp << 3;
   3400          ShiftRem.use = 64 - ShiftBpp.use;
   3401          _asm {
   3402             mov edi, row
   3403             mov ebx, diff
   3404             mov esi, edi               // lp = row
   3405             add edi, bpp           // rp = row + bpp
   3406             // PRIME the pump (load the first Raw(x-bpp) data set
   3407             movq mm1, [edi+ebx-8]
   3408 dsub4lp:
   3409             psrlq mm1, ShiftRem // Shift data for adding 1st bpp bytes
   3410                           // no need for mask; shift clears inactive bytes
   3411             movq mm0, [edi+ebx]
   3412             paddb mm0, mm1
   3413             // Add 2nd active group
   3414             movq mm1, mm0          // mov updated Raws to mm1
   3415             psllq mm1, ShiftBpp    // shift data to position correctly
   3416                                    // there is no need for any mask
   3417                                    // since shift clears inactive bits/bytes
   3418             add ebx, 8
   3419             paddb mm0, mm1
   3420             cmp ebx, MMXLength
   3421             movq [edi+ebx-8], mm0
   3422             movq mm1, mm0          // Prep for doing 1st add at top of loop
   3423             jb dsub4lp
   3424          } // end _asm block
   3425       }
   3426       break;
   3427 
   3428       case 2:
   3429       {
   3430          ActiveMask.use  = 0x00000000ffff0000;
   3431          ShiftBpp.use = 16;       // == 2 * 8
   3432          ShiftRem.use = 48;       // == 64 - 16
   3433          _asm {
   3434             movq mm7, ActiveMask  // Load ActiveMask for 2nd active byte group
   3435             mov ebx, diff
   3436             movq mm6, mm7
   3437             mov edi, row
   3438             psllq mm6, ShiftBpp     // Move mask in mm6 to cover 3rd active
   3439                                     //  byte group
   3440             mov esi, edi            // lp = row
   3441             movq mm5, mm6
   3442             add edi, bpp            // rp = row + bpp
   3443             psllq mm5, ShiftBpp     // Move mask in mm5 to cover 4th active
   3444                                     //  byte group
   3445             // PRIME the pump (load the first Raw(x-bpp) data set
   3446             movq mm1, [edi+ebx-8]
   3447 dsub2lp:
   3448             // Add 1st active group
   3449             psrlq mm1, ShiftRem     // Shift data for adding 1st bpp bytes
   3450                                     // no need for mask; shift clears inactive
   3451                                     //  bytes
   3452             movq mm0, [edi+ebx]
   3453             paddb mm0, mm1
   3454             // Add 2nd active group
   3455             movq mm1, mm0           // mov updated Raws to mm1
   3456             psllq mm1, ShiftBpp     // shift data to position correctly
   3457             pand mm1, mm7           // mask to use only 2nd active group
   3458             paddb mm0, mm1
   3459             // Add 3rd active group
   3460             movq mm1, mm0           // mov updated Raws to mm1
   3461             psllq mm1, ShiftBpp     // shift data to position correctly
   3462             pand mm1, mm6           // mask to use only 3rd active group
   3463             paddb mm0, mm1
   3464             // Add 4th active group
   3465             movq mm1, mm0           // mov updated Raws to mm1
   3466             psllq mm1, ShiftBpp     // shift data to position correctly
   3467             pand mm1, mm5           // mask to use only 4th active group
   3468             add ebx, 8
   3469             paddb mm0, mm1
   3470             cmp ebx, MMXLength
   3471             movq [edi+ebx-8], mm0   // Write updated Raws back to array
   3472             movq mm1, mm0           // Prep for doing 1st add at top of loop
   3473             jb dsub2lp
   3474          } // end _asm block
   3475       }
   3476       break;
   3477       case 8:
   3478       {
   3479          _asm {
   3480             mov edi, row
   3481             mov ebx, diff
   3482             mov esi, edi            // lp = row
   3483             add edi, bpp            // rp = row + bpp
   3484             mov ecx, MMXLength
   3485             movq mm7, [edi+ebx-8]   // PRIME the pump (load the first
   3486                                     // Raw(x-bpp) data set
   3487             and ecx, 0x0000003f     // calc bytes over mult of 64
   3488 dsub8lp:
   3489             movq mm0, [edi+ebx]     // Load Sub(x) for 1st 8 bytes
   3490             paddb mm0, mm7
   3491             movq mm1, [edi+ebx+8]   // Load Sub(x) for 2nd 8 bytes
   3492             movq [edi+ebx], mm0    // Write Raw(x) for 1st 8 bytes
   3493                                    // Now mm0 will be used as Raw(x-bpp) for
   3494                                    // the 2nd group of 8 bytes.  This will be
   3495                                    // repeated for each group of 8 bytes with
   3496                                    // the 8th group being used as the Raw(x-bpp)
   3497                                    // for the 1st group of the next loop.
   3498             paddb mm1, mm0
   3499             movq mm2, [edi+ebx+16]  // Load Sub(x) for 3rd 8 bytes
   3500             movq [edi+ebx+8], mm1   // Write Raw(x) for 2nd 8 bytes
   3501             paddb mm2, mm1
   3502             movq mm3, [edi+ebx+24]  // Load Sub(x) for 4th 8 bytes
   3503             movq [edi+ebx+16], mm2  // Write Raw(x) for 3rd 8 bytes
   3504             paddb mm3, mm2
   3505             movq mm4, [edi+ebx+32]  // Load Sub(x) for 5th 8 bytes
   3506             movq [edi+ebx+24], mm3  // Write Raw(x) for 4th 8 bytes
   3507             paddb mm4, mm3
   3508             movq mm5, [edi+ebx+40]  // Load Sub(x) for 6th 8 bytes
   3509             movq [edi+ebx+32], mm4  // Write Raw(x) for 5th 8 bytes
   3510             paddb mm5, mm4
   3511             movq mm6, [edi+ebx+48]  // Load Sub(x) for 7th 8 bytes
   3512             movq [edi+ebx+40], mm5  // Write Raw(x) for 6th 8 bytes
   3513             paddb mm6, mm5
   3514             movq mm7, [edi+ebx+56]  // Load Sub(x) for 8th 8 bytes
   3515             movq [edi+ebx+48], mm6  // Write Raw(x) for 7th 8 bytes
   3516             add ebx, 64
   3517             paddb mm7, mm6
   3518             cmp ebx, ecx
   3519             movq [edi+ebx-8], mm7   // Write Raw(x) for 8th 8 bytes
   3520             jb dsub8lp
   3521             cmp ebx, MMXLength
   3522             jnb dsub8lt8
   3523 dsub8lpA:
   3524             movq mm0, [edi+ebx]
   3525             add ebx, 8
   3526             paddb mm0, mm7
   3527             cmp ebx, MMXLength
   3528             movq [edi+ebx-8], mm0   // use -8 to offset early add to ebx
   3529             movq mm7, mm0           // Move calculated Raw(x) data to mm1 to
   3530                                     // be the new Raw(x-bpp) for the next loop
   3531             jb dsub8lpA
   3532 dsub8lt8:
   3533          } // end _asm block
   3534       }
   3535       break;
   3536 
   3537       default:                // bpp greater than 8 bytes
   3538       {
   3539          _asm {
   3540             mov ebx, diff
   3541             mov edi, row
   3542             mov esi, edi           // lp = row
   3543             add edi, bpp           // rp = row + bpp
   3544 dsubAlp:
   3545             movq mm0, [edi+ebx]
   3546             movq mm1, [esi+ebx]
   3547             add ebx, 8
   3548             paddb mm0, mm1
   3549             cmp ebx, MMXLength
   3550             movq [edi+ebx-8], mm0  // mov does not affect flags; -8 to offset
   3551                                    //  add ebx
   3552             jb dsubAlp
   3553          } // end _asm block
   3554       }
   3555       break;
   3556 
   3557    } // end switch ( bpp )
   3558 
   3559    _asm {
   3560         mov ebx, MMXLength
   3561         mov edi, row
   3562         cmp ebx, FullLength
   3563         jnb dsubend
   3564         mov esi, edi               // lp = row
   3565         xor eax, eax
   3566         add edi, bpp               // rp = row + bpp
   3567 dsublp2:
   3568         mov al, [esi+ebx]
   3569         add [edi+ebx], al
   3570         inc ebx
   3571         cmp ebx, FullLength
   3572         jb dsublp2
   3573 dsubend:
   3574         emms             // End MMX instructions; prep for possible FP instrs.
   3575    } // end _asm block
   3576 }
   3577 
   3578 // Optimized code for PNG Up filter decoder
   3579 void /* PRIVATE */
   3580 png_read_filter_row_mmx_up(png_row_infop row_info, png_bytep row,
   3581    png_bytep prev_row)
   3582 {
   3583    png_uint_32 len;
   3584    len  = row_info->rowbytes;       // # of bytes to filter
   3585    _asm {
   3586       mov edi, row
   3587       // get # of bytes to alignment
   3588       mov ecx, edi
   3589       xor ebx, ebx
   3590       add ecx, 0x7
   3591       xor eax, eax
   3592       and ecx, 0xfffffff8
   3593       mov esi, prev_row
   3594       sub ecx, edi
   3595       jz dupgo
   3596       // fix alignment
   3597 duplp1:
   3598       mov al, [edi+ebx]
   3599       add al, [esi+ebx]
   3600       inc ebx
   3601       cmp ebx, ecx
   3602       mov [edi + ebx-1], al  // mov does not affect flags; -1 to offset inc ebx
   3603       jb duplp1
   3604 dupgo:
   3605       mov ecx, len
   3606       mov edx, ecx
   3607       sub edx, ebx                  // subtract alignment fix
   3608       and edx, 0x0000003f           // calc bytes over mult of 64
   3609       sub ecx, edx                  // drop over bytes from length
   3610       // Unrolled loop - use all MMX registers and interleave to reduce
   3611       // number of branch instructions (loops) and reduce partial stalls
   3612 duploop:
   3613       movq mm1, [esi+ebx]
   3614       movq mm0, [edi+ebx]
   3615       movq mm3, [esi+ebx+8]
   3616       paddb mm0, mm1
   3617       movq mm2, [edi+ebx+8]
   3618       movq [edi+ebx], mm0
   3619       paddb mm2, mm3
   3620       movq mm5, [esi+ebx+16]
   3621       movq [edi+ebx+8], mm2
   3622       movq mm4, [edi+ebx+16]
   3623       movq mm7, [esi+ebx+24]
   3624       paddb mm4, mm5
   3625       movq mm6, [edi+ebx+24]
   3626       movq [edi+ebx+16], mm4
   3627       paddb mm6, mm7
   3628       movq mm1, [esi+ebx+32]
   3629       movq [edi+ebx+24], mm6
   3630       movq mm0, [edi+ebx+32]
   3631       movq mm3, [esi+ebx+40]
   3632       paddb mm0, mm1
   3633       movq mm2, [edi+ebx+40]
   3634       movq [edi+ebx+32], mm0
   3635       paddb mm2, mm3
   3636       movq mm5, [esi+ebx+48]
   3637       movq [edi+ebx+40], mm2
   3638       movq mm4, [edi+ebx+48]
   3639       movq mm7, [esi+ebx+56]
   3640       paddb mm4, mm5
   3641       movq mm6, [edi+ebx+56]
   3642       movq [edi+ebx+48], mm4
   3643       add ebx, 64
   3644       paddb mm6, mm7
   3645       cmp ebx, ecx
   3646       movq [edi+ebx-8], mm6 // (+56)movq does not affect flags;
   3647                                      // -8 to offset add ebx
   3648       jb duploop
   3649 
   3650       cmp edx, 0                     // Test for bytes over mult of 64
   3651       jz dupend
   3652 
   3653 
   3654       // 2 lines added by lcreeve at netins.net
   3655       // (mail 11 Jul 98 in png-implement list)
   3656       cmp edx, 8 //test for less than 8 bytes
   3657       jb duplt8
   3658 
   3659 
   3660       add ecx, edx
   3661       and edx, 0x00000007           // calc bytes over mult of 8
   3662       sub ecx, edx                  // drop over bytes from length
   3663       jz duplt8
   3664       // Loop using MMX registers mm0 & mm1 to update 8 bytes simultaneously
   3665 duplpA:
   3666       movq mm1, [esi+ebx]
   3667       movq mm0, [edi+ebx]
   3668       add ebx, 8
   3669       paddb mm0, mm1
   3670       cmp ebx, ecx
   3671       movq [edi+ebx-8], mm0 // movq does not affect flags; -8 to offset add ebx
   3672       jb duplpA
   3673       cmp edx, 0            // Test for bytes over mult of 8
   3674       jz dupend
   3675 duplt8:
   3676       xor eax, eax
   3677       add ecx, edx          // move over byte count into counter
   3678       // Loop using x86 registers to update remaining bytes
   3679 duplp2:
   3680       mov al, [edi + ebx]
   3681       add al, [esi + ebx]
   3682       inc ebx
   3683       cmp ebx, ecx
   3684       mov [edi + ebx-1], al // mov does not affect flags; -1 to offset inc ebx
   3685       jb duplp2
   3686 dupend:
   3687       // Conversion of filtered row completed
   3688       emms          // End MMX instructions; prep for possible FP instrs.
   3689    } // end _asm block
   3690 }
   3691 
   3692 
   3693 // Optimized png_read_filter_row routines
   3694 void /* PRIVATE */
   3695 png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
   3696    row, png_bytep prev_row, int filter)
   3697 {
   3698 #ifdef PNG_DEBUG
   3699    char filnm[10];
   3700 #endif
   3701 
   3702    if (mmx_supported == 2) {
   3703 #if !defined(PNG_1_0_X)
   3704        /* this should have happened in png_init_mmx_flags() already */
   3705        png_warning(png_ptr, "asm_flags may not have been initialized");
   3706 #endif
   3707        png_mmx_support();
   3708    }
   3709 
   3710 #ifdef PNG_DEBUG
   3711    png_debug(1, "in png_read_filter_row\n");
   3712    switch (filter)
   3713    {
   3714       case 0: png_snprintf(filnm, 10, "none");
   3715          break;
   3716 #if !defined(PNG_1_0_X)
   3717       case 1: png_snprintf(filnm, 10, "sub-%s",
   3718         (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB)? "MMX" : "x86");
   3719          break;
   3720       case 2: png_snprintf(filnm, 10, "up-%s",
   3721         (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP)? "MMX" : "x86");
   3722          break;
   3723       case 3: png_snprintf(filnm, 10, "avg-%s",
   3724         (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG)? "MMX" : "x86");
   3725          break;
   3726       case 4: png_snprintf(filnm, 10, "Paeth-%s",
   3727         (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH)? "MMX":"x86");
   3728          break;
   3729 #else
   3730       case 1: png_snprintf(filnm, 10, "sub");
   3731          break;
   3732       case 2: png_snprintf(filnm, 10, "up");
   3733          break;
   3734       case 3: png_snprintf(filnm, 10, "avg");
   3735          break;
   3736       case 4: png_snprintf(filnm, 10, "Paeth");
   3737          break;
   3738 #endif
   3739       default: png_snprintf(filnm, 10, "unknw");
   3740          break;
   3741    }
   3742    png_debug2(0,"row=%5d, %s, ", png_ptr->row_number, filnm);
   3743    png_debug2(0, "pd=%2d, b=%d, ", (int)row_info->pixel_depth,
   3744       (int)((row_info->pixel_depth + 7) >> 3));
   3745    png_debug1(0,"len=%8d, ", row_info->rowbytes);
   3746 #endif /* PNG_DEBUG */
   3747 
   3748    switch (filter)
   3749    {
   3750       case PNG_FILTER_VALUE_NONE:
   3751          break;
   3752 
   3753       case PNG_FILTER_VALUE_SUB:
   3754       {
   3755 #if !defined(PNG_1_0_X)
   3756          if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB) &&
   3757              (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
   3758              (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
   3759 #else
   3760          if (mmx_supported)
   3761 #endif
   3762          {
   3763             png_read_filter_row_mmx_sub(row_info, row);
   3764          }
   3765          else
   3766          {
   3767             png_uint_32 i;
   3768             png_uint_32 istop = row_info->rowbytes;
   3769             png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
   3770             png_bytep rp = row + bpp;
   3771             png_bytep lp = row;
   3772 
   3773             for (i = bpp; i < istop; i++)
   3774             {
   3775                *rp = (png_byte)(((int)(*rp) + (int)(*lp++)) & 0xff);
   3776                rp++;
   3777             }
   3778          }
   3779          break;
   3780       }
   3781 
   3782       case PNG_FILTER_VALUE_UP:
   3783       {
   3784 #if !defined(PNG_1_0_X)
   3785          if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP) &&
   3786              (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
   3787              (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
   3788 #else
   3789          if (mmx_supported)
   3790 #endif
   3791          {
   3792             png_read_filter_row_mmx_up(row_info, row, prev_row);
   3793          }
   3794          else
   3795          {
   3796             png_uint_32 i;
   3797             png_uint_32 istop = row_info->rowbytes;
   3798             png_bytep rp = row;
   3799             png_bytep pp = prev_row;
   3800 
   3801             for (i = 0; i < istop; ++i)
   3802             {
   3803                *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
   3804                rp++;
   3805             }
   3806          }
   3807          break;
   3808       }
   3809 
   3810       case PNG_FILTER_VALUE_AVG:
   3811       {
   3812 #if !defined(PNG_1_0_X)
   3813          if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG) &&
   3814              (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
   3815              (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
   3816 #else
   3817          if (mmx_supported)
   3818 #endif
   3819          {
   3820             png_read_filter_row_mmx_avg(row_info, row, prev_row);
   3821          }
   3822          else
   3823          {
   3824             png_uint_32 i;
   3825             png_bytep rp = row;
   3826             png_bytep pp = prev_row;
   3827             png_bytep lp = row;
   3828             png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
   3829             png_uint_32 istop = row_info->rowbytes - bpp;
   3830 
   3831             for (i = 0; i < bpp; i++)
   3832             {
   3833                *rp = (png_byte)(((int)(*rp) +
   3834                   ((int)(*pp++) >> 1)) & 0xff);
   3835                rp++;
   3836             }
   3837 
   3838             for (i = 0; i < istop; i++)
   3839             {
   3840                *rp = (png_byte)(((int)(*rp) +
   3841                   ((int)(*pp++ + *lp++) >> 1)) & 0xff);
   3842                rp++;
   3843             }
   3844          }
   3845          break;
   3846       }
   3847 
   3848       case PNG_FILTER_VALUE_PAETH:
   3849       {
   3850 #if !defined(PNG_1_0_X)
   3851          if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH) &&
   3852              (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
   3853              (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
   3854 #else
   3855          if (mmx_supported)
   3856 #endif
   3857          {
   3858             png_read_filter_row_mmx_paeth(row_info, row, prev_row);
   3859          }
   3860          else
   3861          {
   3862             png_uint_32 i;
   3863             png_bytep rp = row;
   3864             png_bytep pp = prev_row;
   3865             png_bytep lp = row;
   3866             png_bytep cp = prev_row;
   3867             png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
   3868             png_uint_32 istop=row_info->rowbytes - bpp;
   3869 
   3870             for (i = 0; i < bpp; i++)
   3871             {
   3872                *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
   3873                rp++;
   3874             }
   3875 
   3876             for (i = 0; i < istop; i++)   // use leftover rp,pp
   3877             {
   3878                int a, b, c, pa, pb, pc, p;
   3879 
   3880                a = *lp++;
   3881                b = *pp++;
   3882                c = *cp++;
   3883 
   3884                p = b - c;
   3885                pc = a - c;
   3886 
   3887 #ifdef PNG_USE_ABS
   3888                pa = abs(p);
   3889                pb = abs(pc);
   3890                pc = abs(p + pc);
   3891 #else
   3892                pa = p < 0 ? -p : p;
   3893                pb = pc < 0 ? -pc : pc;
   3894                pc = (p + pc) < 0 ? -(p + pc) : p + pc;
   3895 #endif
   3896 
   3897                /*
   3898                   if (pa <= pb && pa <= pc)
   3899                      p = a;
   3900                   else if (pb <= pc)
   3901                      p = b;
   3902                   else
   3903                      p = c;
   3904                 */
   3905 
   3906                p = (pa <= pb && pa <=pc) ? a : (pb <= pc) ? b : c;
   3907 
   3908                *rp = (png_byte)(((int)(*rp) + p) & 0xff);
   3909                rp++;
   3910             }
   3911          }
   3912          break;
   3913       }
   3914 
   3915       default:
   3916          png_warning(png_ptr, "Ignoring bad row filter type");
   3917          *row=0;
   3918          break;
   3919    }
   3920 }
   3921 
   3922 #endif /* PNG_MMX_CODE_SUPPORTED && PNG_USE_PNGVCRD */
   3923