Home | History | Annotate | Download | only in libpng-1.2.19
      1 
      2 /* pnggccrd.c - mixed C/assembler version of utilities to read a PNG file
      3  *
      4  * For Intel/AMD x86 or x86-64 CPU (Pentium-MMX or later) and GNU C compiler.
      5  *
      6  * Last changed in libpng 1.2.19 August 18, 2007
      7  * For conditions of distribution and use, see copyright notice in png.h
      8  * Copyright (c) 1998 Intel Corporation
      9  * Copyright (c) 1999-2002,2007 Greg Roelofs
     10  * Copyright (c) 1998-2007 Glenn Randers-Pehrson
     11  *
     12  * Based on MSVC code contributed by Nirav Chhatrapati, Intel Corp., 1998.
     13  * Interface to libpng contributed by Gilles Vollant, 1999.
     14  * GNU C port by Greg Roelofs, 1999-2001.
     15  *
     16  * References:
     17  *
     18  *     http://www.intel.com/drg/pentiumII/appnotes/916/916.htm
     19  *     http://www.intel.com/drg/pentiumII/appnotes/923/923.htm
     20  *       [Intel's performance analysis of the MMX vs. non-MMX code;
     21  *        moved/deleted as of 2006, but text and some graphs still
     22  *        available via WayBack Machine at archive.org]
     23  *
     24  *     http://www.ibiblio.org/gferg/ldp/GCC-Inline-Assembly-HOWTO.html
     25  *     http://sam.zoy.org/blog/2007-04-13-shlib-with-non-pic-code-have-inline-assembly-and-pic-mix-well
     26  *     http://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html
     27  *     http://gcc.gnu.org/onlinedocs/gcc/Variable-Attributes.html
     28  *     http://gcc.gnu.org/onlinedocs/gcc/Function-Attributes.html
     29  *     AMD64 Architecture Programmer's Manual, volumes 1 and 5
     30  *       [http://www.amd.com/us-en/Processors/TechnicalResources/0,,30_182_739_7044,00.html]
     31  *     Intel 64 and IA-32 Software Developer's Manuals
     32  *       [http://developer.intel.com/products/processor/manuals/]
     33  *
     34  * png_read_filter_row_mmx_*() were converted in place with intel2gas 1.3.1:
     35  *
     36  *     intel2gas -mdI pnggccrd.c.partially-msvc -o pnggccrd.c
     37  *
     38  * and then cleaned up by hand.  See http://hermes.terminal.at/intel2gas/ .
     39  *
     40  * NOTE:  A sufficiently recent version of GNU as (or as.exe under DOS/Windows)
     41  * is required to assemble the newer asm instructions such as movq.  (Version
     42  * 2.5.2l.15 is definitely too old.)  See ftp://ftp.gnu.org/pub/gnu/binutils/ .
     43  */
     44 
     45 /*
     46  * PORTING NOTES AND CHANGELOG (mostly by Greg Roelofs)
     47  * ===========================
     48  *
     49  * 19991006:
     50  *  - fixed sign error in post-MMX cleanup code (16- & 32-bit cases)
     51  *
     52  * 19991007:
     53  *  - additional optimizations (possible or definite):
     54  *     x [DONE] write MMX code for 64-bit case (pixel_bytes == 8) [not tested]
     55  *     - write MMX code for 48-bit case (pixel_bytes == 6)
     56  *     - figure out what's up with 24-bit case (pixel_bytes == 3):
     57  *        why subtract 8 from width_mmx in the pass 4/5 case?
     58  *        (only width_mmx case) (near line 2335)
     59  *     x [DONE] replace pixel_bytes within each block with the true
     60  *        constant value (or are compilers smart enough to do that?)
     61  *     - rewrite all MMX interlacing code so it's aligned with
     62  *        the *beginning* of the row buffer, not the end.  This
     63  *        would not only allow one to eliminate half of the memory
     64  *        writes for odd passes (that is, pass == odd), it may also
     65  *        eliminate some unaligned-data-access exceptions (assuming
     66  *        there's a penalty for not aligning 64-bit accesses on
     67  *        64-bit boundaries).  The only catch is that the "leftover"
     68  *        pixel(s) at the end of the row would have to be saved,
     69  *        but there are enough unused MMX registers in every case,
     70  *        so this is not a problem.  A further benefit is that the
     71  *        post-MMX cleanup code (C code) in at least some of the
     72  *        cases could be done within the assembler block.
     73  *  x [DONE] the "v3 v2 v1 v0 v7 v6 v5 v4" comments are confusing,
     74  *     inconsistent, and don't match the MMX Programmer's Reference
     75  *     Manual conventions anyway.  They should be changed to
     76  *     "b7 b6 b5 b4 b3 b2 b1 b0," where b0 indicates the byte that
     77  *     was lowest in memory (i.e., corresponding to a left pixel)
     78  *     and b7 is the byte that was highest (i.e., a right pixel).
     79  *
     80  * 19991016:
     81  *  - Brennan's Guide notwithstanding, gcc under Linux does *not*
     82  *     want globals prefixed by underscores when referencing them--
     83  *     i.e., if the variable is const4, then refer to it as const4,
     84  *     not _const4.  This seems to be a djgpp-specific requirement.
     85  *     Also, such variables apparently *must* be declared outside
     86  *     of functions; neither static nor automatic variables work if
     87  *     defined within the scope of a single function, but both
     88  *     static and truly global (multi-module) variables work fine.
     89  *
     90  * 19991017:
     91  *  - replaced pixel_bytes in each png_memcpy() call with constant value for
     92  *     inlining (png_do_read_interlace() "non-MMX/modified C code" block)
     93  *
     94  * 19991023:
     95  *  - fixed png_combine_row() non-MMX replication bug (odd passes only?)
     96  *  - switched from string-concatenation-with-macros to cleaner method of
     97  *     renaming global variables for djgpp--i.e., always use prefixes in
     98  *     inlined assembler code (== strings) and conditionally rename the
     99  *     variables, not the other way around.  Hence _const4, _mask8_0, etc.
    100  *
    101  * 19991024:
    102  *  - fixed mmxsupport()/png_do_read_interlace() first-row bug
    103  *     This one was severely weird:  even though mmxsupport() doesn't touch
    104  *     ebx (where "row" pointer was stored), it nevertheless managed to zero
    105  *     the register (even in static/non-fPIC code--see below), which in turn
    106  *     caused png_do_read_interlace() to return prematurely on the first row of
    107  *     interlaced images (i.e., without expanding the interlaced pixels).
    108  *     Inspection of the generated assembly code didn't turn up any clues,
    109  *     although it did point at a minor optimization (i.e., get rid of
    110  *     mmx_supported_local variable and just use eax).  Possibly the CPUID
    111  *     instruction is more destructive than it looks?  (Not yet checked.)
    112  *  - "info gcc" was next to useless, so compared fPIC and non-fPIC assembly
    113  *     listings...  Apparently register spillage has to do with ebx, since
    114  *     it's used to index the global offset table.  Commenting it out of the
    115  *     input-reg lists in png_combine_row() eliminated compiler barfage, so
    116  *     ifdef'd with __PIC__ macro:  if defined, use a global for unmask
    117  *
    118  * 19991107:
    119  *  - verified CPUID clobberage:  12-char string constant ("GenuineIntel",
    120  *     "AuthenticAMD", etc.) placed in ebx:ecx:edx.  Still need to polish.
    121  *
    122  * 19991120:
    123  *  - made "diff" variable (now "_dif") global to simplify conversion of
    124  *     filtering routines (running out of regs, sigh).  "diff" is still used
    125  *     in interlacing routines, however.
    126  *  - fixed up both versions of mmxsupport() (ORIG_THAT_USED_TO_CLOBBER_EBX
    127  *     macro determines which is used); original not yet tested.
    128  *
    129  * 20000213:
    130  *  - when compiling with gcc, be sure to use  -fomit-frame-pointer
    131  *
    132  * 20000319:
    133  *  - fixed a register-name typo in png_do_read_interlace(), default (MMX) case,
    134  *     pass == 4 or 5, that caused visible corruption of interlaced images
    135  *
    136  * 20000623:
    137  *  - Various problems were reported with gcc 2.95.2 in the Cygwin environment,
    138  *     many of the form "forbidden register 0 (ax) was spilled for class AREG."
    139  *     This is explained at http://gcc.gnu.org/fom_serv/cache/23.html, and
    140  *     Chuck Wilson supplied a patch involving dummy output registers.  See
    141  *     http://sourceforge.net/bugs/?func=detailbug&bug_id=108741&group_id=5624
    142  *     for the original (anonymous) SourceForge bug report.
    143  *
    144  * 20000706:
    145  *  - Chuck Wilson passed along these remaining gcc 2.95.2 errors:
    146  *       pnggccrd.c: In function `png_combine_row':
    147  *       pnggccrd.c:525: more than 10 operands in `asm'
    148  *       pnggccrd.c:669: more than 10 operands in `asm'
    149  *       pnggccrd.c:828: more than 10 operands in `asm'
    150  *       pnggccrd.c:994: more than 10 operands in `asm'
    151  *       pnggccrd.c:1177: more than 10 operands in `asm'
    152  *     They are all the same problem and can be worked around by using the
    153  *     global _unmask variable unconditionally, not just in the -fPIC case.
    154  *     Reportedly earlier versions of gcc also have the problem with more than
    155  *     10 operands; they just don't report it.  Much strangeness ensues, etc.
    156  *
    157  * 20000729:
    158  *  - enabled png_read_filter_row_mmx_up() (shortest remaining unconverted
    159  *     MMX routine); began converting png_read_filter_row_mmx_sub()
    160  *  - to finish remaining sections:
    161  *     - clean up indentation and comments
    162  *     - preload local variables
    163  *     - add output and input regs (order of former determines numerical
    164  *        mapping of latter)
    165  *     - avoid all usage of ebx (including bx, bh, bl) register [20000823]
    166  *     - remove "$" from addressing of Shift and Mask variables [20000823]
    167  *
    168  * 20000731:
    169  *  - global union vars causing segfaults in png_read_filter_row_mmx_sub()?
    170  *
    171  * 20000822:
    172  *  - ARGH, stupid png_read_filter_row_mmx_sub() segfault only happens with
    173  *     shared-library (-fPIC) version!  Code works just fine as part of static
    174  *     library.  Should have tested that sooner.
    175  *     ebx is getting clobbered again (explicitly this time); need to save it
    176  *     on stack or rewrite asm code to avoid using it altogether.  Blargh!
    177  *
    178  * 20000823:
    179  *  - first section was trickiest; all remaining sections have ebx -> edx now.
    180  *     (-fPIC works again.)  Also added missing underscores to various Shift*
    181  *     and *Mask* globals and got rid of leading "$" signs.
    182  *
    183  * 20000826:
    184  *  - added visual separators to help navigate microscopic printed copies
    185  *     (http://pobox.com/~newt/code/gpr-latest.zip, mode 10); started working
    186  *     on png_read_filter_row_mmx_avg()
    187  *
    188  * 20000828:
    189  *  - finished png_read_filter_row_mmx_avg():  only Paeth left! (930 lines...)
    190  *     What the hell, did png_read_filter_row_mmx_paeth(), too.  Comments not
    191  *     cleaned up/shortened in either routine, but functionality is complete
    192  *     and seems to be working fine.
    193  *
    194  * 20000829:
    195  *  - ahhh, figured out last(?) bit of gcc/gas asm-fu:  if register is listed
    196  *     as an input reg (with dummy output variables, etc.), then it *cannot*
    197  *     also appear in the clobber list or gcc 2.95.2 will barf.  The solution
    198  *     is simple enough...
    199  *
    200  * 20000914:
    201  *  - bug in png_read_filter_row_mmx_avg():  16-bit grayscale not handled
    202  *     correctly (but 48-bit RGB just fine)
    203  *
    204  * 20000916:
    205  *  - fixed bug in png_read_filter_row_mmx_avg(), bpp == 2 case; three errors:
    206  *     - "_ShiftBpp.use = 24;"      should have been   "_ShiftBpp.use = 16;"
    207  *     - "_ShiftRem.use = 40;"      should have been   "_ShiftRem.use = 48;"
    208  *     - "psllq _ShiftRem, %%mm2"   should have been   "psrlq _ShiftRem, %%mm2"
    209  *
    210  * 20010101:
    211  *  - added new png_init_mmx_flags() function (here only because it needs to
    212  *     call mmxsupport(), which should probably become global png_mmxsupport());
    213  *     modified other MMX routines to run conditionally (png_ptr->asm_flags)
    214  *
    215  * 20010103:
    216  *  - renamed mmxsupport() to png_mmx_support(), with auto-set of mmx_supported,
    217  *     and made it public; moved png_init_mmx_flags() to png.c as internal func
    218  *
    219  * 20010104:
    220  *  - removed dependency on png_read_filter_row_c() (C code already duplicated
    221  *     within MMX version of png_read_filter_row()) so no longer necessary to
    222  *     compile it into pngrutil.o
    223  *
    224  * 20010310:
    225  *  - fixed buffer-overrun bug in png_combine_row() C code (non-MMX)
    226  *
    227  * 20010808:
    228  *  - added PNG_THREAD_UNSAFE_OK around code using global variables [GR-P]
    229  *
    230  * 20011124:
    231  *  - fixed missing save of Eflag in png_mmx_support() [Maxim Sobolev]
    232  *
    233  * 20020304:
    234  *  - eliminated incorrect use of width_mmx in pixel_bytes == 8 case
    235  *
    236  * 20020407:
    237  *  - fixed insufficient preservation of ebx register [Sami Farin]
    238  *
    239  * 20040724:
    240  *  - more tinkering with clobber list at lines 4529 and 5033 to get it to
    241  *     compile with gcc 3.4 [GR-P]
    242  *
    243  * 20040809:
    244  *  - added "rim" definitions for CONST4 and CONST6 [GR-P]
    245  *
    246  * 20060303:
    247  *  - added "OS2" to list of systems that don't need leading underscores [GR-P]
    248  *
    249  * 20060320:
    250  *  - made PIC-compliant [Christian Aichinger]
    251  *
    252  * 20070313:
    253  *  - finally applied Giuseppe Ghib's 64-bit patch of 20060803 (completely
    254  *     overlooked Dylan Alex Simon's similar patch of 20060414, oops...)
    255  *
    256  * 20070524:
    257  *  - fixed link failure caused by asm-only variables being optimized out
    258  *     (identified by Dimitri of Trolltech) with __attribute__((used)), which
    259  *     also gets rid of warnings => nuked ugly png_squelch_warnings() hack
    260  *  - dropped redundant ifdef
    261  *  - moved png_mmx_support() back up where originally intended (as in
    262  *     pngvcrd.c), using __attribute__((noinline)) in extra prototype
    263  *
    264  * 20070527:
    265  *  - revised png_combine_row() to reuse mask in lieu of external _unmask
    266  *  - moved 32-bit (RGBA) case to top of png_combine_row():  most common
    267  *  - just about ready to give up on x86-64 -fPIC mode; can't even access 16
    268  *     _mask*_* constants without triggering link error on shared library:
    269  *       /usr/bin/ld: pnggccrd.pic.o: relocation R_X86_64_32S against `a local
    270  *         symbol' can not be used when making a shared object; recompile with
    271  *         -fPIC
    272  *       pnggccrd.pic.o: could not read symbols: Bad value
    273  *       ("objdump -x pnggccrd.pic.o | grep rodata" to verify)
    274  *     [might be able to work around by doing within assembly code whatever
    275  *     -fPIC does, but given problems to date, seems like long shot...]
    276  *     [relevant ifdefs:  __x86_64__ && __PIC__ => C code only]
    277  *  - changed #if 0 to #ifdef PNG_CLOBBER_MMX_REGS_SUPPORTED in case gcc ever
    278  *     supports MMX regs (%mm0, etc.) in clobber list (not supported by gcc
    279  *     2.7.2.3, 2.91.66 (egcs 1.1.2), 3.x, or 4.1.2)
    280  *
    281  * 20070603:
    282  *  - revised png_combine_row() to use @GOTPCREL(%%rip) addressing on _c64
    283  *     struct of _mask*_* constants for x86-64 -fPIC; see sam.zoy.org link
    284  *     above for details
    285  *  - moved _const4 and _const6 into _c64 struct, renamed to _amask5_3_0 and
    286  *     _amask7_1_0, respectively
    287  *  - can't figure out how to use _c64._mask*_* vars within asm code, so still
    288  *     need single variables for non-x86-64/-fPIC half :-(
    289  *  - replaced various __PIC__ ifdefs with *_GOT_ebx macros
    290  *  - moved _LBCarryMask and _HBClearMask into _c64 struct
    291  *  - conditionally replaced _p*temp variables with %r11d-%r13d (via p*_TEMP
    292  *     and CLOBBER_r1*d macros)
    293  *
    294  * 20070604:
    295  *  - replaced all _ActiveMask and _ActiveMaskEnd with new _amask*_*_* consts
    296  *     (_amask naming convention:  numbers of 00-bytes, ff-bytes, 00-bytes)
    297  *    - _ActiveMask     // (10) // avg/paeth/sub; read-only; consts; movq/pand
    298  *       0x0000000000ffffffLL (bpp 3, avg)      _amask5_3_0
    299  *       0xffffffffffffffffLL (bpp 4, 6, avg)   _amask0_8_0
    300  *       0x000000000000ffffLL (bpp 2, avg)      _amask6_2_0
    301  *       0x0000000000ffffffLL (bpp 3, paeth)    _amask5_3_0
    302  *       0x00000000ffffffffLL (bpp 6, paeth)    _amask4_4_0
    303  *       0x00000000ffffffffLL (bpp 4, paeth)    _amask4_4_0
    304  *       0x00000000ffffffffLL (bpp 8, paeth)    _amask4_4_0
    305  *       0x0000ffffff000000LL (bpp 3, sub)      _amask2_3_3
    306  *       0x00000000ffff0000LL (bpp 2, sub)      _amask4_2_2
    307  *    - _ActiveMaskEnd  // (1)  // paeth only; read-only; const; pand
    308  *       0xffff000000000000LL (bpp 3, paeth)    _amask0_2_6
    309  *  - changed all "#if defined(__x86_64__) // later // && defined(__PIC__)"
    310  *     lines to "#ifdef PNG_x86_64_USE_GOTPCREL" for easier/safer testing
    311  *
    312  * 20070605:
    313  *  - merged PNG_x86_64_USE_GOTPCREL, non-PNG_x86_64_USE_GOTPCREL code via
    314  *     *MASK* and LOAD/RESTORE macros
    315  *
    316  * 20070607:
    317  *  - replaced all constant instances of _ShiftBpp, _ShiftRem with immediates
    318  *     (still have two shared cases in avg, sub routines)
    319  *
    320  * 20070609:
    321  *  - replaced remaining instances of _ShiftBpp, _ShiftRem with immediates
    322  *     (split sub and avg 4/6-bpp cases into separate blocks)
    323  *  - fixed paeth bug due to clobbered r11/r12/r13 regs
    324  *
    325  * 20070610:
    326  *  - made global "_dif" variable (avg/paeth/sub routines) local again (now
    327  *     "diff"--see 19991120 entry above), using register constraints
    328  *  - note that %ebp in clobber list doesn't actually work, at least for 32-bit
    329  *     version and gcc 4.1.2; must save and restore manually.  (Seems to work
    330  *     OK for 64-bit version and gcc 3.4.3, but gcc may not be using ebp/rbp
    331  *     in that case.)
    332  *  - started replacing direct _MMXLength accesses with register constraints
    333  *
    334  * 20070612:
    335  *  - continued replacing direct _MMXLength accesses with register constraints
    336  *
    337  * 20070613:
    338  *  - finished replacing direct _MMXLength accesses with register constraints;
    339  *     switched to local variable (and renamed back to MMXLength)
    340  *
    341  * 20070614:
    342  *  - fixed sub bpp = 1 bug
    343  *  - started replacing direct _FullLength accesses with register constraints
    344  *
    345  * 20070615:
    346  *  - fixed 64-bit paeth bpp 3 crash bug (misplaced LOAD_GOT_rbp)
    347  *  - fixed 64-bit paeth bpp 1/2 and cleanup-block crash bugs (misplaced
    348  *     RESTORE_r11_r12_r13)
    349  *  - slightly optimized avg/paeth cleanup blocks and paeth bpp 1/2 block
    350  *     (save/restore ebx only if needed)
    351  *  - continued replacing direct _FullLength accesses with register constraints
    352  *
    353  * 20070616:
    354  *  - finished replacing direct _FullLength accesses with register constraints
    355  *     (*ugly* conditional clobber-separator macros for avg and paeth, sigh)
    356  *
    357  * 20070618:
    358  *  - fixed misplaced PNG_THREAD_UNSAFE_OK endif (was missing LOAD_GOT_rbp/
    359  *     RESTORE_rbp in 32-bit thread-safe case)
    360  *  - changed all "ifdef *" to "if defined(*)" [GR-P]
    361  *
    362  * 20070619:
    363  *  - rearranged most bitdepth-related case statements to put most frequent
    364  *     cases at top (24-bit, 32-bit, 8-bit, rest)
    365  *
    366  * 20070623:
    367  *  - cleaned up png_debug() warnings/formatting
    368  *  - removed PNG_MMX_CODE_SUPPORTED ifdefs and added outer __GNUC__ ifdef
    369  *     (module no longer used by non-x86/non-GCC builds as of libpng 1.2.19)
    370  *  - removed single libpng-1.2.x PNG_DEBUG dependency on 1.0.x png_struct
    371  *     member (row_buf_size)
    372  *  - rearranged pass-related if-blocks in png_do_read_interlace() to put most
    373  *     frequent cases (4, 5) at top [GR-P suggestion]
    374  *
    375  * 20070624-29:
    376  *  - fixed 64-bit crash bug:  pointers -> rsi/rdi, not esi/edi (switched to
    377  *     %0/%1/%2/%3/%4 notation; eliminated size suffixes from relevant add/
    378  *     inc/sub/mov instructions; changed dummy vars to pointers)
    379  *     - png_combine_row()
    380  *     - png_do_read_interlace()
    381  *     - png_read_filter_row_mmx_avg()
    382  *     - png_read_filter_row_mmx_paeth()
    383  *     - png_read_filter_row_mmx_sub()
    384  *     - png_read_filter_row_mmx_up()
    385  *  - NOTE:  this fix makes use of the fact that modifying a 32-bit reg (e.g.,
    386  *     %%ebx) clears the top half of its corresponding 64-bit reg (%%rbx), so
    387  *     it's safe to mix 32-bit operations with 64-bit base/index addressing
    388  *     (see new PSI/PAX/PBX/PDX/PBP/etc. "pointer-register" macros); applies
    389  *     also to clobber lists
    390  *
    391  * 20070630:
    392  *  - cleaned up formatting, macros, minor png_read_filter_row_mmx_sub() 8-bpp
    393  *     register-usage inefficiency
    394  *  - fixed 32-bit png_do_read_interlace() bug (was using pointer size for
    395  *     64-bit dummy values)
    396  *
    397  * 20070703:
    398  *  - added check for (manual) PIC macro to fix OpenBSD crash bug
    399  *
    400  * 20070717:
    401  *  - fixed 48-bit png_combine_row() bug (was acting like 32-bit):  copy 6
    402  *     bytes per pixel, not 4, and use stride of 6, not 4, in the second loop
    403  *     of interlace processing of 48-bit pixels [GR-P]
    404  *
    405  * 20070722:
    406  *  - fixed 64-bit png_uint_32 bug with MMXLength/FullLength temp vars
    407  *
    408  * [still broken:  tops of all row-filter blocks (input/output constraints);
    409  *  shows up on 64-bit dynamic (-fPIC) version with -O2, especially if debug-
    410  *  printfs enabled, but at right edge of odd-width images even if disabled]
    411  *
    412  *
    413  * STILL TO DO:
    414  *  - fix final thread-unsafe code using stack vars and pointer? (paeth top,
    415  *     default, bottom only:  default, bottom already 5 reg constraints; could
    416  *     replace bpp with pointer and group bpp/patemp/pbtemp/pctemp in array)
    417  *  - fix ebp/no-reg-constraint inefficiency (avg/paeth/sub top)
    418  *  - test png_do_read_interlace() 64-bit case (pixel_bytes == 8)
    419  *  - write MMX code for 48-bit case (pixel_bytes == 6)
    420  *  - figure out what's up with 24-bit case (pixel_bytes == 3):
    421  *     why subtract 8 from width_mmx in the pass 4/5 case?  due to
    422  *     odd number of bytes? (only width_mmx case) (near line 2335)
    423  *  - rewrite all MMX interlacing code so it's aligned with beginning
    424  *     of the row buffer, not the end (see 19991007 for details)
    425  *  - add error messages to any remaining bogus default cases
    426  *  - enable pixel_depth == 8 cases in png_read_filter_row()? (test speed)
    427  *  - try =r, etc., as reg constraints?  (would gcc use 64-bit ones on x86-64?)
    428  *  - need full, non-graphical, CRC-based test suite...  maybe autogenerate
    429  *     random data of various height/width/depth, compute CRCs, write (C
    430  *     funcs), read (asm/MMX), recompute CRCs, and compare?
    431  *  - write true x86-64 version using 128-bit "media instructions", %xmm0-15,
    432  *     and extra general-purpose registers
    433  */
    434 
    435 #if defined(__GNUC__)
    436 
    437 #define PNG_INTERNAL
    438 #include "png.h"
    439 
    440 
    441 /* for some inexplicable reason, gcc 3.3.5 on OpenBSD (and elsewhere?) does
    442  * *not* define __PIC__ when the -fPIC option is used, so we have to rely on
    443  * makefiles and whatnot to define the PIC macro explicitly */
    444 #if defined(PIC) && !defined(__PIC__)   // (this can/should move to pngconf.h)
    445 #  define __PIC__
    446 #endif
    447 
    448 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_USE_PNGGCCRD)
    449 
    450 /* if you want/need full thread-safety on x86-64 even when linking statically,
    451  * comment out the "&& defined(__PIC__)" part here: */
    452 #if defined(__x86_64__) && defined(__PIC__)
    453 #  define PNG_x86_64_USE_GOTPCREL            // GOTPCREL => full thread-safety
    454 #  define PNG_CLOBBER_x86_64_REGS_SUPPORTED  // works as of gcc 3.4.3 ...
    455 #endif
    456 
    457 int PNGAPI png_mmx_support(void);
    458 
    459 #if defined(PNG_USE_LOCAL_ARRAYS)
    460 static PNG_CONST int FARDATA png_pass_start[7] = {0, 4, 0, 2, 0, 1, 0};
    461 static PNG_CONST int FARDATA png_pass_inc[7]   = {8, 8, 4, 4, 2, 2, 1};
    462 static PNG_CONST int FARDATA png_pass_width[7] = {8, 4, 4, 2, 2, 1, 1};
    463 #endif
    464 
    465 /* djgpp, Win32, Cygwin, and OS2 add their own underscores to global variables,
    466  * so define them without: */
    467 #if defined(__DJGPP__) || defined(WIN32) || defined(__CYGWIN__) || \
    468     defined(__OS2__)
    469 #  define _mmx_supported  mmx_supported
    470 #  define _mask8_0        mask8_0
    471 #  define _mask16_1       mask16_1
    472 #  define _mask16_0       mask16_0
    473 #  define _mask24_2       mask24_2
    474 #  define _mask24_1       mask24_1
    475 #  define _mask24_0       mask24_0
    476 #  define _mask32_3       mask32_3
    477 #  define _mask32_2       mask32_2
    478 #  define _mask32_1       mask32_1
    479 #  define _mask32_0       mask32_0
    480 #  define _mask48_5       mask48_5
    481 #  define _mask48_4       mask48_4
    482 #  define _mask48_3       mask48_3
    483 #  define _mask48_2       mask48_2
    484 #  define _mask48_1       mask48_1
    485 #  define _mask48_0       mask48_0
    486 #  define _amask5_3_0     amask5_3_0
    487 #  define _amask7_1_0     amask7_1_0
    488 #  define _LBCarryMask    LBCarryMask
    489 #  define _HBClearMask    HBClearMask
    490 #  define _amask0_8_0     amask0_8_0
    491 #  define _amask6_2_0     amask6_2_0
    492 #  define _amask4_4_0     amask4_4_0
    493 #  define _amask0_2_6     amask0_2_6
    494 #  define _amask2_3_3     amask2_3_3
    495 #  define _amask4_2_2     amask4_2_2
    496 #  if defined(PNG_THREAD_UNSAFE_OK)
    497 #    define _patemp       patemp
    498 #    define _pbtemp       pbtemp
    499 #    define _pctemp       pctemp
    500 #  endif
    501 #endif // djgpp, Win32, Cygwin, OS2
    502 
    503 
    504 /* These constants are used in the inlined MMX assembly code. */
    505 
    506 typedef unsigned long long  ull;
    507 
    508 #if defined(PNG_x86_64_USE_GOTPCREL)
    509 static PNG_CONST struct {
    510     //ull _mask_array[26];
    511 
    512     // png_combine_row() constants:
    513     ull _mask8_0;
    514     ull _mask16_0, _mask16_1;
    515     ull _mask24_0, _mask24_1, _mask24_2;
    516     ull _mask32_0, _mask32_1, _mask32_2, _mask32_3;
    517     ull _mask48_0, _mask48_1, _mask48_2, _mask48_3, _mask48_4, _mask48_5;
    518 
    519     // png_do_read_interlace() constants:
    520     ull _amask5_3_0, _amask7_1_0;  // was _const4 and _const6, respectively
    521 
    522     // png_read_filter_row_mmx_avg() constants (also uses _amask5_3_0):
    523     ull _LBCarryMask, _HBClearMask;
    524     ull _amask0_8_0, _amask6_2_0;  // was ActiveMask for bpp 4/6 and 2 cases
    525 
    526     // png_read_filter_row_mmx_paeth() constants (also uses _amask5_3_0):
    527     ull _amask4_4_0, _amask0_2_6;  // was ActiveMask{,End} for bpp 6/4/8 and 3
    528 
    529     // png_read_filter_row_mmx_sub() constants:
    530     ull _amask2_3_3, _amask4_2_2;  // was ActiveMask for bpp 3 and 2 cases
    531 
    532 } _c64 __attribute__((used, aligned(8))) = {
    533 
    534     // png_combine_row() constants:
    535     0x0102040810204080LL, // _mask8_0      offset 0
    536 
    537     0x1010202040408080LL, // _mask16_0     offset 8
    538     0x0101020204040808LL, // _mask16_1     offset 16
    539 
    540     0x2020404040808080LL, // _mask24_0     offset 24
    541     0x0408080810101020LL, // _mask24_1     offset 32
    542     0x0101010202020404LL, // _mask24_2     offset 40
    543 
    544     0x4040404080808080LL, // _mask32_0     offset 48
    545     0x1010101020202020LL, // _mask32_1     offset 56
    546     0x0404040408080808LL, // _mask32_2     offset 64
    547     0x0101010102020202LL, // _mask32_3     offset 72
    548 
    549     0x4040808080808080LL, // _mask48_0     offset 80
    550     0x2020202040404040LL, // _mask48_1     offset 88
    551     0x1010101010102020LL, // _mask48_2     offset 96
    552     0x0404080808080808LL, // _mask48_3     offset 104
    553     0x0202020204040404LL, // _mask48_4     offset 112
    554     0x0101010101010202LL, // _mask48_5     offset 120
    555 
    556     // png_do_read_interlace() constants:
    557     0x0000000000FFFFFFLL, // _amask5_3_0   offset 128  (bpp 3, avg/paeth) const4
    558     0x00000000000000FFLL, // _amask7_1_0   offset 136                     const6
    559 
    560     // png_read_filter_row_mmx_avg() constants:
    561     0x0101010101010101LL, // _LBCarryMask  offset 144
    562     0x7F7F7F7F7F7F7F7FLL, // _HBClearMask  offset 152
    563     0xFFFFFFFFFFFFFFFFLL, // _amask0_8_0   offset 160  (bpp 4/6, avg)
    564     0x000000000000FFFFLL, // _amask6_2_0   offset 168  (bpp 2,   avg)
    565 
    566     // png_read_filter_row_mmx_paeth() constants:
    567     0x00000000FFFFFFFFLL, // _amask4_4_0   offset 176  (bpp 6/4/8, paeth)
    568     0xFFFF000000000000LL, // _amask0_2_6   offset 184  (bpp 3, paeth)   A.M.End
    569 
    570     // png_read_filter_row_mmx_sub() constants:
    571     0x0000FFFFFF000000LL, // _amask2_3_3   offset 192  (bpp 3, sub)
    572     0x00000000FFFF0000LL, // _amask4_2_2   offset 200  (bpp 2, sub)
    573 
    574 };
    575 
    576 #define MASK8_0        "(%%rbp)"
    577 #define MASK16_0       "8(%%rbp)"
    578 #define MASK16_1       "16(%%rbp)"
    579 #define MASK24_0       "24(%%rbp)"
    580 #define MASK24_1       "32(%%rbp)"
    581 #define MASK24_2       "40(%%rbp)"
    582 #define MASK32_0       "48(%%rbp)"
    583 #define MASK32_1       "56(%%rbp)"
    584 #define MASK32_2       "64(%%rbp)"
    585 #define MASK32_3       "72(%%rbp)"
    586 #define MASK48_0       "80(%%rbp)"
    587 #define MASK48_1       "88(%%rbp)"
    588 #define MASK48_2       "96(%%rbp)"
    589 #define MASK48_3       "104(%%rbp)"
    590 #define MASK48_4       "112(%%rbp)"
    591 #define MASK48_5       "120(%%rbp)"
    592 #define AMASK5_3_0     "128(%%rbp)"
    593 #define AMASK7_1_0     "136(%%rbp)"
    594 #define LB_CARRY_MASK  "144(%%rbp)"
    595 #define HB_CLEAR_MASK  "152(%%rbp)"
    596 #define AMASK0_8_0     "160(%%rbp)"
    597 #define AMASK6_2_0     "168(%%rbp)"
    598 #define AMASK4_4_0     "176(%%rbp)"
    599 #define AMASK0_2_6     "184(%%rbp)"
    600 #define AMASK2_3_3     "192(%%rbp)"
    601 #define AMASK4_2_2     "200(%%rbp)"
    602 
    603 #else // !PNG_x86_64_USE_GOTPCREL
    604 
    605 static PNG_CONST ull _mask8_0  __attribute__((used, aligned(8))) = 0x0102040810204080LL;
    606 
    607 static PNG_CONST ull _mask16_1 __attribute__((used, aligned(8))) = 0x0101020204040808LL;
    608 static PNG_CONST ull _mask16_0 __attribute__((used, aligned(8))) = 0x1010202040408080LL;
    609 
    610 static PNG_CONST ull _mask24_2 __attribute__((used, aligned(8))) = 0x0101010202020404LL;
    611 static PNG_CONST ull _mask24_1 __attribute__((used, aligned(8))) = 0x0408080810101020LL;
    612 static PNG_CONST ull _mask24_0 __attribute__((used, aligned(8))) = 0x2020404040808080LL;
    613 
    614 static PNG_CONST ull _mask32_3 __attribute__((used, aligned(8))) = 0x0101010102020202LL;
    615 static PNG_CONST ull _mask32_2 __attribute__((used, aligned(8))) = 0x0404040408080808LL;
    616 static PNG_CONST ull _mask32_1 __attribute__((used, aligned(8))) = 0x1010101020202020LL;
    617 static PNG_CONST ull _mask32_0 __attribute__((used, aligned(8))) = 0x4040404080808080LL;
    618 
    619 static PNG_CONST ull _mask48_5 __attribute__((used, aligned(8))) = 0x0101010101010202LL;
    620 static PNG_CONST ull _mask48_4 __attribute__((used, aligned(8))) = 0x0202020204040404LL;
    621 static PNG_CONST ull _mask48_3 __attribute__((used, aligned(8))) = 0x0404080808080808LL;
    622 static PNG_CONST ull _mask48_2 __attribute__((used, aligned(8))) = 0x1010101010102020LL;
    623 static PNG_CONST ull _mask48_1 __attribute__((used, aligned(8))) = 0x2020202040404040LL;
    624 static PNG_CONST ull _mask48_0 __attribute__((used, aligned(8))) = 0x4040808080808080LL;
    625 
    626 // png_do_read_interlace() constants:
    627 static PNG_CONST ull _amask5_3_0  __attribute__((aligned(8))) = 0x0000000000FFFFFFLL;  // was _const4
    628 static PNG_CONST ull _amask7_1_0  __attribute__((aligned(8))) = 0x00000000000000FFLL;  // was _const6
    629 
    630 // png_read_filter_row_mmx_avg() constants:
    631 static PNG_CONST ull _LBCarryMask __attribute__((used, aligned(8))) = 0x0101010101010101LL;
    632 static PNG_CONST ull _HBClearMask __attribute__((used, aligned(8))) = 0x7f7f7f7f7f7f7f7fLL;
    633 static PNG_CONST ull _amask0_8_0  __attribute__((used, aligned(8))) = 0xFFFFFFFFFFFFFFFFLL;
    634 static PNG_CONST ull _amask6_2_0  __attribute__((used, aligned(8))) = 0x000000000000FFFFLL;
    635 
    636 // png_read_filter_row_mmx_paeth() constants:
    637 static PNG_CONST ull _amask4_4_0  __attribute__((used, aligned(8))) = 0x00000000FFFFFFFFLL;
    638 static PNG_CONST ull _amask0_2_6  __attribute__((used, aligned(8))) = 0xFFFF000000000000LL;
    639 
    640 // png_read_filter_row_mmx_sub() constants:
    641 static PNG_CONST ull _amask2_3_3  __attribute__((used, aligned(8))) = 0x0000FFFFFF000000LL;
    642 static PNG_CONST ull _amask4_2_2  __attribute__((used, aligned(8))) = 0x00000000FFFF0000LL;
    643 
    644 #define MASK8_0        "_mask8_0"
    645 #define MASK16_0       "_mask16_0"
    646 #define MASK16_1       "_mask16_1"
    647 #define MASK24_0       "_mask24_0"
    648 #define MASK24_1       "_mask24_1"
    649 #define MASK24_2       "_mask24_2"
    650 #define MASK32_0       "_mask32_0"
    651 #define MASK32_1       "_mask32_1"
    652 #define MASK32_2       "_mask32_2"
    653 #define MASK32_3       "_mask32_3"
    654 #define MASK48_0       "_mask48_0"
    655 #define MASK48_1       "_mask48_1"
    656 #define MASK48_2       "_mask48_2"
    657 #define MASK48_3       "_mask48_3"
    658 #define MASK48_4       "_mask48_4"
    659 #define MASK48_5       "_mask48_5"
    660 #define AMASK5_3_0     "_amask5_3_0"
    661 #define AMASK7_1_0     "_amask7_1_0"
    662 #define LB_CARRY_MASK  "_LBCarryMask"
    663 #define HB_CLEAR_MASK  "_HBClearMask"
    664 #define AMASK0_8_0     "_amask0_8_0"
    665 #define AMASK6_2_0     "_amask6_2_0"
    666 #define AMASK4_4_0     "_amask4_4_0"
    667 #define AMASK0_2_6     "_amask0_2_6"
    668 #define AMASK2_3_3     "_amask2_3_3"
    669 #define AMASK4_2_2     "_amask4_2_2"
    670 
    671 #endif // ?PNG_x86_64_USE_GOTPCREL
    672 
    673 
    674 #if defined(PNG_HAVE_MMX_READ_FILTER_ROW) || defined(PNG_HAVE_MMX_COMBINE_ROW)
    675 
    676 // this block is specific to png_read_filter_row_mmx_paeth() except for
    677 // LOAD_GOT_rbp and RESTORE_rbp, which are also used in png_combine_row()
    678 #if defined(PNG_x86_64_USE_GOTPCREL)
    679 #  define pa_TEMP                "%%r11d"
    680 #  define pb_TEMP                "%%r12d"
    681 #  define pc_TEMP                "%%r13d"
    682 #  if defined(PNG_CLOBBER_x86_64_REGS_SUPPORTED)  // works as of gcc 3.4.3 ...
    683 #    define SAVE_r11_r12_r13
    684 #    define RESTORE_r11_r12_r13
    685 #    define _CLOBBER_r11_r12_r13 ,"%r11", "%r12", "%r13"
    686 #    define CLOBBER_r11_r12_r13  "%r11", "%r12", "%r13"
    687 #  else // !PNG_CLOBBER_x86_64_REGS_SUPPORTED
    688 #    define SAVE_r11_r12_r13     "pushq %%r11  \n\t" \
    689                                  "pushq %%r12  \n\t" \
    690                                  "pushq %%r13  \n\t"  // "normally 0-extended"
    691 #    define RESTORE_r11_r12_r13  "popq  %%r13  \n\t" \
    692                                  "popq  %%r12  \n\t" \
    693                                  "popq  %%r11  \n\t"
    694 #    define _CLOBBER_r11_r12_r13
    695 #    define CLOBBER_r11_r12_r13
    696 #  endif
    697 #  define LOAD_GOT_rbp           "pushq %%rbp                        \n\t" \
    698                                  "movq  _c64@GOTPCREL(%%rip), %%rbp  \n\t"
    699 #  define RESTORE_rbp            "popq  %%rbp                        \n\t"
    700 #else // 32-bit and/or non-PIC
    701 #  if defined(PNG_THREAD_UNSAFE_OK)
    702      // These variables are used in png_read_filter_row_mmx_paeth() and would be
    703      //   local variables if not for gcc-inline-assembly addressing limitations
    704      //   (some apparently related to ELF format, others to CPU type).
    705      //
    706      // WARNING: Their presence defeats the thread-safety of libpng.
    707      static int                     _patemp  __attribute__((used));
    708      static int                     _pbtemp  __attribute__((used));
    709      static int                     _pctemp  __attribute__((used));
    710 #    define pa_TEMP                "_patemp"
    711 #    define pb_TEMP                "_pbtemp"  // temp variables for
    712 #    define pc_TEMP                "_pctemp"  //  Paeth routine
    713 #    define SAVE_r11_r12_r13
    714 #    define RESTORE_r11_r12_r13
    715 #    define _CLOBBER_r11_r12_r13   // not using regs => not clobbering
    716 #    define CLOBBER_r11_r12_r13
    717 #  endif // PNG_THREAD_UNSAFE_OK
    718 #  define LOAD_GOT_rbp
    719 #  define RESTORE_rbp
    720 #endif
    721 
    722 #if defined(__x86_64__)
    723 #  define SAVE_ebp
    724 #  define RESTORE_ebp
    725 #  define _CLOBBER_ebp         ,"%ebp"
    726 #  define CLOBBER_ebp          "%ebp"
    727 #  define SAVE_FullLength      "movl %%eax, %%r15d  \n\t"
    728 #  define RESTORE_FullLength   "movl %%r15d, "     // may go into eax or ecx
    729 #  if defined(PNG_CLOBBER_x86_64_REGS_SUPPORTED)   // works as of gcc 3.4.3 ...
    730 #    define SAVE_r15
    731 #    define RESTORE_r15
    732 #    define _CLOBBER_r15       ,"%r15"
    733 #  else
    734 #    define SAVE_r15           "pushq %%r15  \n\t"
    735 #    define RESTORE_r15        "popq  %%r15  \n\t"
    736 #    define _CLOBBER_r15
    737 #  endif
    738 #  define PBP                  "%%rbp"             // regs used for 64-bit
    739 #  define PAX                  "%%rax"             //  pointers or in
    740 #  define PBX                  "%%rbx"             //  combination with
    741 #  define PCX                  "%%rcx"             //  64-bit pointer-regs
    742 #  define PDX                  "%%rdx"             //  (base/index pairs,
    743 #  define PSI                  "%%rsi"             //  add/sub/mov pairs)
    744 #  define CLEAR_BOTTOM_3_BITS  "and  $0xfffffffffffffff8, "
    745 #else
    746 #  define SAVE_ebp             "pushl %%ebp \n\t"  // clobber list doesn't work
    747 #  define RESTORE_ebp          "popl  %%ebp \n\t"  //  for %ebp on 32-bit; not
    748 #  define _CLOBBER_ebp                             //  clear why not
    749 #  define CLOBBER_ebp
    750 #  define SAVE_FullLength      "pushl %%eax \n\t"
    751 #  define RESTORE_FullLength   "popl "             // eax (avg) or ecx (paeth)
    752 #  define SAVE_r15
    753 #  define RESTORE_r15
    754 #  define _CLOBBER_r15
    755 #  define PBP                  "%%ebp"             // regs used for or in
    756 #  define PAX                  "%%eax"             //  combination with
    757 #  define PBX                  "%%ebx"             //  "normal," 32-bit
    758 #  define PCX                  "%%ecx"             //  pointers
    759 #  define PDX                  "%%edx"
    760 #  define PSI                  "%%esi"
    761 #  define CLEAR_BOTTOM_3_BITS  "and  $0xfffffff8, "
    762 #endif
    763 
    764 // CLOB_COMMA_ebx_ebp:  need comma ONLY if both CLOBBER_ebp and CLOBBER_GOT_ebx
    765 //                      have values, i.e., only if __x86_64__ AND !__PIC__
    766 #if defined(__x86_64__) && !defined(__PIC__)
    767 #  define CLOB_COMMA_ebx_ebp    , // clobbering both ebp and ebx => need comma
    768 #else
    769 #  define CLOB_COMMA_ebx_ebp
    770 #endif
    771 
    772 // CLOB_COMMA_ebX_r1X:  need comma UNLESS both CLOBBER_ebp and CLOBBER_GOT_ebx
    773 //                   are empty OR CLOBBER_r11_r12_r13 is empty--i.e., NO comma
    774 //                   if (!__x86_64__ AND __PIC__) OR !(PNG_x86_64_USE_GOTPCREL
    775 //                   AND PNG_CLOBBER_x86_64_REGS_SUPPORTED)   (double sigh...)
    776 #if (!defined(__x86_64__) && defined(__PIC__)) || \
    777     !defined(PNG_x86_64_USE_GOTPCREL) || \
    778     !defined(PNG_CLOBBER_x86_64_REGS_SUPPORTED)
    779 #  define CLOB_COMMA_ebX_r1X
    780 #else
    781 #  define CLOB_COMMA_ebX_r1X    , // clobbering (ebp OR ebx) AND r11_r12_r13
    782 #endif
    783 
    784 // CLOB_COLON_ebx_ebp:  need colon unless CLOBBER_ebp and CLOBBER_GOT_ebx are
    785 //                      BOTH empty--i.e., NO colon if (!__x86_64__ AND __PIC__)
    786 // CLOB_COLON_ebx_ebp_r1X:  if, in addition, CLOBBER_r11_r12_r13 is empty, then
    787 //                          no colon for Paeth blocks, either--i.e., NO colon
    788 //                          if !(PNG_x86_64_USE_GOTPCREL AND
    789 //                               PNG_CLOBBER_x86_64_REGS_SUPPORTED)
    790 #if (!defined(__x86_64__) && defined(__PIC__))
    791 #  define CLOB_COLON_ebx_ebp
    792 #  if !(defined(PNG_x86_64_USE_GOTPCREL) && \
    793         defined(PNG_CLOBBER_x86_64_REGS_SUPPORTED))
    794 #    define CLOB_COLON_ebx_ebp_r1X
    795 #  else
    796 #    define CLOB_COLON_ebx_ebp_r1X  : // clobbering ebp OR ebx OR r11_r12_r13
    797 #  endif
    798 #else
    799 #  define CLOB_COLON_ebx_ebp        : // clobbering ebp OR ebx
    800 #  define CLOB_COLON_ebx_ebp_r1X    : // clobbering ebp OR ebx OR r11_r12_r13
    801 #endif
    802 
    803 #endif // PNG_HAVE_MMX_READ_FILTER_ROW
    804 
    805 #if defined(__PIC__)  // macros to save, restore index to Global Offset Table
    806 #  if defined(__x86_64__)
    807 #    define SAVE_GOT_ebx     "pushq %%rbx \n\t"
    808 #    define RESTORE_GOT_ebx  "popq  %%rbx \n\t"
    809 #  else
    810 #    define SAVE_GOT_ebx     "pushl %%ebx \n\t"
    811 #    define RESTORE_GOT_ebx  "popl  %%ebx \n\t"
    812 #  endif
    813 #  define _CLOBBER_GOT_ebx   // explicitly saved, restored => not clobbered
    814 #  define CLOBBER_GOT_ebx
    815 #else
    816 #  define SAVE_GOT_ebx
    817 #  define RESTORE_GOT_ebx
    818 #  define _CLOBBER_GOT_ebx   ,"%ebx"
    819 #  define CLOBBER_GOT_ebx    "%ebx"
    820 #endif
    821 
    822 #if defined(PNG_HAVE_MMX_COMBINE_ROW) || defined(PNG_HAVE_MMX_READ_INTERLACE)
    823 #  define BPP2  2
    824 #  define BPP3  3  // bytes per pixel (a.k.a. pixel_bytes)
    825 #  define BPP4  4  // (defined only to help avoid cut-and-paste errors)
    826 #  define BPP6  6
    827 #  define BPP8  8
    828 #endif
    829 
    830 
    831 
    832 static int _mmx_supported = 2; // 0: no MMX; 1: MMX supported; 2: not tested
    833 
    834 /*===========================================================================*/
    835 /*                                                                           */
    836 /*                      P N G _ M M X _ S U P P O R T                        */
    837 /*                                                                           */
    838 /*===========================================================================*/
    839 
    840 // GRR NOTES:  (1) the following code assumes 386 or better (pushfl/popfl)
    841 //             (2) all instructions compile with gcc 2.7.2.3 and later
    842 //           x (3) the function is moved down here to prevent gcc from
    843 //           x      inlining it in multiple places and then barfing be-
    844 //           x      cause the ".NOT_SUPPORTED" label is multiply defined
    845 //                  [need to retest with gcc 2.7.2.3]
    846 
    847 // GRR 20070524:  This declaration apparently is compatible with but supersedes
    848 //   the one in png.h; in any case, the generated object file is slightly
    849 //   smaller.  It is unnecessary with gcc 4.1.2, but gcc 2.x apparently
    850 //   replicated the ".NOT_SUPPORTED" label in each location the function was
    851 //   inlined, leading to compilation errors due to the "multiply defined"
    852 //   label.  Old workaround was to leave the function at the end of this
    853 //   file; new one (still testing) is to use a gcc-specific function attribute
    854 //   to prevent local inlining.
    855 int PNGAPI
    856 png_mmx_support(void) __attribute__((noinline));
    857 
    858 int PNGAPI
    859 png_mmx_support(void)
    860 {
    861 #if defined(PNG_MMX_CODE_SUPPORTED)  // superfluous, but what the heck
    862     int result;
    863     __asm__ __volatile__ (
    864 #if defined(__x86_64__)
    865         "pushq %%rbx          \n\t"  // rbx gets clobbered by CPUID instruction
    866         "pushq %%rcx          \n\t"  // so does rcx...
    867         "pushq %%rdx          \n\t"  // ...and rdx (but rcx & rdx safe on Linux)
    868         "pushfq               \n\t"  // save Eflag to stack
    869         "popq %%rax           \n\t"  // get Eflag from stack into rax
    870         "movq %%rax, %%rcx    \n\t"  // make another copy of Eflag in rcx
    871         "xorl $0x200000, %%eax \n\t" // toggle ID bit in Eflag (i.e., bit 21)
    872         "pushq %%rax          \n\t"  // save modified Eflag back to stack
    873         "popfq                \n\t"  // restore modified value to Eflag reg
    874         "pushfq               \n\t"  // save Eflag to stack
    875         "popq %%rax           \n\t"  // get Eflag from stack
    876         "pushq %%rcx          \n\t"  // save original Eflag to stack
    877         "popfq                \n\t"  // restore original Eflag
    878 #else
    879         "pushl %%ebx          \n\t"  // ebx gets clobbered by CPUID instruction
    880         "pushl %%ecx          \n\t"  // so does ecx...
    881         "pushl %%edx          \n\t"  // ...and edx (but ecx & edx safe on Linux)
    882         "pushfl               \n\t"  // save Eflag to stack
    883         "popl %%eax           \n\t"  // get Eflag from stack into eax
    884         "movl %%eax, %%ecx    \n\t"  // make another copy of Eflag in ecx
    885         "xorl $0x200000, %%eax \n\t" // toggle ID bit in Eflag (i.e., bit 21)
    886         "pushl %%eax          \n\t"  // save modified Eflag back to stack
    887         "popfl                \n\t"  // restore modified value to Eflag reg
    888         "pushfl               \n\t"  // save Eflag to stack
    889         "popl %%eax           \n\t"  // get Eflag from stack
    890         "pushl %%ecx          \n\t"  // save original Eflag to stack
    891         "popfl                \n\t"  // restore original Eflag
    892 #endif
    893         "xorl %%ecx, %%eax    \n\t"  // compare new Eflag with original Eflag
    894         "jz 0f                \n\t"  // if same, CPUID instr. is not supported
    895 
    896         "xorl %%eax, %%eax    \n\t"  // set eax to zero
    897 //      ".byte  0x0f, 0xa2    \n\t"  // CPUID instruction (two-byte opcode)
    898         "cpuid                \n\t"  // get the CPU identification info
    899         "cmpl $1, %%eax       \n\t"  // make sure eax return non-zero value
    900         "jl 0f                \n\t"  // if eax is zero, MMX is not supported
    901 
    902         "xorl %%eax, %%eax    \n\t"  // set eax to zero and...
    903         "incl %%eax           \n\t"  // ...increment eax to 1.  This pair is
    904                                      // faster than the instruction "mov eax, 1"
    905         "cpuid                \n\t"  // get the CPU identification info again
    906         "andl $0x800000, %%edx \n\t" // mask out all bits but MMX bit (23)
    907         "cmpl $0, %%edx       \n\t"  // 0 = MMX not supported
    908         "jz 0f                \n\t"  // non-zero = yes, MMX IS supported
    909 
    910         "movl $1, %%eax       \n\t"  // set return value to 1
    911         "jmp  1f              \n\t"  // DONE:  have MMX support
    912 
    913     "0:                       \n\t"  // .NOT_SUPPORTED: target label for jump instructions
    914         "movl $0, %%eax       \n\t"  // set return value to 0
    915     "1:                       \n\t"  // .RETURN: target label for jump instructions
    916 #if defined(__x86_64__)
    917         "popq %%rdx           \n\t"  // restore rdx
    918         "popq %%rcx           \n\t"  // restore rcx
    919         "popq %%rbx           \n\t"  // restore rbx
    920 #else
    921         "popl %%edx           \n\t"  // restore edx
    922         "popl %%ecx           \n\t"  // restore ecx
    923         "popl %%ebx           \n\t"  // restore ebx
    924 #endif
    925 
    926 //      "ret                  \n\t"  // DONE:  no MMX support
    927                                      // (fall through to standard C "ret")
    928 
    929         : "=a" (result)              // output list
    930 
    931         :                            // any variables used on input (none)
    932 
    933                                      // no clobber list
    934 //      , "%ebx", "%ecx", "%edx"     // GRR:  we handle these manually
    935 //      , "memory"   // if write to a variable gcc thought was in a reg
    936 //      , "cc"       // "condition codes" (flag bits)
    937     );
    938     _mmx_supported = result;
    939 #else
    940     _mmx_supported = 0;
    941 #endif /* PNG_MMX_CODE_SUPPORTED */
    942 
    943     return _mmx_supported;
    944 }
    945 
    946 
    947 /*===========================================================================*/
    948 /*                                                                           */
    949 /*                       P N G _ C O M B I N E _ R O W                       */
    950 /*                                                                           */
    951 /*===========================================================================*/
    952 
    953 #if defined(PNG_HAVE_MMX_COMBINE_ROW)
    954 
    955 /* Combines the row recently read in with the previous row.
    956    This routine takes care of alpha and transparency if requested.
    957    This routine also handles the two methods of progressive display
    958    of interlaced images, depending on the mask value.
    959    The mask value describes which pixels are to be combined with
    960    the row.  The pattern always repeats every 8 pixels, so just 8
    961    bits are needed.  A one indicates the pixel is to be combined; a
    962    zero indicates the pixel is to be skipped.  This is in addition
    963    to any alpha or transparency value associated with the pixel.
    964    If you want all pixels to be combined, pass 0xff (255) in mask. */
    965 
    966 /* Use this routine for the x86 platform - it uses a faster MMX routine
    967    if the machine supports MMX. */
    968 
    969 void /* PRIVATE */
    970 png_combine_row(png_structp png_ptr, png_bytep row, int mask)
    971 {
    972    int dummy_value_a;    // fix 'forbidden register spilled' error
    973    int dummy_value_c;
    974    int dummy_value_d;
    975    png_bytep dummy_value_S;
    976    png_bytep dummy_value_D;
    977 
    978    png_debug(1, "in png_combine_row (pnggccrd.c)\n");
    979 
    980    if (_mmx_supported == 2) {
    981 #if !defined(PNG_1_0_X)
    982        /* this should have happened in png_init_mmx_flags() already */
    983        png_warning(png_ptr, "asm_flags may not have been initialized");
    984 #endif
    985        png_mmx_support();
    986    }
    987 
    988    if (mask == 0xff)
    989    {
    990       png_debug(2,"mask == 0xff:  doing single png_memcpy()\n");
    991       png_memcpy(row, png_ptr->row_buf + 1,
    992        (png_size_t)PNG_ROWBYTES(png_ptr->row_info.pixel_depth,png_ptr->width));
    993    }
    994    else   /* (png_combine_row() is never called with mask == 0) */
    995    {
    996       switch (png_ptr->row_info.pixel_depth)
    997       {
    998          case 24:       /* png_ptr->row_info.pixel_depth */
    999          {
   1000             png_bytep srcptr;
   1001             png_bytep dstptr;
   1002 
   1003 #if !defined(PNG_1_0_X)
   1004             if (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
   1005 #else
   1006             if (_mmx_supported)
   1007 #endif
   1008             {
   1009                png_uint_32 len;
   1010                int diff;
   1011 
   1012                srcptr = png_ptr->row_buf + 1;
   1013                dstptr = row;
   1014                len  = png_ptr->width & ~7;          // reduce to multiple of 8
   1015                diff = (int) (png_ptr->width & 7);   // amount lost
   1016 
   1017                __asm__ __volatile__ (
   1018                   "not       %%edx            \n\t" // mask => unmask
   1019                   "movd      %%edx, %%mm7     \n\t" // load bit pattern
   1020                   "not       %%edx            \n\t" // unmask => mask for later
   1021                   "psubb     %%mm6, %%mm6     \n\t" // zero mm6
   1022                   "punpcklbw %%mm7, %%mm7     \n\t"
   1023                   "punpcklwd %%mm7, %%mm7     \n\t"
   1024                   "punpckldq %%mm7, %%mm7     \n\t" // fill reg with 8 masks
   1025 
   1026                   LOAD_GOT_rbp
   1027                   "movq   " MASK24_0 ", %%mm0 \n\t" // _mask24_0 -> mm0
   1028                   "movq   " MASK24_1 ", %%mm1 \n\t" // _mask24_1 -> mm1
   1029                   "movq   " MASK24_2 ", %%mm2 \n\t" // _mask24_2 -> mm2
   1030                   RESTORE_rbp
   1031 
   1032                   "pand      %%mm7, %%mm0     \n\t"
   1033                   "pand      %%mm7, %%mm1     \n\t"
   1034                   "pand      %%mm7, %%mm2     \n\t"
   1035 
   1036                   "pcmpeqb   %%mm6, %%mm0     \n\t"
   1037                   "pcmpeqb   %%mm6, %%mm1     \n\t"
   1038                   "pcmpeqb   %%mm6, %%mm2     \n\t"
   1039 
   1040 // preload        "movl      len, %%ecx       \n\t" // load length of line
   1041 // preload        "movl      srcptr, %3       \n\t" // load source
   1042 // preload        "movl      dstptr, %4       \n\t" // load dest
   1043 
   1044                   "cmpl      $0, %%ecx        \n\t"
   1045                   "jz        mainloop24end    \n\t"
   1046 
   1047                 "mainloop24:                  \n\t"
   1048                   "movq      (%3), %%mm4      \n\t"
   1049                   "pand      %%mm0, %%mm4     \n\t"
   1050                   "movq      %%mm0, %%mm6     \n\t"
   1051                   "movq      (%4), %%mm7      \n\t"
   1052                   "pandn     %%mm7, %%mm6     \n\t"
   1053                   "por       %%mm6, %%mm4     \n\t"
   1054                   "movq      %%mm4, (%4)      \n\t"
   1055 
   1056                   "movq      8(%3), %%mm5     \n\t"
   1057                   "pand      %%mm1, %%mm5     \n\t"
   1058                   "movq      %%mm1, %%mm7     \n\t"
   1059                   "movq      8(%4), %%mm6     \n\t"
   1060                   "pandn     %%mm6, %%mm7     \n\t"
   1061                   "por       %%mm7, %%mm5     \n\t"
   1062                   "movq      %%mm5, 8(%4)     \n\t"
   1063 
   1064                   "movq      16(%3), %%mm6    \n\t"
   1065                   "pand      %%mm2, %%mm6     \n\t"
   1066                   "movq      %%mm2, %%mm4     \n\t"
   1067                   "movq      16(%4), %%mm7    \n\t"
   1068                   "pandn     %%mm7, %%mm4     \n\t"
   1069                   "por       %%mm4, %%mm6     \n\t"
   1070                   "movq      %%mm6, 16(%4)    \n\t"
   1071 
   1072                   "add       $24, %3          \n\t" // inc by 24 bytes processed
   1073                   "add       $24, %4          \n\t"
   1074                   "subl      $8, %%ecx        \n\t" // dec by 8 pixels processed
   1075 
   1076                   "ja        mainloop24       \n\t"
   1077 
   1078                 "mainloop24end:               \n\t"
   1079 // preload        "movl      diff, %%ecx      \n\t" // (diff is in eax)
   1080                   "movl      %%eax, %%ecx     \n\t"
   1081                   "cmpl      $0, %%ecx        \n\t"
   1082                   "jz        end24            \n\t"
   1083 // preload        "movl      mask, %%edx      \n\t"
   1084                   "sall      $24, %%edx       \n\t" // make low byte, high byte
   1085 
   1086                 "secondloop24:                \n\t"
   1087                   "sall      %%edx            \n\t" // move high bit to CF
   1088                   "jnc       skip24           \n\t" // if CF = 0
   1089                   "movw      (%3), %%ax       \n\t"
   1090                   "movw      %%ax, (%4)       \n\t"
   1091                   "xorl      %%eax, %%eax     \n\t"
   1092                   "movb      2(%3), %%al      \n\t"
   1093                   "movb      %%al, 2(%4)      \n\t"
   1094 
   1095                 "skip24:                      \n\t"
   1096                   "add       $3, %3           \n\t"
   1097                   "add       $3, %4           \n\t"
   1098                   "decl      %%ecx            \n\t"
   1099                   "jnz       secondloop24     \n\t"
   1100 
   1101                 "end24:                       \n\t"
   1102                   "EMMS                       \n\t" // DONE
   1103 
   1104                   : "=a" (dummy_value_a),           // output regs (dummy)
   1105                     "=d" (dummy_value_d),
   1106                     "=c" (dummy_value_c),
   1107                     "=S" (dummy_value_S),
   1108                     "=D" (dummy_value_D)
   1109 
   1110                   : "0" (diff),        // eax       // input regs
   1111                     "1" (mask),        // edx
   1112                     "2" (len),         // ecx
   1113 // was (unmask)     "b"    RESERVED    // ebx       // Global Offset Table idx
   1114                     "3" (srcptr),      // esi/rsi
   1115                     "4" (dstptr)       // edi/rdi
   1116 
   1117 #if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
   1118                   : "%mm0", "%mm1", "%mm2"          // clobber list
   1119                   , "%mm4", "%mm5", "%mm6", "%mm7"
   1120 #endif
   1121                );
   1122             }
   1123             else /* not _mmx_supported - use modified C routine */
   1124             {
   1125                register png_uint_32 i;
   1126                png_uint_32 initial_val = BPP3 * png_pass_start[png_ptr->pass];
   1127                  /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
   1128                register int stride = BPP3 * png_pass_inc[png_ptr->pass];
   1129                  /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
   1130                register int rep_bytes = BPP3 * png_pass_width[png_ptr->pass];
   1131                  /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
   1132                png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
   1133                int diff = (int) (png_ptr->width & 7); /* amount lost */
   1134                register png_uint_32 final_val = BPP3 * len;   /* GRR bugfix */
   1135 
   1136                srcptr = png_ptr->row_buf + 1 + initial_val;
   1137                dstptr = row + initial_val;
   1138 
   1139                for (i = initial_val; i < final_val; i += stride)
   1140                {
   1141                   png_memcpy(dstptr, srcptr, rep_bytes);
   1142                   srcptr += stride;
   1143                   dstptr += stride;
   1144                }
   1145                if (diff)  /* number of leftover pixels:  3 for pngtest */
   1146                {
   1147                   final_val += diff*BPP3;
   1148                   for (; i < final_val; i += stride)
   1149                   {
   1150                      if (rep_bytes > (int)(final_val-i))
   1151                         rep_bytes = (int)(final_val-i);
   1152                      png_memcpy(dstptr, srcptr, rep_bytes);
   1153                      srcptr += stride;
   1154                      dstptr += stride;
   1155                   }
   1156                }
   1157             } /* end of else (_mmx_supported) */
   1158 
   1159             break;
   1160          }       /* end 24 bpp */
   1161 
   1162          // formerly claimed to be most common case (combining 32-bit RGBA),
   1163          // but almost certainly less common than 24-bit RGB case
   1164          case 32:       /* png_ptr->row_info.pixel_depth */
   1165          {
   1166             png_bytep srcptr;
   1167             png_bytep dstptr;
   1168 
   1169 #if !defined(PNG_1_0_X)
   1170             if (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
   1171 #else
   1172             if (_mmx_supported)
   1173 #endif
   1174             {
   1175                png_uint_32 len;
   1176                int diff;
   1177 
   1178                srcptr = png_ptr->row_buf + 1;
   1179                dstptr = row;
   1180                len  = png_ptr->width & ~7;          // reduce to multiple of 8
   1181                diff = (int) (png_ptr->width & 7);   // amount lost
   1182 
   1183                __asm__ __volatile__ (
   1184                   "not       %%edx            \n\t" // mask => unmask
   1185                   "movd      %%edx, %%mm7     \n\t" // load bit pattern
   1186                   "not       %%edx            \n\t" // unmask => mask for later
   1187                   "psubb     %%mm6, %%mm6     \n\t" // zero mm6
   1188                   "punpcklbw %%mm7, %%mm7     \n\t"
   1189                   "punpcklwd %%mm7, %%mm7     \n\t"
   1190                   "punpckldq %%mm7, %%mm7     \n\t" // fill reg with 8 masks
   1191 
   1192                   LOAD_GOT_rbp
   1193                   "movq   " MASK32_0 ", %%mm0 \n\t" // _mask32_0
   1194                   "movq   " MASK32_1 ", %%mm1 \n\t" // _mask32_1
   1195                   "movq   " MASK32_2 ", %%mm2 \n\t" // _mask32_2
   1196                   "movq   " MASK32_3 ", %%mm3 \n\t" // _mask32_3
   1197                   RESTORE_rbp
   1198 
   1199                   "pand      %%mm7, %%mm0     \n\t"
   1200                   "pand      %%mm7, %%mm1     \n\t"
   1201                   "pand      %%mm7, %%mm2     \n\t"
   1202                   "pand      %%mm7, %%mm3     \n\t"
   1203 
   1204                   "pcmpeqb   %%mm6, %%mm0     \n\t"
   1205                   "pcmpeqb   %%mm6, %%mm1     \n\t"
   1206                   "pcmpeqb   %%mm6, %%mm2     \n\t"
   1207                   "pcmpeqb   %%mm6, %%mm3     \n\t"
   1208 
   1209 // preload        "movl      len, %%ecx       \n\t" // load length of line
   1210 // preload        "movl      srcptr, %3       \n\t" // load source
   1211 // preload        "movl      dstptr, %4       \n\t" // load dest
   1212 
   1213                   "cmpl      $0, %%ecx        \n\t" // lcr
   1214                   "jz        mainloop32end    \n\t"
   1215 
   1216                 "mainloop32:                  \n\t"
   1217                   "movq      (%3), %%mm4      \n\t"
   1218                   "pand      %%mm0, %%mm4     \n\t"
   1219                   "movq      %%mm0, %%mm6     \n\t"
   1220                   "movq      (%4), %%mm7      \n\t"
   1221                   "pandn     %%mm7, %%mm6     \n\t"
   1222                   "por       %%mm6, %%mm4     \n\t"
   1223                   "movq      %%mm4, (%4)      \n\t"
   1224 
   1225                   "movq      8(%3), %%mm5     \n\t"
   1226                   "pand      %%mm1, %%mm5     \n\t"
   1227                   "movq      %%mm1, %%mm7     \n\t"
   1228                   "movq      8(%4), %%mm6     \n\t"
   1229                   "pandn     %%mm6, %%mm7     \n\t"
   1230                   "por       %%mm7, %%mm5     \n\t"
   1231                   "movq      %%mm5, 8(%4)     \n\t"
   1232 
   1233                   "movq      16(%3), %%mm6    \n\t"
   1234                   "pand      %%mm2, %%mm6     \n\t"
   1235                   "movq      %%mm2, %%mm4     \n\t"
   1236                   "movq      16(%4), %%mm7    \n\t"
   1237                   "pandn     %%mm7, %%mm4     \n\t"
   1238                   "por       %%mm4, %%mm6     \n\t"
   1239                   "movq      %%mm6, 16(%4)    \n\t"
   1240 
   1241                   "movq      24(%3), %%mm7    \n\t"
   1242                   "pand      %%mm3, %%mm7     \n\t"
   1243                   "movq      %%mm3, %%mm5     \n\t"
   1244                   "movq      24(%4), %%mm4    \n\t"
   1245                   "pandn     %%mm4, %%mm5     \n\t"
   1246                   "por       %%mm5, %%mm7     \n\t"
   1247                   "movq      %%mm7, 24(%4)    \n\t"
   1248 
   1249                   "add       $32, %3          \n\t" // inc by 32 bytes processed
   1250                   "add       $32, %4          \n\t"
   1251                   "subl      $8, %%ecx        \n\t" // dec by 8 pixels processed
   1252                   "ja        mainloop32       \n\t"
   1253 
   1254                 "mainloop32end:               \n\t"
   1255 // preload        "movl      diff, %%ecx      \n\t" // (diff is in eax)
   1256                   "movl      %%eax, %%ecx     \n\t"
   1257                   "cmpl      $0, %%ecx        \n\t"
   1258                   "jz        end32            \n\t"
   1259 // preload        "movl      mask, %%edx      \n\t"
   1260                   "sall      $24, %%edx       \n\t" // low byte => high byte
   1261 
   1262                 "secondloop32:                \n\t"
   1263                   "sall      %%edx            \n\t" // move high bit to CF
   1264                   "jnc       skip32           \n\t" // if CF = 0
   1265                   "movl      (%3), %%eax      \n\t"
   1266                   "movl      %%eax, (%4)      \n\t"
   1267 
   1268                 "skip32:                      \n\t"
   1269                   "add       $4, %3           \n\t"
   1270                   "add       $4, %4           \n\t"
   1271                   "decl      %%ecx            \n\t"
   1272                   "jnz       secondloop32     \n\t"
   1273 
   1274                 "end32:                       \n\t"
   1275                   "EMMS                       \n\t" // DONE
   1276 
   1277                   : "=a" (dummy_value_a),           // output regs (dummy)
   1278                     "=d" (dummy_value_d),
   1279                     "=c" (dummy_value_c),
   1280                     "=S" (dummy_value_S),
   1281                     "=D" (dummy_value_D)
   1282 
   1283                   : "0" (diff),        // eax       // input regs
   1284                     "1" (mask),        // edx
   1285                     "2" (len),         // ecx
   1286 // was (unmask)     "b"    RESERVED    // ebx       // Global Offset Table idx
   1287                     "3" (srcptr),      // esi/rsi
   1288                     "4" (dstptr)       // edi/rdi
   1289 
   1290 #if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
   1291                   : "%mm0", "%mm1", "%mm2", "%mm3"  // clobber list
   1292                   , "%mm4", "%mm5", "%mm6", "%mm7"
   1293 #endif
   1294                );
   1295             }
   1296             else /* not _mmx_supported - use modified C routine */
   1297             {
   1298                register png_uint_32 i;
   1299                png_uint_32 initial_val = BPP4 * png_pass_start[png_ptr->pass];
   1300                  /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
   1301                register int stride = BPP4 * png_pass_inc[png_ptr->pass];
   1302                  /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
   1303                register int rep_bytes = BPP4 * png_pass_width[png_ptr->pass];
   1304                  /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
   1305                png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
   1306                int diff = (int) (png_ptr->width & 7); /* amount lost */
   1307                register png_uint_32 final_val = BPP4 * len;   /* GRR bugfix */
   1308 
   1309                srcptr = png_ptr->row_buf + 1 + initial_val;
   1310                dstptr = row + initial_val;
   1311 
   1312                for (i = initial_val; i < final_val; i += stride)
   1313                {
   1314                   png_memcpy(dstptr, srcptr, rep_bytes);
   1315                   srcptr += stride;
   1316                   dstptr += stride;
   1317                }
   1318                if (diff)  /* number of leftover pixels:  3 for pngtest */
   1319                {
   1320                   final_val += diff*BPP4;
   1321                   for (; i < final_val; i += stride)
   1322                   {
   1323                      if (rep_bytes > (int)(final_val-i))
   1324                         rep_bytes = (int)(final_val-i);
   1325                      png_memcpy(dstptr, srcptr, rep_bytes);
   1326                      srcptr += stride;
   1327                      dstptr += stride;
   1328                   }
   1329                }
   1330             } /* end of else (_mmx_supported) */
   1331 
   1332             break;
   1333          }       /* end 32 bpp */
   1334 
   1335          case 8:        /* png_ptr->row_info.pixel_depth */
   1336          {
   1337             png_bytep srcptr;
   1338             png_bytep dstptr;
   1339 
   1340 #if !defined(PNG_1_0_X)
   1341             if (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
   1342 #else
   1343             if (_mmx_supported)
   1344 #endif
   1345             {
   1346                png_uint_32 len;
   1347                int diff;
   1348 
   1349                srcptr = png_ptr->row_buf + 1;
   1350                dstptr = row;
   1351                len  = png_ptr->width & ~7;          // reduce to multiple of 8
   1352                diff = (int) (png_ptr->width & 7);   // amount lost
   1353 
   1354                __asm__ __volatile__ (
   1355                   "not       %%edx            \n\t" // mask => unmask
   1356                   "movd      %%edx, %%mm7     \n\t" // load bit pattern
   1357                   "not       %%edx            \n\t" // unmask => mask for later
   1358                   "psubb     %%mm6, %%mm6     \n\t" // zero mm6
   1359                   "punpcklbw %%mm7, %%mm7     \n\t"
   1360                   "punpcklwd %%mm7, %%mm7     \n\t"
   1361                   "punpckldq %%mm7, %%mm7     \n\t" // fill reg with 8 masks
   1362 
   1363                   LOAD_GOT_rbp
   1364                   "movq   " MASK8_0 ", %%mm0  \n\t" // _mask8_0 -> mm0
   1365                   RESTORE_rbp
   1366 
   1367                   "pand      %%mm7, %%mm0     \n\t" // nonzero if keep byte
   1368                   "pcmpeqb   %%mm6, %%mm0     \n\t" // zeros->1s, v versa
   1369 
   1370 // preload        "movl      len, %%ecx       \n\t" // load length of line
   1371 // preload        "movl      srcptr, %3       \n\t" // load source
   1372 // preload        "movl      dstptr, %4       \n\t" // load dest
   1373 
   1374                   "cmpl      $0, %%ecx        \n\t" // len == 0 ?
   1375                   "je        mainloop8end     \n\t"
   1376 
   1377                 "mainloop8:                   \n\t"
   1378                   "movq      (%3), %%mm4      \n\t" // *srcptr
   1379                   "pand      %%mm0, %%mm4     \n\t"
   1380                   "movq      %%mm0, %%mm6     \n\t"
   1381                   "pandn     (%4), %%mm6      \n\t" // *dstptr
   1382                   "por       %%mm6, %%mm4     \n\t"
   1383                   "movq      %%mm4, (%4)      \n\t"
   1384                   "add       $8, %3           \n\t" // inc by 8 bytes processed
   1385                   "add       $8, %4           \n\t"
   1386                   "subl      $8, %%ecx        \n\t" // dec by 8 pixels processed
   1387                   "ja        mainloop8        \n\t"
   1388 
   1389                 "mainloop8end:                \n\t"
   1390 // preload        "movl      diff, %%ecx      \n\t" // (diff is in eax)
   1391                   "movl      %%eax, %%ecx     \n\t"
   1392                   "cmpl      $0, %%ecx        \n\t"
   1393                   "jz        end8             \n\t"
   1394 // preload        "movl      mask, %%edx      \n\t"
   1395                   "sall      $24, %%edx       \n\t" // make low byte, high byte
   1396 
   1397                 "secondloop8:                 \n\t"
   1398                   "sall      %%edx            \n\t" // move high bit to CF
   1399                   "jnc       skip8            \n\t" // if CF = 0
   1400                   "movb      (%3), %%al       \n\t"
   1401                   "movb      %%al, (%4)       \n\t"
   1402 
   1403                 "skip8:                       \n\t"
   1404                   "inc       %3               \n\t"
   1405                   "inc       %4               \n\t"
   1406                   "decl      %%ecx            \n\t"
   1407                   "jnz       secondloop8      \n\t"
   1408 
   1409                 "end8:                        \n\t"
   1410                   "EMMS                       \n\t" // DONE
   1411 
   1412                   : "=a" (dummy_value_a),           // output regs (dummy)
   1413                     "=d" (dummy_value_d),
   1414                     "=c" (dummy_value_c),
   1415                     "=S" (dummy_value_S),
   1416                     "=D" (dummy_value_D)
   1417 
   1418                   : "0" (diff),        // eax       // input regs
   1419                     "1" (mask),        // edx
   1420                     "2" (len),         // ecx
   1421 // was (unmask)     "b"    RESERVED    // ebx       // Global Offset Table idx
   1422                     "3" (srcptr),      // esi/rsi
   1423                     "4" (dstptr)       // edi/rdi
   1424 
   1425 #if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
   1426                   : "%mm0", "%mm4", "%mm6", "%mm7"  // clobber list
   1427 #endif
   1428                );
   1429             }
   1430             else /* not _mmx_supported - use modified C routine */
   1431             {
   1432                register png_uint_32 i;
   1433                png_uint_32 initial_val = png_pass_start[png_ptr->pass];
   1434                  /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
   1435                register int stride = png_pass_inc[png_ptr->pass];
   1436                  /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
   1437                register int rep_bytes = png_pass_width[png_ptr->pass];
   1438                  /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
   1439                png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
   1440                int diff = (int) (png_ptr->width & 7); /* amount lost */
   1441                register png_uint_32 final_val = len;  /* GRR bugfix */
   1442 
   1443                srcptr = png_ptr->row_buf + 1 + initial_val;
   1444                dstptr = row + initial_val;
   1445 
   1446                for (i = initial_val; i < final_val; i += stride)
   1447                {
   1448                   png_memcpy(dstptr, srcptr, rep_bytes);
   1449                   srcptr += stride;
   1450                   dstptr += stride;
   1451                }
   1452                if (diff)  /* number of leftover pixels:  3 for pngtest */
   1453                {
   1454                   final_val += diff /* *BPP1 */ ;
   1455                   for (; i < final_val; i += stride)
   1456                   {
   1457                      if (rep_bytes > (int)(final_val-i))
   1458                         rep_bytes = (int)(final_val-i);
   1459                      png_memcpy(dstptr, srcptr, rep_bytes);
   1460                      srcptr += stride;
   1461                      dstptr += stride;
   1462                   }
   1463                }
   1464 
   1465             } /* end of else (_mmx_supported) */
   1466 
   1467             break;
   1468          }       /* end 8 bpp */
   1469 
   1470          case 1:        /* png_ptr->row_info.pixel_depth */
   1471          {
   1472             png_bytep sp;
   1473             png_bytep dp;
   1474             int s_inc, s_start, s_end;
   1475             int m;
   1476             int shift;
   1477             png_uint_32 i;
   1478 
   1479             sp = png_ptr->row_buf + 1;
   1480             dp = row;
   1481             m = 0x80;
   1482 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
   1483             if (png_ptr->transformations & PNG_PACKSWAP)
   1484             {
   1485                s_start = 0;
   1486                s_end = 7;
   1487                s_inc = 1;
   1488             }
   1489             else
   1490 #endif
   1491             {
   1492                s_start = 7;
   1493                s_end = 0;
   1494                s_inc = -1;
   1495             }
   1496 
   1497             shift = s_start;
   1498 
   1499             for (i = 0; i < png_ptr->width; i++)
   1500             {
   1501                if (m & mask)
   1502                {
   1503                   int value;
   1504 
   1505                   value = (*sp >> shift) & 0x1;
   1506                   *dp &= (png_byte)((0x7f7f >> (7 - shift)) & 0xff);
   1507                   *dp |= (png_byte)(value << shift);
   1508                }
   1509 
   1510                if (shift == s_end)
   1511                {
   1512                   shift = s_start;
   1513                   sp++;
   1514                   dp++;
   1515                }
   1516                else
   1517                   shift += s_inc;
   1518 
   1519                if (m == 1)
   1520                   m = 0x80;
   1521                else
   1522                   m >>= 1;
   1523             }
   1524             break;
   1525          }       /* end 1 bpp */
   1526 
   1527          case 2:        /* png_ptr->row_info.pixel_depth */
   1528          {
   1529             png_bytep sp;
   1530             png_bytep dp;
   1531             int s_start, s_end, s_inc;
   1532             int m;
   1533             int shift;
   1534             png_uint_32 i;
   1535             int value;
   1536 
   1537             sp = png_ptr->row_buf + 1;
   1538             dp = row;
   1539             m = 0x80;
   1540 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
   1541             if (png_ptr->transformations & PNG_PACKSWAP)
   1542             {
   1543                s_start = 0;
   1544                s_end = 6;
   1545                s_inc = 2;
   1546             }
   1547             else
   1548 #endif
   1549             {
   1550                s_start = 6;
   1551                s_end = 0;
   1552                s_inc = -2;
   1553             }
   1554 
   1555             shift = s_start;
   1556 
   1557             for (i = 0; i < png_ptr->width; i++)
   1558             {
   1559                if (m & mask)
   1560                {
   1561                   value = (*sp >> shift) & 0x3;
   1562                   *dp &= (png_byte)((0x3f3f >> (6 - shift)) & 0xff);
   1563                   *dp |= (png_byte)(value << shift);
   1564                }
   1565 
   1566                if (shift == s_end)
   1567                {
   1568                   shift = s_start;
   1569                   sp++;
   1570                   dp++;
   1571                }
   1572                else
   1573                   shift += s_inc;
   1574                if (m == 1)
   1575                   m = 0x80;
   1576                else
   1577                   m >>= 1;
   1578             }
   1579             break;
   1580          }       /* end 2 bpp */
   1581 
   1582          case 4:        /* png_ptr->row_info.pixel_depth */
   1583          {
   1584             png_bytep sp;
   1585             png_bytep dp;
   1586             int s_start, s_end, s_inc;
   1587             int m;
   1588             int shift;
   1589             png_uint_32 i;
   1590             int value;
   1591 
   1592             sp = png_ptr->row_buf + 1;
   1593             dp = row;
   1594             m = 0x80;
   1595 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
   1596             if (png_ptr->transformations & PNG_PACKSWAP)
   1597             {
   1598                s_start = 0;
   1599                s_end = 4;
   1600                s_inc = 4;
   1601             }
   1602             else
   1603 #endif
   1604             {
   1605                s_start = 4;
   1606                s_end = 0;
   1607                s_inc = -4;
   1608             }
   1609 
   1610             shift = s_start;
   1611 
   1612             for (i = 0; i < png_ptr->width; i++)
   1613             {
   1614                if (m & mask)
   1615                {
   1616                   value = (*sp >> shift) & 0xf;
   1617                   *dp &= (png_byte)((0xf0f >> (4 - shift)) & 0xff);
   1618                   *dp |= (png_byte)(value << shift);
   1619                }
   1620 
   1621                if (shift == s_end)
   1622                {
   1623                   shift = s_start;
   1624                   sp++;
   1625                   dp++;
   1626                }
   1627                else
   1628                   shift += s_inc;
   1629                if (m == 1)
   1630                   m = 0x80;
   1631                else
   1632                   m >>= 1;
   1633             }
   1634             break;
   1635          }       /* end 4 bpp */
   1636 
   1637          case 16:       /* png_ptr->row_info.pixel_depth */
   1638          {
   1639             png_bytep srcptr;
   1640             png_bytep dstptr;
   1641 
   1642 #if !defined(PNG_1_0_X)
   1643             if (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
   1644 #else
   1645             if (_mmx_supported)
   1646 #endif
   1647             {
   1648                png_uint_32 len;
   1649                int diff;
   1650 
   1651                srcptr = png_ptr->row_buf + 1;
   1652                dstptr = row;
   1653                len  = png_ptr->width & ~7;          // reduce to multiple of 8
   1654                diff = (int) (png_ptr->width & 7);   // amount lost
   1655 
   1656                __asm__ __volatile__ (
   1657                   "not       %%edx            \n\t" // mask => unmask
   1658                   "movd      %%edx, %%mm7     \n\t" // load bit pattern
   1659                   "not       %%edx            \n\t" // unmask => mask for later
   1660                   "psubb     %%mm6, %%mm6     \n\t" // zero mm6
   1661                   "punpcklbw %%mm7, %%mm7     \n\t"
   1662                   "punpcklwd %%mm7, %%mm7     \n\t"
   1663                   "punpckldq %%mm7, %%mm7     \n\t" // fill reg with 8 masks
   1664 
   1665                   LOAD_GOT_rbp
   1666                   "movq   " MASK16_0 ", %%mm0 \n\t" // _mask16_0 -> mm0
   1667                   "movq   " MASK16_1 ", %%mm1 \n\t" // _mask16_1 -> mm1
   1668                   RESTORE_rbp
   1669 
   1670                   "pand      %%mm7, %%mm0     \n\t"
   1671                   "pand      %%mm7, %%mm1     \n\t"
   1672 
   1673                   "pcmpeqb   %%mm6, %%mm0     \n\t"
   1674                   "pcmpeqb   %%mm6, %%mm1     \n\t"
   1675 
   1676 // preload        "movl      len, %%ecx       \n\t" // load length of line
   1677 // preload        "movl      srcptr, %3       \n\t" // load source
   1678 // preload        "movl      dstptr, %4       \n\t" // load dest
   1679 
   1680                   "cmpl      $0, %%ecx        \n\t"
   1681                   "jz        mainloop16end    \n\t"
   1682 
   1683                 "mainloop16:                  \n\t"
   1684                   "movq      (%3), %%mm4      \n\t"
   1685                   "pand      %%mm0, %%mm4     \n\t"
   1686                   "movq      %%mm0, %%mm6     \n\t"
   1687                   "movq      (%4), %%mm7      \n\t"
   1688                   "pandn     %%mm7, %%mm6     \n\t"
   1689                   "por       %%mm6, %%mm4     \n\t"
   1690                   "movq      %%mm4, (%4)      \n\t"
   1691 
   1692                   "movq      8(%3), %%mm5     \n\t"
   1693                   "pand      %%mm1, %%mm5     \n\t"
   1694                   "movq      %%mm1, %%mm7     \n\t"
   1695                   "movq      8(%4), %%mm6     \n\t"
   1696                   "pandn     %%mm6, %%mm7     \n\t"
   1697                   "por       %%mm7, %%mm5     \n\t"
   1698                   "movq      %%mm5, 8(%4)     \n\t"
   1699 
   1700                   "add       $16, %3          \n\t" // inc by 16 bytes processed
   1701                   "add       $16, %4          \n\t"
   1702                   "subl      $8, %%ecx        \n\t" // dec by 8 pixels processed
   1703                   "ja        mainloop16       \n\t"
   1704 
   1705                 "mainloop16end:               \n\t"
   1706 // preload        "movl      diff, %%ecx      \n\t" // (diff is in eax)
   1707                   "movl      %%eax, %%ecx     \n\t"
   1708                   "cmpl      $0, %%ecx        \n\t"
   1709                   "jz        end16            \n\t"
   1710 // preload        "movl      mask, %%edx      \n\t"
   1711                   "sall      $24, %%edx       \n\t" // make low byte, high byte
   1712 
   1713                 "secondloop16:                \n\t"
   1714                   "sall      %%edx            \n\t" // move high bit to CF
   1715                   "jnc       skip16           \n\t" // if CF = 0
   1716                   "movw      (%3), %%ax       \n\t"
   1717                   "movw      %%ax, (%4)       \n\t"
   1718 
   1719                 "skip16:                      \n\t"
   1720                   "add       $2, %3           \n\t"
   1721                   "add       $2, %4           \n\t"
   1722                   "decl      %%ecx            \n\t"
   1723                   "jnz       secondloop16     \n\t"
   1724 
   1725                 "end16:                       \n\t"
   1726                   "EMMS                       \n\t" // DONE
   1727 
   1728                   : "=a" (dummy_value_a),           // output regs (dummy)
   1729                     "=d" (dummy_value_d),
   1730                     "=c" (dummy_value_c),
   1731                     "=S" (dummy_value_S),
   1732                     "=D" (dummy_value_D)
   1733 
   1734                   : "0" (diff),        // eax       // input regs
   1735                     "1" (mask),        // edx
   1736                     "2" (len),         // ecx
   1737 // was (unmask)     "b"    RESERVED    // ebx       // Global Offset Table idx
   1738                     "3" (srcptr),      // esi/rsi
   1739                     "4" (dstptr)       // edi/rdi
   1740 
   1741 #if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
   1742                   : "%mm0", "%mm1", "%mm4"          // clobber list
   1743                   , "%mm5", "%mm6", "%mm7"
   1744 #endif
   1745                );
   1746             }
   1747             else /* not _mmx_supported - use modified C routine */
   1748             {
   1749                register png_uint_32 i;
   1750                png_uint_32 initial_val = BPP2 * png_pass_start[png_ptr->pass];
   1751                  /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
   1752                register int stride = BPP2 * png_pass_inc[png_ptr->pass];
   1753                  /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
   1754                register int rep_bytes = BPP2 * png_pass_width[png_ptr->pass];
   1755                  /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
   1756                png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
   1757                int diff = (int) (png_ptr->width & 7); /* amount lost */
   1758                register png_uint_32 final_val = BPP2 * len;   /* GRR bugfix */
   1759 
   1760                srcptr = png_ptr->row_buf + 1 + initial_val;
   1761                dstptr = row + initial_val;
   1762 
   1763                for (i = initial_val; i < final_val; i += stride)
   1764                {
   1765                   png_memcpy(dstptr, srcptr, rep_bytes);
   1766                   srcptr += stride;
   1767                   dstptr += stride;
   1768                }
   1769                if (diff)  /* number of leftover pixels:  3 for pngtest */
   1770                {
   1771                   final_val += diff*BPP2;
   1772                   for (; i < final_val; i += stride)
   1773                   {
   1774                      if (rep_bytes > (int)(final_val-i))
   1775                         rep_bytes = (int)(final_val-i);
   1776                      png_memcpy(dstptr, srcptr, rep_bytes);
   1777                      srcptr += stride;
   1778                      dstptr += stride;
   1779                   }
   1780                }
   1781             } /* end of else (_mmx_supported) */
   1782 
   1783             break;
   1784          }       /* end 16 bpp */
   1785 
   1786          case 48:       /* png_ptr->row_info.pixel_depth */
   1787          {
   1788             png_bytep srcptr;
   1789             png_bytep dstptr;
   1790 
   1791 #if !defined(PNG_1_0_X)
   1792             if (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
   1793 #else
   1794             if (_mmx_supported)
   1795 #endif
   1796             {
   1797                png_uint_32 len;
   1798                int diff;
   1799 
   1800                srcptr = png_ptr->row_buf + 1;
   1801                dstptr = row;
   1802                len  = png_ptr->width & ~7;          // reduce to multiple of 8
   1803                diff = (int) (png_ptr->width & 7);   // amount lost
   1804 
   1805                __asm__ __volatile__ (
   1806                   "not       %%edx            \n\t" // mask => unmask
   1807                   "movd      %%edx, %%mm7     \n\t" // load bit pattern
   1808                   "not       %%edx            \n\t" // unmask => mask for later
   1809                   "psubb     %%mm6, %%mm6     \n\t" // zero mm6
   1810                   "punpcklbw %%mm7, %%mm7     \n\t"
   1811                   "punpcklwd %%mm7, %%mm7     \n\t"
   1812                   "punpckldq %%mm7, %%mm7     \n\t" // fill reg with 8 masks
   1813 
   1814                   LOAD_GOT_rbp
   1815                   "movq   " MASK48_0 ", %%mm0 \n\t" // _mask48_0 -> mm0
   1816                   "movq   " MASK48_1 ", %%mm1 \n\t" // _mask48_1 -> mm1
   1817                   "movq   " MASK48_2 ", %%mm2 \n\t" // _mask48_2 -> mm2
   1818                   "movq   " MASK48_3 ", %%mm3 \n\t" // _mask48_3 -> mm3
   1819                   "movq   " MASK48_4 ", %%mm4 \n\t" // _mask48_4 -> mm4
   1820                   "movq   " MASK48_5 ", %%mm5 \n\t" // _mask48_5 -> mm5
   1821                   RESTORE_rbp
   1822 
   1823                   "pand      %%mm7, %%mm0     \n\t"
   1824                   "pand      %%mm7, %%mm1     \n\t"
   1825                   "pand      %%mm7, %%mm2     \n\t"
   1826                   "pand      %%mm7, %%mm3     \n\t"
   1827                   "pand      %%mm7, %%mm4     \n\t"
   1828                   "pand      %%mm7, %%mm5     \n\t"
   1829 
   1830                   "pcmpeqb   %%mm6, %%mm0     \n\t"
   1831                   "pcmpeqb   %%mm6, %%mm1     \n\t"
   1832                   "pcmpeqb   %%mm6, %%mm2     \n\t"
   1833                   "pcmpeqb   %%mm6, %%mm3     \n\t"
   1834                   "pcmpeqb   %%mm6, %%mm4     \n\t"
   1835                   "pcmpeqb   %%mm6, %%mm5     \n\t"
   1836 
   1837 // preload        "movl      len, %%ecx       \n\t" // load length of line
   1838 // preload        "movl      srcptr, %3       \n\t" // load source
   1839 // preload        "movl      dstptr, %4       \n\t" // load dest
   1840 
   1841                   "cmpl      $0, %%ecx        \n\t"
   1842                   "jz        mainloop48end    \n\t"
   1843 
   1844                 "mainloop48:                  \n\t"
   1845                   "movq      (%3), %%mm7      \n\t"
   1846                   "pand      %%mm0, %%mm7     \n\t"
   1847                   "movq      %%mm0, %%mm6     \n\t"
   1848                   "pandn     (%4), %%mm6      \n\t"
   1849                   "por       %%mm6, %%mm7     \n\t"
   1850                   "movq      %%mm7, (%4)      \n\t"
   1851 
   1852                   "movq      8(%3), %%mm6     \n\t"
   1853                   "pand      %%mm1, %%mm6     \n\t"
   1854                   "movq      %%mm1, %%mm7     \n\t"
   1855                   "pandn     8(%4), %%mm7     \n\t"
   1856                   "por       %%mm7, %%mm6     \n\t"
   1857                   "movq      %%mm6, 8(%4)     \n\t"
   1858 
   1859                   "movq      16(%3), %%mm6    \n\t"
   1860                   "pand      %%mm2, %%mm6     \n\t"
   1861                   "movq      %%mm2, %%mm7     \n\t"
   1862                   "pandn     16(%4), %%mm7    \n\t"
   1863                   "por       %%mm7, %%mm6     \n\t"
   1864                   "movq      %%mm6, 16(%4)    \n\t"
   1865 
   1866                   "movq      24(%3), %%mm7    \n\t"
   1867                   "pand      %%mm3, %%mm7     \n\t"
   1868                   "movq      %%mm3, %%mm6     \n\t"
   1869                   "pandn     24(%4), %%mm6    \n\t"
   1870                   "por       %%mm6, %%mm7     \n\t"
   1871                   "movq      %%mm7, 24(%4)    \n\t"
   1872 
   1873                   "movq      32(%3), %%mm6    \n\t"
   1874                   "pand      %%mm4, %%mm6     \n\t"
   1875                   "movq      %%mm4, %%mm7     \n\t"
   1876                   "pandn     32(%4), %%mm7    \n\t"
   1877                   "por       %%mm7, %%mm6     \n\t"
   1878                   "movq      %%mm6, 32(%4)    \n\t"
   1879 
   1880                   "movq      40(%3), %%mm7    \n\t"
   1881                   "pand      %%mm5, %%mm7     \n\t"
   1882                   "movq      %%mm5, %%mm6     \n\t"
   1883                   "pandn     40(%4), %%mm6    \n\t"
   1884                   "por       %%mm6, %%mm7     \n\t"
   1885                   "movq      %%mm7, 40(%4)    \n\t"
   1886 
   1887                   "add       $48, %3          \n\t" // inc by 48 bytes processed
   1888                   "add       $48, %4          \n\t"
   1889                   "subl      $8, %%ecx        \n\t" // dec by 8 pixels processed
   1890 
   1891                   "ja        mainloop48       \n\t"
   1892 
   1893                 "mainloop48end:               \n\t"
   1894 // preload        "movl      diff, %%ecx      \n\t" // (diff is in eax)
   1895                   "movl      %%eax, %%ecx     \n\t"
   1896                   "cmpl      $0, %%ecx        \n\t"
   1897                   "jz        end48            \n\t"
   1898 // preload        "movl      mask, %%edx      \n\t"
   1899                   "sall      $24, %%edx       \n\t" // make low byte, high byte
   1900 
   1901                 "secondloop48:                \n\t"
   1902                   "sall      %%edx            \n\t" // move high bit to CF
   1903                   "jnc       skip48           \n\t" // if CF = 0
   1904                   "movl      (%3), %%eax      \n\t"
   1905                   "movl      %%eax, (%4)      \n\t"
   1906                   "movw      4(%3), %%ax      \n\t" // GR-P bugfix 20070717
   1907                   "movw      %%ax, 4(%4)      \n\t" // GR-P bugfix 20070717
   1908 
   1909                 "skip48:                      \n\t"
   1910                   "add       $6, %3           \n\t" // GR-P bugfix 20070717
   1911                   "add       $6, %4           \n\t" // GR-P bugfix 20070717
   1912                   "decl      %%ecx            \n\t"
   1913                   "jnz       secondloop48     \n\t"
   1914 
   1915                 "end48:                       \n\t"
   1916                   "EMMS                       \n\t" // DONE
   1917 
   1918                   : "=a" (dummy_value_a),           // output regs (dummy)
   1919                     "=d" (dummy_value_d),
   1920                     "=c" (dummy_value_c),
   1921                     "=S" (dummy_value_S),
   1922                     "=D" (dummy_value_D)
   1923 
   1924                   : "0" (diff),        // eax       // input regs
   1925                     "1" (mask),        // edx
   1926                     "2" (len),         // ecx
   1927 // was (unmask)     "b"    RESERVED    // ebx       // Global Offset Table idx
   1928                     "3" (srcptr),      // esi/rsi
   1929                     "4" (dstptr)       // edi/rdi
   1930 
   1931 #if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
   1932                   : "%mm0", "%mm1", "%mm2", "%mm3"  // clobber list
   1933                   , "%mm4", "%mm5", "%mm6", "%mm7"
   1934 #endif
   1935                );
   1936             }
   1937             else /* not _mmx_supported - use modified C routine */
   1938             {
   1939                register png_uint_32 i;
   1940                png_uint_32 initial_val = BPP6 * png_pass_start[png_ptr->pass];
   1941                  /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
   1942                register int stride = BPP6 * png_pass_inc[png_ptr->pass];
   1943                  /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
   1944                register int rep_bytes = BPP6 * png_pass_width[png_ptr->pass];
   1945                  /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
   1946                png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
   1947                int diff = (int) (png_ptr->width & 7); /* amount lost */
   1948                register png_uint_32 final_val = BPP6 * len;   /* GRR bugfix */
   1949 
   1950                srcptr = png_ptr->row_buf + 1 + initial_val;
   1951                dstptr = row + initial_val;
   1952 
   1953                for (i = initial_val; i < final_val; i += stride)
   1954                {
   1955                   png_memcpy(dstptr, srcptr, rep_bytes);
   1956                   srcptr += stride;
   1957                   dstptr += stride;
   1958                }
   1959                if (diff)  /* number of leftover pixels:  3 for pngtest */
   1960                {
   1961                   final_val += diff*BPP6;
   1962                   for (; i < final_val; i += stride)
   1963                   {
   1964                      if (rep_bytes > (int)(final_val-i))
   1965                         rep_bytes = (int)(final_val-i);
   1966                      png_memcpy(dstptr, srcptr, rep_bytes);
   1967                      srcptr += stride;
   1968                      dstptr += stride;
   1969                   }
   1970                }
   1971             } /* end of else (_mmx_supported) */
   1972 
   1973             break;
   1974          }       /* end 48 bpp */
   1975 
   1976          case 64:       /* png_ptr->row_info.pixel_depth */
   1977          {
   1978             png_bytep srcptr;
   1979             png_bytep dstptr;
   1980             register png_uint_32 i;
   1981             png_uint_32 initial_val = BPP8 * png_pass_start[png_ptr->pass];
   1982               /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
   1983             register int stride = BPP8 * png_pass_inc[png_ptr->pass];
   1984               /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
   1985             register int rep_bytes = BPP8 * png_pass_width[png_ptr->pass];
   1986               /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
   1987             png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
   1988             int diff = (int) (png_ptr->width & 7); /* amount lost */
   1989             register png_uint_32 final_val = BPP8 * len;   /* GRR bugfix */
   1990 
   1991             srcptr = png_ptr->row_buf + 1 + initial_val;
   1992             dstptr = row + initial_val;
   1993 
   1994             for (i = initial_val; i < final_val; i += stride)
   1995             {
   1996                png_memcpy(dstptr, srcptr, rep_bytes);
   1997                srcptr += stride;
   1998                dstptr += stride;
   1999             }
   2000             if (diff)  /* number of leftover pixels:  3 for pngtest */
   2001             {
   2002                final_val += diff*BPP8;
   2003                for (; i < final_val; i += stride)
   2004                {
   2005                   if (rep_bytes > (int)(final_val-i))
   2006                      rep_bytes = (int)(final_val-i);
   2007                   png_memcpy(dstptr, srcptr, rep_bytes);
   2008                   srcptr += stride;
   2009                   dstptr += stride;
   2010                }
   2011             }
   2012 
   2013             break;
   2014          }       /* end 64 bpp */
   2015 
   2016          default: /* png_ptr->row_info.pixel_depth != 1,2,4,8,16,24,32,48,64 */
   2017          {
   2018             // ERROR:  SHOULD NEVER BE REACHED
   2019 #if defined(PNG_DEBUG)
   2020             png_debug(1, "Internal libpng logic error (GCC "
   2021               "png_combine_row() pixel_depth)\n");
   2022 #endif
   2023             break;
   2024          }
   2025       } /* end switch (png_ptr->row_info.pixel_depth) */
   2026 
   2027    } /* end if (non-trivial mask) */
   2028 
   2029 } /* end png_combine_row() */
   2030 
   2031 #endif /* PNG_HAVE_MMX_COMBINE_ROW */
   2032 
   2033 
   2034 
   2035 
   2036 /*===========================================================================*/
   2037 /*                                                                           */
   2038 /*                 P N G _ D O _ R E A D _ I N T E R L A C E                 */
   2039 /*                                                                           */
   2040 /*===========================================================================*/
   2041 
   2042 #if defined(PNG_READ_INTERLACING_SUPPORTED)
   2043 #if defined(PNG_HAVE_MMX_READ_INTERLACE)
   2044 
   2045 /* png_do_read_interlace() is called after any 16-bit to 8-bit conversion
   2046  * has taken place.  [GRR: what other steps come before and/or after?]
   2047  */
   2048 
   2049 void /* PRIVATE */
   2050 png_do_read_interlace(png_structp png_ptr)
   2051 {
   2052    png_row_infop row_info = &(png_ptr->row_info);
   2053    png_bytep row = png_ptr->row_buf + 1;
   2054    int pass = png_ptr->pass;
   2055 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
   2056    png_uint_32 transformations = png_ptr->transformations;
   2057 #endif
   2058 
   2059    png_debug(1, "in png_do_read_interlace (pnggccrd.c)\n");
   2060 
   2061    if (_mmx_supported == 2) {
   2062 #if !defined(PNG_1_0_X)
   2063        /* this should have happened in png_init_mmx_flags() already */
   2064        png_warning(png_ptr, "asm_flags may not have been initialized");
   2065 #endif
   2066        png_mmx_support();
   2067    }
   2068 
   2069    if (row != NULL && row_info != NULL)
   2070    {
   2071       png_uint_32 final_width;
   2072 
   2073       final_width = row_info->width * png_pass_inc[pass];
   2074 
   2075       switch (row_info->pixel_depth)
   2076       {
   2077          case 1:
   2078          {
   2079             png_bytep sp, dp;
   2080             int sshift, dshift;
   2081             int s_start, s_end, s_inc;
   2082             png_byte v;
   2083             png_uint_32 i;
   2084             int j;
   2085 
   2086             sp = row + (png_size_t)((row_info->width - 1) >> 3);
   2087             dp = row + (png_size_t)((final_width - 1) >> 3);
   2088 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
   2089             if (transformations & PNG_PACKSWAP)
   2090             {
   2091                sshift = (int)((row_info->width + 7) & 7);
   2092                dshift = (int)((final_width + 7) & 7);
   2093                s_start = 7;
   2094                s_end = 0;
   2095                s_inc = -1;
   2096             }
   2097             else
   2098 #endif
   2099             {
   2100                sshift = 7 - (int)((row_info->width + 7) & 7);
   2101                dshift = 7 - (int)((final_width + 7) & 7);
   2102                s_start = 0;
   2103                s_end = 7;
   2104                s_inc = 1;
   2105             }
   2106 
   2107             for (i = row_info->width; i; i--)
   2108             {
   2109                v = (png_byte)((*sp >> sshift) & 0x1);
   2110                for (j = 0; j < png_pass_inc[pass]; j++)
   2111                {
   2112                   *dp &= (png_byte)((0x7f7f >> (7 - dshift)) & 0xff);
   2113                   *dp |= (png_byte)(v << dshift);
   2114                   if (dshift == s_end)
   2115                   {
   2116                      dshift = s_start;
   2117                      dp--;
   2118                   }
   2119                   else
   2120                      dshift += s_inc;
   2121                }
   2122                if (sshift == s_end)
   2123                {
   2124                   sshift = s_start;
   2125                   sp--;
   2126                }
   2127                else
   2128                   sshift += s_inc;
   2129             }
   2130             break;
   2131          }
   2132 
   2133          case 2:
   2134          {
   2135             png_bytep sp, dp;
   2136             int sshift, dshift;
   2137             int s_start, s_end, s_inc;
   2138             png_uint_32 i;
   2139 
   2140             sp = row + (png_size_t)((row_info->width - 1) >> 2);
   2141             dp = row + (png_size_t)((final_width - 1) >> 2);
   2142 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
   2143             if (transformations & PNG_PACKSWAP)
   2144             {
   2145                sshift = (png_size_t)(((row_info->width + 3) & 3) << 1);
   2146                dshift = (png_size_t)(((final_width + 3) & 3) << 1);
   2147                s_start = 6;
   2148                s_end = 0;
   2149                s_inc = -2;
   2150             }
   2151             else
   2152 #endif
   2153             {
   2154                sshift = (png_size_t)((3 - ((row_info->width + 3) & 3)) << 1);
   2155                dshift = (png_size_t)((3 - ((final_width + 3) & 3)) << 1);
   2156                s_start = 0;
   2157                s_end = 6;
   2158                s_inc = 2;
   2159             }
   2160 
   2161             for (i = row_info->width; i; i--)
   2162             {
   2163                png_byte v;
   2164                int j;
   2165 
   2166                v = (png_byte)((*sp >> sshift) & 0x3);
   2167                for (j = 0; j < png_pass_inc[pass]; j++)
   2168                {
   2169                   *dp &= (png_byte)((0x3f3f >> (6 - dshift)) & 0xff);
   2170                   *dp |= (png_byte)(v << dshift);
   2171                   if (dshift == s_end)
   2172                   {
   2173                      dshift = s_start;
   2174                      dp--;
   2175                   }
   2176                   else
   2177                      dshift += s_inc;
   2178                }
   2179                if (sshift == s_end)
   2180                {
   2181                   sshift = s_start;
   2182                   sp--;
   2183                }
   2184                else
   2185                   sshift += s_inc;
   2186             }
   2187             break;
   2188          }
   2189 
   2190          case 4:
   2191          {
   2192             png_bytep sp, dp;
   2193             int sshift, dshift;
   2194             int s_start, s_end, s_inc;
   2195             png_uint_32 i;
   2196 
   2197             sp = row + (png_size_t)((row_info->width - 1) >> 1);
   2198             dp = row + (png_size_t)((final_width - 1) >> 1);
   2199 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
   2200             if (transformations & PNG_PACKSWAP)
   2201             {
   2202                sshift = (png_size_t)(((row_info->width + 1) & 1) << 2);
   2203                dshift = (png_size_t)(((final_width + 1) & 1) << 2);
   2204                s_start = 4;
   2205                s_end = 0;
   2206                s_inc = -4;
   2207             }
   2208             else
   2209 #endif
   2210             {
   2211                sshift = (png_size_t)((1 - ((row_info->width + 1) & 1)) << 2);
   2212                dshift = (png_size_t)((1 - ((final_width + 1) & 1)) << 2);
   2213                s_start = 0;
   2214                s_end = 4;
   2215                s_inc = 4;
   2216             }
   2217 
   2218             for (i = row_info->width; i; i--)
   2219             {
   2220                png_byte v;
   2221                int j;
   2222 
   2223                v = (png_byte)((*sp >> sshift) & 0xf);
   2224                for (j = 0; j < png_pass_inc[pass]; j++)
   2225                {
   2226                   *dp &= (png_byte)((0xf0f >> (4 - dshift)) & 0xff);
   2227                   *dp |= (png_byte)(v << dshift);
   2228                   if (dshift == s_end)
   2229                   {
   2230                      dshift = s_start;
   2231                      dp--;
   2232                   }
   2233                   else
   2234                      dshift += s_inc;
   2235                }
   2236                if (sshift == s_end)
   2237                {
   2238                   sshift = s_start;
   2239                   sp--;
   2240                }
   2241                else
   2242                   sshift += s_inc;
   2243             }
   2244             break;
   2245          }
   2246 
   2247        /*====================================================================*/
   2248 
   2249          default: /* 8-bit or larger (this is where the routine is modified) */
   2250          {
   2251             png_bytep sptr, dp;
   2252             png_uint_32 i;
   2253             png_size_t pixel_bytes;
   2254             int width = (int)row_info->width;
   2255 
   2256             pixel_bytes = (row_info->pixel_depth >> 3);
   2257 
   2258             /* point sptr at the last pixel in the pre-expanded row: */
   2259             sptr = row + (width - 1) * pixel_bytes;
   2260 
   2261             /* point dp at the last pixel position in the expanded row: */
   2262             dp = row + (final_width - 1) * pixel_bytes;
   2263 
   2264             /* New code by Nirav Chhatrapati - Intel Corporation */
   2265 
   2266 #if !defined(PNG_1_0_X)
   2267             if (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_INTERLACE)
   2268 #else
   2269             if (_mmx_supported)
   2270 #endif
   2271             {
   2272                int dummy_value_c;        // fix 'forbidden register spilled'
   2273                png_bytep dummy_value_S;
   2274                png_bytep dummy_value_D;
   2275                png_bytep dummy_value_a;
   2276                png_bytep dummy_value_d;
   2277 
   2278                //--------------------------------------------------------------
   2279                if (pixel_bytes == BPP3)
   2280                {
   2281                   if (((pass == 4) || (pass == 5)) && width)
   2282                   {
   2283                      int width_mmx = ((width >> 1) << 1) - 8;   // GRR:  huh?
   2284                      if (width_mmx < 0)
   2285                          width_mmx = 0;
   2286                      width -= width_mmx;        // 8 or 9 pix, 24 or 27 bytes
   2287                      if (width_mmx)
   2288                      {
   2289                         // png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1};
   2290                         // sptr points at last pixel in pre-expanded row
   2291                         // dp points at last pixel position in expanded row
   2292                         __asm__ __volatile__ (
   2293                            "sub  $3, %1             \n\t"
   2294                            "sub  $9, %2             \n\t"
   2295                                         // (png_pass_inc[pass] + 1)*pixel_bytes
   2296 
   2297                         ".loop3_pass4:              \n\t"
   2298                            "movq (%1), %%mm0        \n\t" // x x 5 4 3 2 1 0
   2299                            "movq %%mm0, %%mm1       \n\t" // x x 5 4 3 2 1 0
   2300                            "movq %%mm0, %%mm2       \n\t" // x x 5 4 3 2 1 0
   2301                            "psllq $24, %%mm0        \n\t" // 4 3 2 1 0 z z z
   2302                            "pand (%3), %%mm1        \n\t" // z z z z z 2 1 0
   2303                            "psrlq $24, %%mm2        \n\t" // z z z x x 5 4 3
   2304                            "por %%mm1, %%mm0        \n\t" // 4 3 2 1 0 2 1 0
   2305                            "movq %%mm2, %%mm3       \n\t" // z z z x x 5 4 3
   2306                            "psllq $8, %%mm2         \n\t" // z z x x 5 4 3 z
   2307                            "movq %%mm0, (%2)        \n\t"
   2308                            "psrlq $16, %%mm3        \n\t" // z z z z z x x 5
   2309                            "pand (%4), %%mm3        \n\t" // z z z z z z z 5
   2310                            "por %%mm3, %%mm2        \n\t" // z z x x 5 4 3 5
   2311                            "sub  $6, %1             \n\t"
   2312                            "movd %%mm2, 8(%2)       \n\t"
   2313                            "sub  $12, %2            \n\t"
   2314                            "subl $2, %%ecx          \n\t"
   2315                            "jnz .loop3_pass4        \n\t"
   2316                            "EMMS                    \n\t" // DONE
   2317 
   2318                            : "=c" (dummy_value_c),        // output regs (dummy)
   2319                              "=S" (dummy_value_S),
   2320                              "=D" (dummy_value_D),
   2321                              "=a" (dummy_value_a),
   2322                              "=d" (dummy_value_d)
   2323 
   2324                            : "0" (width_mmx),     // ecx  // input regs
   2325                              "1" (sptr),          // esi/rsi
   2326                              "2" (dp),            // edi/rdi
   2327 #if defined(PNG_x86_64_USE_GOTPCREL)     // formerly _const4 and _const6:
   2328                              "3" (&_c64._amask5_3_0), // (0x0000000000FFFFFFLL)
   2329                              "4" (&_c64._amask7_1_0)  // (0x00000000000000FFLL)
   2330 #else
   2331                              "3" (&_amask5_3_0),  // eax (0x0000000000FFFFFFLL)
   2332                              "4" (&_amask7_1_0)   // edx (0x00000000000000FFLL)
   2333 #endif
   2334 
   2335 #if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
   2336                            : "%mm0", "%mm1"               // clobber list
   2337                            , "%mm2", "%mm3"
   2338 #endif
   2339                         );
   2340                      }
   2341 
   2342                      sptr -= width_mmx*BPP3;
   2343                      dp -= width_mmx*2*BPP3;
   2344                      for (i = width; i; i--)
   2345                      {
   2346                         png_byte v[8];
   2347                         int j;
   2348 
   2349                         png_memcpy(v, sptr, BPP3);
   2350                         for (j = 0; j < png_pass_inc[pass]; j++)
   2351                         {
   2352                            png_memcpy(dp, v, BPP3);
   2353                            dp -= BPP3;
   2354                         }
   2355                         sptr -= BPP3;
   2356                      }
   2357                   }
   2358                   else if (((pass == 2) || (pass == 3)) && width)
   2359                   {
   2360                      __asm__ __volatile__ (
   2361                         "sub  $9, %2             \n\t"
   2362                                      // (png_pass_inc[pass] - 1)*pixel_bytes
   2363 
   2364                      ".loop3_pass2:              \n\t"
   2365                         "movd (%1), %%mm0        \n\t" // x x x x x 2 1 0
   2366                         "pand (%3), %%mm0        \n\t" // z z z z z 2 1 0
   2367                         "movq %%mm0, %%mm1       \n\t" // z z z z z 2 1 0
   2368                         "psllq $16, %%mm0        \n\t" // z z z 2 1 0 z z
   2369                         "movq %%mm0, %%mm2       \n\t" // z z z 2 1 0 z z
   2370                         "psllq $24, %%mm0        \n\t" // 2 1 0 z z z z z
   2371                         "psrlq $8, %%mm1         \n\t" // z z z z z z 2 1
   2372                         "por %%mm2, %%mm0        \n\t" // 2 1 0 2 1 0 z z
   2373                         "por %%mm1, %%mm0        \n\t" // 2 1 0 2 1 0 2 1
   2374                         "movq %%mm0, 4(%2)       \n\t"
   2375                         "psrlq $16, %%mm0        \n\t" // z z 2 1 0 2 1 0
   2376                         "sub  $3, %1             \n\t"
   2377                         "movd %%mm0, (%2)        \n\t"
   2378                         "sub  $12, %2            \n\t"
   2379                         "decl %%ecx              \n\t"
   2380                         "jnz .loop3_pass2        \n\t"
   2381                         "EMMS                    \n\t" // DONE
   2382 
   2383                         : "=c" (dummy_value_c),        // output regs (dummy)
   2384                           "=S" (dummy_value_S),
   2385                           "=D" (dummy_value_D),
   2386                           "=a" (dummy_value_a)
   2387 
   2388                         : "0" (width),         // ecx  // input regs
   2389                           "1" (sptr),          // esi/rsi
   2390                           "2" (dp),            // edi/rdi
   2391 #if defined(PNG_x86_64_USE_GOTPCREL)           // formerly _const4:
   2392                           "3" (&_c64._amask5_3_0)  // (0x0000000000FFFFFFLL)
   2393 #else
   2394                           "3" (&_amask5_3_0)   // eax (0x0000000000FFFFFFLL)
   2395 #endif
   2396 
   2397 #if defined(CLOBBER_MMX_REGS_SUPPORTED)
   2398                         : "%mm0", "%mm1", "%mm2"       // clobber list
   2399 #endif
   2400                      );
   2401                   }
   2402                   else if (width)  // && ((pass == 0) || (pass == 1))
   2403                   {
   2404                      __asm__ __volatile__ (
   2405                         "sub  $21, %2            \n\t"
   2406                                      // (png_pass_inc[pass] - 1)*pixel_bytes
   2407 
   2408                      ".loop3_pass0:              \n\t"
   2409                         "movd (%1), %%mm0        \n\t" // x x x x x 2 1 0
   2410                         "pand (%3), %%mm0        \n\t" // z z z z z 2 1 0
   2411                         "movq %%mm0, %%mm1       \n\t" // z z z z z 2 1 0
   2412                         "psllq $16, %%mm0        \n\t" // z z z 2 1 0 z z
   2413                         "movq %%mm0, %%mm2       \n\t" // z z z 2 1 0 z z
   2414                         "psllq $24, %%mm0        \n\t" // 2 1 0 z z z z z
   2415                         "psrlq $8, %%mm1         \n\t" // z z z z z z 2 1
   2416                         "por %%mm2, %%mm0        \n\t" // 2 1 0 2 1 0 z z
   2417                         "por %%mm1, %%mm0        \n\t" // 2 1 0 2 1 0 2 1
   2418                         "movq %%mm0, %%mm3       \n\t" // 2 1 0 2 1 0 2 1
   2419                         "psllq $16, %%mm0        \n\t" // 0 2 1 0 2 1 z z
   2420                         "movq %%mm3, %%mm4       \n\t" // 2 1 0 2 1 0 2 1
   2421                         "punpckhdq %%mm0, %%mm3  \n\t" // 0 2 1 0 2 1 0 2
   2422                         "movq %%mm4, 16(%2)      \n\t"
   2423                         "psrlq $32, %%mm0        \n\t" // z z z z 0 2 1 0
   2424                         "movq %%mm3, 8(%2)       \n\t"
   2425                         "punpckldq %%mm4, %%mm0  \n\t" // 1 0 2 1 0 2 1 0
   2426                         "sub  $3, %1             \n\t"
   2427                         "movq %%mm0, (%2)        \n\t"
   2428                         "sub  $24, %2            \n\t"
   2429                         "decl %%ecx              \n\t"
   2430                         "jnz .loop3_pass0        \n\t"
   2431                         "EMMS                    \n\t" // DONE
   2432 
   2433                         : "=c" (dummy_value_c),        // output regs (dummy)
   2434                           "=S" (dummy_value_S),
   2435                           "=D" (dummy_value_D),
   2436                           "=a" (dummy_value_a)
   2437 
   2438                         : "0" (width),         // ecx  // input regs
   2439                           "1" (sptr),          // esi/rsi
   2440                           "2" (dp),            // edi/rdi
   2441 #if defined(PNG_x86_64_USE_GOTPCREL)           // formerly _const4:
   2442                           "3" (&_c64._amask5_3_0)  // (0x0000000000FFFFFFLL)
   2443 #else
   2444                           "3" (&_amask5_3_0)   // eax (0x0000000000FFFFFFLL)
   2445 #endif
   2446 
   2447 #if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
   2448                         : "%mm0", "%mm1", "%mm2"       // clobber list
   2449                         , "%mm3", "%mm4"
   2450 #endif
   2451                      );
   2452                   }
   2453                } /* end of pixel_bytes == 3 */
   2454 
   2455                //--------------------------------------------------------------
   2456                else if (pixel_bytes == BPP4)
   2457                {
   2458                   if (((pass == 4) || (pass == 5)) && width)
   2459                   {
   2460                      int width_mmx = ((width >> 1) << 1) ;
   2461                      width -= width_mmx;        // 0,1 pixels => 0,4 bytes
   2462                      if (width_mmx)
   2463                      {
   2464                         __asm__ __volatile__ (
   2465                            "sub  $4, %1             \n\t"
   2466                            "sub  $12, %2            \n\t"
   2467 
   2468                         ".loop4_pass4:              \n\t"
   2469                            "movq (%1), %%mm0        \n\t" // 7 6 5 4 3 2 1 0
   2470                            "movq %%mm0, %%mm1       \n\t" // 7 6 5 4 3 2 1 0
   2471                            "punpckldq %%mm0, %%mm0  \n\t" // 3 2 1 0 3 2 1 0
   2472                            "punpckhdq %%mm1, %%mm1  \n\t" // 7 6 5 4 7 6 5 4
   2473                            "movq %%mm0, (%2)        \n\t"
   2474                            "sub  $8, %1             \n\t"
   2475                            "movq %%mm1, 8(%2)       \n\t"
   2476                            "sub  $16, %2            \n\t"
   2477                            "subl $2, %%ecx          \n\t"
   2478                            "jnz .loop4_pass4        \n\t"
   2479                            "EMMS                    \n\t" // DONE
   2480 
   2481                            : "=c" (dummy_value_c),        // output regs (dummy)
   2482                              "=S" (dummy_value_S),
   2483                              "=D" (dummy_value_D)
   2484 
   2485                            : "0" (width_mmx),     // ecx  // input regs
   2486                              "1" (sptr),          // esi/rsi
   2487                              "2" (dp)             // edi/rdi
   2488 
   2489 #if defined(CLOBBER_MMX_REGS_SUPPORTED)
   2490                            : "%mm0", "%mm1"               // clobber list
   2491 #endif
   2492                         );
   2493                      }
   2494 
   2495                      sptr -= (width_mmx*BPP4 - BPP4); // sign fixed
   2496                      dp -= (width_mmx*2*BPP4 - BPP4); // sign fixed
   2497                      for (i = width; i; i--)
   2498                      {
   2499                         png_byte v[8];
   2500                         int j;
   2501                         sptr -= BPP4;
   2502                         png_memcpy(v, sptr, BPP4);
   2503                         for (j = 0; j < png_pass_inc[pass]; j++)
   2504                         {
   2505                            dp -= BPP4;
   2506                            png_memcpy(dp, v, BPP4);
   2507                         }
   2508                      }
   2509                   }
   2510                   else if (((pass == 2) || (pass == 3)) && width)
   2511                   {
   2512                      int width_mmx = ((width >> 1) << 1);
   2513                      width -= width_mmx;        // 0,1 pixels => 0,4 bytes
   2514                      if (width_mmx)
   2515                      {
   2516                         __asm__ __volatile__ (
   2517                            "sub  $4, %1             \n\t"
   2518                            "sub  $28, %2            \n\t"
   2519 
   2520                         ".loop4_pass2:              \n\t"
   2521                            "movq (%1), %%mm0        \n\t" // 7 6 5 4 3 2 1 0
   2522                            "movq %%mm0, %%mm1       \n\t" // 7 6 5 4 3 2 1 0
   2523                            "punpckldq %%mm0, %%mm0  \n\t" // 3 2 1 0 3 2 1 0
   2524                            "punpckhdq %%mm1, %%mm1  \n\t" // 7 6 5 4 7 6 5 4
   2525                            "movq %%mm0, (%2)        \n\t"
   2526                            "movq %%mm0, 8(%2)       \n\t"
   2527                            "movq %%mm1, 16(%2)      \n\t"
   2528                            "movq %%mm1, 24(%2)      \n\t"
   2529                            "sub  $8, %1             \n\t"
   2530                            "sub  $32, %2            \n\t"
   2531                            "subl $2, %%ecx          \n\t"
   2532                            "jnz .loop4_pass2        \n\t"
   2533                            "EMMS                    \n\t" // DONE
   2534 
   2535                            : "=c" (dummy_value_c),        // output regs (dummy)
   2536                              "=S" (dummy_value_S),
   2537                              "=D" (dummy_value_D)
   2538 
   2539                            : "0" (width_mmx),     // ecx  // input regs
   2540                              "1" (sptr),          // esi/rsi
   2541                              "2" (dp)             // edi/rdi
   2542 
   2543 #if defined(CLOBBER_MMX_REGS_SUPPORTED)
   2544                            : "%mm0", "%mm1"               // clobber list
   2545 #endif
   2546                         );
   2547                      }
   2548 
   2549                      sptr -= (width_mmx*4 - 4); // sign fixed
   2550                      dp -= (width_mmx*16 - 4);  // sign fixed
   2551                      for (i = width; i; i--)
   2552                      {
   2553                         png_byte v[8];
   2554                         int j;
   2555                         sptr -= 4;
   2556                         png_memcpy(v, sptr, 4);
   2557                         for (j = 0; j < png_pass_inc[pass]; j++)
   2558                         {
   2559                            dp -= 4;
   2560                            png_memcpy(dp, v, 4);
   2561                         }
   2562                      }
   2563                   }
   2564                   else if (width)  // && ((pass == 0) || (pass == 1))
   2565                   {
   2566                      int width_mmx = ((width >> 1) << 1);
   2567                      width -= width_mmx;        // 0,1 pixels => 0,4 bytes
   2568                      if (width_mmx)
   2569                      {
   2570                         __asm__ __volatile__ (
   2571                            "sub  $4, %1             \n\t"
   2572                            "sub  $60, %2            \n\t"
   2573 
   2574                         ".loop4_pass0:              \n\t"
   2575                            "movq (%1), %%mm0        \n\t" // 7 6 5 4 3 2 1 0
   2576                            "movq %%mm0, %%mm1       \n\t" // 7 6 5 4 3 2 1 0
   2577                            "punpckldq %%mm0, %%mm0  \n\t" // 3 2 1 0 3 2 1 0
   2578                            "punpckhdq %%mm1, %%mm1  \n\t" // 7 6 5 4 7 6 5 4
   2579                            "movq %%mm0, (%2)        \n\t"
   2580                            "movq %%mm0, 8(%2)       \n\t"
   2581                            "movq %%mm0, 16(%2)      \n\t"
   2582                            "movq %%mm0, 24(%2)      \n\t"
   2583                            "movq %%mm1, 32(%2)      \n\t"
   2584                            "movq %%mm1, 40(%2)      \n\t"
   2585                            "movq %%mm1, 48(%2)      \n\t"
   2586                            "sub  $8, %1             \n\t"
   2587                            "movq %%mm1, 56(%2)      \n\t"
   2588                            "sub  $64, %2            \n\t"
   2589                            "subl $2, %%ecx          \n\t"
   2590                            "jnz .loop4_pass0        \n\t"
   2591                            "EMMS                    \n\t" // DONE
   2592 
   2593                            : "=c" (dummy_value_c),        // output regs (dummy)
   2594                              "=S" (dummy_value_S),
   2595                              "=D" (dummy_value_D)
   2596 
   2597                            : "0" (width_mmx),     // ecx  // input regs
   2598                              "1" (sptr),          // esi/rsi
   2599                              "2" (dp)             // edi/rdi
   2600 
   2601 #if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
   2602                            : "%mm0", "%mm1"               // clobber list
   2603 #endif
   2604                         );
   2605                      }
   2606 
   2607                      sptr -= (width_mmx*4 - 4); // sign fixed
   2608                      dp -= (width_mmx*32 - 4);  // sign fixed
   2609                      for (i = width; i; i--)
   2610                      {
   2611                         png_byte v[8];
   2612                         int j;
   2613                         sptr -= 4;
   2614                         png_memcpy(v, sptr, 4);
   2615                         for (j = 0; j < png_pass_inc[pass]; j++)
   2616                         {
   2617                            dp -= 4;
   2618                            png_memcpy(dp, v, 4);
   2619                         }
   2620                      }
   2621                   }
   2622                } /* end of pixel_bytes == 4 */
   2623 
   2624                //--------------------------------------------------------------
   2625                else if (pixel_bytes == 1)
   2626                {
   2627                   if (((pass == 4) || (pass == 5)) && width)
   2628                   {
   2629                      int width_mmx = ((width >> 3) << 3);
   2630                      width -= width_mmx;        // 0-3 pixels => 0-3 bytes
   2631                      if (width_mmx)
   2632                      {
   2633                         __asm__ __volatile__ (
   2634                            "sub  $7, %1             \n\t"
   2635                            "sub  $15, %2            \n\t"
   2636 
   2637                         ".loop1_pass4:              \n\t"
   2638                            "movq (%1), %%mm0        \n\t" // 7 6 5 4 3 2 1 0
   2639                            "movq %%mm0, %%mm1       \n\t" // 7 6 5 4 3 2 1 0
   2640                            "punpcklbw %%mm0, %%mm0  \n\t" // 3 3 2 2 1 1 0 0
   2641                            "punpckhbw %%mm1, %%mm1  \n\t" // 7 7 6 6 5 5 4 4
   2642                            "movq %%mm1, 8(%2)       \n\t"
   2643                            "sub  $8, %1             \n\t"
   2644                            "movq %%mm0, (%2)        \n\t"
   2645                            "sub  $16, %2            \n\t"
   2646                            "subl $8, %%ecx          \n\t"
   2647                            "jnz .loop1_pass4        \n\t"
   2648                            "EMMS                    \n\t" // DONE
   2649 
   2650                            : "=c" (dummy_value_c),        // output regs (dummy)
   2651                              "=S" (dummy_value_S),
   2652                              "=D" (dummy_value_D)
   2653 
   2654                            : "0" (width_mmx),     // ecx  // input regs
   2655                              "1" (sptr),          // esi/rsi
   2656                              "2" (dp)             // edi/rdi
   2657 
   2658 #if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
   2659                            : "%mm0", "%mm1"               // clobber list
   2660 #endif
   2661                         );
   2662                      }
   2663 
   2664                      sptr -= width_mmx;
   2665                      dp -= width_mmx*2;
   2666                      for (i = width; i; i--)
   2667                      {
   2668                         int j;
   2669 
   2670                         for (j = 0; j < png_pass_inc[pass]; j++)
   2671                         {
   2672                            *dp-- = *sptr;
   2673                         }
   2674                         --sptr;
   2675                      }
   2676                   }
   2677                   else if (((pass == 2) || (pass == 3)) && width)
   2678                   {
   2679                      int width_mmx = ((width >> 2) << 2);
   2680                      width -= width_mmx;        // 0-3 pixels => 0-3 bytes
   2681                      if (width_mmx)
   2682                      {
   2683                         __asm__ __volatile__ (
   2684                            "sub  $3, %1             \n\t"
   2685                            "sub  $15, %2            \n\t"
   2686 
   2687                         ".loop1_pass2:              \n\t"
   2688                            "movd (%1), %%mm0        \n\t" // x x x x 3 2 1 0
   2689                            "punpcklbw %%mm0, %%mm0  \n\t" // 3 3 2 2 1 1 0 0
   2690                            "movq %%mm0, %%mm1       \n\t" // 3 3 2 2 1 1 0 0
   2691                            "punpcklwd %%mm0, %%mm0  \n\t" // 1 1 1 1 0 0 0 0
   2692                            "punpckhwd %%mm1, %%mm1  \n\t" // 3 3 3 3 2 2 2 2
   2693                            "movq %%mm0, (%2)        \n\t"
   2694                            "sub  $4, %1             \n\t"
   2695                            "movq %%mm1, 8(%2)       \n\t"
   2696                            "sub  $16, %2            \n\t"
   2697                            "subl $4, %%ecx          \n\t"
   2698                            "jnz .loop1_pass2        \n\t"
   2699                            "EMMS                    \n\t" // DONE
   2700 
   2701                            : "=c" (dummy_value_c),        // output regs (dummy)
   2702                              "=S" (dummy_value_S),
   2703                              "=D" (dummy_value_D)
   2704 
   2705                            : "0" (width_mmx),     // ecx  // input regs
   2706                              "1" (sptr),          // esi/rsi
   2707                              "2" (dp)             // edi/rdi
   2708 
   2709 #if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
   2710                            : "%mm0", "%mm1"               // clobber list
   2711 #endif
   2712                         );
   2713                      }
   2714 
   2715                      sptr -= width_mmx;
   2716                      dp -= width_mmx*4;
   2717                      for (i = width; i; i--)
   2718                      {
   2719                         int j;
   2720 
   2721                         for (j = 0; j < png_pass_inc[pass]; j++)
   2722                         {
   2723                            *dp-- = *sptr;
   2724                         }
   2725                         --sptr;
   2726                      }
   2727                   }
   2728                   else if (width)  // && ((pass == 0) || (pass == 1))
   2729                   {
   2730                      int width_mmx = ((width >> 2) << 2);
   2731                      width -= width_mmx;        // 0-3 pixels => 0-3 bytes
   2732                      if (width_mmx)
   2733                      {
   2734                         __asm__ __volatile__ (
   2735                            "sub  $3, %1             \n\t"
   2736                            "sub  $31, %2            \n\t"
   2737 
   2738                         ".loop1_pass0:              \n\t"
   2739                            "movd (%1), %%mm0        \n\t" // x x x x 3 2 1 0
   2740                            "movq %%mm0, %%mm1       \n\t" // x x x x 3 2 1 0
   2741                            "punpcklbw %%mm0, %%mm0  \n\t" // 3 3 2 2 1 1 0 0
   2742                            "movq %%mm0, %%mm2       \n\t" // 3 3 2 2 1 1 0 0
   2743                            "punpcklwd %%mm0, %%mm0  \n\t" // 1 1 1 1 0 0 0 0
   2744                            "movq %%mm0, %%mm3       \n\t" // 1 1 1 1 0 0 0 0
   2745                            "punpckldq %%mm0, %%mm0  \n\t" // 0 0 0 0 0 0 0 0
   2746                            "punpckhdq %%mm3, %%mm3  \n\t" // 1 1 1 1 1 1 1 1
   2747                            "movq %%mm0, (%2)        \n\t"
   2748                            "punpckhwd %%mm2, %%mm2  \n\t" // 3 3 3 3 2 2 2 2
   2749                            "movq %%mm3, 8(%2)       \n\t"
   2750                            "movq %%mm2, %%mm4       \n\t" // 3 3 3 3 2 2 2 2
   2751                            "punpckldq %%mm2, %%mm2  \n\t" // 2 2 2 2 2 2 2 2
   2752                            "punpckhdq %%mm4, %%mm4  \n\t" // 3 3 3 3 3 3 3 3
   2753                            "movq %%mm2, 16(%2)      \n\t"
   2754                            "sub  $4, %1             \n\t"
   2755                            "movq %%mm4, 24(%2)      \n\t"
   2756                            "sub  $32, %2            \n\t"
   2757                            "subl $4, %%ecx          \n\t"
   2758                            "jnz .loop1_pass0        \n\t"
   2759                            "EMMS                    \n\t" // DONE
   2760 
   2761                            : "=c" (dummy_value_c),        // output regs (dummy)
   2762                              "=S" (dummy_value_S),
   2763                              "=D" (dummy_value_D)
   2764 
   2765                            : "0" (width_mmx),     // ecx  // input regs
   2766                              "1" (sptr),          // esi/rsi
   2767                              "2" (dp)             // edi/rdi
   2768 
   2769 #if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
   2770                            : "%mm0", "%mm1", "%mm2"       // clobber list
   2771                            , "%mm3", "%mm4"
   2772 #endif
   2773                         );
   2774                      }
   2775 
   2776                      sptr -= width_mmx;
   2777                      dp -= width_mmx*8;
   2778                      for (i = width; i; i--)
   2779                      {
   2780                         int j;
   2781 
   2782                        /* I simplified this part in version 1.0.4e
   2783                         * here and in several other instances where
   2784                         * pixel_bytes == 1  -- GR-P
   2785                         *
   2786                         * Original code:
   2787                         *
   2788                         * png_byte v[8];
   2789                         * png_memcpy(v, sptr, pixel_bytes);
   2790                         * for (j = 0; j < png_pass_inc[pass]; j++)
   2791                         * {
   2792                         *    png_memcpy(dp, v, pixel_bytes);
   2793                         *    dp -= pixel_bytes;
   2794                         * }
   2795                         * sptr -= pixel_bytes;
   2796                         *
   2797                         * Replacement code is in the next three lines:
   2798                         */
   2799 
   2800                         for (j = 0; j < png_pass_inc[pass]; j++)
   2801                         {
   2802                            *dp-- = *sptr;
   2803                         }
   2804                         --sptr;
   2805                      }
   2806                   }
   2807                } /* end of pixel_bytes == 1 */
   2808 
   2809                //--------------------------------------------------------------
   2810                else if (pixel_bytes == BPP2)
   2811                {
   2812                   if (((pass == 4) || (pass == 5)) && width)
   2813                   {
   2814                      int width_mmx = ((width >> 1) << 1) ;
   2815                      width -= width_mmx;        // 0,1 pixels => 0,2 bytes
   2816                      if (width_mmx)
   2817                      {
   2818                         __asm__ __volatile__ (
   2819                            "sub  $2, %1             \n\t"
   2820                            "sub  $6, %2             \n\t"
   2821 
   2822                         ".loop2_pass4:              \n\t"
   2823                            "movd (%1), %%mm0        \n\t" // x x x x 3 2 1 0
   2824                            "punpcklwd %%mm0, %%mm0  \n\t" // 3 2 3 2 1 0 1 0
   2825                            "sub  $4, %1             \n\t"
   2826                            "movq %%mm0, (%2)        \n\t"
   2827                            "sub  $8, %2             \n\t"
   2828                            "subl $2, %%ecx          \n\t"
   2829                            "jnz .loop2_pass4        \n\t"
   2830                            "EMMS                    \n\t" // DONE
   2831 
   2832                            : "=c" (dummy_value_c),        // output regs (dummy)
   2833                              "=S" (dummy_value_S),
   2834                              "=D" (dummy_value_D)
   2835 
   2836                            : "0" (width_mmx),     // ecx  // input regs
   2837                              "1" (sptr),          // esi/rsi
   2838                              "2" (dp)             // edi/rdi
   2839 
   2840 #if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
   2841                            : "%mm0"                       // clobber list
   2842 #endif
   2843                         );
   2844                      }
   2845 
   2846                      sptr -= (width_mmx*BPP2 - BPP2); // sign fixed
   2847                      dp -= (width_mmx*2*BPP2 - BPP2); // sign fixed
   2848                      for (i = width; i; i--)
   2849                      {
   2850                         png_byte v[8];
   2851                         int j;
   2852                         sptr -= BPP2;
   2853                         png_memcpy(v, sptr, BPP2);
   2854                         for (j = 0; j < png_pass_inc[pass]; j++)
   2855                         {
   2856                            dp -= BPP2;
   2857                            png_memcpy(dp, v, BPP2);
   2858                         }
   2859                      }
   2860                   }
   2861                   else if (((pass == 2) || (pass == 3)) && width)
   2862                   {
   2863                      int width_mmx = ((width >> 1) << 1) ;
   2864                      width -= width_mmx;        // 0,1 pixels => 0,2 bytes
   2865                      if (width_mmx)
   2866                      {
   2867                         __asm__ __volatile__ (
   2868                            "sub  $2, %1             \n\t"
   2869                            "sub  $14, %2            \n\t"
   2870 
   2871                         ".loop2_pass2:              \n\t"
   2872                            "movd (%1), %%mm0        \n\t" // x x x x 3 2 1 0
   2873                            "punpcklwd %%mm0, %%mm0  \n\t" // 3 2 3 2 1 0 1 0
   2874                            "movq %%mm0, %%mm1       \n\t" // 3 2 3 2 1 0 1 0
   2875                            "punpckldq %%mm0, %%mm0  \n\t" // 1 0 1 0 1 0 1 0
   2876                            "punpckhdq %%mm1, %%mm1  \n\t" // 3 2 3 2 3 2 3 2
   2877                            "movq %%mm0, (%2)        \n\t"
   2878                            "sub  $4, %1             \n\t"
   2879                            "movq %%mm1, 8(%2)       \n\t"
   2880                            "sub  $16, %2            \n\t"
   2881                            "subl $2, %%ecx          \n\t"
   2882                            "jnz .loop2_pass2        \n\t"
   2883                            "EMMS                    \n\t" // DONE
   2884 
   2885                            : "=c" (dummy_value_c),        // output regs (dummy)
   2886                              "=S" (dummy_value_S),
   2887                              "=D" (dummy_value_D)
   2888 
   2889                            : "0" (width_mmx),     // ecx  // input regs
   2890                              "1" (sptr),          // esi/rsi
   2891                              "2" (dp)             // edi/rdi
   2892 
   2893 #if defined(CLOBBER_MMX_REGS_SUPPORTED)
   2894                            : "%mm0", "%mm1"               // clobber list
   2895 #endif
   2896                         );
   2897                      }
   2898 
   2899                      sptr -= (width_mmx*2 - 2); // sign fixed
   2900                      dp -= (width_mmx*8 - 2);   // sign fixed
   2901                      for (i = width; i; i--)
   2902                      {
   2903                         png_byte v[8];
   2904                         int j;
   2905                         sptr -= 2;
   2906                         png_memcpy(v, sptr, 2);
   2907                         for (j = 0; j < png_pass_inc[pass]; j++)
   2908                         {
   2909                            dp -= 2;
   2910                            png_memcpy(dp, v, 2);
   2911                         }
   2912                      }
   2913                   }
   2914                   else if (width)  // && ((pass == 0) || (pass == 1))
   2915                   {
   2916                      int width_mmx = ((width >> 1) << 1);
   2917                      width -= width_mmx;        // 0,1 pixels => 0,2 bytes
   2918                      if (width_mmx)
   2919                      {
   2920                         __asm__ __volatile__ (
   2921                            "sub  $2, %1             \n\t"
   2922                            "sub  $30, %2            \n\t"
   2923 
   2924                         ".loop2_pass0:              \n\t"
   2925                            "movd (%1), %%mm0        \n\t" // x x x x 3 2 1 0
   2926                            "punpcklwd %%mm0, %%mm0  \n\t" // 3 2 3 2 1 0 1 0
   2927                            "movq %%mm0, %%mm1       \n\t" // 3 2 3 2 1 0 1 0
   2928                            "punpckldq %%mm0, %%mm0  \n\t" // 1 0 1 0 1 0 1 0
   2929                            "punpckhdq %%mm1, %%mm1  \n\t" // 3 2 3 2 3 2 3 2
   2930                            "movq %%mm0, (%2)        \n\t"
   2931                            "movq %%mm0, 8(%2)       \n\t"
   2932                            "movq %%mm1, 16(%2)      \n\t"
   2933                            "sub  $4, %1             \n\t"
   2934                            "movq %%mm1, 24(%2)      \n\t"
   2935                            "sub  $32, %2            \n\t"
   2936                            "subl $2, %%ecx          \n\t"
   2937                            "jnz .loop2_pass0        \n\t"
   2938                            "EMMS                    \n\t" // DONE
   2939 
   2940                            : "=c" (dummy_value_c),        // output regs (dummy)
   2941                              "=S" (dummy_value_S),
   2942                              "=D" (dummy_value_D)
   2943 
   2944                            : "0" (width_mmx),     // ecx  // input regs
   2945                              "1" (sptr),          // esi/rsi
   2946                              "2" (dp)             // edi/rdi
   2947 
   2948 #if defined(CLOBBER_MMX_REGS_SUPPORTED)
   2949                            : "%mm0", "%mm1"               // clobber list
   2950 #endif
   2951                         );
   2952                      }
   2953 
   2954                      sptr -= (width_mmx*2 - 2); // sign fixed
   2955                      dp -= (width_mmx*16 - 2);  // sign fixed
   2956                      for (i = width; i; i--)
   2957                      {
   2958                         png_byte v[8];
   2959                         int j;
   2960                         sptr -= 2;
   2961                         png_memcpy(v, sptr, 2);
   2962                         for (j = 0; j < png_pass_inc[pass]; j++)
   2963                         {
   2964                            dp -= 2;
   2965                            png_memcpy(dp, v, 2);
   2966                         }
   2967                      }
   2968                   }
   2969                } /* end of pixel_bytes == 2 */
   2970 
   2971                //--------------------------------------------------------------
   2972                else if (pixel_bytes == BPP8)
   2973                {
   2974 // GRR TEST:  should work, but needs testing (special 64-bit version of rpng2?)
   2975                   // GRR NOTE:  no need to combine passes here!
   2976                   if (((pass == 4) || (pass == 5)) && width)
   2977                   {
   2978                      // source is 8-byte RRGGBBAA
   2979                      // dest is 16-byte RRGGBBAA RRGGBBAA
   2980                      __asm__ __volatile__ (
   2981                         "sub  $8, %2             \n\t" // start of last block
   2982 
   2983                      ".loop8_pass4:              \n\t"
   2984                         "movq (%1), %%mm0        \n\t" // 7 6 5 4 3 2 1 0
   2985                         "movq %%mm0, (%2)        \n\t"
   2986                         "sub  $8, %1             \n\t"
   2987                         "movq %%mm0, 8(%2)       \n\t"
   2988                         "sub  $16, %2            \n\t"
   2989                         "decl %%ecx              \n\t"
   2990                         "jnz .loop8_pass4        \n\t"
   2991                         "EMMS                    \n\t" // DONE
   2992 
   2993                         : "=c" (dummy_value_c),        // output regs (dummy)
   2994                           "=S" (dummy_value_S),
   2995                           "=D" (dummy_value_D)
   2996 
   2997                         : "0" (width),         // ecx  // input regs
   2998                           "1" (sptr),          // esi/rsi
   2999                           "2" (dp)             // edi/rdi
   3000 
   3001 #if defined(CLOBBER_MMX_REGS_SUPPORTED)
   3002                         : "%mm0"                       // clobber list
   3003 #endif
   3004                      );
   3005                   }
   3006                   else if (((pass == 2) || (pass == 3)) && width)
   3007                   {
   3008                      // source is 8-byte RRGGBBAA
   3009                      // dest is 32-byte RRGGBBAA RRGGBBAA RRGGBBAA RRGGBBAA
   3010                      // (recall that expansion is _in place_:  sptr and dp
   3011                      //  both point at locations within same row buffer)
   3012                      __asm__ __volatile__ (
   3013                         "sub  $24, %2            \n\t" // start of last block
   3014 
   3015                      ".loop8_pass2:              \n\t"
   3016                         "movq (%1), %%mm0        \n\t" // 7 6 5 4 3 2 1 0
   3017                         "movq %%mm0, (%2)        \n\t"
   3018                         "movq %%mm0, 8(%2)       \n\t"
   3019                         "movq %%mm0, 16(%2)      \n\t"
   3020                         "sub  $8, %1             \n\t"
   3021                         "movq %%mm0, 24(%2)      \n\t"
   3022                         "sub  $32, %2            \n\t"
   3023                         "decl %%ecx              \n\t"
   3024                         "jnz .loop8_pass2        \n\t"
   3025                         "EMMS                    \n\t" // DONE
   3026 
   3027                         : "=c" (dummy_value_c),        // output regs (dummy)
   3028                           "=S" (dummy_value_S),
   3029                           "=D" (dummy_value_D)
   3030 
   3031                         : "0" (width),         // ecx  // input regs
   3032                           "1" (sptr),          // esi/rsi
   3033                           "2" (dp)             // edi/rdi
   3034 
   3035 #if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
   3036                         : "%mm0"                       // clobber list
   3037 #endif
   3038                      );
   3039                   }
   3040                   else if (width)  // && ((pass == 0) || (pass == 1))
   3041                   {
   3042                      // source is 8-byte RRGGBBAA
   3043                      // dest is 64-byte RRGGBBAA RRGGBBAA RRGGBBAA RRGGBBAA ...
   3044                      __asm__ __volatile__ (
   3045                         "sub  $56, %2            \n\t" // start of last block
   3046 
   3047                      ".loop8_pass0:              \n\t"
   3048                         "movq (%1), %%mm0        \n\t" // 7 6 5 4 3 2 1 0
   3049                         "movq %%mm0, (%2)        \n\t"
   3050                         "movq %%mm0, 8(%2)       \n\t"
   3051                         "movq %%mm0, 16(%2)      \n\t"
   3052                         "movq %%mm0, 24(%2)      \n\t"
   3053                         "movq %%mm0, 32(%2)      \n\t"
   3054                         "movq %%mm0, 40(%2)      \n\t"
   3055                         "movq %%mm0, 48(%2)      \n\t"
   3056                         "sub  $8, %1             \n\t"
   3057                         "movq %%mm0, 56(%2)      \n\t"
   3058                         "sub  $64, %2            \n\t"
   3059                         "decl %%ecx              \n\t"
   3060                         "jnz .loop8_pass0        \n\t"
   3061                         "EMMS                    \n\t" // DONE
   3062 
   3063                         : "=c" (dummy_value_c),        // output regs (dummy)
   3064                           "=S" (dummy_value_S),
   3065                           "=D" (dummy_value_D)
   3066 
   3067                         : "0" (width),         // ecx  // input regs
   3068                           "1" (sptr),          // esi/rsi
   3069                           "2" (dp)             // edi/rdi
   3070 
   3071 #if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
   3072                         : "%mm0"                       // clobber list
   3073 #endif
   3074                      );
   3075                   }
   3076                } /* end of pixel_bytes == 8 */
   3077 
   3078                //--------------------------------------------------------------
   3079                else if (pixel_bytes == BPP6)   // why no MMX for this case?
   3080                {
   3081                   for (i = width; i; i--)
   3082                   {
   3083                      png_byte v[8];
   3084                      int j;
   3085                      png_memcpy(v, sptr, BPP6);
   3086                      for (j = 0; j < png_pass_inc[pass]; j++)
   3087                      {
   3088                         png_memcpy(dp, v, BPP6);
   3089                         dp -= BPP6;
   3090                      }
   3091                      sptr -= BPP6;
   3092                   }
   3093                } /* end of pixel_bytes == 6 */
   3094 
   3095                //--------------------------------------------------------------
   3096                else
   3097                {
   3098                   // ERROR:  SHOULD NEVER BE REACHED
   3099 #if defined(PNG_DEBUG)
   3100                   png_debug(1, "Internal libpng logic error (GCC "
   3101                     "png_do_read_interlace() _mmx_supported)\n");
   3102 #endif
   3103                }
   3104 
   3105             } // end of _mmx_supported ========================================
   3106 
   3107             else /* MMX not supported:  use modified C code - takes advantage
   3108                   *   of inlining of png_memcpy for a constant */
   3109             {
   3110                if (pixel_bytes == BPP3)
   3111                {
   3112                   for (i = width; i; i--)
   3113                   {
   3114                      png_byte v[8];
   3115                      int j;
   3116                      png_memcpy(v, sptr, BPP3);
   3117                      for (j = 0; j < png_pass_inc[pass]; j++)
   3118                      {
   3119                         png_memcpy(dp, v, BPP3);
   3120                         dp -= BPP3;
   3121                      }
   3122                      sptr -= BPP3;
   3123                   }
   3124                }
   3125                else if (pixel_bytes == BPP4)
   3126                {
   3127                   for (i = width; i; i--)
   3128                   {
   3129                      png_byte v[8];
   3130                      int j;
   3131                      png_memcpy(v, sptr, BPP4);
   3132                      for (j = 0; j < png_pass_inc[pass]; j++)
   3133                      {
   3134 #if defined(PNG_DEBUG) && defined(PNG_1_0_X)  // row_buf_size gone in 1.2.x
   3135                         if (dp < row || dp+3 > row+png_ptr->row_buf_size)
   3136                         {
   3137                            printf("dp out of bounds: row=%10p, dp=%10p, "
   3138                              "rp=%10p\n", row, dp, row+png_ptr->row_buf_size);
   3139                            printf("row_buf_size=%lu\n", png_ptr->row_buf_size);
   3140                         }
   3141 #endif
   3142                         png_memcpy(dp, v, BPP4);
   3143                         dp -= BPP4;
   3144                      }
   3145                      sptr -= BPP4;
   3146                   }
   3147                }
   3148                else if (pixel_bytes == 1)
   3149                {
   3150                   for (i = width; i; i--)
   3151                   {
   3152                      int j;
   3153                      for (j = 0; j < png_pass_inc[pass]; j++)
   3154                      {
   3155                         *dp-- = *sptr;
   3156                      }
   3157                      --sptr;
   3158                   }
   3159                }
   3160                else if (pixel_bytes == BPP2)
   3161                {
   3162                   for (i = width; i; i--)
   3163                   {
   3164                      png_byte v[8];
   3165                      int j;
   3166                      png_memcpy(v, sptr, BPP2);
   3167                      for (j = 0; j < png_pass_inc[pass]; j++)
   3168                      {
   3169                         png_memcpy(dp, v, BPP2);
   3170                         dp -= BPP2;
   3171                      }
   3172                      sptr -= BPP2;
   3173                   }
   3174                }
   3175                else if (pixel_bytes == BPP6)
   3176                {
   3177                   for (i = width; i; i--)
   3178                   {
   3179                      png_byte v[8];
   3180                      int j;
   3181                      png_memcpy(v, sptr, BPP6);
   3182                      for (j = 0; j < png_pass_inc[pass]; j++)
   3183                      {
   3184                         png_memcpy(dp, v, BPP6);
   3185                         dp -= BPP6;
   3186                      }
   3187                      sptr -= BPP6;
   3188                   }
   3189                }
   3190                else if (pixel_bytes == BPP8)
   3191                {
   3192                   for (i = width; i; i--)
   3193                   {
   3194                      png_byte v[8];
   3195                      int j;
   3196                      png_memcpy(v, sptr, BPP8);
   3197                      for (j = 0; j < png_pass_inc[pass]; j++)
   3198                      {
   3199                         png_memcpy(dp, v, BPP8);
   3200                         dp -= BPP8;
   3201                      }
   3202                      sptr -= BPP8;
   3203                   }
   3204                }
   3205                else
   3206                {
   3207                   // ERROR:  SHOULD NEVER BE REACHED
   3208 #if defined(PNG_DEBUG)
   3209                   png_debug(1, "Internal libpng logic error (GCC "
   3210                     "png_do_read_interlace() !_mmx_supported)\n");
   3211 #endif
   3212                }
   3213 
   3214             } /* end if (MMX not supported) */
   3215             break;
   3216          } /* end default (8-bit or larger) */
   3217       } /* end switch (row_info->pixel_depth) */
   3218 
   3219       row_info->width = final_width;
   3220 
   3221       row_info->rowbytes = PNG_ROWBYTES(row_info->pixel_depth,final_width);
   3222    }
   3223 
   3224 } /* end png_do_read_interlace() */
   3225 
   3226 #endif /* PNG_HAVE_MMX_READ_INTERLACE */
   3227 #endif /* PNG_READ_INTERLACING_SUPPORTED */
   3228 
   3229 
   3230 
   3231 #if defined(PNG_HAVE_MMX_READ_FILTER_ROW)
   3232 #if defined(PNG_MMX_READ_FILTER_AVG_SUPPORTED)
   3233 
   3234 //===========================================================================//
   3235 //                                                                           //
   3236 //           P N G _ R E A D _ F I L T E R _ R O W _ M M X _ A V G           //
   3237 //                                                                           //
   3238 //===========================================================================//
   3239 
   3240 // Optimized code for PNG Average filter decoder
   3241 
   3242 static void /* PRIVATE */
   3243 png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row,
   3244                             png_bytep prev_row)
   3245 {
   3246    unsigned FullLength, MMXLength;  // png_uint_32 is actually 64-bit on x86-64
   3247    int bpp;
   3248    int dummy_value_a;
   3249    int dummy_value_c;   // fix 'forbidden register 2 (cx) was spilled' error
   3250    int dummy_value_d;
   3251    png_bytep dummy_value_S;
   3252    png_bytep dummy_value_D;
   3253    int diff; //     __attribute__((used));
   3254 
   3255    bpp = (row_info->pixel_depth + 7) >> 3;  // calc number of bytes per pixel
   3256    FullLength = row_info->rowbytes;         // number of bytes to filter
   3257 
   3258    __asm__ __volatile__ (
   3259    "avg_top:                       \n\t"
   3260       SAVE_GOT_ebx
   3261       SAVE_r15
   3262       SAVE_ebp
   3263       // initialize address pointers and offset
   3264 //pre "movl row, %5                \n\t" // edi/rdi:  ptr to Avg(x)
   3265       "xorl %%ebx, %%ebx           \n\t" // ebx:  x
   3266 //pre "movl prev_row, %4           \n\t" // esi/rsi:  ptr to Prior(x)
   3267       "mov  %5, " PDX "            \n\t" // copy of row ptr...
   3268 //pre "subl bpp, " PDX "           \n\t" // (bpp is preloaded into ecx)
   3269       "sub  " PCX "," PDX "        \n\t" // edx/rdx:  ptr to Raw(x-bpp)
   3270 //pre "movl FullLength, %%eax      \n\t" // bring in via eax...
   3271       SAVE_FullLength                    // ...but store for later use
   3272       "xorl %%eax, %%eax           \n\t"
   3273 
   3274       // Compute the Raw value for the first bpp bytes
   3275       //    Raw(x) = Avg(x) + (Prior(x)/2)
   3276    "avg_rlp:                       \n\t"
   3277       "movb (%4," PBX ",), %%al    \n\t" // load al with Prior(x)
   3278       "incl %%ebx                  \n\t"
   3279       "shrb %%al                   \n\t" // divide by 2
   3280       "addb -1(%5," PBX ",), %%al  \n\t" // add Avg(x); -1 to offset inc ebx
   3281 //pre "cmpl bpp, %%ebx             \n\t" // (bpp is preloaded into ecx)
   3282       "cmpl %%ecx, %%ebx           \n\t"
   3283       "movb %%al, -1(%5," PBX ",)  \n\t" // write Raw(x); -1 to offset inc ebx
   3284       "jb avg_rlp                  \n\t" // mov does not affect flags
   3285 
   3286       // get # of bytes to alignment (32-bit mask _would_ be good enough
   3287       // [computing delta], but 32-bit ops are zero-extended on 64-bit, argh)
   3288       // (if swapped edx and ebp, could do 8-bit or 16-bit mask...FIXME?)
   3289       "mov  %5, " PBP "            \n\t" // take start of row
   3290       "add  " PBX "," PBP "        \n\t" // add bpp
   3291       "add  $0xf, " PBP "          \n\t" // add 7+8 to incr past alignment bdry
   3292 //    "andl $0xfffffff8, %%ebp     \n\t" // mask to alignment boundary (32-bit!)
   3293       CLEAR_BOTTOM_3_BITS  PBP    "\n\t" // mask to alignment boundary
   3294       "sub  %5, " PBP "            \n\t" // subtract row ptr again => ebp =
   3295       "jz avg_go                   \n\t" //  target value of ebx at alignment
   3296 
   3297       "xorl %%ecx, %%ecx           \n\t"
   3298 
   3299       // fix alignment
   3300       // Compute the Raw value for the bytes up to the alignment boundary
   3301       //    Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
   3302    "avg_lp1:                       \n\t"
   3303       "xorl %%eax, %%eax           \n\t"
   3304       "movb (%4," PBX ",), %%cl    \n\t" // load cl with Prior(x)
   3305       "movb (" PDX "," PBX ",), %%al \n\t" // load al with Raw(x-bpp)
   3306       "addw %%cx, %%ax             \n\t"
   3307       "incl %%ebx                  \n\t"
   3308       "shrw %%ax                   \n\t" // divide by 2
   3309       "addb -1(%5," PBX ",), %%al  \n\t" // add Avg(x); -1 to offset inc ebx
   3310       "cmpl %%ebp, %%ebx           \n\t" // check if at alignment boundary
   3311       "movb %%al, -1(%5," PBX ",)  \n\t" // write Raw(x); -1 to offset inc ebx
   3312       "jb avg_lp1                  \n\t" // repeat until at alignment boundary
   3313 
   3314    "avg_go:                        \n\t"
   3315       RESTORE_FullLength "%%eax    \n\t" // FullLength -> eax
   3316       "movl %%eax, %%ecx           \n\t" // copy -> ecx
   3317       "subl %%ebx, %%eax           \n\t" // subtract alignment fix
   3318       "andl $0x00000007, %%eax     \n\t" // calc bytes over mult of 8
   3319       "subl %%eax, %%ecx           \n\t" // sub over-bytes from original length
   3320 //out "movl %%ecx, MMXLength       \n\t"
   3321       "movl %%ebp, %%eax           \n\t" // ebp = diff, but no reg constraint(?)
   3322       RESTORE_ebp                        //  (could swap ebp and edx functions)
   3323       RESTORE_r15
   3324       RESTORE_GOT_ebx
   3325 
   3326 // "There is no way for you to specify that an input operand is modified
   3327 // without also specifying it as an output operand."  [makes sense]
   3328 
   3329 // "Unless an output operand has the `&' constraint modifier, GCC may
   3330 // allocate it in the same register as an unrelated input operand, on the
   3331 // assumption the inputs are consumed before the outputs are produced."
   3332 // [trying to _force_ this]
   3333 
   3334 // "`='   Means that this operand is write-only for this instruction:
   3335 //        the previous value is discarded and replaced by output data."
   3336 //        [operand == variable name, presumably]
   3337 
   3338       // output regs
   3339       // these are operands 0-1 (originally 0-3):
   3340       : "=c" (MMXLength),      // %0 -> %0
   3341         "=a" (diff)            // %3 -> %1
   3342 //      "=S" (dummy_value_S),  // %1 -> GONE
   3343 //      "=D" (dummy_value_D),  // %2 -> GONE
   3344 
   3345       // input regs
   3346       // these are operands 2-5 (originally 4-7); two of their constraints say
   3347       // they must go in same places as operands 0-1 (originally 0-3) above:
   3348       : "0" (bpp),         // %4 -> %2 ecx
   3349         "1" (FullLength),  // %7 -> %3 eax
   3350         "S" (prev_row),    // %5 -> %4 esi/rsi
   3351         "D" (row)          // %6 -> %5 edi/rdi
   3352 
   3353       : "%edx"                           // clobber list
   3354         _CLOBBER_r15
   3355         _CLOBBER_ebp
   3356         _CLOBBER_GOT_ebx
   3357    );
   3358 
   3359    // now do the math for the rest of the row
   3360    switch (bpp)
   3361    {
   3362       case 3:
   3363       {
   3364 //       _ShiftBpp = 24;    // == 3 * 8
   3365 //       _ShiftRem = 40;    // == 64 - 24
   3366 
   3367          __asm__ __volatile__ (
   3368             // re-init address pointers and offset
   3369             LOAD_GOT_rbp
   3370             "movq " AMASK5_3_0 ", %%mm7    \n\t" // _amask5_3_0 -> mm7
   3371 // preload  "movl  diff, %%ecx             \n\t" // ecx:  x = offset to
   3372                                                  //  alignment boundary
   3373             "movq " LB_CARRY_MASK ", %%mm5 \n\t" // [interleave for parallel.?]
   3374 // preload  "movl  row, %1                 \n\t" // edi:  Avg(x)
   3375             "movq " HB_CLEAR_MASK ", %%mm4 \n\t" // _HBClearMask -> mm4
   3376 // preload  "movl  prev_row, %0            \n\t" // esi:  Prior(x)
   3377             RESTORE_rbp
   3378 
   3379             // prime the pump:  load the first Raw(x-bpp) data set
   3380             "movq  -8(%1," PCX ",), %%mm2 \n\t"// load previous aligned 8 bytes
   3381                                                // (correct pos. in loop below)
   3382          "avg_3lp:                        \n\t"
   3383             "movq  (%1," PCX ",), %%mm0   \n\t" // load mm0 with Avg(x)
   3384             "movq  %%mm5, %%mm3           \n\t"
   3385             "psrlq $40, %%mm2             \n\t" // correct position Raw(x-bpp)
   3386                                                 // data
   3387             "movq  (%0," PCX ",), %%mm1   \n\t" // load mm1 with Prior(x)
   3388             "movq  %%mm7, %%mm6           \n\t"
   3389             "pand  %%mm1, %%mm3           \n\t" // get lsb for each prevrow byte
   3390             "psrlq $1, %%mm1              \n\t" // divide prev_row bytes by 2
   3391             "pand  %%mm4, %%mm1           \n\t" // clear invalid bit 7 of each
   3392                                                 // byte
   3393             "paddb %%mm1, %%mm0           \n\t" // add (Prev_row/2) to Avg for
   3394                                                 // each byte
   3395             // add 1st active group (Raw(x-bpp)/2) to average with LBCarry
   3396             "movq  %%mm3, %%mm1           \n\t" // now use mm1 for getting
   3397                                                 // LBCarrys
   3398             "pand  %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
   3399                                                 // where both lsb's were == 1
   3400                                                 // (valid only for active group)
   3401             "psrlq $1, %%mm2              \n\t" // divide raw bytes by 2
   3402             "pand  %%mm4, %%mm2           \n\t" // clear invalid bit 7 of each
   3403                                                 // byte
   3404             "paddb %%mm1, %%mm2           \n\t" // add LBCarrys to Raw(x-bpp)/2
   3405                                                 // for each byte
   3406             "pand  %%mm6, %%mm2           \n\t" // leave only Active Group 1
   3407                                                 // bytes to add to Avg
   3408             "paddb %%mm2, %%mm0           \n\t" // add (Raw/2) + LBCarrys to
   3409                                                 // Avg for each Active byte
   3410             // add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
   3411             "psllq $24, %%mm6             \n\t" // shift the mm6 mask to cover
   3412                                                 // bytes 3-5
   3413             "movq  %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
   3414             "psllq $24, %%mm2             \n\t" // shift data to pos. correctly
   3415             "movq  %%mm3, %%mm1           \n\t" // now use mm1 for getting
   3416                                                 // LBCarrys
   3417             "pand  %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
   3418                                                 // where both lsb's were == 1
   3419                                                 // (valid only for active group)
   3420             "psrlq $1, %%mm2              \n\t" // divide raw bytes by 2
   3421             "pand  %%mm4, %%mm2           \n\t" // clear invalid bit 7 of each
   3422                                                 // byte
   3423             "paddb %%mm1, %%mm2           \n\t" // add LBCarrys to Raw(x-bpp)/2
   3424                                                 // for each byte
   3425             "pand  %%mm6, %%mm2           \n\t" // leave only Active Group 2
   3426                                                 // bytes to add to Avg
   3427             "paddb %%mm2, %%mm0           \n\t" // add (Raw/2) + LBCarrys to
   3428                                                 // Avg for each Active byte
   3429 
   3430             // add 3rd active group (Raw(x-bpp)/2) to average with _LBCarry
   3431             "psllq $24, %%mm6             \n\t" // shift mm6 mask to cover last
   3432                                                 // two bytes
   3433             "movq  %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
   3434             "psllq $24, %%mm2             \n\t" // shift data to pos. correctly
   3435                               // Data need be shifted only once here to
   3436                               // get the correct x-bpp offset.
   3437             "movq  %%mm3, %%mm1           \n\t" // now use mm1 for getting
   3438                                                 // LBCarrys
   3439             "pand  %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
   3440                                                 // where both
   3441                               // lsb's were == 1 (only valid for active group)
   3442             "psrlq $1, %%mm2              \n\t" // divide raw bytes by 2
   3443             "pand  %%mm4, %%mm2           \n\t" // clear invalid bit 7 of each
   3444                                                 // byte
   3445             "paddb %%mm1, %%mm2           \n\t" // add LBCarrys to Raw(x-bpp)/2
   3446                                                 // for each byte
   3447             "pand  %%mm6, %%mm2           \n\t" // leave only Active Group 2
   3448                                                 // bytes to add to Avg
   3449             "addl  $8, %%ecx              \n\t"
   3450             "paddb %%mm2, %%mm0           \n\t" // add (Raw/2) + LBCarrys to
   3451                                                 // Avg for each Active byte
   3452             // now ready to write back to memory
   3453             "movq  %%mm0, -8(%1," PCX ",) \n\t"
   3454             // move updated Raw(x) to use as Raw(x-bpp) for next loop
   3455             "cmpl  %%eax, %%ecx           \n\t" // MMXLength
   3456             "movq  %%mm0, %%mm2           \n\t" // mov updated Raw(x) to mm2
   3457             "jb avg_3lp                   \n\t"
   3458 
   3459             : "=S" (dummy_value_S),            // output regs (dummy)
   3460               "=D" (dummy_value_D),
   3461               "=c" (dummy_value_c),
   3462               "=a" (dummy_value_a)
   3463 
   3464             : "0" (prev_row),    // esi/rsi    // input regs
   3465               "1" (row),         // edi/rdi
   3466               "2" (diff),        // ecx
   3467               "3" (MMXLength)    // eax
   3468 
   3469 #if defined(CLOBBER_MMX_REGS_SUPPORTED)
   3470             : "%mm0", "%mm1", "%mm2", "%mm3"   // clobber list
   3471             , "%mm4", "%mm5", "%mm6", "%mm7"
   3472 #endif
   3473          );
   3474       }
   3475       break;  // end 3 bpp
   3476 
   3477       case 4:   // formerly shared with 6 bpp case via _ShiftBpp and _ShiftRem,
   3478       {         // but loop uses all 8 MMX regs, and psrlq/psllq require 64-bit
   3479                 // mem (PIC/.so problems), MMX reg (none left), or immediate
   3480 //       _ShiftBpp = bpp << 3;        // 32 (psllq)
   3481 //       _ShiftRem = 64 - _ShiftBpp;  // 32 (psrlq)
   3482 
   3483          __asm__ __volatile__ (
   3484             LOAD_GOT_rbp
   3485             "movq " HB_CLEAR_MASK ", %%mm4 \n\t" // _HBClearMask -> mm4
   3486             "movq " LB_CARRY_MASK ", %%mm5 \n\t" // _LBCarryMask -> mm5
   3487             // re-init address pointers and offset
   3488 // preload  "movl  diff, %%ecx             \n\t" // ecx:  x = offset to
   3489                                                  // alignment boundary
   3490             "movq " AMASK0_8_0 ", %%mm7    \n\t" // _amask0_8_0 -> mm7
   3491             RESTORE_rbp
   3492 
   3493             // ... and clear all bytes except for 1st active group
   3494 // preload  "movl  row, %1               \n\t" // edi:  Avg(x)
   3495             "psrlq $32, %%mm7            \n\t" // was _ShiftRem
   3496 // preload  "movl  prev_row, %0          \n\t" // esi:  Prior(x)
   3497             "movq  %%mm7, %%mm6          \n\t"
   3498             "psllq $32, %%mm6            \n\t" // mask for 2nd active group
   3499 
   3500             // prime the pump:  load the first Raw(x-bpp) data set
   3501             "movq -8(%1," PCX ",), %%mm2 \n\t" // load previous aligned 8 bytes
   3502                                              // (we correct pos. in loop below)
   3503          "avg_4lp:                       \n\t"
   3504             "movq (%1," PCX ",), %%mm0   \n\t"
   3505             "psrlq $32, %%mm2            \n\t" // shift data to pos. correctly
   3506             "movq (%0," PCX ",), %%mm1   \n\t"
   3507             // add (Prev_row/2) to average
   3508             "movq %%mm5, %%mm3           \n\t"
   3509             "pand %%mm1, %%mm3           \n\t" // get lsb for each prev_row byte
   3510             "psrlq $1, %%mm1             \n\t" // divide prev_row bytes by 2
   3511             "pand  %%mm4, %%mm1          \n\t" // clear invalid bit 7 of each
   3512                                                // byte
   3513             "paddb %%mm1, %%mm0          \n\t" // add (Prev_row/2) to Avg for
   3514                                                // each byte
   3515             // add 1st active group (Raw(x-bpp)/2) to average with _LBCarry
   3516             "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting
   3517                                                // LBCarrys
   3518             "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
   3519                                                // where both
   3520                               // lsb's were == 1 (only valid for active group)
   3521             "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
   3522             "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each
   3523                                                // byte
   3524             "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2)
   3525                                                // for each byte
   3526             "pand %%mm7, %%mm2           \n\t" // leave only Active Group 1
   3527                                                // bytes to add to Avg
   3528             "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to Avg
   3529                                                // for each Active
   3530                               // byte
   3531             // add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
   3532             "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
   3533             "psllq $32, %%mm2            \n\t" // shift data to pos. correctly
   3534             "addl $8, %%ecx              \n\t"
   3535             "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting
   3536                                                // LBCarrys
   3537             "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
   3538                                                // where both
   3539                               // lsb's were == 1 (only valid for active group)
   3540             "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
   3541             "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each
   3542                                                // byte
   3543             "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2)
   3544                                                // for each byte
   3545             "pand %%mm6, %%mm2           \n\t" // leave only Active Group 2
   3546                                                // bytes to add to Avg
   3547             "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to
   3548                                                // Avg for each Active byte
   3549             "cmpl %%eax, %%ecx           \n\t" // MMXLength
   3550             // now ready to write back to memory
   3551             "movq %%mm0, -8(%1," PCX ",) \n\t"
   3552             // prep Raw(x-bpp) for next loop
   3553             "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
   3554             "jb avg_4lp                  \n\t"
   3555 
   3556             : "=S" (dummy_value_S),            // output regs (dummy)
   3557               "=D" (dummy_value_D),
   3558               "=c" (dummy_value_c),
   3559               "=a" (dummy_value_a)
   3560 
   3561             : "0" (prev_row),    // esi/rsi    // input regs
   3562               "1" (row),         // edi/rdi
   3563               "2" (diff),        // ecx
   3564               "3" (MMXLength)    // eax
   3565 
   3566 #if defined(CLOBBER_MMX_REGS_SUPPORTED)
   3567             : "%mm0", "%mm1", "%mm2", "%mm3"   // clobber list
   3568             , "%mm4", "%mm5", "%mm6", "%mm7"
   3569 #endif
   3570          );
   3571       }
   3572       break;  // end 4 bpp
   3573 
   3574       case 1:
   3575       {
   3576          __asm__ __volatile__ (
   3577             // re-init address pointers and offset
   3578 // preload  "movl diff, %%ecx            \n\t" // ecx: x = offset to align. bdry
   3579 // preload  "movl row, %1                \n\t" // edi/rdi:  Avg(x)
   3580 // preload  "movl FullLength, %%eax      \n\t"
   3581             "cmpl %%eax, %%ecx           \n\t" // test if offset at end of array
   3582             "jnb avg_1end                \n\t"
   3583 
   3584             SAVE_ebp
   3585 
   3586             // do Avg decode for remaining bytes
   3587 // preload  "movl prev_row, %0           \n\t" // esi/rsi:  Prior(x)
   3588             "mov  %1, " PBP "            \n\t" // copy of row pointer...
   3589             "dec  " PBP "                \n\t" // ebp/rbp:  Raw(x-bpp)
   3590             "xorl %%edx, %%edx           \n\t" // zero edx before using dl & dx
   3591                                                //  in loop below
   3592             SAVE_GOT_ebx
   3593 
   3594          "avg_1lp:                       \n\t"
   3595             // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
   3596             "xorl %%ebx, %%ebx           \n\t"
   3597             "movb (%0," PCX ",), %%dl    \n\t" // load dl with Prior(x)
   3598             "movb (" PBP "," PCX ",), %%bl \n\t" // load bl with Raw(x-bpp)
   3599             "addw %%dx, %%bx             \n\t"
   3600             "incl %%ecx                  \n\t"
   3601             "shrw %%bx                   \n\t" // divide by 2
   3602             "addb -1(%1," PCX ",), %%bl  \n\t" // add Avg(x); -1 to offset
   3603                                                // inc ecx
   3604             "cmpl %%eax, %%ecx           \n\t" // check if at end of array
   3605             "movb %%bl, -1(%1," PCX ",)  \n\t" // write back Raw(x);
   3606                          // mov does not affect flags; -1 to offset inc ecx
   3607             "jb avg_1lp                  \n\t"
   3608 
   3609             RESTORE_GOT_ebx
   3610             RESTORE_ebp
   3611 
   3612          "avg_1end:                      \n\t"
   3613 
   3614             : "=S" (dummy_value_S),            // output regs (dummy)
   3615               "=D" (dummy_value_D),
   3616               "=c" (dummy_value_c),
   3617               "=a" (dummy_value_a)
   3618 
   3619             : "0" (prev_row),    // esi/rsi    // input regs
   3620               "1" (row),         // edi/rdi
   3621               "2" (diff),        // ecx
   3622               "3" (FullLength)   // eax
   3623 
   3624             : "%edx"                           // clobber list
   3625               _CLOBBER_GOT_ebx
   3626               _CLOBBER_ebp
   3627          );
   3628       }
   3629       return;  // end 1 bpp
   3630 
   3631       case 2:
   3632       {
   3633 //       _ShiftBpp = 16;   // == 2 * 8
   3634 //       _ShiftRem = 48;   // == 64 - _ShiftBpp
   3635 
   3636          __asm__ __volatile__ (
   3637             LOAD_GOT_rbp
   3638             // load (former) _ActiveMask
   3639             "movq " AMASK6_2_0 ", %%mm7    \n\t" // _amask6_2_0 -> mm7
   3640             // re-init address pointers and offset
   3641 // preload  "movl  diff, %%ecx             \n\t" // ecx:  x = offset to
   3642                                                  // alignment boundary
   3643             "movq " LB_CARRY_MASK ", %%mm5 \n\t" // _LBCarryMask -> mm5
   3644 // preload  "movl  row, %1                 \n\t" // edi:  Avg(x)
   3645             "movq " HB_CLEAR_MASK ", %%mm4 \n\t" // _HBClearMask -> mm4
   3646 // preload  "movl  prev_row, %0            \n\t" // esi:  Prior(x)
   3647             RESTORE_rbp
   3648 
   3649             // prime the pump:  load the first Raw(x-bpp) data set
   3650             "movq -8(%1," PCX ",), %%mm2 \n\t" // load previous aligned 8 bytes
   3651                                              // (we correct pos. in loop below)
   3652          "avg_2lp:                       \n\t"
   3653             "movq (%1," PCX ",), %%mm0   \n\t"
   3654             "psrlq $48, %%mm2            \n\t" // shift data to pos. correctly
   3655             "movq (%0," PCX ",), %%mm1   \n\t" //  (GRR BUGFIX:  was psllq)
   3656             // add (Prev_row/2) to average
   3657             "movq %%mm5, %%mm3           \n\t"
   3658             "pand %%mm1, %%mm3           \n\t" // get lsb for each prev_row byte
   3659             "psrlq $1, %%mm1             \n\t" // divide prev_row bytes by 2
   3660             "pand  %%mm4, %%mm1          \n\t" // clear invalid bit 7 of each
   3661                                                // byte
   3662             "movq %%mm7, %%mm6           \n\t"
   3663             "paddb %%mm1, %%mm0          \n\t" // add (Prev_row/2) to Avg for
   3664                                                // each byte
   3665 
   3666             // add 1st active group (Raw(x-bpp)/2) to average with _LBCarry
   3667             "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting
   3668                                                // LBCarrys
   3669             "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
   3670                                                // where both
   3671                                                // lsb's were == 1 (only valid
   3672                                                // for active group)
   3673             "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
   3674             "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each
   3675                                                // byte
   3676             "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2)
   3677                                                // for each byte
   3678             "pand %%mm6, %%mm2           \n\t" // leave only Active Group 1
   3679                                                // bytes to add to Avg
   3680             "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to Avg
   3681                                                // for each Active byte
   3682 
   3683             // add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
   3684             "psllq $16, %%mm6            \n\t" // shift the mm6 mask to cover
   3685                                                // bytes 2 & 3
   3686             "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
   3687             "psllq $16, %%mm2            \n\t" // shift data to pos. correctly
   3688             "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting
   3689                                                // LBCarrys
   3690             "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
   3691                                                // where both
   3692                                                // lsb's were == 1 (only valid
   3693                                                // for active group)
   3694             "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
   3695             "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each
   3696                                                // byte
   3697             "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2)
   3698                                                // for each byte
   3699             "pand %%mm6, %%mm2           \n\t" // leave only Active Group 2
   3700                                                // bytes to add to Avg
   3701             "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to
   3702                                                // Avg for each Active byte
   3703 
   3704             // add 3rd active group (Raw(x-bpp)/2) to average with _LBCarry
   3705             "psllq $16, %%mm6            \n\t" // shift the mm6 mask to cover
   3706                                                // bytes 4 & 5
   3707             "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
   3708             "psllq $16, %%mm2            \n\t" // shift data to pos. correctly
   3709             "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting
   3710                                                // LBCarrys
   3711             "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
   3712                                                // where both lsb's were == 1
   3713                                                // (only valid for active group)
   3714             "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
   3715             "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each
   3716                                                // byte
   3717             "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2)
   3718                                                // for each byte
   3719             "pand %%mm6, %%mm2           \n\t" // leave only Active Group 2
   3720                                                // bytes to add to Avg
   3721             "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to
   3722                                                // Avg for each Active byte
   3723 
   3724             // add 4th active group (Raw(x-bpp)/2) to average with _LBCarry
   3725             "psllq $16, %%mm6            \n\t" // shift the mm6 mask to cover
   3726                                                // bytes 6 & 7
   3727             "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
   3728             "psllq $16, %%mm2            \n\t" // shift data to pos. correctly
   3729             "addl $8, %%ecx              \n\t"
   3730             "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting
   3731                                                // LBCarrys
   3732             "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
   3733                                                // where both
   3734                                                // lsb's were == 1 (only valid
   3735                                                // for active group)
   3736             "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
   3737             "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each
   3738                                                // byte
   3739             "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2)
   3740                                                // for each byte
   3741             "pand %%mm6, %%mm2           \n\t" // leave only Active Group 2
   3742                                                // bytes to add to Avg
   3743             "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to
   3744                                                // Avg for each Active byte
   3745             "cmpl %%eax, %%ecx           \n\t" // MMXLength
   3746             // now ready to write back to memory
   3747             "movq %%mm0, -8(%1," PCX ",) \n\t"
   3748             // prep Raw(x-bpp) for next loop
   3749             "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
   3750             "jb avg_2lp                  \n\t"
   3751 
   3752             : "=S" (dummy_value_S),            // output regs (dummy)
   3753               "=D" (dummy_value_D),
   3754               "=c" (dummy_value_c),
   3755               "=a" (dummy_value_a)
   3756 
   3757             : "0" (prev_row),    // esi/rsi    // input regs
   3758               "1" (row),         // edi/rdi
   3759               "2" (diff),        // ecx
   3760               "3" (MMXLength)    // eax
   3761 
   3762 #if defined(CLOBBER_MMX_REGS_SUPPORTED)
   3763             : "%mm0", "%mm1", "%mm2", "%mm3"   // clobber list
   3764             , "%mm4", "%mm5", "%mm6", "%mm7"
   3765 #endif
   3766          );
   3767       }
   3768       break;  // end 2 bpp
   3769 
   3770       case 6:   // formerly shared with 4 bpp case (see comments there)
   3771       {
   3772 //       _ShiftBpp = bpp << 3;        // 48 (psllq)
   3773 //       _ShiftRem = 64 - _ShiftBpp;  // 16 (psrlq)
   3774 
   3775          __asm__ __volatile__ (
   3776             LOAD_GOT_rbp
   3777             "movq " HB_CLEAR_MASK ", %%mm4 \n\t" // _HBClearMask -> mm4
   3778             "movq " LB_CARRY_MASK ", %%mm5 \n\t" // _LBCarryMask -> mm5
   3779             // re-init address pointers and offset
   3780 // preload  "movl  diff, %%ecx             \n\t" // ecx:  x = offset to
   3781                                                  // alignment boundary
   3782             "movq " AMASK0_8_0 ", %%mm7    \n\t" // _amask0_8_0 -> mm7
   3783             RESTORE_rbp
   3784 
   3785             // ... and clear all bytes except for 1st active group
   3786 // preload  "movl  row, %1               \n\t" // edi:  Avg(x)
   3787             "psrlq $16, %%mm7            \n\t"
   3788 // preload  "movl  prev_row, %0          \n\t" // esi:  Prior(x)
   3789             "movq  %%mm7, %%mm6          \n\t"
   3790             "psllq $48, %%mm6            \n\t" // mask for 2nd active group
   3791 
   3792             // prime the pump:  load the first Raw(x-bpp) data set
   3793             "movq -8(%1," PCX ",), %%mm2 \n\t" // load previous aligned 8 bytes
   3794                                              // (we correct pos. in loop below)
   3795          "avg_6lp:                       \n\t"
   3796             "movq (%1," PCX ",), %%mm0   \n\t"
   3797             "psrlq $16, %%mm2            \n\t" // shift data to pos. correctly
   3798             "movq (%0," PCX ",), %%mm1   \n\t"
   3799             // add (Prev_row/2) to average
   3800             "movq %%mm5, %%mm3           \n\t"
   3801             "pand %%mm1, %%mm3           \n\t" // get lsb for each prev_row byte
   3802             "psrlq $1, %%mm1             \n\t" // divide prev_row bytes by 2
   3803             "pand  %%mm4, %%mm1          \n\t" // clear invalid bit 7 of each
   3804                                                // byte
   3805             "paddb %%mm1, %%mm0          \n\t" // add (Prev_row/2) to Avg for
   3806                                                // each byte
   3807             // add 1st active group (Raw(x-bpp)/2) to average with _LBCarry
   3808             "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting
   3809                                                // LBCarrys
   3810             "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
   3811                                                // where both
   3812                               // lsb's were == 1 (only valid for active group)
   3813             "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
   3814             "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each
   3815                                                // byte
   3816             "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2)
   3817                                                // for each byte
   3818             "pand %%mm7, %%mm2           \n\t" // leave only Active Group 1
   3819                                                // bytes to add to Avg
   3820             "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to Avg
   3821                                                // for each Active
   3822                               // byte
   3823             // add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
   3824             "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
   3825             "psllq $48, %%mm2            \n\t" // shift data to pos. correctly
   3826             "addl $8, %%ecx              \n\t"
   3827             "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting
   3828                                                // LBCarrys
   3829             "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
   3830                                                // where both
   3831                               // lsb's were == 1 (only valid for active group)
   3832             "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
   3833             "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each
   3834                                                // byte
   3835             "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2)
   3836                                                // for each byte
   3837             "pand %%mm6, %%mm2           \n\t" // leave only Active Group 2
   3838                                                // bytes to add to Avg
   3839             "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to
   3840                                                // Avg for each Active byte
   3841             "cmpl %%eax, %%ecx           \n\t" // MMXLength
   3842             // now ready to write back to memory
   3843             "movq %%mm0, -8(%1," PCX ",) \n\t"
   3844             // prep Raw(x-bpp) for next loop
   3845             "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
   3846             "jb avg_6lp                  \n\t"
   3847 
   3848             : "=S" (dummy_value_S),            // output regs (dummy)
   3849               "=D" (dummy_value_D),
   3850               "=c" (dummy_value_c),
   3851               "=a" (dummy_value_a)
   3852 
   3853             : "0" (prev_row),    // esi/rsi    // input regs
   3854               "1" (row),         // edi/rdi
   3855               "2" (diff),        // ecx
   3856               "3" (MMXLength)    // eax
   3857 
   3858 #if defined(CLOBBER_MMX_REGS_SUPPORTED)
   3859             : "%mm0", "%mm1", "%mm2", "%mm3"   // clobber list
   3860             , "%mm4", "%mm5", "%mm6", "%mm7"
   3861 #endif
   3862          );
   3863       }
   3864       break;  // end 6 bpp
   3865 
   3866       case 8:
   3867       {
   3868          __asm__ __volatile__ (
   3869             // re-init address pointers and offset
   3870 // preload  "movl  diff, %%ecx             \n\t" // ecx:  x = offset to
   3871                                                  // alignment boundary
   3872             LOAD_GOT_rbp
   3873             "movq " LB_CARRY_MASK ", %%mm5 \n\t" // [interleave for parallel.?]
   3874 // preload  "movl  row, %1                 \n\t" // edi:  Avg(x)
   3875             "movq " HB_CLEAR_MASK ", %%mm4 \n\t" // _HBClearMask -> mm4
   3876 // preload  "movl  prev_row, %0            \n\t" // esi:  Prior(x)
   3877             RESTORE_rbp
   3878 
   3879             // prime the pump:  load the first Raw(x-bpp) data set
   3880             "movq -8(%1," PCX ",), %%mm2 \n\t" // load previous aligned 8 bytes
   3881                                       // (NO NEED to correct pos. in loop below)
   3882 
   3883          "avg_8lp:                       \n\t"
   3884             "movq (%1," PCX ",), %%mm0   \n\t"
   3885             "movq %%mm5, %%mm3           \n\t"
   3886             "movq (%0," PCX ",), %%mm1   \n\t"
   3887             "addl $8, %%ecx              \n\t"
   3888             "pand %%mm1, %%mm3           \n\t" // get lsb for each prev_row byte
   3889             "psrlq $1, %%mm1             \n\t" // divide prev_row bytes by 2
   3890             "pand %%mm2, %%mm3           \n\t" // get LBCarrys for each byte
   3891                                                //  where both lsb's were == 1
   3892             "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
   3893             "pand  %%mm4, %%mm1          \n\t" // clear invalid bit 7, each byte
   3894             "paddb %%mm3, %%mm0          \n\t" // add LBCarrys to Avg, each byte
   3895             "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7, each byte
   3896             "paddb %%mm1, %%mm0          \n\t" // add (Prev_row/2) to Avg, each
   3897             "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) to Avg for each
   3898             "cmpl %%eax, %%ecx           \n\t" // MMXLength
   3899             "movq %%mm0, -8(%1," PCX ",) \n\t"
   3900             "movq %%mm0, %%mm2           \n\t" // reuse as Raw(x-bpp)
   3901             "jb avg_8lp                  \n\t"
   3902 
   3903             : "=S" (dummy_value_S),            // output regs (dummy)
   3904               "=D" (dummy_value_D),
   3905               "=c" (dummy_value_c),
   3906               "=a" (dummy_value_a)
   3907 
   3908             : "0" (prev_row),    // esi/rsi    // input regs
   3909               "1" (row),         // edi/rdi
   3910               "2" (diff),        // ecx
   3911               "3" (MMXLength)    // eax
   3912 
   3913 #if defined(CLOBBER_MMX_REGS_SUPPORTED)
   3914             : "%mm0", "%mm1", "%mm2"           // clobber list
   3915             , "%mm3", "%mm4", "%mm5"
   3916 #endif
   3917          );
   3918       }
   3919       break;  // end 8 bpp
   3920 
   3921       default:                // bpp != 1,2,3,4,6,8:  doesn't exist
   3922       {
   3923          // ERROR:  SHOULD NEVER BE REACHED
   3924 #if defined(PNG_DEBUG)
   3925          png_debug(1, "Internal libpng logic error (GCC "
   3926            "png_read_filter_row_mmx_avg())\n");
   3927 #endif
   3928       }
   3929       break;
   3930 
   3931    } // end switch (bpp)
   3932 
   3933    __asm__ __volatile__ (
   3934       // MMX acceleration complete; now do clean-up
   3935       // check if any remaining bytes left to decode
   3936 //pre "movl FullLength, %%edx      \n\t"
   3937 //pre "movl MMXLength, %%eax       \n\t" // eax:  x == offset bytes after MMX
   3938 //pre "movl row, %2                \n\t" // edi:  Avg(x)
   3939       "cmpl %%edx, %%eax           \n\t" // test if offset at end of array
   3940       "jnb avg_end                 \n\t"
   3941 
   3942       SAVE_ebp
   3943 
   3944       // do Avg decode for remaining bytes
   3945 //pre "movl prev_row, %1           \n\t" // esi:  Prior(x)
   3946       "mov  %2, " PBP "            \n\t" // copy of row pointer...
   3947 //pre "subl bpp, " PBP "           \n\t" // (bpp is preloaded into ecx)
   3948       "sub  " PCX "," PBP "        \n\t" // ebp:  Raw(x-bpp)
   3949       "xorl %%ecx, %%ecx           \n\t" // zero ecx before using cl & cx below
   3950 
   3951       SAVE_GOT_ebx
   3952 
   3953    "avg_lp2:                       \n\t"
   3954       // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
   3955       "xorl %%ebx, %%ebx           \n\t"
   3956       "movb (%1," PAX ",), %%cl    \n\t" // load cl with Prior(x)
   3957       "movb (" PBP "," PAX ",), %%bl \n\t" // load bl with Raw(x-bpp)
   3958       "addw %%cx, %%bx             \n\t"
   3959       "incl %%eax                  \n\t"
   3960       "shrw %%bx                   \n\t" // divide by 2
   3961       "addb -1(%2," PAX ",), %%bl  \n\t" // add Avg(x); -1 to offset inc eax
   3962       "cmpl %%edx, %%eax           \n\t" // check if at end of array
   3963       "movb %%bl, -1(%2," PAX ",)  \n\t" // write back Raw(x) [mov does not
   3964       "jb avg_lp2                  \n\t" //  affect flags; -1 to offset inc eax]
   3965 
   3966       RESTORE_GOT_ebx
   3967       RESTORE_ebp
   3968 
   3969    "avg_end:                       \n\t"
   3970       "EMMS                        \n\t" // end MMX; prep for poss. FP instrs.
   3971 
   3972       : "=c" (dummy_value_c),            // output regs (dummy)
   3973         "=S" (dummy_value_S),
   3974         "=D" (dummy_value_D),
   3975         "=a" (dummy_value_a),
   3976         "=d" (dummy_value_d)
   3977 
   3978       : "0" (bpp),         // ecx        // input regs
   3979         "1" (prev_row),    // esi/rsi
   3980         "2" (row),         // edi/rdi
   3981         "3" (MMXLength),   // eax
   3982         "4" (FullLength)   // edx
   3983 
   3984       CLOB_COLON_ebx_ebp                 // clobber list
   3985         CLOBBER_GOT_ebx
   3986         CLOB_COMMA_ebx_ebp
   3987         CLOBBER_ebp
   3988    );
   3989 
   3990 } /* end png_read_filter_row_mmx_avg() */
   3991 
   3992 #endif /* PNG_MMX_READ_FILTER_AVG_SUPPORTED */
   3993 
   3994 
   3995 
   3996 #if defined(PNG_MMX_READ_FILTER_PAETH_SUPPORTED)
   3997 #if defined(PNG_x86_64_USE_GOTPCREL) || defined(PNG_THREAD_UNSAFE_OK)
   3998 
   3999 //===========================================================================//
   4000 //                                                                           //
   4001 //         P N G _ R E A D _ F I L T E R _ R O W _ M M X _ P A E T H         //
   4002 //                                                                           //
   4003 //===========================================================================//
   4004 
   4005 // Optimized code for PNG Paeth filter decoder
   4006 
   4007 static void /* PRIVATE */
   4008 png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
   4009                               png_bytep prev_row)
   4010 {
   4011    unsigned FullLength, MMXLength;  // png_uint_32 is actually 64-bit on x86-64
   4012    int bpp;
   4013    int dummy_value_a;
   4014    int dummy_value_c;   // fix 'forbidden register 2 (cx) was spilled' error
   4015    int dummy_value_d;
   4016    png_charp dummy_value_S;
   4017    png_charp dummy_value_D;
   4018    int diff; //     __attribute__((used));
   4019 
   4020    bpp = (row_info->pixel_depth + 7) >> 3;  // calc number of bytes per pixel
   4021    FullLength = row_info->rowbytes;         // number of bytes to filter
   4022 
   4023    __asm__ __volatile__ (
   4024       SAVE_GOT_ebx
   4025       SAVE_r15
   4026       SAVE_ebp
   4027 //pre "movl row, %2                \n\t" // edi/rdi
   4028       "xorl %%ebx, %%ebx           \n\t" // ebx:  x offset
   4029 //pre "movl prev_row, %1           \n\t" // esi/rsi
   4030       "xorl %%edx, %%edx           \n\t" // edx:  x-bpp offset
   4031 //pre "movl FullLength, %%eax      \n\t" // bring in via eax...
   4032       SAVE_FullLength                    // ...but store for later use
   4033       "xorl %%eax, %%eax           \n\t"
   4034 
   4035       // Compute the Raw value for the first bpp bytes
   4036       // Note: the formula works out to be always
   4037       //   Paeth(x) = Raw(x) + Prior(x)      where x < bpp
   4038    "paeth_rlp:                     \n\t"
   4039       "movb (%2," PBX ",), %%al    \n\t"
   4040       "addb (%1," PBX ",), %%al    \n\t"
   4041       "incl %%ebx                  \n\t"
   4042 //pre "cmpl bpp, %%ebx             \n\t" (bpp is preloaded into ecx)
   4043       "cmpl %%ecx, %%ebx           \n\t"
   4044       "movb %%al, -1(%2," PBX ",)  \n\t"
   4045       "jb paeth_rlp                \n\t"
   4046 
   4047       // get # of bytes to alignment (note:  computing _delta_ of two pointers,
   4048       // so hereafter %%ebp is sufficient even on 64-bit)
   4049       "mov  %2, " PBP "            \n\t" // take start of row
   4050       "add  " PBX "," PBP "        \n\t" // add bpp
   4051       "add  $0xf, " PBP "          \n\t" // add 7+8 to incr past alignment bdry
   4052 //    "andl $0xfffffff8, %%ebp     \n\t" // mask to alignment boundary (32-bit!)
   4053       CLEAR_BOTTOM_3_BITS  PBP    "\n\t" // mask to alignment boundary
   4054       "sub  %2, " PBP "            \n\t" // subtract row ptr again => ebp =
   4055       "jz paeth_go                 \n\t" //  target value of ebx at alignment
   4056 
   4057       "xorl %%ecx, %%ecx           \n\t"
   4058 
   4059       SAVE_r11_r12_r13
   4060 
   4061       // fix alignment
   4062    "paeth_lp1:                     \n\t"
   4063       "xorl %%eax, %%eax           \n\t"
   4064       // pav = p - a = (a + b - c) - a = b - c
   4065       "movb (%1," PBX ",), %%al    \n\t" // load Prior(x) into al
   4066       "movb (%1," PDX ",), %%cl    \n\t" // load Prior(x-bpp) into cl
   4067       "subl %%ecx, %%eax           \n\t" // subtract Prior(x-bpp)
   4068       "movl %%eax, " pa_TEMP "     \n\t" // Save pav for later use
   4069       "xorl %%eax, %%eax           \n\t"
   4070       // pbv = p - b = (a + b - c) - b = a - c
   4071       "movb (%2," PDX ",), %%al    \n\t" // load Raw(x-bpp) into al
   4072       "subl %%ecx, %%eax           \n\t" // subtract Prior(x-bpp)
   4073       "movl %%eax, %%ecx           \n\t"
   4074       // pcv = p - c = (a + b - c) - c = (a - c) + (b - c) = pav + pbv
   4075       "addl " pa_TEMP ", %%eax     \n\t" // pcv = pav + pbv
   4076       // pc = abs(pcv)
   4077       "testl $0x80000000, %%eax    \n\t"
   4078       "jz paeth_pca                \n\t"
   4079       "negl %%eax                  \n\t" // reverse sign of neg values
   4080 
   4081    "paeth_pca:                     \n\t"
   4082       "movl %%eax, " pc_TEMP "     \n\t" // save pc for later use
   4083       // pb = abs(pbv)
   4084       "testl $0x80000000, %%ecx    \n\t"
   4085       "jz paeth_pba                \n\t"
   4086       "negl %%ecx                  \n\t" // reverse sign of neg values
   4087 
   4088    "paeth_pba:                     \n\t"
   4089       "movl %%ecx, " pb_TEMP "     \n\t" // save pb for later use
   4090       // pa = abs(pav)
   4091       "movl " pa_TEMP ", %%eax     \n\t"
   4092       "testl $0x80000000, %%eax    \n\t"
   4093       "jz paeth_paa                \n\t"
   4094       "negl %%eax                  \n\t" // reverse sign of neg values
   4095 
   4096    "paeth_paa:                     \n\t"
   4097       "movl %%eax, " pa_TEMP "     \n\t" // save pa for later use
   4098       // test if pa <= pb
   4099       "cmpl %%ecx, %%eax           \n\t"
   4100       "jna paeth_abb               \n\t"
   4101       // pa > pb; now test if pb <= pc
   4102       "cmpl " pc_TEMP ", %%ecx     \n\t"
   4103       "jna paeth_bbc               \n\t"
   4104       // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
   4105       "movb (%1," PDX ",), %%cl    \n\t" // load Prior(x-bpp) into cl
   4106       "jmp paeth_paeth             \n\t"
   4107 
   4108    "paeth_bbc:                     \n\t"
   4109       // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
   4110       "movb (%1," PBX ",), %%cl    \n\t" // load Prior(x) into cl
   4111       "jmp paeth_paeth             \n\t"
   4112 
   4113    "paeth_abb:                     \n\t"
   4114       // pa <= pb; now test if pa <= pc
   4115       "cmpl " pc_TEMP ", %%eax     \n\t"
   4116       "jna paeth_abc               \n\t"
   4117       // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
   4118       "movb (%1," PDX ",), %%cl    \n\t" // load Prior(x-bpp) into cl
   4119       "jmp paeth_paeth             \n\t"
   4120 
   4121    "paeth_abc:                     \n\t"
   4122       // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
   4123       "movb (%2," PDX ",), %%cl    \n\t" // load Raw(x-bpp) into cl
   4124 
   4125    "paeth_paeth:                   \n\t"
   4126       "incl %%ebx                  \n\t"
   4127       "incl %%edx                  \n\t"
   4128       // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
   4129       "addb %%cl, -1(%2," PBX ",)  \n\t"
   4130       "cmpl %%ebp, %%ebx           \n\t"
   4131       "jb paeth_lp1                \n\t"
   4132 
   4133       RESTORE_r11_r12_r13
   4134 
   4135    "paeth_go:                      \n\t"
   4136       RESTORE_FullLength "%%ecx    \n\t" // FullLength -> ecx
   4137       "movl %%ecx, %%eax           \n\t"
   4138       "subl %%ebx, %%eax           \n\t" // subtract alignment fix
   4139       "andl $0x00000007, %%eax     \n\t" // calc bytes over mult of 8
   4140       "subl %%eax, %%ecx           \n\t" // drop over bytes from original length
   4141 //out "movl %%ecx, MMXLength       \n\t"
   4142       "movl %%ebp, %%eax           \n\t" // ebp = diff, but no reg constraint(?)
   4143       RESTORE_ebp                        //  (could swap ebp and edx functions)
   4144       RESTORE_r15
   4145       RESTORE_GOT_ebx
   4146 
   4147       : "=c" (MMXLength),                // output regs
   4148         "=S" (dummy_value_S),
   4149         "=D" (dummy_value_D),
   4150         "=a" (diff)
   4151 
   4152       : "0" (bpp),         // ecx        // input regs
   4153         "1" (prev_row),    // esi/rsi
   4154         "2" (row),         // edi/rdi
   4155         "3" (FullLength)   // eax
   4156 
   4157       : "%edx"                           // clobber list
   4158         _CLOBBER_r11_r12_r13
   4159         _CLOBBER_r15
   4160         _CLOBBER_ebp
   4161         _CLOBBER_GOT_ebx
   4162    );
   4163 
   4164    // now do the math for the rest of the row
   4165    switch (bpp)
   4166    {
   4167       case 3:
   4168       {
   4169 //       _ShiftBpp = 24;    // == bpp * 8
   4170 //       _ShiftRem = 40;    // == 64 - _ShiftBpp
   4171 
   4172          __asm__ __volatile__ (
   4173             LOAD_GOT_rbp
   4174 // preload  "movl diff, %%ecx            \n\t"
   4175 // preload  "movl row, %1                \n\t" // edi/rdi
   4176 // preload  "movl prev_row, %0           \n\t" // esi/rsi
   4177             "pxor %%mm0, %%mm0           \n\t"
   4178 
   4179             // prime the pump:  load the first Raw(x-bpp) data set
   4180             "movq -8(%1," PCX ",), %%mm1 \n\t"
   4181          "paeth_3lp:                     \n\t"
   4182             "psrlq $40, %%mm1            \n\t" // shift last 3 bytes to 1st
   4183                                                // 3 bytes
   4184             "movq (%0," PCX ",), %%mm2   \n\t" // load b=Prior(x)
   4185             "punpcklbw %%mm0, %%mm1      \n\t" // unpack High bytes of a
   4186             "movq -8(%0," PCX ",), %%mm3 \n\t" // prep c=Prior(x-bpp) bytes
   4187             "punpcklbw %%mm0, %%mm2      \n\t" // unpack High bytes of b
   4188             "psrlq $40, %%mm3            \n\t" // shift last 3 bytes to 1st
   4189                                                // 3 bytes
   4190             // pav = p - a = (a + b - c) - a = b - c
   4191             "movq %%mm2, %%mm4           \n\t"
   4192             "punpcklbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
   4193             // pbv = p - b = (a + b - c) - b = a - c
   4194             "movq %%mm1, %%mm5           \n\t"
   4195             "psubw %%mm3, %%mm4          \n\t"
   4196             "pxor %%mm7, %%mm7           \n\t"
   4197             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
   4198             "movq %%mm4, %%mm6           \n\t"
   4199             "psubw %%mm3, %%mm5          \n\t"
   4200 
   4201             // pa = abs(p-a) = abs(pav)
   4202             // pb = abs(p-b) = abs(pbv)
   4203             // pc = abs(p-c) = abs(pcv)
   4204             "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
   4205             "paddw %%mm5, %%mm6          \n\t"
   4206             "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
   4207             "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
   4208             "psubw %%mm0, %%mm4          \n\t"
   4209             "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
   4210             "psubw %%mm0, %%mm4          \n\t"
   4211             "psubw %%mm7, %%mm5          \n\t"
   4212             "pxor %%mm0, %%mm0           \n\t"
   4213             "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
   4214             "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
   4215             "psubw %%mm7, %%mm5          \n\t"
   4216             "psubw %%mm0, %%mm6          \n\t"
   4217             //  test pa <= pb
   4218             "movq %%mm4, %%mm7           \n\t"
   4219             "psubw %%mm0, %%mm6          \n\t"
   4220             "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
   4221             "movq %%mm7, %%mm0           \n\t"
   4222             // use mm7 mask to merge pa & pb
   4223             "pand %%mm7, %%mm5           \n\t"
   4224             // use mm0 mask copy to merge a & b
   4225             "pand %%mm0, %%mm2           \n\t"
   4226             "pandn %%mm4, %%mm7          \n\t"
   4227             "pandn %%mm1, %%mm0          \n\t"
   4228             "paddw %%mm5, %%mm7          \n\t"
   4229             "paddw %%mm2, %%mm0          \n\t"
   4230             //  test  ((pa <= pb)? pa:pb) <= pc
   4231             "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
   4232             "pxor %%mm1, %%mm1           \n\t"
   4233             "pand %%mm7, %%mm3           \n\t"
   4234             "pandn %%mm0, %%mm7          \n\t"
   4235             "paddw %%mm3, %%mm7          \n\t"
   4236             "pxor %%mm0, %%mm0           \n\t"
   4237             "packuswb %%mm1, %%mm7       \n\t"
   4238             "movq (%0," PCX ",), %%mm3   \n\t" // load c=Prior(x-bpp)
   4239             "pand " AMASK5_3_0 ", %%mm7  \n\t" // _amask5_3_0 (was _ActiveMask)
   4240             "movq %%mm3, %%mm2           \n\t" // load b=Prior(x) step 1
   4241             "paddb (%1," PCX ",), %%mm7  \n\t" // add Paeth predictor + Raw(x)
   4242             "punpcklbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
   4243             "movq %%mm7, (%1," PCX ",)   \n\t" // write back updated value
   4244             "movq %%mm7, %%mm1           \n\t" // now mm1 will be used as
   4245                                                // Raw(x-bpp)
   4246             // now do Paeth for 2nd set of bytes (3-5)
   4247             "psrlq $24, %%mm2            \n\t" // load b=Prior(x) step 2
   4248             "punpcklbw %%mm0, %%mm1      \n\t" // unpack High bytes of a
   4249             "pxor %%mm7, %%mm7           \n\t"
   4250             "punpcklbw %%mm0, %%mm2      \n\t" // unpack High bytes of b
   4251             // pbv = p - b = (a + b - c) - b = a - c
   4252             "movq %%mm1, %%mm5           \n\t"
   4253             // pav = p - a = (a + b - c) - a = b - c
   4254             "movq %%mm2, %%mm4           \n\t"
   4255             "psubw %%mm3, %%mm5          \n\t"
   4256             "psubw %%mm3, %%mm4          \n\t"
   4257             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) =
   4258             //       pav + pbv = pbv + pav
   4259             "movq %%mm5, %%mm6           \n\t"
   4260             "paddw %%mm4, %%mm6          \n\t"
   4261 
   4262             // pa = abs(p-a) = abs(pav)
   4263             // pb = abs(p-b) = abs(pbv)
   4264             // pc = abs(p-c) = abs(pcv)
   4265             "pcmpgtw %%mm5, %%mm0        \n\t" // create mask pbv bytes < 0
   4266             "pcmpgtw %%mm4, %%mm7        \n\t" // create mask pav bytes < 0
   4267             "pand %%mm5, %%mm0           \n\t" // only pbv bytes < 0 in mm0
   4268             "pand %%mm4, %%mm7           \n\t" // only pav bytes < 0 in mm7
   4269             "psubw %%mm0, %%mm5          \n\t"
   4270             "psubw %%mm7, %%mm4          \n\t"
   4271             "psubw %%mm0, %%mm5          \n\t"
   4272             "psubw %%mm7, %%mm4          \n\t"
   4273             "pxor %%mm0, %%mm0           \n\t"
   4274             "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
   4275             "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
   4276             "psubw %%mm0, %%mm6          \n\t"
   4277             //  test pa <= pb
   4278             "movq %%mm4, %%mm7           \n\t"
   4279             "psubw %%mm0, %%mm6          \n\t"
   4280             "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
   4281             "movq %%mm7, %%mm0           \n\t"
   4282             // use mm7 mask to merge pa & pb
   4283             "pand %%mm7, %%mm5           \n\t"
   4284             // use mm0 mask copy to merge a & b
   4285             "pand %%mm0, %%mm2           \n\t"
   4286             "pandn %%mm4, %%mm7          \n\t"
   4287             "pandn %%mm1, %%mm0          \n\t"
   4288             "paddw %%mm5, %%mm7          \n\t"
   4289             "paddw %%mm2, %%mm0          \n\t"
   4290             //  test  ((pa <= pb)? pa:pb) <= pc
   4291             "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
   4292             "movq (%0," PCX ",), %%mm2   \n\t" // load b=Prior(x)
   4293             "pand %%mm7, %%mm3           \n\t"
   4294             "pandn %%mm0, %%mm7          \n\t"
   4295             "pxor %%mm1, %%mm1           \n\t"
   4296             "paddw %%mm3, %%mm7          \n\t"
   4297             "pxor %%mm0, %%mm0           \n\t"
   4298             "packuswb %%mm1, %%mm7       \n\t"
   4299             "movq %%mm2, %%mm3           \n\t" // load c=Prior(x-bpp) step 1
   4300             "pand " AMASK5_3_0 ", %%mm7  \n\t" // _amask5_3_0 (was _ActiveMask)
   4301             "punpckhbw %%mm0, %%mm2      \n\t" // unpack High bytes of b
   4302             "psllq $24, %%mm7            \n\t" // shift bytes to 2nd group of
   4303                                                // 3 bytes
   4304              // pav = p - a = (a + b - c) - a = b - c
   4305             "movq %%mm2, %%mm4           \n\t"
   4306             "paddb (%1," PCX ",), %%mm7  \n\t" // add Paeth predictor + Raw(x)
   4307             "psllq $24, %%mm3            \n\t" // load c=Prior(x-bpp) step 2
   4308             "movq %%mm7, (%1," PCX ",)   \n\t" // write back updated value
   4309             "movq %%mm7, %%mm1           \n\t"
   4310             "punpckhbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
   4311             "psllq $24, %%mm1            \n\t" // shift bytes (was _ShiftBpp)
   4312                                     // now mm1 will be used as Raw(x-bpp)
   4313             // now do Paeth for 3rd, and final, set of bytes (6-7)
   4314             "pxor %%mm7, %%mm7           \n\t"
   4315             "punpckhbw %%mm0, %%mm1      \n\t" // unpack High bytes of a
   4316             "psubw %%mm3, %%mm4          \n\t"
   4317             // pbv = p - b = (a + b - c) - b = a - c
   4318             "movq %%mm1, %%mm5           \n\t"
   4319             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
   4320             "movq %%mm4, %%mm6           \n\t"
   4321             "psubw %%mm3, %%mm5          \n\t"
   4322             "pxor %%mm0, %%mm0           \n\t"
   4323             "paddw %%mm5, %%mm6          \n\t"
   4324 
   4325             // pa = abs(p-a) = abs(pav)
   4326             // pb = abs(p-b) = abs(pbv)
   4327             // pc = abs(p-c) = abs(pcv)
   4328             "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
   4329             "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
   4330             "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
   4331             "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
   4332             "psubw %%mm0, %%mm4          \n\t"
   4333             "psubw %%mm7, %%mm5          \n\t"
   4334             "psubw %%mm0, %%mm4          \n\t"
   4335             "psubw %%mm7, %%mm5          \n\t"
   4336             "pxor %%mm0, %%mm0           \n\t"
   4337             "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
   4338             "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
   4339             "psubw %%mm0, %%mm6          \n\t"
   4340             //  test pa <= pb
   4341             "movq %%mm4, %%mm7           \n\t"
   4342             "psubw %%mm0, %%mm6          \n\t"
   4343             "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
   4344             "movq %%mm7, %%mm0           \n\t"
   4345             // use mm0 mask copy to merge a & b
   4346             "pand %%mm0, %%mm2           \n\t"
   4347             // use mm7 mask to merge pa & pb
   4348             "pand %%mm7, %%mm5           \n\t"
   4349             "pandn %%mm1, %%mm0          \n\t"
   4350             "pandn %%mm4, %%mm7          \n\t"
   4351             "paddw %%mm2, %%mm0          \n\t"
   4352             "paddw %%mm5, %%mm7          \n\t"
   4353             //  test  ((pa <= pb)? pa:pb) <= pc
   4354             "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
   4355             "pand %%mm7, %%mm3           \n\t"
   4356             "pandn %%mm0, %%mm7          \n\t"
   4357             "paddw %%mm3, %%mm7          \n\t"
   4358             "pxor %%mm1, %%mm1           \n\t"
   4359             "packuswb %%mm7, %%mm1       \n\t"
   4360             // step ecx to next set of 8 bytes and repeat loop til done
   4361             "addl $8, %%ecx              \n\t"
   4362             "pand " AMASK0_2_6 ", %%mm1  \n\t" // _amask0_2_6 (_ActiveMaskEnd)
   4363             "paddb -8(%1," PCX ",), %%mm1 \n\t" // add Paeth predictor + Raw(x)
   4364             "cmpl %%eax, %%ecx           \n\t" // MMXLength
   4365             "pxor %%mm0, %%mm0           \n\t" // pxor does not affect flags
   4366             "movq %%mm1, -8(%1," PCX ",) \n\t" // write back updated value
   4367                                  // mm1 will be used as Raw(x-bpp) next loop
   4368                            // mm3 ready to be used as Prior(x-bpp) next loop
   4369             "jb paeth_3lp                \n\t"
   4370             RESTORE_rbp
   4371 
   4372             : "=S" (dummy_value_S),            // output regs (dummy)
   4373               "=D" (dummy_value_D),
   4374               "=c" (dummy_value_c),
   4375               "=a" (dummy_value_a)
   4376 
   4377             : "0" (prev_row),  // esi/rsi      // input regs
   4378               "1" (row),       // edi/rdi
   4379               "2" (diff),      // ecx
   4380               "3" (MMXLength)  // eax
   4381 
   4382 #if defined(CLOBBER_MMX_REGS_SUPPORTED)
   4383             : "%mm0", "%mm1", "%mm2", "%mm3"   // clobber list
   4384             , "%mm4", "%mm5", "%mm6", "%mm7"
   4385 #endif
   4386          );
   4387       }
   4388       break;  // end 3 bpp
   4389 
   4390       case 4:
   4391       {
   4392          __asm__ __volatile__ (
   4393 // preload  "movl diff, %%ecx            \n\t"
   4394 // preload  "movl row, %1                \n\t" // edi/rdi
   4395 // preload  "movl prev_row, %0           \n\t" // esi/rsi
   4396             "pxor %%mm0, %%mm0           \n\t"
   4397             // prime the pump:  load the first Raw(x-bpp) data set
   4398             "movq -8(%1," PCX ",), %%mm1 \n\t" // only time should need to read
   4399                                                //  a=Raw(x-bpp) bytes
   4400          "paeth_4lp:                     \n\t"
   4401             // do first set of 4 bytes
   4402             "movq -8(%0," PCX ",), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
   4403             "punpckhbw %%mm0, %%mm1      \n\t" // unpack Low bytes of a
   4404             "movq (%0," PCX ",), %%mm2   \n\t" // load b=Prior(x)
   4405             "punpcklbw %%mm0, %%mm2      \n\t" // unpack High bytes of b
   4406             // pav = p - a = (a + b - c) - a = b - c
   4407             "movq %%mm2, %%mm4           \n\t"
   4408             "punpckhbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
   4409             // pbv = p - b = (a + b - c) - b = a - c
   4410             "movq %%mm1, %%mm5           \n\t"
   4411             "psubw %%mm3, %%mm4          \n\t"
   4412             "pxor %%mm7, %%mm7           \n\t"
   4413             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
   4414             "movq %%mm4, %%mm6           \n\t"
   4415             "psubw %%mm3, %%mm5          \n\t"
   4416             // pa = abs(p-a) = abs(pav)
   4417             // pb = abs(p-b) = abs(pbv)
   4418             // pc = abs(p-c) = abs(pcv)
   4419             "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
   4420             "paddw %%mm5, %%mm6          \n\t"
   4421             "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
   4422             "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
   4423             "psubw %%mm0, %%mm4          \n\t"
   4424             "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
   4425             "psubw %%mm0, %%mm4          \n\t"
   4426             "psubw %%mm7, %%mm5          \n\t"
   4427             "pxor %%mm0, %%mm0           \n\t"
   4428             "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
   4429             "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
   4430             "psubw %%mm7, %%mm5          \n\t"
   4431             "psubw %%mm0, %%mm6          \n\t"
   4432             //  test pa <= pb
   4433             "movq %%mm4, %%mm7           \n\t"
   4434             "psubw %%mm0, %%mm6          \n\t"
   4435             "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
   4436             "movq %%mm7, %%mm0           \n\t"
   4437             // use mm7 mask to merge pa & pb
   4438             "pand %%mm7, %%mm5           \n\t"
   4439             // use mm0 mask copy to merge a & b
   4440             "pand %%mm0, %%mm2           \n\t"
   4441             "pandn %%mm4, %%mm7          \n\t"
   4442             "pandn %%mm1, %%mm0          \n\t"
   4443             "paddw %%mm5, %%mm7          \n\t"
   4444             "paddw %%mm2, %%mm0          \n\t"
   4445             //  test  ((pa <= pb)? pa:pb) <= pc
   4446             "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
   4447             "pxor %%mm1, %%mm1           \n\t"
   4448             "pand %%mm7, %%mm3           \n\t"
   4449             "pandn %%mm0, %%mm7          \n\t"
   4450             "paddw %%mm3, %%mm7          \n\t"
   4451             "pxor %%mm0, %%mm0           \n\t"
   4452             "packuswb %%mm1, %%mm7       \n\t"
   4453             "movq (%0," PCX ",), %%mm3   \n\t" // load c=Prior(x-bpp)
   4454             LOAD_GOT_rbp
   4455             "pand " AMASK4_4_0 ", %%mm7  \n\t" // _amask4_4_0 (was _ActiveMask)
   4456             RESTORE_rbp
   4457             "movq %%mm3, %%mm2           \n\t" // load b=Prior(x) step 1
   4458             "paddb (%1," PCX ",), %%mm7  \n\t" // add Paeth predictor + Raw(x)
   4459             "punpcklbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
   4460             "movq %%mm7, (%1," PCX ",)   \n\t" // write back updated value
   4461             "movq %%mm7, %%mm1           \n\t" // now mm1 will be used as
   4462                                                // Raw(x-bpp)
   4463             // do second set of 4 bytes
   4464             "punpckhbw %%mm0, %%mm2      \n\t" // unpack Low bytes of b
   4465             "punpcklbw %%mm0, %%mm1      \n\t" // unpack Low bytes of a
   4466             // pav = p - a = (a + b - c) - a = b - c
   4467             "movq %%mm2, %%mm4           \n\t"
   4468             // pbv = p - b = (a + b - c) - b = a - c
   4469             "movq %%mm1, %%mm5           \n\t"
   4470             "psubw %%mm3, %%mm4          \n\t"
   4471             "pxor %%mm7, %%mm7           \n\t"
   4472             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
   4473             "movq %%mm4, %%mm6           \n\t"
   4474             "psubw %%mm3, %%mm5          \n\t"
   4475             // pa = abs(p-a) = abs(pav)
   4476             // pb = abs(p-b) = abs(pbv)
   4477             // pc = abs(p-c) = abs(pcv)
   4478             "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
   4479             "paddw %%mm5, %%mm6          \n\t"
   4480             "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
   4481             "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
   4482             "psubw %%mm0, %%mm4          \n\t"
   4483             "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
   4484             "psubw %%mm0, %%mm4          \n\t"
   4485             "psubw %%mm7, %%mm5          \n\t"
   4486             "pxor %%mm0, %%mm0           \n\t"
   4487             "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
   4488             "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
   4489             "psubw %%mm7, %%mm5          \n\t"
   4490             "psubw %%mm0, %%mm6          \n\t"
   4491             //  test pa <= pb
   4492             "movq %%mm4, %%mm7           \n\t"
   4493             "psubw %%mm0, %%mm6          \n\t"
   4494             "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
   4495             "movq %%mm7, %%mm0           \n\t"
   4496             // use mm7 mask to merge pa & pb
   4497             "pand %%mm7, %%mm5           \n\t"
   4498             // use mm0 mask copy to merge a & b
   4499             "pand %%mm0, %%mm2           \n\t"
   4500             "pandn %%mm4, %%mm7          \n\t"
   4501             "pandn %%mm1, %%mm0          \n\t"
   4502             "paddw %%mm5, %%mm7          \n\t"
   4503             "paddw %%mm2, %%mm0          \n\t"
   4504             //  test  ((pa <= pb)? pa:pb) <= pc
   4505             "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
   4506             "pxor %%mm1, %%mm1           \n\t"
   4507             "pand %%mm7, %%mm3           \n\t"
   4508             "pandn %%mm0, %%mm7          \n\t"
   4509             "pxor %%mm1, %%mm1           \n\t"
   4510             "paddw %%mm3, %%mm7          \n\t"
   4511             "pxor %%mm0, %%mm0           \n\t"
   4512             // step ecx to next set of 8 bytes and repeat loop til done
   4513             "addl $8, %%ecx              \n\t"
   4514             "packuswb %%mm7, %%mm1       \n\t"
   4515             "paddb -8(%1," PCX ",), %%mm1 \n\t" // add predictor with Raw(x)
   4516             "cmpl %%eax, %%ecx           \n\t" // MMXLength
   4517             "movq %%mm1, -8(%1," PCX ",) \n\t" // write back updated value
   4518                                  // mm1 will be used as Raw(x-bpp) next loop
   4519             "jb paeth_4lp                \n\t"
   4520 
   4521             : "=S" (dummy_value_S),            // output regs (dummy)
   4522               "=D" (dummy_value_D),
   4523               "=c" (dummy_value_c),
   4524               "=a" (dummy_value_a)
   4525 
   4526             : "0" (prev_row),  // esi/rsi      // input regs
   4527               "1" (row),       // edi/rdi
   4528               "2" (diff),      // ecx
   4529               "3" (MMXLength)  // eax
   4530 
   4531 #if defined(CLOBBER_MMX_REGS_SUPPORTED)
   4532             : "%mm0", "%mm1", "%mm2", "%mm3"   // clobber list
   4533             , "%mm4", "%mm5", "%mm6", "%mm7"
   4534 #endif
   4535          );
   4536       }
   4537       break;  // end 4 bpp
   4538 
   4539       case 1:
   4540       case 2:
   4541       {
   4542          __asm__ __volatile__ (
   4543 // preload  "movl diff, %%eax            \n\t" // eax: x = offset to align. bdry
   4544 // preload  "movl FullLength, %%edx      \n\t"
   4545             "cmpl %%edx, %%eax           \n\t"
   4546             "jnb paeth_dend              \n\t"
   4547 
   4548             SAVE_ebp
   4549 
   4550 // preload  "movl row, %2                \n\t" // edi/rdi
   4551             // do Paeth decode for remaining bytes
   4552 // preload  "movl prev_row, %1           \n\t" // esi/rsi
   4553             "movl %%eax, %%ebp           \n\t"
   4554 // preload  "subl bpp, %%ebp             \n\t" // (bpp is preloaded into ecx)
   4555             "subl %%ecx, %%ebp           \n\t" // ebp = eax - bpp
   4556             "xorl %%ecx, %%ecx           \n\t" // zero ecx before using cl & cx
   4557 
   4558             SAVE_GOT_ebx
   4559             SAVE_r11_r12_r13
   4560 
   4561          "paeth_dlp:                     \n\t"
   4562             "xorl %%ebx, %%ebx           \n\t"
   4563             // pav = p - a = (a + b - c) - a = b - c
   4564             "movb (%1," PAX ",), %%bl    \n\t" // load Prior(x) into bl
   4565             "movb (%1," PBP ",), %%cl    \n\t" // load Prior(x-bpp) into cl
   4566             "subl %%ecx, %%ebx           \n\t" // subtract Prior(x-bpp)
   4567             "movl %%ebx, " pa_TEMP "     \n\t" // Save pav for later use
   4568             "xorl %%ebx, %%ebx           \n\t"
   4569             // pbv = p - b = (a + b - c) - b = a - c
   4570             "movb (%2," PBP ",), %%bl    \n\t" // load Raw(x-bpp) into bl
   4571             "subl %%ecx, %%ebx           \n\t" // subtract Prior(x-bpp)
   4572             "movl %%ebx, %%ecx           \n\t"
   4573             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
   4574             "addl " pa_TEMP ", %%ebx     \n\t" // pcv = pav + pbv
   4575             // pc = abs(pcv)
   4576             "testl $0x80000000, %%ebx    \n\t"
   4577             "jz paeth_dpca               \n\t"
   4578             "negl %%ebx                  \n\t" // reverse sign of neg values
   4579 
   4580          "paeth_dpca:                    \n\t"
   4581             "movl %%ebx, " pc_TEMP "     \n\t" // save pc for later use
   4582             // pb = abs(pbv)
   4583             "testl $0x80000000, %%ecx    \n\t"
   4584             "jz paeth_dpba               \n\t"
   4585             "negl %%ecx                  \n\t" // reverse sign of neg values
   4586 
   4587          "paeth_dpba:                    \n\t"
   4588             "movl %%ecx, " pb_TEMP "     \n\t" // save pb for later use
   4589             // pa = abs(pav)
   4590             "movl " pa_TEMP ", %%ebx     \n\t"
   4591             "testl $0x80000000, %%ebx    \n\t"
   4592             "jz paeth_dpaa               \n\t"
   4593             "negl %%ebx                  \n\t" // reverse sign of neg values
   4594 
   4595          "paeth_dpaa:                    \n\t"
   4596             "movl %%ebx, " pa_TEMP "     \n\t" // save pa for later use
   4597             // test if pa <= pb
   4598             "cmpl %%ecx, %%ebx           \n\t"
   4599             "jna paeth_dabb              \n\t"
   4600             // pa > pb; now test if pb <= pc
   4601             "cmpl " pc_TEMP ", %%ecx     \n\t"
   4602             "jna paeth_dbbc              \n\t"
   4603             // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
   4604             "movb (%1," PBP ",), %%cl    \n\t" // load Prior(x-bpp) into cl
   4605             "jmp paeth_dpaeth            \n\t"
   4606 
   4607          "paeth_dbbc:                    \n\t"
   4608             // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
   4609             "movb (%1," PAX ",), %%cl    \n\t" // load Prior(x) into cl
   4610             "jmp paeth_dpaeth            \n\t"
   4611 
   4612          "paeth_dabb:                    \n\t"
   4613             // pa <= pb; now test if pa <= pc
   4614             "cmpl " pc_TEMP ", %%ebx     \n\t"
   4615             "jna paeth_dabc              \n\t"
   4616             // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
   4617             "movb (%1," PBP ",), %%cl   \n\t" // load Prior(x-bpp) into cl
   4618             "jmp paeth_dpaeth            \n\t"
   4619 
   4620          "paeth_dabc:                    \n\t"
   4621             // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
   4622             "movb (%2," PBP ",), %%cl    \n\t" // load Raw(x-bpp) into cl
   4623 
   4624          "paeth_dpaeth:                  \n\t"
   4625             "incl %%eax                  \n\t"
   4626             "incl %%ebp                  \n\t"
   4627             // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
   4628             "addb %%cl, -1(%2," PAX ",)  \n\t"
   4629             "cmpl %%edx, %%eax           \n\t" // check against FullLength
   4630             "jb paeth_dlp                \n\t"
   4631 
   4632             RESTORE_r11_r12_r13
   4633             RESTORE_GOT_ebx
   4634             RESTORE_ebp
   4635 
   4636          "paeth_dend:                    \n\t"
   4637 
   4638             : "=c" (dummy_value_c),            // output regs (dummy)
   4639               "=S" (dummy_value_S),
   4640               "=D" (dummy_value_D),
   4641               "=a" (dummy_value_a),
   4642               "=d" (dummy_value_d)
   4643 
   4644             : "0" (bpp),         // ecx        // input regs
   4645               "1" (prev_row),    // esi/rsi
   4646               "2" (row),         // edi/rdi
   4647               "3" (diff),        // eax
   4648               "4" (FullLength)   // edx
   4649 
   4650             CLOB_COLON_ebx_ebp_r1X             // clobber list
   4651               CLOBBER_GOT_ebx
   4652               CLOB_COMMA_ebx_ebp
   4653               CLOBBER_ebp
   4654               CLOB_COMMA_ebX_r1X
   4655               CLOBBER_r11_r12_r13
   4656          );
   4657       }
   4658       return; // end 1 or 2 bpp (no need to go further with this one)
   4659 
   4660       case 6:
   4661       {
   4662 //       _ActiveMask2 = 0xffffffff00000000LL;  // NOT USED ("_amask_0_4_4")
   4663 //       _ShiftBpp = 48;       // bpp << 3 == bpp * 8
   4664 //       _ShiftRem = 16;       // 64 - _ShiftBpp
   4665 
   4666          __asm__ __volatile__ (
   4667 // preload  "movl diff, %%ecx            \n\t"
   4668 // preload  "movl row, %1                \n\t" // edi/rdi
   4669 // preload  "movl prev_row, %0           \n\t" // esi/rsi
   4670             // prime the pump:  load the first Raw(x-bpp) data set
   4671             "movq -8(%1," PCX ",), %%mm1 \n\t"
   4672             "pxor %%mm0, %%mm0           \n\t"
   4673 
   4674          "paeth_6lp:                     \n\t"
   4675             // must shift to position Raw(x-bpp) data
   4676             "psrlq $16, %%mm1            \n\t" // was _ShiftRem
   4677             // do first set of 4 bytes
   4678             "movq -8(%0," PCX ",), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
   4679             "punpcklbw %%mm0, %%mm1      \n\t" // unpack Low bytes of a
   4680             "movq (%0," PCX ",), %%mm2   \n\t" // load b=Prior(x)
   4681             "punpcklbw %%mm0, %%mm2      \n\t" // unpack Low bytes of b
   4682             // must shift to position Prior(x-bpp) data
   4683             "psrlq $16, %%mm3            \n\t" // was _ShiftRem
   4684             // pav = p - a = (a + b - c) - a = b - c
   4685             "movq %%mm2, %%mm4           \n\t"
   4686             "punpcklbw %%mm0, %%mm3      \n\t" // unpack Low bytes of c
   4687             // pbv = p - b = (a + b - c) - b = a - c
   4688             "movq %%mm1, %%mm5           \n\t"
   4689             "psubw %%mm3, %%mm4          \n\t"
   4690             "pxor %%mm7, %%mm7           \n\t"
   4691             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
   4692             "movq %%mm4, %%mm6           \n\t"
   4693             "psubw %%mm3, %%mm5          \n\t"
   4694             // pa = abs(p-a) = abs(pav)
   4695             // pb = abs(p-b) = abs(pbv)
   4696             // pc = abs(p-c) = abs(pcv)
   4697             "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
   4698             "paddw %%mm5, %%mm6          \n\t"
   4699             "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
   4700             "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
   4701             "psubw %%mm0, %%mm4          \n\t"
   4702             "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
   4703             "psubw %%mm0, %%mm4          \n\t"
   4704             "psubw %%mm7, %%mm5          \n\t"
   4705             "pxor %%mm0, %%mm0           \n\t"
   4706             "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
   4707             "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
   4708             "psubw %%mm7, %%mm5          \n\t"
   4709             "psubw %%mm0, %%mm6          \n\t"
   4710             //  test pa <= pb
   4711             "movq %%mm4, %%mm7           \n\t"
   4712             "psubw %%mm0, %%mm6          \n\t"
   4713             "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
   4714             "movq %%mm7, %%mm0           \n\t"
   4715             // use mm7 mask to merge pa & pb
   4716             "pand %%mm7, %%mm5           \n\t"
   4717             // use mm0 mask copy to merge a & b
   4718             "pand %%mm0, %%mm2           \n\t"
   4719             "pandn %%mm4, %%mm7          \n\t"
   4720             "pandn %%mm1, %%mm0          \n\t"
   4721             "paddw %%mm5, %%mm7          \n\t"
   4722             "paddw %%mm2, %%mm0          \n\t"
   4723             //  test  ((pa <= pb)? pa:pb) <= pc
   4724             "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
   4725             "pxor %%mm1, %%mm1           \n\t"
   4726             "pand %%mm7, %%mm3           \n\t"
   4727             "pandn %%mm0, %%mm7          \n\t"
   4728             "paddw %%mm3, %%mm7          \n\t"
   4729             "pxor %%mm0, %%mm0           \n\t"
   4730             "packuswb %%mm1, %%mm7       \n\t"
   4731             "movq -8(%0," PCX ",), %%mm3 \n\t" // load c=Prior(x-bpp)
   4732             LOAD_GOT_rbp
   4733             "pand " AMASK4_4_0 ", %%mm7  \n\t" // _amask4_4_0 (was _ActiveMask)
   4734             RESTORE_rbp
   4735             "psrlq $16, %%mm3            \n\t"
   4736             "movq (%0," PCX ",), %%mm2   \n\t" // load b=Prior(x) step 1
   4737             "paddb (%1," PCX ",), %%mm7  \n\t" // add Paeth predictor + Raw(x)
   4738             "movq %%mm2, %%mm6           \n\t"
   4739             "movq %%mm7, (%1," PCX ",)   \n\t" // write back updated value
   4740             "movq -8(%1," PCX ",), %%mm1 \n\t"
   4741             "psllq $48, %%mm6            \n\t" // bpp * 8 = bits per pixel
   4742             "movq %%mm7, %%mm5           \n\t"
   4743             "psrlq $16, %%mm1            \n\t" // 64 - (bpp * 8) = remainder
   4744             "por %%mm6, %%mm3            \n\t"
   4745             "psllq $48, %%mm5            \n\t" // was _ShiftBpp
   4746             "punpckhbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
   4747             "por %%mm5, %%mm1            \n\t"
   4748             // do second set of 4 bytes
   4749             "punpckhbw %%mm0, %%mm2      \n\t" // unpack High bytes of b
   4750             "punpckhbw %%mm0, %%mm1      \n\t" // unpack High bytes of a
   4751             // pav = p - a = (a + b - c) - a = b - c
   4752             "movq %%mm2, %%mm4           \n\t"
   4753             // pbv = p - b = (a + b - c) - b = a - c
   4754             "movq %%mm1, %%mm5           \n\t"
   4755             "psubw %%mm3, %%mm4          \n\t"
   4756             "pxor %%mm7, %%mm7           \n\t"
   4757             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
   4758             "movq %%mm4, %%mm6           \n\t"
   4759             "psubw %%mm3, %%mm5          \n\t"
   4760             // pa = abs(p-a) = abs(pav)
   4761             // pb = abs(p-b) = abs(pbv)
   4762             // pc = abs(p-c) = abs(pcv)
   4763             "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
   4764             "paddw %%mm5, %%mm6          \n\t"
   4765             "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
   4766             "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
   4767             "psubw %%mm0, %%mm4          \n\t"
   4768             "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
   4769             "psubw %%mm0, %%mm4          \n\t"
   4770             "psubw %%mm7, %%mm5          \n\t"
   4771             "pxor %%mm0, %%mm0           \n\t"
   4772             "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
   4773             "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
   4774             "psubw %%mm7, %%mm5          \n\t"
   4775             "psubw %%mm0, %%mm6          \n\t"
   4776             //  test pa <= pb
   4777             "movq %%mm4, %%mm7           \n\t"
   4778             "psubw %%mm0, %%mm6          \n\t"
   4779             "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
   4780             "movq %%mm7, %%mm0           \n\t"
   4781             // use mm7 mask to merge pa & pb
   4782             "pand %%mm7, %%mm5           \n\t"
   4783             // use mm0 mask copy to merge a & b
   4784             "pand %%mm0, %%mm2           \n\t"
   4785             "pandn %%mm4, %%mm7          \n\t"
   4786             "pandn %%mm1, %%mm0          \n\t"
   4787             "paddw %%mm5, %%mm7          \n\t"
   4788             "paddw %%mm2, %%mm0          \n\t"
   4789             //  test  ((pa <= pb)? pa:pb) <= pc
   4790             "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
   4791             "pxor %%mm1, %%mm1           \n\t"
   4792             "pand %%mm7, %%mm3           \n\t"
   4793             "pandn %%mm0, %%mm7          \n\t"
   4794             "pxor %%mm1, %%mm1           \n\t"
   4795             "paddw %%mm3, %%mm7          \n\t"
   4796             "pxor %%mm0, %%mm0           \n\t"
   4797             // step ecx to next set of 8 bytes and repeat loop til done
   4798             "addl $8, %%ecx              \n\t"
   4799             "packuswb %%mm7, %%mm1       \n\t"
   4800             "paddb -8(%1," PCX ",), %%mm1 \n\t" // add Paeth predictor + Raw(x)
   4801             "cmpl %%eax, %%ecx           \n\t" // MMXLength
   4802             "movq %%mm1, -8(%1," PCX ",) \n\t" // write back updated value
   4803                                  // mm1 will be used as Raw(x-bpp) next loop
   4804             "jb paeth_6lp                \n\t"
   4805 
   4806             : "=S" (dummy_value_S),            // output regs (dummy)
   4807               "=D" (dummy_value_D),
   4808               "=c" (dummy_value_c),
   4809               "=a" (dummy_value_a)
   4810 
   4811             : "0" (prev_row),  // esi/rsi      // input regs
   4812               "1" (row),       // edi/rdi
   4813               "2" (diff),      // ecx
   4814               "3" (MMXLength)  // eax
   4815 
   4816 #if defined(CLOBBER_MMX_REGS_SUPPORTED)
   4817             : "%mm0", "%mm1", "%mm2", "%mm3"   // clobber list
   4818             , "%mm4", "%mm5", "%mm6", "%mm7"
   4819 #endif
   4820          );
   4821       }
   4822       break;  // end 6 bpp
   4823 
   4824       case 8:                          // bpp == 8
   4825       {
   4826          __asm__ __volatile__ (
   4827 // preload  "movl diff, %%ecx            \n\t"
   4828 // preload  "movl row, %1                \n\t" // edi/rdi
   4829 // preload  "movl prev_row, %0           \n\t" // esi/rsi
   4830             "pxor %%mm0, %%mm0           \n\t"
   4831             // prime the pump:  load the first Raw(x-bpp) data set
   4832             "movq -8(%1," PCX ",), %%mm1 \n\t" // only time should need to read
   4833                                                //  a=Raw(x-bpp) bytes
   4834          "paeth_8lp:                     \n\t"
   4835             // do first set of 4 bytes
   4836             "movq -8(%0," PCX ",), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
   4837             "punpcklbw %%mm0, %%mm1      \n\t" // unpack Low bytes of a
   4838             "movq (%0," PCX ",), %%mm2   \n\t" // load b=Prior(x)
   4839             "punpcklbw %%mm0, %%mm2      \n\t" // unpack Low bytes of b
   4840             // pav = p - a = (a + b - c) - a = b - c
   4841             "movq %%mm2, %%mm4           \n\t"
   4842             "punpcklbw %%mm0, %%mm3      \n\t" // unpack Low bytes of c
   4843             // pbv = p - b = (a + b - c) - b = a - c
   4844             "movq %%mm1, %%mm5           \n\t"
   4845             "psubw %%mm3, %%mm4          \n\t"
   4846             "pxor %%mm7, %%mm7           \n\t"
   4847             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
   4848             "movq %%mm4, %%mm6           \n\t"
   4849             "psubw %%mm3, %%mm5          \n\t"
   4850             // pa = abs(p-a) = abs(pav)
   4851             // pb = abs(p-b) = abs(pbv)
   4852             // pc = abs(p-c) = abs(pcv)
   4853             "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
   4854             "paddw %%mm5, %%mm6          \n\t"
   4855             "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
   4856             "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
   4857             "psubw %%mm0, %%mm4          \n\t"
   4858             "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
   4859             "psubw %%mm0, %%mm4          \n\t"
   4860             "psubw %%mm7, %%mm5          \n\t"
   4861             "pxor %%mm0, %%mm0           \n\t"
   4862             "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
   4863             "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
   4864             "psubw %%mm7, %%mm5          \n\t"
   4865             "psubw %%mm0, %%mm6          \n\t"
   4866             //  test pa <= pb
   4867             "movq %%mm4, %%mm7           \n\t"
   4868             "psubw %%mm0, %%mm6          \n\t"
   4869             "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
   4870             "movq %%mm7, %%mm0           \n\t"
   4871             // use mm7 mask to merge pa & pb
   4872             "pand %%mm7, %%mm5           \n\t"
   4873             // use mm0 mask copy to merge a & b
   4874             "pand %%mm0, %%mm2           \n\t"
   4875             "pandn %%mm4, %%mm7          \n\t"
   4876             "pandn %%mm1, %%mm0          \n\t"
   4877             "paddw %%mm5, %%mm7          \n\t"
   4878             "paddw %%mm2, %%mm0          \n\t"
   4879             //  test  ((pa <= pb)? pa:pb) <= pc
   4880             "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
   4881             "pxor %%mm1, %%mm1           \n\t"
   4882             "pand %%mm7, %%mm3           \n\t"
   4883             "pandn %%mm0, %%mm7          \n\t"
   4884             "paddw %%mm3, %%mm7          \n\t"
   4885             "pxor %%mm0, %%mm0           \n\t"
   4886             "packuswb %%mm1, %%mm7       \n\t"
   4887             "movq -8(%0," PCX ",), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
   4888             LOAD_GOT_rbp
   4889             "pand " AMASK4_4_0 ", %%mm7  \n\t" // _amask4_4_0 (was _ActiveMask)
   4890             RESTORE_rbp
   4891             "movq (%0," PCX ",), %%mm2   \n\t" // load b=Prior(x)
   4892             "paddb (%1," PCX ",), %%mm7  \n\t" // add Paeth predictor + Raw(x)
   4893             "punpckhbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
   4894             "movq %%mm7, (%1," PCX ",)   \n\t" // write back updated value
   4895             "movq -8(%1," PCX ",), %%mm1 \n\t" // read a=Raw(x-bpp) bytes
   4896 
   4897             // do second set of 4 bytes
   4898             "punpckhbw %%mm0, %%mm2      \n\t" // unpack High bytes of b
   4899             "punpckhbw %%mm0, %%mm1      \n\t" // unpack High bytes of a
   4900             // pav = p - a = (a + b - c) - a = b - c
   4901             "movq %%mm2, %%mm4           \n\t"
   4902             // pbv = p - b = (a + b - c) - b = a - c
   4903             "movq %%mm1, %%mm5           \n\t"
   4904             "psubw %%mm3, %%mm4          \n\t"
   4905             "pxor %%mm7, %%mm7           \n\t"
   4906             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
   4907             "movq %%mm4, %%mm6           \n\t"
   4908             "psubw %%mm3, %%mm5          \n\t"
   4909             // pa = abs(p-a) = abs(pav)
   4910             // pb = abs(p-b) = abs(pbv)
   4911             // pc = abs(p-c) = abs(pcv)
   4912             "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
   4913             "paddw %%mm5, %%mm6          \n\t"
   4914             "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
   4915             "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
   4916             "psubw %%mm0, %%mm4          \n\t"
   4917             "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
   4918             "psubw %%mm0, %%mm4          \n\t"
   4919             "psubw %%mm7, %%mm5          \n\t"
   4920             "pxor %%mm0, %%mm0           \n\t"
   4921             "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
   4922             "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
   4923             "psubw %%mm7, %%mm5          \n\t"
   4924             "psubw %%mm0, %%mm6          \n\t"
   4925             //  test pa <= pb
   4926             "movq %%mm4, %%mm7           \n\t"
   4927             "psubw %%mm0, %%mm6          \n\t"
   4928             "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
   4929             "movq %%mm7, %%mm0           \n\t"
   4930             // use mm7 mask to merge pa & pb
   4931             "pand %%mm7, %%mm5           \n\t"
   4932             // use mm0 mask copy to merge a & b
   4933             "pand %%mm0, %%mm2           \n\t"
   4934             "pandn %%mm4, %%mm7          \n\t"
   4935             "pandn %%mm1, %%mm0          \n\t"
   4936             "paddw %%mm5, %%mm7          \n\t"
   4937             "paddw %%mm2, %%mm0          \n\t"
   4938             //  test  ((pa <= pb)? pa:pb) <= pc
   4939             "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
   4940             "pxor %%mm1, %%mm1           \n\t"
   4941             "pand %%mm7, %%mm3           \n\t"
   4942             "pandn %%mm0, %%mm7          \n\t"
   4943             "pxor %%mm1, %%mm1           \n\t"
   4944             "paddw %%mm3, %%mm7          \n\t"
   4945             "pxor %%mm0, %%mm0           \n\t"
   4946             // step ecx to next set of 8 bytes and repeat loop til done
   4947             "addl $8, %%ecx              \n\t"
   4948             "packuswb %%mm7, %%mm1       \n\t"
   4949             "paddb -8(%1," PCX ",), %%mm1 \n\t" // add Paeth predictor + Raw(x)
   4950             "cmpl %%eax, %%ecx           \n\t" // MMXLength
   4951             "movq %%mm1, -8(%1," PCX ",) \n\t" // write back updated value
   4952                                  // mm1 will be used as Raw(x-bpp) next loop
   4953             "jb paeth_8lp                \n\t"
   4954 
   4955             : "=S" (dummy_value_S),            // output regs (dummy)
   4956               "=D" (dummy_value_D),
   4957               "=c" (dummy_value_c),
   4958               "=a" (dummy_value_a)
   4959 
   4960             : "0" (prev_row),  // esi/rsi      // input regs
   4961               "1" (row),       // edi/rdi
   4962               "2" (diff),      // ecx
   4963               "3" (MMXLength)  // eax
   4964 
   4965 #if defined(CLOBBER_MMX_REGS_SUPPORTED)
   4966             : "%mm0", "%mm1", "%mm2", "%mm3"   // clobber list
   4967             , "%mm4", "%mm5", "%mm6", "%mm7"
   4968 #endif
   4969          );
   4970       }
   4971       break;  // end 8 bpp
   4972 
   4973       default:                // bpp != 1,2,3,4,6,8:  doesn't exist
   4974       {
   4975          // ERROR:  SHOULD NEVER BE REACHED
   4976 #if defined(PNG_DEBUG)
   4977          png_debug(1, "Internal libpng logic error (GCC "
   4978            "png_read_filter_row_mmx_paeth())\n");
   4979 #endif
   4980       }
   4981       break;
   4982 
   4983    } // end switch (bpp)
   4984 
   4985    __asm__ __volatile__ (
   4986       // MMX acceleration complete; now do clean-up
   4987       // check if any remaining bytes left to decode
   4988 //pre "movl FullLength, %%edx      \n\t"
   4989 //pre "movl MMXLength, %%eax       \n\t"
   4990       "cmpl %%edx, %%eax           \n\t"
   4991       "jnb paeth_end               \n\t"
   4992 
   4993       SAVE_ebp
   4994 
   4995 //pre "movl row, %2                \n\t" // edi/rdi
   4996 //pre "movl prev_row, %1           \n\t" // esi/rsi
   4997       // do Paeth decode for remaining bytes
   4998       "movl %%eax, %%ebp           \n\t"
   4999 //pre "subl bpp, %%ebp             \n\t" // (bpp is preloaded into ecx)
   5000       "subl %%ecx, %%ebp           \n\t" // ebp = eax - bpp
   5001       "xorl %%ecx, %%ecx           \n\t" // zero ecx before using cl & cx below
   5002 
   5003       SAVE_GOT_ebx
   5004       SAVE_r11_r12_r13
   5005 
   5006    "paeth_lp2:                     \n\t"
   5007       "xorl %%ebx, %%ebx           \n\t"
   5008       // pav = p - a = (a + b - c) - a = b - c
   5009       "movb (%1," PAX ",), %%bl    \n\t" // load Prior(x) into bl
   5010       "movb (%1," PBP ",), %%cl    \n\t" // load Prior(x-bpp) into cl
   5011       "subl %%ecx, %%ebx           \n\t" // subtract Prior(x-bpp)
   5012       "movl %%ebx, " pa_TEMP "     \n\t" // Save pav for later use
   5013       "xorl %%ebx, %%ebx           \n\t"
   5014       // pbv = p - b = (a + b - c) - b = a - c
   5015       "movb (%2," PBP ",), %%bl    \n\t" // load Raw(x-bpp) into bl
   5016       "subl %%ecx, %%ebx           \n\t" // subtract Prior(x-bpp)
   5017       "movl %%ebx, %%ecx           \n\t"
   5018       // pcv = p - c = (a + b - c) - c = (a - c) + (b - c) = pav + pbv
   5019       "addl " pa_TEMP ", %%ebx     \n\t" // pcv = pav + pbv
   5020       // pc = abs(pcv)
   5021       "testl $0x80000000, %%ebx    \n\t"
   5022       "jz paeth_pca2               \n\t"
   5023       "negl %%ebx                  \n\t" // reverse sign of neg values
   5024 
   5025    "paeth_pca2:                    \n\t"
   5026       "movl %%ebx, " pc_TEMP "     \n\t" // save pc for later use
   5027       // pb = abs(pbv)
   5028       "testl $0x80000000, %%ecx    \n\t"
   5029       "jz paeth_pba2               \n\t"
   5030       "negl %%ecx                  \n\t" // reverse sign of neg values
   5031 
   5032    "paeth_pba2:                    \n\t"
   5033       "movl %%ecx, " pb_TEMP "     \n\t" // save pb for later use
   5034       // pa = abs(pav)
   5035       "movl " pa_TEMP ", %%ebx     \n\t"
   5036       "testl $0x80000000, %%ebx    \n\t"
   5037       "jz paeth_paa2               \n\t"
   5038       "negl %%ebx                  \n\t" // reverse sign of neg values
   5039 
   5040    "paeth_paa2:                    \n\t"
   5041       "movl %%ebx, " pa_TEMP "     \n\t" // save pa for later use
   5042       // test if pa <= pb
   5043       "cmpl %%ecx, %%ebx           \n\t"
   5044       "jna paeth_abb2              \n\t"
   5045       // pa > pb; now test if pb <= pc
   5046       "cmpl " pc_TEMP ", %%ecx     \n\t"
   5047       "jna paeth_bbc2              \n\t"
   5048       // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
   5049       "movb (%1," PBP ",), %%cl    \n\t" // load Prior(x-bpp) into cl
   5050       "jmp paeth_paeth2            \n\t"
   5051 
   5052    "paeth_bbc2:                    \n\t"
   5053       // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
   5054       "movb (%1," PAX ",), %%cl    \n\t" // load Prior(x) into cl
   5055       "jmp paeth_paeth2            \n\t"
   5056 
   5057    "paeth_abb2:                    \n\t"
   5058       // pa <= pb; now test if pa <= pc
   5059       "cmpl " pc_TEMP ", %%ebx     \n\t"
   5060       "jna paeth_abc2              \n\t"
   5061       // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
   5062       "movb (%1," PBP ",), %%cl    \n\t" // load Prior(x-bpp) into cl
   5063       "jmp paeth_paeth2            \n\t"
   5064 
   5065    "paeth_abc2:                    \n\t"
   5066       // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
   5067       "movb (%2," PBP ",), %%cl    \n\t" // load Raw(x-bpp) into cl
   5068 
   5069    "paeth_paeth2:                  \n\t"
   5070       "incl %%eax                  \n\t"
   5071       "incl %%ebp                  \n\t"
   5072       // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
   5073       "addb %%cl, -1(%2," PAX ",)  \n\t"
   5074       "cmpl %%edx, %%eax           \n\t" // check against FullLength
   5075       "jb paeth_lp2                \n\t"
   5076 
   5077       RESTORE_r11_r12_r13
   5078       RESTORE_GOT_ebx
   5079       RESTORE_ebp
   5080 
   5081    "paeth_end:                     \n\t"
   5082       "EMMS                        \n\t" // end MMX; prep for poss. FP instrs.
   5083 
   5084       : "=c" (dummy_value_c),            // output regs (dummy)
   5085         "=S" (dummy_value_S),
   5086         "=D" (dummy_value_D),
   5087         "=a" (dummy_value_a),
   5088         "=d" (dummy_value_d)
   5089 
   5090       : "0" (bpp),         // ecx        // input regs
   5091         "1" (prev_row),    // esi/rsi
   5092         "2" (row),         // edi/rdi
   5093         "3" (MMXLength),   // eax
   5094         "4" (FullLength)   // edx
   5095 
   5096       CLOB_COLON_ebx_ebp_r1X             // clobber list
   5097         CLOBBER_GOT_ebx
   5098         CLOB_COMMA_ebx_ebp
   5099         CLOBBER_ebp
   5100         CLOB_COMMA_ebX_r1X
   5101         CLOBBER_r11_r12_r13
   5102    );
   5103 
   5104 } /* end png_read_filter_row_mmx_paeth() */
   5105 
   5106 #endif // PNG_x86_64_USE_GOTPCREL || PNG_THREAD_UNSAFE_OK
   5107 #endif /* PNG_MMX_READ_FILTER_PAETH_SUPPORTED */
   5108 
   5109 
   5110 
   5111 
   5112 #if defined(PNG_MMX_READ_FILTER_SUB_SUPPORTED)
   5113 
   5114 //===========================================================================//
   5115 //                                                                           //
   5116 //           P N G _ R E A D _ F I L T E R _ R O W _ M M X _ S U B           //
   5117 //                                                                           //
   5118 //===========================================================================//
   5119 
   5120 // Optimized code for PNG Sub filter decoder
   5121 
   5122 static void /* PRIVATE */
   5123 png_read_filter_row_mmx_sub(png_row_infop row_info, png_bytep row)
   5124 {
   5125    unsigned FullLength, MMXLength;  // png_uint_32 is actually 64-bit on x86-64
   5126    int bpp;
   5127    int dummy_value_a;
   5128    int dummy_value_c;
   5129    int dummy_value_d;
   5130    png_bytep dummy_value_D;
   5131    int diff; //     __attribute__((used));
   5132 
   5133    bpp = (row_info->pixel_depth + 7) >> 3;  // calc number of bytes per pixel
   5134    FullLength = row_info->rowbytes - bpp;   // number of bytes to filter
   5135      // (why do we subtract off bpp?  not so in avg or paeth...)
   5136 
   5137    __asm__ __volatile__ (
   5138       SAVE_r15
   5139       SAVE_ebp
   5140 //pre "movl row, %1                \n\t" // edi/rdi
   5141       "mov  %1, " PSI "            \n\t" // lp = row
   5142 //pre "movl bpp, %%ecx             \n\t"
   5143       "add  " PCX ", %1            \n\t" // rp = row + bpp
   5144 //pre "movl FullLength, %%eax      \n\t" // bring in via eax...
   5145       SAVE_FullLength                    // ...but store for later use
   5146 
   5147       "xorl %%eax, %%eax           \n\t"
   5148 
   5149       // get # of bytes to alignment (note:  computing _delta_ of two pointers,
   5150       // so hereafter %%ebp is sufficient even on 64-bit)
   5151       "mov  %1, " PBP "            \n\t" // take start of row
   5152       "add  $0xf, " PBP "          \n\t" // add 7+8 to incr past alignment bdry
   5153 //    "andl $0xfffffff8, %%ebp     \n\t" // mask to alignment boundary (32-bit!)
   5154       CLEAR_BOTTOM_3_BITS  PBP    "\n\t" // mask to alignment boundary
   5155       "sub  %1, " PBP "            \n\t" // subtract row ptr again => ebp =
   5156       "jz sub_go                   \n\t" //  target value of eax at alignment
   5157 
   5158    "sub_lp1:                       \n\t" // fix alignment
   5159       "movb (" PSI "," PAX ",), %%cl \n\t"
   5160       "addb %%cl, (%1," PAX ",)    \n\t"
   5161       "incl %%eax                  \n\t"
   5162       "cmpl %%ebp, %%eax           \n\t"
   5163       "jb sub_lp1                  \n\t"
   5164 
   5165    "sub_go:                        \n\t"
   5166       RESTORE_FullLength "%%ecx    \n\t" // FullLength -> ecx
   5167       "movl %%ecx, %%edx           \n\t"
   5168       "subl %%eax, %%edx           \n\t" // subtract alignment fix
   5169       "andl $0x00000007, %%edx     \n\t" // calc bytes over mult of 8
   5170       "subl %%edx, %%ecx           \n\t" // drop over bytes from length
   5171 //out "movl %%ecx, MMXLength       \n\t"
   5172       "movl %%ebp, %%eax           \n\t" // ebp = diff, but no reg constraint(?)
   5173       RESTORE_ebp                        //  (could swap ebp and ecx functions,
   5174       RESTORE_r15                        //  but %%cl issues...)
   5175 
   5176       : "=c" (MMXLength),       // 0     // output regs
   5177         "=D" (dummy_value_D),   // 1
   5178         "=a" (diff)             // 2
   5179 
   5180       : "0" (bpp),              // ecx   // input regs
   5181         "1" (row),              // edi
   5182         "2" (FullLength)        // eax
   5183 
   5184       : "%esi", "%edx"                   // clobber list
   5185         _CLOBBER_r15
   5186         _CLOBBER_ebp
   5187    );
   5188 
   5189    // now do the math for the rest of the row
   5190    switch (bpp)
   5191    {
   5192       case 3:
   5193       {
   5194 //       _ShiftBpp = 24;       // == 3 * 8
   5195 //       _ShiftRem  = 40;      // == 64 - 24
   5196 
   5197          __asm__ __volatile__ (
   5198 // preload  "mov  row, %1                 \n\t" // edi/rdi
   5199             LOAD_GOT_rbp
   5200             // load (former) _ActiveMask for 2nd active byte group
   5201             "movq " AMASK2_3_3 ", %%mm7   \n\t" // _amask2_3_3
   5202             RESTORE_rbp
   5203 
   5204 // notused  "mov  %1, " PSI "             \n\t" // lp = row
   5205 // preload  "movl bpp, %%ecx              \n\t"
   5206             "add  " PCX ", %1             \n\t" // rp = row + bpp
   5207             "movq %%mm7, %%mm6            \n\t"
   5208 // preload  "movl diff, %%edx             \n\t"
   5209             "psllq $24, %%mm6             \n\t" // move mask in mm6 to cover
   5210                                                 //  3rd active byte group
   5211             // prime the pump:  load the first Raw(x-bpp) data set
   5212             "movq -8(%1," PDX ",), %%mm1  \n\t"
   5213 
   5214          "sub_3lp:                        \n\t" // shift data for adding first
   5215             "psrlq $40, %%mm1             \n\t" //  bpp bytes (no need for mask;
   5216                                                 //  shift clears inactive bytes)
   5217             // add 1st active group
   5218             "movq (%1," PDX ",), %%mm0    \n\t"
   5219             "paddb %%mm1, %%mm0           \n\t"
   5220 
   5221             // add 2nd active group
   5222             "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
   5223             "psllq $24, %%mm1             \n\t" // shift data to pos. correctly
   5224             "pand %%mm7, %%mm1            \n\t" // mask to use 2nd active group
   5225             "paddb %%mm1, %%mm0           \n\t"
   5226 
   5227             // add 3rd active group
   5228             "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
   5229             "psllq $24, %%mm1             \n\t" // shift data to pos. correctly
   5230             "pand %%mm6, %%mm1            \n\t" // mask to use 3rd active group
   5231             "addl $8, %%edx               \n\t"
   5232             "paddb %%mm1, %%mm0           \n\t"
   5233 
   5234             "cmpl %%eax, %%edx            \n\t" // MMXLength
   5235             "movq %%mm0, -8(%1," PDX ",)  \n\t" // write updated Raws to array
   5236             "movq %%mm0, %%mm1            \n\t" // prep 1st add at top of loop
   5237             "jb sub_3lp                   \n\t"
   5238 
   5239             : "=c" (dummy_value_c),   // 0      // output regs (dummy)
   5240               "=D" (dummy_value_D),   // 1
   5241               "=d" (dummy_value_d),   // 2
   5242               "=a" (dummy_value_a)    // 3
   5243 
   5244             : "0" (bpp),              // ecx    // input regs
   5245               "1" (row),              // edi
   5246               "2" (diff),             // edx
   5247               "3" (MMXLength)         // eax
   5248 
   5249 #if defined(CLOBBER_MMX_REGS_SUPPORTED)
   5250             : "%mm0", "%mm1", "%mm6", "%mm7"    // clobber list
   5251 #endif
   5252          );
   5253       }
   5254       break;  // end 3 bpp
   5255 
   5256       case 4:   // formerly shared with 6 bpp case via _ShiftBpp and _ShiftRem,
   5257       {         // but 64-bit PIC/.so problems (could still share, moving vars
   5258                 // into unused MMX regs via ecx/edx, but kludgy)
   5259 //       _ShiftBpp = bpp << 3;        // 32 (psllq)
   5260 //       _ShiftRem = 64 - _ShiftBpp;  // 32 (psrlq)
   5261 
   5262          __asm__ __volatile__ (
   5263 // preload  "mov  row, %1                 \n\t" // edi/rdi
   5264 // preload  "movl diff, %%edx             \n\t"
   5265 // notused  "mov  %1, " PSI "             \n\t" // lp = row
   5266 // preload  "movl bpp, %%ecx              \n\t"
   5267             "add  " PCX ", %1             \n\t" // rp = row + bpp
   5268 
   5269             // prime the pump:  load the first Raw(x-bpp) data set
   5270             "movq -8(%1," PDX ",), %%mm1  \n\t"
   5271 
   5272          "sub_4lp:                        \n\t" // shift data for adding first
   5273             "psrlq $32, %%mm1             \n\t" //  bpp bytes (no need for mask;
   5274                                                 //  shift clears inactive bytes)
   5275             "movq (%1," PDX ",), %%mm0    \n\t"
   5276             "paddb %%mm1, %%mm0           \n\t"
   5277 
   5278             // add 2nd active group
   5279             "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
   5280             "psllq $32, %%mm1             \n\t" // shift data to pos. correctly
   5281             "addl $8, %%edx               \n\t"
   5282             "paddb %%mm1, %%mm0           \n\t"
   5283 
   5284             "cmpl %%eax, %%edx            \n\t" // MMXLength
   5285             "movq %%mm0, -8(%1," PDX ",)  \n\t" // write updated Raws to array
   5286             "movq %%mm0, %%mm1            \n\t" // prep 1st add at top of loop
   5287             "jb sub_4lp                   \n\t"
   5288 
   5289             : "=c" (dummy_value_c),   // 0      // output regs (dummy)
   5290               "=D" (dummy_value_D),   // 1
   5291               "=d" (dummy_value_d),   // 2
   5292               "=a" (dummy_value_a)    // 3
   5293 
   5294             : "0" (bpp),              // ecx    // input regs
   5295               "1" (row),              // edi
   5296               "2" (diff),             // edx
   5297               "3" (MMXLength)         // eax
   5298 
   5299 #if defined(CLOBBER_MMX_REGS_SUPPORTED)
   5300             : "%mm0", "%mm1"                    // clobber list
   5301 #endif
   5302          );
   5303       }
   5304       break;  // end 4 bpp
   5305 
   5306       case 1:
   5307       {
   5308          __asm__ __volatile__ (
   5309 // preload  "movl diff, %%edx              \n\t"
   5310 // preload  "mov  row, %1                  \n\t" // edi/rdi
   5311 // preload  "cmpl FullLength, %%edx        \n\t"
   5312             "cmpl %%eax, %%edx             \n\t"
   5313             "jnb sub_1end                  \n\t"
   5314             "mov  %1, " PSI "              \n\t" // lp = row
   5315 // irrel.   "xorl %%ecx, %%ecx             \n\t" // (actually bug with preload)
   5316 // preload  "movl bpp, %%ecx               \n\t"
   5317             "add  " PCX ", %1              \n\t" // rp = row + bpp
   5318 
   5319          "sub_1lp:                         \n\t"
   5320             "movb (" PSI "," PDX ",), %%cl \n\t"
   5321             "addb %%cl, (%1," PDX ",)      \n\t"
   5322             "incl %%edx                    \n\t"
   5323             "cmpl %%eax, %%edx             \n\t" // compare with FullLength
   5324             "jb sub_1lp                    \n\t"
   5325 
   5326          "sub_1end:                        \n\t"
   5327 
   5328             : "=c" (dummy_value_c),   // 0      // output regs (dummy)
   5329               "=D" (dummy_value_D),   // 1
   5330               "=d" (dummy_value_d),   // 2
   5331               "=a" (dummy_value_a)    // 3
   5332 
   5333             : "0" (bpp),              // ecx    // input regs
   5334               "1" (row),              // edi
   5335               "2" (diff),             // edx
   5336               "3" (FullLength)        // eax
   5337 
   5338             : "%esi"                            // clobber list
   5339          );
   5340       }
   5341       return;  // end 1 bpp (bypassing cleanup block!)
   5342 
   5343       case 2:
   5344       {
   5345 //       _ShiftBpp = 16;       // == 2 * 8
   5346 //       _ShiftRem = 48;       // == 64 - 16
   5347 
   5348          __asm__ __volatile__ (
   5349             LOAD_GOT_rbp
   5350             // load (former) _ActiveMask for 2nd active byte group
   5351             "movq " AMASK4_2_2 ", %%mm7   \n\t" // _amask4_2_2
   5352             RESTORE_rbp
   5353 // preload  "movl diff, %%edx             \n\t"
   5354             "movq %%mm7, %%mm6            \n\t"
   5355 // preload  "mov  row, %1                 \n\t" // edi/rdi
   5356             "psllq $16, %%mm6             \n\t" // move mask in mm6 to cover
   5357                                                 //  3rd active byte group
   5358 // notused  "mov  %1, " PSI "             \n\t" // lp = row
   5359             "movq %%mm6, %%mm5            \n\t"
   5360 // preload  "movl bpp, %%ecx              \n\t"
   5361             "add  " PCX ", %1             \n\t" // rp = row + bpp
   5362             "psllq $16, %%mm5             \n\t" // move mask in mm5 to cover
   5363                                                 //  4th active byte group
   5364             // prime the pump:  load the first Raw(x-bpp) data set
   5365             "movq -8(%1," PDX ",), %%mm1  \n\t"
   5366 
   5367          "sub_2lp:                        \n\t" // shift data for adding first
   5368             "psrlq $48, %%mm1             \n\t" //  bpp bytes (no need for mask;
   5369                                                 //  shift clears inactive bytes)
   5370             // add 1st active group
   5371             "movq (%1," PDX ",), %%mm0    \n\t"
   5372             "paddb %%mm1, %%mm0           \n\t"
   5373 
   5374             // add 2nd active group
   5375             "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
   5376             "psllq $16, %%mm1             \n\t" // shift data to pos. correctly
   5377             "pand %%mm7, %%mm1            \n\t" // mask to use 2nd active group
   5378             "paddb %%mm1, %%mm0           \n\t"
   5379 
   5380             // add 3rd active group
   5381             "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
   5382             "psllq $16, %%mm1             \n\t" // shift data to pos. correctly
   5383             "pand %%mm6, %%mm1            \n\t" // mask to use 3rd active group
   5384             "paddb %%mm1, %%mm0           \n\t"
   5385 
   5386             // add 4th active group
   5387             "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
   5388             "psllq $16, %%mm1             \n\t" // shift data to pos. correctly
   5389             "pand %%mm5, %%mm1            \n\t" // mask to use 4th active group
   5390             "addl $8, %%edx               \n\t"
   5391             "paddb %%mm1, %%mm0           \n\t"
   5392             "cmpl %%eax, %%edx            \n\t" // MMXLength
   5393             "movq %%mm0, -8(%1," PDX ",)  \n\t" // write updated Raws to array
   5394             "movq %%mm0, %%mm1            \n\t" // prep 1st add at top of loop
   5395             "jb sub_2lp                   \n\t"
   5396 
   5397             : "=c" (dummy_value_c),   // 0      // output regs (dummy)
   5398               "=D" (dummy_value_D),   // 1
   5399               "=d" (dummy_value_d),   // 2
   5400               "=a" (dummy_value_a)    // 3
   5401 
   5402             : "0" (bpp),              // ecx    // input regs
   5403               "1" (row),              // edi
   5404               "2" (diff),             // edx
   5405               "3" (MMXLength)         // eax
   5406 
   5407 #if defined(CLOBBER_MMX_REGS_SUPPORTED)
   5408             : "%mm0", "%mm1", "%mm5", "%mm6"    // clobber list
   5409             , "%mm7"
   5410 #endif
   5411          );
   5412       }
   5413       break;  // end 2 bpp
   5414 
   5415       case 6:   // formerly shared with 4 bpp case (see comments there)
   5416       {
   5417 //       _ShiftBpp = bpp << 3;        // 48 (psllq)
   5418 //       _ShiftRem = 64 - _ShiftBpp;  // 16 (psrlq)
   5419 
   5420          __asm__ __volatile__ (
   5421 // preload  "mov  row, %1                 \n\t" // edi/rdi
   5422 // preload  "movl diff, %%edx             \n\t"
   5423 // notused  "mov  %1, " PSI "             \n\t" // lp = row
   5424 // preload  "movl bpp, %%ecx              \n\t"
   5425             "add  " PCX ", %1             \n\t" // rp = row + bpp
   5426 
   5427             // prime the pump:  load the first Raw(x-bpp) data set
   5428             "movq -8(%1," PDX ",), %%mm1  \n\t"
   5429 
   5430          "sub_6lp:                        \n\t" // shift data for adding first
   5431             "psrlq $16, %%mm1             \n\t" //  bpp bytes (no need for mask;
   5432                                                 //  shift clears inactive bytes)
   5433             "movq (%1," PDX ",), %%mm0    \n\t"
   5434             "paddb %%mm1, %%mm0           \n\t"
   5435 
   5436             // add 2nd active group
   5437             "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
   5438             "psllq $48, %%mm1             \n\t" // shift data to pos. correctly
   5439             "addl $8, %%edx               \n\t"
   5440             "paddb %%mm1, %%mm0           \n\t"
   5441 
   5442             "cmpl %%eax, %%edx            \n\t" // MMXLength
   5443             "movq %%mm0, -8(%1," PDX ",)  \n\t" // write updated Raws to array
   5444             "movq %%mm0, %%mm1            \n\t" // prep 1st add at top of loop
   5445             "jb sub_6lp                   \n\t"
   5446 
   5447             : "=c" (dummy_value_c),   // 0      // output regs (dummy)
   5448               "=D" (dummy_value_D),   // 1
   5449               "=d" (dummy_value_d),   // 2
   5450               "=a" (dummy_value_a)    // 3
   5451 
   5452             : "0" (bpp),              // ecx    // input regs
   5453               "1" (row),              // edi
   5454               "2" (diff),             // edx
   5455               "3" (MMXLength)         // eax
   5456 
   5457 #if defined(CLOBBER_MMX_REGS_SUPPORTED)
   5458             : "%mm0", "%mm1"                    // clobber list
   5459 #endif
   5460          );
   5461       }
   5462       break;  // end 6 bpp
   5463 
   5464       case 8:
   5465       {
   5466          __asm__ __volatile__ (
   5467 // preload  "mov  row, %1                 \n\t" // edi/rdi
   5468 // preload  "movl diff, %%edx             \n\t"
   5469 // notused  "mov  %1, " PSI "             \n\t" // lp = row
   5470 // preload  "movl bpp, %%ecx              \n\t"
   5471             "add  " PCX ", %1             \n\t" // rp = row + bpp
   5472 // preload  "movl MMXLength, %%eax        \n\t"
   5473 
   5474             // prime the pump:  load the first Raw(x-bpp) data set
   5475             "movq -8(%1," PDX ",), %%mm7  \n\t"
   5476             "movl %%eax, %%esi            \n\t" // copy of MMXLength -> esi
   5477             "andl $0x0000003f, %%esi      \n\t" // calc bytes over mult of 64
   5478 
   5479          "sub_8lp:                        \n\t"
   5480             "movq (%1," PDX ",), %%mm0    \n\t" // load Sub(x) for 1st 8 bytes
   5481             "paddb %%mm7, %%mm0           \n\t"
   5482             "movq 8(%1," PDX ",), %%mm1   \n\t" // load Sub(x) for 2nd 8 bytes
   5483             "movq %%mm0, (%1," PDX ",)    \n\t" // write Raw(x) for 1st 8 bytes
   5484 
   5485             // Now mm0 will be used as Raw(x-bpp) for the 2nd group of 8 bytes.
   5486             // This will be repeated for each group of 8 bytes with the 8th
   5487             // group being used as the Raw(x-bpp) for the 1st group of the
   5488             // next loop.
   5489 
   5490             "paddb %%mm0, %%mm1           \n\t"
   5491             "movq 16(%1," PDX ",), %%mm2  \n\t" // load Sub(x) for 3rd 8 bytes
   5492             "movq %%mm1, 8(%1," PDX ",)   \n\t" // write Raw(x) for 2nd 8 bytes
   5493             "paddb %%mm1, %%mm2           \n\t"
   5494             "movq 24(%1," PDX ",), %%mm3  \n\t" // load Sub(x) for 4th 8 bytes
   5495             "movq %%mm2, 16(%1," PDX ",)  \n\t" // write Raw(x) for 3rd 8 bytes
   5496             "paddb %%mm2, %%mm3           \n\t"
   5497             "movq 32(%1," PDX ",), %%mm4  \n\t" // load Sub(x) for 5th 8 bytes
   5498             "movq %%mm3, 24(%1," PDX ",)  \n\t" // write Raw(x) for 4th 8 bytes
   5499             "paddb %%mm3, %%mm4           \n\t"
   5500             "movq 40(%1," PDX ",), %%mm5  \n\t" // load Sub(x) for 6th 8 bytes
   5501             "movq %%mm4, 32(%1," PDX ",)  \n\t" // write Raw(x) for 5th 8 bytes
   5502             "paddb %%mm4, %%mm5           \n\t"
   5503             "movq 48(%1," PDX ",), %%mm6  \n\t" // load Sub(x) for 7th 8 bytes
   5504             "movq %%mm5, 40(%1," PDX ",)  \n\t" // write Raw(x) for 6th 8 bytes
   5505             "paddb %%mm5, %%mm6           \n\t"
   5506             "movq 56(%1," PDX ",), %%mm7  \n\t" // load Sub(x) for 8th 8 bytes
   5507             "movq %%mm6, 48(%1," PDX ",)  \n\t" // write Raw(x) for 7th 8 bytes
   5508             "addl $64, %%edx              \n\t"
   5509             "paddb %%mm6, %%mm7           \n\t"
   5510             "cmpl %%esi, %%edx            \n\t" // cmp to bytes over mult of 64
   5511             "movq %%mm7, -8(%1," PDX ",)  \n\t" // write Raw(x) for 8th 8 bytes
   5512             "jb sub_8lp                   \n\t"
   5513 
   5514             "cmpl %%eax, %%edx            \n\t" // compare to MMXLength
   5515             "jnb sub_8lt8                 \n\t"
   5516 
   5517          "sub_8lpA:                       \n\t"
   5518             "movq (%1," PDX ",), %%mm0    \n\t"
   5519             "addl $8, %%edx               \n\t"
   5520             "paddb %%mm7, %%mm0           \n\t"
   5521             "cmpl %%eax, %%edx            \n\t" // compare to MMXLength
   5522             "movq %%mm0, -8(%1," PDX ",)  \n\t" // -8 to offset early addl edx
   5523             "movq %%mm0, %%mm7            \n\t" // move calculated Raw(x) data
   5524             "jb sub_8lpA                  \n\t" //  to mm7 to be new Raw(x-bpp)
   5525                                                 //  for next loop
   5526          "sub_8lt8:                       \n\t"
   5527 
   5528             : "=c" (dummy_value_c),   // 0      // output regs (dummy)
   5529               "=D" (dummy_value_D),   // 1
   5530               "=d" (dummy_value_d),   // 2
   5531               "=a" (dummy_value_a)    // 3
   5532 
   5533             : "0" (bpp),              // ecx    // input regs
   5534               "1" (row),              // edi
   5535               "2" (diff),             // edx
   5536               "3" (MMXLength)         // eax
   5537 
   5538             : "%esi"                            // clobber list
   5539 #if defined(CLOBBER_MMX_REGS_SUPPORTED)
   5540             , "%mm0", "%mm1", "%mm2", "%mm3"
   5541             , "%mm4", "%mm5", "%mm6", "%mm7"
   5542 #endif
   5543          );
   5544       }
   5545       break;  // end 8 bpp
   5546 
   5547       default:                // bpp != 1,2,3,4,6,8:  doesn't exist
   5548       {
   5549          // ERROR:  SHOULD NEVER BE REACHED
   5550 #if defined(PNG_DEBUG)
   5551          png_debug(1, "Internal libpng logic error (GCC "
   5552            "png_read_filter_row_mmx_sub())\n");
   5553 #endif
   5554       }
   5555       break;
   5556 
   5557    } // end switch (bpp)
   5558 
   5559    __asm__ __volatile__ (
   5560 //pre "movl MMXLength, %%eax         \n\t"
   5561 //pre "mov  row, %1                  \n\t" // edi/rdi
   5562 //pre "cmpl FullLength, %%eax        \n\t"
   5563       "cmpl %%edx, %%eax             \n\t"
   5564       "jnb sub_end                   \n\t"
   5565 
   5566       "mov  %1, " PSI "              \n\t" // lp = row
   5567 //pre "movl bpp, %%ecx               \n\t"
   5568       "add  " PCX ", %1              \n\t" // rp = row + bpp
   5569       "xorl %%ecx, %%ecx             \n\t"
   5570 
   5571    "sub_lp2:                         \n\t"
   5572       "movb (" PSI "," PAX ",), %%cl \n\t"
   5573       "addb %%cl, (%1," PAX ",)      \n\t"
   5574       "incl %%eax                    \n\t"
   5575       "cmpl %%edx, %%eax             \n\t" // FullLength
   5576       "jb sub_lp2                    \n\t"
   5577 
   5578    "sub_end:                         \n\t"
   5579       "EMMS                          \n\t" // end MMX instructions
   5580 
   5581       : "=c" (dummy_value_c),   // 0      // output regs (dummy)
   5582         "=D" (dummy_value_D),   // 1
   5583         "=a" (dummy_value_a),   // 2
   5584         "=d" (dummy_value_d)    // 3
   5585 
   5586       : "0" (bpp),              // ecx    // input regs
   5587         "1" (row),              // edi
   5588         "2" (MMXLength),        // eax
   5589         "3" (FullLength)        // edx
   5590 
   5591       : "%esi"                            // clobber list
   5592    );
   5593 
   5594 } // end of png_read_filter_row_mmx_sub()
   5595 
   5596 #endif /* PNG_MMX_READ_FILTER_SUB_SUPPORTED */
   5597 
   5598 
   5599 
   5600 
   5601 #if defined(PNG_MMX_READ_FILTER_UP_SUPPORTED)
   5602 
   5603 //===========================================================================//
   5604 //                                                                           //
   5605 //            P N G _ R E A D _ F I L T E R _ R O W _ M M X _ U P            //
   5606 //                                                                           //
   5607 //===========================================================================//
   5608 
   5609 // Optimized code for PNG Up filter decoder
   5610 
   5611 static void /* PRIVATE */
   5612 png_read_filter_row_mmx_up(png_row_infop row_info, png_bytep row,
   5613                            png_bytep prev_row)
   5614 {
   5615    unsigned len;        // png_uint_32 is actually 64-bit on x86-64
   5616    int dummy_value_d;   // fix 'forbidden register 3 (dx) was spilled' error
   5617    png_bytep dummy_value_S;
   5618    png_bytep dummy_value_D;
   5619 
   5620    len = row_info->rowbytes;              // number of bytes to filter
   5621 
   5622    __asm__ __volatile__ (
   5623       SAVE_GOT_ebx
   5624 //pre "mov  prev_row, %1           \n\t" // esi/rsi
   5625 //pre "movl row, %2                \n\t" // edi/rdi
   5626 
   5627       "xorl %%ebx, %%ebx           \n\t"
   5628       "xorl %%eax, %%eax           \n\t"
   5629 
   5630       // get # of bytes to alignment (note:  computing _delta_ of two pointers,
   5631       // so hereafter %%ecx is sufficient even on 64-bit)
   5632       "mov  %2, " PCX "            \n\t" // take start of row
   5633       "add  $0x7, " PCX "          \n\t" // add 7 to incr past alignment bdry
   5634 //    "andl $0xfffffff8, %%ecx     \n\t" // mask to alignment boundary (32-bit!)
   5635       CLEAR_BOTTOM_3_BITS  PCX    "\n\t" // mask to alignment boundary
   5636       "sub  %2, " PCX "            \n\t" // subtract row ptr again => ebp =
   5637       "jz up_go                    \n\t" //  target value of ecx at alignment
   5638 
   5639    "up_lp1:                        \n\t" // fix alignment
   5640       "movb (%2," PBX ",), %%al    \n\t"
   5641       "addb (%1," PBX ",), %%al    \n\t"
   5642       "incl %%ebx                  \n\t"
   5643       "cmpl %%ecx, %%ebx           \n\t"
   5644       "movb %%al, -1(%2," PBX ",)  \n\t" // mov does not affect flags; -1 to
   5645       "jb up_lp1                   \n\t" //  offset incl ebx
   5646 
   5647    "up_go:                         \n\t"
   5648 //pre "movl len, %%edx             \n\t"
   5649       "movl %%edx, %%ecx           \n\t"
   5650       "subl %%ebx, %%edx           \n\t" // subtract alignment fix
   5651       "andl $0x0000003f, %%edx     \n\t" // calc bytes over mult of 64
   5652       "subl %%edx, %%ecx           \n\t" // sub over-bytes from original length
   5653 
   5654       // unrolled loop - use all MMX registers and interleave to reduce
   5655       // number of branch instructions (loops) and reduce partial stalls
   5656    "up_loop:                       \n\t"
   5657       "movq (%1," PBX ",), %%mm1   \n\t"
   5658       "movq (%2," PBX ",), %%mm0   \n\t"
   5659       "movq 8(%1," PBX ",), %%mm3  \n\t"
   5660       "paddb %%mm1, %%mm0          \n\t"
   5661       "movq 8(%2," PBX ",), %%mm2  \n\t"
   5662       "movq %%mm0, (%2," PBX ",)   \n\t"
   5663       "paddb %%mm3, %%mm2          \n\t"
   5664       "movq 16(%1," PBX ",), %%mm5 \n\t"
   5665       "movq %%mm2, 8(%2," PBX ",)  \n\t"
   5666       "movq 16(%2," PBX ",), %%mm4 \n\t"
   5667       "movq 24(%1," PBX ",), %%mm7 \n\t"
   5668       "paddb %%mm5, %%mm4          \n\t"
   5669       "movq 24(%2," PBX ",), %%mm6 \n\t"
   5670       "movq %%mm4, 16(%2," PBX ",) \n\t"
   5671       "paddb %%mm7, %%mm6          \n\t"
   5672       "movq 32(%1," PBX ",), %%mm1 \n\t"
   5673       "movq %%mm6, 24(%2," PBX ",) \n\t"
   5674       "movq 32(%2," PBX ",), %%mm0 \n\t"
   5675       "movq 40(%1," PBX ",), %%mm3 \n\t"
   5676       "paddb %%mm1, %%mm0          \n\t"
   5677       "movq 40(%2," PBX ",), %%mm2 \n\t"
   5678       "movq %%mm0, 32(%2," PBX ",) \n\t"
   5679       "paddb %%mm3, %%mm2          \n\t"
   5680       "movq 48(%1," PBX ",), %%mm5 \n\t"
   5681       "movq %%mm2, 40(%2," PBX ",) \n\t"
   5682       "movq 48(%2," PBX ",), %%mm4 \n\t"
   5683       "movq 56(%1," PBX ",), %%mm7 \n\t"
   5684       "paddb %%mm5, %%mm4          \n\t"
   5685       "movq 56(%2," PBX ",), %%mm6 \n\t"
   5686       "movq %%mm4, 48(%2," PBX ",) \n\t"
   5687       "addl $64, %%ebx             \n\t"
   5688       "paddb %%mm7, %%mm6          \n\t"
   5689       "cmpl %%ecx, %%ebx           \n\t"
   5690       "movq %%mm6, -8(%2," PBX ",) \n\t" // (+56)movq does not affect flags;
   5691       "jb up_loop                  \n\t" //  -8 to offset addl ebx
   5692 
   5693       "cmpl $0, %%edx              \n\t" // test for bytes over mult of 64
   5694       "jz up_end                   \n\t"
   5695 
   5696       "cmpl $8, %%edx              \n\t" // test for less than 8 bytes
   5697       "jb up_lt8                   \n\t" //  [added by lcreeve at netins.net]
   5698 
   5699       "addl %%edx, %%ecx           \n\t"
   5700       "andl $0x00000007, %%edx     \n\t" // calc bytes over mult of 8
   5701       "subl %%edx, %%ecx           \n\t" // drop over-bytes from length
   5702       "jz up_lt8                   \n\t"
   5703 
   5704    "up_lpA:                        \n\t" // use MMX regs to update 8 bytes sim.
   5705       "movq (%1," PBX ",), %%mm1   \n\t"
   5706       "movq (%2," PBX ",), %%mm0   \n\t"
   5707       "addl $8, %%ebx              \n\t"
   5708       "paddb %%mm1, %%mm0          \n\t"
   5709       "cmpl %%ecx, %%ebx           \n\t"
   5710       "movq %%mm0, -8(%2," PBX ",) \n\t" // movq does not affect flags; -8 to
   5711       "jb up_lpA                   \n\t" //  offset add ebx
   5712       "cmpl $0, %%edx              \n\t" // test for bytes over mult of 8
   5713       "jz up_end                   \n\t"
   5714 
   5715    "up_lt8:                        \n\t"
   5716       "xorl %%eax, %%eax           \n\t"
   5717       "addl %%edx, %%ecx           \n\t" // move over byte count into counter
   5718 
   5719    "up_lp2:                        \n\t" // use x86 regs for remaining bytes
   5720       "movb (%2," PBX ",), %%al    \n\t"
   5721       "addb (%1," PBX ",), %%al    \n\t"
   5722       "incl %%ebx                  \n\t"
   5723       "cmpl %%ecx, %%ebx           \n\t"
   5724       "movb %%al, -1(%2," PBX ",)  \n\t" // mov does not affect flags; -1 to
   5725       "jb up_lp2                   \n\t" //  offset inc ebx
   5726 
   5727    "up_end:                        \n\t"
   5728       "EMMS                        \n\t" // conversion of filtered row complete
   5729       RESTORE_GOT_ebx
   5730 
   5731       : "=d" (dummy_value_d),   // 0     // output regs (dummy)
   5732         "=S" (dummy_value_S),   // 1
   5733         "=D" (dummy_value_D)    // 2
   5734 
   5735       : "0" (len),              // edx   // input regs
   5736         "1" (prev_row),         // esi
   5737         "2" (row)               // edi
   5738 
   5739       : "%eax", "%ecx"                   // clobber list (no input regs!)
   5740         _CLOBBER_GOT_ebx
   5741 #if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
   5742       , "%mm0", "%mm1", "%mm2", "%mm3"
   5743       , "%mm4", "%mm5", "%mm6", "%mm7"
   5744 #endif
   5745    );
   5746 
   5747 } // end of png_read_filter_row_mmx_up()
   5748 
   5749 #endif /* PNG_MMX_READ_FILTER_UP_SUPPORTED */
   5750 
   5751 
   5752 
   5753 
   5754 /*===========================================================================*/
   5755 /*                                                                           */
   5756 /*                   P N G _ R E A D _ F I L T E R _ R O W                   */
   5757 /*                                                                           */
   5758 /*===========================================================================*/
   5759 
   5760 /* Optimized png_read_filter_row routines */
   5761 
   5762 void /* PRIVATE */
   5763 png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
   5764    row, png_bytep prev_row, int filter)
   5765 {
   5766 #if defined(PNG_DEBUG)
   5767    char filtname[10];
   5768 #endif
   5769 
   5770    if (_mmx_supported == 2) {
   5771 #if !defined(PNG_1_0_X)
   5772        /* this should have happened in png_init_mmx_flags() already */
   5773        png_warning(png_ptr, "asm_flags may not have been initialized");
   5774 #endif
   5775        png_mmx_support();
   5776    }
   5777 
   5778 #if defined(PNG_DEBUG)
   5779    png_debug(1, "in png_read_filter_row (pnggccrd.c)\n");
   5780    switch (filter)
   5781    {
   5782       case 0:
   5783          png_snprintf(filtname, 10, "none");
   5784          break;
   5785 
   5786       case 1:
   5787          png_snprintf(filtname, 10, "sub-%s",
   5788 #ifdef PNG_MMX_READ_FILTER_SUB_SUPPORTED
   5789 #if !defined(PNG_1_0_X)
   5790            ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB) &&
   5791             (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
   5792             (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
   5793 #else
   5794            _mmx_supported
   5795 #endif
   5796            ? "MMX" :
   5797 #endif
   5798            "C");
   5799          break;
   5800 
   5801       case 2:
   5802          png_snprintf(filtname, 10, "up-%s",
   5803 #ifdef PNG_MMX_READ_FILTER_UP_SUPPORTED
   5804 #if !defined(PNG_1_0_X)
   5805            ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP) &&
   5806             (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
   5807             (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
   5808 #else
   5809            _mmx_supported
   5810 #endif
   5811            ? "MMX" :
   5812 #endif
   5813            "C");
   5814          break;
   5815 
   5816       case 3:
   5817          png_snprintf(filtname, 10, "avg-%s",
   5818 #ifdef PNG_MMX_READ_FILTER_AVG_SUPPORTED
   5819 #if !defined(PNG_1_0_X)
   5820            ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG) &&
   5821             (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
   5822             (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
   5823 #else
   5824            _mmx_supported
   5825 #endif
   5826            ? "MMX" :
   5827 #endif
   5828            "C");
   5829          break;
   5830 
   5831       case 4:
   5832          png_snprintf(filtname, 10, "paeth-%s",
   5833 #ifdef PNG_MMX_READ_FILTER_PAETH_SUPPORTED
   5834 #if defined(PNG_x86_64_USE_GOTPCREL) || defined(PNG_THREAD_UNSAFE_OK)
   5835 #if !defined(PNG_1_0_X)
   5836            ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH) &&
   5837             (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
   5838             (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
   5839 #else
   5840            _mmx_supported
   5841 #endif
   5842            ? "MMX" :
   5843 #endif /* PNG_x86_64_USE_GOTPCREL || PNG_THREAD_UNSAFE_OK */
   5844 #endif
   5845            "C");
   5846          break;
   5847 
   5848       default:
   5849          png_snprintf(filtname, 10, "unknown");
   5850          break;
   5851    }
   5852    png_debug2(2, "row_number=%ld, %s, ", png_ptr->row_number, filtname);
   5853    //png_debug1(0, "png_ptr=%10p, ", png_ptr);
   5854    //png_debug1(0, "asm_flags=0x%08lx, ", png_ptr->asm_flags);
   5855    png_debug1(0, "row=%10p, ", row);
   5856    png_debug2(0, "pixdepth=%d, bytes=%d, ", (int)row_info->pixel_depth,
   5857       (int)((row_info->pixel_depth + 7) >> 3));
   5858    png_debug1(0, "rowbytes=%ld\n", row_info->rowbytes);
   5859 #endif /* PNG_DEBUG */
   5860 
   5861    switch (filter)
   5862    {
   5863       case PNG_FILTER_VALUE_NONE:
   5864          break;
   5865 
   5866       case PNG_FILTER_VALUE_SUB:
   5867 #ifdef PNG_MMX_READ_FILTER_SUB_SUPPORTED
   5868 #if !defined(PNG_1_0_X)
   5869          if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB) &&
   5870              (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
   5871              (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
   5872 #else
   5873          if (_mmx_supported)
   5874 #endif
   5875          {
   5876             png_read_filter_row_mmx_sub(row_info, row);
   5877          }
   5878          else
   5879 #endif
   5880          {
   5881             png_uint_32 i;
   5882             png_uint_32 istop = row_info->rowbytes;
   5883             png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
   5884             png_bytep rp = row + bpp;
   5885             png_bytep lp = row;
   5886 
   5887             for (i = bpp; i < istop; i++)
   5888             {
   5889                *rp = (png_byte)(((int)(*rp) + (int)(*lp++)) & 0xff);
   5890                rp++;
   5891             }
   5892          }  /* end !UseMMX_sub */
   5893          break;
   5894 
   5895       case PNG_FILTER_VALUE_UP:
   5896 #ifdef PNG_MMX_READ_FILTER_UP_SUPPORTED
   5897 #if !defined(PNG_1_0_X)
   5898          if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP) &&
   5899              (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
   5900              (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
   5901 #else
   5902          if (_mmx_supported)
   5903 #endif
   5904          {
   5905             png_read_filter_row_mmx_up(row_info, row, prev_row);
   5906          }
   5907           else
   5908 #endif
   5909          {
   5910             png_uint_32 i;
   5911             png_uint_32 istop = row_info->rowbytes;
   5912             png_bytep rp = row;
   5913             png_bytep pp = prev_row;
   5914 
   5915             for (i = 0; i < istop; ++i)
   5916             {
   5917                *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
   5918                rp++;
   5919             }
   5920          }  /* end !UseMMX_up */
   5921          break;
   5922 
   5923       case PNG_FILTER_VALUE_AVG:
   5924 #ifdef PNG_MMX_READ_FILTER_AVG_SUPPORTED
   5925 #if !defined(PNG_1_0_X)
   5926          if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG) &&
   5927              (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
   5928              (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
   5929 #else
   5930          if (_mmx_supported)
   5931 #endif
   5932          {
   5933             png_read_filter_row_mmx_avg(row_info, row, prev_row);
   5934          }
   5935          else
   5936 #endif
   5937          {
   5938             png_uint_32 i;
   5939             png_bytep rp = row;
   5940             png_bytep pp = prev_row;
   5941             png_bytep lp = row;
   5942             png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
   5943             png_uint_32 istop = row_info->rowbytes - bpp;
   5944 
   5945             for (i = 0; i < bpp; i++)
   5946             {
   5947                *rp = (png_byte)(((int)(*rp) +
   5948                   ((int)(*pp++) >> 1)) & 0xff);
   5949                rp++;
   5950             }
   5951 
   5952             for (i = 0; i < istop; i++)
   5953             {
   5954                *rp = (png_byte)(((int)(*rp) +
   5955                   ((int)(*pp++ + *lp++) >> 1)) & 0xff);
   5956                rp++;
   5957             }
   5958          }  /* end !UseMMX_avg */
   5959          break;
   5960 
   5961       case PNG_FILTER_VALUE_PAETH:
   5962 #ifdef PNG_MMX_READ_FILTER_PAETH_SUPPORTED
   5963 #if defined(PNG_x86_64_USE_GOTPCREL) || defined(PNG_THREAD_UNSAFE_OK)
   5964 #if !defined(PNG_1_0_X)
   5965          if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH) &&
   5966              (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
   5967              (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
   5968 #else
   5969          if (_mmx_supported)
   5970 #endif
   5971          {
   5972             png_read_filter_row_mmx_paeth(row_info, row, prev_row);
   5973          }
   5974          else
   5975 #endif /* PNG_x86_64_USE_GOTPCREL || PNG_THREAD_UNSAFE_OK */
   5976 #endif
   5977          {
   5978             png_uint_32 i;
   5979             png_bytep rp = row;
   5980             png_bytep pp = prev_row;
   5981             png_bytep lp = row;
   5982             png_bytep cp = prev_row;
   5983             png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
   5984             png_uint_32 istop = row_info->rowbytes - bpp;
   5985 
   5986             for (i = 0; i < bpp; i++)
   5987             {
   5988                *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
   5989                rp++;
   5990             }
   5991 
   5992             for (i = 0; i < istop; i++)   /* use leftover rp,pp */
   5993             {
   5994                int a, b, c, pa, pb, pc, p;
   5995 
   5996                a = *lp++;
   5997                b = *pp++;
   5998                c = *cp++;
   5999 
   6000                p = b - c;
   6001                pc = a - c;
   6002 
   6003 #if defined(PNG_USE_ABS)
   6004                pa = abs(p);
   6005                pb = abs(pc);
   6006                pc = abs(p + pc);
   6007 #else
   6008                pa = p < 0 ? -p : p;
   6009                pb = pc < 0 ? -pc : pc;
   6010                pc = (p + pc) < 0 ? -(p + pc) : p + pc;
   6011 #endif
   6012 
   6013                /*
   6014                   if (pa <= pb && pa <= pc)
   6015                      p = a;
   6016                   else if (pb <= pc)
   6017                      p = b;
   6018                   else
   6019                      p = c;
   6020                 */
   6021 
   6022                p = (pa <= pb && pa <= pc) ? a : (pb <= pc) ? b : c;
   6023 
   6024                *rp = (png_byte)(((int)(*rp) + p) & 0xff);
   6025                rp++;
   6026             }
   6027          }  /* end !UseMMX_paeth */
   6028          break;
   6029 
   6030       default:
   6031          png_warning(png_ptr, "Ignoring bad row-filter type");
   6032          *row=0;
   6033          break;
   6034    }
   6035 }
   6036 
   6037 #endif /* PNG_HAVE_MMX_READ_FILTER_ROW */
   6038 
   6039 
   6040 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED && PNG_USE_PNGGCCRD */
   6041 #endif /* __GNUC__ */
   6042