1 2 /* pnggccrd.c - mixed C/assembler version of utilities to read a PNG file 3 * 4 * For Intel/AMD x86 or x86-64 CPU (Pentium-MMX or later) and GNU C compiler. 5 * 6 * Last changed in libpng 1.2.19 August 18, 2007 7 * For conditions of distribution and use, see copyright notice in png.h 8 * Copyright (c) 1998 Intel Corporation 9 * Copyright (c) 1999-2002,2007 Greg Roelofs 10 * Copyright (c) 1998-2007 Glenn Randers-Pehrson 11 * 12 * Based on MSVC code contributed by Nirav Chhatrapati, Intel Corp., 1998. 13 * Interface to libpng contributed by Gilles Vollant, 1999. 14 * GNU C port by Greg Roelofs, 1999-2001. 15 * 16 * References: 17 * 18 * http://www.intel.com/drg/pentiumII/appnotes/916/916.htm 19 * http://www.intel.com/drg/pentiumII/appnotes/923/923.htm 20 * [Intel's performance analysis of the MMX vs. non-MMX code; 21 * moved/deleted as of 2006, but text and some graphs still 22 * available via WayBack Machine at archive.org] 23 * 24 * http://www.ibiblio.org/gferg/ldp/GCC-Inline-Assembly-HOWTO.html 25 * http://sam.zoy.org/blog/2007-04-13-shlib-with-non-pic-code-have-inline-assembly-and-pic-mix-well 26 * http://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html 27 * http://gcc.gnu.org/onlinedocs/gcc/Variable-Attributes.html 28 * http://gcc.gnu.org/onlinedocs/gcc/Function-Attributes.html 29 * AMD64 Architecture Programmer's Manual, volumes 1 and 5 30 * [http://www.amd.com/us-en/Processors/TechnicalResources/0,,30_182_739_7044,00.html] 31 * Intel 64 and IA-32 Software Developer's Manuals 32 * [http://developer.intel.com/products/processor/manuals/] 33 * 34 * png_read_filter_row_mmx_*() were converted in place with intel2gas 1.3.1: 35 * 36 * intel2gas -mdI pnggccrd.c.partially-msvc -o pnggccrd.c 37 * 38 * and then cleaned up by hand. See http://hermes.terminal.at/intel2gas/ . 39 * 40 * NOTE: A sufficiently recent version of GNU as (or as.exe under DOS/Windows) 41 * is required to assemble the newer asm instructions such as movq. (Version 42 * 2.5.2l.15 is definitely too old.) See ftp://ftp.gnu.org/pub/gnu/binutils/ . 43 */ 44 45 /* 46 * PORTING NOTES AND CHANGELOG (mostly by Greg Roelofs) 47 * =========================== 48 * 49 * 19991006: 50 * - fixed sign error in post-MMX cleanup code (16- & 32-bit cases) 51 * 52 * 19991007: 53 * - additional optimizations (possible or definite): 54 * x [DONE] write MMX code for 64-bit case (pixel_bytes == 8) [not tested] 55 * - write MMX code for 48-bit case (pixel_bytes == 6) 56 * - figure out what's up with 24-bit case (pixel_bytes == 3): 57 * why subtract 8 from width_mmx in the pass 4/5 case? 58 * (only width_mmx case) (near line 2335) 59 * x [DONE] replace pixel_bytes within each block with the true 60 * constant value (or are compilers smart enough to do that?) 61 * - rewrite all MMX interlacing code so it's aligned with 62 * the *beginning* of the row buffer, not the end. This 63 * would not only allow one to eliminate half of the memory 64 * writes for odd passes (that is, pass == odd), it may also 65 * eliminate some unaligned-data-access exceptions (assuming 66 * there's a penalty for not aligning 64-bit accesses on 67 * 64-bit boundaries). The only catch is that the "leftover" 68 * pixel(s) at the end of the row would have to be saved, 69 * but there are enough unused MMX registers in every case, 70 * so this is not a problem. A further benefit is that the 71 * post-MMX cleanup code (C code) in at least some of the 72 * cases could be done within the assembler block. 73 * x [DONE] the "v3 v2 v1 v0 v7 v6 v5 v4" comments are confusing, 74 * inconsistent, and don't match the MMX Programmer's Reference 75 * Manual conventions anyway. They should be changed to 76 * "b7 b6 b5 b4 b3 b2 b1 b0," where b0 indicates the byte that 77 * was lowest in memory (i.e., corresponding to a left pixel) 78 * and b7 is the byte that was highest (i.e., a right pixel). 79 * 80 * 19991016: 81 * - Brennan's Guide notwithstanding, gcc under Linux does *not* 82 * want globals prefixed by underscores when referencing them-- 83 * i.e., if the variable is const4, then refer to it as const4, 84 * not _const4. This seems to be a djgpp-specific requirement. 85 * Also, such variables apparently *must* be declared outside 86 * of functions; neither static nor automatic variables work if 87 * defined within the scope of a single function, but both 88 * static and truly global (multi-module) variables work fine. 89 * 90 * 19991017: 91 * - replaced pixel_bytes in each png_memcpy() call with constant value for 92 * inlining (png_do_read_interlace() "non-MMX/modified C code" block) 93 * 94 * 19991023: 95 * - fixed png_combine_row() non-MMX replication bug (odd passes only?) 96 * - switched from string-concatenation-with-macros to cleaner method of 97 * renaming global variables for djgpp--i.e., always use prefixes in 98 * inlined assembler code (== strings) and conditionally rename the 99 * variables, not the other way around. Hence _const4, _mask8_0, etc. 100 * 101 * 19991024: 102 * - fixed mmxsupport()/png_do_read_interlace() first-row bug 103 * This one was severely weird: even though mmxsupport() doesn't touch 104 * ebx (where "row" pointer was stored), it nevertheless managed to zero 105 * the register (even in static/non-fPIC code--see below), which in turn 106 * caused png_do_read_interlace() to return prematurely on the first row of 107 * interlaced images (i.e., without expanding the interlaced pixels). 108 * Inspection of the generated assembly code didn't turn up any clues, 109 * although it did point at a minor optimization (i.e., get rid of 110 * mmx_supported_local variable and just use eax). Possibly the CPUID 111 * instruction is more destructive than it looks? (Not yet checked.) 112 * - "info gcc" was next to useless, so compared fPIC and non-fPIC assembly 113 * listings... Apparently register spillage has to do with ebx, since 114 * it's used to index the global offset table. Commenting it out of the 115 * input-reg lists in png_combine_row() eliminated compiler barfage, so 116 * ifdef'd with __PIC__ macro: if defined, use a global for unmask 117 * 118 * 19991107: 119 * - verified CPUID clobberage: 12-char string constant ("GenuineIntel", 120 * "AuthenticAMD", etc.) placed in ebx:ecx:edx. Still need to polish. 121 * 122 * 19991120: 123 * - made "diff" variable (now "_dif") global to simplify conversion of 124 * filtering routines (running out of regs, sigh). "diff" is still used 125 * in interlacing routines, however. 126 * - fixed up both versions of mmxsupport() (ORIG_THAT_USED_TO_CLOBBER_EBX 127 * macro determines which is used); original not yet tested. 128 * 129 * 20000213: 130 * - when compiling with gcc, be sure to use -fomit-frame-pointer 131 * 132 * 20000319: 133 * - fixed a register-name typo in png_do_read_interlace(), default (MMX) case, 134 * pass == 4 or 5, that caused visible corruption of interlaced images 135 * 136 * 20000623: 137 * - Various problems were reported with gcc 2.95.2 in the Cygwin environment, 138 * many of the form "forbidden register 0 (ax) was spilled for class AREG." 139 * This is explained at http://gcc.gnu.org/fom_serv/cache/23.html, and 140 * Chuck Wilson supplied a patch involving dummy output registers. See 141 * http://sourceforge.net/bugs/?func=detailbug&bug_id=108741&group_id=5624 142 * for the original (anonymous) SourceForge bug report. 143 * 144 * 20000706: 145 * - Chuck Wilson passed along these remaining gcc 2.95.2 errors: 146 * pnggccrd.c: In function `png_combine_row': 147 * pnggccrd.c:525: more than 10 operands in `asm' 148 * pnggccrd.c:669: more than 10 operands in `asm' 149 * pnggccrd.c:828: more than 10 operands in `asm' 150 * pnggccrd.c:994: more than 10 operands in `asm' 151 * pnggccrd.c:1177: more than 10 operands in `asm' 152 * They are all the same problem and can be worked around by using the 153 * global _unmask variable unconditionally, not just in the -fPIC case. 154 * Reportedly earlier versions of gcc also have the problem with more than 155 * 10 operands; they just don't report it. Much strangeness ensues, etc. 156 * 157 * 20000729: 158 * - enabled png_read_filter_row_mmx_up() (shortest remaining unconverted 159 * MMX routine); began converting png_read_filter_row_mmx_sub() 160 * - to finish remaining sections: 161 * - clean up indentation and comments 162 * - preload local variables 163 * - add output and input regs (order of former determines numerical 164 * mapping of latter) 165 * - avoid all usage of ebx (including bx, bh, bl) register [20000823] 166 * - remove "$" from addressing of Shift and Mask variables [20000823] 167 * 168 * 20000731: 169 * - global union vars causing segfaults in png_read_filter_row_mmx_sub()? 170 * 171 * 20000822: 172 * - ARGH, stupid png_read_filter_row_mmx_sub() segfault only happens with 173 * shared-library (-fPIC) version! Code works just fine as part of static 174 * library. Should have tested that sooner. 175 * ebx is getting clobbered again (explicitly this time); need to save it 176 * on stack or rewrite asm code to avoid using it altogether. Blargh! 177 * 178 * 20000823: 179 * - first section was trickiest; all remaining sections have ebx -> edx now. 180 * (-fPIC works again.) Also added missing underscores to various Shift* 181 * and *Mask* globals and got rid of leading "$" signs. 182 * 183 * 20000826: 184 * - added visual separators to help navigate microscopic printed copies 185 * (http://pobox.com/~newt/code/gpr-latest.zip, mode 10); started working 186 * on png_read_filter_row_mmx_avg() 187 * 188 * 20000828: 189 * - finished png_read_filter_row_mmx_avg(): only Paeth left! (930 lines...) 190 * What the hell, did png_read_filter_row_mmx_paeth(), too. Comments not 191 * cleaned up/shortened in either routine, but functionality is complete 192 * and seems to be working fine. 193 * 194 * 20000829: 195 * - ahhh, figured out last(?) bit of gcc/gas asm-fu: if register is listed 196 * as an input reg (with dummy output variables, etc.), then it *cannot* 197 * also appear in the clobber list or gcc 2.95.2 will barf. The solution 198 * is simple enough... 199 * 200 * 20000914: 201 * - bug in png_read_filter_row_mmx_avg(): 16-bit grayscale not handled 202 * correctly (but 48-bit RGB just fine) 203 * 204 * 20000916: 205 * - fixed bug in png_read_filter_row_mmx_avg(), bpp == 2 case; three errors: 206 * - "_ShiftBpp.use = 24;" should have been "_ShiftBpp.use = 16;" 207 * - "_ShiftRem.use = 40;" should have been "_ShiftRem.use = 48;" 208 * - "psllq _ShiftRem, %%mm2" should have been "psrlq _ShiftRem, %%mm2" 209 * 210 * 20010101: 211 * - added new png_init_mmx_flags() function (here only because it needs to 212 * call mmxsupport(), which should probably become global png_mmxsupport()); 213 * modified other MMX routines to run conditionally (png_ptr->asm_flags) 214 * 215 * 20010103: 216 * - renamed mmxsupport() to png_mmx_support(), with auto-set of mmx_supported, 217 * and made it public; moved png_init_mmx_flags() to png.c as internal func 218 * 219 * 20010104: 220 * - removed dependency on png_read_filter_row_c() (C code already duplicated 221 * within MMX version of png_read_filter_row()) so no longer necessary to 222 * compile it into pngrutil.o 223 * 224 * 20010310: 225 * - fixed buffer-overrun bug in png_combine_row() C code (non-MMX) 226 * 227 * 20010808: 228 * - added PNG_THREAD_UNSAFE_OK around code using global variables [GR-P] 229 * 230 * 20011124: 231 * - fixed missing save of Eflag in png_mmx_support() [Maxim Sobolev] 232 * 233 * 20020304: 234 * - eliminated incorrect use of width_mmx in pixel_bytes == 8 case 235 * 236 * 20020407: 237 * - fixed insufficient preservation of ebx register [Sami Farin] 238 * 239 * 20040724: 240 * - more tinkering with clobber list at lines 4529 and 5033 to get it to 241 * compile with gcc 3.4 [GR-P] 242 * 243 * 20040809: 244 * - added "rim" definitions for CONST4 and CONST6 [GR-P] 245 * 246 * 20060303: 247 * - added "OS2" to list of systems that don't need leading underscores [GR-P] 248 * 249 * 20060320: 250 * - made PIC-compliant [Christian Aichinger] 251 * 252 * 20070313: 253 * - finally applied Giuseppe Ghib's 64-bit patch of 20060803 (completely 254 * overlooked Dylan Alex Simon's similar patch of 20060414, oops...) 255 * 256 * 20070524: 257 * - fixed link failure caused by asm-only variables being optimized out 258 * (identified by Dimitri of Trolltech) with __attribute__((used)), which 259 * also gets rid of warnings => nuked ugly png_squelch_warnings() hack 260 * - dropped redundant ifdef 261 * - moved png_mmx_support() back up where originally intended (as in 262 * pngvcrd.c), using __attribute__((noinline)) in extra prototype 263 * 264 * 20070527: 265 * - revised png_combine_row() to reuse mask in lieu of external _unmask 266 * - moved 32-bit (RGBA) case to top of png_combine_row(): most common 267 * - just about ready to give up on x86-64 -fPIC mode; can't even access 16 268 * _mask*_* constants without triggering link error on shared library: 269 * /usr/bin/ld: pnggccrd.pic.o: relocation R_X86_64_32S against `a local 270 * symbol' can not be used when making a shared object; recompile with 271 * -fPIC 272 * pnggccrd.pic.o: could not read symbols: Bad value 273 * ("objdump -x pnggccrd.pic.o | grep rodata" to verify) 274 * [might be able to work around by doing within assembly code whatever 275 * -fPIC does, but given problems to date, seems like long shot...] 276 * [relevant ifdefs: __x86_64__ && __PIC__ => C code only] 277 * - changed #if 0 to #ifdef PNG_CLOBBER_MMX_REGS_SUPPORTED in case gcc ever 278 * supports MMX regs (%mm0, etc.) in clobber list (not supported by gcc 279 * 2.7.2.3, 2.91.66 (egcs 1.1.2), 3.x, or 4.1.2) 280 * 281 * 20070603: 282 * - revised png_combine_row() to use @GOTPCREL(%%rip) addressing on _c64 283 * struct of _mask*_* constants for x86-64 -fPIC; see sam.zoy.org link 284 * above for details 285 * - moved _const4 and _const6 into _c64 struct, renamed to _amask5_3_0 and 286 * _amask7_1_0, respectively 287 * - can't figure out how to use _c64._mask*_* vars within asm code, so still 288 * need single variables for non-x86-64/-fPIC half :-( 289 * - replaced various __PIC__ ifdefs with *_GOT_ebx macros 290 * - moved _LBCarryMask and _HBClearMask into _c64 struct 291 * - conditionally replaced _p*temp variables with %r11d-%r13d (via p*_TEMP 292 * and CLOBBER_r1*d macros) 293 * 294 * 20070604: 295 * - replaced all _ActiveMask and _ActiveMaskEnd with new _amask*_*_* consts 296 * (_amask naming convention: numbers of 00-bytes, ff-bytes, 00-bytes) 297 * - _ActiveMask // (10) // avg/paeth/sub; read-only; consts; movq/pand 298 * 0x0000000000ffffffLL (bpp 3, avg) _amask5_3_0 299 * 0xffffffffffffffffLL (bpp 4, 6, avg) _amask0_8_0 300 * 0x000000000000ffffLL (bpp 2, avg) _amask6_2_0 301 * 0x0000000000ffffffLL (bpp 3, paeth) _amask5_3_0 302 * 0x00000000ffffffffLL (bpp 6, paeth) _amask4_4_0 303 * 0x00000000ffffffffLL (bpp 4, paeth) _amask4_4_0 304 * 0x00000000ffffffffLL (bpp 8, paeth) _amask4_4_0 305 * 0x0000ffffff000000LL (bpp 3, sub) _amask2_3_3 306 * 0x00000000ffff0000LL (bpp 2, sub) _amask4_2_2 307 * - _ActiveMaskEnd // (1) // paeth only; read-only; const; pand 308 * 0xffff000000000000LL (bpp 3, paeth) _amask0_2_6 309 * - changed all "#if defined(__x86_64__) // later // && defined(__PIC__)" 310 * lines to "#ifdef PNG_x86_64_USE_GOTPCREL" for easier/safer testing 311 * 312 * 20070605: 313 * - merged PNG_x86_64_USE_GOTPCREL, non-PNG_x86_64_USE_GOTPCREL code via 314 * *MASK* and LOAD/RESTORE macros 315 * 316 * 20070607: 317 * - replaced all constant instances of _ShiftBpp, _ShiftRem with immediates 318 * (still have two shared cases in avg, sub routines) 319 * 320 * 20070609: 321 * - replaced remaining instances of _ShiftBpp, _ShiftRem with immediates 322 * (split sub and avg 4/6-bpp cases into separate blocks) 323 * - fixed paeth bug due to clobbered r11/r12/r13 regs 324 * 325 * 20070610: 326 * - made global "_dif" variable (avg/paeth/sub routines) local again (now 327 * "diff"--see 19991120 entry above), using register constraints 328 * - note that %ebp in clobber list doesn't actually work, at least for 32-bit 329 * version and gcc 4.1.2; must save and restore manually. (Seems to work 330 * OK for 64-bit version and gcc 3.4.3, but gcc may not be using ebp/rbp 331 * in that case.) 332 * - started replacing direct _MMXLength accesses with register constraints 333 * 334 * 20070612: 335 * - continued replacing direct _MMXLength accesses with register constraints 336 * 337 * 20070613: 338 * - finished replacing direct _MMXLength accesses with register constraints; 339 * switched to local variable (and renamed back to MMXLength) 340 * 341 * 20070614: 342 * - fixed sub bpp = 1 bug 343 * - started replacing direct _FullLength accesses with register constraints 344 * 345 * 20070615: 346 * - fixed 64-bit paeth bpp 3 crash bug (misplaced LOAD_GOT_rbp) 347 * - fixed 64-bit paeth bpp 1/2 and cleanup-block crash bugs (misplaced 348 * RESTORE_r11_r12_r13) 349 * - slightly optimized avg/paeth cleanup blocks and paeth bpp 1/2 block 350 * (save/restore ebx only if needed) 351 * - continued replacing direct _FullLength accesses with register constraints 352 * 353 * 20070616: 354 * - finished replacing direct _FullLength accesses with register constraints 355 * (*ugly* conditional clobber-separator macros for avg and paeth, sigh) 356 * 357 * 20070618: 358 * - fixed misplaced PNG_THREAD_UNSAFE_OK endif (was missing LOAD_GOT_rbp/ 359 * RESTORE_rbp in 32-bit thread-safe case) 360 * - changed all "ifdef *" to "if defined(*)" [GR-P] 361 * 362 * 20070619: 363 * - rearranged most bitdepth-related case statements to put most frequent 364 * cases at top (24-bit, 32-bit, 8-bit, rest) 365 * 366 * 20070623: 367 * - cleaned up png_debug() warnings/formatting 368 * - removed PNG_MMX_CODE_SUPPORTED ifdefs and added outer __GNUC__ ifdef 369 * (module no longer used by non-x86/non-GCC builds as of libpng 1.2.19) 370 * - removed single libpng-1.2.x PNG_DEBUG dependency on 1.0.x png_struct 371 * member (row_buf_size) 372 * - rearranged pass-related if-blocks in png_do_read_interlace() to put most 373 * frequent cases (4, 5) at top [GR-P suggestion] 374 * 375 * 20070624-29: 376 * - fixed 64-bit crash bug: pointers -> rsi/rdi, not esi/edi (switched to 377 * %0/%1/%2/%3/%4 notation; eliminated size suffixes from relevant add/ 378 * inc/sub/mov instructions; changed dummy vars to pointers) 379 * - png_combine_row() 380 * - png_do_read_interlace() 381 * - png_read_filter_row_mmx_avg() 382 * - png_read_filter_row_mmx_paeth() 383 * - png_read_filter_row_mmx_sub() 384 * - png_read_filter_row_mmx_up() 385 * - NOTE: this fix makes use of the fact that modifying a 32-bit reg (e.g., 386 * %%ebx) clears the top half of its corresponding 64-bit reg (%%rbx), so 387 * it's safe to mix 32-bit operations with 64-bit base/index addressing 388 * (see new PSI/PAX/PBX/PDX/PBP/etc. "pointer-register" macros); applies 389 * also to clobber lists 390 * 391 * 20070630: 392 * - cleaned up formatting, macros, minor png_read_filter_row_mmx_sub() 8-bpp 393 * register-usage inefficiency 394 * - fixed 32-bit png_do_read_interlace() bug (was using pointer size for 395 * 64-bit dummy values) 396 * 397 * 20070703: 398 * - added check for (manual) PIC macro to fix OpenBSD crash bug 399 * 400 * 20070717: 401 * - fixed 48-bit png_combine_row() bug (was acting like 32-bit): copy 6 402 * bytes per pixel, not 4, and use stride of 6, not 4, in the second loop 403 * of interlace processing of 48-bit pixels [GR-P] 404 * 405 * 20070722: 406 * - fixed 64-bit png_uint_32 bug with MMXLength/FullLength temp vars 407 * 408 * [still broken: tops of all row-filter blocks (input/output constraints); 409 * shows up on 64-bit dynamic (-fPIC) version with -O2, especially if debug- 410 * printfs enabled, but at right edge of odd-width images even if disabled] 411 * 412 * 413 * STILL TO DO: 414 * - fix final thread-unsafe code using stack vars and pointer? (paeth top, 415 * default, bottom only: default, bottom already 5 reg constraints; could 416 * replace bpp with pointer and group bpp/patemp/pbtemp/pctemp in array) 417 * - fix ebp/no-reg-constraint inefficiency (avg/paeth/sub top) 418 * - test png_do_read_interlace() 64-bit case (pixel_bytes == 8) 419 * - write MMX code for 48-bit case (pixel_bytes == 6) 420 * - figure out what's up with 24-bit case (pixel_bytes == 3): 421 * why subtract 8 from width_mmx in the pass 4/5 case? due to 422 * odd number of bytes? (only width_mmx case) (near line 2335) 423 * - rewrite all MMX interlacing code so it's aligned with beginning 424 * of the row buffer, not the end (see 19991007 for details) 425 * - add error messages to any remaining bogus default cases 426 * - enable pixel_depth == 8 cases in png_read_filter_row()? (test speed) 427 * - try =r, etc., as reg constraints? (would gcc use 64-bit ones on x86-64?) 428 * - need full, non-graphical, CRC-based test suite... maybe autogenerate 429 * random data of various height/width/depth, compute CRCs, write (C 430 * funcs), read (asm/MMX), recompute CRCs, and compare? 431 * - write true x86-64 version using 128-bit "media instructions", %xmm0-15, 432 * and extra general-purpose registers 433 */ 434 435 #if defined(__GNUC__) 436 437 #define PNG_INTERNAL 438 #include "png.h" 439 440 441 /* for some inexplicable reason, gcc 3.3.5 on OpenBSD (and elsewhere?) does 442 * *not* define __PIC__ when the -fPIC option is used, so we have to rely on 443 * makefiles and whatnot to define the PIC macro explicitly */ 444 #if defined(PIC) && !defined(__PIC__) // (this can/should move to pngconf.h) 445 # define __PIC__ 446 #endif 447 448 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_USE_PNGGCCRD) 449 450 /* if you want/need full thread-safety on x86-64 even when linking statically, 451 * comment out the "&& defined(__PIC__)" part here: */ 452 #if defined(__x86_64__) && defined(__PIC__) 453 # define PNG_x86_64_USE_GOTPCREL // GOTPCREL => full thread-safety 454 # define PNG_CLOBBER_x86_64_REGS_SUPPORTED // works as of gcc 3.4.3 ... 455 #endif 456 457 int PNGAPI png_mmx_support(void); 458 459 #if defined(PNG_USE_LOCAL_ARRAYS) 460 static PNG_CONST int FARDATA png_pass_start[7] = {0, 4, 0, 2, 0, 1, 0}; 461 static PNG_CONST int FARDATA png_pass_inc[7] = {8, 8, 4, 4, 2, 2, 1}; 462 static PNG_CONST int FARDATA png_pass_width[7] = {8, 4, 4, 2, 2, 1, 1}; 463 #endif 464 465 /* djgpp, Win32, Cygwin, and OS2 add their own underscores to global variables, 466 * so define them without: */ 467 #if defined(__DJGPP__) || defined(WIN32) || defined(__CYGWIN__) || \ 468 defined(__OS2__) 469 # define _mmx_supported mmx_supported 470 # define _mask8_0 mask8_0 471 # define _mask16_1 mask16_1 472 # define _mask16_0 mask16_0 473 # define _mask24_2 mask24_2 474 # define _mask24_1 mask24_1 475 # define _mask24_0 mask24_0 476 # define _mask32_3 mask32_3 477 # define _mask32_2 mask32_2 478 # define _mask32_1 mask32_1 479 # define _mask32_0 mask32_0 480 # define _mask48_5 mask48_5 481 # define _mask48_4 mask48_4 482 # define _mask48_3 mask48_3 483 # define _mask48_2 mask48_2 484 # define _mask48_1 mask48_1 485 # define _mask48_0 mask48_0 486 # define _amask5_3_0 amask5_3_0 487 # define _amask7_1_0 amask7_1_0 488 # define _LBCarryMask LBCarryMask 489 # define _HBClearMask HBClearMask 490 # define _amask0_8_0 amask0_8_0 491 # define _amask6_2_0 amask6_2_0 492 # define _amask4_4_0 amask4_4_0 493 # define _amask0_2_6 amask0_2_6 494 # define _amask2_3_3 amask2_3_3 495 # define _amask4_2_2 amask4_2_2 496 # if defined(PNG_THREAD_UNSAFE_OK) 497 # define _patemp patemp 498 # define _pbtemp pbtemp 499 # define _pctemp pctemp 500 # endif 501 #endif // djgpp, Win32, Cygwin, OS2 502 503 504 /* These constants are used in the inlined MMX assembly code. */ 505 506 typedef unsigned long long ull; 507 508 #if defined(PNG_x86_64_USE_GOTPCREL) 509 static PNG_CONST struct { 510 //ull _mask_array[26]; 511 512 // png_combine_row() constants: 513 ull _mask8_0; 514 ull _mask16_0, _mask16_1; 515 ull _mask24_0, _mask24_1, _mask24_2; 516 ull _mask32_0, _mask32_1, _mask32_2, _mask32_3; 517 ull _mask48_0, _mask48_1, _mask48_2, _mask48_3, _mask48_4, _mask48_5; 518 519 // png_do_read_interlace() constants: 520 ull _amask5_3_0, _amask7_1_0; // was _const4 and _const6, respectively 521 522 // png_read_filter_row_mmx_avg() constants (also uses _amask5_3_0): 523 ull _LBCarryMask, _HBClearMask; 524 ull _amask0_8_0, _amask6_2_0; // was ActiveMask for bpp 4/6 and 2 cases 525 526 // png_read_filter_row_mmx_paeth() constants (also uses _amask5_3_0): 527 ull _amask4_4_0, _amask0_2_6; // was ActiveMask{,End} for bpp 6/4/8 and 3 528 529 // png_read_filter_row_mmx_sub() constants: 530 ull _amask2_3_3, _amask4_2_2; // was ActiveMask for bpp 3 and 2 cases 531 532 } _c64 __attribute__((used, aligned(8))) = { 533 534 // png_combine_row() constants: 535 0x0102040810204080LL, // _mask8_0 offset 0 536 537 0x1010202040408080LL, // _mask16_0 offset 8 538 0x0101020204040808LL, // _mask16_1 offset 16 539 540 0x2020404040808080LL, // _mask24_0 offset 24 541 0x0408080810101020LL, // _mask24_1 offset 32 542 0x0101010202020404LL, // _mask24_2 offset 40 543 544 0x4040404080808080LL, // _mask32_0 offset 48 545 0x1010101020202020LL, // _mask32_1 offset 56 546 0x0404040408080808LL, // _mask32_2 offset 64 547 0x0101010102020202LL, // _mask32_3 offset 72 548 549 0x4040808080808080LL, // _mask48_0 offset 80 550 0x2020202040404040LL, // _mask48_1 offset 88 551 0x1010101010102020LL, // _mask48_2 offset 96 552 0x0404080808080808LL, // _mask48_3 offset 104 553 0x0202020204040404LL, // _mask48_4 offset 112 554 0x0101010101010202LL, // _mask48_5 offset 120 555 556 // png_do_read_interlace() constants: 557 0x0000000000FFFFFFLL, // _amask5_3_0 offset 128 (bpp 3, avg/paeth) const4 558 0x00000000000000FFLL, // _amask7_1_0 offset 136 const6 559 560 // png_read_filter_row_mmx_avg() constants: 561 0x0101010101010101LL, // _LBCarryMask offset 144 562 0x7F7F7F7F7F7F7F7FLL, // _HBClearMask offset 152 563 0xFFFFFFFFFFFFFFFFLL, // _amask0_8_0 offset 160 (bpp 4/6, avg) 564 0x000000000000FFFFLL, // _amask6_2_0 offset 168 (bpp 2, avg) 565 566 // png_read_filter_row_mmx_paeth() constants: 567 0x00000000FFFFFFFFLL, // _amask4_4_0 offset 176 (bpp 6/4/8, paeth) 568 0xFFFF000000000000LL, // _amask0_2_6 offset 184 (bpp 3, paeth) A.M.End 569 570 // png_read_filter_row_mmx_sub() constants: 571 0x0000FFFFFF000000LL, // _amask2_3_3 offset 192 (bpp 3, sub) 572 0x00000000FFFF0000LL, // _amask4_2_2 offset 200 (bpp 2, sub) 573 574 }; 575 576 #define MASK8_0 "(%%rbp)" 577 #define MASK16_0 "8(%%rbp)" 578 #define MASK16_1 "16(%%rbp)" 579 #define MASK24_0 "24(%%rbp)" 580 #define MASK24_1 "32(%%rbp)" 581 #define MASK24_2 "40(%%rbp)" 582 #define MASK32_0 "48(%%rbp)" 583 #define MASK32_1 "56(%%rbp)" 584 #define MASK32_2 "64(%%rbp)" 585 #define MASK32_3 "72(%%rbp)" 586 #define MASK48_0 "80(%%rbp)" 587 #define MASK48_1 "88(%%rbp)" 588 #define MASK48_2 "96(%%rbp)" 589 #define MASK48_3 "104(%%rbp)" 590 #define MASK48_4 "112(%%rbp)" 591 #define MASK48_5 "120(%%rbp)" 592 #define AMASK5_3_0 "128(%%rbp)" 593 #define AMASK7_1_0 "136(%%rbp)" 594 #define LB_CARRY_MASK "144(%%rbp)" 595 #define HB_CLEAR_MASK "152(%%rbp)" 596 #define AMASK0_8_0 "160(%%rbp)" 597 #define AMASK6_2_0 "168(%%rbp)" 598 #define AMASK4_4_0 "176(%%rbp)" 599 #define AMASK0_2_6 "184(%%rbp)" 600 #define AMASK2_3_3 "192(%%rbp)" 601 #define AMASK4_2_2 "200(%%rbp)" 602 603 #else // !PNG_x86_64_USE_GOTPCREL 604 605 static PNG_CONST ull _mask8_0 __attribute__((used, aligned(8))) = 0x0102040810204080LL; 606 607 static PNG_CONST ull _mask16_1 __attribute__((used, aligned(8))) = 0x0101020204040808LL; 608 static PNG_CONST ull _mask16_0 __attribute__((used, aligned(8))) = 0x1010202040408080LL; 609 610 static PNG_CONST ull _mask24_2 __attribute__((used, aligned(8))) = 0x0101010202020404LL; 611 static PNG_CONST ull _mask24_1 __attribute__((used, aligned(8))) = 0x0408080810101020LL; 612 static PNG_CONST ull _mask24_0 __attribute__((used, aligned(8))) = 0x2020404040808080LL; 613 614 static PNG_CONST ull _mask32_3 __attribute__((used, aligned(8))) = 0x0101010102020202LL; 615 static PNG_CONST ull _mask32_2 __attribute__((used, aligned(8))) = 0x0404040408080808LL; 616 static PNG_CONST ull _mask32_1 __attribute__((used, aligned(8))) = 0x1010101020202020LL; 617 static PNG_CONST ull _mask32_0 __attribute__((used, aligned(8))) = 0x4040404080808080LL; 618 619 static PNG_CONST ull _mask48_5 __attribute__((used, aligned(8))) = 0x0101010101010202LL; 620 static PNG_CONST ull _mask48_4 __attribute__((used, aligned(8))) = 0x0202020204040404LL; 621 static PNG_CONST ull _mask48_3 __attribute__((used, aligned(8))) = 0x0404080808080808LL; 622 static PNG_CONST ull _mask48_2 __attribute__((used, aligned(8))) = 0x1010101010102020LL; 623 static PNG_CONST ull _mask48_1 __attribute__((used, aligned(8))) = 0x2020202040404040LL; 624 static PNG_CONST ull _mask48_0 __attribute__((used, aligned(8))) = 0x4040808080808080LL; 625 626 // png_do_read_interlace() constants: 627 static PNG_CONST ull _amask5_3_0 __attribute__((aligned(8))) = 0x0000000000FFFFFFLL; // was _const4 628 static PNG_CONST ull _amask7_1_0 __attribute__((aligned(8))) = 0x00000000000000FFLL; // was _const6 629 630 // png_read_filter_row_mmx_avg() constants: 631 static PNG_CONST ull _LBCarryMask __attribute__((used, aligned(8))) = 0x0101010101010101LL; 632 static PNG_CONST ull _HBClearMask __attribute__((used, aligned(8))) = 0x7f7f7f7f7f7f7f7fLL; 633 static PNG_CONST ull _amask0_8_0 __attribute__((used, aligned(8))) = 0xFFFFFFFFFFFFFFFFLL; 634 static PNG_CONST ull _amask6_2_0 __attribute__((used, aligned(8))) = 0x000000000000FFFFLL; 635 636 // png_read_filter_row_mmx_paeth() constants: 637 static PNG_CONST ull _amask4_4_0 __attribute__((used, aligned(8))) = 0x00000000FFFFFFFFLL; 638 static PNG_CONST ull _amask0_2_6 __attribute__((used, aligned(8))) = 0xFFFF000000000000LL; 639 640 // png_read_filter_row_mmx_sub() constants: 641 static PNG_CONST ull _amask2_3_3 __attribute__((used, aligned(8))) = 0x0000FFFFFF000000LL; 642 static PNG_CONST ull _amask4_2_2 __attribute__((used, aligned(8))) = 0x00000000FFFF0000LL; 643 644 #define MASK8_0 "_mask8_0" 645 #define MASK16_0 "_mask16_0" 646 #define MASK16_1 "_mask16_1" 647 #define MASK24_0 "_mask24_0" 648 #define MASK24_1 "_mask24_1" 649 #define MASK24_2 "_mask24_2" 650 #define MASK32_0 "_mask32_0" 651 #define MASK32_1 "_mask32_1" 652 #define MASK32_2 "_mask32_2" 653 #define MASK32_3 "_mask32_3" 654 #define MASK48_0 "_mask48_0" 655 #define MASK48_1 "_mask48_1" 656 #define MASK48_2 "_mask48_2" 657 #define MASK48_3 "_mask48_3" 658 #define MASK48_4 "_mask48_4" 659 #define MASK48_5 "_mask48_5" 660 #define AMASK5_3_0 "_amask5_3_0" 661 #define AMASK7_1_0 "_amask7_1_0" 662 #define LB_CARRY_MASK "_LBCarryMask" 663 #define HB_CLEAR_MASK "_HBClearMask" 664 #define AMASK0_8_0 "_amask0_8_0" 665 #define AMASK6_2_0 "_amask6_2_0" 666 #define AMASK4_4_0 "_amask4_4_0" 667 #define AMASK0_2_6 "_amask0_2_6" 668 #define AMASK2_3_3 "_amask2_3_3" 669 #define AMASK4_2_2 "_amask4_2_2" 670 671 #endif // ?PNG_x86_64_USE_GOTPCREL 672 673 674 #if defined(PNG_HAVE_MMX_READ_FILTER_ROW) || defined(PNG_HAVE_MMX_COMBINE_ROW) 675 676 // this block is specific to png_read_filter_row_mmx_paeth() except for 677 // LOAD_GOT_rbp and RESTORE_rbp, which are also used in png_combine_row() 678 #if defined(PNG_x86_64_USE_GOTPCREL) 679 # define pa_TEMP "%%r11d" 680 # define pb_TEMP "%%r12d" 681 # define pc_TEMP "%%r13d" 682 # if defined(PNG_CLOBBER_x86_64_REGS_SUPPORTED) // works as of gcc 3.4.3 ... 683 # define SAVE_r11_r12_r13 684 # define RESTORE_r11_r12_r13 685 # define _CLOBBER_r11_r12_r13 ,"%r11", "%r12", "%r13" 686 # define CLOBBER_r11_r12_r13 "%r11", "%r12", "%r13" 687 # else // !PNG_CLOBBER_x86_64_REGS_SUPPORTED 688 # define SAVE_r11_r12_r13 "pushq %%r11 \n\t" \ 689 "pushq %%r12 \n\t" \ 690 "pushq %%r13 \n\t" // "normally 0-extended" 691 # define RESTORE_r11_r12_r13 "popq %%r13 \n\t" \ 692 "popq %%r12 \n\t" \ 693 "popq %%r11 \n\t" 694 # define _CLOBBER_r11_r12_r13 695 # define CLOBBER_r11_r12_r13 696 # endif 697 # define LOAD_GOT_rbp "pushq %%rbp \n\t" \ 698 "movq _c64@GOTPCREL(%%rip), %%rbp \n\t" 699 # define RESTORE_rbp "popq %%rbp \n\t" 700 #else // 32-bit and/or non-PIC 701 # if defined(PNG_THREAD_UNSAFE_OK) 702 // These variables are used in png_read_filter_row_mmx_paeth() and would be 703 // local variables if not for gcc-inline-assembly addressing limitations 704 // (some apparently related to ELF format, others to CPU type). 705 // 706 // WARNING: Their presence defeats the thread-safety of libpng. 707 static int _patemp __attribute__((used)); 708 static int _pbtemp __attribute__((used)); 709 static int _pctemp __attribute__((used)); 710 # define pa_TEMP "_patemp" 711 # define pb_TEMP "_pbtemp" // temp variables for 712 # define pc_TEMP "_pctemp" // Paeth routine 713 # define SAVE_r11_r12_r13 714 # define RESTORE_r11_r12_r13 715 # define _CLOBBER_r11_r12_r13 // not using regs => not clobbering 716 # define CLOBBER_r11_r12_r13 717 # endif // PNG_THREAD_UNSAFE_OK 718 # define LOAD_GOT_rbp 719 # define RESTORE_rbp 720 #endif 721 722 #if defined(__x86_64__) 723 # define SAVE_ebp 724 # define RESTORE_ebp 725 # define _CLOBBER_ebp ,"%ebp" 726 # define CLOBBER_ebp "%ebp" 727 # define SAVE_FullLength "movl %%eax, %%r15d \n\t" 728 # define RESTORE_FullLength "movl %%r15d, " // may go into eax or ecx 729 # if defined(PNG_CLOBBER_x86_64_REGS_SUPPORTED) // works as of gcc 3.4.3 ... 730 # define SAVE_r15 731 # define RESTORE_r15 732 # define _CLOBBER_r15 ,"%r15" 733 # else 734 # define SAVE_r15 "pushq %%r15 \n\t" 735 # define RESTORE_r15 "popq %%r15 \n\t" 736 # define _CLOBBER_r15 737 # endif 738 # define PBP "%%rbp" // regs used for 64-bit 739 # define PAX "%%rax" // pointers or in 740 # define PBX "%%rbx" // combination with 741 # define PCX "%%rcx" // 64-bit pointer-regs 742 # define PDX "%%rdx" // (base/index pairs, 743 # define PSI "%%rsi" // add/sub/mov pairs) 744 # define CLEAR_BOTTOM_3_BITS "and $0xfffffffffffffff8, " 745 #else 746 # define SAVE_ebp "pushl %%ebp \n\t" // clobber list doesn't work 747 # define RESTORE_ebp "popl %%ebp \n\t" // for %ebp on 32-bit; not 748 # define _CLOBBER_ebp // clear why not 749 # define CLOBBER_ebp 750 # define SAVE_FullLength "pushl %%eax \n\t" 751 # define RESTORE_FullLength "popl " // eax (avg) or ecx (paeth) 752 # define SAVE_r15 753 # define RESTORE_r15 754 # define _CLOBBER_r15 755 # define PBP "%%ebp" // regs used for or in 756 # define PAX "%%eax" // combination with 757 # define PBX "%%ebx" // "normal," 32-bit 758 # define PCX "%%ecx" // pointers 759 # define PDX "%%edx" 760 # define PSI "%%esi" 761 # define CLEAR_BOTTOM_3_BITS "and $0xfffffff8, " 762 #endif 763 764 // CLOB_COMMA_ebx_ebp: need comma ONLY if both CLOBBER_ebp and CLOBBER_GOT_ebx 765 // have values, i.e., only if __x86_64__ AND !__PIC__ 766 #if defined(__x86_64__) && !defined(__PIC__) 767 # define CLOB_COMMA_ebx_ebp , // clobbering both ebp and ebx => need comma 768 #else 769 # define CLOB_COMMA_ebx_ebp 770 #endif 771 772 // CLOB_COMMA_ebX_r1X: need comma UNLESS both CLOBBER_ebp and CLOBBER_GOT_ebx 773 // are empty OR CLOBBER_r11_r12_r13 is empty--i.e., NO comma 774 // if (!__x86_64__ AND __PIC__) OR !(PNG_x86_64_USE_GOTPCREL 775 // AND PNG_CLOBBER_x86_64_REGS_SUPPORTED) (double sigh...) 776 #if (!defined(__x86_64__) && defined(__PIC__)) || \ 777 !defined(PNG_x86_64_USE_GOTPCREL) || \ 778 !defined(PNG_CLOBBER_x86_64_REGS_SUPPORTED) 779 # define CLOB_COMMA_ebX_r1X 780 #else 781 # define CLOB_COMMA_ebX_r1X , // clobbering (ebp OR ebx) AND r11_r12_r13 782 #endif 783 784 // CLOB_COLON_ebx_ebp: need colon unless CLOBBER_ebp and CLOBBER_GOT_ebx are 785 // BOTH empty--i.e., NO colon if (!__x86_64__ AND __PIC__) 786 // CLOB_COLON_ebx_ebp_r1X: if, in addition, CLOBBER_r11_r12_r13 is empty, then 787 // no colon for Paeth blocks, either--i.e., NO colon 788 // if !(PNG_x86_64_USE_GOTPCREL AND 789 // PNG_CLOBBER_x86_64_REGS_SUPPORTED) 790 #if (!defined(__x86_64__) && defined(__PIC__)) 791 # define CLOB_COLON_ebx_ebp 792 # if !(defined(PNG_x86_64_USE_GOTPCREL) && \ 793 defined(PNG_CLOBBER_x86_64_REGS_SUPPORTED)) 794 # define CLOB_COLON_ebx_ebp_r1X 795 # else 796 # define CLOB_COLON_ebx_ebp_r1X : // clobbering ebp OR ebx OR r11_r12_r13 797 # endif 798 #else 799 # define CLOB_COLON_ebx_ebp : // clobbering ebp OR ebx 800 # define CLOB_COLON_ebx_ebp_r1X : // clobbering ebp OR ebx OR r11_r12_r13 801 #endif 802 803 #endif // PNG_HAVE_MMX_READ_FILTER_ROW 804 805 #if defined(__PIC__) // macros to save, restore index to Global Offset Table 806 # if defined(__x86_64__) 807 # define SAVE_GOT_ebx "pushq %%rbx \n\t" 808 # define RESTORE_GOT_ebx "popq %%rbx \n\t" 809 # else 810 # define SAVE_GOT_ebx "pushl %%ebx \n\t" 811 # define RESTORE_GOT_ebx "popl %%ebx \n\t" 812 # endif 813 # define _CLOBBER_GOT_ebx // explicitly saved, restored => not clobbered 814 # define CLOBBER_GOT_ebx 815 #else 816 # define SAVE_GOT_ebx 817 # define RESTORE_GOT_ebx 818 # define _CLOBBER_GOT_ebx ,"%ebx" 819 # define CLOBBER_GOT_ebx "%ebx" 820 #endif 821 822 #if defined(PNG_HAVE_MMX_COMBINE_ROW) || defined(PNG_HAVE_MMX_READ_INTERLACE) 823 # define BPP2 2 824 # define BPP3 3 // bytes per pixel (a.k.a. pixel_bytes) 825 # define BPP4 4 // (defined only to help avoid cut-and-paste errors) 826 # define BPP6 6 827 # define BPP8 8 828 #endif 829 830 831 832 static int _mmx_supported = 2; // 0: no MMX; 1: MMX supported; 2: not tested 833 834 /*===========================================================================*/ 835 /* */ 836 /* P N G _ M M X _ S U P P O R T */ 837 /* */ 838 /*===========================================================================*/ 839 840 // GRR NOTES: (1) the following code assumes 386 or better (pushfl/popfl) 841 // (2) all instructions compile with gcc 2.7.2.3 and later 842 // x (3) the function is moved down here to prevent gcc from 843 // x inlining it in multiple places and then barfing be- 844 // x cause the ".NOT_SUPPORTED" label is multiply defined 845 // [need to retest with gcc 2.7.2.3] 846 847 // GRR 20070524: This declaration apparently is compatible with but supersedes 848 // the one in png.h; in any case, the generated object file is slightly 849 // smaller. It is unnecessary with gcc 4.1.2, but gcc 2.x apparently 850 // replicated the ".NOT_SUPPORTED" label in each location the function was 851 // inlined, leading to compilation errors due to the "multiply defined" 852 // label. Old workaround was to leave the function at the end of this 853 // file; new one (still testing) is to use a gcc-specific function attribute 854 // to prevent local inlining. 855 int PNGAPI 856 png_mmx_support(void) __attribute__((noinline)); 857 858 int PNGAPI 859 png_mmx_support(void) 860 { 861 #if defined(PNG_MMX_CODE_SUPPORTED) // superfluous, but what the heck 862 int result; 863 __asm__ __volatile__ ( 864 #if defined(__x86_64__) 865 "pushq %%rbx \n\t" // rbx gets clobbered by CPUID instruction 866 "pushq %%rcx \n\t" // so does rcx... 867 "pushq %%rdx \n\t" // ...and rdx (but rcx & rdx safe on Linux) 868 "pushfq \n\t" // save Eflag to stack 869 "popq %%rax \n\t" // get Eflag from stack into rax 870 "movq %%rax, %%rcx \n\t" // make another copy of Eflag in rcx 871 "xorl $0x200000, %%eax \n\t" // toggle ID bit in Eflag (i.e., bit 21) 872 "pushq %%rax \n\t" // save modified Eflag back to stack 873 "popfq \n\t" // restore modified value to Eflag reg 874 "pushfq \n\t" // save Eflag to stack 875 "popq %%rax \n\t" // get Eflag from stack 876 "pushq %%rcx \n\t" // save original Eflag to stack 877 "popfq \n\t" // restore original Eflag 878 #else 879 "pushl %%ebx \n\t" // ebx gets clobbered by CPUID instruction 880 "pushl %%ecx \n\t" // so does ecx... 881 "pushl %%edx \n\t" // ...and edx (but ecx & edx safe on Linux) 882 "pushfl \n\t" // save Eflag to stack 883 "popl %%eax \n\t" // get Eflag from stack into eax 884 "movl %%eax, %%ecx \n\t" // make another copy of Eflag in ecx 885 "xorl $0x200000, %%eax \n\t" // toggle ID bit in Eflag (i.e., bit 21) 886 "pushl %%eax \n\t" // save modified Eflag back to stack 887 "popfl \n\t" // restore modified value to Eflag reg 888 "pushfl \n\t" // save Eflag to stack 889 "popl %%eax \n\t" // get Eflag from stack 890 "pushl %%ecx \n\t" // save original Eflag to stack 891 "popfl \n\t" // restore original Eflag 892 #endif 893 "xorl %%ecx, %%eax \n\t" // compare new Eflag with original Eflag 894 "jz 0f \n\t" // if same, CPUID instr. is not supported 895 896 "xorl %%eax, %%eax \n\t" // set eax to zero 897 // ".byte 0x0f, 0xa2 \n\t" // CPUID instruction (two-byte opcode) 898 "cpuid \n\t" // get the CPU identification info 899 "cmpl $1, %%eax \n\t" // make sure eax return non-zero value 900 "jl 0f \n\t" // if eax is zero, MMX is not supported 901 902 "xorl %%eax, %%eax \n\t" // set eax to zero and... 903 "incl %%eax \n\t" // ...increment eax to 1. This pair is 904 // faster than the instruction "mov eax, 1" 905 "cpuid \n\t" // get the CPU identification info again 906 "andl $0x800000, %%edx \n\t" // mask out all bits but MMX bit (23) 907 "cmpl $0, %%edx \n\t" // 0 = MMX not supported 908 "jz 0f \n\t" // non-zero = yes, MMX IS supported 909 910 "movl $1, %%eax \n\t" // set return value to 1 911 "jmp 1f \n\t" // DONE: have MMX support 912 913 "0: \n\t" // .NOT_SUPPORTED: target label for jump instructions 914 "movl $0, %%eax \n\t" // set return value to 0 915 "1: \n\t" // .RETURN: target label for jump instructions 916 #if defined(__x86_64__) 917 "popq %%rdx \n\t" // restore rdx 918 "popq %%rcx \n\t" // restore rcx 919 "popq %%rbx \n\t" // restore rbx 920 #else 921 "popl %%edx \n\t" // restore edx 922 "popl %%ecx \n\t" // restore ecx 923 "popl %%ebx \n\t" // restore ebx 924 #endif 925 926 // "ret \n\t" // DONE: no MMX support 927 // (fall through to standard C "ret") 928 929 : "=a" (result) // output list 930 931 : // any variables used on input (none) 932 933 // no clobber list 934 // , "%ebx", "%ecx", "%edx" // GRR: we handle these manually 935 // , "memory" // if write to a variable gcc thought was in a reg 936 // , "cc" // "condition codes" (flag bits) 937 ); 938 _mmx_supported = result; 939 #else 940 _mmx_supported = 0; 941 #endif /* PNG_MMX_CODE_SUPPORTED */ 942 943 return _mmx_supported; 944 } 945 946 947 /*===========================================================================*/ 948 /* */ 949 /* P N G _ C O M B I N E _ R O W */ 950 /* */ 951 /*===========================================================================*/ 952 953 #if defined(PNG_HAVE_MMX_COMBINE_ROW) 954 955 /* Combines the row recently read in with the previous row. 956 This routine takes care of alpha and transparency if requested. 957 This routine also handles the two methods of progressive display 958 of interlaced images, depending on the mask value. 959 The mask value describes which pixels are to be combined with 960 the row. The pattern always repeats every 8 pixels, so just 8 961 bits are needed. A one indicates the pixel is to be combined; a 962 zero indicates the pixel is to be skipped. This is in addition 963 to any alpha or transparency value associated with the pixel. 964 If you want all pixels to be combined, pass 0xff (255) in mask. */ 965 966 /* Use this routine for the x86 platform - it uses a faster MMX routine 967 if the machine supports MMX. */ 968 969 void /* PRIVATE */ 970 png_combine_row(png_structp png_ptr, png_bytep row, int mask) 971 { 972 int dummy_value_a; // fix 'forbidden register spilled' error 973 int dummy_value_c; 974 int dummy_value_d; 975 png_bytep dummy_value_S; 976 png_bytep dummy_value_D; 977 978 png_debug(1, "in png_combine_row (pnggccrd.c)\n"); 979 980 if (_mmx_supported == 2) { 981 #if !defined(PNG_1_0_X) 982 /* this should have happened in png_init_mmx_flags() already */ 983 png_warning(png_ptr, "asm_flags may not have been initialized"); 984 #endif 985 png_mmx_support(); 986 } 987 988 if (mask == 0xff) 989 { 990 png_debug(2,"mask == 0xff: doing single png_memcpy()\n"); 991 png_memcpy(row, png_ptr->row_buf + 1, 992 (png_size_t)PNG_ROWBYTES(png_ptr->row_info.pixel_depth,png_ptr->width)); 993 } 994 else /* (png_combine_row() is never called with mask == 0) */ 995 { 996 switch (png_ptr->row_info.pixel_depth) 997 { 998 case 24: /* png_ptr->row_info.pixel_depth */ 999 { 1000 png_bytep srcptr; 1001 png_bytep dstptr; 1002 1003 #if !defined(PNG_1_0_X) 1004 if (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW) 1005 #else 1006 if (_mmx_supported) 1007 #endif 1008 { 1009 png_uint_32 len; 1010 int diff; 1011 1012 srcptr = png_ptr->row_buf + 1; 1013 dstptr = row; 1014 len = png_ptr->width & ~7; // reduce to multiple of 8 1015 diff = (int) (png_ptr->width & 7); // amount lost 1016 1017 __asm__ __volatile__ ( 1018 "not %%edx \n\t" // mask => unmask 1019 "movd %%edx, %%mm7 \n\t" // load bit pattern 1020 "not %%edx \n\t" // unmask => mask for later 1021 "psubb %%mm6, %%mm6 \n\t" // zero mm6 1022 "punpcklbw %%mm7, %%mm7 \n\t" 1023 "punpcklwd %%mm7, %%mm7 \n\t" 1024 "punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks 1025 1026 LOAD_GOT_rbp 1027 "movq " MASK24_0 ", %%mm0 \n\t" // _mask24_0 -> mm0 1028 "movq " MASK24_1 ", %%mm1 \n\t" // _mask24_1 -> mm1 1029 "movq " MASK24_2 ", %%mm2 \n\t" // _mask24_2 -> mm2 1030 RESTORE_rbp 1031 1032 "pand %%mm7, %%mm0 \n\t" 1033 "pand %%mm7, %%mm1 \n\t" 1034 "pand %%mm7, %%mm2 \n\t" 1035 1036 "pcmpeqb %%mm6, %%mm0 \n\t" 1037 "pcmpeqb %%mm6, %%mm1 \n\t" 1038 "pcmpeqb %%mm6, %%mm2 \n\t" 1039 1040 // preload "movl len, %%ecx \n\t" // load length of line 1041 // preload "movl srcptr, %3 \n\t" // load source 1042 // preload "movl dstptr, %4 \n\t" // load dest 1043 1044 "cmpl $0, %%ecx \n\t" 1045 "jz mainloop24end \n\t" 1046 1047 "mainloop24: \n\t" 1048 "movq (%3), %%mm4 \n\t" 1049 "pand %%mm0, %%mm4 \n\t" 1050 "movq %%mm0, %%mm6 \n\t" 1051 "movq (%4), %%mm7 \n\t" 1052 "pandn %%mm7, %%mm6 \n\t" 1053 "por %%mm6, %%mm4 \n\t" 1054 "movq %%mm4, (%4) \n\t" 1055 1056 "movq 8(%3), %%mm5 \n\t" 1057 "pand %%mm1, %%mm5 \n\t" 1058 "movq %%mm1, %%mm7 \n\t" 1059 "movq 8(%4), %%mm6 \n\t" 1060 "pandn %%mm6, %%mm7 \n\t" 1061 "por %%mm7, %%mm5 \n\t" 1062 "movq %%mm5, 8(%4) \n\t" 1063 1064 "movq 16(%3), %%mm6 \n\t" 1065 "pand %%mm2, %%mm6 \n\t" 1066 "movq %%mm2, %%mm4 \n\t" 1067 "movq 16(%4), %%mm7 \n\t" 1068 "pandn %%mm7, %%mm4 \n\t" 1069 "por %%mm4, %%mm6 \n\t" 1070 "movq %%mm6, 16(%4) \n\t" 1071 1072 "add $24, %3 \n\t" // inc by 24 bytes processed 1073 "add $24, %4 \n\t" 1074 "subl $8, %%ecx \n\t" // dec by 8 pixels processed 1075 1076 "ja mainloop24 \n\t" 1077 1078 "mainloop24end: \n\t" 1079 // preload "movl diff, %%ecx \n\t" // (diff is in eax) 1080 "movl %%eax, %%ecx \n\t" 1081 "cmpl $0, %%ecx \n\t" 1082 "jz end24 \n\t" 1083 // preload "movl mask, %%edx \n\t" 1084 "sall $24, %%edx \n\t" // make low byte, high byte 1085 1086 "secondloop24: \n\t" 1087 "sall %%edx \n\t" // move high bit to CF 1088 "jnc skip24 \n\t" // if CF = 0 1089 "movw (%3), %%ax \n\t" 1090 "movw %%ax, (%4) \n\t" 1091 "xorl %%eax, %%eax \n\t" 1092 "movb 2(%3), %%al \n\t" 1093 "movb %%al, 2(%4) \n\t" 1094 1095 "skip24: \n\t" 1096 "add $3, %3 \n\t" 1097 "add $3, %4 \n\t" 1098 "decl %%ecx \n\t" 1099 "jnz secondloop24 \n\t" 1100 1101 "end24: \n\t" 1102 "EMMS \n\t" // DONE 1103 1104 : "=a" (dummy_value_a), // output regs (dummy) 1105 "=d" (dummy_value_d), 1106 "=c" (dummy_value_c), 1107 "=S" (dummy_value_S), 1108 "=D" (dummy_value_D) 1109 1110 : "0" (diff), // eax // input regs 1111 "1" (mask), // edx 1112 "2" (len), // ecx 1113 // was (unmask) "b" RESERVED // ebx // Global Offset Table idx 1114 "3" (srcptr), // esi/rsi 1115 "4" (dstptr) // edi/rdi 1116 1117 #if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED) 1118 : "%mm0", "%mm1", "%mm2" // clobber list 1119 , "%mm4", "%mm5", "%mm6", "%mm7" 1120 #endif 1121 ); 1122 } 1123 else /* not _mmx_supported - use modified C routine */ 1124 { 1125 register png_uint_32 i; 1126 png_uint_32 initial_val = BPP3 * png_pass_start[png_ptr->pass]; 1127 /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */ 1128 register int stride = BPP3 * png_pass_inc[png_ptr->pass]; 1129 /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */ 1130 register int rep_bytes = BPP3 * png_pass_width[png_ptr->pass]; 1131 /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */ 1132 png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */ 1133 int diff = (int) (png_ptr->width & 7); /* amount lost */ 1134 register png_uint_32 final_val = BPP3 * len; /* GRR bugfix */ 1135 1136 srcptr = png_ptr->row_buf + 1 + initial_val; 1137 dstptr = row + initial_val; 1138 1139 for (i = initial_val; i < final_val; i += stride) 1140 { 1141 png_memcpy(dstptr, srcptr, rep_bytes); 1142 srcptr += stride; 1143 dstptr += stride; 1144 } 1145 if (diff) /* number of leftover pixels: 3 for pngtest */ 1146 { 1147 final_val += diff*BPP3; 1148 for (; i < final_val; i += stride) 1149 { 1150 if (rep_bytes > (int)(final_val-i)) 1151 rep_bytes = (int)(final_val-i); 1152 png_memcpy(dstptr, srcptr, rep_bytes); 1153 srcptr += stride; 1154 dstptr += stride; 1155 } 1156 } 1157 } /* end of else (_mmx_supported) */ 1158 1159 break; 1160 } /* end 24 bpp */ 1161 1162 // formerly claimed to be most common case (combining 32-bit RGBA), 1163 // but almost certainly less common than 24-bit RGB case 1164 case 32: /* png_ptr->row_info.pixel_depth */ 1165 { 1166 png_bytep srcptr; 1167 png_bytep dstptr; 1168 1169 #if !defined(PNG_1_0_X) 1170 if (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW) 1171 #else 1172 if (_mmx_supported) 1173 #endif 1174 { 1175 png_uint_32 len; 1176 int diff; 1177 1178 srcptr = png_ptr->row_buf + 1; 1179 dstptr = row; 1180 len = png_ptr->width & ~7; // reduce to multiple of 8 1181 diff = (int) (png_ptr->width & 7); // amount lost 1182 1183 __asm__ __volatile__ ( 1184 "not %%edx \n\t" // mask => unmask 1185 "movd %%edx, %%mm7 \n\t" // load bit pattern 1186 "not %%edx \n\t" // unmask => mask for later 1187 "psubb %%mm6, %%mm6 \n\t" // zero mm6 1188 "punpcklbw %%mm7, %%mm7 \n\t" 1189 "punpcklwd %%mm7, %%mm7 \n\t" 1190 "punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks 1191 1192 LOAD_GOT_rbp 1193 "movq " MASK32_0 ", %%mm0 \n\t" // _mask32_0 1194 "movq " MASK32_1 ", %%mm1 \n\t" // _mask32_1 1195 "movq " MASK32_2 ", %%mm2 \n\t" // _mask32_2 1196 "movq " MASK32_3 ", %%mm3 \n\t" // _mask32_3 1197 RESTORE_rbp 1198 1199 "pand %%mm7, %%mm0 \n\t" 1200 "pand %%mm7, %%mm1 \n\t" 1201 "pand %%mm7, %%mm2 \n\t" 1202 "pand %%mm7, %%mm3 \n\t" 1203 1204 "pcmpeqb %%mm6, %%mm0 \n\t" 1205 "pcmpeqb %%mm6, %%mm1 \n\t" 1206 "pcmpeqb %%mm6, %%mm2 \n\t" 1207 "pcmpeqb %%mm6, %%mm3 \n\t" 1208 1209 // preload "movl len, %%ecx \n\t" // load length of line 1210 // preload "movl srcptr, %3 \n\t" // load source 1211 // preload "movl dstptr, %4 \n\t" // load dest 1212 1213 "cmpl $0, %%ecx \n\t" // lcr 1214 "jz mainloop32end \n\t" 1215 1216 "mainloop32: \n\t" 1217 "movq (%3), %%mm4 \n\t" 1218 "pand %%mm0, %%mm4 \n\t" 1219 "movq %%mm0, %%mm6 \n\t" 1220 "movq (%4), %%mm7 \n\t" 1221 "pandn %%mm7, %%mm6 \n\t" 1222 "por %%mm6, %%mm4 \n\t" 1223 "movq %%mm4, (%4) \n\t" 1224 1225 "movq 8(%3), %%mm5 \n\t" 1226 "pand %%mm1, %%mm5 \n\t" 1227 "movq %%mm1, %%mm7 \n\t" 1228 "movq 8(%4), %%mm6 \n\t" 1229 "pandn %%mm6, %%mm7 \n\t" 1230 "por %%mm7, %%mm5 \n\t" 1231 "movq %%mm5, 8(%4) \n\t" 1232 1233 "movq 16(%3), %%mm6 \n\t" 1234 "pand %%mm2, %%mm6 \n\t" 1235 "movq %%mm2, %%mm4 \n\t" 1236 "movq 16(%4), %%mm7 \n\t" 1237 "pandn %%mm7, %%mm4 \n\t" 1238 "por %%mm4, %%mm6 \n\t" 1239 "movq %%mm6, 16(%4) \n\t" 1240 1241 "movq 24(%3), %%mm7 \n\t" 1242 "pand %%mm3, %%mm7 \n\t" 1243 "movq %%mm3, %%mm5 \n\t" 1244 "movq 24(%4), %%mm4 \n\t" 1245 "pandn %%mm4, %%mm5 \n\t" 1246 "por %%mm5, %%mm7 \n\t" 1247 "movq %%mm7, 24(%4) \n\t" 1248 1249 "add $32, %3 \n\t" // inc by 32 bytes processed 1250 "add $32, %4 \n\t" 1251 "subl $8, %%ecx \n\t" // dec by 8 pixels processed 1252 "ja mainloop32 \n\t" 1253 1254 "mainloop32end: \n\t" 1255 // preload "movl diff, %%ecx \n\t" // (diff is in eax) 1256 "movl %%eax, %%ecx \n\t" 1257 "cmpl $0, %%ecx \n\t" 1258 "jz end32 \n\t" 1259 // preload "movl mask, %%edx \n\t" 1260 "sall $24, %%edx \n\t" // low byte => high byte 1261 1262 "secondloop32: \n\t" 1263 "sall %%edx \n\t" // move high bit to CF 1264 "jnc skip32 \n\t" // if CF = 0 1265 "movl (%3), %%eax \n\t" 1266 "movl %%eax, (%4) \n\t" 1267 1268 "skip32: \n\t" 1269 "add $4, %3 \n\t" 1270 "add $4, %4 \n\t" 1271 "decl %%ecx \n\t" 1272 "jnz secondloop32 \n\t" 1273 1274 "end32: \n\t" 1275 "EMMS \n\t" // DONE 1276 1277 : "=a" (dummy_value_a), // output regs (dummy) 1278 "=d" (dummy_value_d), 1279 "=c" (dummy_value_c), 1280 "=S" (dummy_value_S), 1281 "=D" (dummy_value_D) 1282 1283 : "0" (diff), // eax // input regs 1284 "1" (mask), // edx 1285 "2" (len), // ecx 1286 // was (unmask) "b" RESERVED // ebx // Global Offset Table idx 1287 "3" (srcptr), // esi/rsi 1288 "4" (dstptr) // edi/rdi 1289 1290 #if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED) 1291 : "%mm0", "%mm1", "%mm2", "%mm3" // clobber list 1292 , "%mm4", "%mm5", "%mm6", "%mm7" 1293 #endif 1294 ); 1295 } 1296 else /* not _mmx_supported - use modified C routine */ 1297 { 1298 register png_uint_32 i; 1299 png_uint_32 initial_val = BPP4 * png_pass_start[png_ptr->pass]; 1300 /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */ 1301 register int stride = BPP4 * png_pass_inc[png_ptr->pass]; 1302 /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */ 1303 register int rep_bytes = BPP4 * png_pass_width[png_ptr->pass]; 1304 /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */ 1305 png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */ 1306 int diff = (int) (png_ptr->width & 7); /* amount lost */ 1307 register png_uint_32 final_val = BPP4 * len; /* GRR bugfix */ 1308 1309 srcptr = png_ptr->row_buf + 1 + initial_val; 1310 dstptr = row + initial_val; 1311 1312 for (i = initial_val; i < final_val; i += stride) 1313 { 1314 png_memcpy(dstptr, srcptr, rep_bytes); 1315 srcptr += stride; 1316 dstptr += stride; 1317 } 1318 if (diff) /* number of leftover pixels: 3 for pngtest */ 1319 { 1320 final_val += diff*BPP4; 1321 for (; i < final_val; i += stride) 1322 { 1323 if (rep_bytes > (int)(final_val-i)) 1324 rep_bytes = (int)(final_val-i); 1325 png_memcpy(dstptr, srcptr, rep_bytes); 1326 srcptr += stride; 1327 dstptr += stride; 1328 } 1329 } 1330 } /* end of else (_mmx_supported) */ 1331 1332 break; 1333 } /* end 32 bpp */ 1334 1335 case 8: /* png_ptr->row_info.pixel_depth */ 1336 { 1337 png_bytep srcptr; 1338 png_bytep dstptr; 1339 1340 #if !defined(PNG_1_0_X) 1341 if (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW) 1342 #else 1343 if (_mmx_supported) 1344 #endif 1345 { 1346 png_uint_32 len; 1347 int diff; 1348 1349 srcptr = png_ptr->row_buf + 1; 1350 dstptr = row; 1351 len = png_ptr->width & ~7; // reduce to multiple of 8 1352 diff = (int) (png_ptr->width & 7); // amount lost 1353 1354 __asm__ __volatile__ ( 1355 "not %%edx \n\t" // mask => unmask 1356 "movd %%edx, %%mm7 \n\t" // load bit pattern 1357 "not %%edx \n\t" // unmask => mask for later 1358 "psubb %%mm6, %%mm6 \n\t" // zero mm6 1359 "punpcklbw %%mm7, %%mm7 \n\t" 1360 "punpcklwd %%mm7, %%mm7 \n\t" 1361 "punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks 1362 1363 LOAD_GOT_rbp 1364 "movq " MASK8_0 ", %%mm0 \n\t" // _mask8_0 -> mm0 1365 RESTORE_rbp 1366 1367 "pand %%mm7, %%mm0 \n\t" // nonzero if keep byte 1368 "pcmpeqb %%mm6, %%mm0 \n\t" // zeros->1s, v versa 1369 1370 // preload "movl len, %%ecx \n\t" // load length of line 1371 // preload "movl srcptr, %3 \n\t" // load source 1372 // preload "movl dstptr, %4 \n\t" // load dest 1373 1374 "cmpl $0, %%ecx \n\t" // len == 0 ? 1375 "je mainloop8end \n\t" 1376 1377 "mainloop8: \n\t" 1378 "movq (%3), %%mm4 \n\t" // *srcptr 1379 "pand %%mm0, %%mm4 \n\t" 1380 "movq %%mm0, %%mm6 \n\t" 1381 "pandn (%4), %%mm6 \n\t" // *dstptr 1382 "por %%mm6, %%mm4 \n\t" 1383 "movq %%mm4, (%4) \n\t" 1384 "add $8, %3 \n\t" // inc by 8 bytes processed 1385 "add $8, %4 \n\t" 1386 "subl $8, %%ecx \n\t" // dec by 8 pixels processed 1387 "ja mainloop8 \n\t" 1388 1389 "mainloop8end: \n\t" 1390 // preload "movl diff, %%ecx \n\t" // (diff is in eax) 1391 "movl %%eax, %%ecx \n\t" 1392 "cmpl $0, %%ecx \n\t" 1393 "jz end8 \n\t" 1394 // preload "movl mask, %%edx \n\t" 1395 "sall $24, %%edx \n\t" // make low byte, high byte 1396 1397 "secondloop8: \n\t" 1398 "sall %%edx \n\t" // move high bit to CF 1399 "jnc skip8 \n\t" // if CF = 0 1400 "movb (%3), %%al \n\t" 1401 "movb %%al, (%4) \n\t" 1402 1403 "skip8: \n\t" 1404 "inc %3 \n\t" 1405 "inc %4 \n\t" 1406 "decl %%ecx \n\t" 1407 "jnz secondloop8 \n\t" 1408 1409 "end8: \n\t" 1410 "EMMS \n\t" // DONE 1411 1412 : "=a" (dummy_value_a), // output regs (dummy) 1413 "=d" (dummy_value_d), 1414 "=c" (dummy_value_c), 1415 "=S" (dummy_value_S), 1416 "=D" (dummy_value_D) 1417 1418 : "0" (diff), // eax // input regs 1419 "1" (mask), // edx 1420 "2" (len), // ecx 1421 // was (unmask) "b" RESERVED // ebx // Global Offset Table idx 1422 "3" (srcptr), // esi/rsi 1423 "4" (dstptr) // edi/rdi 1424 1425 #if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED) 1426 : "%mm0", "%mm4", "%mm6", "%mm7" // clobber list 1427 #endif 1428 ); 1429 } 1430 else /* not _mmx_supported - use modified C routine */ 1431 { 1432 register png_uint_32 i; 1433 png_uint_32 initial_val = png_pass_start[png_ptr->pass]; 1434 /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */ 1435 register int stride = png_pass_inc[png_ptr->pass]; 1436 /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */ 1437 register int rep_bytes = png_pass_width[png_ptr->pass]; 1438 /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */ 1439 png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */ 1440 int diff = (int) (png_ptr->width & 7); /* amount lost */ 1441 register png_uint_32 final_val = len; /* GRR bugfix */ 1442 1443 srcptr = png_ptr->row_buf + 1 + initial_val; 1444 dstptr = row + initial_val; 1445 1446 for (i = initial_val; i < final_val; i += stride) 1447 { 1448 png_memcpy(dstptr, srcptr, rep_bytes); 1449 srcptr += stride; 1450 dstptr += stride; 1451 } 1452 if (diff) /* number of leftover pixels: 3 for pngtest */ 1453 { 1454 final_val += diff /* *BPP1 */ ; 1455 for (; i < final_val; i += stride) 1456 { 1457 if (rep_bytes > (int)(final_val-i)) 1458 rep_bytes = (int)(final_val-i); 1459 png_memcpy(dstptr, srcptr, rep_bytes); 1460 srcptr += stride; 1461 dstptr += stride; 1462 } 1463 } 1464 1465 } /* end of else (_mmx_supported) */ 1466 1467 break; 1468 } /* end 8 bpp */ 1469 1470 case 1: /* png_ptr->row_info.pixel_depth */ 1471 { 1472 png_bytep sp; 1473 png_bytep dp; 1474 int s_inc, s_start, s_end; 1475 int m; 1476 int shift; 1477 png_uint_32 i; 1478 1479 sp = png_ptr->row_buf + 1; 1480 dp = row; 1481 m = 0x80; 1482 #if defined(PNG_READ_PACKSWAP_SUPPORTED) 1483 if (png_ptr->transformations & PNG_PACKSWAP) 1484 { 1485 s_start = 0; 1486 s_end = 7; 1487 s_inc = 1; 1488 } 1489 else 1490 #endif 1491 { 1492 s_start = 7; 1493 s_end = 0; 1494 s_inc = -1; 1495 } 1496 1497 shift = s_start; 1498 1499 for (i = 0; i < png_ptr->width; i++) 1500 { 1501 if (m & mask) 1502 { 1503 int value; 1504 1505 value = (*sp >> shift) & 0x1; 1506 *dp &= (png_byte)((0x7f7f >> (7 - shift)) & 0xff); 1507 *dp |= (png_byte)(value << shift); 1508 } 1509 1510 if (shift == s_end) 1511 { 1512 shift = s_start; 1513 sp++; 1514 dp++; 1515 } 1516 else 1517 shift += s_inc; 1518 1519 if (m == 1) 1520 m = 0x80; 1521 else 1522 m >>= 1; 1523 } 1524 break; 1525 } /* end 1 bpp */ 1526 1527 case 2: /* png_ptr->row_info.pixel_depth */ 1528 { 1529 png_bytep sp; 1530 png_bytep dp; 1531 int s_start, s_end, s_inc; 1532 int m; 1533 int shift; 1534 png_uint_32 i; 1535 int value; 1536 1537 sp = png_ptr->row_buf + 1; 1538 dp = row; 1539 m = 0x80; 1540 #if defined(PNG_READ_PACKSWAP_SUPPORTED) 1541 if (png_ptr->transformations & PNG_PACKSWAP) 1542 { 1543 s_start = 0; 1544 s_end = 6; 1545 s_inc = 2; 1546 } 1547 else 1548 #endif 1549 { 1550 s_start = 6; 1551 s_end = 0; 1552 s_inc = -2; 1553 } 1554 1555 shift = s_start; 1556 1557 for (i = 0; i < png_ptr->width; i++) 1558 { 1559 if (m & mask) 1560 { 1561 value = (*sp >> shift) & 0x3; 1562 *dp &= (png_byte)((0x3f3f >> (6 - shift)) & 0xff); 1563 *dp |= (png_byte)(value << shift); 1564 } 1565 1566 if (shift == s_end) 1567 { 1568 shift = s_start; 1569 sp++; 1570 dp++; 1571 } 1572 else 1573 shift += s_inc; 1574 if (m == 1) 1575 m = 0x80; 1576 else 1577 m >>= 1; 1578 } 1579 break; 1580 } /* end 2 bpp */ 1581 1582 case 4: /* png_ptr->row_info.pixel_depth */ 1583 { 1584 png_bytep sp; 1585 png_bytep dp; 1586 int s_start, s_end, s_inc; 1587 int m; 1588 int shift; 1589 png_uint_32 i; 1590 int value; 1591 1592 sp = png_ptr->row_buf + 1; 1593 dp = row; 1594 m = 0x80; 1595 #if defined(PNG_READ_PACKSWAP_SUPPORTED) 1596 if (png_ptr->transformations & PNG_PACKSWAP) 1597 { 1598 s_start = 0; 1599 s_end = 4; 1600 s_inc = 4; 1601 } 1602 else 1603 #endif 1604 { 1605 s_start = 4; 1606 s_end = 0; 1607 s_inc = -4; 1608 } 1609 1610 shift = s_start; 1611 1612 for (i = 0; i < png_ptr->width; i++) 1613 { 1614 if (m & mask) 1615 { 1616 value = (*sp >> shift) & 0xf; 1617 *dp &= (png_byte)((0xf0f >> (4 - shift)) & 0xff); 1618 *dp |= (png_byte)(value << shift); 1619 } 1620 1621 if (shift == s_end) 1622 { 1623 shift = s_start; 1624 sp++; 1625 dp++; 1626 } 1627 else 1628 shift += s_inc; 1629 if (m == 1) 1630 m = 0x80; 1631 else 1632 m >>= 1; 1633 } 1634 break; 1635 } /* end 4 bpp */ 1636 1637 case 16: /* png_ptr->row_info.pixel_depth */ 1638 { 1639 png_bytep srcptr; 1640 png_bytep dstptr; 1641 1642 #if !defined(PNG_1_0_X) 1643 if (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW) 1644 #else 1645 if (_mmx_supported) 1646 #endif 1647 { 1648 png_uint_32 len; 1649 int diff; 1650 1651 srcptr = png_ptr->row_buf + 1; 1652 dstptr = row; 1653 len = png_ptr->width & ~7; // reduce to multiple of 8 1654 diff = (int) (png_ptr->width & 7); // amount lost 1655 1656 __asm__ __volatile__ ( 1657 "not %%edx \n\t" // mask => unmask 1658 "movd %%edx, %%mm7 \n\t" // load bit pattern 1659 "not %%edx \n\t" // unmask => mask for later 1660 "psubb %%mm6, %%mm6 \n\t" // zero mm6 1661 "punpcklbw %%mm7, %%mm7 \n\t" 1662 "punpcklwd %%mm7, %%mm7 \n\t" 1663 "punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks 1664 1665 LOAD_GOT_rbp 1666 "movq " MASK16_0 ", %%mm0 \n\t" // _mask16_0 -> mm0 1667 "movq " MASK16_1 ", %%mm1 \n\t" // _mask16_1 -> mm1 1668 RESTORE_rbp 1669 1670 "pand %%mm7, %%mm0 \n\t" 1671 "pand %%mm7, %%mm1 \n\t" 1672 1673 "pcmpeqb %%mm6, %%mm0 \n\t" 1674 "pcmpeqb %%mm6, %%mm1 \n\t" 1675 1676 // preload "movl len, %%ecx \n\t" // load length of line 1677 // preload "movl srcptr, %3 \n\t" // load source 1678 // preload "movl dstptr, %4 \n\t" // load dest 1679 1680 "cmpl $0, %%ecx \n\t" 1681 "jz mainloop16end \n\t" 1682 1683 "mainloop16: \n\t" 1684 "movq (%3), %%mm4 \n\t" 1685 "pand %%mm0, %%mm4 \n\t" 1686 "movq %%mm0, %%mm6 \n\t" 1687 "movq (%4), %%mm7 \n\t" 1688 "pandn %%mm7, %%mm6 \n\t" 1689 "por %%mm6, %%mm4 \n\t" 1690 "movq %%mm4, (%4) \n\t" 1691 1692 "movq 8(%3), %%mm5 \n\t" 1693 "pand %%mm1, %%mm5 \n\t" 1694 "movq %%mm1, %%mm7 \n\t" 1695 "movq 8(%4), %%mm6 \n\t" 1696 "pandn %%mm6, %%mm7 \n\t" 1697 "por %%mm7, %%mm5 \n\t" 1698 "movq %%mm5, 8(%4) \n\t" 1699 1700 "add $16, %3 \n\t" // inc by 16 bytes processed 1701 "add $16, %4 \n\t" 1702 "subl $8, %%ecx \n\t" // dec by 8 pixels processed 1703 "ja mainloop16 \n\t" 1704 1705 "mainloop16end: \n\t" 1706 // preload "movl diff, %%ecx \n\t" // (diff is in eax) 1707 "movl %%eax, %%ecx \n\t" 1708 "cmpl $0, %%ecx \n\t" 1709 "jz end16 \n\t" 1710 // preload "movl mask, %%edx \n\t" 1711 "sall $24, %%edx \n\t" // make low byte, high byte 1712 1713 "secondloop16: \n\t" 1714 "sall %%edx \n\t" // move high bit to CF 1715 "jnc skip16 \n\t" // if CF = 0 1716 "movw (%3), %%ax \n\t" 1717 "movw %%ax, (%4) \n\t" 1718 1719 "skip16: \n\t" 1720 "add $2, %3 \n\t" 1721 "add $2, %4 \n\t" 1722 "decl %%ecx \n\t" 1723 "jnz secondloop16 \n\t" 1724 1725 "end16: \n\t" 1726 "EMMS \n\t" // DONE 1727 1728 : "=a" (dummy_value_a), // output regs (dummy) 1729 "=d" (dummy_value_d), 1730 "=c" (dummy_value_c), 1731 "=S" (dummy_value_S), 1732 "=D" (dummy_value_D) 1733 1734 : "0" (diff), // eax // input regs 1735 "1" (mask), // edx 1736 "2" (len), // ecx 1737 // was (unmask) "b" RESERVED // ebx // Global Offset Table idx 1738 "3" (srcptr), // esi/rsi 1739 "4" (dstptr) // edi/rdi 1740 1741 #if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED) 1742 : "%mm0", "%mm1", "%mm4" // clobber list 1743 , "%mm5", "%mm6", "%mm7" 1744 #endif 1745 ); 1746 } 1747 else /* not _mmx_supported - use modified C routine */ 1748 { 1749 register png_uint_32 i; 1750 png_uint_32 initial_val = BPP2 * png_pass_start[png_ptr->pass]; 1751 /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */ 1752 register int stride = BPP2 * png_pass_inc[png_ptr->pass]; 1753 /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */ 1754 register int rep_bytes = BPP2 * png_pass_width[png_ptr->pass]; 1755 /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */ 1756 png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */ 1757 int diff = (int) (png_ptr->width & 7); /* amount lost */ 1758 register png_uint_32 final_val = BPP2 * len; /* GRR bugfix */ 1759 1760 srcptr = png_ptr->row_buf + 1 + initial_val; 1761 dstptr = row + initial_val; 1762 1763 for (i = initial_val; i < final_val; i += stride) 1764 { 1765 png_memcpy(dstptr, srcptr, rep_bytes); 1766 srcptr += stride; 1767 dstptr += stride; 1768 } 1769 if (diff) /* number of leftover pixels: 3 for pngtest */ 1770 { 1771 final_val += diff*BPP2; 1772 for (; i < final_val; i += stride) 1773 { 1774 if (rep_bytes > (int)(final_val-i)) 1775 rep_bytes = (int)(final_val-i); 1776 png_memcpy(dstptr, srcptr, rep_bytes); 1777 srcptr += stride; 1778 dstptr += stride; 1779 } 1780 } 1781 } /* end of else (_mmx_supported) */ 1782 1783 break; 1784 } /* end 16 bpp */ 1785 1786 case 48: /* png_ptr->row_info.pixel_depth */ 1787 { 1788 png_bytep srcptr; 1789 png_bytep dstptr; 1790 1791 #if !defined(PNG_1_0_X) 1792 if (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW) 1793 #else 1794 if (_mmx_supported) 1795 #endif 1796 { 1797 png_uint_32 len; 1798 int diff; 1799 1800 srcptr = png_ptr->row_buf + 1; 1801 dstptr = row; 1802 len = png_ptr->width & ~7; // reduce to multiple of 8 1803 diff = (int) (png_ptr->width & 7); // amount lost 1804 1805 __asm__ __volatile__ ( 1806 "not %%edx \n\t" // mask => unmask 1807 "movd %%edx, %%mm7 \n\t" // load bit pattern 1808 "not %%edx \n\t" // unmask => mask for later 1809 "psubb %%mm6, %%mm6 \n\t" // zero mm6 1810 "punpcklbw %%mm7, %%mm7 \n\t" 1811 "punpcklwd %%mm7, %%mm7 \n\t" 1812 "punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks 1813 1814 LOAD_GOT_rbp 1815 "movq " MASK48_0 ", %%mm0 \n\t" // _mask48_0 -> mm0 1816 "movq " MASK48_1 ", %%mm1 \n\t" // _mask48_1 -> mm1 1817 "movq " MASK48_2 ", %%mm2 \n\t" // _mask48_2 -> mm2 1818 "movq " MASK48_3 ", %%mm3 \n\t" // _mask48_3 -> mm3 1819 "movq " MASK48_4 ", %%mm4 \n\t" // _mask48_4 -> mm4 1820 "movq " MASK48_5 ", %%mm5 \n\t" // _mask48_5 -> mm5 1821 RESTORE_rbp 1822 1823 "pand %%mm7, %%mm0 \n\t" 1824 "pand %%mm7, %%mm1 \n\t" 1825 "pand %%mm7, %%mm2 \n\t" 1826 "pand %%mm7, %%mm3 \n\t" 1827 "pand %%mm7, %%mm4 \n\t" 1828 "pand %%mm7, %%mm5 \n\t" 1829 1830 "pcmpeqb %%mm6, %%mm0 \n\t" 1831 "pcmpeqb %%mm6, %%mm1 \n\t" 1832 "pcmpeqb %%mm6, %%mm2 \n\t" 1833 "pcmpeqb %%mm6, %%mm3 \n\t" 1834 "pcmpeqb %%mm6, %%mm4 \n\t" 1835 "pcmpeqb %%mm6, %%mm5 \n\t" 1836 1837 // preload "movl len, %%ecx \n\t" // load length of line 1838 // preload "movl srcptr, %3 \n\t" // load source 1839 // preload "movl dstptr, %4 \n\t" // load dest 1840 1841 "cmpl $0, %%ecx \n\t" 1842 "jz mainloop48end \n\t" 1843 1844 "mainloop48: \n\t" 1845 "movq (%3), %%mm7 \n\t" 1846 "pand %%mm0, %%mm7 \n\t" 1847 "movq %%mm0, %%mm6 \n\t" 1848 "pandn (%4), %%mm6 \n\t" 1849 "por %%mm6, %%mm7 \n\t" 1850 "movq %%mm7, (%4) \n\t" 1851 1852 "movq 8(%3), %%mm6 \n\t" 1853 "pand %%mm1, %%mm6 \n\t" 1854 "movq %%mm1, %%mm7 \n\t" 1855 "pandn 8(%4), %%mm7 \n\t" 1856 "por %%mm7, %%mm6 \n\t" 1857 "movq %%mm6, 8(%4) \n\t" 1858 1859 "movq 16(%3), %%mm6 \n\t" 1860 "pand %%mm2, %%mm6 \n\t" 1861 "movq %%mm2, %%mm7 \n\t" 1862 "pandn 16(%4), %%mm7 \n\t" 1863 "por %%mm7, %%mm6 \n\t" 1864 "movq %%mm6, 16(%4) \n\t" 1865 1866 "movq 24(%3), %%mm7 \n\t" 1867 "pand %%mm3, %%mm7 \n\t" 1868 "movq %%mm3, %%mm6 \n\t" 1869 "pandn 24(%4), %%mm6 \n\t" 1870 "por %%mm6, %%mm7 \n\t" 1871 "movq %%mm7, 24(%4) \n\t" 1872 1873 "movq 32(%3), %%mm6 \n\t" 1874 "pand %%mm4, %%mm6 \n\t" 1875 "movq %%mm4, %%mm7 \n\t" 1876 "pandn 32(%4), %%mm7 \n\t" 1877 "por %%mm7, %%mm6 \n\t" 1878 "movq %%mm6, 32(%4) \n\t" 1879 1880 "movq 40(%3), %%mm7 \n\t" 1881 "pand %%mm5, %%mm7 \n\t" 1882 "movq %%mm5, %%mm6 \n\t" 1883 "pandn 40(%4), %%mm6 \n\t" 1884 "por %%mm6, %%mm7 \n\t" 1885 "movq %%mm7, 40(%4) \n\t" 1886 1887 "add $48, %3 \n\t" // inc by 48 bytes processed 1888 "add $48, %4 \n\t" 1889 "subl $8, %%ecx \n\t" // dec by 8 pixels processed 1890 1891 "ja mainloop48 \n\t" 1892 1893 "mainloop48end: \n\t" 1894 // preload "movl diff, %%ecx \n\t" // (diff is in eax) 1895 "movl %%eax, %%ecx \n\t" 1896 "cmpl $0, %%ecx \n\t" 1897 "jz end48 \n\t" 1898 // preload "movl mask, %%edx \n\t" 1899 "sall $24, %%edx \n\t" // make low byte, high byte 1900 1901 "secondloop48: \n\t" 1902 "sall %%edx \n\t" // move high bit to CF 1903 "jnc skip48 \n\t" // if CF = 0 1904 "movl (%3), %%eax \n\t" 1905 "movl %%eax, (%4) \n\t" 1906 "movw 4(%3), %%ax \n\t" // GR-P bugfix 20070717 1907 "movw %%ax, 4(%4) \n\t" // GR-P bugfix 20070717 1908 1909 "skip48: \n\t" 1910 "add $6, %3 \n\t" // GR-P bugfix 20070717 1911 "add $6, %4 \n\t" // GR-P bugfix 20070717 1912 "decl %%ecx \n\t" 1913 "jnz secondloop48 \n\t" 1914 1915 "end48: \n\t" 1916 "EMMS \n\t" // DONE 1917 1918 : "=a" (dummy_value_a), // output regs (dummy) 1919 "=d" (dummy_value_d), 1920 "=c" (dummy_value_c), 1921 "=S" (dummy_value_S), 1922 "=D" (dummy_value_D) 1923 1924 : "0" (diff), // eax // input regs 1925 "1" (mask), // edx 1926 "2" (len), // ecx 1927 // was (unmask) "b" RESERVED // ebx // Global Offset Table idx 1928 "3" (srcptr), // esi/rsi 1929 "4" (dstptr) // edi/rdi 1930 1931 #if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED) 1932 : "%mm0", "%mm1", "%mm2", "%mm3" // clobber list 1933 , "%mm4", "%mm5", "%mm6", "%mm7" 1934 #endif 1935 ); 1936 } 1937 else /* not _mmx_supported - use modified C routine */ 1938 { 1939 register png_uint_32 i; 1940 png_uint_32 initial_val = BPP6 * png_pass_start[png_ptr->pass]; 1941 /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */ 1942 register int stride = BPP6 * png_pass_inc[png_ptr->pass]; 1943 /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */ 1944 register int rep_bytes = BPP6 * png_pass_width[png_ptr->pass]; 1945 /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */ 1946 png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */ 1947 int diff = (int) (png_ptr->width & 7); /* amount lost */ 1948 register png_uint_32 final_val = BPP6 * len; /* GRR bugfix */ 1949 1950 srcptr = png_ptr->row_buf + 1 + initial_val; 1951 dstptr = row + initial_val; 1952 1953 for (i = initial_val; i < final_val; i += stride) 1954 { 1955 png_memcpy(dstptr, srcptr, rep_bytes); 1956 srcptr += stride; 1957 dstptr += stride; 1958 } 1959 if (diff) /* number of leftover pixels: 3 for pngtest */ 1960 { 1961 final_val += diff*BPP6; 1962 for (; i < final_val; i += stride) 1963 { 1964 if (rep_bytes > (int)(final_val-i)) 1965 rep_bytes = (int)(final_val-i); 1966 png_memcpy(dstptr, srcptr, rep_bytes); 1967 srcptr += stride; 1968 dstptr += stride; 1969 } 1970 } 1971 } /* end of else (_mmx_supported) */ 1972 1973 break; 1974 } /* end 48 bpp */ 1975 1976 case 64: /* png_ptr->row_info.pixel_depth */ 1977 { 1978 png_bytep srcptr; 1979 png_bytep dstptr; 1980 register png_uint_32 i; 1981 png_uint_32 initial_val = BPP8 * png_pass_start[png_ptr->pass]; 1982 /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */ 1983 register int stride = BPP8 * png_pass_inc[png_ptr->pass]; 1984 /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */ 1985 register int rep_bytes = BPP8 * png_pass_width[png_ptr->pass]; 1986 /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */ 1987 png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */ 1988 int diff = (int) (png_ptr->width & 7); /* amount lost */ 1989 register png_uint_32 final_val = BPP8 * len; /* GRR bugfix */ 1990 1991 srcptr = png_ptr->row_buf + 1 + initial_val; 1992 dstptr = row + initial_val; 1993 1994 for (i = initial_val; i < final_val; i += stride) 1995 { 1996 png_memcpy(dstptr, srcptr, rep_bytes); 1997 srcptr += stride; 1998 dstptr += stride; 1999 } 2000 if (diff) /* number of leftover pixels: 3 for pngtest */ 2001 { 2002 final_val += diff*BPP8; 2003 for (; i < final_val; i += stride) 2004 { 2005 if (rep_bytes > (int)(final_val-i)) 2006 rep_bytes = (int)(final_val-i); 2007 png_memcpy(dstptr, srcptr, rep_bytes); 2008 srcptr += stride; 2009 dstptr += stride; 2010 } 2011 } 2012 2013 break; 2014 } /* end 64 bpp */ 2015 2016 default: /* png_ptr->row_info.pixel_depth != 1,2,4,8,16,24,32,48,64 */ 2017 { 2018 // ERROR: SHOULD NEVER BE REACHED 2019 #if defined(PNG_DEBUG) 2020 png_debug(1, "Internal libpng logic error (GCC " 2021 "png_combine_row() pixel_depth)\n"); 2022 #endif 2023 break; 2024 } 2025 } /* end switch (png_ptr->row_info.pixel_depth) */ 2026 2027 } /* end if (non-trivial mask) */ 2028 2029 } /* end png_combine_row() */ 2030 2031 #endif /* PNG_HAVE_MMX_COMBINE_ROW */ 2032 2033 2034 2035 2036 /*===========================================================================*/ 2037 /* */ 2038 /* P N G _ D O _ R E A D _ I N T E R L A C E */ 2039 /* */ 2040 /*===========================================================================*/ 2041 2042 #if defined(PNG_READ_INTERLACING_SUPPORTED) 2043 #if defined(PNG_HAVE_MMX_READ_INTERLACE) 2044 2045 /* png_do_read_interlace() is called after any 16-bit to 8-bit conversion 2046 * has taken place. [GRR: what other steps come before and/or after?] 2047 */ 2048 2049 void /* PRIVATE */ 2050 png_do_read_interlace(png_structp png_ptr) 2051 { 2052 png_row_infop row_info = &(png_ptr->row_info); 2053 png_bytep row = png_ptr->row_buf + 1; 2054 int pass = png_ptr->pass; 2055 #if defined(PNG_READ_PACKSWAP_SUPPORTED) 2056 png_uint_32 transformations = png_ptr->transformations; 2057 #endif 2058 2059 png_debug(1, "in png_do_read_interlace (pnggccrd.c)\n"); 2060 2061 if (_mmx_supported == 2) { 2062 #if !defined(PNG_1_0_X) 2063 /* this should have happened in png_init_mmx_flags() already */ 2064 png_warning(png_ptr, "asm_flags may not have been initialized"); 2065 #endif 2066 png_mmx_support(); 2067 } 2068 2069 if (row != NULL && row_info != NULL) 2070 { 2071 png_uint_32 final_width; 2072 2073 final_width = row_info->width * png_pass_inc[pass]; 2074 2075 switch (row_info->pixel_depth) 2076 { 2077 case 1: 2078 { 2079 png_bytep sp, dp; 2080 int sshift, dshift; 2081 int s_start, s_end, s_inc; 2082 png_byte v; 2083 png_uint_32 i; 2084 int j; 2085 2086 sp = row + (png_size_t)((row_info->width - 1) >> 3); 2087 dp = row + (png_size_t)((final_width - 1) >> 3); 2088 #if defined(PNG_READ_PACKSWAP_SUPPORTED) 2089 if (transformations & PNG_PACKSWAP) 2090 { 2091 sshift = (int)((row_info->width + 7) & 7); 2092 dshift = (int)((final_width + 7) & 7); 2093 s_start = 7; 2094 s_end = 0; 2095 s_inc = -1; 2096 } 2097 else 2098 #endif 2099 { 2100 sshift = 7 - (int)((row_info->width + 7) & 7); 2101 dshift = 7 - (int)((final_width + 7) & 7); 2102 s_start = 0; 2103 s_end = 7; 2104 s_inc = 1; 2105 } 2106 2107 for (i = row_info->width; i; i--) 2108 { 2109 v = (png_byte)((*sp >> sshift) & 0x1); 2110 for (j = 0; j < png_pass_inc[pass]; j++) 2111 { 2112 *dp &= (png_byte)((0x7f7f >> (7 - dshift)) & 0xff); 2113 *dp |= (png_byte)(v << dshift); 2114 if (dshift == s_end) 2115 { 2116 dshift = s_start; 2117 dp--; 2118 } 2119 else 2120 dshift += s_inc; 2121 } 2122 if (sshift == s_end) 2123 { 2124 sshift = s_start; 2125 sp--; 2126 } 2127 else 2128 sshift += s_inc; 2129 } 2130 break; 2131 } 2132 2133 case 2: 2134 { 2135 png_bytep sp, dp; 2136 int sshift, dshift; 2137 int s_start, s_end, s_inc; 2138 png_uint_32 i; 2139 2140 sp = row + (png_size_t)((row_info->width - 1) >> 2); 2141 dp = row + (png_size_t)((final_width - 1) >> 2); 2142 #if defined(PNG_READ_PACKSWAP_SUPPORTED) 2143 if (transformations & PNG_PACKSWAP) 2144 { 2145 sshift = (png_size_t)(((row_info->width + 3) & 3) << 1); 2146 dshift = (png_size_t)(((final_width + 3) & 3) << 1); 2147 s_start = 6; 2148 s_end = 0; 2149 s_inc = -2; 2150 } 2151 else 2152 #endif 2153 { 2154 sshift = (png_size_t)((3 - ((row_info->width + 3) & 3)) << 1); 2155 dshift = (png_size_t)((3 - ((final_width + 3) & 3)) << 1); 2156 s_start = 0; 2157 s_end = 6; 2158 s_inc = 2; 2159 } 2160 2161 for (i = row_info->width; i; i--) 2162 { 2163 png_byte v; 2164 int j; 2165 2166 v = (png_byte)((*sp >> sshift) & 0x3); 2167 for (j = 0; j < png_pass_inc[pass]; j++) 2168 { 2169 *dp &= (png_byte)((0x3f3f >> (6 - dshift)) & 0xff); 2170 *dp |= (png_byte)(v << dshift); 2171 if (dshift == s_end) 2172 { 2173 dshift = s_start; 2174 dp--; 2175 } 2176 else 2177 dshift += s_inc; 2178 } 2179 if (sshift == s_end) 2180 { 2181 sshift = s_start; 2182 sp--; 2183 } 2184 else 2185 sshift += s_inc; 2186 } 2187 break; 2188 } 2189 2190 case 4: 2191 { 2192 png_bytep sp, dp; 2193 int sshift, dshift; 2194 int s_start, s_end, s_inc; 2195 png_uint_32 i; 2196 2197 sp = row + (png_size_t)((row_info->width - 1) >> 1); 2198 dp = row + (png_size_t)((final_width - 1) >> 1); 2199 #if defined(PNG_READ_PACKSWAP_SUPPORTED) 2200 if (transformations & PNG_PACKSWAP) 2201 { 2202 sshift = (png_size_t)(((row_info->width + 1) & 1) << 2); 2203 dshift = (png_size_t)(((final_width + 1) & 1) << 2); 2204 s_start = 4; 2205 s_end = 0; 2206 s_inc = -4; 2207 } 2208 else 2209 #endif 2210 { 2211 sshift = (png_size_t)((1 - ((row_info->width + 1) & 1)) << 2); 2212 dshift = (png_size_t)((1 - ((final_width + 1) & 1)) << 2); 2213 s_start = 0; 2214 s_end = 4; 2215 s_inc = 4; 2216 } 2217 2218 for (i = row_info->width; i; i--) 2219 { 2220 png_byte v; 2221 int j; 2222 2223 v = (png_byte)((*sp >> sshift) & 0xf); 2224 for (j = 0; j < png_pass_inc[pass]; j++) 2225 { 2226 *dp &= (png_byte)((0xf0f >> (4 - dshift)) & 0xff); 2227 *dp |= (png_byte)(v << dshift); 2228 if (dshift == s_end) 2229 { 2230 dshift = s_start; 2231 dp--; 2232 } 2233 else 2234 dshift += s_inc; 2235 } 2236 if (sshift == s_end) 2237 { 2238 sshift = s_start; 2239 sp--; 2240 } 2241 else 2242 sshift += s_inc; 2243 } 2244 break; 2245 } 2246 2247 /*====================================================================*/ 2248 2249 default: /* 8-bit or larger (this is where the routine is modified) */ 2250 { 2251 png_bytep sptr, dp; 2252 png_uint_32 i; 2253 png_size_t pixel_bytes; 2254 int width = (int)row_info->width; 2255 2256 pixel_bytes = (row_info->pixel_depth >> 3); 2257 2258 /* point sptr at the last pixel in the pre-expanded row: */ 2259 sptr = row + (width - 1) * pixel_bytes; 2260 2261 /* point dp at the last pixel position in the expanded row: */ 2262 dp = row + (final_width - 1) * pixel_bytes; 2263 2264 /* New code by Nirav Chhatrapati - Intel Corporation */ 2265 2266 #if !defined(PNG_1_0_X) 2267 if (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_INTERLACE) 2268 #else 2269 if (_mmx_supported) 2270 #endif 2271 { 2272 int dummy_value_c; // fix 'forbidden register spilled' 2273 png_bytep dummy_value_S; 2274 png_bytep dummy_value_D; 2275 png_bytep dummy_value_a; 2276 png_bytep dummy_value_d; 2277 2278 //-------------------------------------------------------------- 2279 if (pixel_bytes == BPP3) 2280 { 2281 if (((pass == 4) || (pass == 5)) && width) 2282 { 2283 int width_mmx = ((width >> 1) << 1) - 8; // GRR: huh? 2284 if (width_mmx < 0) 2285 width_mmx = 0; 2286 width -= width_mmx; // 8 or 9 pix, 24 or 27 bytes 2287 if (width_mmx) 2288 { 2289 // png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; 2290 // sptr points at last pixel in pre-expanded row 2291 // dp points at last pixel position in expanded row 2292 __asm__ __volatile__ ( 2293 "sub $3, %1 \n\t" 2294 "sub $9, %2 \n\t" 2295 // (png_pass_inc[pass] + 1)*pixel_bytes 2296 2297 ".loop3_pass4: \n\t" 2298 "movq (%1), %%mm0 \n\t" // x x 5 4 3 2 1 0 2299 "movq %%mm0, %%mm1 \n\t" // x x 5 4 3 2 1 0 2300 "movq %%mm0, %%mm2 \n\t" // x x 5 4 3 2 1 0 2301 "psllq $24, %%mm0 \n\t" // 4 3 2 1 0 z z z 2302 "pand (%3), %%mm1 \n\t" // z z z z z 2 1 0 2303 "psrlq $24, %%mm2 \n\t" // z z z x x 5 4 3 2304 "por %%mm1, %%mm0 \n\t" // 4 3 2 1 0 2 1 0 2305 "movq %%mm2, %%mm3 \n\t" // z z z x x 5 4 3 2306 "psllq $8, %%mm2 \n\t" // z z x x 5 4 3 z 2307 "movq %%mm0, (%2) \n\t" 2308 "psrlq $16, %%mm3 \n\t" // z z z z z x x 5 2309 "pand (%4), %%mm3 \n\t" // z z z z z z z 5 2310 "por %%mm3, %%mm2 \n\t" // z z x x 5 4 3 5 2311 "sub $6, %1 \n\t" 2312 "movd %%mm2, 8(%2) \n\t" 2313 "sub $12, %2 \n\t" 2314 "subl $2, %%ecx \n\t" 2315 "jnz .loop3_pass4 \n\t" 2316 "EMMS \n\t" // DONE 2317 2318 : "=c" (dummy_value_c), // output regs (dummy) 2319 "=S" (dummy_value_S), 2320 "=D" (dummy_value_D), 2321 "=a" (dummy_value_a), 2322 "=d" (dummy_value_d) 2323 2324 : "0" (width_mmx), // ecx // input regs 2325 "1" (sptr), // esi/rsi 2326 "2" (dp), // edi/rdi 2327 #if defined(PNG_x86_64_USE_GOTPCREL) // formerly _const4 and _const6: 2328 "3" (&_c64._amask5_3_0), // (0x0000000000FFFFFFLL) 2329 "4" (&_c64._amask7_1_0) // (0x00000000000000FFLL) 2330 #else 2331 "3" (&_amask5_3_0), // eax (0x0000000000FFFFFFLL) 2332 "4" (&_amask7_1_0) // edx (0x00000000000000FFLL) 2333 #endif 2334 2335 #if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED) 2336 : "%mm0", "%mm1" // clobber list 2337 , "%mm2", "%mm3" 2338 #endif 2339 ); 2340 } 2341 2342 sptr -= width_mmx*BPP3; 2343 dp -= width_mmx*2*BPP3; 2344 for (i = width; i; i--) 2345 { 2346 png_byte v[8]; 2347 int j; 2348 2349 png_memcpy(v, sptr, BPP3); 2350 for (j = 0; j < png_pass_inc[pass]; j++) 2351 { 2352 png_memcpy(dp, v, BPP3); 2353 dp -= BPP3; 2354 } 2355 sptr -= BPP3; 2356 } 2357 } 2358 else if (((pass == 2) || (pass == 3)) && width) 2359 { 2360 __asm__ __volatile__ ( 2361 "sub $9, %2 \n\t" 2362 // (png_pass_inc[pass] - 1)*pixel_bytes 2363 2364 ".loop3_pass2: \n\t" 2365 "movd (%1), %%mm0 \n\t" // x x x x x 2 1 0 2366 "pand (%3), %%mm0 \n\t" // z z z z z 2 1 0 2367 "movq %%mm0, %%mm1 \n\t" // z z z z z 2 1 0 2368 "psllq $16, %%mm0 \n\t" // z z z 2 1 0 z z 2369 "movq %%mm0, %%mm2 \n\t" // z z z 2 1 0 z z 2370 "psllq $24, %%mm0 \n\t" // 2 1 0 z z z z z 2371 "psrlq $8, %%mm1 \n\t" // z z z z z z 2 1 2372 "por %%mm2, %%mm0 \n\t" // 2 1 0 2 1 0 z z 2373 "por %%mm1, %%mm0 \n\t" // 2 1 0 2 1 0 2 1 2374 "movq %%mm0, 4(%2) \n\t" 2375 "psrlq $16, %%mm0 \n\t" // z z 2 1 0 2 1 0 2376 "sub $3, %1 \n\t" 2377 "movd %%mm0, (%2) \n\t" 2378 "sub $12, %2 \n\t" 2379 "decl %%ecx \n\t" 2380 "jnz .loop3_pass2 \n\t" 2381 "EMMS \n\t" // DONE 2382 2383 : "=c" (dummy_value_c), // output regs (dummy) 2384 "=S" (dummy_value_S), 2385 "=D" (dummy_value_D), 2386 "=a" (dummy_value_a) 2387 2388 : "0" (width), // ecx // input regs 2389 "1" (sptr), // esi/rsi 2390 "2" (dp), // edi/rdi 2391 #if defined(PNG_x86_64_USE_GOTPCREL) // formerly _const4: 2392 "3" (&_c64._amask5_3_0) // (0x0000000000FFFFFFLL) 2393 #else 2394 "3" (&_amask5_3_0) // eax (0x0000000000FFFFFFLL) 2395 #endif 2396 2397 #if defined(CLOBBER_MMX_REGS_SUPPORTED) 2398 : "%mm0", "%mm1", "%mm2" // clobber list 2399 #endif 2400 ); 2401 } 2402 else if (width) // && ((pass == 0) || (pass == 1)) 2403 { 2404 __asm__ __volatile__ ( 2405 "sub $21, %2 \n\t" 2406 // (png_pass_inc[pass] - 1)*pixel_bytes 2407 2408 ".loop3_pass0: \n\t" 2409 "movd (%1), %%mm0 \n\t" // x x x x x 2 1 0 2410 "pand (%3), %%mm0 \n\t" // z z z z z 2 1 0 2411 "movq %%mm0, %%mm1 \n\t" // z z z z z 2 1 0 2412 "psllq $16, %%mm0 \n\t" // z z z 2 1 0 z z 2413 "movq %%mm0, %%mm2 \n\t" // z z z 2 1 0 z z 2414 "psllq $24, %%mm0 \n\t" // 2 1 0 z z z z z 2415 "psrlq $8, %%mm1 \n\t" // z z z z z z 2 1 2416 "por %%mm2, %%mm0 \n\t" // 2 1 0 2 1 0 z z 2417 "por %%mm1, %%mm0 \n\t" // 2 1 0 2 1 0 2 1 2418 "movq %%mm0, %%mm3 \n\t" // 2 1 0 2 1 0 2 1 2419 "psllq $16, %%mm0 \n\t" // 0 2 1 0 2 1 z z 2420 "movq %%mm3, %%mm4 \n\t" // 2 1 0 2 1 0 2 1 2421 "punpckhdq %%mm0, %%mm3 \n\t" // 0 2 1 0 2 1 0 2 2422 "movq %%mm4, 16(%2) \n\t" 2423 "psrlq $32, %%mm0 \n\t" // z z z z 0 2 1 0 2424 "movq %%mm3, 8(%2) \n\t" 2425 "punpckldq %%mm4, %%mm0 \n\t" // 1 0 2 1 0 2 1 0 2426 "sub $3, %1 \n\t" 2427 "movq %%mm0, (%2) \n\t" 2428 "sub $24, %2 \n\t" 2429 "decl %%ecx \n\t" 2430 "jnz .loop3_pass0 \n\t" 2431 "EMMS \n\t" // DONE 2432 2433 : "=c" (dummy_value_c), // output regs (dummy) 2434 "=S" (dummy_value_S), 2435 "=D" (dummy_value_D), 2436 "=a" (dummy_value_a) 2437 2438 : "0" (width), // ecx // input regs 2439 "1" (sptr), // esi/rsi 2440 "2" (dp), // edi/rdi 2441 #if defined(PNG_x86_64_USE_GOTPCREL) // formerly _const4: 2442 "3" (&_c64._amask5_3_0) // (0x0000000000FFFFFFLL) 2443 #else 2444 "3" (&_amask5_3_0) // eax (0x0000000000FFFFFFLL) 2445 #endif 2446 2447 #if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED) 2448 : "%mm0", "%mm1", "%mm2" // clobber list 2449 , "%mm3", "%mm4" 2450 #endif 2451 ); 2452 } 2453 } /* end of pixel_bytes == 3 */ 2454 2455 //-------------------------------------------------------------- 2456 else if (pixel_bytes == BPP4) 2457 { 2458 if (((pass == 4) || (pass == 5)) && width) 2459 { 2460 int width_mmx = ((width >> 1) << 1) ; 2461 width -= width_mmx; // 0,1 pixels => 0,4 bytes 2462 if (width_mmx) 2463 { 2464 __asm__ __volatile__ ( 2465 "sub $4, %1 \n\t" 2466 "sub $12, %2 \n\t" 2467 2468 ".loop4_pass4: \n\t" 2469 "movq (%1), %%mm0 \n\t" // 7 6 5 4 3 2 1 0 2470 "movq %%mm0, %%mm1 \n\t" // 7 6 5 4 3 2 1 0 2471 "punpckldq %%mm0, %%mm0 \n\t" // 3 2 1 0 3 2 1 0 2472 "punpckhdq %%mm1, %%mm1 \n\t" // 7 6 5 4 7 6 5 4 2473 "movq %%mm0, (%2) \n\t" 2474 "sub $8, %1 \n\t" 2475 "movq %%mm1, 8(%2) \n\t" 2476 "sub $16, %2 \n\t" 2477 "subl $2, %%ecx \n\t" 2478 "jnz .loop4_pass4 \n\t" 2479 "EMMS \n\t" // DONE 2480 2481 : "=c" (dummy_value_c), // output regs (dummy) 2482 "=S" (dummy_value_S), 2483 "=D" (dummy_value_D) 2484 2485 : "0" (width_mmx), // ecx // input regs 2486 "1" (sptr), // esi/rsi 2487 "2" (dp) // edi/rdi 2488 2489 #if defined(CLOBBER_MMX_REGS_SUPPORTED) 2490 : "%mm0", "%mm1" // clobber list 2491 #endif 2492 ); 2493 } 2494 2495 sptr -= (width_mmx*BPP4 - BPP4); // sign fixed 2496 dp -= (width_mmx*2*BPP4 - BPP4); // sign fixed 2497 for (i = width; i; i--) 2498 { 2499 png_byte v[8]; 2500 int j; 2501 sptr -= BPP4; 2502 png_memcpy(v, sptr, BPP4); 2503 for (j = 0; j < png_pass_inc[pass]; j++) 2504 { 2505 dp -= BPP4; 2506 png_memcpy(dp, v, BPP4); 2507 } 2508 } 2509 } 2510 else if (((pass == 2) || (pass == 3)) && width) 2511 { 2512 int width_mmx = ((width >> 1) << 1); 2513 width -= width_mmx; // 0,1 pixels => 0,4 bytes 2514 if (width_mmx) 2515 { 2516 __asm__ __volatile__ ( 2517 "sub $4, %1 \n\t" 2518 "sub $28, %2 \n\t" 2519 2520 ".loop4_pass2: \n\t" 2521 "movq (%1), %%mm0 \n\t" // 7 6 5 4 3 2 1 0 2522 "movq %%mm0, %%mm1 \n\t" // 7 6 5 4 3 2 1 0 2523 "punpckldq %%mm0, %%mm0 \n\t" // 3 2 1 0 3 2 1 0 2524 "punpckhdq %%mm1, %%mm1 \n\t" // 7 6 5 4 7 6 5 4 2525 "movq %%mm0, (%2) \n\t" 2526 "movq %%mm0, 8(%2) \n\t" 2527 "movq %%mm1, 16(%2) \n\t" 2528 "movq %%mm1, 24(%2) \n\t" 2529 "sub $8, %1 \n\t" 2530 "sub $32, %2 \n\t" 2531 "subl $2, %%ecx \n\t" 2532 "jnz .loop4_pass2 \n\t" 2533 "EMMS \n\t" // DONE 2534 2535 : "=c" (dummy_value_c), // output regs (dummy) 2536 "=S" (dummy_value_S), 2537 "=D" (dummy_value_D) 2538 2539 : "0" (width_mmx), // ecx // input regs 2540 "1" (sptr), // esi/rsi 2541 "2" (dp) // edi/rdi 2542 2543 #if defined(CLOBBER_MMX_REGS_SUPPORTED) 2544 : "%mm0", "%mm1" // clobber list 2545 #endif 2546 ); 2547 } 2548 2549 sptr -= (width_mmx*4 - 4); // sign fixed 2550 dp -= (width_mmx*16 - 4); // sign fixed 2551 for (i = width; i; i--) 2552 { 2553 png_byte v[8]; 2554 int j; 2555 sptr -= 4; 2556 png_memcpy(v, sptr, 4); 2557 for (j = 0; j < png_pass_inc[pass]; j++) 2558 { 2559 dp -= 4; 2560 png_memcpy(dp, v, 4); 2561 } 2562 } 2563 } 2564 else if (width) // && ((pass == 0) || (pass == 1)) 2565 { 2566 int width_mmx = ((width >> 1) << 1); 2567 width -= width_mmx; // 0,1 pixels => 0,4 bytes 2568 if (width_mmx) 2569 { 2570 __asm__ __volatile__ ( 2571 "sub $4, %1 \n\t" 2572 "sub $60, %2 \n\t" 2573 2574 ".loop4_pass0: \n\t" 2575 "movq (%1), %%mm0 \n\t" // 7 6 5 4 3 2 1 0 2576 "movq %%mm0, %%mm1 \n\t" // 7 6 5 4 3 2 1 0 2577 "punpckldq %%mm0, %%mm0 \n\t" // 3 2 1 0 3 2 1 0 2578 "punpckhdq %%mm1, %%mm1 \n\t" // 7 6 5 4 7 6 5 4 2579 "movq %%mm0, (%2) \n\t" 2580 "movq %%mm0, 8(%2) \n\t" 2581 "movq %%mm0, 16(%2) \n\t" 2582 "movq %%mm0, 24(%2) \n\t" 2583 "movq %%mm1, 32(%2) \n\t" 2584 "movq %%mm1, 40(%2) \n\t" 2585 "movq %%mm1, 48(%2) \n\t" 2586 "sub $8, %1 \n\t" 2587 "movq %%mm1, 56(%2) \n\t" 2588 "sub $64, %2 \n\t" 2589 "subl $2, %%ecx \n\t" 2590 "jnz .loop4_pass0 \n\t" 2591 "EMMS \n\t" // DONE 2592 2593 : "=c" (dummy_value_c), // output regs (dummy) 2594 "=S" (dummy_value_S), 2595 "=D" (dummy_value_D) 2596 2597 : "0" (width_mmx), // ecx // input regs 2598 "1" (sptr), // esi/rsi 2599 "2" (dp) // edi/rdi 2600 2601 #if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED) 2602 : "%mm0", "%mm1" // clobber list 2603 #endif 2604 ); 2605 } 2606 2607 sptr -= (width_mmx*4 - 4); // sign fixed 2608 dp -= (width_mmx*32 - 4); // sign fixed 2609 for (i = width; i; i--) 2610 { 2611 png_byte v[8]; 2612 int j; 2613 sptr -= 4; 2614 png_memcpy(v, sptr, 4); 2615 for (j = 0; j < png_pass_inc[pass]; j++) 2616 { 2617 dp -= 4; 2618 png_memcpy(dp, v, 4); 2619 } 2620 } 2621 } 2622 } /* end of pixel_bytes == 4 */ 2623 2624 //-------------------------------------------------------------- 2625 else if (pixel_bytes == 1) 2626 { 2627 if (((pass == 4) || (pass == 5)) && width) 2628 { 2629 int width_mmx = ((width >> 3) << 3); 2630 width -= width_mmx; // 0-3 pixels => 0-3 bytes 2631 if (width_mmx) 2632 { 2633 __asm__ __volatile__ ( 2634 "sub $7, %1 \n\t" 2635 "sub $15, %2 \n\t" 2636 2637 ".loop1_pass4: \n\t" 2638 "movq (%1), %%mm0 \n\t" // 7 6 5 4 3 2 1 0 2639 "movq %%mm0, %%mm1 \n\t" // 7 6 5 4 3 2 1 0 2640 "punpcklbw %%mm0, %%mm0 \n\t" // 3 3 2 2 1 1 0 0 2641 "punpckhbw %%mm1, %%mm1 \n\t" // 7 7 6 6 5 5 4 4 2642 "movq %%mm1, 8(%2) \n\t" 2643 "sub $8, %1 \n\t" 2644 "movq %%mm0, (%2) \n\t" 2645 "sub $16, %2 \n\t" 2646 "subl $8, %%ecx \n\t" 2647 "jnz .loop1_pass4 \n\t" 2648 "EMMS \n\t" // DONE 2649 2650 : "=c" (dummy_value_c), // output regs (dummy) 2651 "=S" (dummy_value_S), 2652 "=D" (dummy_value_D) 2653 2654 : "0" (width_mmx), // ecx // input regs 2655 "1" (sptr), // esi/rsi 2656 "2" (dp) // edi/rdi 2657 2658 #if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED) 2659 : "%mm0", "%mm1" // clobber list 2660 #endif 2661 ); 2662 } 2663 2664 sptr -= width_mmx; 2665 dp -= width_mmx*2; 2666 for (i = width; i; i--) 2667 { 2668 int j; 2669 2670 for (j = 0; j < png_pass_inc[pass]; j++) 2671 { 2672 *dp-- = *sptr; 2673 } 2674 --sptr; 2675 } 2676 } 2677 else if (((pass == 2) || (pass == 3)) && width) 2678 { 2679 int width_mmx = ((width >> 2) << 2); 2680 width -= width_mmx; // 0-3 pixels => 0-3 bytes 2681 if (width_mmx) 2682 { 2683 __asm__ __volatile__ ( 2684 "sub $3, %1 \n\t" 2685 "sub $15, %2 \n\t" 2686 2687 ".loop1_pass2: \n\t" 2688 "movd (%1), %%mm0 \n\t" // x x x x 3 2 1 0 2689 "punpcklbw %%mm0, %%mm0 \n\t" // 3 3 2 2 1 1 0 0 2690 "movq %%mm0, %%mm1 \n\t" // 3 3 2 2 1 1 0 0 2691 "punpcklwd %%mm0, %%mm0 \n\t" // 1 1 1 1 0 0 0 0 2692 "punpckhwd %%mm1, %%mm1 \n\t" // 3 3 3 3 2 2 2 2 2693 "movq %%mm0, (%2) \n\t" 2694 "sub $4, %1 \n\t" 2695 "movq %%mm1, 8(%2) \n\t" 2696 "sub $16, %2 \n\t" 2697 "subl $4, %%ecx \n\t" 2698 "jnz .loop1_pass2 \n\t" 2699 "EMMS \n\t" // DONE 2700 2701 : "=c" (dummy_value_c), // output regs (dummy) 2702 "=S" (dummy_value_S), 2703 "=D" (dummy_value_D) 2704 2705 : "0" (width_mmx), // ecx // input regs 2706 "1" (sptr), // esi/rsi 2707 "2" (dp) // edi/rdi 2708 2709 #if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED) 2710 : "%mm0", "%mm1" // clobber list 2711 #endif 2712 ); 2713 } 2714 2715 sptr -= width_mmx; 2716 dp -= width_mmx*4; 2717 for (i = width; i; i--) 2718 { 2719 int j; 2720 2721 for (j = 0; j < png_pass_inc[pass]; j++) 2722 { 2723 *dp-- = *sptr; 2724 } 2725 --sptr; 2726 } 2727 } 2728 else if (width) // && ((pass == 0) || (pass == 1)) 2729 { 2730 int width_mmx = ((width >> 2) << 2); 2731 width -= width_mmx; // 0-3 pixels => 0-3 bytes 2732 if (width_mmx) 2733 { 2734 __asm__ __volatile__ ( 2735 "sub $3, %1 \n\t" 2736 "sub $31, %2 \n\t" 2737 2738 ".loop1_pass0: \n\t" 2739 "movd (%1), %%mm0 \n\t" // x x x x 3 2 1 0 2740 "movq %%mm0, %%mm1 \n\t" // x x x x 3 2 1 0 2741 "punpcklbw %%mm0, %%mm0 \n\t" // 3 3 2 2 1 1 0 0 2742 "movq %%mm0, %%mm2 \n\t" // 3 3 2 2 1 1 0 0 2743 "punpcklwd %%mm0, %%mm0 \n\t" // 1 1 1 1 0 0 0 0 2744 "movq %%mm0, %%mm3 \n\t" // 1 1 1 1 0 0 0 0 2745 "punpckldq %%mm0, %%mm0 \n\t" // 0 0 0 0 0 0 0 0 2746 "punpckhdq %%mm3, %%mm3 \n\t" // 1 1 1 1 1 1 1 1 2747 "movq %%mm0, (%2) \n\t" 2748 "punpckhwd %%mm2, %%mm2 \n\t" // 3 3 3 3 2 2 2 2 2749 "movq %%mm3, 8(%2) \n\t" 2750 "movq %%mm2, %%mm4 \n\t" // 3 3 3 3 2 2 2 2 2751 "punpckldq %%mm2, %%mm2 \n\t" // 2 2 2 2 2 2 2 2 2752 "punpckhdq %%mm4, %%mm4 \n\t" // 3 3 3 3 3 3 3 3 2753 "movq %%mm2, 16(%2) \n\t" 2754 "sub $4, %1 \n\t" 2755 "movq %%mm4, 24(%2) \n\t" 2756 "sub $32, %2 \n\t" 2757 "subl $4, %%ecx \n\t" 2758 "jnz .loop1_pass0 \n\t" 2759 "EMMS \n\t" // DONE 2760 2761 : "=c" (dummy_value_c), // output regs (dummy) 2762 "=S" (dummy_value_S), 2763 "=D" (dummy_value_D) 2764 2765 : "0" (width_mmx), // ecx // input regs 2766 "1" (sptr), // esi/rsi 2767 "2" (dp) // edi/rdi 2768 2769 #if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED) 2770 : "%mm0", "%mm1", "%mm2" // clobber list 2771 , "%mm3", "%mm4" 2772 #endif 2773 ); 2774 } 2775 2776 sptr -= width_mmx; 2777 dp -= width_mmx*8; 2778 for (i = width; i; i--) 2779 { 2780 int j; 2781 2782 /* I simplified this part in version 1.0.4e 2783 * here and in several other instances where 2784 * pixel_bytes == 1 -- GR-P 2785 * 2786 * Original code: 2787 * 2788 * png_byte v[8]; 2789 * png_memcpy(v, sptr, pixel_bytes); 2790 * for (j = 0; j < png_pass_inc[pass]; j++) 2791 * { 2792 * png_memcpy(dp, v, pixel_bytes); 2793 * dp -= pixel_bytes; 2794 * } 2795 * sptr -= pixel_bytes; 2796 * 2797 * Replacement code is in the next three lines: 2798 */ 2799 2800 for (j = 0; j < png_pass_inc[pass]; j++) 2801 { 2802 *dp-- = *sptr; 2803 } 2804 --sptr; 2805 } 2806 } 2807 } /* end of pixel_bytes == 1 */ 2808 2809 //-------------------------------------------------------------- 2810 else if (pixel_bytes == BPP2) 2811 { 2812 if (((pass == 4) || (pass == 5)) && width) 2813 { 2814 int width_mmx = ((width >> 1) << 1) ; 2815 width -= width_mmx; // 0,1 pixels => 0,2 bytes 2816 if (width_mmx) 2817 { 2818 __asm__ __volatile__ ( 2819 "sub $2, %1 \n\t" 2820 "sub $6, %2 \n\t" 2821 2822 ".loop2_pass4: \n\t" 2823 "movd (%1), %%mm0 \n\t" // x x x x 3 2 1 0 2824 "punpcklwd %%mm0, %%mm0 \n\t" // 3 2 3 2 1 0 1 0 2825 "sub $4, %1 \n\t" 2826 "movq %%mm0, (%2) \n\t" 2827 "sub $8, %2 \n\t" 2828 "subl $2, %%ecx \n\t" 2829 "jnz .loop2_pass4 \n\t" 2830 "EMMS \n\t" // DONE 2831 2832 : "=c" (dummy_value_c), // output regs (dummy) 2833 "=S" (dummy_value_S), 2834 "=D" (dummy_value_D) 2835 2836 : "0" (width_mmx), // ecx // input regs 2837 "1" (sptr), // esi/rsi 2838 "2" (dp) // edi/rdi 2839 2840 #if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED) 2841 : "%mm0" // clobber list 2842 #endif 2843 ); 2844 } 2845 2846 sptr -= (width_mmx*BPP2 - BPP2); // sign fixed 2847 dp -= (width_mmx*2*BPP2 - BPP2); // sign fixed 2848 for (i = width; i; i--) 2849 { 2850 png_byte v[8]; 2851 int j; 2852 sptr -= BPP2; 2853 png_memcpy(v, sptr, BPP2); 2854 for (j = 0; j < png_pass_inc[pass]; j++) 2855 { 2856 dp -= BPP2; 2857 png_memcpy(dp, v, BPP2); 2858 } 2859 } 2860 } 2861 else if (((pass == 2) || (pass == 3)) && width) 2862 { 2863 int width_mmx = ((width >> 1) << 1) ; 2864 width -= width_mmx; // 0,1 pixels => 0,2 bytes 2865 if (width_mmx) 2866 { 2867 __asm__ __volatile__ ( 2868 "sub $2, %1 \n\t" 2869 "sub $14, %2 \n\t" 2870 2871 ".loop2_pass2: \n\t" 2872 "movd (%1), %%mm0 \n\t" // x x x x 3 2 1 0 2873 "punpcklwd %%mm0, %%mm0 \n\t" // 3 2 3 2 1 0 1 0 2874 "movq %%mm0, %%mm1 \n\t" // 3 2 3 2 1 0 1 0 2875 "punpckldq %%mm0, %%mm0 \n\t" // 1 0 1 0 1 0 1 0 2876 "punpckhdq %%mm1, %%mm1 \n\t" // 3 2 3 2 3 2 3 2 2877 "movq %%mm0, (%2) \n\t" 2878 "sub $4, %1 \n\t" 2879 "movq %%mm1, 8(%2) \n\t" 2880 "sub $16, %2 \n\t" 2881 "subl $2, %%ecx \n\t" 2882 "jnz .loop2_pass2 \n\t" 2883 "EMMS \n\t" // DONE 2884 2885 : "=c" (dummy_value_c), // output regs (dummy) 2886 "=S" (dummy_value_S), 2887 "=D" (dummy_value_D) 2888 2889 : "0" (width_mmx), // ecx // input regs 2890 "1" (sptr), // esi/rsi 2891 "2" (dp) // edi/rdi 2892 2893 #if defined(CLOBBER_MMX_REGS_SUPPORTED) 2894 : "%mm0", "%mm1" // clobber list 2895 #endif 2896 ); 2897 } 2898 2899 sptr -= (width_mmx*2 - 2); // sign fixed 2900 dp -= (width_mmx*8 - 2); // sign fixed 2901 for (i = width; i; i--) 2902 { 2903 png_byte v[8]; 2904 int j; 2905 sptr -= 2; 2906 png_memcpy(v, sptr, 2); 2907 for (j = 0; j < png_pass_inc[pass]; j++) 2908 { 2909 dp -= 2; 2910 png_memcpy(dp, v, 2); 2911 } 2912 } 2913 } 2914 else if (width) // && ((pass == 0) || (pass == 1)) 2915 { 2916 int width_mmx = ((width >> 1) << 1); 2917 width -= width_mmx; // 0,1 pixels => 0,2 bytes 2918 if (width_mmx) 2919 { 2920 __asm__ __volatile__ ( 2921 "sub $2, %1 \n\t" 2922 "sub $30, %2 \n\t" 2923 2924 ".loop2_pass0: \n\t" 2925 "movd (%1), %%mm0 \n\t" // x x x x 3 2 1 0 2926 "punpcklwd %%mm0, %%mm0 \n\t" // 3 2 3 2 1 0 1 0 2927 "movq %%mm0, %%mm1 \n\t" // 3 2 3 2 1 0 1 0 2928 "punpckldq %%mm0, %%mm0 \n\t" // 1 0 1 0 1 0 1 0 2929 "punpckhdq %%mm1, %%mm1 \n\t" // 3 2 3 2 3 2 3 2 2930 "movq %%mm0, (%2) \n\t" 2931 "movq %%mm0, 8(%2) \n\t" 2932 "movq %%mm1, 16(%2) \n\t" 2933 "sub $4, %1 \n\t" 2934 "movq %%mm1, 24(%2) \n\t" 2935 "sub $32, %2 \n\t" 2936 "subl $2, %%ecx \n\t" 2937 "jnz .loop2_pass0 \n\t" 2938 "EMMS \n\t" // DONE 2939 2940 : "=c" (dummy_value_c), // output regs (dummy) 2941 "=S" (dummy_value_S), 2942 "=D" (dummy_value_D) 2943 2944 : "0" (width_mmx), // ecx // input regs 2945 "1" (sptr), // esi/rsi 2946 "2" (dp) // edi/rdi 2947 2948 #if defined(CLOBBER_MMX_REGS_SUPPORTED) 2949 : "%mm0", "%mm1" // clobber list 2950 #endif 2951 ); 2952 } 2953 2954 sptr -= (width_mmx*2 - 2); // sign fixed 2955 dp -= (width_mmx*16 - 2); // sign fixed 2956 for (i = width; i; i--) 2957 { 2958 png_byte v[8]; 2959 int j; 2960 sptr -= 2; 2961 png_memcpy(v, sptr, 2); 2962 for (j = 0; j < png_pass_inc[pass]; j++) 2963 { 2964 dp -= 2; 2965 png_memcpy(dp, v, 2); 2966 } 2967 } 2968 } 2969 } /* end of pixel_bytes == 2 */ 2970 2971 //-------------------------------------------------------------- 2972 else if (pixel_bytes == BPP8) 2973 { 2974 // GRR TEST: should work, but needs testing (special 64-bit version of rpng2?) 2975 // GRR NOTE: no need to combine passes here! 2976 if (((pass == 4) || (pass == 5)) && width) 2977 { 2978 // source is 8-byte RRGGBBAA 2979 // dest is 16-byte RRGGBBAA RRGGBBAA 2980 __asm__ __volatile__ ( 2981 "sub $8, %2 \n\t" // start of last block 2982 2983 ".loop8_pass4: \n\t" 2984 "movq (%1), %%mm0 \n\t" // 7 6 5 4 3 2 1 0 2985 "movq %%mm0, (%2) \n\t" 2986 "sub $8, %1 \n\t" 2987 "movq %%mm0, 8(%2) \n\t" 2988 "sub $16, %2 \n\t" 2989 "decl %%ecx \n\t" 2990 "jnz .loop8_pass4 \n\t" 2991 "EMMS \n\t" // DONE 2992 2993 : "=c" (dummy_value_c), // output regs (dummy) 2994 "=S" (dummy_value_S), 2995 "=D" (dummy_value_D) 2996 2997 : "0" (width), // ecx // input regs 2998 "1" (sptr), // esi/rsi 2999 "2" (dp) // edi/rdi 3000 3001 #if defined(CLOBBER_MMX_REGS_SUPPORTED) 3002 : "%mm0" // clobber list 3003 #endif 3004 ); 3005 } 3006 else if (((pass == 2) || (pass == 3)) && width) 3007 { 3008 // source is 8-byte RRGGBBAA 3009 // dest is 32-byte RRGGBBAA RRGGBBAA RRGGBBAA RRGGBBAA 3010 // (recall that expansion is _in place_: sptr and dp 3011 // both point at locations within same row buffer) 3012 __asm__ __volatile__ ( 3013 "sub $24, %2 \n\t" // start of last block 3014 3015 ".loop8_pass2: \n\t" 3016 "movq (%1), %%mm0 \n\t" // 7 6 5 4 3 2 1 0 3017 "movq %%mm0, (%2) \n\t" 3018 "movq %%mm0, 8(%2) \n\t" 3019 "movq %%mm0, 16(%2) \n\t" 3020 "sub $8, %1 \n\t" 3021 "movq %%mm0, 24(%2) \n\t" 3022 "sub $32, %2 \n\t" 3023 "decl %%ecx \n\t" 3024 "jnz .loop8_pass2 \n\t" 3025 "EMMS \n\t" // DONE 3026 3027 : "=c" (dummy_value_c), // output regs (dummy) 3028 "=S" (dummy_value_S), 3029 "=D" (dummy_value_D) 3030 3031 : "0" (width), // ecx // input regs 3032 "1" (sptr), // esi/rsi 3033 "2" (dp) // edi/rdi 3034 3035 #if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED) 3036 : "%mm0" // clobber list 3037 #endif 3038 ); 3039 } 3040 else if (width) // && ((pass == 0) || (pass == 1)) 3041 { 3042 // source is 8-byte RRGGBBAA 3043 // dest is 64-byte RRGGBBAA RRGGBBAA RRGGBBAA RRGGBBAA ... 3044 __asm__ __volatile__ ( 3045 "sub $56, %2 \n\t" // start of last block 3046 3047 ".loop8_pass0: \n\t" 3048 "movq (%1), %%mm0 \n\t" // 7 6 5 4 3 2 1 0 3049 "movq %%mm0, (%2) \n\t" 3050 "movq %%mm0, 8(%2) \n\t" 3051 "movq %%mm0, 16(%2) \n\t" 3052 "movq %%mm0, 24(%2) \n\t" 3053 "movq %%mm0, 32(%2) \n\t" 3054 "movq %%mm0, 40(%2) \n\t" 3055 "movq %%mm0, 48(%2) \n\t" 3056 "sub $8, %1 \n\t" 3057 "movq %%mm0, 56(%2) \n\t" 3058 "sub $64, %2 \n\t" 3059 "decl %%ecx \n\t" 3060 "jnz .loop8_pass0 \n\t" 3061 "EMMS \n\t" // DONE 3062 3063 : "=c" (dummy_value_c), // output regs (dummy) 3064 "=S" (dummy_value_S), 3065 "=D" (dummy_value_D) 3066 3067 : "0" (width), // ecx // input regs 3068 "1" (sptr), // esi/rsi 3069 "2" (dp) // edi/rdi 3070 3071 #if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED) 3072 : "%mm0" // clobber list 3073 #endif 3074 ); 3075 } 3076 } /* end of pixel_bytes == 8 */ 3077 3078 //-------------------------------------------------------------- 3079 else if (pixel_bytes == BPP6) // why no MMX for this case? 3080 { 3081 for (i = width; i; i--) 3082 { 3083 png_byte v[8]; 3084 int j; 3085 png_memcpy(v, sptr, BPP6); 3086 for (j = 0; j < png_pass_inc[pass]; j++) 3087 { 3088 png_memcpy(dp, v, BPP6); 3089 dp -= BPP6; 3090 } 3091 sptr -= BPP6; 3092 } 3093 } /* end of pixel_bytes == 6 */ 3094 3095 //-------------------------------------------------------------- 3096 else 3097 { 3098 // ERROR: SHOULD NEVER BE REACHED 3099 #if defined(PNG_DEBUG) 3100 png_debug(1, "Internal libpng logic error (GCC " 3101 "png_do_read_interlace() _mmx_supported)\n"); 3102 #endif 3103 } 3104 3105 } // end of _mmx_supported ======================================== 3106 3107 else /* MMX not supported: use modified C code - takes advantage 3108 * of inlining of png_memcpy for a constant */ 3109 { 3110 if (pixel_bytes == BPP3) 3111 { 3112 for (i = width; i; i--) 3113 { 3114 png_byte v[8]; 3115 int j; 3116 png_memcpy(v, sptr, BPP3); 3117 for (j = 0; j < png_pass_inc[pass]; j++) 3118 { 3119 png_memcpy(dp, v, BPP3); 3120 dp -= BPP3; 3121 } 3122 sptr -= BPP3; 3123 } 3124 } 3125 else if (pixel_bytes == BPP4) 3126 { 3127 for (i = width; i; i--) 3128 { 3129 png_byte v[8]; 3130 int j; 3131 png_memcpy(v, sptr, BPP4); 3132 for (j = 0; j < png_pass_inc[pass]; j++) 3133 { 3134 #if defined(PNG_DEBUG) && defined(PNG_1_0_X) // row_buf_size gone in 1.2.x 3135 if (dp < row || dp+3 > row+png_ptr->row_buf_size) 3136 { 3137 printf("dp out of bounds: row=%10p, dp=%10p, " 3138 "rp=%10p\n", row, dp, row+png_ptr->row_buf_size); 3139 printf("row_buf_size=%lu\n", png_ptr->row_buf_size); 3140 } 3141 #endif 3142 png_memcpy(dp, v, BPP4); 3143 dp -= BPP4; 3144 } 3145 sptr -= BPP4; 3146 } 3147 } 3148 else if (pixel_bytes == 1) 3149 { 3150 for (i = width; i; i--) 3151 { 3152 int j; 3153 for (j = 0; j < png_pass_inc[pass]; j++) 3154 { 3155 *dp-- = *sptr; 3156 } 3157 --sptr; 3158 } 3159 } 3160 else if (pixel_bytes == BPP2) 3161 { 3162 for (i = width; i; i--) 3163 { 3164 png_byte v[8]; 3165 int j; 3166 png_memcpy(v, sptr, BPP2); 3167 for (j = 0; j < png_pass_inc[pass]; j++) 3168 { 3169 png_memcpy(dp, v, BPP2); 3170 dp -= BPP2; 3171 } 3172 sptr -= BPP2; 3173 } 3174 } 3175 else if (pixel_bytes == BPP6) 3176 { 3177 for (i = width; i; i--) 3178 { 3179 png_byte v[8]; 3180 int j; 3181 png_memcpy(v, sptr, BPP6); 3182 for (j = 0; j < png_pass_inc[pass]; j++) 3183 { 3184 png_memcpy(dp, v, BPP6); 3185 dp -= BPP6; 3186 } 3187 sptr -= BPP6; 3188 } 3189 } 3190 else if (pixel_bytes == BPP8) 3191 { 3192 for (i = width; i; i--) 3193 { 3194 png_byte v[8]; 3195 int j; 3196 png_memcpy(v, sptr, BPP8); 3197 for (j = 0; j < png_pass_inc[pass]; j++) 3198 { 3199 png_memcpy(dp, v, BPP8); 3200 dp -= BPP8; 3201 } 3202 sptr -= BPP8; 3203 } 3204 } 3205 else 3206 { 3207 // ERROR: SHOULD NEVER BE REACHED 3208 #if defined(PNG_DEBUG) 3209 png_debug(1, "Internal libpng logic error (GCC " 3210 "png_do_read_interlace() !_mmx_supported)\n"); 3211 #endif 3212 } 3213 3214 } /* end if (MMX not supported) */ 3215 break; 3216 } /* end default (8-bit or larger) */ 3217 } /* end switch (row_info->pixel_depth) */ 3218 3219 row_info->width = final_width; 3220 3221 row_info->rowbytes = PNG_ROWBYTES(row_info->pixel_depth,final_width); 3222 } 3223 3224 } /* end png_do_read_interlace() */ 3225 3226 #endif /* PNG_HAVE_MMX_READ_INTERLACE */ 3227 #endif /* PNG_READ_INTERLACING_SUPPORTED */ 3228 3229 3230 3231 #if defined(PNG_HAVE_MMX_READ_FILTER_ROW) 3232 #if defined(PNG_MMX_READ_FILTER_AVG_SUPPORTED) 3233 3234 //===========================================================================// 3235 // // 3236 // P N G _ R E A D _ F I L T E R _ R O W _ M M X _ A V G // 3237 // // 3238 //===========================================================================// 3239 3240 // Optimized code for PNG Average filter decoder 3241 3242 static void /* PRIVATE */ 3243 png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row, 3244 png_bytep prev_row) 3245 { 3246 unsigned FullLength, MMXLength; // png_uint_32 is actually 64-bit on x86-64 3247 int bpp; 3248 int dummy_value_a; 3249 int dummy_value_c; // fix 'forbidden register 2 (cx) was spilled' error 3250 int dummy_value_d; 3251 png_bytep dummy_value_S; 3252 png_bytep dummy_value_D; 3253 int diff; // __attribute__((used)); 3254 3255 bpp = (row_info->pixel_depth + 7) >> 3; // calc number of bytes per pixel 3256 FullLength = row_info->rowbytes; // number of bytes to filter 3257 3258 __asm__ __volatile__ ( 3259 "avg_top: \n\t" 3260 SAVE_GOT_ebx 3261 SAVE_r15 3262 SAVE_ebp 3263 // initialize address pointers and offset 3264 //pre "movl row, %5 \n\t" // edi/rdi: ptr to Avg(x) 3265 "xorl %%ebx, %%ebx \n\t" // ebx: x 3266 //pre "movl prev_row, %4 \n\t" // esi/rsi: ptr to Prior(x) 3267 "mov %5, " PDX " \n\t" // copy of row ptr... 3268 //pre "subl bpp, " PDX " \n\t" // (bpp is preloaded into ecx) 3269 "sub " PCX "," PDX " \n\t" // edx/rdx: ptr to Raw(x-bpp) 3270 //pre "movl FullLength, %%eax \n\t" // bring in via eax... 3271 SAVE_FullLength // ...but store for later use 3272 "xorl %%eax, %%eax \n\t" 3273 3274 // Compute the Raw value for the first bpp bytes 3275 // Raw(x) = Avg(x) + (Prior(x)/2) 3276 "avg_rlp: \n\t" 3277 "movb (%4," PBX ",), %%al \n\t" // load al with Prior(x) 3278 "incl %%ebx \n\t" 3279 "shrb %%al \n\t" // divide by 2 3280 "addb -1(%5," PBX ",), %%al \n\t" // add Avg(x); -1 to offset inc ebx 3281 //pre "cmpl bpp, %%ebx \n\t" // (bpp is preloaded into ecx) 3282 "cmpl %%ecx, %%ebx \n\t" 3283 "movb %%al, -1(%5," PBX ",) \n\t" // write Raw(x); -1 to offset inc ebx 3284 "jb avg_rlp \n\t" // mov does not affect flags 3285 3286 // get # of bytes to alignment (32-bit mask _would_ be good enough 3287 // [computing delta], but 32-bit ops are zero-extended on 64-bit, argh) 3288 // (if swapped edx and ebp, could do 8-bit or 16-bit mask...FIXME?) 3289 "mov %5, " PBP " \n\t" // take start of row 3290 "add " PBX "," PBP " \n\t" // add bpp 3291 "add $0xf, " PBP " \n\t" // add 7+8 to incr past alignment bdry 3292 // "andl $0xfffffff8, %%ebp \n\t" // mask to alignment boundary (32-bit!) 3293 CLEAR_BOTTOM_3_BITS PBP "\n\t" // mask to alignment boundary 3294 "sub %5, " PBP " \n\t" // subtract row ptr again => ebp = 3295 "jz avg_go \n\t" // target value of ebx at alignment 3296 3297 "xorl %%ecx, %%ecx \n\t" 3298 3299 // fix alignment 3300 // Compute the Raw value for the bytes up to the alignment boundary 3301 // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2) 3302 "avg_lp1: \n\t" 3303 "xorl %%eax, %%eax \n\t" 3304 "movb (%4," PBX ",), %%cl \n\t" // load cl with Prior(x) 3305 "movb (" PDX "," PBX ",), %%al \n\t" // load al with Raw(x-bpp) 3306 "addw %%cx, %%ax \n\t" 3307 "incl %%ebx \n\t" 3308 "shrw %%ax \n\t" // divide by 2 3309 "addb -1(%5," PBX ",), %%al \n\t" // add Avg(x); -1 to offset inc ebx 3310 "cmpl %%ebp, %%ebx \n\t" // check if at alignment boundary 3311 "movb %%al, -1(%5," PBX ",) \n\t" // write Raw(x); -1 to offset inc ebx 3312 "jb avg_lp1 \n\t" // repeat until at alignment boundary 3313 3314 "avg_go: \n\t" 3315 RESTORE_FullLength "%%eax \n\t" // FullLength -> eax 3316 "movl %%eax, %%ecx \n\t" // copy -> ecx 3317 "subl %%ebx, %%eax \n\t" // subtract alignment fix 3318 "andl $0x00000007, %%eax \n\t" // calc bytes over mult of 8 3319 "subl %%eax, %%ecx \n\t" // sub over-bytes from original length 3320 //out "movl %%ecx, MMXLength \n\t" 3321 "movl %%ebp, %%eax \n\t" // ebp = diff, but no reg constraint(?) 3322 RESTORE_ebp // (could swap ebp and edx functions) 3323 RESTORE_r15 3324 RESTORE_GOT_ebx 3325 3326 // "There is no way for you to specify that an input operand is modified 3327 // without also specifying it as an output operand." [makes sense] 3328 3329 // "Unless an output operand has the `&' constraint modifier, GCC may 3330 // allocate it in the same register as an unrelated input operand, on the 3331 // assumption the inputs are consumed before the outputs are produced." 3332 // [trying to _force_ this] 3333 3334 // "`=' Means that this operand is write-only for this instruction: 3335 // the previous value is discarded and replaced by output data." 3336 // [operand == variable name, presumably] 3337 3338 // output regs 3339 // these are operands 0-1 (originally 0-3): 3340 : "=c" (MMXLength), // %0 -> %0 3341 "=a" (diff) // %3 -> %1 3342 // "=S" (dummy_value_S), // %1 -> GONE 3343 // "=D" (dummy_value_D), // %2 -> GONE 3344 3345 // input regs 3346 // these are operands 2-5 (originally 4-7); two of their constraints say 3347 // they must go in same places as operands 0-1 (originally 0-3) above: 3348 : "0" (bpp), // %4 -> %2 ecx 3349 "1" (FullLength), // %7 -> %3 eax 3350 "S" (prev_row), // %5 -> %4 esi/rsi 3351 "D" (row) // %6 -> %5 edi/rdi 3352 3353 : "%edx" // clobber list 3354 _CLOBBER_r15 3355 _CLOBBER_ebp 3356 _CLOBBER_GOT_ebx 3357 ); 3358 3359 // now do the math for the rest of the row 3360 switch (bpp) 3361 { 3362 case 3: 3363 { 3364 // _ShiftBpp = 24; // == 3 * 8 3365 // _ShiftRem = 40; // == 64 - 24 3366 3367 __asm__ __volatile__ ( 3368 // re-init address pointers and offset 3369 LOAD_GOT_rbp 3370 "movq " AMASK5_3_0 ", %%mm7 \n\t" // _amask5_3_0 -> mm7 3371 // preload "movl diff, %%ecx \n\t" // ecx: x = offset to 3372 // alignment boundary 3373 "movq " LB_CARRY_MASK ", %%mm5 \n\t" // [interleave for parallel.?] 3374 // preload "movl row, %1 \n\t" // edi: Avg(x) 3375 "movq " HB_CLEAR_MASK ", %%mm4 \n\t" // _HBClearMask -> mm4 3376 // preload "movl prev_row, %0 \n\t" // esi: Prior(x) 3377 RESTORE_rbp 3378 3379 // prime the pump: load the first Raw(x-bpp) data set 3380 "movq -8(%1," PCX ",), %%mm2 \n\t"// load previous aligned 8 bytes 3381 // (correct pos. in loop below) 3382 "avg_3lp: \n\t" 3383 "movq (%1," PCX ",), %%mm0 \n\t" // load mm0 with Avg(x) 3384 "movq %%mm5, %%mm3 \n\t" 3385 "psrlq $40, %%mm2 \n\t" // correct position Raw(x-bpp) 3386 // data 3387 "movq (%0," PCX ",), %%mm1 \n\t" // load mm1 with Prior(x) 3388 "movq %%mm7, %%mm6 \n\t" 3389 "pand %%mm1, %%mm3 \n\t" // get lsb for each prevrow byte 3390 "psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2 3391 "pand %%mm4, %%mm1 \n\t" // clear invalid bit 7 of each 3392 // byte 3393 "paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg for 3394 // each byte 3395 // add 1st active group (Raw(x-bpp)/2) to average with LBCarry 3396 "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting 3397 // LBCarrys 3398 "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte 3399 // where both lsb's were == 1 3400 // (valid only for active group) 3401 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2 3402 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each 3403 // byte 3404 "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to Raw(x-bpp)/2 3405 // for each byte 3406 "pand %%mm6, %%mm2 \n\t" // leave only Active Group 1 3407 // bytes to add to Avg 3408 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to 3409 // Avg for each Active byte 3410 // add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry 3411 "psllq $24, %%mm6 \n\t" // shift the mm6 mask to cover 3412 // bytes 3-5 3413 "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2 3414 "psllq $24, %%mm2 \n\t" // shift data to pos. correctly 3415 "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting 3416 // LBCarrys 3417 "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte 3418 // where both lsb's were == 1 3419 // (valid only for active group) 3420 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2 3421 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each 3422 // byte 3423 "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to Raw(x-bpp)/2 3424 // for each byte 3425 "pand %%mm6, %%mm2 \n\t" // leave only Active Group 2 3426 // bytes to add to Avg 3427 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to 3428 // Avg for each Active byte 3429 3430 // add 3rd active group (Raw(x-bpp)/2) to average with _LBCarry 3431 "psllq $24, %%mm6 \n\t" // shift mm6 mask to cover last 3432 // two bytes 3433 "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2 3434 "psllq $24, %%mm2 \n\t" // shift data to pos. correctly 3435 // Data need be shifted only once here to 3436 // get the correct x-bpp offset. 3437 "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting 3438 // LBCarrys 3439 "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte 3440 // where both 3441 // lsb's were == 1 (only valid for active group) 3442 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2 3443 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each 3444 // byte 3445 "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to Raw(x-bpp)/2 3446 // for each byte 3447 "pand %%mm6, %%mm2 \n\t" // leave only Active Group 2 3448 // bytes to add to Avg 3449 "addl $8, %%ecx \n\t" 3450 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to 3451 // Avg for each Active byte 3452 // now ready to write back to memory 3453 "movq %%mm0, -8(%1," PCX ",) \n\t" 3454 // move updated Raw(x) to use as Raw(x-bpp) for next loop 3455 "cmpl %%eax, %%ecx \n\t" // MMXLength 3456 "movq %%mm0, %%mm2 \n\t" // mov updated Raw(x) to mm2 3457 "jb avg_3lp \n\t" 3458 3459 : "=S" (dummy_value_S), // output regs (dummy) 3460 "=D" (dummy_value_D), 3461 "=c" (dummy_value_c), 3462 "=a" (dummy_value_a) 3463 3464 : "0" (prev_row), // esi/rsi // input regs 3465 "1" (row), // edi/rdi 3466 "2" (diff), // ecx 3467 "3" (MMXLength) // eax 3468 3469 #if defined(CLOBBER_MMX_REGS_SUPPORTED) 3470 : "%mm0", "%mm1", "%mm2", "%mm3" // clobber list 3471 , "%mm4", "%mm5", "%mm6", "%mm7" 3472 #endif 3473 ); 3474 } 3475 break; // end 3 bpp 3476 3477 case 4: // formerly shared with 6 bpp case via _ShiftBpp and _ShiftRem, 3478 { // but loop uses all 8 MMX regs, and psrlq/psllq require 64-bit 3479 // mem (PIC/.so problems), MMX reg (none left), or immediate 3480 // _ShiftBpp = bpp << 3; // 32 (psllq) 3481 // _ShiftRem = 64 - _ShiftBpp; // 32 (psrlq) 3482 3483 __asm__ __volatile__ ( 3484 LOAD_GOT_rbp 3485 "movq " HB_CLEAR_MASK ", %%mm4 \n\t" // _HBClearMask -> mm4 3486 "movq " LB_CARRY_MASK ", %%mm5 \n\t" // _LBCarryMask -> mm5 3487 // re-init address pointers and offset 3488 // preload "movl diff, %%ecx \n\t" // ecx: x = offset to 3489 // alignment boundary 3490 "movq " AMASK0_8_0 ", %%mm7 \n\t" // _amask0_8_0 -> mm7 3491 RESTORE_rbp 3492 3493 // ... and clear all bytes except for 1st active group 3494 // preload "movl row, %1 \n\t" // edi: Avg(x) 3495 "psrlq $32, %%mm7 \n\t" // was _ShiftRem 3496 // preload "movl prev_row, %0 \n\t" // esi: Prior(x) 3497 "movq %%mm7, %%mm6 \n\t" 3498 "psllq $32, %%mm6 \n\t" // mask for 2nd active group 3499 3500 // prime the pump: load the first Raw(x-bpp) data set 3501 "movq -8(%1," PCX ",), %%mm2 \n\t" // load previous aligned 8 bytes 3502 // (we correct pos. in loop below) 3503 "avg_4lp: \n\t" 3504 "movq (%1," PCX ",), %%mm0 \n\t" 3505 "psrlq $32, %%mm2 \n\t" // shift data to pos. correctly 3506 "movq (%0," PCX ",), %%mm1 \n\t" 3507 // add (Prev_row/2) to average 3508 "movq %%mm5, %%mm3 \n\t" 3509 "pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte 3510 "psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2 3511 "pand %%mm4, %%mm1 \n\t" // clear invalid bit 7 of each 3512 // byte 3513 "paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg for 3514 // each byte 3515 // add 1st active group (Raw(x-bpp)/2) to average with _LBCarry 3516 "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting 3517 // LBCarrys 3518 "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte 3519 // where both 3520 // lsb's were == 1 (only valid for active group) 3521 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2 3522 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each 3523 // byte 3524 "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2) 3525 // for each byte 3526 "pand %%mm7, %%mm2 \n\t" // leave only Active Group 1 3527 // bytes to add to Avg 3528 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to Avg 3529 // for each Active 3530 // byte 3531 // add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry 3532 "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2 3533 "psllq $32, %%mm2 \n\t" // shift data to pos. correctly 3534 "addl $8, %%ecx \n\t" 3535 "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting 3536 // LBCarrys 3537 "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte 3538 // where both 3539 // lsb's were == 1 (only valid for active group) 3540 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2 3541 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each 3542 // byte 3543 "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2) 3544 // for each byte 3545 "pand %%mm6, %%mm2 \n\t" // leave only Active Group 2 3546 // bytes to add to Avg 3547 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to 3548 // Avg for each Active byte 3549 "cmpl %%eax, %%ecx \n\t" // MMXLength 3550 // now ready to write back to memory 3551 "movq %%mm0, -8(%1," PCX ",) \n\t" 3552 // prep Raw(x-bpp) for next loop 3553 "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2 3554 "jb avg_4lp \n\t" 3555 3556 : "=S" (dummy_value_S), // output regs (dummy) 3557 "=D" (dummy_value_D), 3558 "=c" (dummy_value_c), 3559 "=a" (dummy_value_a) 3560 3561 : "0" (prev_row), // esi/rsi // input regs 3562 "1" (row), // edi/rdi 3563 "2" (diff), // ecx 3564 "3" (MMXLength) // eax 3565 3566 #if defined(CLOBBER_MMX_REGS_SUPPORTED) 3567 : "%mm0", "%mm1", "%mm2", "%mm3" // clobber list 3568 , "%mm4", "%mm5", "%mm6", "%mm7" 3569 #endif 3570 ); 3571 } 3572 break; // end 4 bpp 3573 3574 case 1: 3575 { 3576 __asm__ __volatile__ ( 3577 // re-init address pointers and offset 3578 // preload "movl diff, %%ecx \n\t" // ecx: x = offset to align. bdry 3579 // preload "movl row, %1 \n\t" // edi/rdi: Avg(x) 3580 // preload "movl FullLength, %%eax \n\t" 3581 "cmpl %%eax, %%ecx \n\t" // test if offset at end of array 3582 "jnb avg_1end \n\t" 3583 3584 SAVE_ebp 3585 3586 // do Avg decode for remaining bytes 3587 // preload "movl prev_row, %0 \n\t" // esi/rsi: Prior(x) 3588 "mov %1, " PBP " \n\t" // copy of row pointer... 3589 "dec " PBP " \n\t" // ebp/rbp: Raw(x-bpp) 3590 "xorl %%edx, %%edx \n\t" // zero edx before using dl & dx 3591 // in loop below 3592 SAVE_GOT_ebx 3593 3594 "avg_1lp: \n\t" 3595 // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2) 3596 "xorl %%ebx, %%ebx \n\t" 3597 "movb (%0," PCX ",), %%dl \n\t" // load dl with Prior(x) 3598 "movb (" PBP "," PCX ",), %%bl \n\t" // load bl with Raw(x-bpp) 3599 "addw %%dx, %%bx \n\t" 3600 "incl %%ecx \n\t" 3601 "shrw %%bx \n\t" // divide by 2 3602 "addb -1(%1," PCX ",), %%bl \n\t" // add Avg(x); -1 to offset 3603 // inc ecx 3604 "cmpl %%eax, %%ecx \n\t" // check if at end of array 3605 "movb %%bl, -1(%1," PCX ",) \n\t" // write back Raw(x); 3606 // mov does not affect flags; -1 to offset inc ecx 3607 "jb avg_1lp \n\t" 3608 3609 RESTORE_GOT_ebx 3610 RESTORE_ebp 3611 3612 "avg_1end: \n\t" 3613 3614 : "=S" (dummy_value_S), // output regs (dummy) 3615 "=D" (dummy_value_D), 3616 "=c" (dummy_value_c), 3617 "=a" (dummy_value_a) 3618 3619 : "0" (prev_row), // esi/rsi // input regs 3620 "1" (row), // edi/rdi 3621 "2" (diff), // ecx 3622 "3" (FullLength) // eax 3623 3624 : "%edx" // clobber list 3625 _CLOBBER_GOT_ebx 3626 _CLOBBER_ebp 3627 ); 3628 } 3629 return; // end 1 bpp 3630 3631 case 2: 3632 { 3633 // _ShiftBpp = 16; // == 2 * 8 3634 // _ShiftRem = 48; // == 64 - _ShiftBpp 3635 3636 __asm__ __volatile__ ( 3637 LOAD_GOT_rbp 3638 // load (former) _ActiveMask 3639 "movq " AMASK6_2_0 ", %%mm7 \n\t" // _amask6_2_0 -> mm7 3640 // re-init address pointers and offset 3641 // preload "movl diff, %%ecx \n\t" // ecx: x = offset to 3642 // alignment boundary 3643 "movq " LB_CARRY_MASK ", %%mm5 \n\t" // _LBCarryMask -> mm5 3644 // preload "movl row, %1 \n\t" // edi: Avg(x) 3645 "movq " HB_CLEAR_MASK ", %%mm4 \n\t" // _HBClearMask -> mm4 3646 // preload "movl prev_row, %0 \n\t" // esi: Prior(x) 3647 RESTORE_rbp 3648 3649 // prime the pump: load the first Raw(x-bpp) data set 3650 "movq -8(%1," PCX ",), %%mm2 \n\t" // load previous aligned 8 bytes 3651 // (we correct pos. in loop below) 3652 "avg_2lp: \n\t" 3653 "movq (%1," PCX ",), %%mm0 \n\t" 3654 "psrlq $48, %%mm2 \n\t" // shift data to pos. correctly 3655 "movq (%0," PCX ",), %%mm1 \n\t" // (GRR BUGFIX: was psllq) 3656 // add (Prev_row/2) to average 3657 "movq %%mm5, %%mm3 \n\t" 3658 "pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte 3659 "psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2 3660 "pand %%mm4, %%mm1 \n\t" // clear invalid bit 7 of each 3661 // byte 3662 "movq %%mm7, %%mm6 \n\t" 3663 "paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg for 3664 // each byte 3665 3666 // add 1st active group (Raw(x-bpp)/2) to average with _LBCarry 3667 "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting 3668 // LBCarrys 3669 "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte 3670 // where both 3671 // lsb's were == 1 (only valid 3672 // for active group) 3673 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2 3674 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each 3675 // byte 3676 "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2) 3677 // for each byte 3678 "pand %%mm6, %%mm2 \n\t" // leave only Active Group 1 3679 // bytes to add to Avg 3680 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to Avg 3681 // for each Active byte 3682 3683 // add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry 3684 "psllq $16, %%mm6 \n\t" // shift the mm6 mask to cover 3685 // bytes 2 & 3 3686 "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2 3687 "psllq $16, %%mm2 \n\t" // shift data to pos. correctly 3688 "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting 3689 // LBCarrys 3690 "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte 3691 // where both 3692 // lsb's were == 1 (only valid 3693 // for active group) 3694 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2 3695 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each 3696 // byte 3697 "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2) 3698 // for each byte 3699 "pand %%mm6, %%mm2 \n\t" // leave only Active Group 2 3700 // bytes to add to Avg 3701 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to 3702 // Avg for each Active byte 3703 3704 // add 3rd active group (Raw(x-bpp)/2) to average with _LBCarry 3705 "psllq $16, %%mm6 \n\t" // shift the mm6 mask to cover 3706 // bytes 4 & 5 3707 "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2 3708 "psllq $16, %%mm2 \n\t" // shift data to pos. correctly 3709 "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting 3710 // LBCarrys 3711 "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte 3712 // where both lsb's were == 1 3713 // (only valid for active group) 3714 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2 3715 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each 3716 // byte 3717 "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2) 3718 // for each byte 3719 "pand %%mm6, %%mm2 \n\t" // leave only Active Group 2 3720 // bytes to add to Avg 3721 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to 3722 // Avg for each Active byte 3723 3724 // add 4th active group (Raw(x-bpp)/2) to average with _LBCarry 3725 "psllq $16, %%mm6 \n\t" // shift the mm6 mask to cover 3726 // bytes 6 & 7 3727 "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2 3728 "psllq $16, %%mm2 \n\t" // shift data to pos. correctly 3729 "addl $8, %%ecx \n\t" 3730 "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting 3731 // LBCarrys 3732 "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte 3733 // where both 3734 // lsb's were == 1 (only valid 3735 // for active group) 3736 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2 3737 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each 3738 // byte 3739 "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2) 3740 // for each byte 3741 "pand %%mm6, %%mm2 \n\t" // leave only Active Group 2 3742 // bytes to add to Avg 3743 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to 3744 // Avg for each Active byte 3745 "cmpl %%eax, %%ecx \n\t" // MMXLength 3746 // now ready to write back to memory 3747 "movq %%mm0, -8(%1," PCX ",) \n\t" 3748 // prep Raw(x-bpp) for next loop 3749 "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2 3750 "jb avg_2lp \n\t" 3751 3752 : "=S" (dummy_value_S), // output regs (dummy) 3753 "=D" (dummy_value_D), 3754 "=c" (dummy_value_c), 3755 "=a" (dummy_value_a) 3756 3757 : "0" (prev_row), // esi/rsi // input regs 3758 "1" (row), // edi/rdi 3759 "2" (diff), // ecx 3760 "3" (MMXLength) // eax 3761 3762 #if defined(CLOBBER_MMX_REGS_SUPPORTED) 3763 : "%mm0", "%mm1", "%mm2", "%mm3" // clobber list 3764 , "%mm4", "%mm5", "%mm6", "%mm7" 3765 #endif 3766 ); 3767 } 3768 break; // end 2 bpp 3769 3770 case 6: // formerly shared with 4 bpp case (see comments there) 3771 { 3772 // _ShiftBpp = bpp << 3; // 48 (psllq) 3773 // _ShiftRem = 64 - _ShiftBpp; // 16 (psrlq) 3774 3775 __asm__ __volatile__ ( 3776 LOAD_GOT_rbp 3777 "movq " HB_CLEAR_MASK ", %%mm4 \n\t" // _HBClearMask -> mm4 3778 "movq " LB_CARRY_MASK ", %%mm5 \n\t" // _LBCarryMask -> mm5 3779 // re-init address pointers and offset 3780 // preload "movl diff, %%ecx \n\t" // ecx: x = offset to 3781 // alignment boundary 3782 "movq " AMASK0_8_0 ", %%mm7 \n\t" // _amask0_8_0 -> mm7 3783 RESTORE_rbp 3784 3785 // ... and clear all bytes except for 1st active group 3786 // preload "movl row, %1 \n\t" // edi: Avg(x) 3787 "psrlq $16, %%mm7 \n\t" 3788 // preload "movl prev_row, %0 \n\t" // esi: Prior(x) 3789 "movq %%mm7, %%mm6 \n\t" 3790 "psllq $48, %%mm6 \n\t" // mask for 2nd active group 3791 3792 // prime the pump: load the first Raw(x-bpp) data set 3793 "movq -8(%1," PCX ",), %%mm2 \n\t" // load previous aligned 8 bytes 3794 // (we correct pos. in loop below) 3795 "avg_6lp: \n\t" 3796 "movq (%1," PCX ",), %%mm0 \n\t" 3797 "psrlq $16, %%mm2 \n\t" // shift data to pos. correctly 3798 "movq (%0," PCX ",), %%mm1 \n\t" 3799 // add (Prev_row/2) to average 3800 "movq %%mm5, %%mm3 \n\t" 3801 "pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte 3802 "psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2 3803 "pand %%mm4, %%mm1 \n\t" // clear invalid bit 7 of each 3804 // byte 3805 "paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg for 3806 // each byte 3807 // add 1st active group (Raw(x-bpp)/2) to average with _LBCarry 3808 "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting 3809 // LBCarrys 3810 "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte 3811 // where both 3812 // lsb's were == 1 (only valid for active group) 3813 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2 3814 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each 3815 // byte 3816 "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2) 3817 // for each byte 3818 "pand %%mm7, %%mm2 \n\t" // leave only Active Group 1 3819 // bytes to add to Avg 3820 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to Avg 3821 // for each Active 3822 // byte 3823 // add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry 3824 "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2 3825 "psllq $48, %%mm2 \n\t" // shift data to pos. correctly 3826 "addl $8, %%ecx \n\t" 3827 "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting 3828 // LBCarrys 3829 "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte 3830 // where both 3831 // lsb's were == 1 (only valid for active group) 3832 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2 3833 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each 3834 // byte 3835 "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2) 3836 // for each byte 3837 "pand %%mm6, %%mm2 \n\t" // leave only Active Group 2 3838 // bytes to add to Avg 3839 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to 3840 // Avg for each Active byte 3841 "cmpl %%eax, %%ecx \n\t" // MMXLength 3842 // now ready to write back to memory 3843 "movq %%mm0, -8(%1," PCX ",) \n\t" 3844 // prep Raw(x-bpp) for next loop 3845 "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2 3846 "jb avg_6lp \n\t" 3847 3848 : "=S" (dummy_value_S), // output regs (dummy) 3849 "=D" (dummy_value_D), 3850 "=c" (dummy_value_c), 3851 "=a" (dummy_value_a) 3852 3853 : "0" (prev_row), // esi/rsi // input regs 3854 "1" (row), // edi/rdi 3855 "2" (diff), // ecx 3856 "3" (MMXLength) // eax 3857 3858 #if defined(CLOBBER_MMX_REGS_SUPPORTED) 3859 : "%mm0", "%mm1", "%mm2", "%mm3" // clobber list 3860 , "%mm4", "%mm5", "%mm6", "%mm7" 3861 #endif 3862 ); 3863 } 3864 break; // end 6 bpp 3865 3866 case 8: 3867 { 3868 __asm__ __volatile__ ( 3869 // re-init address pointers and offset 3870 // preload "movl diff, %%ecx \n\t" // ecx: x = offset to 3871 // alignment boundary 3872 LOAD_GOT_rbp 3873 "movq " LB_CARRY_MASK ", %%mm5 \n\t" // [interleave for parallel.?] 3874 // preload "movl row, %1 \n\t" // edi: Avg(x) 3875 "movq " HB_CLEAR_MASK ", %%mm4 \n\t" // _HBClearMask -> mm4 3876 // preload "movl prev_row, %0 \n\t" // esi: Prior(x) 3877 RESTORE_rbp 3878 3879 // prime the pump: load the first Raw(x-bpp) data set 3880 "movq -8(%1," PCX ",), %%mm2 \n\t" // load previous aligned 8 bytes 3881 // (NO NEED to correct pos. in loop below) 3882 3883 "avg_8lp: \n\t" 3884 "movq (%1," PCX ",), %%mm0 \n\t" 3885 "movq %%mm5, %%mm3 \n\t" 3886 "movq (%0," PCX ",), %%mm1 \n\t" 3887 "addl $8, %%ecx \n\t" 3888 "pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte 3889 "psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2 3890 "pand %%mm2, %%mm3 \n\t" // get LBCarrys for each byte 3891 // where both lsb's were == 1 3892 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2 3893 "pand %%mm4, %%mm1 \n\t" // clear invalid bit 7, each byte 3894 "paddb %%mm3, %%mm0 \n\t" // add LBCarrys to Avg, each byte 3895 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7, each byte 3896 "paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg, each 3897 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) to Avg for each 3898 "cmpl %%eax, %%ecx \n\t" // MMXLength 3899 "movq %%mm0, -8(%1," PCX ",) \n\t" 3900 "movq %%mm0, %%mm2 \n\t" // reuse as Raw(x-bpp) 3901 "jb avg_8lp \n\t" 3902 3903 : "=S" (dummy_value_S), // output regs (dummy) 3904 "=D" (dummy_value_D), 3905 "=c" (dummy_value_c), 3906 "=a" (dummy_value_a) 3907 3908 : "0" (prev_row), // esi/rsi // input regs 3909 "1" (row), // edi/rdi 3910 "2" (diff), // ecx 3911 "3" (MMXLength) // eax 3912 3913 #if defined(CLOBBER_MMX_REGS_SUPPORTED) 3914 : "%mm0", "%mm1", "%mm2" // clobber list 3915 , "%mm3", "%mm4", "%mm5" 3916 #endif 3917 ); 3918 } 3919 break; // end 8 bpp 3920 3921 default: // bpp != 1,2,3,4,6,8: doesn't exist 3922 { 3923 // ERROR: SHOULD NEVER BE REACHED 3924 #if defined(PNG_DEBUG) 3925 png_debug(1, "Internal libpng logic error (GCC " 3926 "png_read_filter_row_mmx_avg())\n"); 3927 #endif 3928 } 3929 break; 3930 3931 } // end switch (bpp) 3932 3933 __asm__ __volatile__ ( 3934 // MMX acceleration complete; now do clean-up 3935 // check if any remaining bytes left to decode 3936 //pre "movl FullLength, %%edx \n\t" 3937 //pre "movl MMXLength, %%eax \n\t" // eax: x == offset bytes after MMX 3938 //pre "movl row, %2 \n\t" // edi: Avg(x) 3939 "cmpl %%edx, %%eax \n\t" // test if offset at end of array 3940 "jnb avg_end \n\t" 3941 3942 SAVE_ebp 3943 3944 // do Avg decode for remaining bytes 3945 //pre "movl prev_row, %1 \n\t" // esi: Prior(x) 3946 "mov %2, " PBP " \n\t" // copy of row pointer... 3947 //pre "subl bpp, " PBP " \n\t" // (bpp is preloaded into ecx) 3948 "sub " PCX "," PBP " \n\t" // ebp: Raw(x-bpp) 3949 "xorl %%ecx, %%ecx \n\t" // zero ecx before using cl & cx below 3950 3951 SAVE_GOT_ebx 3952 3953 "avg_lp2: \n\t" 3954 // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2) 3955 "xorl %%ebx, %%ebx \n\t" 3956 "movb (%1," PAX ",), %%cl \n\t" // load cl with Prior(x) 3957 "movb (" PBP "," PAX ",), %%bl \n\t" // load bl with Raw(x-bpp) 3958 "addw %%cx, %%bx \n\t" 3959 "incl %%eax \n\t" 3960 "shrw %%bx \n\t" // divide by 2 3961 "addb -1(%2," PAX ",), %%bl \n\t" // add Avg(x); -1 to offset inc eax 3962 "cmpl %%edx, %%eax \n\t" // check if at end of array 3963 "movb %%bl, -1(%2," PAX ",) \n\t" // write back Raw(x) [mov does not 3964 "jb avg_lp2 \n\t" // affect flags; -1 to offset inc eax] 3965 3966 RESTORE_GOT_ebx 3967 RESTORE_ebp 3968 3969 "avg_end: \n\t" 3970 "EMMS \n\t" // end MMX; prep for poss. FP instrs. 3971 3972 : "=c" (dummy_value_c), // output regs (dummy) 3973 "=S" (dummy_value_S), 3974 "=D" (dummy_value_D), 3975 "=a" (dummy_value_a), 3976 "=d" (dummy_value_d) 3977 3978 : "0" (bpp), // ecx // input regs 3979 "1" (prev_row), // esi/rsi 3980 "2" (row), // edi/rdi 3981 "3" (MMXLength), // eax 3982 "4" (FullLength) // edx 3983 3984 CLOB_COLON_ebx_ebp // clobber list 3985 CLOBBER_GOT_ebx 3986 CLOB_COMMA_ebx_ebp 3987 CLOBBER_ebp 3988 ); 3989 3990 } /* end png_read_filter_row_mmx_avg() */ 3991 3992 #endif /* PNG_MMX_READ_FILTER_AVG_SUPPORTED */ 3993 3994 3995 3996 #if defined(PNG_MMX_READ_FILTER_PAETH_SUPPORTED) 3997 #if defined(PNG_x86_64_USE_GOTPCREL) || defined(PNG_THREAD_UNSAFE_OK) 3998 3999 //===========================================================================// 4000 // // 4001 // P N G _ R E A D _ F I L T E R _ R O W _ M M X _ P A E T H // 4002 // // 4003 //===========================================================================// 4004 4005 // Optimized code for PNG Paeth filter decoder 4006 4007 static void /* PRIVATE */ 4008 png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row, 4009 png_bytep prev_row) 4010 { 4011 unsigned FullLength, MMXLength; // png_uint_32 is actually 64-bit on x86-64 4012 int bpp; 4013 int dummy_value_a; 4014 int dummy_value_c; // fix 'forbidden register 2 (cx) was spilled' error 4015 int dummy_value_d; 4016 png_charp dummy_value_S; 4017 png_charp dummy_value_D; 4018 int diff; // __attribute__((used)); 4019 4020 bpp = (row_info->pixel_depth + 7) >> 3; // calc number of bytes per pixel 4021 FullLength = row_info->rowbytes; // number of bytes to filter 4022 4023 __asm__ __volatile__ ( 4024 SAVE_GOT_ebx 4025 SAVE_r15 4026 SAVE_ebp 4027 //pre "movl row, %2 \n\t" // edi/rdi 4028 "xorl %%ebx, %%ebx \n\t" // ebx: x offset 4029 //pre "movl prev_row, %1 \n\t" // esi/rsi 4030 "xorl %%edx, %%edx \n\t" // edx: x-bpp offset 4031 //pre "movl FullLength, %%eax \n\t" // bring in via eax... 4032 SAVE_FullLength // ...but store for later use 4033 "xorl %%eax, %%eax \n\t" 4034 4035 // Compute the Raw value for the first bpp bytes 4036 // Note: the formula works out to be always 4037 // Paeth(x) = Raw(x) + Prior(x) where x < bpp 4038 "paeth_rlp: \n\t" 4039 "movb (%2," PBX ",), %%al \n\t" 4040 "addb (%1," PBX ",), %%al \n\t" 4041 "incl %%ebx \n\t" 4042 //pre "cmpl bpp, %%ebx \n\t" (bpp is preloaded into ecx) 4043 "cmpl %%ecx, %%ebx \n\t" 4044 "movb %%al, -1(%2," PBX ",) \n\t" 4045 "jb paeth_rlp \n\t" 4046 4047 // get # of bytes to alignment (note: computing _delta_ of two pointers, 4048 // so hereafter %%ebp is sufficient even on 64-bit) 4049 "mov %2, " PBP " \n\t" // take start of row 4050 "add " PBX "," PBP " \n\t" // add bpp 4051 "add $0xf, " PBP " \n\t" // add 7+8 to incr past alignment bdry 4052 // "andl $0xfffffff8, %%ebp \n\t" // mask to alignment boundary (32-bit!) 4053 CLEAR_BOTTOM_3_BITS PBP "\n\t" // mask to alignment boundary 4054 "sub %2, " PBP " \n\t" // subtract row ptr again => ebp = 4055 "jz paeth_go \n\t" // target value of ebx at alignment 4056 4057 "xorl %%ecx, %%ecx \n\t" 4058 4059 SAVE_r11_r12_r13 4060 4061 // fix alignment 4062 "paeth_lp1: \n\t" 4063 "xorl %%eax, %%eax \n\t" 4064 // pav = p - a = (a + b - c) - a = b - c 4065 "movb (%1," PBX ",), %%al \n\t" // load Prior(x) into al 4066 "movb (%1," PDX ",), %%cl \n\t" // load Prior(x-bpp) into cl 4067 "subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp) 4068 "movl %%eax, " pa_TEMP " \n\t" // Save pav for later use 4069 "xorl %%eax, %%eax \n\t" 4070 // pbv = p - b = (a + b - c) - b = a - c 4071 "movb (%2," PDX ",), %%al \n\t" // load Raw(x-bpp) into al 4072 "subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp) 4073 "movl %%eax, %%ecx \n\t" 4074 // pcv = p - c = (a + b - c) - c = (a - c) + (b - c) = pav + pbv 4075 "addl " pa_TEMP ", %%eax \n\t" // pcv = pav + pbv 4076 // pc = abs(pcv) 4077 "testl $0x80000000, %%eax \n\t" 4078 "jz paeth_pca \n\t" 4079 "negl %%eax \n\t" // reverse sign of neg values 4080 4081 "paeth_pca: \n\t" 4082 "movl %%eax, " pc_TEMP " \n\t" // save pc for later use 4083 // pb = abs(pbv) 4084 "testl $0x80000000, %%ecx \n\t" 4085 "jz paeth_pba \n\t" 4086 "negl %%ecx \n\t" // reverse sign of neg values 4087 4088 "paeth_pba: \n\t" 4089 "movl %%ecx, " pb_TEMP " \n\t" // save pb for later use 4090 // pa = abs(pav) 4091 "movl " pa_TEMP ", %%eax \n\t" 4092 "testl $0x80000000, %%eax \n\t" 4093 "jz paeth_paa \n\t" 4094 "negl %%eax \n\t" // reverse sign of neg values 4095 4096 "paeth_paa: \n\t" 4097 "movl %%eax, " pa_TEMP " \n\t" // save pa for later use 4098 // test if pa <= pb 4099 "cmpl %%ecx, %%eax \n\t" 4100 "jna paeth_abb \n\t" 4101 // pa > pb; now test if pb <= pc 4102 "cmpl " pc_TEMP ", %%ecx \n\t" 4103 "jna paeth_bbc \n\t" 4104 // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp) 4105 "movb (%1," PDX ",), %%cl \n\t" // load Prior(x-bpp) into cl 4106 "jmp paeth_paeth \n\t" 4107 4108 "paeth_bbc: \n\t" 4109 // pb <= pc; Raw(x) = Paeth(x) + Prior(x) 4110 "movb (%1," PBX ",), %%cl \n\t" // load Prior(x) into cl 4111 "jmp paeth_paeth \n\t" 4112 4113 "paeth_abb: \n\t" 4114 // pa <= pb; now test if pa <= pc 4115 "cmpl " pc_TEMP ", %%eax \n\t" 4116 "jna paeth_abc \n\t" 4117 // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp) 4118 "movb (%1," PDX ",), %%cl \n\t" // load Prior(x-bpp) into cl 4119 "jmp paeth_paeth \n\t" 4120 4121 "paeth_abc: \n\t" 4122 // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp) 4123 "movb (%2," PDX ",), %%cl \n\t" // load Raw(x-bpp) into cl 4124 4125 "paeth_paeth: \n\t" 4126 "incl %%ebx \n\t" 4127 "incl %%edx \n\t" 4128 // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256 4129 "addb %%cl, -1(%2," PBX ",) \n\t" 4130 "cmpl %%ebp, %%ebx \n\t" 4131 "jb paeth_lp1 \n\t" 4132 4133 RESTORE_r11_r12_r13 4134 4135 "paeth_go: \n\t" 4136 RESTORE_FullLength "%%ecx \n\t" // FullLength -> ecx 4137 "movl %%ecx, %%eax \n\t" 4138 "subl %%ebx, %%eax \n\t" // subtract alignment fix 4139 "andl $0x00000007, %%eax \n\t" // calc bytes over mult of 8 4140 "subl %%eax, %%ecx \n\t" // drop over bytes from original length 4141 //out "movl %%ecx, MMXLength \n\t" 4142 "movl %%ebp, %%eax \n\t" // ebp = diff, but no reg constraint(?) 4143 RESTORE_ebp // (could swap ebp and edx functions) 4144 RESTORE_r15 4145 RESTORE_GOT_ebx 4146 4147 : "=c" (MMXLength), // output regs 4148 "=S" (dummy_value_S), 4149 "=D" (dummy_value_D), 4150 "=a" (diff) 4151 4152 : "0" (bpp), // ecx // input regs 4153 "1" (prev_row), // esi/rsi 4154 "2" (row), // edi/rdi 4155 "3" (FullLength) // eax 4156 4157 : "%edx" // clobber list 4158 _CLOBBER_r11_r12_r13 4159 _CLOBBER_r15 4160 _CLOBBER_ebp 4161 _CLOBBER_GOT_ebx 4162 ); 4163 4164 // now do the math for the rest of the row 4165 switch (bpp) 4166 { 4167 case 3: 4168 { 4169 // _ShiftBpp = 24; // == bpp * 8 4170 // _ShiftRem = 40; // == 64 - _ShiftBpp 4171 4172 __asm__ __volatile__ ( 4173 LOAD_GOT_rbp 4174 // preload "movl diff, %%ecx \n\t" 4175 // preload "movl row, %1 \n\t" // edi/rdi 4176 // preload "movl prev_row, %0 \n\t" // esi/rsi 4177 "pxor %%mm0, %%mm0 \n\t" 4178 4179 // prime the pump: load the first Raw(x-bpp) data set 4180 "movq -8(%1," PCX ",), %%mm1 \n\t" 4181 "paeth_3lp: \n\t" 4182 "psrlq $40, %%mm1 \n\t" // shift last 3 bytes to 1st 4183 // 3 bytes 4184 "movq (%0," PCX ",), %%mm2 \n\t" // load b=Prior(x) 4185 "punpcklbw %%mm0, %%mm1 \n\t" // unpack High bytes of a 4186 "movq -8(%0," PCX ",), %%mm3 \n\t" // prep c=Prior(x-bpp) bytes 4187 "punpcklbw %%mm0, %%mm2 \n\t" // unpack High bytes of b 4188 "psrlq $40, %%mm3 \n\t" // shift last 3 bytes to 1st 4189 // 3 bytes 4190 // pav = p - a = (a + b - c) - a = b - c 4191 "movq %%mm2, %%mm4 \n\t" 4192 "punpcklbw %%mm0, %%mm3 \n\t" // unpack High bytes of c 4193 // pbv = p - b = (a + b - c) - b = a - c 4194 "movq %%mm1, %%mm5 \n\t" 4195 "psubw %%mm3, %%mm4 \n\t" 4196 "pxor %%mm7, %%mm7 \n\t" 4197 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv 4198 "movq %%mm4, %%mm6 \n\t" 4199 "psubw %%mm3, %%mm5 \n\t" 4200 4201 // pa = abs(p-a) = abs(pav) 4202 // pb = abs(p-b) = abs(pbv) 4203 // pc = abs(p-c) = abs(pcv) 4204 "pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0 4205 "paddw %%mm5, %%mm6 \n\t" 4206 "pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7 4207 "pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0 4208 "psubw %%mm0, %%mm4 \n\t" 4209 "pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0 4210 "psubw %%mm0, %%mm4 \n\t" 4211 "psubw %%mm7, %%mm5 \n\t" 4212 "pxor %%mm0, %%mm0 \n\t" 4213 "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0 4214 "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7 4215 "psubw %%mm7, %%mm5 \n\t" 4216 "psubw %%mm0, %%mm6 \n\t" 4217 // test pa <= pb 4218 "movq %%mm4, %%mm7 \n\t" 4219 "psubw %%mm0, %%mm6 \n\t" 4220 "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb? 4221 "movq %%mm7, %%mm0 \n\t" 4222 // use mm7 mask to merge pa & pb 4223 "pand %%mm7, %%mm5 \n\t" 4224 // use mm0 mask copy to merge a & b 4225 "pand %%mm0, %%mm2 \n\t" 4226 "pandn %%mm4, %%mm7 \n\t" 4227 "pandn %%mm1, %%mm0 \n\t" 4228 "paddw %%mm5, %%mm7 \n\t" 4229 "paddw %%mm2, %%mm0 \n\t" 4230 // test ((pa <= pb)? pa:pb) <= pc 4231 "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc? 4232 "pxor %%mm1, %%mm1 \n\t" 4233 "pand %%mm7, %%mm3 \n\t" 4234 "pandn %%mm0, %%mm7 \n\t" 4235 "paddw %%mm3, %%mm7 \n\t" 4236 "pxor %%mm0, %%mm0 \n\t" 4237 "packuswb %%mm1, %%mm7 \n\t" 4238 "movq (%0," PCX ",), %%mm3 \n\t" // load c=Prior(x-bpp) 4239 "pand " AMASK5_3_0 ", %%mm7 \n\t" // _amask5_3_0 (was _ActiveMask) 4240 "movq %%mm3, %%mm2 \n\t" // load b=Prior(x) step 1 4241 "paddb (%1," PCX ",), %%mm7 \n\t" // add Paeth predictor + Raw(x) 4242 "punpcklbw %%mm0, %%mm3 \n\t" // unpack High bytes of c 4243 "movq %%mm7, (%1," PCX ",) \n\t" // write back updated value 4244 "movq %%mm7, %%mm1 \n\t" // now mm1 will be used as 4245 // Raw(x-bpp) 4246 // now do Paeth for 2nd set of bytes (3-5) 4247 "psrlq $24, %%mm2 \n\t" // load b=Prior(x) step 2 4248 "punpcklbw %%mm0, %%mm1 \n\t" // unpack High bytes of a 4249 "pxor %%mm7, %%mm7 \n\t" 4250 "punpcklbw %%mm0, %%mm2 \n\t" // unpack High bytes of b 4251 // pbv = p - b = (a + b - c) - b = a - c 4252 "movq %%mm1, %%mm5 \n\t" 4253 // pav = p - a = (a + b - c) - a = b - c 4254 "movq %%mm2, %%mm4 \n\t" 4255 "psubw %%mm3, %%mm5 \n\t" 4256 "psubw %%mm3, %%mm4 \n\t" 4257 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = 4258 // pav + pbv = pbv + pav 4259 "movq %%mm5, %%mm6 \n\t" 4260 "paddw %%mm4, %%mm6 \n\t" 4261 4262 // pa = abs(p-a) = abs(pav) 4263 // pb = abs(p-b) = abs(pbv) 4264 // pc = abs(p-c) = abs(pcv) 4265 "pcmpgtw %%mm5, %%mm0 \n\t" // create mask pbv bytes < 0 4266 "pcmpgtw %%mm4, %%mm7 \n\t" // create mask pav bytes < 0 4267 "pand %%mm5, %%mm0 \n\t" // only pbv bytes < 0 in mm0 4268 "pand %%mm4, %%mm7 \n\t" // only pav bytes < 0 in mm7 4269 "psubw %%mm0, %%mm5 \n\t" 4270 "psubw %%mm7, %%mm4 \n\t" 4271 "psubw %%mm0, %%mm5 \n\t" 4272 "psubw %%mm7, %%mm4 \n\t" 4273 "pxor %%mm0, %%mm0 \n\t" 4274 "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0 4275 "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7 4276 "psubw %%mm0, %%mm6 \n\t" 4277 // test pa <= pb 4278 "movq %%mm4, %%mm7 \n\t" 4279 "psubw %%mm0, %%mm6 \n\t" 4280 "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb? 4281 "movq %%mm7, %%mm0 \n\t" 4282 // use mm7 mask to merge pa & pb 4283 "pand %%mm7, %%mm5 \n\t" 4284 // use mm0 mask copy to merge a & b 4285 "pand %%mm0, %%mm2 \n\t" 4286 "pandn %%mm4, %%mm7 \n\t" 4287 "pandn %%mm1, %%mm0 \n\t" 4288 "paddw %%mm5, %%mm7 \n\t" 4289 "paddw %%mm2, %%mm0 \n\t" 4290 // test ((pa <= pb)? pa:pb) <= pc 4291 "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc? 4292 "movq (%0," PCX ",), %%mm2 \n\t" // load b=Prior(x) 4293 "pand %%mm7, %%mm3 \n\t" 4294 "pandn %%mm0, %%mm7 \n\t" 4295 "pxor %%mm1, %%mm1 \n\t" 4296 "paddw %%mm3, %%mm7 \n\t" 4297 "pxor %%mm0, %%mm0 \n\t" 4298 "packuswb %%mm1, %%mm7 \n\t" 4299 "movq %%mm2, %%mm3 \n\t" // load c=Prior(x-bpp) step 1 4300 "pand " AMASK5_3_0 ", %%mm7 \n\t" // _amask5_3_0 (was _ActiveMask) 4301 "punpckhbw %%mm0, %%mm2 \n\t" // unpack High bytes of b 4302 "psllq $24, %%mm7 \n\t" // shift bytes to 2nd group of 4303 // 3 bytes 4304 // pav = p - a = (a + b - c) - a = b - c 4305 "movq %%mm2, %%mm4 \n\t" 4306 "paddb (%1," PCX ",), %%mm7 \n\t" // add Paeth predictor + Raw(x) 4307 "psllq $24, %%mm3 \n\t" // load c=Prior(x-bpp) step 2 4308 "movq %%mm7, (%1," PCX ",) \n\t" // write back updated value 4309 "movq %%mm7, %%mm1 \n\t" 4310 "punpckhbw %%mm0, %%mm3 \n\t" // unpack High bytes of c 4311 "psllq $24, %%mm1 \n\t" // shift bytes (was _ShiftBpp) 4312 // now mm1 will be used as Raw(x-bpp) 4313 // now do Paeth for 3rd, and final, set of bytes (6-7) 4314 "pxor %%mm7, %%mm7 \n\t" 4315 "punpckhbw %%mm0, %%mm1 \n\t" // unpack High bytes of a 4316 "psubw %%mm3, %%mm4 \n\t" 4317 // pbv = p - b = (a + b - c) - b = a - c 4318 "movq %%mm1, %%mm5 \n\t" 4319 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv 4320 "movq %%mm4, %%mm6 \n\t" 4321 "psubw %%mm3, %%mm5 \n\t" 4322 "pxor %%mm0, %%mm0 \n\t" 4323 "paddw %%mm5, %%mm6 \n\t" 4324 4325 // pa = abs(p-a) = abs(pav) 4326 // pb = abs(p-b) = abs(pbv) 4327 // pc = abs(p-c) = abs(pcv) 4328 "pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0 4329 "pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0 4330 "pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7 4331 "pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0 4332 "psubw %%mm0, %%mm4 \n\t" 4333 "psubw %%mm7, %%mm5 \n\t" 4334 "psubw %%mm0, %%mm4 \n\t" 4335 "psubw %%mm7, %%mm5 \n\t" 4336 "pxor %%mm0, %%mm0 \n\t" 4337 "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0 4338 "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7 4339 "psubw %%mm0, %%mm6 \n\t" 4340 // test pa <= pb 4341 "movq %%mm4, %%mm7 \n\t" 4342 "psubw %%mm0, %%mm6 \n\t" 4343 "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb? 4344 "movq %%mm7, %%mm0 \n\t" 4345 // use mm0 mask copy to merge a & b 4346 "pand %%mm0, %%mm2 \n\t" 4347 // use mm7 mask to merge pa & pb 4348 "pand %%mm7, %%mm5 \n\t" 4349 "pandn %%mm1, %%mm0 \n\t" 4350 "pandn %%mm4, %%mm7 \n\t" 4351 "paddw %%mm2, %%mm0 \n\t" 4352 "paddw %%mm5, %%mm7 \n\t" 4353 // test ((pa <= pb)? pa:pb) <= pc 4354 "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc? 4355 "pand %%mm7, %%mm3 \n\t" 4356 "pandn %%mm0, %%mm7 \n\t" 4357 "paddw %%mm3, %%mm7 \n\t" 4358 "pxor %%mm1, %%mm1 \n\t" 4359 "packuswb %%mm7, %%mm1 \n\t" 4360 // step ecx to next set of 8 bytes and repeat loop til done 4361 "addl $8, %%ecx \n\t" 4362 "pand " AMASK0_2_6 ", %%mm1 \n\t" // _amask0_2_6 (_ActiveMaskEnd) 4363 "paddb -8(%1," PCX ",), %%mm1 \n\t" // add Paeth predictor + Raw(x) 4364 "cmpl %%eax, %%ecx \n\t" // MMXLength 4365 "pxor %%mm0, %%mm0 \n\t" // pxor does not affect flags 4366 "movq %%mm1, -8(%1," PCX ",) \n\t" // write back updated value 4367 // mm1 will be used as Raw(x-bpp) next loop 4368 // mm3 ready to be used as Prior(x-bpp) next loop 4369 "jb paeth_3lp \n\t" 4370 RESTORE_rbp 4371 4372 : "=S" (dummy_value_S), // output regs (dummy) 4373 "=D" (dummy_value_D), 4374 "=c" (dummy_value_c), 4375 "=a" (dummy_value_a) 4376 4377 : "0" (prev_row), // esi/rsi // input regs 4378 "1" (row), // edi/rdi 4379 "2" (diff), // ecx 4380 "3" (MMXLength) // eax 4381 4382 #if defined(CLOBBER_MMX_REGS_SUPPORTED) 4383 : "%mm0", "%mm1", "%mm2", "%mm3" // clobber list 4384 , "%mm4", "%mm5", "%mm6", "%mm7" 4385 #endif 4386 ); 4387 } 4388 break; // end 3 bpp 4389 4390 case 4: 4391 { 4392 __asm__ __volatile__ ( 4393 // preload "movl diff, %%ecx \n\t" 4394 // preload "movl row, %1 \n\t" // edi/rdi 4395 // preload "movl prev_row, %0 \n\t" // esi/rsi 4396 "pxor %%mm0, %%mm0 \n\t" 4397 // prime the pump: load the first Raw(x-bpp) data set 4398 "movq -8(%1," PCX ",), %%mm1 \n\t" // only time should need to read 4399 // a=Raw(x-bpp) bytes 4400 "paeth_4lp: \n\t" 4401 // do first set of 4 bytes 4402 "movq -8(%0," PCX ",), %%mm3 \n\t" // read c=Prior(x-bpp) bytes 4403 "punpckhbw %%mm0, %%mm1 \n\t" // unpack Low bytes of a 4404 "movq (%0," PCX ",), %%mm2 \n\t" // load b=Prior(x) 4405 "punpcklbw %%mm0, %%mm2 \n\t" // unpack High bytes of b 4406 // pav = p - a = (a + b - c) - a = b - c 4407 "movq %%mm2, %%mm4 \n\t" 4408 "punpckhbw %%mm0, %%mm3 \n\t" // unpack High bytes of c 4409 // pbv = p - b = (a + b - c) - b = a - c 4410 "movq %%mm1, %%mm5 \n\t" 4411 "psubw %%mm3, %%mm4 \n\t" 4412 "pxor %%mm7, %%mm7 \n\t" 4413 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv 4414 "movq %%mm4, %%mm6 \n\t" 4415 "psubw %%mm3, %%mm5 \n\t" 4416 // pa = abs(p-a) = abs(pav) 4417 // pb = abs(p-b) = abs(pbv) 4418 // pc = abs(p-c) = abs(pcv) 4419 "pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0 4420 "paddw %%mm5, %%mm6 \n\t" 4421 "pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7 4422 "pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0 4423 "psubw %%mm0, %%mm4 \n\t" 4424 "pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0 4425 "psubw %%mm0, %%mm4 \n\t" 4426 "psubw %%mm7, %%mm5 \n\t" 4427 "pxor %%mm0, %%mm0 \n\t" 4428 "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0 4429 "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7 4430 "psubw %%mm7, %%mm5 \n\t" 4431 "psubw %%mm0, %%mm6 \n\t" 4432 // test pa <= pb 4433 "movq %%mm4, %%mm7 \n\t" 4434 "psubw %%mm0, %%mm6 \n\t" 4435 "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb? 4436 "movq %%mm7, %%mm0 \n\t" 4437 // use mm7 mask to merge pa & pb 4438 "pand %%mm7, %%mm5 \n\t" 4439 // use mm0 mask copy to merge a & b 4440 "pand %%mm0, %%mm2 \n\t" 4441 "pandn %%mm4, %%mm7 \n\t" 4442 "pandn %%mm1, %%mm0 \n\t" 4443 "paddw %%mm5, %%mm7 \n\t" 4444 "paddw %%mm2, %%mm0 \n\t" 4445 // test ((pa <= pb)? pa:pb) <= pc 4446 "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc? 4447 "pxor %%mm1, %%mm1 \n\t" 4448 "pand %%mm7, %%mm3 \n\t" 4449 "pandn %%mm0, %%mm7 \n\t" 4450 "paddw %%mm3, %%mm7 \n\t" 4451 "pxor %%mm0, %%mm0 \n\t" 4452 "packuswb %%mm1, %%mm7 \n\t" 4453 "movq (%0," PCX ",), %%mm3 \n\t" // load c=Prior(x-bpp) 4454 LOAD_GOT_rbp 4455 "pand " AMASK4_4_0 ", %%mm7 \n\t" // _amask4_4_0 (was _ActiveMask) 4456 RESTORE_rbp 4457 "movq %%mm3, %%mm2 \n\t" // load b=Prior(x) step 1 4458 "paddb (%1," PCX ",), %%mm7 \n\t" // add Paeth predictor + Raw(x) 4459 "punpcklbw %%mm0, %%mm3 \n\t" // unpack High bytes of c 4460 "movq %%mm7, (%1," PCX ",) \n\t" // write back updated value 4461 "movq %%mm7, %%mm1 \n\t" // now mm1 will be used as 4462 // Raw(x-bpp) 4463 // do second set of 4 bytes 4464 "punpckhbw %%mm0, %%mm2 \n\t" // unpack Low bytes of b 4465 "punpcklbw %%mm0, %%mm1 \n\t" // unpack Low bytes of a 4466 // pav = p - a = (a + b - c) - a = b - c 4467 "movq %%mm2, %%mm4 \n\t" 4468 // pbv = p - b = (a + b - c) - b = a - c 4469 "movq %%mm1, %%mm5 \n\t" 4470 "psubw %%mm3, %%mm4 \n\t" 4471 "pxor %%mm7, %%mm7 \n\t" 4472 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv 4473 "movq %%mm4, %%mm6 \n\t" 4474 "psubw %%mm3, %%mm5 \n\t" 4475 // pa = abs(p-a) = abs(pav) 4476 // pb = abs(p-b) = abs(pbv) 4477 // pc = abs(p-c) = abs(pcv) 4478 "pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0 4479 "paddw %%mm5, %%mm6 \n\t" 4480 "pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7 4481 "pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0 4482 "psubw %%mm0, %%mm4 \n\t" 4483 "pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0 4484 "psubw %%mm0, %%mm4 \n\t" 4485 "psubw %%mm7, %%mm5 \n\t" 4486 "pxor %%mm0, %%mm0 \n\t" 4487 "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0 4488 "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7 4489 "psubw %%mm7, %%mm5 \n\t" 4490 "psubw %%mm0, %%mm6 \n\t" 4491 // test pa <= pb 4492 "movq %%mm4, %%mm7 \n\t" 4493 "psubw %%mm0, %%mm6 \n\t" 4494 "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb? 4495 "movq %%mm7, %%mm0 \n\t" 4496 // use mm7 mask to merge pa & pb 4497 "pand %%mm7, %%mm5 \n\t" 4498 // use mm0 mask copy to merge a & b 4499 "pand %%mm0, %%mm2 \n\t" 4500 "pandn %%mm4, %%mm7 \n\t" 4501 "pandn %%mm1, %%mm0 \n\t" 4502 "paddw %%mm5, %%mm7 \n\t" 4503 "paddw %%mm2, %%mm0 \n\t" 4504 // test ((pa <= pb)? pa:pb) <= pc 4505 "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc? 4506 "pxor %%mm1, %%mm1 \n\t" 4507 "pand %%mm7, %%mm3 \n\t" 4508 "pandn %%mm0, %%mm7 \n\t" 4509 "pxor %%mm1, %%mm1 \n\t" 4510 "paddw %%mm3, %%mm7 \n\t" 4511 "pxor %%mm0, %%mm0 \n\t" 4512 // step ecx to next set of 8 bytes and repeat loop til done 4513 "addl $8, %%ecx \n\t" 4514 "packuswb %%mm7, %%mm1 \n\t" 4515 "paddb -8(%1," PCX ",), %%mm1 \n\t" // add predictor with Raw(x) 4516 "cmpl %%eax, %%ecx \n\t" // MMXLength 4517 "movq %%mm1, -8(%1," PCX ",) \n\t" // write back updated value 4518 // mm1 will be used as Raw(x-bpp) next loop 4519 "jb paeth_4lp \n\t" 4520 4521 : "=S" (dummy_value_S), // output regs (dummy) 4522 "=D" (dummy_value_D), 4523 "=c" (dummy_value_c), 4524 "=a" (dummy_value_a) 4525 4526 : "0" (prev_row), // esi/rsi // input regs 4527 "1" (row), // edi/rdi 4528 "2" (diff), // ecx 4529 "3" (MMXLength) // eax 4530 4531 #if defined(CLOBBER_MMX_REGS_SUPPORTED) 4532 : "%mm0", "%mm1", "%mm2", "%mm3" // clobber list 4533 , "%mm4", "%mm5", "%mm6", "%mm7" 4534 #endif 4535 ); 4536 } 4537 break; // end 4 bpp 4538 4539 case 1: 4540 case 2: 4541 { 4542 __asm__ __volatile__ ( 4543 // preload "movl diff, %%eax \n\t" // eax: x = offset to align. bdry 4544 // preload "movl FullLength, %%edx \n\t" 4545 "cmpl %%edx, %%eax \n\t" 4546 "jnb paeth_dend \n\t" 4547 4548 SAVE_ebp 4549 4550 // preload "movl row, %2 \n\t" // edi/rdi 4551 // do Paeth decode for remaining bytes 4552 // preload "movl prev_row, %1 \n\t" // esi/rsi 4553 "movl %%eax, %%ebp \n\t" 4554 // preload "subl bpp, %%ebp \n\t" // (bpp is preloaded into ecx) 4555 "subl %%ecx, %%ebp \n\t" // ebp = eax - bpp 4556 "xorl %%ecx, %%ecx \n\t" // zero ecx before using cl & cx 4557 4558 SAVE_GOT_ebx 4559 SAVE_r11_r12_r13 4560 4561 "paeth_dlp: \n\t" 4562 "xorl %%ebx, %%ebx \n\t" 4563 // pav = p - a = (a + b - c) - a = b - c 4564 "movb (%1," PAX ",), %%bl \n\t" // load Prior(x) into bl 4565 "movb (%1," PBP ",), %%cl \n\t" // load Prior(x-bpp) into cl 4566 "subl %%ecx, %%ebx \n\t" // subtract Prior(x-bpp) 4567 "movl %%ebx, " pa_TEMP " \n\t" // Save pav for later use 4568 "xorl %%ebx, %%ebx \n\t" 4569 // pbv = p - b = (a + b - c) - b = a - c 4570 "movb (%2," PBP ",), %%bl \n\t" // load Raw(x-bpp) into bl 4571 "subl %%ecx, %%ebx \n\t" // subtract Prior(x-bpp) 4572 "movl %%ebx, %%ecx \n\t" 4573 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv 4574 "addl " pa_TEMP ", %%ebx \n\t" // pcv = pav + pbv 4575 // pc = abs(pcv) 4576 "testl $0x80000000, %%ebx \n\t" 4577 "jz paeth_dpca \n\t" 4578 "negl %%ebx \n\t" // reverse sign of neg values 4579 4580 "paeth_dpca: \n\t" 4581 "movl %%ebx, " pc_TEMP " \n\t" // save pc for later use 4582 // pb = abs(pbv) 4583 "testl $0x80000000, %%ecx \n\t" 4584 "jz paeth_dpba \n\t" 4585 "negl %%ecx \n\t" // reverse sign of neg values 4586 4587 "paeth_dpba: \n\t" 4588 "movl %%ecx, " pb_TEMP " \n\t" // save pb for later use 4589 // pa = abs(pav) 4590 "movl " pa_TEMP ", %%ebx \n\t" 4591 "testl $0x80000000, %%ebx \n\t" 4592 "jz paeth_dpaa \n\t" 4593 "negl %%ebx \n\t" // reverse sign of neg values 4594 4595 "paeth_dpaa: \n\t" 4596 "movl %%ebx, " pa_TEMP " \n\t" // save pa for later use 4597 // test if pa <= pb 4598 "cmpl %%ecx, %%ebx \n\t" 4599 "jna paeth_dabb \n\t" 4600 // pa > pb; now test if pb <= pc 4601 "cmpl " pc_TEMP ", %%ecx \n\t" 4602 "jna paeth_dbbc \n\t" 4603 // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp) 4604 "movb (%1," PBP ",), %%cl \n\t" // load Prior(x-bpp) into cl 4605 "jmp paeth_dpaeth \n\t" 4606 4607 "paeth_dbbc: \n\t" 4608 // pb <= pc; Raw(x) = Paeth(x) + Prior(x) 4609 "movb (%1," PAX ",), %%cl \n\t" // load Prior(x) into cl 4610 "jmp paeth_dpaeth \n\t" 4611 4612 "paeth_dabb: \n\t" 4613 // pa <= pb; now test if pa <= pc 4614 "cmpl " pc_TEMP ", %%ebx \n\t" 4615 "jna paeth_dabc \n\t" 4616 // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp) 4617 "movb (%1," PBP ",), %%cl \n\t" // load Prior(x-bpp) into cl 4618 "jmp paeth_dpaeth \n\t" 4619 4620 "paeth_dabc: \n\t" 4621 // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp) 4622 "movb (%2," PBP ",), %%cl \n\t" // load Raw(x-bpp) into cl 4623 4624 "paeth_dpaeth: \n\t" 4625 "incl %%eax \n\t" 4626 "incl %%ebp \n\t" 4627 // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256 4628 "addb %%cl, -1(%2," PAX ",) \n\t" 4629 "cmpl %%edx, %%eax \n\t" // check against FullLength 4630 "jb paeth_dlp \n\t" 4631 4632 RESTORE_r11_r12_r13 4633 RESTORE_GOT_ebx 4634 RESTORE_ebp 4635 4636 "paeth_dend: \n\t" 4637 4638 : "=c" (dummy_value_c), // output regs (dummy) 4639 "=S" (dummy_value_S), 4640 "=D" (dummy_value_D), 4641 "=a" (dummy_value_a), 4642 "=d" (dummy_value_d) 4643 4644 : "0" (bpp), // ecx // input regs 4645 "1" (prev_row), // esi/rsi 4646 "2" (row), // edi/rdi 4647 "3" (diff), // eax 4648 "4" (FullLength) // edx 4649 4650 CLOB_COLON_ebx_ebp_r1X // clobber list 4651 CLOBBER_GOT_ebx 4652 CLOB_COMMA_ebx_ebp 4653 CLOBBER_ebp 4654 CLOB_COMMA_ebX_r1X 4655 CLOBBER_r11_r12_r13 4656 ); 4657 } 4658 return; // end 1 or 2 bpp (no need to go further with this one) 4659 4660 case 6: 4661 { 4662 // _ActiveMask2 = 0xffffffff00000000LL; // NOT USED ("_amask_0_4_4") 4663 // _ShiftBpp = 48; // bpp << 3 == bpp * 8 4664 // _ShiftRem = 16; // 64 - _ShiftBpp 4665 4666 __asm__ __volatile__ ( 4667 // preload "movl diff, %%ecx \n\t" 4668 // preload "movl row, %1 \n\t" // edi/rdi 4669 // preload "movl prev_row, %0 \n\t" // esi/rsi 4670 // prime the pump: load the first Raw(x-bpp) data set 4671 "movq -8(%1," PCX ",), %%mm1 \n\t" 4672 "pxor %%mm0, %%mm0 \n\t" 4673 4674 "paeth_6lp: \n\t" 4675 // must shift to position Raw(x-bpp) data 4676 "psrlq $16, %%mm1 \n\t" // was _ShiftRem 4677 // do first set of 4 bytes 4678 "movq -8(%0," PCX ",), %%mm3 \n\t" // read c=Prior(x-bpp) bytes 4679 "punpcklbw %%mm0, %%mm1 \n\t" // unpack Low bytes of a 4680 "movq (%0," PCX ",), %%mm2 \n\t" // load b=Prior(x) 4681 "punpcklbw %%mm0, %%mm2 \n\t" // unpack Low bytes of b 4682 // must shift to position Prior(x-bpp) data 4683 "psrlq $16, %%mm3 \n\t" // was _ShiftRem 4684 // pav = p - a = (a + b - c) - a = b - c 4685 "movq %%mm2, %%mm4 \n\t" 4686 "punpcklbw %%mm0, %%mm3 \n\t" // unpack Low bytes of c 4687 // pbv = p - b = (a + b - c) - b = a - c 4688 "movq %%mm1, %%mm5 \n\t" 4689 "psubw %%mm3, %%mm4 \n\t" 4690 "pxor %%mm7, %%mm7 \n\t" 4691 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv 4692 "movq %%mm4, %%mm6 \n\t" 4693 "psubw %%mm3, %%mm5 \n\t" 4694 // pa = abs(p-a) = abs(pav) 4695 // pb = abs(p-b) = abs(pbv) 4696 // pc = abs(p-c) = abs(pcv) 4697 "pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0 4698 "paddw %%mm5, %%mm6 \n\t" 4699 "pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7 4700 "pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0 4701 "psubw %%mm0, %%mm4 \n\t" 4702 "pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0 4703 "psubw %%mm0, %%mm4 \n\t" 4704 "psubw %%mm7, %%mm5 \n\t" 4705 "pxor %%mm0, %%mm0 \n\t" 4706 "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0 4707 "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7 4708 "psubw %%mm7, %%mm5 \n\t" 4709 "psubw %%mm0, %%mm6 \n\t" 4710 // test pa <= pb 4711 "movq %%mm4, %%mm7 \n\t" 4712 "psubw %%mm0, %%mm6 \n\t" 4713 "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb? 4714 "movq %%mm7, %%mm0 \n\t" 4715 // use mm7 mask to merge pa & pb 4716 "pand %%mm7, %%mm5 \n\t" 4717 // use mm0 mask copy to merge a & b 4718 "pand %%mm0, %%mm2 \n\t" 4719 "pandn %%mm4, %%mm7 \n\t" 4720 "pandn %%mm1, %%mm0 \n\t" 4721 "paddw %%mm5, %%mm7 \n\t" 4722 "paddw %%mm2, %%mm0 \n\t" 4723 // test ((pa <= pb)? pa:pb) <= pc 4724 "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc? 4725 "pxor %%mm1, %%mm1 \n\t" 4726 "pand %%mm7, %%mm3 \n\t" 4727 "pandn %%mm0, %%mm7 \n\t" 4728 "paddw %%mm3, %%mm7 \n\t" 4729 "pxor %%mm0, %%mm0 \n\t" 4730 "packuswb %%mm1, %%mm7 \n\t" 4731 "movq -8(%0," PCX ",), %%mm3 \n\t" // load c=Prior(x-bpp) 4732 LOAD_GOT_rbp 4733 "pand " AMASK4_4_0 ", %%mm7 \n\t" // _amask4_4_0 (was _ActiveMask) 4734 RESTORE_rbp 4735 "psrlq $16, %%mm3 \n\t" 4736 "movq (%0," PCX ",), %%mm2 \n\t" // load b=Prior(x) step 1 4737 "paddb (%1," PCX ",), %%mm7 \n\t" // add Paeth predictor + Raw(x) 4738 "movq %%mm2, %%mm6 \n\t" 4739 "movq %%mm7, (%1," PCX ",) \n\t" // write back updated value 4740 "movq -8(%1," PCX ",), %%mm1 \n\t" 4741 "psllq $48, %%mm6 \n\t" // bpp * 8 = bits per pixel 4742 "movq %%mm7, %%mm5 \n\t" 4743 "psrlq $16, %%mm1 \n\t" // 64 - (bpp * 8) = remainder 4744 "por %%mm6, %%mm3 \n\t" 4745 "psllq $48, %%mm5 \n\t" // was _ShiftBpp 4746 "punpckhbw %%mm0, %%mm3 \n\t" // unpack High bytes of c 4747 "por %%mm5, %%mm1 \n\t" 4748 // do second set of 4 bytes 4749 "punpckhbw %%mm0, %%mm2 \n\t" // unpack High bytes of b 4750 "punpckhbw %%mm0, %%mm1 \n\t" // unpack High bytes of a 4751 // pav = p - a = (a + b - c) - a = b - c 4752 "movq %%mm2, %%mm4 \n\t" 4753 // pbv = p - b = (a + b - c) - b = a - c 4754 "movq %%mm1, %%mm5 \n\t" 4755 "psubw %%mm3, %%mm4 \n\t" 4756 "pxor %%mm7, %%mm7 \n\t" 4757 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv 4758 "movq %%mm4, %%mm6 \n\t" 4759 "psubw %%mm3, %%mm5 \n\t" 4760 // pa = abs(p-a) = abs(pav) 4761 // pb = abs(p-b) = abs(pbv) 4762 // pc = abs(p-c) = abs(pcv) 4763 "pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0 4764 "paddw %%mm5, %%mm6 \n\t" 4765 "pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7 4766 "pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0 4767 "psubw %%mm0, %%mm4 \n\t" 4768 "pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0 4769 "psubw %%mm0, %%mm4 \n\t" 4770 "psubw %%mm7, %%mm5 \n\t" 4771 "pxor %%mm0, %%mm0 \n\t" 4772 "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0 4773 "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7 4774 "psubw %%mm7, %%mm5 \n\t" 4775 "psubw %%mm0, %%mm6 \n\t" 4776 // test pa <= pb 4777 "movq %%mm4, %%mm7 \n\t" 4778 "psubw %%mm0, %%mm6 \n\t" 4779 "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb? 4780 "movq %%mm7, %%mm0 \n\t" 4781 // use mm7 mask to merge pa & pb 4782 "pand %%mm7, %%mm5 \n\t" 4783 // use mm0 mask copy to merge a & b 4784 "pand %%mm0, %%mm2 \n\t" 4785 "pandn %%mm4, %%mm7 \n\t" 4786 "pandn %%mm1, %%mm0 \n\t" 4787 "paddw %%mm5, %%mm7 \n\t" 4788 "paddw %%mm2, %%mm0 \n\t" 4789 // test ((pa <= pb)? pa:pb) <= pc 4790 "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc? 4791 "pxor %%mm1, %%mm1 \n\t" 4792 "pand %%mm7, %%mm3 \n\t" 4793 "pandn %%mm0, %%mm7 \n\t" 4794 "pxor %%mm1, %%mm1 \n\t" 4795 "paddw %%mm3, %%mm7 \n\t" 4796 "pxor %%mm0, %%mm0 \n\t" 4797 // step ecx to next set of 8 bytes and repeat loop til done 4798 "addl $8, %%ecx \n\t" 4799 "packuswb %%mm7, %%mm1 \n\t" 4800 "paddb -8(%1," PCX ",), %%mm1 \n\t" // add Paeth predictor + Raw(x) 4801 "cmpl %%eax, %%ecx \n\t" // MMXLength 4802 "movq %%mm1, -8(%1," PCX ",) \n\t" // write back updated value 4803 // mm1 will be used as Raw(x-bpp) next loop 4804 "jb paeth_6lp \n\t" 4805 4806 : "=S" (dummy_value_S), // output regs (dummy) 4807 "=D" (dummy_value_D), 4808 "=c" (dummy_value_c), 4809 "=a" (dummy_value_a) 4810 4811 : "0" (prev_row), // esi/rsi // input regs 4812 "1" (row), // edi/rdi 4813 "2" (diff), // ecx 4814 "3" (MMXLength) // eax 4815 4816 #if defined(CLOBBER_MMX_REGS_SUPPORTED) 4817 : "%mm0", "%mm1", "%mm2", "%mm3" // clobber list 4818 , "%mm4", "%mm5", "%mm6", "%mm7" 4819 #endif 4820 ); 4821 } 4822 break; // end 6 bpp 4823 4824 case 8: // bpp == 8 4825 { 4826 __asm__ __volatile__ ( 4827 // preload "movl diff, %%ecx \n\t" 4828 // preload "movl row, %1 \n\t" // edi/rdi 4829 // preload "movl prev_row, %0 \n\t" // esi/rsi 4830 "pxor %%mm0, %%mm0 \n\t" 4831 // prime the pump: load the first Raw(x-bpp) data set 4832 "movq -8(%1," PCX ",), %%mm1 \n\t" // only time should need to read 4833 // a=Raw(x-bpp) bytes 4834 "paeth_8lp: \n\t" 4835 // do first set of 4 bytes 4836 "movq -8(%0," PCX ",), %%mm3 \n\t" // read c=Prior(x-bpp) bytes 4837 "punpcklbw %%mm0, %%mm1 \n\t" // unpack Low bytes of a 4838 "movq (%0," PCX ",), %%mm2 \n\t" // load b=Prior(x) 4839 "punpcklbw %%mm0, %%mm2 \n\t" // unpack Low bytes of b 4840 // pav = p - a = (a + b - c) - a = b - c 4841 "movq %%mm2, %%mm4 \n\t" 4842 "punpcklbw %%mm0, %%mm3 \n\t" // unpack Low bytes of c 4843 // pbv = p - b = (a + b - c) - b = a - c 4844 "movq %%mm1, %%mm5 \n\t" 4845 "psubw %%mm3, %%mm4 \n\t" 4846 "pxor %%mm7, %%mm7 \n\t" 4847 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv 4848 "movq %%mm4, %%mm6 \n\t" 4849 "psubw %%mm3, %%mm5 \n\t" 4850 // pa = abs(p-a) = abs(pav) 4851 // pb = abs(p-b) = abs(pbv) 4852 // pc = abs(p-c) = abs(pcv) 4853 "pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0 4854 "paddw %%mm5, %%mm6 \n\t" 4855 "pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7 4856 "pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0 4857 "psubw %%mm0, %%mm4 \n\t" 4858 "pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0 4859 "psubw %%mm0, %%mm4 \n\t" 4860 "psubw %%mm7, %%mm5 \n\t" 4861 "pxor %%mm0, %%mm0 \n\t" 4862 "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0 4863 "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7 4864 "psubw %%mm7, %%mm5 \n\t" 4865 "psubw %%mm0, %%mm6 \n\t" 4866 // test pa <= pb 4867 "movq %%mm4, %%mm7 \n\t" 4868 "psubw %%mm0, %%mm6 \n\t" 4869 "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb? 4870 "movq %%mm7, %%mm0 \n\t" 4871 // use mm7 mask to merge pa & pb 4872 "pand %%mm7, %%mm5 \n\t" 4873 // use mm0 mask copy to merge a & b 4874 "pand %%mm0, %%mm2 \n\t" 4875 "pandn %%mm4, %%mm7 \n\t" 4876 "pandn %%mm1, %%mm0 \n\t" 4877 "paddw %%mm5, %%mm7 \n\t" 4878 "paddw %%mm2, %%mm0 \n\t" 4879 // test ((pa <= pb)? pa:pb) <= pc 4880 "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc? 4881 "pxor %%mm1, %%mm1 \n\t" 4882 "pand %%mm7, %%mm3 \n\t" 4883 "pandn %%mm0, %%mm7 \n\t" 4884 "paddw %%mm3, %%mm7 \n\t" 4885 "pxor %%mm0, %%mm0 \n\t" 4886 "packuswb %%mm1, %%mm7 \n\t" 4887 "movq -8(%0," PCX ",), %%mm3 \n\t" // read c=Prior(x-bpp) bytes 4888 LOAD_GOT_rbp 4889 "pand " AMASK4_4_0 ", %%mm7 \n\t" // _amask4_4_0 (was _ActiveMask) 4890 RESTORE_rbp 4891 "movq (%0," PCX ",), %%mm2 \n\t" // load b=Prior(x) 4892 "paddb (%1," PCX ",), %%mm7 \n\t" // add Paeth predictor + Raw(x) 4893 "punpckhbw %%mm0, %%mm3 \n\t" // unpack High bytes of c 4894 "movq %%mm7, (%1," PCX ",) \n\t" // write back updated value 4895 "movq -8(%1," PCX ",), %%mm1 \n\t" // read a=Raw(x-bpp) bytes 4896 4897 // do second set of 4 bytes 4898 "punpckhbw %%mm0, %%mm2 \n\t" // unpack High bytes of b 4899 "punpckhbw %%mm0, %%mm1 \n\t" // unpack High bytes of a 4900 // pav = p - a = (a + b - c) - a = b - c 4901 "movq %%mm2, %%mm4 \n\t" 4902 // pbv = p - b = (a + b - c) - b = a - c 4903 "movq %%mm1, %%mm5 \n\t" 4904 "psubw %%mm3, %%mm4 \n\t" 4905 "pxor %%mm7, %%mm7 \n\t" 4906 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv 4907 "movq %%mm4, %%mm6 \n\t" 4908 "psubw %%mm3, %%mm5 \n\t" 4909 // pa = abs(p-a) = abs(pav) 4910 // pb = abs(p-b) = abs(pbv) 4911 // pc = abs(p-c) = abs(pcv) 4912 "pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0 4913 "paddw %%mm5, %%mm6 \n\t" 4914 "pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7 4915 "pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0 4916 "psubw %%mm0, %%mm4 \n\t" 4917 "pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0 4918 "psubw %%mm0, %%mm4 \n\t" 4919 "psubw %%mm7, %%mm5 \n\t" 4920 "pxor %%mm0, %%mm0 \n\t" 4921 "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0 4922 "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7 4923 "psubw %%mm7, %%mm5 \n\t" 4924 "psubw %%mm0, %%mm6 \n\t" 4925 // test pa <= pb 4926 "movq %%mm4, %%mm7 \n\t" 4927 "psubw %%mm0, %%mm6 \n\t" 4928 "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb? 4929 "movq %%mm7, %%mm0 \n\t" 4930 // use mm7 mask to merge pa & pb 4931 "pand %%mm7, %%mm5 \n\t" 4932 // use mm0 mask copy to merge a & b 4933 "pand %%mm0, %%mm2 \n\t" 4934 "pandn %%mm4, %%mm7 \n\t" 4935 "pandn %%mm1, %%mm0 \n\t" 4936 "paddw %%mm5, %%mm7 \n\t" 4937 "paddw %%mm2, %%mm0 \n\t" 4938 // test ((pa <= pb)? pa:pb) <= pc 4939 "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc? 4940 "pxor %%mm1, %%mm1 \n\t" 4941 "pand %%mm7, %%mm3 \n\t" 4942 "pandn %%mm0, %%mm7 \n\t" 4943 "pxor %%mm1, %%mm1 \n\t" 4944 "paddw %%mm3, %%mm7 \n\t" 4945 "pxor %%mm0, %%mm0 \n\t" 4946 // step ecx to next set of 8 bytes and repeat loop til done 4947 "addl $8, %%ecx \n\t" 4948 "packuswb %%mm7, %%mm1 \n\t" 4949 "paddb -8(%1," PCX ",), %%mm1 \n\t" // add Paeth predictor + Raw(x) 4950 "cmpl %%eax, %%ecx \n\t" // MMXLength 4951 "movq %%mm1, -8(%1," PCX ",) \n\t" // write back updated value 4952 // mm1 will be used as Raw(x-bpp) next loop 4953 "jb paeth_8lp \n\t" 4954 4955 : "=S" (dummy_value_S), // output regs (dummy) 4956 "=D" (dummy_value_D), 4957 "=c" (dummy_value_c), 4958 "=a" (dummy_value_a) 4959 4960 : "0" (prev_row), // esi/rsi // input regs 4961 "1" (row), // edi/rdi 4962 "2" (diff), // ecx 4963 "3" (MMXLength) // eax 4964 4965 #if defined(CLOBBER_MMX_REGS_SUPPORTED) 4966 : "%mm0", "%mm1", "%mm2", "%mm3" // clobber list 4967 , "%mm4", "%mm5", "%mm6", "%mm7" 4968 #endif 4969 ); 4970 } 4971 break; // end 8 bpp 4972 4973 default: // bpp != 1,2,3,4,6,8: doesn't exist 4974 { 4975 // ERROR: SHOULD NEVER BE REACHED 4976 #if defined(PNG_DEBUG) 4977 png_debug(1, "Internal libpng logic error (GCC " 4978 "png_read_filter_row_mmx_paeth())\n"); 4979 #endif 4980 } 4981 break; 4982 4983 } // end switch (bpp) 4984 4985 __asm__ __volatile__ ( 4986 // MMX acceleration complete; now do clean-up 4987 // check if any remaining bytes left to decode 4988 //pre "movl FullLength, %%edx \n\t" 4989 //pre "movl MMXLength, %%eax \n\t" 4990 "cmpl %%edx, %%eax \n\t" 4991 "jnb paeth_end \n\t" 4992 4993 SAVE_ebp 4994 4995 //pre "movl row, %2 \n\t" // edi/rdi 4996 //pre "movl prev_row, %1 \n\t" // esi/rsi 4997 // do Paeth decode for remaining bytes 4998 "movl %%eax, %%ebp \n\t" 4999 //pre "subl bpp, %%ebp \n\t" // (bpp is preloaded into ecx) 5000 "subl %%ecx, %%ebp \n\t" // ebp = eax - bpp 5001 "xorl %%ecx, %%ecx \n\t" // zero ecx before using cl & cx below 5002 5003 SAVE_GOT_ebx 5004 SAVE_r11_r12_r13 5005 5006 "paeth_lp2: \n\t" 5007 "xorl %%ebx, %%ebx \n\t" 5008 // pav = p - a = (a + b - c) - a = b - c 5009 "movb (%1," PAX ",), %%bl \n\t" // load Prior(x) into bl 5010 "movb (%1," PBP ",), %%cl \n\t" // load Prior(x-bpp) into cl 5011 "subl %%ecx, %%ebx \n\t" // subtract Prior(x-bpp) 5012 "movl %%ebx, " pa_TEMP " \n\t" // Save pav for later use 5013 "xorl %%ebx, %%ebx \n\t" 5014 // pbv = p - b = (a + b - c) - b = a - c 5015 "movb (%2," PBP ",), %%bl \n\t" // load Raw(x-bpp) into bl 5016 "subl %%ecx, %%ebx \n\t" // subtract Prior(x-bpp) 5017 "movl %%ebx, %%ecx \n\t" 5018 // pcv = p - c = (a + b - c) - c = (a - c) + (b - c) = pav + pbv 5019 "addl " pa_TEMP ", %%ebx \n\t" // pcv = pav + pbv 5020 // pc = abs(pcv) 5021 "testl $0x80000000, %%ebx \n\t" 5022 "jz paeth_pca2 \n\t" 5023 "negl %%ebx \n\t" // reverse sign of neg values 5024 5025 "paeth_pca2: \n\t" 5026 "movl %%ebx, " pc_TEMP " \n\t" // save pc for later use 5027 // pb = abs(pbv) 5028 "testl $0x80000000, %%ecx \n\t" 5029 "jz paeth_pba2 \n\t" 5030 "negl %%ecx \n\t" // reverse sign of neg values 5031 5032 "paeth_pba2: \n\t" 5033 "movl %%ecx, " pb_TEMP " \n\t" // save pb for later use 5034 // pa = abs(pav) 5035 "movl " pa_TEMP ", %%ebx \n\t" 5036 "testl $0x80000000, %%ebx \n\t" 5037 "jz paeth_paa2 \n\t" 5038 "negl %%ebx \n\t" // reverse sign of neg values 5039 5040 "paeth_paa2: \n\t" 5041 "movl %%ebx, " pa_TEMP " \n\t" // save pa for later use 5042 // test if pa <= pb 5043 "cmpl %%ecx, %%ebx \n\t" 5044 "jna paeth_abb2 \n\t" 5045 // pa > pb; now test if pb <= pc 5046 "cmpl " pc_TEMP ", %%ecx \n\t" 5047 "jna paeth_bbc2 \n\t" 5048 // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp) 5049 "movb (%1," PBP ",), %%cl \n\t" // load Prior(x-bpp) into cl 5050 "jmp paeth_paeth2 \n\t" 5051 5052 "paeth_bbc2: \n\t" 5053 // pb <= pc; Raw(x) = Paeth(x) + Prior(x) 5054 "movb (%1," PAX ",), %%cl \n\t" // load Prior(x) into cl 5055 "jmp paeth_paeth2 \n\t" 5056 5057 "paeth_abb2: \n\t" 5058 // pa <= pb; now test if pa <= pc 5059 "cmpl " pc_TEMP ", %%ebx \n\t" 5060 "jna paeth_abc2 \n\t" 5061 // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp) 5062 "movb (%1," PBP ",), %%cl \n\t" // load Prior(x-bpp) into cl 5063 "jmp paeth_paeth2 \n\t" 5064 5065 "paeth_abc2: \n\t" 5066 // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp) 5067 "movb (%2," PBP ",), %%cl \n\t" // load Raw(x-bpp) into cl 5068 5069 "paeth_paeth2: \n\t" 5070 "incl %%eax \n\t" 5071 "incl %%ebp \n\t" 5072 // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256 5073 "addb %%cl, -1(%2," PAX ",) \n\t" 5074 "cmpl %%edx, %%eax \n\t" // check against FullLength 5075 "jb paeth_lp2 \n\t" 5076 5077 RESTORE_r11_r12_r13 5078 RESTORE_GOT_ebx 5079 RESTORE_ebp 5080 5081 "paeth_end: \n\t" 5082 "EMMS \n\t" // end MMX; prep for poss. FP instrs. 5083 5084 : "=c" (dummy_value_c), // output regs (dummy) 5085 "=S" (dummy_value_S), 5086 "=D" (dummy_value_D), 5087 "=a" (dummy_value_a), 5088 "=d" (dummy_value_d) 5089 5090 : "0" (bpp), // ecx // input regs 5091 "1" (prev_row), // esi/rsi 5092 "2" (row), // edi/rdi 5093 "3" (MMXLength), // eax 5094 "4" (FullLength) // edx 5095 5096 CLOB_COLON_ebx_ebp_r1X // clobber list 5097 CLOBBER_GOT_ebx 5098 CLOB_COMMA_ebx_ebp 5099 CLOBBER_ebp 5100 CLOB_COMMA_ebX_r1X 5101 CLOBBER_r11_r12_r13 5102 ); 5103 5104 } /* end png_read_filter_row_mmx_paeth() */ 5105 5106 #endif // PNG_x86_64_USE_GOTPCREL || PNG_THREAD_UNSAFE_OK 5107 #endif /* PNG_MMX_READ_FILTER_PAETH_SUPPORTED */ 5108 5109 5110 5111 5112 #if defined(PNG_MMX_READ_FILTER_SUB_SUPPORTED) 5113 5114 //===========================================================================// 5115 // // 5116 // P N G _ R E A D _ F I L T E R _ R O W _ M M X _ S U B // 5117 // // 5118 //===========================================================================// 5119 5120 // Optimized code for PNG Sub filter decoder 5121 5122 static void /* PRIVATE */ 5123 png_read_filter_row_mmx_sub(png_row_infop row_info, png_bytep row) 5124 { 5125 unsigned FullLength, MMXLength; // png_uint_32 is actually 64-bit on x86-64 5126 int bpp; 5127 int dummy_value_a; 5128 int dummy_value_c; 5129 int dummy_value_d; 5130 png_bytep dummy_value_D; 5131 int diff; // __attribute__((used)); 5132 5133 bpp = (row_info->pixel_depth + 7) >> 3; // calc number of bytes per pixel 5134 FullLength = row_info->rowbytes - bpp; // number of bytes to filter 5135 // (why do we subtract off bpp? not so in avg or paeth...) 5136 5137 __asm__ __volatile__ ( 5138 SAVE_r15 5139 SAVE_ebp 5140 //pre "movl row, %1 \n\t" // edi/rdi 5141 "mov %1, " PSI " \n\t" // lp = row 5142 //pre "movl bpp, %%ecx \n\t" 5143 "add " PCX ", %1 \n\t" // rp = row + bpp 5144 //pre "movl FullLength, %%eax \n\t" // bring in via eax... 5145 SAVE_FullLength // ...but store for later use 5146 5147 "xorl %%eax, %%eax \n\t" 5148 5149 // get # of bytes to alignment (note: computing _delta_ of two pointers, 5150 // so hereafter %%ebp is sufficient even on 64-bit) 5151 "mov %1, " PBP " \n\t" // take start of row 5152 "add $0xf, " PBP " \n\t" // add 7+8 to incr past alignment bdry 5153 // "andl $0xfffffff8, %%ebp \n\t" // mask to alignment boundary (32-bit!) 5154 CLEAR_BOTTOM_3_BITS PBP "\n\t" // mask to alignment boundary 5155 "sub %1, " PBP " \n\t" // subtract row ptr again => ebp = 5156 "jz sub_go \n\t" // target value of eax at alignment 5157 5158 "sub_lp1: \n\t" // fix alignment 5159 "movb (" PSI "," PAX ",), %%cl \n\t" 5160 "addb %%cl, (%1," PAX ",) \n\t" 5161 "incl %%eax \n\t" 5162 "cmpl %%ebp, %%eax \n\t" 5163 "jb sub_lp1 \n\t" 5164 5165 "sub_go: \n\t" 5166 RESTORE_FullLength "%%ecx \n\t" // FullLength -> ecx 5167 "movl %%ecx, %%edx \n\t" 5168 "subl %%eax, %%edx \n\t" // subtract alignment fix 5169 "andl $0x00000007, %%edx \n\t" // calc bytes over mult of 8 5170 "subl %%edx, %%ecx \n\t" // drop over bytes from length 5171 //out "movl %%ecx, MMXLength \n\t" 5172 "movl %%ebp, %%eax \n\t" // ebp = diff, but no reg constraint(?) 5173 RESTORE_ebp // (could swap ebp and ecx functions, 5174 RESTORE_r15 // but %%cl issues...) 5175 5176 : "=c" (MMXLength), // 0 // output regs 5177 "=D" (dummy_value_D), // 1 5178 "=a" (diff) // 2 5179 5180 : "0" (bpp), // ecx // input regs 5181 "1" (row), // edi 5182 "2" (FullLength) // eax 5183 5184 : "%esi", "%edx" // clobber list 5185 _CLOBBER_r15 5186 _CLOBBER_ebp 5187 ); 5188 5189 // now do the math for the rest of the row 5190 switch (bpp) 5191 { 5192 case 3: 5193 { 5194 // _ShiftBpp = 24; // == 3 * 8 5195 // _ShiftRem = 40; // == 64 - 24 5196 5197 __asm__ __volatile__ ( 5198 // preload "mov row, %1 \n\t" // edi/rdi 5199 LOAD_GOT_rbp 5200 // load (former) _ActiveMask for 2nd active byte group 5201 "movq " AMASK2_3_3 ", %%mm7 \n\t" // _amask2_3_3 5202 RESTORE_rbp 5203 5204 // notused "mov %1, " PSI " \n\t" // lp = row 5205 // preload "movl bpp, %%ecx \n\t" 5206 "add " PCX ", %1 \n\t" // rp = row + bpp 5207 "movq %%mm7, %%mm6 \n\t" 5208 // preload "movl diff, %%edx \n\t" 5209 "psllq $24, %%mm6 \n\t" // move mask in mm6 to cover 5210 // 3rd active byte group 5211 // prime the pump: load the first Raw(x-bpp) data set 5212 "movq -8(%1," PDX ",), %%mm1 \n\t" 5213 5214 "sub_3lp: \n\t" // shift data for adding first 5215 "psrlq $40, %%mm1 \n\t" // bpp bytes (no need for mask; 5216 // shift clears inactive bytes) 5217 // add 1st active group 5218 "movq (%1," PDX ",), %%mm0 \n\t" 5219 "paddb %%mm1, %%mm0 \n\t" 5220 5221 // add 2nd active group 5222 "movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1 5223 "psllq $24, %%mm1 \n\t" // shift data to pos. correctly 5224 "pand %%mm7, %%mm1 \n\t" // mask to use 2nd active group 5225 "paddb %%mm1, %%mm0 \n\t" 5226 5227 // add 3rd active group 5228 "movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1 5229 "psllq $24, %%mm1 \n\t" // shift data to pos. correctly 5230 "pand %%mm6, %%mm1 \n\t" // mask to use 3rd active group 5231 "addl $8, %%edx \n\t" 5232 "paddb %%mm1, %%mm0 \n\t" 5233 5234 "cmpl %%eax, %%edx \n\t" // MMXLength 5235 "movq %%mm0, -8(%1," PDX ",) \n\t" // write updated Raws to array 5236 "movq %%mm0, %%mm1 \n\t" // prep 1st add at top of loop 5237 "jb sub_3lp \n\t" 5238 5239 : "=c" (dummy_value_c), // 0 // output regs (dummy) 5240 "=D" (dummy_value_D), // 1 5241 "=d" (dummy_value_d), // 2 5242 "=a" (dummy_value_a) // 3 5243 5244 : "0" (bpp), // ecx // input regs 5245 "1" (row), // edi 5246 "2" (diff), // edx 5247 "3" (MMXLength) // eax 5248 5249 #if defined(CLOBBER_MMX_REGS_SUPPORTED) 5250 : "%mm0", "%mm1", "%mm6", "%mm7" // clobber list 5251 #endif 5252 ); 5253 } 5254 break; // end 3 bpp 5255 5256 case 4: // formerly shared with 6 bpp case via _ShiftBpp and _ShiftRem, 5257 { // but 64-bit PIC/.so problems (could still share, moving vars 5258 // into unused MMX regs via ecx/edx, but kludgy) 5259 // _ShiftBpp = bpp << 3; // 32 (psllq) 5260 // _ShiftRem = 64 - _ShiftBpp; // 32 (psrlq) 5261 5262 __asm__ __volatile__ ( 5263 // preload "mov row, %1 \n\t" // edi/rdi 5264 // preload "movl diff, %%edx \n\t" 5265 // notused "mov %1, " PSI " \n\t" // lp = row 5266 // preload "movl bpp, %%ecx \n\t" 5267 "add " PCX ", %1 \n\t" // rp = row + bpp 5268 5269 // prime the pump: load the first Raw(x-bpp) data set 5270 "movq -8(%1," PDX ",), %%mm1 \n\t" 5271 5272 "sub_4lp: \n\t" // shift data for adding first 5273 "psrlq $32, %%mm1 \n\t" // bpp bytes (no need for mask; 5274 // shift clears inactive bytes) 5275 "movq (%1," PDX ",), %%mm0 \n\t" 5276 "paddb %%mm1, %%mm0 \n\t" 5277 5278 // add 2nd active group 5279 "movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1 5280 "psllq $32, %%mm1 \n\t" // shift data to pos. correctly 5281 "addl $8, %%edx \n\t" 5282 "paddb %%mm1, %%mm0 \n\t" 5283 5284 "cmpl %%eax, %%edx \n\t" // MMXLength 5285 "movq %%mm0, -8(%1," PDX ",) \n\t" // write updated Raws to array 5286 "movq %%mm0, %%mm1 \n\t" // prep 1st add at top of loop 5287 "jb sub_4lp \n\t" 5288 5289 : "=c" (dummy_value_c), // 0 // output regs (dummy) 5290 "=D" (dummy_value_D), // 1 5291 "=d" (dummy_value_d), // 2 5292 "=a" (dummy_value_a) // 3 5293 5294 : "0" (bpp), // ecx // input regs 5295 "1" (row), // edi 5296 "2" (diff), // edx 5297 "3" (MMXLength) // eax 5298 5299 #if defined(CLOBBER_MMX_REGS_SUPPORTED) 5300 : "%mm0", "%mm1" // clobber list 5301 #endif 5302 ); 5303 } 5304 break; // end 4 bpp 5305 5306 case 1: 5307 { 5308 __asm__ __volatile__ ( 5309 // preload "movl diff, %%edx \n\t" 5310 // preload "mov row, %1 \n\t" // edi/rdi 5311 // preload "cmpl FullLength, %%edx \n\t" 5312 "cmpl %%eax, %%edx \n\t" 5313 "jnb sub_1end \n\t" 5314 "mov %1, " PSI " \n\t" // lp = row 5315 // irrel. "xorl %%ecx, %%ecx \n\t" // (actually bug with preload) 5316 // preload "movl bpp, %%ecx \n\t" 5317 "add " PCX ", %1 \n\t" // rp = row + bpp 5318 5319 "sub_1lp: \n\t" 5320 "movb (" PSI "," PDX ",), %%cl \n\t" 5321 "addb %%cl, (%1," PDX ",) \n\t" 5322 "incl %%edx \n\t" 5323 "cmpl %%eax, %%edx \n\t" // compare with FullLength 5324 "jb sub_1lp \n\t" 5325 5326 "sub_1end: \n\t" 5327 5328 : "=c" (dummy_value_c), // 0 // output regs (dummy) 5329 "=D" (dummy_value_D), // 1 5330 "=d" (dummy_value_d), // 2 5331 "=a" (dummy_value_a) // 3 5332 5333 : "0" (bpp), // ecx // input regs 5334 "1" (row), // edi 5335 "2" (diff), // edx 5336 "3" (FullLength) // eax 5337 5338 : "%esi" // clobber list 5339 ); 5340 } 5341 return; // end 1 bpp (bypassing cleanup block!) 5342 5343 case 2: 5344 { 5345 // _ShiftBpp = 16; // == 2 * 8 5346 // _ShiftRem = 48; // == 64 - 16 5347 5348 __asm__ __volatile__ ( 5349 LOAD_GOT_rbp 5350 // load (former) _ActiveMask for 2nd active byte group 5351 "movq " AMASK4_2_2 ", %%mm7 \n\t" // _amask4_2_2 5352 RESTORE_rbp 5353 // preload "movl diff, %%edx \n\t" 5354 "movq %%mm7, %%mm6 \n\t" 5355 // preload "mov row, %1 \n\t" // edi/rdi 5356 "psllq $16, %%mm6 \n\t" // move mask in mm6 to cover 5357 // 3rd active byte group 5358 // notused "mov %1, " PSI " \n\t" // lp = row 5359 "movq %%mm6, %%mm5 \n\t" 5360 // preload "movl bpp, %%ecx \n\t" 5361 "add " PCX ", %1 \n\t" // rp = row + bpp 5362 "psllq $16, %%mm5 \n\t" // move mask in mm5 to cover 5363 // 4th active byte group 5364 // prime the pump: load the first Raw(x-bpp) data set 5365 "movq -8(%1," PDX ",), %%mm1 \n\t" 5366 5367 "sub_2lp: \n\t" // shift data for adding first 5368 "psrlq $48, %%mm1 \n\t" // bpp bytes (no need for mask; 5369 // shift clears inactive bytes) 5370 // add 1st active group 5371 "movq (%1," PDX ",), %%mm0 \n\t" 5372 "paddb %%mm1, %%mm0 \n\t" 5373 5374 // add 2nd active group 5375 "movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1 5376 "psllq $16, %%mm1 \n\t" // shift data to pos. correctly 5377 "pand %%mm7, %%mm1 \n\t" // mask to use 2nd active group 5378 "paddb %%mm1, %%mm0 \n\t" 5379 5380 // add 3rd active group 5381 "movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1 5382 "psllq $16, %%mm1 \n\t" // shift data to pos. correctly 5383 "pand %%mm6, %%mm1 \n\t" // mask to use 3rd active group 5384 "paddb %%mm1, %%mm0 \n\t" 5385 5386 // add 4th active group 5387 "movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1 5388 "psllq $16, %%mm1 \n\t" // shift data to pos. correctly 5389 "pand %%mm5, %%mm1 \n\t" // mask to use 4th active group 5390 "addl $8, %%edx \n\t" 5391 "paddb %%mm1, %%mm0 \n\t" 5392 "cmpl %%eax, %%edx \n\t" // MMXLength 5393 "movq %%mm0, -8(%1," PDX ",) \n\t" // write updated Raws to array 5394 "movq %%mm0, %%mm1 \n\t" // prep 1st add at top of loop 5395 "jb sub_2lp \n\t" 5396 5397 : "=c" (dummy_value_c), // 0 // output regs (dummy) 5398 "=D" (dummy_value_D), // 1 5399 "=d" (dummy_value_d), // 2 5400 "=a" (dummy_value_a) // 3 5401 5402 : "0" (bpp), // ecx // input regs 5403 "1" (row), // edi 5404 "2" (diff), // edx 5405 "3" (MMXLength) // eax 5406 5407 #if defined(CLOBBER_MMX_REGS_SUPPORTED) 5408 : "%mm0", "%mm1", "%mm5", "%mm6" // clobber list 5409 , "%mm7" 5410 #endif 5411 ); 5412 } 5413 break; // end 2 bpp 5414 5415 case 6: // formerly shared with 4 bpp case (see comments there) 5416 { 5417 // _ShiftBpp = bpp << 3; // 48 (psllq) 5418 // _ShiftRem = 64 - _ShiftBpp; // 16 (psrlq) 5419 5420 __asm__ __volatile__ ( 5421 // preload "mov row, %1 \n\t" // edi/rdi 5422 // preload "movl diff, %%edx \n\t" 5423 // notused "mov %1, " PSI " \n\t" // lp = row 5424 // preload "movl bpp, %%ecx \n\t" 5425 "add " PCX ", %1 \n\t" // rp = row + bpp 5426 5427 // prime the pump: load the first Raw(x-bpp) data set 5428 "movq -8(%1," PDX ",), %%mm1 \n\t" 5429 5430 "sub_6lp: \n\t" // shift data for adding first 5431 "psrlq $16, %%mm1 \n\t" // bpp bytes (no need for mask; 5432 // shift clears inactive bytes) 5433 "movq (%1," PDX ",), %%mm0 \n\t" 5434 "paddb %%mm1, %%mm0 \n\t" 5435 5436 // add 2nd active group 5437 "movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1 5438 "psllq $48, %%mm1 \n\t" // shift data to pos. correctly 5439 "addl $8, %%edx \n\t" 5440 "paddb %%mm1, %%mm0 \n\t" 5441 5442 "cmpl %%eax, %%edx \n\t" // MMXLength 5443 "movq %%mm0, -8(%1," PDX ",) \n\t" // write updated Raws to array 5444 "movq %%mm0, %%mm1 \n\t" // prep 1st add at top of loop 5445 "jb sub_6lp \n\t" 5446 5447 : "=c" (dummy_value_c), // 0 // output regs (dummy) 5448 "=D" (dummy_value_D), // 1 5449 "=d" (dummy_value_d), // 2 5450 "=a" (dummy_value_a) // 3 5451 5452 : "0" (bpp), // ecx // input regs 5453 "1" (row), // edi 5454 "2" (diff), // edx 5455 "3" (MMXLength) // eax 5456 5457 #if defined(CLOBBER_MMX_REGS_SUPPORTED) 5458 : "%mm0", "%mm1" // clobber list 5459 #endif 5460 ); 5461 } 5462 break; // end 6 bpp 5463 5464 case 8: 5465 { 5466 __asm__ __volatile__ ( 5467 // preload "mov row, %1 \n\t" // edi/rdi 5468 // preload "movl diff, %%edx \n\t" 5469 // notused "mov %1, " PSI " \n\t" // lp = row 5470 // preload "movl bpp, %%ecx \n\t" 5471 "add " PCX ", %1 \n\t" // rp = row + bpp 5472 // preload "movl MMXLength, %%eax \n\t" 5473 5474 // prime the pump: load the first Raw(x-bpp) data set 5475 "movq -8(%1," PDX ",), %%mm7 \n\t" 5476 "movl %%eax, %%esi \n\t" // copy of MMXLength -> esi 5477 "andl $0x0000003f, %%esi \n\t" // calc bytes over mult of 64 5478 5479 "sub_8lp: \n\t" 5480 "movq (%1," PDX ",), %%mm0 \n\t" // load Sub(x) for 1st 8 bytes 5481 "paddb %%mm7, %%mm0 \n\t" 5482 "movq 8(%1," PDX ",), %%mm1 \n\t" // load Sub(x) for 2nd 8 bytes 5483 "movq %%mm0, (%1," PDX ",) \n\t" // write Raw(x) for 1st 8 bytes 5484 5485 // Now mm0 will be used as Raw(x-bpp) for the 2nd group of 8 bytes. 5486 // This will be repeated for each group of 8 bytes with the 8th 5487 // group being used as the Raw(x-bpp) for the 1st group of the 5488 // next loop. 5489 5490 "paddb %%mm0, %%mm1 \n\t" 5491 "movq 16(%1," PDX ",), %%mm2 \n\t" // load Sub(x) for 3rd 8 bytes 5492 "movq %%mm1, 8(%1," PDX ",) \n\t" // write Raw(x) for 2nd 8 bytes 5493 "paddb %%mm1, %%mm2 \n\t" 5494 "movq 24(%1," PDX ",), %%mm3 \n\t" // load Sub(x) for 4th 8 bytes 5495 "movq %%mm2, 16(%1," PDX ",) \n\t" // write Raw(x) for 3rd 8 bytes 5496 "paddb %%mm2, %%mm3 \n\t" 5497 "movq 32(%1," PDX ",), %%mm4 \n\t" // load Sub(x) for 5th 8 bytes 5498 "movq %%mm3, 24(%1," PDX ",) \n\t" // write Raw(x) for 4th 8 bytes 5499 "paddb %%mm3, %%mm4 \n\t" 5500 "movq 40(%1," PDX ",), %%mm5 \n\t" // load Sub(x) for 6th 8 bytes 5501 "movq %%mm4, 32(%1," PDX ",) \n\t" // write Raw(x) for 5th 8 bytes 5502 "paddb %%mm4, %%mm5 \n\t" 5503 "movq 48(%1," PDX ",), %%mm6 \n\t" // load Sub(x) for 7th 8 bytes 5504 "movq %%mm5, 40(%1," PDX ",) \n\t" // write Raw(x) for 6th 8 bytes 5505 "paddb %%mm5, %%mm6 \n\t" 5506 "movq 56(%1," PDX ",), %%mm7 \n\t" // load Sub(x) for 8th 8 bytes 5507 "movq %%mm6, 48(%1," PDX ",) \n\t" // write Raw(x) for 7th 8 bytes 5508 "addl $64, %%edx \n\t" 5509 "paddb %%mm6, %%mm7 \n\t" 5510 "cmpl %%esi, %%edx \n\t" // cmp to bytes over mult of 64 5511 "movq %%mm7, -8(%1," PDX ",) \n\t" // write Raw(x) for 8th 8 bytes 5512 "jb sub_8lp \n\t" 5513 5514 "cmpl %%eax, %%edx \n\t" // compare to MMXLength 5515 "jnb sub_8lt8 \n\t" 5516 5517 "sub_8lpA: \n\t" 5518 "movq (%1," PDX ",), %%mm0 \n\t" 5519 "addl $8, %%edx \n\t" 5520 "paddb %%mm7, %%mm0 \n\t" 5521 "cmpl %%eax, %%edx \n\t" // compare to MMXLength 5522 "movq %%mm0, -8(%1," PDX ",) \n\t" // -8 to offset early addl edx 5523 "movq %%mm0, %%mm7 \n\t" // move calculated Raw(x) data 5524 "jb sub_8lpA \n\t" // to mm7 to be new Raw(x-bpp) 5525 // for next loop 5526 "sub_8lt8: \n\t" 5527 5528 : "=c" (dummy_value_c), // 0 // output regs (dummy) 5529 "=D" (dummy_value_D), // 1 5530 "=d" (dummy_value_d), // 2 5531 "=a" (dummy_value_a) // 3 5532 5533 : "0" (bpp), // ecx // input regs 5534 "1" (row), // edi 5535 "2" (diff), // edx 5536 "3" (MMXLength) // eax 5537 5538 : "%esi" // clobber list 5539 #if defined(CLOBBER_MMX_REGS_SUPPORTED) 5540 , "%mm0", "%mm1", "%mm2", "%mm3" 5541 , "%mm4", "%mm5", "%mm6", "%mm7" 5542 #endif 5543 ); 5544 } 5545 break; // end 8 bpp 5546 5547 default: // bpp != 1,2,3,4,6,8: doesn't exist 5548 { 5549 // ERROR: SHOULD NEVER BE REACHED 5550 #if defined(PNG_DEBUG) 5551 png_debug(1, "Internal libpng logic error (GCC " 5552 "png_read_filter_row_mmx_sub())\n"); 5553 #endif 5554 } 5555 break; 5556 5557 } // end switch (bpp) 5558 5559 __asm__ __volatile__ ( 5560 //pre "movl MMXLength, %%eax \n\t" 5561 //pre "mov row, %1 \n\t" // edi/rdi 5562 //pre "cmpl FullLength, %%eax \n\t" 5563 "cmpl %%edx, %%eax \n\t" 5564 "jnb sub_end \n\t" 5565 5566 "mov %1, " PSI " \n\t" // lp = row 5567 //pre "movl bpp, %%ecx \n\t" 5568 "add " PCX ", %1 \n\t" // rp = row + bpp 5569 "xorl %%ecx, %%ecx \n\t" 5570 5571 "sub_lp2: \n\t" 5572 "movb (" PSI "," PAX ",), %%cl \n\t" 5573 "addb %%cl, (%1," PAX ",) \n\t" 5574 "incl %%eax \n\t" 5575 "cmpl %%edx, %%eax \n\t" // FullLength 5576 "jb sub_lp2 \n\t" 5577 5578 "sub_end: \n\t" 5579 "EMMS \n\t" // end MMX instructions 5580 5581 : "=c" (dummy_value_c), // 0 // output regs (dummy) 5582 "=D" (dummy_value_D), // 1 5583 "=a" (dummy_value_a), // 2 5584 "=d" (dummy_value_d) // 3 5585 5586 : "0" (bpp), // ecx // input regs 5587 "1" (row), // edi 5588 "2" (MMXLength), // eax 5589 "3" (FullLength) // edx 5590 5591 : "%esi" // clobber list 5592 ); 5593 5594 } // end of png_read_filter_row_mmx_sub() 5595 5596 #endif /* PNG_MMX_READ_FILTER_SUB_SUPPORTED */ 5597 5598 5599 5600 5601 #if defined(PNG_MMX_READ_FILTER_UP_SUPPORTED) 5602 5603 //===========================================================================// 5604 // // 5605 // P N G _ R E A D _ F I L T E R _ R O W _ M M X _ U P // 5606 // // 5607 //===========================================================================// 5608 5609 // Optimized code for PNG Up filter decoder 5610 5611 static void /* PRIVATE */ 5612 png_read_filter_row_mmx_up(png_row_infop row_info, png_bytep row, 5613 png_bytep prev_row) 5614 { 5615 unsigned len; // png_uint_32 is actually 64-bit on x86-64 5616 int dummy_value_d; // fix 'forbidden register 3 (dx) was spilled' error 5617 png_bytep dummy_value_S; 5618 png_bytep dummy_value_D; 5619 5620 len = row_info->rowbytes; // number of bytes to filter 5621 5622 __asm__ __volatile__ ( 5623 SAVE_GOT_ebx 5624 //pre "mov prev_row, %1 \n\t" // esi/rsi 5625 //pre "movl row, %2 \n\t" // edi/rdi 5626 5627 "xorl %%ebx, %%ebx \n\t" 5628 "xorl %%eax, %%eax \n\t" 5629 5630 // get # of bytes to alignment (note: computing _delta_ of two pointers, 5631 // so hereafter %%ecx is sufficient even on 64-bit) 5632 "mov %2, " PCX " \n\t" // take start of row 5633 "add $0x7, " PCX " \n\t" // add 7 to incr past alignment bdry 5634 // "andl $0xfffffff8, %%ecx \n\t" // mask to alignment boundary (32-bit!) 5635 CLEAR_BOTTOM_3_BITS PCX "\n\t" // mask to alignment boundary 5636 "sub %2, " PCX " \n\t" // subtract row ptr again => ebp = 5637 "jz up_go \n\t" // target value of ecx at alignment 5638 5639 "up_lp1: \n\t" // fix alignment 5640 "movb (%2," PBX ",), %%al \n\t" 5641 "addb (%1," PBX ",), %%al \n\t" 5642 "incl %%ebx \n\t" 5643 "cmpl %%ecx, %%ebx \n\t" 5644 "movb %%al, -1(%2," PBX ",) \n\t" // mov does not affect flags; -1 to 5645 "jb up_lp1 \n\t" // offset incl ebx 5646 5647 "up_go: \n\t" 5648 //pre "movl len, %%edx \n\t" 5649 "movl %%edx, %%ecx \n\t" 5650 "subl %%ebx, %%edx \n\t" // subtract alignment fix 5651 "andl $0x0000003f, %%edx \n\t" // calc bytes over mult of 64 5652 "subl %%edx, %%ecx \n\t" // sub over-bytes from original length 5653 5654 // unrolled loop - use all MMX registers and interleave to reduce 5655 // number of branch instructions (loops) and reduce partial stalls 5656 "up_loop: \n\t" 5657 "movq (%1," PBX ",), %%mm1 \n\t" 5658 "movq (%2," PBX ",), %%mm0 \n\t" 5659 "movq 8(%1," PBX ",), %%mm3 \n\t" 5660 "paddb %%mm1, %%mm0 \n\t" 5661 "movq 8(%2," PBX ",), %%mm2 \n\t" 5662 "movq %%mm0, (%2," PBX ",) \n\t" 5663 "paddb %%mm3, %%mm2 \n\t" 5664 "movq 16(%1," PBX ",), %%mm5 \n\t" 5665 "movq %%mm2, 8(%2," PBX ",) \n\t" 5666 "movq 16(%2," PBX ",), %%mm4 \n\t" 5667 "movq 24(%1," PBX ",), %%mm7 \n\t" 5668 "paddb %%mm5, %%mm4 \n\t" 5669 "movq 24(%2," PBX ",), %%mm6 \n\t" 5670 "movq %%mm4, 16(%2," PBX ",) \n\t" 5671 "paddb %%mm7, %%mm6 \n\t" 5672 "movq 32(%1," PBX ",), %%mm1 \n\t" 5673 "movq %%mm6, 24(%2," PBX ",) \n\t" 5674 "movq 32(%2," PBX ",), %%mm0 \n\t" 5675 "movq 40(%1," PBX ",), %%mm3 \n\t" 5676 "paddb %%mm1, %%mm0 \n\t" 5677 "movq 40(%2," PBX ",), %%mm2 \n\t" 5678 "movq %%mm0, 32(%2," PBX ",) \n\t" 5679 "paddb %%mm3, %%mm2 \n\t" 5680 "movq 48(%1," PBX ",), %%mm5 \n\t" 5681 "movq %%mm2, 40(%2," PBX ",) \n\t" 5682 "movq 48(%2," PBX ",), %%mm4 \n\t" 5683 "movq 56(%1," PBX ",), %%mm7 \n\t" 5684 "paddb %%mm5, %%mm4 \n\t" 5685 "movq 56(%2," PBX ",), %%mm6 \n\t" 5686 "movq %%mm4, 48(%2," PBX ",) \n\t" 5687 "addl $64, %%ebx \n\t" 5688 "paddb %%mm7, %%mm6 \n\t" 5689 "cmpl %%ecx, %%ebx \n\t" 5690 "movq %%mm6, -8(%2," PBX ",) \n\t" // (+56)movq does not affect flags; 5691 "jb up_loop \n\t" // -8 to offset addl ebx 5692 5693 "cmpl $0, %%edx \n\t" // test for bytes over mult of 64 5694 "jz up_end \n\t" 5695 5696 "cmpl $8, %%edx \n\t" // test for less than 8 bytes 5697 "jb up_lt8 \n\t" // [added by lcreeve at netins.net] 5698 5699 "addl %%edx, %%ecx \n\t" 5700 "andl $0x00000007, %%edx \n\t" // calc bytes over mult of 8 5701 "subl %%edx, %%ecx \n\t" // drop over-bytes from length 5702 "jz up_lt8 \n\t" 5703 5704 "up_lpA: \n\t" // use MMX regs to update 8 bytes sim. 5705 "movq (%1," PBX ",), %%mm1 \n\t" 5706 "movq (%2," PBX ",), %%mm0 \n\t" 5707 "addl $8, %%ebx \n\t" 5708 "paddb %%mm1, %%mm0 \n\t" 5709 "cmpl %%ecx, %%ebx \n\t" 5710 "movq %%mm0, -8(%2," PBX ",) \n\t" // movq does not affect flags; -8 to 5711 "jb up_lpA \n\t" // offset add ebx 5712 "cmpl $0, %%edx \n\t" // test for bytes over mult of 8 5713 "jz up_end \n\t" 5714 5715 "up_lt8: \n\t" 5716 "xorl %%eax, %%eax \n\t" 5717 "addl %%edx, %%ecx \n\t" // move over byte count into counter 5718 5719 "up_lp2: \n\t" // use x86 regs for remaining bytes 5720 "movb (%2," PBX ",), %%al \n\t" 5721 "addb (%1," PBX ",), %%al \n\t" 5722 "incl %%ebx \n\t" 5723 "cmpl %%ecx, %%ebx \n\t" 5724 "movb %%al, -1(%2," PBX ",) \n\t" // mov does not affect flags; -1 to 5725 "jb up_lp2 \n\t" // offset inc ebx 5726 5727 "up_end: \n\t" 5728 "EMMS \n\t" // conversion of filtered row complete 5729 RESTORE_GOT_ebx 5730 5731 : "=d" (dummy_value_d), // 0 // output regs (dummy) 5732 "=S" (dummy_value_S), // 1 5733 "=D" (dummy_value_D) // 2 5734 5735 : "0" (len), // edx // input regs 5736 "1" (prev_row), // esi 5737 "2" (row) // edi 5738 5739 : "%eax", "%ecx" // clobber list (no input regs!) 5740 _CLOBBER_GOT_ebx 5741 #if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED) 5742 , "%mm0", "%mm1", "%mm2", "%mm3" 5743 , "%mm4", "%mm5", "%mm6", "%mm7" 5744 #endif 5745 ); 5746 5747 } // end of png_read_filter_row_mmx_up() 5748 5749 #endif /* PNG_MMX_READ_FILTER_UP_SUPPORTED */ 5750 5751 5752 5753 5754 /*===========================================================================*/ 5755 /* */ 5756 /* P N G _ R E A D _ F I L T E R _ R O W */ 5757 /* */ 5758 /*===========================================================================*/ 5759 5760 /* Optimized png_read_filter_row routines */ 5761 5762 void /* PRIVATE */ 5763 png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep 5764 row, png_bytep prev_row, int filter) 5765 { 5766 #if defined(PNG_DEBUG) 5767 char filtname[10]; 5768 #endif 5769 5770 if (_mmx_supported == 2) { 5771 #if !defined(PNG_1_0_X) 5772 /* this should have happened in png_init_mmx_flags() already */ 5773 png_warning(png_ptr, "asm_flags may not have been initialized"); 5774 #endif 5775 png_mmx_support(); 5776 } 5777 5778 #if defined(PNG_DEBUG) 5779 png_debug(1, "in png_read_filter_row (pnggccrd.c)\n"); 5780 switch (filter) 5781 { 5782 case 0: 5783 png_snprintf(filtname, 10, "none"); 5784 break; 5785 5786 case 1: 5787 png_snprintf(filtname, 10, "sub-%s", 5788 #ifdef PNG_MMX_READ_FILTER_SUB_SUPPORTED 5789 #if !defined(PNG_1_0_X) 5790 ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB) && 5791 (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) && 5792 (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold)) 5793 #else 5794 _mmx_supported 5795 #endif 5796 ? "MMX" : 5797 #endif 5798 "C"); 5799 break; 5800 5801 case 2: 5802 png_snprintf(filtname, 10, "up-%s", 5803 #ifdef PNG_MMX_READ_FILTER_UP_SUPPORTED 5804 #if !defined(PNG_1_0_X) 5805 ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP) && 5806 (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) && 5807 (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold)) 5808 #else 5809 _mmx_supported 5810 #endif 5811 ? "MMX" : 5812 #endif 5813 "C"); 5814 break; 5815 5816 case 3: 5817 png_snprintf(filtname, 10, "avg-%s", 5818 #ifdef PNG_MMX_READ_FILTER_AVG_SUPPORTED 5819 #if !defined(PNG_1_0_X) 5820 ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG) && 5821 (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) && 5822 (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold)) 5823 #else 5824 _mmx_supported 5825 #endif 5826 ? "MMX" : 5827 #endif 5828 "C"); 5829 break; 5830 5831 case 4: 5832 png_snprintf(filtname, 10, "paeth-%s", 5833 #ifdef PNG_MMX_READ_FILTER_PAETH_SUPPORTED 5834 #if defined(PNG_x86_64_USE_GOTPCREL) || defined(PNG_THREAD_UNSAFE_OK) 5835 #if !defined(PNG_1_0_X) 5836 ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH) && 5837 (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) && 5838 (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold)) 5839 #else 5840 _mmx_supported 5841 #endif 5842 ? "MMX" : 5843 #endif /* PNG_x86_64_USE_GOTPCREL || PNG_THREAD_UNSAFE_OK */ 5844 #endif 5845 "C"); 5846 break; 5847 5848 default: 5849 png_snprintf(filtname, 10, "unknown"); 5850 break; 5851 } 5852 png_debug2(2, "row_number=%ld, %s, ", png_ptr->row_number, filtname); 5853 //png_debug1(0, "png_ptr=%10p, ", png_ptr); 5854 //png_debug1(0, "asm_flags=0x%08lx, ", png_ptr->asm_flags); 5855 png_debug1(0, "row=%10p, ", row); 5856 png_debug2(0, "pixdepth=%d, bytes=%d, ", (int)row_info->pixel_depth, 5857 (int)((row_info->pixel_depth + 7) >> 3)); 5858 png_debug1(0, "rowbytes=%ld\n", row_info->rowbytes); 5859 #endif /* PNG_DEBUG */ 5860 5861 switch (filter) 5862 { 5863 case PNG_FILTER_VALUE_NONE: 5864 break; 5865 5866 case PNG_FILTER_VALUE_SUB: 5867 #ifdef PNG_MMX_READ_FILTER_SUB_SUPPORTED 5868 #if !defined(PNG_1_0_X) 5869 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB) && 5870 (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) && 5871 (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold)) 5872 #else 5873 if (_mmx_supported) 5874 #endif 5875 { 5876 png_read_filter_row_mmx_sub(row_info, row); 5877 } 5878 else 5879 #endif 5880 { 5881 png_uint_32 i; 5882 png_uint_32 istop = row_info->rowbytes; 5883 png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3; 5884 png_bytep rp = row + bpp; 5885 png_bytep lp = row; 5886 5887 for (i = bpp; i < istop; i++) 5888 { 5889 *rp = (png_byte)(((int)(*rp) + (int)(*lp++)) & 0xff); 5890 rp++; 5891 } 5892 } /* end !UseMMX_sub */ 5893 break; 5894 5895 case PNG_FILTER_VALUE_UP: 5896 #ifdef PNG_MMX_READ_FILTER_UP_SUPPORTED 5897 #if !defined(PNG_1_0_X) 5898 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP) && 5899 (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) && 5900 (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold)) 5901 #else 5902 if (_mmx_supported) 5903 #endif 5904 { 5905 png_read_filter_row_mmx_up(row_info, row, prev_row); 5906 } 5907 else 5908 #endif 5909 { 5910 png_uint_32 i; 5911 png_uint_32 istop = row_info->rowbytes; 5912 png_bytep rp = row; 5913 png_bytep pp = prev_row; 5914 5915 for (i = 0; i < istop; ++i) 5916 { 5917 *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff); 5918 rp++; 5919 } 5920 } /* end !UseMMX_up */ 5921 break; 5922 5923 case PNG_FILTER_VALUE_AVG: 5924 #ifdef PNG_MMX_READ_FILTER_AVG_SUPPORTED 5925 #if !defined(PNG_1_0_X) 5926 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG) && 5927 (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) && 5928 (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold)) 5929 #else 5930 if (_mmx_supported) 5931 #endif 5932 { 5933 png_read_filter_row_mmx_avg(row_info, row, prev_row); 5934 } 5935 else 5936 #endif 5937 { 5938 png_uint_32 i; 5939 png_bytep rp = row; 5940 png_bytep pp = prev_row; 5941 png_bytep lp = row; 5942 png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3; 5943 png_uint_32 istop = row_info->rowbytes - bpp; 5944 5945 for (i = 0; i < bpp; i++) 5946 { 5947 *rp = (png_byte)(((int)(*rp) + 5948 ((int)(*pp++) >> 1)) & 0xff); 5949 rp++; 5950 } 5951 5952 for (i = 0; i < istop; i++) 5953 { 5954 *rp = (png_byte)(((int)(*rp) + 5955 ((int)(*pp++ + *lp++) >> 1)) & 0xff); 5956 rp++; 5957 } 5958 } /* end !UseMMX_avg */ 5959 break; 5960 5961 case PNG_FILTER_VALUE_PAETH: 5962 #ifdef PNG_MMX_READ_FILTER_PAETH_SUPPORTED 5963 #if defined(PNG_x86_64_USE_GOTPCREL) || defined(PNG_THREAD_UNSAFE_OK) 5964 #if !defined(PNG_1_0_X) 5965 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH) && 5966 (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) && 5967 (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold)) 5968 #else 5969 if (_mmx_supported) 5970 #endif 5971 { 5972 png_read_filter_row_mmx_paeth(row_info, row, prev_row); 5973 } 5974 else 5975 #endif /* PNG_x86_64_USE_GOTPCREL || PNG_THREAD_UNSAFE_OK */ 5976 #endif 5977 { 5978 png_uint_32 i; 5979 png_bytep rp = row; 5980 png_bytep pp = prev_row; 5981 png_bytep lp = row; 5982 png_bytep cp = prev_row; 5983 png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3; 5984 png_uint_32 istop = row_info->rowbytes - bpp; 5985 5986 for (i = 0; i < bpp; i++) 5987 { 5988 *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff); 5989 rp++; 5990 } 5991 5992 for (i = 0; i < istop; i++) /* use leftover rp,pp */ 5993 { 5994 int a, b, c, pa, pb, pc, p; 5995 5996 a = *lp++; 5997 b = *pp++; 5998 c = *cp++; 5999 6000 p = b - c; 6001 pc = a - c; 6002 6003 #if defined(PNG_USE_ABS) 6004 pa = abs(p); 6005 pb = abs(pc); 6006 pc = abs(p + pc); 6007 #else 6008 pa = p < 0 ? -p : p; 6009 pb = pc < 0 ? -pc : pc; 6010 pc = (p + pc) < 0 ? -(p + pc) : p + pc; 6011 #endif 6012 6013 /* 6014 if (pa <= pb && pa <= pc) 6015 p = a; 6016 else if (pb <= pc) 6017 p = b; 6018 else 6019 p = c; 6020 */ 6021 6022 p = (pa <= pb && pa <= pc) ? a : (pb <= pc) ? b : c; 6023 6024 *rp = (png_byte)(((int)(*rp) + p) & 0xff); 6025 rp++; 6026 } 6027 } /* end !UseMMX_paeth */ 6028 break; 6029 6030 default: 6031 png_warning(png_ptr, "Ignoring bad row-filter type"); 6032 *row=0; 6033 break; 6034 } 6035 } 6036 6037 #endif /* PNG_HAVE_MMX_READ_FILTER_ROW */ 6038 6039 6040 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED && PNG_USE_PNGGCCRD */ 6041 #endif /* __GNUC__ */ 6042