Home | History | Annotate | Download | only in masmx86
      1 ; match686.asm -- Asm portion of the optimized longest_match for 32 bits x86
      2 ; Copyright (C) 1995-1996 Jean-loup Gailly, Brian Raiter and Gilles Vollant.
      3 ; File written by Gilles Vollant, by converting match686.S from Brian Raiter
      4 ; for MASM. This is as assembly version of longest_match
      5 ;  from Jean-loup Gailly in deflate.c
      6 ;
      7 ;         http://www.zlib.net
      8 ;         http://www.winimage.com/zLibDll
      9 ;         http://www.muppetlabs.com/~breadbox/software/assembly.html
     10 ;
     11 ; For Visual C++ 4.x and higher and ML 6.x and higher
     12 ;   ml.exe is distributed in
     13 ;  http://www.microsoft.com/downloads/details.aspx?FamilyID=7a1c9da0-0510-44a2-b042-7ef370530c64
     14 ;
     15 ; this file contain two implementation of longest_match
     16 ;
     17 ;  this longest_match was written by Brian raiter (1998), optimized for Pentium Pro
     18 ;   (and the faster known version of match_init on modern Core 2 Duo and AMD Phenom)
     19 ;
     20 ;  for using an assembly version of longest_match, you need define ASMV in project
     21 ;
     22 ;    compile the asm file running
     23 ;           ml /coff /Zi /c /Flmatch686.lst match686.asm
     24 ;    and do not include match686.obj in your project
     25 ;
     26 ; note: contrib of zLib 1.2.3 and earlier contained both a deprecated version for
     27 ;  Pentium (prior Pentium Pro) and this version for Pentium Pro and modern processor
     28 ;  with autoselect (with cpu detection code)
     29 ;  if you want support the old pentium optimization, you can still use these version
     30 ;
     31 ; this file is not optimized for old pentium, but it compatible with all x86 32 bits
     32 ; processor (starting 80386)
     33 ;
     34 ;
     35 ; see below : zlib1222add must be adjuster if you use a zlib version < 1.2.2.2
     36 
     37 ;uInt longest_match(s, cur_match)
     38 ;    deflate_state *s;
     39 ;    IPos cur_match;                             /* current match */
     40 
     41     NbStack         equ     76
     42     cur_match       equ     dword ptr[esp+NbStack-0]
     43     str_s           equ     dword ptr[esp+NbStack-4]
     44 ; 5 dword on top (ret,ebp,esi,edi,ebx)
     45     adrret          equ     dword ptr[esp+NbStack-8]
     46     pushebp         equ     dword ptr[esp+NbStack-12]
     47     pushedi         equ     dword ptr[esp+NbStack-16]
     48     pushesi         equ     dword ptr[esp+NbStack-20]
     49     pushebx         equ     dword ptr[esp+NbStack-24]
     50 
     51     chain_length    equ     dword ptr [esp+NbStack-28]
     52     limit           equ     dword ptr [esp+NbStack-32]
     53     best_len        equ     dword ptr [esp+NbStack-36]
     54     window          equ     dword ptr [esp+NbStack-40]
     55     prev            equ     dword ptr [esp+NbStack-44]
     56     scan_start      equ      word ptr [esp+NbStack-48]
     57     wmask           equ     dword ptr [esp+NbStack-52]
     58     match_start_ptr equ     dword ptr [esp+NbStack-56]
     59     nice_match      equ     dword ptr [esp+NbStack-60]
     60     scan            equ     dword ptr [esp+NbStack-64]
     61 
     62     windowlen       equ     dword ptr [esp+NbStack-68]
     63     match_start     equ     dword ptr [esp+NbStack-72]
     64     strend          equ     dword ptr [esp+NbStack-76]
     65     NbStackAdd      equ     (NbStack-24)
     66 
     67     .386p
     68 
     69     name    gvmatch
     70     .MODEL  FLAT
     71 
     72 
     73 
     74 ;  all the +zlib1222add offsets are due to the addition of fields
     75 ;  in zlib in the deflate_state structure since the asm code was first written
     76 ;  (if you compile with zlib 1.0.4 or older, use "zlib1222add equ (-4)").
     77 ;  (if you compile with zlib between 1.0.5 and 1.2.2.1, use "zlib1222add equ 0").
     78 ;  if you compile with zlib 1.2.2.2 or later , use "zlib1222add equ 8").
     79 
     80     zlib1222add         equ     8
     81 
     82 ;  Note : these value are good with a 8 bytes boundary pack structure
     83     dep_chain_length    equ     74h+zlib1222add
     84     dep_window          equ     30h+zlib1222add
     85     dep_strstart        equ     64h+zlib1222add
     86     dep_prev_length     equ     70h+zlib1222add
     87     dep_nice_match      equ     88h+zlib1222add
     88     dep_w_size          equ     24h+zlib1222add
     89     dep_prev            equ     38h+zlib1222add
     90     dep_w_mask          equ     2ch+zlib1222add
     91     dep_good_match      equ     84h+zlib1222add
     92     dep_match_start     equ     68h+zlib1222add
     93     dep_lookahead       equ     6ch+zlib1222add
     94 
     95 
     96 _TEXT                   segment
     97 
     98 IFDEF NOUNDERLINE
     99             public  longest_match
    100             public  match_init
    101 ELSE
    102             public  _longest_match
    103             public  _match_init
    104 ENDIF
    105 
    106     MAX_MATCH           equ     258
    107     MIN_MATCH           equ     3
    108     MIN_LOOKAHEAD       equ     (MAX_MATCH+MIN_MATCH+1)
    109 
    110 
    111 
    112 MAX_MATCH       equ     258
    113 MIN_MATCH       equ     3
    114 MIN_LOOKAHEAD   equ     (MAX_MATCH + MIN_MATCH + 1)
    115 MAX_MATCH_8_     equ     ((MAX_MATCH + 7) AND 0FFF0h)
    116 
    117 
    118 ;;; stack frame offsets
    119 
    120 chainlenwmask   equ  esp + 0    ; high word: current chain len
    121                     ; low word: s->wmask
    122 window      equ  esp + 4    ; local copy of s->window
    123 windowbestlen   equ  esp + 8    ; s->window + bestlen
    124 scanstart   equ  esp + 16   ; first two bytes of string
    125 scanend     equ  esp + 12   ; last two bytes of string
    126 scanalign   equ  esp + 20   ; dword-misalignment of string
    127 nicematch   equ  esp + 24   ; a good enough match size
    128 bestlen     equ  esp + 28   ; size of best match so far
    129 scan        equ  esp + 32   ; ptr to string wanting match
    130 
    131 LocalVarsSize   equ 36
    132 ;   saved ebx   byte esp + 36
    133 ;   saved edi   byte esp + 40
    134 ;   saved esi   byte esp + 44
    135 ;   saved ebp   byte esp + 48
    136 ;   return address  byte esp + 52
    137 deflatestate    equ  esp + 56   ; the function arguments
    138 curmatch    equ  esp + 60
    139 
    140 ;;; Offsets for fields in the deflate_state structure. These numbers
    141 ;;; are calculated from the definition of deflate_state, with the
    142 ;;; assumption that the compiler will dword-align the fields. (Thus,
    143 ;;; changing the definition of deflate_state could easily cause this
    144 ;;; program to crash horribly, without so much as a warning at
    145 ;;; compile time. Sigh.)
    146 
    147 dsWSize     equ 36+zlib1222add
    148 dsWMask     equ 44+zlib1222add
    149 dsWindow    equ 48+zlib1222add
    150 dsPrev      equ 56+zlib1222add
    151 dsMatchLen  equ 88+zlib1222add
    152 dsPrevMatch equ 92+zlib1222add
    153 dsStrStart  equ 100+zlib1222add
    154 dsMatchStart    equ 104+zlib1222add
    155 dsLookahead equ 108+zlib1222add
    156 dsPrevLen   equ 112+zlib1222add
    157 dsMaxChainLen   equ 116+zlib1222add
    158 dsGoodMatch equ 132+zlib1222add
    159 dsNiceMatch equ 136+zlib1222add
    160 
    161 
    162 ;;; match686.asm -- Pentium-Pro-optimized version of longest_match()
    163 ;;; Written for zlib 1.1.2
    164 ;;; Copyright (C) 1998 Brian Raiter <breadbox (a] muppetlabs.com>
    165 ;;; You can look at http://www.muppetlabs.com/~breadbox/software/assembly.html
    166 ;;;
    167 ;;
    168 ;;  This software is provided 'as-is', without any express or implied
    169 ;;  warranty.  In no event will the authors be held liable for any damages
    170 ;;  arising from the use of this software.
    171 ;;
    172 ;;  Permission is granted to anyone to use this software for any purpose,
    173 ;;  including commercial applications, and to alter it and redistribute it
    174 ;;  freely, subject to the following restrictions:
    175 ;;
    176 ;;  1. The origin of this software must not be misrepresented; you must not
    177 ;;     claim that you wrote the original software. If you use this software
    178 ;;     in a product, an acknowledgment in the product documentation would be
    179 ;;     appreciated but is not required.
    180 ;;  2. Altered source versions must be plainly marked as such, and must not be
    181 ;;     misrepresented as being the original software
    182 ;;  3. This notice may not be removed or altered from any source distribution.
    183 ;;
    184 
    185 ;GLOBAL _longest_match, _match_init
    186 
    187 
    188 ;SECTION    .text
    189 
    190 ;;; uInt longest_match(deflate_state *deflatestate, IPos curmatch)
    191 
    192 ;_longest_match:
    193     IFDEF NOUNDERLINE
    194     longest_match       proc near
    195     ELSE
    196     _longest_match      proc near
    197     ENDIF
    198 
    199 ;;; Save registers that the compiler may be using, and adjust esp to
    200 ;;; make room for our stack frame.
    201 
    202         push    ebp
    203         push    edi
    204         push    esi
    205         push    ebx
    206         sub esp, LocalVarsSize
    207 
    208 ;;; Retrieve the function arguments. ecx will hold cur_match
    209 ;;; throughout the entire function. edx will hold the pointer to the
    210 ;;; deflate_state structure during the function's setup (before
    211 ;;; entering the main loop.
    212 
    213         mov edx, [deflatestate]
    214         mov ecx, [curmatch]
    215 
    216 ;;; uInt wmask = s->w_mask;
    217 ;;; unsigned chain_length = s->max_chain_length;
    218 ;;; if (s->prev_length >= s->good_match) {
    219 ;;;     chain_length >>= 2;
    220 ;;; }
    221 
    222         mov eax, [edx + dsPrevLen]
    223         mov ebx, [edx + dsGoodMatch]
    224         cmp eax, ebx
    225         mov eax, [edx + dsWMask]
    226         mov ebx, [edx + dsMaxChainLen]
    227         jl  LastMatchGood
    228         shr ebx, 2
    229 LastMatchGood:
    230 
    231 ;;; chainlen is decremented once beforehand so that the function can
    232 ;;; use the sign flag instead of the zero flag for the exit test.
    233 ;;; It is then shifted into the high word, to make room for the wmask
    234 ;;; value, which it will always accompany.
    235 
    236         dec ebx
    237         shl ebx, 16
    238         or  ebx, eax
    239         mov [chainlenwmask], ebx
    240 
    241 ;;; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead;
    242 
    243         mov eax, [edx + dsNiceMatch]
    244         mov ebx, [edx + dsLookahead]
    245         cmp ebx, eax
    246         jl  LookaheadLess
    247         mov ebx, eax
    248 LookaheadLess:  mov [nicematch], ebx
    249 
    250 ;;; register Bytef *scan = s->window + s->strstart;
    251 
    252         mov esi, [edx + dsWindow]
    253         mov [window], esi
    254         mov ebp, [edx + dsStrStart]
    255         lea edi, [esi + ebp]
    256         mov [scan], edi
    257 
    258 ;;; Determine how many bytes the scan ptr is off from being
    259 ;;; dword-aligned.
    260 
    261         mov eax, edi
    262         neg eax
    263         and eax, 3
    264         mov [scanalign], eax
    265 
    266 ;;; IPos limit = s->strstart > (IPos)MAX_DIST(s) ?
    267 ;;;     s->strstart - (IPos)MAX_DIST(s) : NIL;
    268 
    269         mov eax, [edx + dsWSize]
    270         sub eax, MIN_LOOKAHEAD
    271         sub ebp, eax
    272         jg  LimitPositive
    273         xor ebp, ebp
    274 LimitPositive:
    275 
    276 ;;; int best_len = s->prev_length;
    277 
    278         mov eax, [edx + dsPrevLen]
    279         mov [bestlen], eax
    280 
    281 ;;; Store the sum of s->window + best_len in esi locally, and in esi.
    282 
    283         add esi, eax
    284         mov [windowbestlen], esi
    285 
    286 ;;; register ush scan_start = *(ushf*)scan;
    287 ;;; register ush scan_end   = *(ushf*)(scan+best_len-1);
    288 ;;; Posf *prev = s->prev;
    289 
    290         movzx   ebx, word ptr [edi]
    291         mov [scanstart], ebx
    292         movzx   ebx, word ptr [edi + eax - 1]
    293         mov [scanend], ebx
    294         mov edi, [edx + dsPrev]
    295 
    296 ;;; Jump into the main loop.
    297 
    298         mov edx, [chainlenwmask]
    299         jmp short LoopEntry
    300 
    301 align 4
    302 
    303 ;;; do {
    304 ;;;     match = s->window + cur_match;
    305 ;;;     if (*(ushf*)(match+best_len-1) != scan_end ||
    306 ;;;         *(ushf*)match != scan_start) continue;
    307 ;;;     [...]
    308 ;;; } while ((cur_match = prev[cur_match & wmask]) > limit
    309 ;;;          && --chain_length != 0);
    310 ;;;
    311 ;;; Here is the inner loop of the function. The function will spend the
    312 ;;; majority of its time in this loop, and majority of that time will
    313 ;;; be spent in the first ten instructions.
    314 ;;;
    315 ;;; Within this loop:
    316 ;;; ebx = scanend
    317 ;;; ecx = curmatch
    318 ;;; edx = chainlenwmask - i.e., ((chainlen << 16) | wmask)
    319 ;;; esi = windowbestlen - i.e., (window + bestlen)
    320 ;;; edi = prev
    321 ;;; ebp = limit
    322 
    323 LookupLoop:
    324         and ecx, edx
    325         movzx   ecx, word ptr [edi + ecx*2]
    326         cmp ecx, ebp
    327         jbe LeaveNow
    328         sub edx, 00010000h
    329         js  LeaveNow
    330 LoopEntry:  movzx   eax, word ptr [esi + ecx - 1]
    331         cmp eax, ebx
    332         jnz LookupLoop
    333         mov eax, [window]
    334         movzx   eax, word ptr [eax + ecx]
    335         cmp eax, [scanstart]
    336         jnz LookupLoop
    337 
    338 ;;; Store the current value of chainlen.
    339 
    340         mov [chainlenwmask], edx
    341 
    342 ;;; Point edi to the string under scrutiny, and esi to the string we
    343 ;;; are hoping to match it up with. In actuality, esi and edi are
    344 ;;; both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and edx is
    345 ;;; initialized to -(MAX_MATCH_8 - scanalign).
    346 
    347         mov esi, [window]
    348         mov edi, [scan]
    349         add esi, ecx
    350         mov eax, [scanalign]
    351         mov edx, 0fffffef8h; -(MAX_MATCH_8)
    352         lea edi, [edi + eax + 0108h] ;MAX_MATCH_8]
    353         lea esi, [esi + eax + 0108h] ;MAX_MATCH_8]
    354 
    355 ;;; Test the strings for equality, 8 bytes at a time. At the end,
    356 ;;; adjust edx so that it is offset to the exact byte that mismatched.
    357 ;;;
    358 ;;; We already know at this point that the first three bytes of the
    359 ;;; strings match each other, and they can be safely passed over before
    360 ;;; starting the compare loop. So what this code does is skip over 0-3
    361 ;;; bytes, as much as necessary in order to dword-align the edi
    362 ;;; pointer. (esi will still be misaligned three times out of four.)
    363 ;;;
    364 ;;; It should be confessed that this loop usually does not represent
    365 ;;; much of the total running time. Replacing it with a more
    366 ;;; straightforward "rep cmpsb" would not drastically degrade
    367 ;;; performance.
    368 
    369 LoopCmps:
    370         mov eax, [esi + edx]
    371         xor eax, [edi + edx]
    372         jnz LeaveLoopCmps
    373         mov eax, [esi + edx + 4]
    374         xor eax, [edi + edx + 4]
    375         jnz LeaveLoopCmps4
    376         add edx, 8
    377         jnz LoopCmps
    378         jmp short LenMaximum
    379 LeaveLoopCmps4: add edx, 4
    380 LeaveLoopCmps:  test    eax, 0000FFFFh
    381         jnz LenLower
    382         add edx,  2
    383         shr eax, 16
    384 LenLower:   sub al, 1
    385         adc edx, 0
    386 
    387 ;;; Calculate the length of the match. If it is longer than MAX_MATCH,
    388 ;;; then automatically accept it as the best possible match and leave.
    389 
    390         lea eax, [edi + edx]
    391         mov edi, [scan]
    392         sub eax, edi
    393         cmp eax, MAX_MATCH
    394         jge LenMaximum
    395 
    396 ;;; If the length of the match is not longer than the best match we
    397 ;;; have so far, then forget it and return to the lookup loop.
    398 
    399         mov edx, [deflatestate]
    400         mov ebx, [bestlen]
    401         cmp eax, ebx
    402         jg  LongerMatch
    403         mov esi, [windowbestlen]
    404         mov edi, [edx + dsPrev]
    405         mov ebx, [scanend]
    406         mov edx, [chainlenwmask]
    407         jmp LookupLoop
    408 
    409 ;;;         s->match_start = cur_match;
    410 ;;;         best_len = len;
    411 ;;;         if (len >= nice_match) break;
    412 ;;;         scan_end = *(ushf*)(scan+best_len-1);
    413 
    414 LongerMatch:    mov ebx, [nicematch]
    415         mov [bestlen], eax
    416         mov [edx + dsMatchStart], ecx
    417         cmp eax, ebx
    418         jge LeaveNow
    419         mov esi, [window]
    420         add esi, eax
    421         mov [windowbestlen], esi
    422         movzx   ebx, word ptr [edi + eax - 1]
    423         mov edi, [edx + dsPrev]
    424         mov [scanend], ebx
    425         mov edx, [chainlenwmask]
    426         jmp LookupLoop
    427 
    428 ;;; Accept the current string, with the maximum possible length.
    429 
    430 LenMaximum: mov edx, [deflatestate]
    431         mov dword ptr [bestlen], MAX_MATCH
    432         mov [edx + dsMatchStart], ecx
    433 
    434 ;;; if ((uInt)best_len <= s->lookahead) return (uInt)best_len;
    435 ;;; return s->lookahead;
    436 
    437 LeaveNow:
    438         mov edx, [deflatestate]
    439         mov ebx, [bestlen]
    440         mov eax, [edx + dsLookahead]
    441         cmp ebx, eax
    442         jg  LookaheadRet
    443         mov eax, ebx
    444 LookaheadRet:
    445 
    446 ;;; Restore the stack and return from whence we came.
    447 
    448         add esp, LocalVarsSize
    449         pop ebx
    450         pop esi
    451         pop edi
    452         pop ebp
    453 
    454         ret
    455 ; please don't remove this string !
    456 ; Your can freely use match686 in any free or commercial app if you don't remove the string in the binary!
    457     db     0dh,0ah,"asm686 with masm, optimised assembly code from Brian Raiter, written 1998",0dh,0ah
    458 
    459 
    460     IFDEF NOUNDERLINE
    461     longest_match       endp
    462     ELSE
    463     _longest_match      endp
    464     ENDIF
    465 
    466     IFDEF NOUNDERLINE
    467     match_init      proc near
    468                     ret
    469     match_init      endp
    470     ELSE
    471     _match_init     proc near
    472                     ret
    473     _match_init     endp
    474     ENDIF
    475 
    476 
    477 _TEXT   ends
    478 end
    479