Home | History | Annotate | Download | only in masmx86
      1 ; match686.asm -- Asm portion of the optimized longest_match for 32 bits x86
      2 ; Copyright (C) 1995-1996 Jean-loup Gailly, Brian Raiter and Gilles Vollant.
      3 ; File written by Gilles Vollant, by converting match686.S from Brian Raiter
      4 ; for MASM. This is as assembly version of longest_match
      5 ;  from Jean-loup Gailly in deflate.c
      6 ;
      7 ;         http://www.zlib.net
      8 ;         http://www.winimage.com/zLibDll
      9 ;         http://www.muppetlabs.com/~breadbox/software/assembly.html
     10 ;
     11 ; For Visual C++ 4.x and higher and ML 6.x and higher
     12 ;   ml.exe is distributed in
     13 ;  http://www.microsoft.com/downloads/details.aspx?FamilyID=7a1c9da0-0510-44a2-b042-7ef370530c64
     14 ;
     15 ; this file contain two implementation of longest_match
     16 ;
     17 ;  this longest_match was written by Brian raiter (1998), optimized for Pentium Pro
     18 ;   (and the faster known version of match_init on modern Core 2 Duo and AMD Phenom)
     19 ;
     20 ;  for using an assembly version of longest_match, you need define ASMV in project
     21 ;
     22 ;    compile the asm file running
     23 ;           ml /coff /Zi /c /Flmatch686.lst match686.asm
     24 ;    and do not include match686.obj in your project
     25 ;
     26 ; note: contrib of zLib 1.2.3 and earlier contained both a deprecated version for
     27 ;  Pentium (prior Pentium Pro) and this version for Pentium Pro and modern processor
     28 ;  with autoselect (with cpu detection code)
     29 ;  if you want support the old pentium optimization, you can still use these version
     30 ;
     31 ; this file is not optimized for old pentium, but it compatible with all x86 32 bits
     32 ; processor (starting 80386)
     33 ;
     34 ;
     35 ; see below : zlib1222add must be adjuster if you use a zlib version < 1.2.2.2
     36 
     37 ;uInt longest_match(s, cur_match)
     38 ;    deflate_state *s;
     39 ;    IPos cur_match;                             /* current match */
     40 
     41     NbStack         equ     76
     42     cur_match       equ     dword ptr[esp+NbStack-0]
     43     str_s           equ     dword ptr[esp+NbStack-4]
     44 ; 5 dword on top (ret,ebp,esi,edi,ebx)
     45     adrret          equ     dword ptr[esp+NbStack-8]
     46     pushebp         equ     dword ptr[esp+NbStack-12]
     47     pushedi         equ     dword ptr[esp+NbStack-16]
     48     pushesi         equ     dword ptr[esp+NbStack-20]
     49     pushebx         equ     dword ptr[esp+NbStack-24]
     50 
     51     chain_length    equ     dword ptr [esp+NbStack-28]
     52     limit           equ     dword ptr [esp+NbStack-32]
     53     best_len        equ     dword ptr [esp+NbStack-36]
     54     window          equ     dword ptr [esp+NbStack-40]
     55     prev            equ     dword ptr [esp+NbStack-44]
     56     scan_start      equ      word ptr [esp+NbStack-48]
     57     wmask           equ     dword ptr [esp+NbStack-52]
     58     match_start_ptr equ     dword ptr [esp+NbStack-56]
     59     nice_match      equ     dword ptr [esp+NbStack-60]
     60     scan            equ     dword ptr [esp+NbStack-64]
     61 
     62     windowlen       equ     dword ptr [esp+NbStack-68]
     63     match_start     equ     dword ptr [esp+NbStack-72]
     64     strend          equ     dword ptr [esp+NbStack-76]
     65     NbStackAdd      equ     (NbStack-24)
     66 
     67     .386p
     68 
     69     name    gvmatch
     70     .MODEL  FLAT
     71 
     72 
     73 
     74 ;  all the +zlib1222add offsets are due to the addition of fields
     75 ;  in zlib in the deflate_state structure since the asm code was first written
     76 ;  (if you compile with zlib 1.0.4 or older, use "zlib1222add equ (-4)").
     77 ;  (if you compile with zlib between 1.0.5 and 1.2.2.1, use "zlib1222add equ 0").
     78 ;  if you compile with zlib 1.2.2.2 or later , use "zlib1222add equ 8").
     79 
     80     zlib1222add         equ     8
     81 
     82 ;  Note : these value are good with a 8 bytes boundary pack structure
     83     dep_chain_length    equ     74h+zlib1222add
     84     dep_window          equ     30h+zlib1222add
     85     dep_strstart        equ     64h+zlib1222add
     86     dep_prev_length     equ     70h+zlib1222add
     87     dep_nice_match      equ     88h+zlib1222add
     88     dep_w_size          equ     24h+zlib1222add
     89     dep_prev            equ     38h+zlib1222add
     90     dep_w_mask          equ     2ch+zlib1222add
     91     dep_good_match      equ     84h+zlib1222add
     92     dep_match_start     equ     68h+zlib1222add
     93     dep_lookahead       equ     6ch+zlib1222add
     94 
     95 
     96 _TEXT                   segment
     97 
     98 IFDEF NOUNDERLINE
     99             public  longest_match
    100             public  match_init
    101 ELSE
    102             public  _longest_match
    103             public  _match_init
    104 ENDIF
    105 
    106     MAX_MATCH           equ     258
    107     MIN_MATCH           equ     3
    108     MIN_LOOKAHEAD       equ     (MAX_MATCH+MIN_MATCH+1)
    109 
    110 
    111 
    112 MAX_MATCH       equ     258
    113 MIN_MATCH       equ     3
    114 MIN_LOOKAHEAD   equ     (MAX_MATCH + MIN_MATCH + 1)
    115 MAX_MATCH_8_     equ     ((MAX_MATCH + 7) AND 0FFF0h)
    116 
    117 
    118 ;;; stack frame offsets
    119 
    120 chainlenwmask   equ  esp + 0    ; high word: current chain len
    121                     ; low word: s->wmask
    122 window      equ  esp + 4    ; local copy of s->window
    123 windowbestlen   equ  esp + 8    ; s->window + bestlen
    124 scanstart   equ  esp + 16   ; first two bytes of string
    125 scanend     equ  esp + 12   ; last two bytes of string
    126 scanalign   equ  esp + 20   ; dword-misalignment of string
    127 nicematch   equ  esp + 24   ; a good enough match size
    128 bestlen     equ  esp + 28   ; size of best match so far
    129 scan        equ  esp + 32   ; ptr to string wanting match
    130 
    131 LocalVarsSize   equ 36
    132 ;   saved ebx   byte esp + 36
    133 ;   saved edi   byte esp + 40
    134 ;   saved esi   byte esp + 44
    135 ;   saved ebp   byte esp + 48
    136 ;   return address  byte esp + 52
    137 deflatestate    equ  esp + 56   ; the function arguments
    138 curmatch    equ  esp + 60
    139 
    140 ;;; Offsets for fields in the deflate_state structure. These numbers
    141 ;;; are calculated from the definition of deflate_state, with the
    142 ;;; assumption that the compiler will dword-align the fields. (Thus,
    143 ;;; changing the definition of deflate_state could easily cause this
    144 ;;; program to crash horribly, without so much as a warning at
    145 ;;; compile time. Sigh.)
    146 
    147 dsWSize     equ 36+zlib1222add
    148 dsWMask     equ 44+zlib1222add
    149 dsWindow    equ 48+zlib1222add
    150 dsPrev      equ 56+zlib1222add
    151 dsMatchLen  equ 88+zlib1222add
    152 dsPrevMatch equ 92+zlib1222add
    153 dsStrStart  equ 100+zlib1222add
    154 dsMatchStart    equ 104+zlib1222add
    155 dsLookahead equ 108+zlib1222add
    156 dsPrevLen   equ 112+zlib1222add
    157 dsMaxChainLen   equ 116+zlib1222add
    158 dsGoodMatch equ 132+zlib1222add
    159 dsNiceMatch equ 136+zlib1222add
    160 
    161 
    162 ;;; match686.asm -- Pentium-Pro-optimized version of longest_match()
    163 ;;; Written for zlib 1.1.2
    164 ;;; Copyright (C) 1998 Brian Raiter <breadbox (a] muppetlabs.com>
    165 ;;; You can look at http://www.muppetlabs.com/~breadbox/software/assembly.html
    166 ;;;
    167 ;;
    168 ;;  This software is provided 'as-is', without any express or implied
    169 ;;  warranty.  In no event will the authors be held liable for any damages
    170 ;;  arising from the use of this software.
    171 ;;
    172 ;;  Permission is granted to anyone to use this software for any purpose,
    173 ;;  including commercial applications, and to alter it and redistribute it
    174 ;;  freely, subject to the following restrictions:
    175 ;;
    176 ;;  1. The origin of this software must not be misrepresented; you must not
    177 ;;     claim that you wrote the original software. If you use this software
    178 ;;     in a product, an acknowledgment in the product documentation would be
    179 ;;     appreciated but is not required.
    180 ;;  2. Altered source versions must be plainly marked as such, and must not be
    181 ;;     misrepresented as being the original software
    182 ;;  3. This notice may not be removed or altered from any source distribution.
    183 ;;
    184 
    185 ;GLOBAL _longest_match, _match_init
    186 
    187 
    188 ;SECTION    .text
    189 
    190 ;;; uInt longest_match(deflate_state *deflatestate, IPos curmatch)
    191 
    192 ;_longest_match:
    193     IFDEF NOUNDERLINE
    194     longest_match       proc near
    195     ELSE
    196     _longest_match      proc near
    197     ENDIF
    198 .FPO (9, 4, 0, 0, 1, 0)
    199 
    200 ;;; Save registers that the compiler may be using, and adjust esp to
    201 ;;; make room for our stack frame.
    202 
    203         push    ebp
    204         push    edi
    205         push    esi
    206         push    ebx
    207         sub esp, LocalVarsSize
    208 
    209 ;;; Retrieve the function arguments. ecx will hold cur_match
    210 ;;; throughout the entire function. edx will hold the pointer to the
    211 ;;; deflate_state structure during the function's setup (before
    212 ;;; entering the main loop.
    213 
    214         mov edx, [deflatestate]
    215         mov ecx, [curmatch]
    216 
    217 ;;; uInt wmask = s->w_mask;
    218 ;;; unsigned chain_length = s->max_chain_length;
    219 ;;; if (s->prev_length >= s->good_match) {
    220 ;;;     chain_length >>= 2;
    221 ;;; }
    222 
    223         mov eax, [edx + dsPrevLen]
    224         mov ebx, [edx + dsGoodMatch]
    225         cmp eax, ebx
    226         mov eax, [edx + dsWMask]
    227         mov ebx, [edx + dsMaxChainLen]
    228         jl  LastMatchGood
    229         shr ebx, 2
    230 LastMatchGood:
    231 
    232 ;;; chainlen is decremented once beforehand so that the function can
    233 ;;; use the sign flag instead of the zero flag for the exit test.
    234 ;;; It is then shifted into the high word, to make room for the wmask
    235 ;;; value, which it will always accompany.
    236 
    237         dec ebx
    238         shl ebx, 16
    239         or  ebx, eax
    240         mov [chainlenwmask], ebx
    241 
    242 ;;; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead;
    243 
    244         mov eax, [edx + dsNiceMatch]
    245         mov ebx, [edx + dsLookahead]
    246         cmp ebx, eax
    247         jl  LookaheadLess
    248         mov ebx, eax
    249 LookaheadLess:  mov [nicematch], ebx
    250 
    251 ;;; register Bytef *scan = s->window + s->strstart;
    252 
    253         mov esi, [edx + dsWindow]
    254         mov [window], esi
    255         mov ebp, [edx + dsStrStart]
    256         lea edi, [esi + ebp]
    257         mov [scan], edi
    258 
    259 ;;; Determine how many bytes the scan ptr is off from being
    260 ;;; dword-aligned.
    261 
    262         mov eax, edi
    263         neg eax
    264         and eax, 3
    265         mov [scanalign], eax
    266 
    267 ;;; IPos limit = s->strstart > (IPos)MAX_DIST(s) ?
    268 ;;;     s->strstart - (IPos)MAX_DIST(s) : NIL;
    269 
    270         mov eax, [edx + dsWSize]
    271         sub eax, MIN_LOOKAHEAD
    272         sub ebp, eax
    273         jg  LimitPositive
    274         xor ebp, ebp
    275 LimitPositive:
    276 
    277 ;;; int best_len = s->prev_length;
    278 
    279         mov eax, [edx + dsPrevLen]
    280         mov [bestlen], eax
    281 
    282 ;;; Store the sum of s->window + best_len in esi locally, and in esi.
    283 
    284         add esi, eax
    285         mov [windowbestlen], esi
    286 
    287 ;;; register ush scan_start = *(ushf*)scan;
    288 ;;; register ush scan_end   = *(ushf*)(scan+best_len-1);
    289 ;;; Posf *prev = s->prev;
    290 
    291         movzx   ebx, word ptr [edi]
    292         mov [scanstart], ebx
    293         movzx   ebx, word ptr [edi + eax - 1]
    294         mov [scanend], ebx
    295         mov edi, [edx + dsPrev]
    296 
    297 ;;; Jump into the main loop.
    298 
    299         mov edx, [chainlenwmask]
    300         jmp short LoopEntry
    301 
    302 align 4
    303 
    304 ;;; do {
    305 ;;;     match = s->window + cur_match;
    306 ;;;     if (*(ushf*)(match+best_len-1) != scan_end ||
    307 ;;;         *(ushf*)match != scan_start) continue;
    308 ;;;     [...]
    309 ;;; } while ((cur_match = prev[cur_match & wmask]) > limit
    310 ;;;          && --chain_length != 0);
    311 ;;;
    312 ;;; Here is the inner loop of the function. The function will spend the
    313 ;;; majority of its time in this loop, and majority of that time will
    314 ;;; be spent in the first ten instructions.
    315 ;;;
    316 ;;; Within this loop:
    317 ;;; ebx = scanend
    318 ;;; ecx = curmatch
    319 ;;; edx = chainlenwmask - i.e., ((chainlen << 16) | wmask)
    320 ;;; esi = windowbestlen - i.e., (window + bestlen)
    321 ;;; edi = prev
    322 ;;; ebp = limit
    323 
    324 LookupLoop:
    325         and ecx, edx
    326         movzx   ecx, word ptr [edi + ecx*2]
    327         cmp ecx, ebp
    328         jbe LeaveNow
    329         sub edx, 00010000h
    330         js  LeaveNow
    331 LoopEntry:  movzx   eax, word ptr [esi + ecx - 1]
    332         cmp eax, ebx
    333         jnz LookupLoop
    334         mov eax, [window]
    335         movzx   eax, word ptr [eax + ecx]
    336         cmp eax, [scanstart]
    337         jnz LookupLoop
    338 
    339 ;;; Store the current value of chainlen.
    340 
    341         mov [chainlenwmask], edx
    342 
    343 ;;; Point edi to the string under scrutiny, and esi to the string we
    344 ;;; are hoping to match it up with. In actuality, esi and edi are
    345 ;;; both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and edx is
    346 ;;; initialized to -(MAX_MATCH_8 - scanalign).
    347 
    348         mov esi, [window]
    349         mov edi, [scan]
    350         add esi, ecx
    351         mov eax, [scanalign]
    352         mov edx, 0fffffef8h; -(MAX_MATCH_8)
    353         lea edi, [edi + eax + 0108h] ;MAX_MATCH_8]
    354         lea esi, [esi + eax + 0108h] ;MAX_MATCH_8]
    355 
    356 ;;; Test the strings for equality, 8 bytes at a time. At the end,
    357 ;;; adjust edx so that it is offset to the exact byte that mismatched.
    358 ;;;
    359 ;;; We already know at this point that the first three bytes of the
    360 ;;; strings match each other, and they can be safely passed over before
    361 ;;; starting the compare loop. So what this code does is skip over 0-3
    362 ;;; bytes, as much as necessary in order to dword-align the edi
    363 ;;; pointer. (esi will still be misaligned three times out of four.)
    364 ;;;
    365 ;;; It should be confessed that this loop usually does not represent
    366 ;;; much of the total running time. Replacing it with a more
    367 ;;; straightforward "rep cmpsb" would not drastically degrade
    368 ;;; performance.
    369 
    370 LoopCmps:
    371         mov eax, [esi + edx]
    372         xor eax, [edi + edx]
    373         jnz LeaveLoopCmps
    374         mov eax, [esi + edx + 4]
    375         xor eax, [edi + edx + 4]
    376         jnz LeaveLoopCmps4
    377         add edx, 8
    378         jnz LoopCmps
    379         jmp short LenMaximum
    380 LeaveLoopCmps4: add edx, 4
    381 LeaveLoopCmps:  test    eax, 0000FFFFh
    382         jnz LenLower
    383         add edx,  2
    384         shr eax, 16
    385 LenLower:   sub al, 1
    386         adc edx, 0
    387 
    388 ;;; Calculate the length of the match. If it is longer than MAX_MATCH,
    389 ;;; then automatically accept it as the best possible match and leave.
    390 
    391         lea eax, [edi + edx]
    392         mov edi, [scan]
    393         sub eax, edi
    394         cmp eax, MAX_MATCH
    395         jge LenMaximum
    396 
    397 ;;; If the length of the match is not longer than the best match we
    398 ;;; have so far, then forget it and return to the lookup loop.
    399 
    400         mov edx, [deflatestate]
    401         mov ebx, [bestlen]
    402         cmp eax, ebx
    403         jg  LongerMatch
    404         mov esi, [windowbestlen]
    405         mov edi, [edx + dsPrev]
    406         mov ebx, [scanend]
    407         mov edx, [chainlenwmask]
    408         jmp LookupLoop
    409 
    410 ;;;         s->match_start = cur_match;
    411 ;;;         best_len = len;
    412 ;;;         if (len >= nice_match) break;
    413 ;;;         scan_end = *(ushf*)(scan+best_len-1);
    414 
    415 LongerMatch:    mov ebx, [nicematch]
    416         mov [bestlen], eax
    417         mov [edx + dsMatchStart], ecx
    418         cmp eax, ebx
    419         jge LeaveNow
    420         mov esi, [window]
    421         add esi, eax
    422         mov [windowbestlen], esi
    423         movzx   ebx, word ptr [edi + eax - 1]
    424         mov edi, [edx + dsPrev]
    425         mov [scanend], ebx
    426         mov edx, [chainlenwmask]
    427         jmp LookupLoop
    428 
    429 ;;; Accept the current string, with the maximum possible length.
    430 
    431 LenMaximum: mov edx, [deflatestate]
    432         mov dword ptr [bestlen], MAX_MATCH
    433         mov [edx + dsMatchStart], ecx
    434 
    435 ;;; if ((uInt)best_len <= s->lookahead) return (uInt)best_len;
    436 ;;; return s->lookahead;
    437 
    438 LeaveNow:
    439         mov edx, [deflatestate]
    440         mov ebx, [bestlen]
    441         mov eax, [edx + dsLookahead]
    442         cmp ebx, eax
    443         jg  LookaheadRet
    444         mov eax, ebx
    445 LookaheadRet:
    446 
    447 ;;; Restore the stack and return from whence we came.
    448 
    449         add esp, LocalVarsSize
    450         pop ebx
    451         pop esi
    452         pop edi
    453         pop ebp
    454 
    455         ret
    456 ; please don't remove this string !
    457 ; Your can freely use match686 in any free or commercial app if you don't remove the string in the binary!
    458     db     0dh,0ah,"asm686 with masm, optimised assembly code from Brian Raiter, written 1998",0dh,0ah
    459 
    460 
    461     IFDEF NOUNDERLINE
    462     longest_match       endp
    463     ELSE
    464     _longest_match      endp
    465     ENDIF
    466 
    467     IFDEF NOUNDERLINE
    468     match_init      proc near
    469                     ret
    470     match_init      endp
    471     ELSE
    472     _match_init     proc near
    473                     ret
    474     _match_init     endp
    475     ENDIF
    476 
    477 
    478 _TEXT   ends
    479 end
    480