Home | History | Annotate | Download | only in masmx86
      1 ; gvmat32.asm -- Asm portion of the optimized longest_match for 32 bits x86
      2 ; Copyright (C) 1995-1996 Jean-loup Gailly and Gilles Vollant.
      3 ; File written by Gilles Vollant, by modifiying the longest_match
      4 ;  from Jean-loup Gailly in deflate.c
      5 ;
      6 ;         http://www.zlib.net
      7 ;         http://www.winimage.com/zLibDll
      8 ;         http://www.muppetlabs.com/~breadbox/software/assembly.html
      9 ;
     10 ; For Visual C++ 4.x and higher and ML 6.x and higher
     11 ;   ml.exe is in directory \MASM611C of Win95 DDK
     12 ;   ml.exe is also distributed in http://www.masm32.com/masmdl.htm
     13 ;    and in VC++2003 toolkit at http://msdn.microsoft.com/visualc/vctoolkit2003/
     14 ;
     15 ; this file contain two implementation of longest_match
     16 ;
     17 ;  longest_match_7fff : written 1996 by Gilles Vollant optimized for 
     18 ;            first Pentium. Assume s->w_mask == 0x7fff
     19 ;  longest_match_686 : written by Brian raiter (1998), optimized for Pentium Pro
     20 ;
     21 ;  for using an seembly version of longest_match, you need define ASMV in project
     22 ;  There is two way in using gvmat32.asm
     23 ;
     24 ;  A) Suggested method
     25 ;    if you want include both longest_match_7fff and longest_match_686
     26 ;    compile the asm file running
     27 ;           ml /coff /Zi /Flgvmat32.lst /c gvmat32.asm
     28 ;    and include gvmat32c.c in your project
     29 ;    if you have an old cpu (386,486 or first Pentium) and s->w_mask==0x7fff,
     30 ;        longest_match_7fff will be used
     31 ;    if you have a more modern CPU (Pentium Pro, II and higher)
     32 ;        longest_match_686 will be used
     33 ;    on old cpu with s->w_mask!=0x7fff, longest_match_686 will be used,
     34 ;        but this is not a sitation you'll find often
     35 ;
     36 ;  B) Alternative
     37 ;    if you are not interresed in old cpu performance and want the smaller
     38 ;       binaries possible
     39 ;
     40 ;    compile the asm file running
     41 ;           ml /coff /Zi /c /Flgvmat32.lst /DNOOLDPENTIUMCODE gvmat32.asm
     42 ;    and do not include gvmat32c.c in your project (ou define also 
     43 ;              NOOLDPENTIUMCODE)
     44 ;
     45 ; note : as I known, longest_match_686 is very faster than longest_match_7fff
     46 ;        on pentium Pro/II/III, faster (but less) in P4, but it seem
     47 ;        longest_match_7fff can be faster (very very litte) on AMD Athlon64/K8
     48 ;
     49 ; see below : zlib1222add must be adjuster if you use a zlib version < 1.2.2.2
     50 
     51 ;uInt longest_match_7fff(s, cur_match)
     52 ;    deflate_state *s;
     53 ;    IPos cur_match;                             /* current match */
     54 
     55     NbStack         equ     76
     56     cur_match       equ     dword ptr[esp+NbStack-0]
     57     str_s           equ     dword ptr[esp+NbStack-4]
     58 ; 5 dword on top (ret,ebp,esi,edi,ebx)
     59     adrret          equ     dword ptr[esp+NbStack-8]
     60     pushebp         equ     dword ptr[esp+NbStack-12]
     61     pushedi         equ     dword ptr[esp+NbStack-16]
     62     pushesi         equ     dword ptr[esp+NbStack-20]
     63     pushebx         equ     dword ptr[esp+NbStack-24]
     64 
     65     chain_length    equ     dword ptr [esp+NbStack-28]
     66     limit           equ     dword ptr [esp+NbStack-32]
     67     best_len        equ     dword ptr [esp+NbStack-36]
     68     window          equ     dword ptr [esp+NbStack-40]
     69     prev            equ     dword ptr [esp+NbStack-44]
     70     scan_start      equ      word ptr [esp+NbStack-48]
     71     wmask           equ     dword ptr [esp+NbStack-52]
     72     match_start_ptr equ     dword ptr [esp+NbStack-56]
     73     nice_match      equ     dword ptr [esp+NbStack-60]
     74     scan            equ     dword ptr [esp+NbStack-64]
     75 
     76     windowlen       equ     dword ptr [esp+NbStack-68]
     77     match_start     equ     dword ptr [esp+NbStack-72]
     78     strend          equ     dword ptr [esp+NbStack-76]
     79     NbStackAdd      equ     (NbStack-24)
     80 
     81     .386p
     82 
     83     name    gvmatch
     84     .MODEL  FLAT
     85 
     86 
     87 
     88 ;  all the +zlib1222add offsets are due to the addition of fields
     89 ;  in zlib in the deflate_state structure since the asm code was first written
     90 ;  (if you compile with zlib 1.0.4 or older, use "zlib1222add equ (-4)").
     91 ;  (if you compile with zlib between 1.0.5 and 1.2.2.1, use "zlib1222add equ 0").
     92 ;  if you compile with zlib 1.2.2.2 or later , use "zlib1222add equ 8").
     93 
     94     zlib1222add         equ     8
     95 
     96 ;  Note : these value are good with a 8 bytes boundary pack structure
     97     dep_chain_length    equ     74h+zlib1222add
     98     dep_window          equ     30h+zlib1222add
     99     dep_strstart        equ     64h+zlib1222add
    100     dep_prev_length     equ     70h+zlib1222add
    101     dep_nice_match      equ     88h+zlib1222add
    102     dep_w_size          equ     24h+zlib1222add
    103     dep_prev            equ     38h+zlib1222add
    104     dep_w_mask          equ     2ch+zlib1222add
    105     dep_good_match      equ     84h+zlib1222add
    106     dep_match_start     equ     68h+zlib1222add
    107     dep_lookahead       equ     6ch+zlib1222add
    108 
    109 
    110 _TEXT                   segment
    111 
    112 IFDEF NOUNDERLINE
    113    IFDEF NOOLDPENTIUMCODE
    114             public  longest_match
    115             public  match_init
    116    ELSE            
    117             public  longest_match_7fff
    118             public  cpudetect32
    119             public  longest_match_686
    120    ENDIF
    121 ELSE
    122    IFDEF NOOLDPENTIUMCODE
    123             public  _longest_match
    124             public  _match_init
    125    ELSE
    126             public  _longest_match_7fff
    127             public  _cpudetect32
    128             public  _longest_match_686
    129    ENDIF
    130 ENDIF
    131 
    132     MAX_MATCH           equ     258
    133     MIN_MATCH           equ     3
    134     MIN_LOOKAHEAD       equ     (MAX_MATCH+MIN_MATCH+1)
    135 
    136 
    137 
    138 IFNDEF NOOLDPENTIUMCODE
    139 IFDEF NOUNDERLINE
    140 longest_match_7fff   proc near
    141 ELSE
    142 _longest_match_7fff  proc near
    143 ENDIF
    144 
    145     mov     edx,[esp+4]
    146 
    147 
    148 
    149     push    ebp
    150     push    edi
    151     push    esi
    152     push    ebx
    153 
    154     sub     esp,NbStackAdd
    155 
    156 ; initialize or check the variables used in match.asm.
    157     mov     ebp,edx
    158 
    159 ; chain_length = s->max_chain_length
    160 ; if (prev_length>=good_match) chain_length >>= 2
    161     mov     edx,[ebp+dep_chain_length]
    162     mov     ebx,[ebp+dep_prev_length]
    163     cmp     [ebp+dep_good_match],ebx
    164     ja      noshr
    165     shr     edx,2
    166 noshr:
    167 ; we increment chain_length because in the asm, the --chain_lenght is in the beginning of the loop
    168     inc     edx
    169     mov     edi,[ebp+dep_nice_match]
    170     mov     chain_length,edx
    171     mov     eax,[ebp+dep_lookahead]
    172     cmp     eax,edi
    173 ; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead;
    174     jae     nolookaheadnicematch
    175     mov     edi,eax
    176 nolookaheadnicematch:
    177 ; best_len = s->prev_length
    178     mov     best_len,ebx
    179 
    180 ; window = s->window
    181     mov     esi,[ebp+dep_window]
    182     mov     ecx,[ebp+dep_strstart]
    183     mov     window,esi
    184 
    185     mov     nice_match,edi
    186 ; scan = window + strstart
    187     add     esi,ecx
    188     mov     scan,esi
    189 ; dx = *window
    190     mov     dx,word ptr [esi]
    191 ; bx = *(window+best_len-1)
    192     mov     bx,word ptr [esi+ebx-1]
    193     add     esi,MAX_MATCH-1
    194 ; scan_start = *scan
    195     mov     scan_start,dx
    196 ; strend = scan + MAX_MATCH-1
    197     mov     strend,esi
    198 ; bx = scan_end = *(window+best_len-1)
    199 
    200 ;    IPos limit = s->strstart > (IPos)MAX_DIST(s) ?
    201 ;        s->strstart - (IPos)MAX_DIST(s) : NIL;
    202 
    203     mov     esi,[ebp+dep_w_size]
    204     sub     esi,MIN_LOOKAHEAD
    205 ; here esi = MAX_DIST(s)
    206     sub     ecx,esi
    207     ja      nodist
    208     xor     ecx,ecx
    209 nodist:
    210     mov     limit,ecx
    211 
    212 ; prev = s->prev
    213     mov     edx,[ebp+dep_prev]
    214     mov     prev,edx
    215 
    216 ;
    217     mov     edx,dword ptr [ebp+dep_match_start]
    218     mov     bp,scan_start
    219     mov     eax,cur_match
    220     mov     match_start,edx
    221 
    222     mov     edx,window
    223     mov     edi,edx
    224     add     edi,best_len
    225     mov     esi,prev
    226     dec     edi
    227 ; windowlen = window + best_len -1
    228     mov     windowlen,edi
    229 
    230     jmp     beginloop2
    231     align   4
    232 
    233 ; here, in the loop
    234 ;       eax = ax = cur_match
    235 ;       ecx = limit
    236 ;        bx = scan_end
    237 ;        bp = scan_start
    238 ;       edi = windowlen (window + best_len -1)
    239 ;       esi = prev
    240 
    241 
    242 ;// here; chain_length <=16
    243 normalbeg0add16:
    244     add     chain_length,16
    245     jz      exitloop
    246 normalbeg0:
    247     cmp     word ptr[edi+eax],bx
    248     je      normalbeg2noroll
    249 rcontlabnoroll:
    250 ; cur_match = prev[cur_match & wmask]
    251     and     eax,7fffh
    252     mov     ax,word ptr[esi+eax*2]
    253 ; if cur_match > limit, go to exitloop
    254     cmp     ecx,eax
    255     jnb     exitloop
    256 ; if --chain_length != 0, go to exitloop
    257     dec     chain_length
    258     jnz     normalbeg0
    259     jmp     exitloop
    260 
    261 normalbeg2noroll:
    262 ; if (scan_start==*(cur_match+window)) goto normalbeg2
    263     cmp     bp,word ptr[edx+eax]
    264     jne     rcontlabnoroll
    265     jmp     normalbeg2
    266 
    267 contloop3:
    268     mov     edi,windowlen
    269 
    270 ; cur_match = prev[cur_match & wmask]
    271     and     eax,7fffh
    272     mov     ax,word ptr[esi+eax*2]
    273 ; if cur_match > limit, go to exitloop
    274     cmp     ecx,eax
    275 jnbexitloopshort1:
    276     jnb     exitloop
    277 ; if --chain_length != 0, go to exitloop
    278 
    279 
    280 ; begin the main loop
    281 beginloop2:
    282     sub     chain_length,16+1
    283 ; if chain_length <=16, don't use the unrolled loop
    284     jna     normalbeg0add16
    285 
    286 do16:
    287     cmp     word ptr[edi+eax],bx
    288     je      normalbeg2dc0
    289 
    290 maccn   MACRO   lab
    291     and     eax,7fffh
    292     mov     ax,word ptr[esi+eax*2]
    293     cmp     ecx,eax
    294     jnb     exitloop
    295     cmp     word ptr[edi+eax],bx
    296     je      lab
    297     ENDM
    298 
    299 rcontloop0:
    300     maccn   normalbeg2dc1
    301 
    302 rcontloop1:
    303     maccn   normalbeg2dc2
    304 
    305 rcontloop2:
    306     maccn   normalbeg2dc3
    307 
    308 rcontloop3:
    309     maccn   normalbeg2dc4
    310 
    311 rcontloop4:
    312     maccn   normalbeg2dc5
    313 
    314 rcontloop5:
    315     maccn   normalbeg2dc6
    316 
    317 rcontloop6:
    318     maccn   normalbeg2dc7
    319 
    320 rcontloop7:
    321     maccn   normalbeg2dc8
    322 
    323 rcontloop8:
    324     maccn   normalbeg2dc9
    325 
    326 rcontloop9:
    327     maccn   normalbeg2dc10
    328 
    329 rcontloop10:
    330     maccn   short normalbeg2dc11
    331 
    332 rcontloop11:
    333     maccn   short normalbeg2dc12
    334 
    335 rcontloop12:
    336     maccn   short normalbeg2dc13
    337 
    338 rcontloop13:
    339     maccn   short normalbeg2dc14
    340 
    341 rcontloop14:
    342     maccn   short normalbeg2dc15
    343 
    344 rcontloop15:
    345     and     eax,7fffh
    346     mov     ax,word ptr[esi+eax*2]
    347     cmp     ecx,eax
    348     jnb     exitloop
    349 
    350     sub     chain_length,16
    351     ja      do16
    352     jmp     normalbeg0add16
    353 
    354 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    355 
    356 normbeg MACRO   rcontlab,valsub
    357 ; if we are here, we know that *(match+best_len-1) == scan_end
    358     cmp     bp,word ptr[edx+eax]
    359 ; if (match != scan_start) goto rcontlab
    360     jne     rcontlab
    361 ; calculate the good chain_length, and we'll compare scan and match string
    362     add     chain_length,16-valsub
    363     jmp     iseq
    364     ENDM
    365 
    366 
    367 normalbeg2dc11:
    368     normbeg rcontloop11,11
    369 
    370 normalbeg2dc12:
    371     normbeg short rcontloop12,12
    372 
    373 normalbeg2dc13:
    374     normbeg short rcontloop13,13
    375 
    376 normalbeg2dc14:
    377     normbeg short rcontloop14,14
    378 
    379 normalbeg2dc15:
    380     normbeg short rcontloop15,15
    381 
    382 normalbeg2dc10:
    383     normbeg rcontloop10,10
    384 
    385 normalbeg2dc9:
    386     normbeg rcontloop9,9
    387 
    388 normalbeg2dc8:
    389     normbeg rcontloop8,8
    390 
    391 normalbeg2dc7:
    392     normbeg rcontloop7,7
    393 
    394 normalbeg2dc6:
    395     normbeg rcontloop6,6
    396 
    397 normalbeg2dc5:
    398     normbeg rcontloop5,5
    399 
    400 normalbeg2dc4:
    401     normbeg rcontloop4,4
    402 
    403 normalbeg2dc3:
    404     normbeg rcontloop3,3
    405 
    406 normalbeg2dc2:
    407     normbeg rcontloop2,2
    408 
    409 normalbeg2dc1:
    410     normbeg rcontloop1,1
    411 
    412 normalbeg2dc0:
    413     normbeg rcontloop0,0
    414 
    415 
    416 ; we go in normalbeg2 because *(ushf*)(match+best_len-1) == scan_end
    417 
    418 normalbeg2:
    419     mov     edi,window
    420 
    421     cmp     bp,word ptr[edi+eax]
    422     jne     contloop3                   ; if *(ushf*)match != scan_start, continue
    423 
    424 iseq:
    425 ; if we are here, we know that *(match+best_len-1) == scan_end
    426 ; and (match == scan_start)
    427 
    428     mov     edi,edx
    429     mov     esi,scan                    ; esi = scan
    430     add     edi,eax                     ; edi = window + cur_match = match
    431 
    432     mov     edx,[esi+3]                 ; compare manually dword at match+3
    433     xor     edx,[edi+3]                 ; and scan +3
    434 
    435     jz      begincompare                ; if equal, go to long compare
    436 
    437 ; we will determine the unmatch byte and calculate len (in esi)
    438     or      dl,dl
    439     je      eq1rr
    440     mov     esi,3
    441     jmp     trfinval
    442 eq1rr:
    443     or      dx,dx
    444     je      eq1
    445 
    446     mov     esi,4
    447     jmp     trfinval
    448 eq1:
    449     and     edx,0ffffffh
    450     jz      eq11
    451     mov     esi,5
    452     jmp     trfinval
    453 eq11:
    454     mov     esi,6
    455     jmp     trfinval
    456 
    457 begincompare:
    458     ; here we now scan and match begin same
    459     add     edi,6
    460     add     esi,6
    461     mov     ecx,(MAX_MATCH-(2+4))/4     ; scan for at most MAX_MATCH bytes
    462     repe    cmpsd                       ; loop until mismatch
    463 
    464     je      trfin                       ; go to trfin if not unmatch
    465 ; we determine the unmatch byte
    466     sub     esi,4
    467     mov     edx,[edi-4]
    468     xor     edx,[esi]
    469 
    470     or      dl,dl
    471     jnz     trfin
    472     inc     esi
    473 
    474     or      dx,dx
    475     jnz     trfin
    476     inc     esi
    477 
    478     and     edx,0ffffffh
    479     jnz     trfin
    480     inc     esi
    481 
    482 trfin:
    483     sub     esi,scan          ; esi = len
    484 trfinval:
    485 ; here we have finised compare, and esi contain len of equal string
    486     cmp     esi,best_len        ; if len > best_len, go newbestlen
    487     ja      short newbestlen
    488 ; now we restore edx, ecx and esi, for the big loop
    489     mov     esi,prev
    490     mov     ecx,limit
    491     mov     edx,window
    492     jmp     contloop3
    493 
    494 newbestlen:
    495     mov     best_len,esi        ; len become best_len
    496 
    497     mov     match_start,eax     ; save new position as match_start
    498     cmp     esi,nice_match      ; if best_len >= nice_match, exit
    499     jae     exitloop
    500     mov     ecx,scan
    501     mov     edx,window          ; restore edx=window
    502     add     ecx,esi
    503     add     esi,edx
    504 
    505     dec     esi
    506     mov     windowlen,esi       ; windowlen = window + best_len-1
    507     mov     bx,[ecx-1]          ; bx = *(scan+best_len-1) = scan_end
    508 
    509 ; now we restore ecx and esi, for the big loop :
    510     mov     esi,prev
    511     mov     ecx,limit
    512     jmp     contloop3
    513 
    514 exitloop:
    515 ; exit : s->match_start=match_start
    516     mov     ebx,match_start
    517     mov     ebp,str_s
    518     mov     ecx,best_len
    519     mov     dword ptr [ebp+dep_match_start],ebx
    520     mov     eax,dword ptr [ebp+dep_lookahead]
    521     cmp     ecx,eax
    522     ja      minexlo
    523     mov     eax,ecx
    524 minexlo:
    525 ; return min(best_len,s->lookahead)
    526 
    527 ; restore stack and register ebx,esi,edi,ebp
    528     add     esp,NbStackAdd
    529 
    530     pop     ebx
    531     pop     esi
    532     pop     edi
    533     pop     ebp
    534     ret
    535 InfoAuthor:
    536 ; please don't remove this string !
    537 ; Your are free use gvmat32 in any fre or commercial apps if you don't remove the string in the binary!
    538     db     0dh,0ah,"GVMat32 optimised assembly code written 1996-98 by Gilles Vollant",0dh,0ah
    539 
    540 
    541 
    542 IFDEF NOUNDERLINE
    543 longest_match_7fff   endp
    544 ELSE
    545 _longest_match_7fff  endp
    546 ENDIF
    547 
    548 
    549 IFDEF NOUNDERLINE
    550 cpudetect32     proc near
    551 ELSE
    552 _cpudetect32    proc near
    553 ENDIF
    554 
    555     push    ebx
    556 
    557     pushfd                  ; push original EFLAGS
    558     pop     eax             ; get original EFLAGS
    559     mov     ecx, eax        ; save original EFLAGS
    560     xor     eax, 40000h     ; flip AC bit in EFLAGS
    561     push    eax             ; save new EFLAGS value on stack
    562     popfd                   ; replace current EFLAGS value
    563     pushfd                  ; get new EFLAGS
    564     pop     eax             ; store new EFLAGS in EAX
    565     xor     eax, ecx        ; cant toggle AC bit, processor=80386
    566     jz      end_cpu_is_386  ; jump if 80386 processor
    567     push    ecx
    568     popfd                   ; restore AC bit in EFLAGS first
    569 
    570     pushfd
    571     pushfd
    572     pop     ecx
    573 
    574     mov     eax, ecx        ; get original EFLAGS
    575     xor     eax, 200000h    ; flip ID bit in EFLAGS
    576     push    eax             ; save new EFLAGS value on stack
    577     popfd                   ; replace current EFLAGS value
    578     pushfd                  ; get new EFLAGS
    579     pop     eax             ; store new EFLAGS in EAX
    580     popfd                   ; restore original EFLAGS
    581     xor     eax, ecx        ; cant toggle ID bit,
    582     je      is_old_486      ; processor=old
    583 
    584     mov     eax,1
    585     db      0fh,0a2h        ;CPUID
    586 
    587 exitcpudetect:
    588     pop ebx
    589     ret
    590 
    591 end_cpu_is_386:
    592     mov     eax,0300h
    593     jmp     exitcpudetect
    594 
    595 is_old_486:
    596     mov     eax,0400h
    597     jmp     exitcpudetect
    598 
    599 IFDEF NOUNDERLINE
    600 cpudetect32     endp
    601 ELSE
    602 _cpudetect32    endp
    603 ENDIF
    604 ENDIF
    605 
    606 MAX_MATCH       equ     258
    607 MIN_MATCH       equ     3
    608 MIN_LOOKAHEAD   equ     (MAX_MATCH + MIN_MATCH + 1)
    609 MAX_MATCH_8_     equ     ((MAX_MATCH + 7) AND 0FFF0h)
    610 
    611 
    612 ;;; stack frame offsets
    613 
    614 chainlenwmask   equ  esp + 0    ; high word: current chain len
    615                     ; low word: s->wmask
    616 window      equ  esp + 4    ; local copy of s->window
    617 windowbestlen   equ  esp + 8    ; s->window + bestlen
    618 scanstart   equ  esp + 16   ; first two bytes of string
    619 scanend     equ  esp + 12   ; last two bytes of string
    620 scanalign   equ  esp + 20   ; dword-misalignment of string
    621 nicematch   equ  esp + 24   ; a good enough match size
    622 bestlen     equ  esp + 28   ; size of best match so far
    623 scan        equ  esp + 32   ; ptr to string wanting match
    624 
    625 LocalVarsSize   equ 36
    626 ;   saved ebx   byte esp + 36
    627 ;   saved edi   byte esp + 40
    628 ;   saved esi   byte esp + 44
    629 ;   saved ebp   byte esp + 48
    630 ;   return address  byte esp + 52
    631 deflatestate    equ  esp + 56   ; the function arguments
    632 curmatch    equ  esp + 60
    633 
    634 ;;; Offsets for fields in the deflate_state structure. These numbers
    635 ;;; are calculated from the definition of deflate_state, with the
    636 ;;; assumption that the compiler will dword-align the fields. (Thus,
    637 ;;; changing the definition of deflate_state could easily cause this
    638 ;;; program to crash horribly, without so much as a warning at
    639 ;;; compile time. Sigh.)
    640 
    641 dsWSize     equ 36+zlib1222add
    642 dsWMask     equ 44+zlib1222add
    643 dsWindow    equ 48+zlib1222add
    644 dsPrev      equ 56+zlib1222add
    645 dsMatchLen  equ 88+zlib1222add
    646 dsPrevMatch equ 92+zlib1222add
    647 dsStrStart  equ 100+zlib1222add
    648 dsMatchStart    equ 104+zlib1222add
    649 dsLookahead equ 108+zlib1222add
    650 dsPrevLen   equ 112+zlib1222add
    651 dsMaxChainLen   equ 116+zlib1222add
    652 dsGoodMatch equ 132+zlib1222add
    653 dsNiceMatch equ 136+zlib1222add
    654 
    655 
    656 ;;; match.asm -- Pentium-Pro-optimized version of longest_match()
    657 ;;; Written for zlib 1.1.2
    658 ;;; Copyright (C) 1998 Brian Raiter <breadbox (a] muppetlabs.com>
    659 ;;; You can look at http://www.muppetlabs.com/~breadbox/software/assembly.html
    660 ;;;
    661 ;;; This is free software; you can redistribute it and/or modify it
    662 ;;; under the terms of the GNU General Public License.
    663 
    664 ;GLOBAL _longest_match, _match_init
    665 
    666 
    667 ;SECTION    .text
    668 
    669 ;;; uInt longest_match(deflate_state *deflatestate, IPos curmatch)
    670 
    671 ;_longest_match:
    672 IFDEF NOOLDPENTIUMCODE
    673     IFDEF NOUNDERLINE
    674     longest_match       proc near
    675     ELSE
    676     _longest_match      proc near
    677     ENDIF
    678 ELSE
    679     IFDEF NOUNDERLINE
    680     longest_match_686   proc near
    681     ELSE
    682     _longest_match_686  proc near
    683     ENDIF
    684 ENDIF
    685 
    686 ;;; Save registers that the compiler may be using, and adjust esp to
    687 ;;; make room for our stack frame.
    688 
    689         push    ebp
    690         push    edi
    691         push    esi
    692         push    ebx
    693         sub esp, LocalVarsSize
    694 
    695 ;;; Retrieve the function arguments. ecx will hold cur_match
    696 ;;; throughout the entire function. edx will hold the pointer to the
    697 ;;; deflate_state structure during the function's setup (before
    698 ;;; entering the main loop.
    699 
    700         mov edx, [deflatestate]
    701         mov ecx, [curmatch]
    702 
    703 ;;; uInt wmask = s->w_mask;
    704 ;;; unsigned chain_length = s->max_chain_length;
    705 ;;; if (s->prev_length >= s->good_match) {
    706 ;;;     chain_length >>= 2;
    707 ;;; }
    708 
    709         mov eax, [edx + dsPrevLen]
    710         mov ebx, [edx + dsGoodMatch]
    711         cmp eax, ebx
    712         mov eax, [edx + dsWMask]
    713         mov ebx, [edx + dsMaxChainLen]
    714         jl  LastMatchGood
    715         shr ebx, 2
    716 LastMatchGood:
    717 
    718 ;;; chainlen is decremented once beforehand so that the function can
    719 ;;; use the sign flag instead of the zero flag for the exit test.
    720 ;;; It is then shifted into the high word, to make room for the wmask
    721 ;;; value, which it will always accompany.
    722 
    723         dec ebx
    724         shl ebx, 16
    725         or  ebx, eax
    726         mov [chainlenwmask], ebx
    727 
    728 ;;; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead;
    729 
    730         mov eax, [edx + dsNiceMatch]
    731         mov ebx, [edx + dsLookahead]
    732         cmp ebx, eax
    733         jl  LookaheadLess
    734         mov ebx, eax
    735 LookaheadLess:  mov [nicematch], ebx
    736 
    737 ;;; register Bytef *scan = s->window + s->strstart;
    738 
    739         mov esi, [edx + dsWindow]
    740         mov [window], esi
    741         mov ebp, [edx + dsStrStart]
    742         lea edi, [esi + ebp]
    743         mov [scan], edi
    744 
    745 ;;; Determine how many bytes the scan ptr is off from being
    746 ;;; dword-aligned.
    747 
    748         mov eax, edi
    749         neg eax
    750         and eax, 3
    751         mov [scanalign], eax
    752 
    753 ;;; IPos limit = s->strstart > (IPos)MAX_DIST(s) ?
    754 ;;;     s->strstart - (IPos)MAX_DIST(s) : NIL;
    755 
    756         mov eax, [edx + dsWSize]
    757         sub eax, MIN_LOOKAHEAD
    758         sub ebp, eax
    759         jg  LimitPositive
    760         xor ebp, ebp
    761 LimitPositive:
    762 
    763 ;;; int best_len = s->prev_length;
    764 
    765         mov eax, [edx + dsPrevLen]
    766         mov [bestlen], eax
    767 
    768 ;;; Store the sum of s->window + best_len in esi locally, and in esi.
    769 
    770         add esi, eax
    771         mov [windowbestlen], esi
    772 
    773 ;;; register ush scan_start = *(ushf*)scan;
    774 ;;; register ush scan_end   = *(ushf*)(scan+best_len-1);
    775 ;;; Posf *prev = s->prev;
    776 
    777         movzx   ebx, word ptr [edi]
    778         mov [scanstart], ebx
    779         movzx   ebx, word ptr [edi + eax - 1]
    780         mov [scanend], ebx
    781         mov edi, [edx + dsPrev]
    782 
    783 ;;; Jump into the main loop.
    784 
    785         mov edx, [chainlenwmask]
    786         jmp short LoopEntry
    787 
    788 align 4
    789 
    790 ;;; do {
    791 ;;;     match = s->window + cur_match;
    792 ;;;     if (*(ushf*)(match+best_len-1) != scan_end ||
    793 ;;;         *(ushf*)match != scan_start) continue;
    794 ;;;     [...]
    795 ;;; } while ((cur_match = prev[cur_match & wmask]) > limit
    796 ;;;          && --chain_length != 0);
    797 ;;;
    798 ;;; Here is the inner loop of the function. The function will spend the
    799 ;;; majority of its time in this loop, and majority of that time will
    800 ;;; be spent in the first ten instructions.
    801 ;;;
    802 ;;; Within this loop:
    803 ;;; ebx = scanend
    804 ;;; ecx = curmatch
    805 ;;; edx = chainlenwmask - i.e., ((chainlen << 16) | wmask)
    806 ;;; esi = windowbestlen - i.e., (window + bestlen)
    807 ;;; edi = prev
    808 ;;; ebp = limit
    809 
    810 LookupLoop:
    811         and ecx, edx
    812         movzx   ecx, word ptr [edi + ecx*2]
    813         cmp ecx, ebp
    814         jbe LeaveNow
    815         sub edx, 00010000h
    816         js  LeaveNow
    817 LoopEntry:  movzx   eax, word ptr [esi + ecx - 1]
    818         cmp eax, ebx
    819         jnz LookupLoop
    820         mov eax, [window]
    821         movzx   eax, word ptr [eax + ecx]
    822         cmp eax, [scanstart]
    823         jnz LookupLoop
    824 
    825 ;;; Store the current value of chainlen.
    826 
    827         mov [chainlenwmask], edx
    828 
    829 ;;; Point edi to the string under scrutiny, and esi to the string we
    830 ;;; are hoping to match it up with. In actuality, esi and edi are
    831 ;;; both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and edx is
    832 ;;; initialized to -(MAX_MATCH_8 - scanalign).
    833 
    834         mov esi, [window]
    835         mov edi, [scan]
    836         add esi, ecx
    837         mov eax, [scanalign]
    838         mov edx, 0fffffef8h; -(MAX_MATCH_8)
    839         lea edi, [edi + eax + 0108h] ;MAX_MATCH_8]
    840         lea esi, [esi + eax + 0108h] ;MAX_MATCH_8]
    841 
    842 ;;; Test the strings for equality, 8 bytes at a time. At the end,
    843 ;;; adjust edx so that it is offset to the exact byte that mismatched.
    844 ;;;
    845 ;;; We already know at this point that the first three bytes of the
    846 ;;; strings match each other, and they can be safely passed over before
    847 ;;; starting the compare loop. So what this code does is skip over 0-3
    848 ;;; bytes, as much as necessary in order to dword-align the edi
    849 ;;; pointer. (esi will still be misaligned three times out of four.)
    850 ;;;
    851 ;;; It should be confessed that this loop usually does not represent
    852 ;;; much of the total running time. Replacing it with a more
    853 ;;; straightforward "rep cmpsb" would not drastically degrade
    854 ;;; performance.
    855 
    856 LoopCmps:
    857         mov eax, [esi + edx]
    858         xor eax, [edi + edx]
    859         jnz LeaveLoopCmps
    860         mov eax, [esi + edx + 4]
    861         xor eax, [edi + edx + 4]
    862         jnz LeaveLoopCmps4
    863         add edx, 8
    864         jnz LoopCmps
    865         jmp short LenMaximum
    866 LeaveLoopCmps4: add edx, 4
    867 LeaveLoopCmps:  test    eax, 0000FFFFh
    868         jnz LenLower
    869         add edx,  2
    870         shr eax, 16
    871 LenLower:   sub al, 1
    872         adc edx, 0
    873 
    874 ;;; Calculate the length of the match. If it is longer than MAX_MATCH,
    875 ;;; then automatically accept it as the best possible match and leave.
    876 
    877         lea eax, [edi + edx]
    878         mov edi, [scan]
    879         sub eax, edi
    880         cmp eax, MAX_MATCH
    881         jge LenMaximum
    882 
    883 ;;; If the length of the match is not longer than the best match we
    884 ;;; have so far, then forget it and return to the lookup loop.
    885 
    886         mov edx, [deflatestate]
    887         mov ebx, [bestlen]
    888         cmp eax, ebx
    889         jg  LongerMatch
    890         mov esi, [windowbestlen]
    891         mov edi, [edx + dsPrev]
    892         mov ebx, [scanend]
    893         mov edx, [chainlenwmask]
    894         jmp LookupLoop
    895 
    896 ;;;         s->match_start = cur_match;
    897 ;;;         best_len = len;
    898 ;;;         if (len >= nice_match) break;
    899 ;;;         scan_end = *(ushf*)(scan+best_len-1);
    900 
    901 LongerMatch:    mov ebx, [nicematch]
    902         mov [bestlen], eax
    903         mov [edx + dsMatchStart], ecx
    904         cmp eax, ebx
    905         jge LeaveNow
    906         mov esi, [window]
    907         add esi, eax
    908         mov [windowbestlen], esi
    909         movzx   ebx, word ptr [edi + eax - 1]
    910         mov edi, [edx + dsPrev]
    911         mov [scanend], ebx
    912         mov edx, [chainlenwmask]
    913         jmp LookupLoop
    914 
    915 ;;; Accept the current string, with the maximum possible length.
    916 
    917 LenMaximum: mov edx, [deflatestate]
    918         mov dword ptr [bestlen], MAX_MATCH
    919         mov [edx + dsMatchStart], ecx
    920 
    921 ;;; if ((uInt)best_len <= s->lookahead) return (uInt)best_len;
    922 ;;; return s->lookahead;
    923 
    924 LeaveNow:
    925         mov edx, [deflatestate]
    926         mov ebx, [bestlen]
    927         mov eax, [edx + dsLookahead]
    928         cmp ebx, eax
    929         jg  LookaheadRet
    930         mov eax, ebx
    931 LookaheadRet:
    932 
    933 ;;; Restore the stack and return from whence we came.
    934 
    935         add esp, LocalVarsSize
    936         pop ebx
    937         pop esi
    938         pop edi
    939         pop ebp
    940 
    941         ret
    942 ; please don't remove this string !
    943 ; Your can freely use gvmat32 in any free or commercial app if you don't remove the string in the binary!
    944     db     0dh,0ah,"asm686 with masm, optimised assembly code from Brian Raiter, written 1998",0dh,0ah
    945 
    946 
    947 IFDEF NOOLDPENTIUMCODE
    948     IFDEF NOUNDERLINE
    949     longest_match       endp
    950     ELSE
    951     _longest_match      endp
    952     ENDIF
    953 
    954     IFDEF NOUNDERLINE
    955     match_init      proc near
    956                     ret
    957     match_init      endp
    958     ELSE
    959     _match_init     proc near
    960                     ret
    961     _match_init     endp
    962     ENDIF    
    963 ELSE
    964     IFDEF NOUNDERLINE
    965     longest_match_686   endp
    966     ELSE
    967     _longest_match_686  endp
    968     ENDIF
    969 ENDIF
    970 
    971 _TEXT   ends
    972 end
    973