Home | History | Annotate | Download | only in tests
      1 ;uInt longest_match_x64(
      2 ;    deflate_state *s,
      3 ;    IPos cur_match);                             /* current match */
      4 
      5 ; gvmat64.asm -- Asm portion of the optimized longest_match for 32 bits x86
      6 ; Copyright (C) 1995-2005 Jean-loup Gailly, Brian Raiter and Gilles Vollant.
      7 ;
      8 ; File written by Gilles Vollant, by converting to assembly the longest_match
      9 ;  from Jean-loup Gailly in deflate.c of zLib and infoZip zip.
     10 ;
     11 ;  and by taking inspiration on asm686 with masm, optimised assembly code
     12 ;        from Brian Raiter, written 1998
     13 ;
     14 ;         http://www.zlib.net
     15 ;         http://www.winimage.com/zLibDll
     16 ;         http://www.muppetlabs.com/~breadbox/software/assembly.html
     17 ;
     18 ; to compile this file for infozip Zip, I use option:
     19 ;   ml64.exe /Flgvmat64 /c /Zi /DINFOZIP gvmat64.asm
     20 ;
     21 ; to compile this file for zLib, I use option:
     22 ;   ml64.exe /Flgvmat64 /c /Zi gvmat64.asm
     23 ; Be carrefull to adapt zlib1222add below to your version of zLib
     24 ;   (if you use a version of zLib before 1.0.4 or after 1.2.2.2, change
     25 ;    value of zlib1222add later)
     26 ;
     27 ; This file compile with Microsoft Macro Assembler (x64) for AMD64
     28 ;
     29 ;   ml64.exe is given with Visual Studio 2005 and Windows 2003 server DDK
     30 ;
     31 ;   (you can get Windows 2003 server DDK with ml64 and cl for AMD64 from
     32 ;      http://www.microsoft.com/whdc/devtools/ddk/default.mspx for low price)
     33 ;
     34 
     35 
     36 ;uInt longest_match(s, cur_match)
     37 ;    deflate_state *s;
     38 ;    IPos cur_match;                             /* current match */
     39 bits 64
     40 	section .text
     41 	
     42 longest_match:	; PROC
     43 
     44 
     45 ;%define LocalVarsSize  88
     46  %define LocalVarsSize  72
     47 
     48 ; register used : rax,rbx,rcx,rdx,rsi,rdi,r8,r9,r10,r11,r12
     49 ; free register :  r14,r15
     50 ; register can be saved : rsp
     51 
     52  %define chainlenwmask   rsp + 8 - LocalVarsSize    ; high word: current chain len
     53                                                  ; low word: s->wmask
     54 ;%define window   rsp + xx - LocalVarsSize   ; local copy of s->window ; stored in r10
     55 ;%define windowbestlen   rsp + xx - LocalVarsSize   ; s->window + bestlen , use r10+r11
     56 ;%define scanstart   rsp + xx - LocalVarsSize   ; first two bytes of string ; stored in r12w
     57 ;%define scanend   rsp + xx - LocalVarsSize   ; last two bytes of string use ebx
     58 ;%define scanalign   rsp + xx - LocalVarsSize   ; dword-misalignment of string r13
     59 ;%define bestlen   rsp + xx - LocalVarsSize   ; size of best match so far -> r11d
     60 ;%define scan   rsp + xx - LocalVarsSize   ; ptr to string wanting match -> r9
     61 %ifdef INFOZIP
     62 %else
     63  %define nicematch   (rsp + 16 - LocalVarsSize) ; a good enough match size
     64 %endif
     65 
     66 %define save_rdi   rsp + 24 - LocalVarsSize
     67 %define save_rsi   rsp + 32 - LocalVarsSize
     68 %define save_rbx   rsp + 40 - LocalVarsSize
     69 %define save_rbp   rsp + 48 - LocalVarsSize
     70 %define save_r12   rsp + 56 - LocalVarsSize
     71 %define save_r13   rsp + 64 - LocalVarsSize
     72 ;%define save_r14   rsp + 72 - LocalVarsSize
     73 ;%define save_r15   rsp + 80 - LocalVarsSize
     74 
     75 
     76 
     77 ;  all the +4 offsets are due to the addition of pending_buf_size (in zlib
     78 ;  in the deflate_state structure since the asm code was first written
     79 ;  (if you compile with zlib 1.0.4 or older, remove the +4).
     80 ;  Note : these value are good with a 8 bytes boundary pack structure
     81 
     82 
     83     %define MAX_MATCH      258
     84     %define MIN_MATCH      3
     85     %define MIN_LOOKAHEAD      (MAX_MATCH+MIN_MATCH+1)
     86 
     87 
     88 ;;; Offsets for fields in the deflate_state structure. These numbers
     89 ;;; are calculated from the definition of deflate_state, with the
     90 ;;; assumption that the compiler will dword-align the fields. (Thus,
     91 ;;; changing the definition of deflate_state could easily cause this
     92 ;;; program to crash horribly, without so much as a warning at
     93 ;;; compile time. Sigh.)
     94 
     95 ;  all the +zlib1222add offsets are due to the addition of fields
     96 ;  in zlib in the deflate_state structure since the asm code was first written
     97 ;  (if you compile with zlib 1.0.4 or older, use "%define zlib1222add  (-4)").
     98 ;  (if you compile with zlib between 1.0.5 and 1.2.2.1, use "%define zlib1222add  0").
     99 ;  if you compile with zlib 1.2.2.2 or later , use "%define zlib1222add  8").
    100 
    101 %ifdef INFOZIP
    102 	section .data
    103 
    104 extern    window_size:DWORD
    105 ; WMask ; 7fff
    106 extern    window:BYTE:010040H
    107 extern    prev:WORD:08000H
    108 ; MatchLen : unused
    109 ; PrevMatch : unused
    110 extern    strstart:DWORD
    111 extern    match_start:DWORD
    112 ; Lookahead : ignore
    113 extern    prev_length:DWORD ; PrevLen
    114 extern    max_chain_length:DWORD
    115 extern    good_match:DWORD
    116 extern    nice_match:DWORD
    117 %define prev_ad  OFFSET prev
    118 %define window_ad  OFFSET window
    119 %define nicematch  nice_match
    120 
    121 %define WMask  07fffh
    122 
    123 %else
    124 
    125   %ifndef zlib1222add
    126     %define zlib1222add  8
    127   %endif
    128 %define dsWSize  56+zlib1222add+(zlib1222add/2)
    129 %define dsWMask  64+zlib1222add+(zlib1222add/2)
    130 %define dsWindow  72+zlib1222add
    131 %define dsPrev  88+zlib1222add
    132 %define dsMatchLen  128+zlib1222add
    133 %define dsPrevMatch  132+zlib1222add
    134 %define dsStrStart  140+zlib1222add
    135 %define dsMatchStart  144+zlib1222add
    136 %define dsLookahead  148+zlib1222add
    137 %define dsPrevLen  152+zlib1222add
    138 %define dsMaxChainLen  156+zlib1222add
    139 %define dsGoodMatch  172+zlib1222add
    140 %define dsNiceMatch  176+zlib1222add
    141 
    142 %define window_size  [ rcx + dsWSize]
    143 %define WMask  [ rcx + dsWMask]
    144 %define window_ad  [ rcx + dsWindow]
    145 %define prev_ad  [ rcx + dsPrev]
    146 %define strstart  [ rcx + dsStrStart]
    147 %define match_start  [ rcx + dsMatchStart]
    148 %define Lookahead  [ rcx + dsLookahead] ; 0ffffffffh on infozip
    149 %define prev_length  [ rcx + dsPrevLen]
    150 %define max_chain_length  [ rcx + dsMaxChainLen]
    151 %define good_match  [ rcx + dsGoodMatch]
    152 %define nice_match  [ rcx + dsNiceMatch]
    153 %endif
    154 
    155 ; parameter 1 in r8(deflate state s), param 2 in rdx (cur match)
    156 
    157 ; see http://weblogs.asp.net/oldnewthing/archive/2004/01/14/58579.aspx and
    158 ; http://msdn.microsoft.com/library/en-us/kmarch/hh/kmarch/64bitAMD_8e951dd2-ee77-4728-8702-55ce4b5dd24a.xml.asp
    159 ;
    160 ; All registers must be preserved across the call, except for
    161 ;   rax, rcx, rdx, r8, r9, r10, and r11, which are scratch.
    162 
    163 	section .text
    164 
    165 ;;; Save registers that the compiler may be using, and adjust esp to
    166 ;;; make room for our stack frame.
    167 
    168 
    169 ;;; Retrieve the function arguments. r8d will hold cur_match
    170 ;;; throughout the entire function. edx will hold the pointer to the
    171 ;;; deflate_state structure during the function's setup (before
    172 ;;; entering the main loop.
    173 
    174 ; parameter 1 in rcx (deflate_state* s), param 2 in edx -> r8 (cur match)
    175 
    176 ; this clear high 32 bits of r8, which can be garbage in both r8 and rdx
    177 
    178         mov [save_rdi],rdi
    179         mov [save_rsi],rsi
    180         mov [save_rbx],rbx
    181         mov [save_rbp],rbp
    182 %ifdef INFOZIP
    183         mov r8d,ecx
    184 %else
    185         mov r8d,edx
    186 %endif
    187         mov [save_r12],r12
    188         mov [save_r13],r13
    189 ;        mov [save_r14],r14
    190 ;        mov [save_r15],r15
    191 
    192 
    193 ;;; uInt wmask = s->w_mask;
    194 ;;; unsigned chain_length = s->max_chain_length;
    195 ;;; if (s->prev_length >= s->good_match) {
    196 ;;;     chain_length >>= 2;
    197 ;;; }
    198 
    199         mov edi, prev_length
    200         mov esi, good_match
    201         mov eax, WMask
    202         mov ebx, max_chain_length
    203         cmp edi, esi
    204         jl  LastMatchGood
    205         shr ebx, 2
    206 LastMatchGood:
    207 
    208 ;;; chainlen is decremented once beforehand so that the function can
    209 ;;; use the sign flag instead of the zero flag for the exit test.
    210 ;;; It is then shifted into the high word, to make room for the wmask
    211 ;;; value, which it will always accompany.
    212 
    213         dec ebx
    214         shl ebx, 16
    215         or  ebx, eax
    216 
    217 ;;; on zlib only
    218 ;;; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead;
    219 
    220 %ifdef INFOZIP
    221         mov [chainlenwmask], ebx
    222 ; on infozip nice_match = [nice_match]
    223 %else
    224         mov eax, nice_match
    225         mov [chainlenwmask], ebx
    226         mov r10d, Lookahead
    227         cmp r10d, eax
    228         cmovnl r10d, eax
    229         mov [nicematch],r10d
    230 %endif
    231 
    232 ;;; register Bytef *scan = s->window + s->strstart;
    233         mov r10, window_ad
    234         mov ebp, strstart
    235         lea r13, [r10 + rbp]
    236 
    237 ;;; Determine how many bytes the scan ptr is off from being
    238 ;;; dword-aligned.
    239 
    240          mov r9,r13
    241          neg r13
    242          and r13,3
    243 
    244 ;;; IPos limit = s->strstart > (IPos)MAX_DIST(s) ?
    245 ;;;     s->strstart - (IPos)MAX_DIST(s) : NIL;
    246 %ifdef INFOZIP
    247         mov eax,07efah ; MAX_DIST = (WSIZE-MIN_LOOKAHEAD) (0x8000-(3+8+1))
    248 %else
    249         mov eax, window_size
    250         sub eax, MIN_LOOKAHEAD
    251 %endif
    252         xor edi,edi
    253         sub ebp, eax
    254 
    255         mov r11d, prev_length
    256 
    257         cmovng ebp,edi
    258 
    259 ;;; int best_len = s->prev_length;
    260 
    261 
    262 ;;; Store the sum of s->window + best_len in esi locally, and in esi.
    263 
    264        lea  rsi,[r10+r11]
    265 
    266 ;;; register ush scan_start = *(ushf*)scan;
    267 ;;; register ush scan_end   = *(ushf*)(scan+best_len-1);
    268 ;;; Posf *prev = s->prev;
    269 
    270         movzx r12d,word [r9]
    271         movzx ebx, word [r9 + r11 - 1]
    272 
    273         mov rdi, prev_ad
    274 
    275 ;;; Jump into the main loop.
    276 
    277         mov edx, [chainlenwmask]
    278 
    279         cmp bx,word [rsi + r8 - 1]
    280         jz  LookupLoopIsZero
    281 
    282 LookupLoop1:
    283         and r8d, edx
    284 
    285         movzx   r8d, word [rdi + r8*2]
    286         cmp r8d, ebp
    287         jbe LeaveNow
    288         sub edx, 00010000h
    289         js  LeaveNow
    290 
    291 LoopEntry1:
    292         cmp bx,word [rsi + r8 - 1]
    293         jz  LookupLoopIsZero
    294 
    295 LookupLoop2:
    296         and r8d, edx
    297 
    298         movzx   r8d, word [rdi + r8*2]
    299         cmp r8d, ebp
    300         jbe LeaveNow
    301         sub edx, 00010000h
    302         js  LeaveNow
    303 
    304 LoopEntry2:
    305         cmp bx,word [rsi + r8 - 1]
    306         jz  LookupLoopIsZero
    307 
    308 LookupLoop4:
    309         and r8d, edx
    310 
    311         movzx   r8d, word [rdi + r8*2]
    312         cmp r8d, ebp
    313         jbe LeaveNow
    314         sub edx, 00010000h
    315         js  LeaveNow
    316 
    317 LoopEntry4:
    318 
    319         cmp bx,word [rsi + r8 - 1]
    320         jnz LookupLoop1
    321         jmp LookupLoopIsZero
    322 
    323 
    324 ;;; do {
    325 ;;;     match = s->window + cur_match;
    326 ;;;     if (*(ushf*)(match+best_len-1) != scan_end ||
    327 ;;;         *(ushf*)match != scan_start) continue;
    328 ;;;     [...]
    329 ;;; } while ((cur_match = prev[cur_match & wmask]) > limit
    330 ;;;          && --chain_length != 0);
    331 ;;;
    332 ;;; Here is the inner loop of the function. The function will spend the
    333 ;;; majority of its time in this loop, and majority of that time will
    334 ;;; be spent in the first ten instructions.
    335 ;;;
    336 ;;; Within this loop:
    337 ;;; ebx = scanend
    338 ;;; r8d = curmatch
    339 ;;; edx = chainlenwmask - i.e., ((chainlen << 16) | wmask)
    340 ;;; esi = windowbestlen - i.e., (window + bestlen)
    341 ;;; edi = prev
    342 ;;; ebp = limit
    343 
    344 LookupLoop:
    345         and r8d, edx
    346 
    347         movzx   r8d, word [rdi + r8*2]
    348         cmp r8d, ebp
    349         jbe LeaveNow
    350         sub edx, 00010000h
    351         js  LeaveNow
    352 
    353 LoopEntry:
    354 
    355         cmp bx,word [rsi + r8 - 1]
    356         jnz LookupLoop1
    357 LookupLoopIsZero:
    358         cmp     r12w, word [r10 + r8]
    359         jnz LookupLoop1
    360 
    361 
    362 ;;; Store the current value of chainlen.
    363         mov [chainlenwmask], edx
    364 
    365 ;;; Point edi to the string under scrutiny, and esi to the string we
    366 ;;; are hoping to match it up with. In actuality, esi and edi are
    367 ;;; both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and edx is
    368 ;;; initialized to -(MAX_MATCH_8 - scanalign).
    369 
    370         lea rsi,[r8+r10]
    371         mov rdx, 0fffffffffffffef8h; -(MAX_MATCH_8)
    372         lea rsi, [rsi + r13 + 0108h] ;MAX_MATCH_8]
    373         lea rdi, [r9 + r13 + 0108h] ;MAX_MATCH_8]
    374 
    375         prefetcht1 [rsi+rdx]
    376         prefetcht1 [rdi+rdx]
    377 
    378 
    379 ;;; Test the strings %define for ality, 8 bytes at a time. At the end,
    380 ;;; adjust rdx so that it is offset to the exact byte that mismatched.
    381 ;;;
    382 ;;; We already know at this point that the first three bytes of the
    383 ;;; strings match each other, and they can be safely passed over before
    384 ;;; starting the compare loop. So what this code does is skip over 0-3
    385 ;;; bytes, as much as necessary in order to dword-align the edi
    386 ;;; pointer. (rsi will still be misaligned three times out of four.)
    387 ;;;
    388 ;;; It should be confessed that this loop usually does not represent
    389 ;;; much of the total running time. Replacing it with a more
    390 ;;; straightforward "rep cmpsb" would not drastically degrade
    391 ;;; performance.
    392 
    393 LoopCmps:
    394         mov rax, [rsi + rdx]
    395         xor rax, [rdi + rdx]
    396         jnz LeaveLoopCmps
    397 
    398         mov rax, [rsi + rdx + 8]
    399         xor rax, [rdi + rdx + 8]
    400         jnz LeaveLoopCmps8
    401 
    402 
    403         mov rax, [rsi + rdx + 8+8]
    404         xor rax, [rdi + rdx + 8+8]
    405         jnz LeaveLoopCmps16
    406 
    407         add rdx,8+8+8
    408 
    409         jmp short LoopCmps
    410 LeaveLoopCmps16: add rdx,8
    411 LeaveLoopCmps8: add rdx,8
    412 LeaveLoopCmps:
    413 
    414         test    eax, 0000FFFFh
    415         jnz LenLower
    416 
    417         test eax,0ffffffffh
    418 
    419         jnz LenLower32
    420 
    421         add rdx,4
    422         shr rax,32
    423         or ax,ax
    424         jnz LenLower
    425 
    426 LenLower32:
    427         shr eax,16
    428         add rdx,2
    429 LenLower:   sub al, 1
    430         adc rdx, 0
    431 ;;; Calculate the length of the match. If it is longer than MAX_MATCH,
    432 ;;; then automatically accept it as the best possible match and leave.
    433 
    434         lea rax, [rdi + rdx]
    435         sub rax, r9
    436         cmp eax, MAX_MATCH
    437         jge LenMaximum
    438 
    439 ;;; If the length of the match is not longer than the best match we
    440 ;;; have so far, then forget it and return to the lookup loop.
    441 ;///////////////////////////////////
    442 
    443         cmp eax, r11d
    444         jg  LongerMatch
    445 
    446         lea rsi,[r10+r11]
    447 
    448         mov rdi, prev_ad
    449         mov edx, [chainlenwmask]
    450         jmp LookupLoop
    451 
    452 ;;;         s->match_start = cur_match;
    453 ;;;         best_len = len;
    454 ;;;         if (len >= nice_match) break;
    455 ;;;         scan_end = *(ushf*)(scan+best_len-1);
    456 
    457 LongerMatch:
    458         mov r11d, eax
    459         mov match_start, r8d
    460         cmp eax, [nicematch]
    461         jge LeaveNow
    462 
    463         lea rsi,[r10+rax]
    464 
    465         movzx   ebx, word [r9 + rax - 1]
    466         mov rdi, prev_ad
    467         mov edx, [chainlenwmask]
    468         jmp LookupLoop
    469 
    470 ;;; Accept the current string, with the maximum possible length.
    471 
    472 LenMaximum:
    473         mov r11d,MAX_MATCH
    474         mov match_start, r8d
    475 ;;; if ((uInt)best_len <= s->lookahead) return (uInt)best_len;
    476 ;;; return s->lookahead;
    477 
    478 LeaveNow:
    479 %ifdef INFOZIP
    480         mov eax,r11d
    481 %else
    482         mov eax, Lookahead
    483         cmp r11d, eax
    484         cmovng eax, r11d
    485 %endif
    486 
    487 ;;; Restore the stack and return from whence we came.
    488 
    489         mov rsi,[save_rsi]
    490         mov rdi,[save_rdi]
    491         mov rbx,[save_rbx]
    492         mov rbp,[save_rbp]
    493         mov r12,[save_r12]
    494         mov r13,[save_r13]
    495 ;        mov r14,[save_r14]
    496 ;        mov r15,[save_r15]
    497 
    498 
    499         ret 0
    500 ; please don't remove this string !
    501 ; Your can freely use gvmat64 in any free or externercial app
    502 ; but it is far better don't remove the string in the binary!
    503     db     0dh,0ah,"asm686 with masm, optimised assembly code from Brian Raiter, written 1998, converted to amd 64 by Gilles Vollant 2005",0dh,0ah,0
    504 %if 0
    505 longest_match   ENDP
    506 %endif
    507 
    508 match_init:	; PROC
    509   ret 0
    510  %if 0
    511 match_init ENDP
    512 %endif
    513 
    514 END
    515