Home | History | Annotate | Download | only in masmx64
      1 ;uInt longest_match_x64(
      2 ;    deflate_state *s,
      3 ;    IPos cur_match);                             /* current match */
      4 
      5 ; gvmat64.asm -- Asm portion of the optimized longest_match for 32 bits x86_64
      6 ;  (AMD64 on Athlon 64, Opteron, Phenom
      7 ;     and Intel EM64T on Pentium 4 with EM64T, Pentium D, Core 2 Duo, Core I5/I7)
      8 ; Copyright (C) 1995-2010 Jean-loup Gailly, Brian Raiter and Gilles Vollant.
      9 ;
     10 ; File written by Gilles Vollant, by converting to assembly the longest_match
     11 ;  from Jean-loup Gailly in deflate.c of zLib and infoZip zip.
     12 ;
     13 ;  and by taking inspiration on asm686 with masm, optimised assembly code
     14 ;        from Brian Raiter, written 1998
     15 ;
     16 ;  This software is provided 'as-is', without any express or implied
     17 ;  warranty.  In no event will the authors be held liable for any damages
     18 ;  arising from the use of this software.
     19 ;
     20 ;  Permission is granted to anyone to use this software for any purpose,
     21 ;  including commercial applications, and to alter it and redistribute it
     22 ;  freely, subject to the following restrictions:
     23 ;
     24 ;  1. The origin of this software must not be misrepresented; you must not
     25 ;     claim that you wrote the original software. If you use this software
     26 ;     in a product, an acknowledgment in the product documentation would be
     27 ;     appreciated but is not required.
     28 ;  2. Altered source versions must be plainly marked as such, and must not be
     29 ;     misrepresented as being the original software
     30 ;  3. This notice may not be removed or altered from any source distribution.
     31 ;
     32 ;
     33 ;
     34 ;         http://www.zlib.net
     35 ;         http://www.winimage.com/zLibDll
     36 ;         http://www.muppetlabs.com/~breadbox/software/assembly.html
     37 ;
     38 ; to compile this file for infozip Zip, I use option:
     39 ;   ml64.exe /Flgvmat64 /c /Zi /DINFOZIP gvmat64.asm
     40 ;
     41 ; to compile this file for zLib, I use option:
     42 ;   ml64.exe /Flgvmat64 /c /Zi gvmat64.asm
     43 ; Be carrefull to adapt zlib1222add below to your version of zLib
     44 ;   (if you use a version of zLib before 1.0.4 or after 1.2.2.2, change
     45 ;    value of zlib1222add later)
     46 ;
     47 ; This file compile with Microsoft Macro Assembler (x64) for AMD64
     48 ;
     49 ;   ml64.exe is given with Visual Studio 2005/2008/2010 and Windows WDK
     50 ;
     51 ;   (you can get Windows WDK with ml64 for AMD64 from
     52 ;      http://www.microsoft.com/whdc/Devtools/wdk/default.mspx for low price)
     53 ;
     54 
     55 
     56 ;uInt longest_match(s, cur_match)
     57 ;    deflate_state *s;
     58 ;    IPos cur_match;                             /* current match */
     59 .code
     60 longest_match PROC
     61 
     62 
     63 ;LocalVarsSize   equ 88
     64  LocalVarsSize   equ 72
     65 
     66 ; register used : rax,rbx,rcx,rdx,rsi,rdi,r8,r9,r10,r11,r12
     67 ; free register :  r14,r15
     68 ; register can be saved : rsp
     69 
     70  chainlenwmask   equ  rsp + 8 - LocalVarsSize    ; high word: current chain len
     71                                                  ; low word: s->wmask
     72 ;window          equ  rsp + xx - LocalVarsSize   ; local copy of s->window ; stored in r10
     73 ;windowbestlen   equ  rsp + xx - LocalVarsSize   ; s->window + bestlen , use r10+r11
     74 ;scanstart       equ  rsp + xx - LocalVarsSize   ; first two bytes of string ; stored in r12w
     75 ;scanend         equ  rsp + xx - LocalVarsSize   ; last two bytes of string use ebx
     76 ;scanalign       equ  rsp + xx - LocalVarsSize   ; dword-misalignment of string r13
     77 ;bestlen         equ  rsp + xx - LocalVarsSize   ; size of best match so far -> r11d
     78 ;scan            equ  rsp + xx - LocalVarsSize   ; ptr to string wanting match -> r9
     79 IFDEF INFOZIP
     80 ELSE
     81  nicematch       equ  (rsp + 16 - LocalVarsSize) ; a good enough match size
     82 ENDIF
     83 
     84 save_rdi        equ  rsp + 24 - LocalVarsSize
     85 save_rsi        equ  rsp + 32 - LocalVarsSize
     86 save_rbx        equ  rsp + 40 - LocalVarsSize
     87 save_rbp        equ  rsp + 48 - LocalVarsSize
     88 save_r12        equ  rsp + 56 - LocalVarsSize
     89 save_r13        equ  rsp + 64 - LocalVarsSize
     90 ;save_r14        equ  rsp + 72 - LocalVarsSize
     91 ;save_r15        equ  rsp + 80 - LocalVarsSize
     92 
     93 
     94 ; summary of register usage
     95 ; scanend     ebx
     96 ; scanendw    bx
     97 ; chainlenwmask   edx
     98 ; curmatch    rsi
     99 ; curmatchd   esi
    100 ; windowbestlen   r8
    101 ; scanalign   r9
    102 ; scanalignd  r9d
    103 ; window      r10
    104 ; bestlen     r11
    105 ; bestlend    r11d
    106 ; scanstart   r12d
    107 ; scanstartw  r12w
    108 ; scan        r13
    109 ; nicematch   r14d
    110 ; limit       r15
    111 ; limitd      r15d
    112 ; prev        rcx
    113 
    114 ;  all the +4 offsets are due to the addition of pending_buf_size (in zlib
    115 ;  in the deflate_state structure since the asm code was first written
    116 ;  (if you compile with zlib 1.0.4 or older, remove the +4).
    117 ;  Note : these value are good with a 8 bytes boundary pack structure
    118 
    119 
    120     MAX_MATCH           equ     258
    121     MIN_MATCH           equ     3
    122     MIN_LOOKAHEAD       equ     (MAX_MATCH+MIN_MATCH+1)
    123 
    124 
    125 ;;; Offsets for fields in the deflate_state structure. These numbers
    126 ;;; are calculated from the definition of deflate_state, with the
    127 ;;; assumption that the compiler will dword-align the fields. (Thus,
    128 ;;; changing the definition of deflate_state could easily cause this
    129 ;;; program to crash horribly, without so much as a warning at
    130 ;;; compile time. Sigh.)
    131 
    132 ;  all the +zlib1222add offsets are due to the addition of fields
    133 ;  in zlib in the deflate_state structure since the asm code was first written
    134 ;  (if you compile with zlib 1.0.4 or older, use "zlib1222add equ (-4)").
    135 ;  (if you compile with zlib between 1.0.5 and 1.2.2.1, use "zlib1222add equ 0").
    136 ;  if you compile with zlib 1.2.2.2 or later , use "zlib1222add equ 8").
    137 
    138 
    139 IFDEF INFOZIP
    140 
    141 _DATA   SEGMENT
    142 COMM    window_size:DWORD
    143 ; WMask ; 7fff
    144 COMM    window:BYTE:010040H
    145 COMM    prev:WORD:08000H
    146 ; MatchLen : unused
    147 ; PrevMatch : unused
    148 COMM    strstart:DWORD
    149 COMM    match_start:DWORD
    150 ; Lookahead : ignore
    151 COMM    prev_length:DWORD ; PrevLen
    152 COMM    max_chain_length:DWORD
    153 COMM    good_match:DWORD
    154 COMM    nice_match:DWORD
    155 prev_ad equ OFFSET prev
    156 window_ad equ OFFSET window
    157 nicematch equ nice_match
    158 _DATA ENDS
    159 WMask equ 07fffh
    160 
    161 ELSE
    162 
    163   IFNDEF zlib1222add
    164     zlib1222add equ 8
    165   ENDIF
    166 dsWSize         equ 56+zlib1222add+(zlib1222add/2)
    167 dsWMask         equ 64+zlib1222add+(zlib1222add/2)
    168 dsWindow        equ 72+zlib1222add
    169 dsPrev          equ 88+zlib1222add
    170 dsMatchLen      equ 128+zlib1222add
    171 dsPrevMatch     equ 132+zlib1222add
    172 dsStrStart      equ 140+zlib1222add
    173 dsMatchStart    equ 144+zlib1222add
    174 dsLookahead     equ 148+zlib1222add
    175 dsPrevLen       equ 152+zlib1222add
    176 dsMaxChainLen   equ 156+zlib1222add
    177 dsGoodMatch     equ 172+zlib1222add
    178 dsNiceMatch     equ 176+zlib1222add
    179 
    180 window_size     equ [ rcx + dsWSize]
    181 WMask           equ [ rcx + dsWMask]
    182 window_ad       equ [ rcx + dsWindow]
    183 prev_ad         equ [ rcx + dsPrev]
    184 strstart        equ [ rcx + dsStrStart]
    185 match_start     equ [ rcx + dsMatchStart]
    186 Lookahead       equ [ rcx + dsLookahead] ; 0ffffffffh on infozip
    187 prev_length     equ [ rcx + dsPrevLen]
    188 max_chain_length equ [ rcx + dsMaxChainLen]
    189 good_match      equ [ rcx + dsGoodMatch]
    190 nice_match      equ [ rcx + dsNiceMatch]
    191 ENDIF
    192 
    193 ; parameter 1 in r8(deflate state s), param 2 in rdx (cur match)
    194 
    195 ; see http://weblogs.asp.net/oldnewthing/archive/2004/01/14/58579.aspx and
    196 ; http://msdn.microsoft.com/library/en-us/kmarch/hh/kmarch/64bitAMD_8e951dd2-ee77-4728-8702-55ce4b5dd24a.xml.asp
    197 ;
    198 ; All registers must be preserved across the call, except for
    199 ;   rax, rcx, rdx, r8, r9, r10, and r11, which are scratch.
    200 
    201 
    202 
    203 ;;; Save registers that the compiler may be using, and adjust esp to
    204 ;;; make room for our stack frame.
    205 
    206 
    207 ;;; Retrieve the function arguments. r8d will hold cur_match
    208 ;;; throughout the entire function. edx will hold the pointer to the
    209 ;;; deflate_state structure during the function's setup (before
    210 ;;; entering the main loop.
    211 
    212 ; parameter 1 in rcx (deflate_state* s), param 2 in edx -> r8 (cur match)
    213 
    214 ; this clear high 32 bits of r8, which can be garbage in both r8 and rdx
    215 
    216         mov [save_rdi],rdi
    217         mov [save_rsi],rsi
    218         mov [save_rbx],rbx
    219         mov [save_rbp],rbp
    220 IFDEF INFOZIP
    221         mov r8d,ecx
    222 ELSE
    223         mov r8d,edx
    224 ENDIF
    225         mov [save_r12],r12
    226         mov [save_r13],r13
    227 ;        mov [save_r14],r14
    228 ;        mov [save_r15],r15
    229 
    230 
    231 ;;; uInt wmask = s->w_mask;
    232 ;;; unsigned chain_length = s->max_chain_length;
    233 ;;; if (s->prev_length >= s->good_match) {
    234 ;;;     chain_length >>= 2;
    235 ;;; }
    236 
    237         mov edi, prev_length
    238         mov esi, good_match
    239         mov eax, WMask
    240         mov ebx, max_chain_length
    241         cmp edi, esi
    242         jl  LastMatchGood
    243         shr ebx, 2
    244 LastMatchGood:
    245 
    246 ;;; chainlen is decremented once beforehand so that the function can
    247 ;;; use the sign flag instead of the zero flag for the exit test.
    248 ;;; It is then shifted into the high word, to make room for the wmask
    249 ;;; value, which it will always accompany.
    250 
    251         dec ebx
    252         shl ebx, 16
    253         or  ebx, eax
    254 
    255 ;;; on zlib only
    256 ;;; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead;
    257 
    258 IFDEF INFOZIP
    259         mov [chainlenwmask], ebx
    260 ; on infozip nice_match = [nice_match]
    261 ELSE
    262         mov eax, nice_match
    263         mov [chainlenwmask], ebx
    264         mov r10d, Lookahead
    265         cmp r10d, eax
    266         cmovnl r10d, eax
    267         mov [nicematch],r10d
    268 ENDIF
    269 
    270 ;;; register Bytef *scan = s->window + s->strstart;
    271         mov r10, window_ad
    272         mov ebp, strstart
    273         lea r13, [r10 + rbp]
    274 
    275 ;;; Determine how many bytes the scan ptr is off from being
    276 ;;; dword-aligned.
    277 
    278          mov r9,r13
    279          neg r13
    280          and r13,3
    281 
    282 ;;; IPos limit = s->strstart > (IPos)MAX_DIST(s) ?
    283 ;;;     s->strstart - (IPos)MAX_DIST(s) : NIL;
    284 IFDEF INFOZIP
    285         mov eax,07efah ; MAX_DIST = (WSIZE-MIN_LOOKAHEAD) (0x8000-(3+8+1))
    286 ELSE
    287         mov eax, window_size
    288         sub eax, MIN_LOOKAHEAD
    289 ENDIF
    290         xor edi,edi
    291         sub ebp, eax
    292 
    293         mov r11d, prev_length
    294 
    295         cmovng ebp,edi
    296 
    297 ;;; int best_len = s->prev_length;
    298 
    299 
    300 ;;; Store the sum of s->window + best_len in esi locally, and in esi.
    301 
    302        lea  rsi,[r10+r11]
    303 
    304 ;;; register ush scan_start = *(ushf*)scan;
    305 ;;; register ush scan_end   = *(ushf*)(scan+best_len-1);
    306 ;;; Posf *prev = s->prev;
    307 
    308         movzx r12d,word ptr [r9]
    309         movzx ebx, word ptr [r9 + r11 - 1]
    310 
    311         mov rdi, prev_ad
    312 
    313 ;;; Jump into the main loop.
    314 
    315         mov edx, [chainlenwmask]
    316 
    317         cmp bx,word ptr [rsi + r8 - 1]
    318         jz  LookupLoopIsZero
    319 
    320 LookupLoop1:
    321         and r8d, edx
    322 
    323         movzx   r8d, word ptr [rdi + r8*2]
    324         cmp r8d, ebp
    325         jbe LeaveNow
    326         sub edx, 00010000h
    327         js  LeaveNow
    328 
    329 LoopEntry1:
    330         cmp bx,word ptr [rsi + r8 - 1]
    331         jz  LookupLoopIsZero
    332 
    333 LookupLoop2:
    334         and r8d, edx
    335 
    336         movzx   r8d, word ptr [rdi + r8*2]
    337         cmp r8d, ebp
    338         jbe LeaveNow
    339         sub edx, 00010000h
    340         js  LeaveNow
    341 
    342 LoopEntry2:
    343         cmp bx,word ptr [rsi + r8 - 1]
    344         jz  LookupLoopIsZero
    345 
    346 LookupLoop4:
    347         and r8d, edx
    348 
    349         movzx   r8d, word ptr [rdi + r8*2]
    350         cmp r8d, ebp
    351         jbe LeaveNow
    352         sub edx, 00010000h
    353         js  LeaveNow
    354 
    355 LoopEntry4:
    356 
    357         cmp bx,word ptr [rsi + r8 - 1]
    358         jnz LookupLoop1
    359         jmp LookupLoopIsZero
    360 
    361 
    362 ;;; do {
    363 ;;;     match = s->window + cur_match;
    364 ;;;     if (*(ushf*)(match+best_len-1) != scan_end ||
    365 ;;;         *(ushf*)match != scan_start) continue;
    366 ;;;     [...]
    367 ;;; } while ((cur_match = prev[cur_match & wmask]) > limit
    368 ;;;          && --chain_length != 0);
    369 ;;;
    370 ;;; Here is the inner loop of the function. The function will spend the
    371 ;;; majority of its time in this loop, and majority of that time will
    372 ;;; be spent in the first ten instructions.
    373 ;;;
    374 ;;; Within this loop:
    375 ;;; ebx = scanend
    376 ;;; r8d = curmatch
    377 ;;; edx = chainlenwmask - i.e., ((chainlen << 16) | wmask)
    378 ;;; esi = windowbestlen - i.e., (window + bestlen)
    379 ;;; edi = prev
    380 ;;; ebp = limit
    381 
    382 LookupLoop:
    383         and r8d, edx
    384 
    385         movzx   r8d, word ptr [rdi + r8*2]
    386         cmp r8d, ebp
    387         jbe LeaveNow
    388         sub edx, 00010000h
    389         js  LeaveNow
    390 
    391 LoopEntry:
    392 
    393         cmp bx,word ptr [rsi + r8 - 1]
    394         jnz LookupLoop1
    395 LookupLoopIsZero:
    396         cmp     r12w, word ptr [r10 + r8]
    397         jnz LookupLoop1
    398 
    399 
    400 ;;; Store the current value of chainlen.
    401         mov [chainlenwmask], edx
    402 
    403 ;;; Point edi to the string under scrutiny, and esi to the string we
    404 ;;; are hoping to match it up with. In actuality, esi and edi are
    405 ;;; both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and edx is
    406 ;;; initialized to -(MAX_MATCH_8 - scanalign).
    407 
    408         lea rsi,[r8+r10]
    409         mov rdx, 0fffffffffffffef8h; -(MAX_MATCH_8)
    410         lea rsi, [rsi + r13 + 0108h] ;MAX_MATCH_8]
    411         lea rdi, [r9 + r13 + 0108h] ;MAX_MATCH_8]
    412 
    413         prefetcht1 [rsi+rdx]
    414         prefetcht1 [rdi+rdx]
    415 
    416 
    417 ;;; Test the strings for equality, 8 bytes at a time. At the end,
    418 ;;; adjust rdx so that it is offset to the exact byte that mismatched.
    419 ;;;
    420 ;;; We already know at this point that the first three bytes of the
    421 ;;; strings match each other, and they can be safely passed over before
    422 ;;; starting the compare loop. So what this code does is skip over 0-3
    423 ;;; bytes, as much as necessary in order to dword-align the edi
    424 ;;; pointer. (rsi will still be misaligned three times out of four.)
    425 ;;;
    426 ;;; It should be confessed that this loop usually does not represent
    427 ;;; much of the total running time. Replacing it with a more
    428 ;;; straightforward "rep cmpsb" would not drastically degrade
    429 ;;; performance.
    430 
    431 
    432 LoopCmps:
    433         mov rax, [rsi + rdx]
    434         xor rax, [rdi + rdx]
    435         jnz LeaveLoopCmps
    436 
    437         mov rax, [rsi + rdx + 8]
    438         xor rax, [rdi + rdx + 8]
    439         jnz LeaveLoopCmps8
    440 
    441 
    442         mov rax, [rsi + rdx + 8+8]
    443         xor rax, [rdi + rdx + 8+8]
    444         jnz LeaveLoopCmps16
    445 
    446         add rdx,8+8+8
    447 
    448         jnz short LoopCmps
    449         jmp short LenMaximum
    450 LeaveLoopCmps16: add rdx,8
    451 LeaveLoopCmps8: add rdx,8
    452 LeaveLoopCmps:
    453 
    454         test    eax, 0000FFFFh
    455         jnz LenLower
    456 
    457         test eax,0ffffffffh
    458 
    459         jnz LenLower32
    460 
    461         add rdx,4
    462         shr rax,32
    463         or ax,ax
    464         jnz LenLower
    465 
    466 LenLower32:
    467         shr eax,16
    468         add rdx,2
    469 LenLower:   sub al, 1
    470         adc rdx, 0
    471 ;;; Calculate the length of the match. If it is longer than MAX_MATCH,
    472 ;;; then automatically accept it as the best possible match and leave.
    473 
    474         lea rax, [rdi + rdx]
    475         sub rax, r9
    476         cmp eax, MAX_MATCH
    477         jge LenMaximum
    478 
    479 ;;; If the length of the match is not longer than the best match we
    480 ;;; have so far, then forget it and return to the lookup loop.
    481 ;///////////////////////////////////
    482 
    483         cmp eax, r11d
    484         jg  LongerMatch
    485 
    486         lea rsi,[r10+r11]
    487 
    488         mov rdi, prev_ad
    489         mov edx, [chainlenwmask]
    490         jmp LookupLoop
    491 
    492 ;;;         s->match_start = cur_match;
    493 ;;;         best_len = len;
    494 ;;;         if (len >= nice_match) break;
    495 ;;;         scan_end = *(ushf*)(scan+best_len-1);
    496 
    497 LongerMatch:
    498         mov r11d, eax
    499         mov match_start, r8d
    500         cmp eax, [nicematch]
    501         jge LeaveNow
    502 
    503         lea rsi,[r10+rax]
    504 
    505         movzx   ebx, word ptr [r9 + rax - 1]
    506         mov rdi, prev_ad
    507         mov edx, [chainlenwmask]
    508         jmp LookupLoop
    509 
    510 ;;; Accept the current string, with the maximum possible length.
    511 
    512 LenMaximum:
    513         mov r11d,MAX_MATCH
    514         mov match_start, r8d
    515 
    516 ;;; if ((uInt)best_len <= s->lookahead) return (uInt)best_len;
    517 ;;; return s->lookahead;
    518 
    519 LeaveNow:
    520 IFDEF INFOZIP
    521         mov eax,r11d
    522 ELSE
    523         mov eax, Lookahead
    524         cmp r11d, eax
    525         cmovng eax, r11d
    526 ENDIF
    527 
    528 ;;; Restore the stack and return from whence we came.
    529 
    530 
    531         mov rsi,[save_rsi]
    532         mov rdi,[save_rdi]
    533         mov rbx,[save_rbx]
    534         mov rbp,[save_rbp]
    535         mov r12,[save_r12]
    536         mov r13,[save_r13]
    537 ;        mov r14,[save_r14]
    538 ;        mov r15,[save_r15]
    539 
    540 
    541         ret 0
    542 ; please don't remove this string !
    543 ; Your can freely use gvmat64 in any free or commercial app
    544 ; but it is far better don't remove the string in the binary!
    545     db     0dh,0ah,"asm686 with masm, optimised assembly code from Brian Raiter, written 1998, converted to amd 64 by Gilles Vollant 2005",0dh,0ah,0
    546 longest_match   ENDP
    547 
    548 match_init PROC
    549   ret 0
    550 match_init ENDP
    551 
    552 
    553 END
    554