1 ;uInt longest_match_x64( 2 ; deflate_state *s, 3 ; IPos cur_match); /* current match */ 4 5 ; gvmat64.asm -- Asm portion of the optimized longest_match for 32 bits x86 6 ; Copyright (C) 1995-2005 Jean-loup Gailly, Brian Raiter and Gilles Vollant. 7 ; 8 ; File written by Gilles Vollant, by converting to assembly the longest_match 9 ; from Jean-loup Gailly in deflate.c of zLib and infoZip zip. 10 ; 11 ; and by taking inspiration on asm686 with masm, optimised assembly code 12 ; from Brian Raiter, written 1998 13 ; 14 ; http://www.zlib.net 15 ; http://www.winimage.com/zLibDll 16 ; http://www.muppetlabs.com/~breadbox/software/assembly.html 17 ; 18 ; to compile this file for infozip Zip, I use option: 19 ; ml64.exe /Flgvmat64 /c /Zi /DINFOZIP gvmat64.asm 20 ; 21 ; to compile this file for zLib, I use option: 22 ; ml64.exe /Flgvmat64 /c /Zi gvmat64.asm 23 ; Be carrefull to adapt zlib1222add below to your version of zLib 24 ; (if you use a version of zLib before 1.0.4 or after 1.2.2.2, change 25 ; value of zlib1222add later) 26 ; 27 ; This file compile with Microsoft Macro Assembler (x64) for AMD64 28 ; 29 ; ml64.exe is given with Visual Studio 2005 and Windows 2003 server DDK 30 ; 31 ; (you can get Windows 2003 server DDK with ml64 and cl for AMD64 from 32 ; http://www.microsoft.com/whdc/devtools/ddk/default.mspx for low price) 33 ; 34 35 36 ;uInt longest_match(s, cur_match) 37 ; deflate_state *s; 38 ; IPos cur_match; /* current match */ 39 bits 64 40 section .text 41 42 longest_match: ; PROC 43 44 45 ;%define LocalVarsSize 88 46 %define LocalVarsSize 72 47 48 ; register used : rax,rbx,rcx,rdx,rsi,rdi,r8,r9,r10,r11,r12 49 ; free register : r14,r15 50 ; register can be saved : rsp 51 52 %define chainlenwmask rsp + 8 - LocalVarsSize ; high word: current chain len 53 ; low word: s->wmask 54 ;%define window rsp + xx - LocalVarsSize ; local copy of s->window ; stored in r10 55 ;%define windowbestlen rsp + xx - LocalVarsSize ; s->window + bestlen , use r10+r11 56 ;%define scanstart rsp + xx - LocalVarsSize ; first two bytes of string ; stored in r12w 57 ;%define scanend rsp + xx - LocalVarsSize ; last two bytes of string use ebx 58 ;%define scanalign rsp + xx - LocalVarsSize ; dword-misalignment of string r13 59 ;%define bestlen rsp + xx - LocalVarsSize ; size of best match so far -> r11d 60 ;%define scan rsp + xx - LocalVarsSize ; ptr to string wanting match -> r9 61 %ifdef INFOZIP 62 %else 63 %define nicematch (rsp + 16 - LocalVarsSize) ; a good enough match size 64 %endif 65 66 %define save_rdi rsp + 24 - LocalVarsSize 67 %define save_rsi rsp + 32 - LocalVarsSize 68 %define save_rbx rsp + 40 - LocalVarsSize 69 %define save_rbp rsp + 48 - LocalVarsSize 70 %define save_r12 rsp + 56 - LocalVarsSize 71 %define save_r13 rsp + 64 - LocalVarsSize 72 ;%define save_r14 rsp + 72 - LocalVarsSize 73 ;%define save_r15 rsp + 80 - LocalVarsSize 74 75 76 77 ; all the +4 offsets are due to the addition of pending_buf_size (in zlib 78 ; in the deflate_state structure since the asm code was first written 79 ; (if you compile with zlib 1.0.4 or older, remove the +4). 80 ; Note : these value are good with a 8 bytes boundary pack structure 81 82 83 %define MAX_MATCH 258 84 %define MIN_MATCH 3 85 %define MIN_LOOKAHEAD (MAX_MATCH+MIN_MATCH+1) 86 87 88 ;;; Offsets for fields in the deflate_state structure. These numbers 89 ;;; are calculated from the definition of deflate_state, with the 90 ;;; assumption that the compiler will dword-align the fields. (Thus, 91 ;;; changing the definition of deflate_state could easily cause this 92 ;;; program to crash horribly, without so much as a warning at 93 ;;; compile time. Sigh.) 94 95 ; all the +zlib1222add offsets are due to the addition of fields 96 ; in zlib in the deflate_state structure since the asm code was first written 97 ; (if you compile with zlib 1.0.4 or older, use "%define zlib1222add (-4)"). 98 ; (if you compile with zlib between 1.0.5 and 1.2.2.1, use "%define zlib1222add 0"). 99 ; if you compile with zlib 1.2.2.2 or later , use "%define zlib1222add 8"). 100 101 %ifdef INFOZIP 102 section .data 103 104 extern window_size:DWORD 105 ; WMask ; 7fff 106 extern window:BYTE:010040H 107 extern prev:WORD:08000H 108 ; MatchLen : unused 109 ; PrevMatch : unused 110 extern strstart:DWORD 111 extern match_start:DWORD 112 ; Lookahead : ignore 113 extern prev_length:DWORD ; PrevLen 114 extern max_chain_length:DWORD 115 extern good_match:DWORD 116 extern nice_match:DWORD 117 %define prev_ad OFFSET prev 118 %define window_ad OFFSET window 119 %define nicematch nice_match 120 121 %define WMask 07fffh 122 123 %else 124 125 %ifndef zlib1222add 126 %define zlib1222add 8 127 %endif 128 %define dsWSize 56+zlib1222add+(zlib1222add/2) 129 %define dsWMask 64+zlib1222add+(zlib1222add/2) 130 %define dsWindow 72+zlib1222add 131 %define dsPrev 88+zlib1222add 132 %define dsMatchLen 128+zlib1222add 133 %define dsPrevMatch 132+zlib1222add 134 %define dsStrStart 140+zlib1222add 135 %define dsMatchStart 144+zlib1222add 136 %define dsLookahead 148+zlib1222add 137 %define dsPrevLen 152+zlib1222add 138 %define dsMaxChainLen 156+zlib1222add 139 %define dsGoodMatch 172+zlib1222add 140 %define dsNiceMatch 176+zlib1222add 141 142 %define window_size [ rcx + dsWSize] 143 %define WMask [ rcx + dsWMask] 144 %define window_ad [ rcx + dsWindow] 145 %define prev_ad [ rcx + dsPrev] 146 %define strstart [ rcx + dsStrStart] 147 %define match_start [ rcx + dsMatchStart] 148 %define Lookahead [ rcx + dsLookahead] ; 0ffffffffh on infozip 149 %define prev_length [ rcx + dsPrevLen] 150 %define max_chain_length [ rcx + dsMaxChainLen] 151 %define good_match [ rcx + dsGoodMatch] 152 %define nice_match [ rcx + dsNiceMatch] 153 %endif 154 155 ; parameter 1 in r8(deflate state s), param 2 in rdx (cur match) 156 157 ; see http://weblogs.asp.net/oldnewthing/archive/2004/01/14/58579.aspx and 158 ; http://msdn.microsoft.com/library/en-us/kmarch/hh/kmarch/64bitAMD_8e951dd2-ee77-4728-8702-55ce4b5dd24a.xml.asp 159 ; 160 ; All registers must be preserved across the call, except for 161 ; rax, rcx, rdx, r8, r9, r10, and r11, which are scratch. 162 163 section .text 164 165 ;;; Save registers that the compiler may be using, and adjust esp to 166 ;;; make room for our stack frame. 167 168 169 ;;; Retrieve the function arguments. r8d will hold cur_match 170 ;;; throughout the entire function. edx will hold the pointer to the 171 ;;; deflate_state structure during the function's setup (before 172 ;;; entering the main loop. 173 174 ; parameter 1 in rcx (deflate_state* s), param 2 in edx -> r8 (cur match) 175 176 ; this clear high 32 bits of r8, which can be garbage in both r8 and rdx 177 178 mov [save_rdi],rdi 179 mov [save_rsi],rsi 180 mov [save_rbx],rbx 181 mov [save_rbp],rbp 182 %ifdef INFOZIP 183 mov r8d,ecx 184 %else 185 mov r8d,edx 186 %endif 187 mov [save_r12],r12 188 mov [save_r13],r13 189 ; mov [save_r14],r14 190 ; mov [save_r15],r15 191 192 193 ;;; uInt wmask = s->w_mask; 194 ;;; unsigned chain_length = s->max_chain_length; 195 ;;; if (s->prev_length >= s->good_match) { 196 ;;; chain_length >>= 2; 197 ;;; } 198 199 mov edi, prev_length 200 mov esi, good_match 201 mov eax, WMask 202 mov ebx, max_chain_length 203 cmp edi, esi 204 jl LastMatchGood 205 shr ebx, 2 206 LastMatchGood: 207 208 ;;; chainlen is decremented once beforehand so that the function can 209 ;;; use the sign flag instead of the zero flag for the exit test. 210 ;;; It is then shifted into the high word, to make room for the wmask 211 ;;; value, which it will always accompany. 212 213 dec ebx 214 shl ebx, 16 215 or ebx, eax 216 217 ;;; on zlib only 218 ;;; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead; 219 220 %ifdef INFOZIP 221 mov [chainlenwmask], ebx 222 ; on infozip nice_match = [nice_match] 223 %else 224 mov eax, nice_match 225 mov [chainlenwmask], ebx 226 mov r10d, Lookahead 227 cmp r10d, eax 228 cmovnl r10d, eax 229 mov [nicematch],r10d 230 %endif 231 232 ;;; register Bytef *scan = s->window + s->strstart; 233 mov r10, window_ad 234 mov ebp, strstart 235 lea r13, [r10 + rbp] 236 237 ;;; Determine how many bytes the scan ptr is off from being 238 ;;; dword-aligned. 239 240 mov r9,r13 241 neg r13 242 and r13,3 243 244 ;;; IPos limit = s->strstart > (IPos)MAX_DIST(s) ? 245 ;;; s->strstart - (IPos)MAX_DIST(s) : NIL; 246 %ifdef INFOZIP 247 mov eax,07efah ; MAX_DIST = (WSIZE-MIN_LOOKAHEAD) (0x8000-(3+8+1)) 248 %else 249 mov eax, window_size 250 sub eax, MIN_LOOKAHEAD 251 %endif 252 xor edi,edi 253 sub ebp, eax 254 255 mov r11d, prev_length 256 257 cmovng ebp,edi 258 259 ;;; int best_len = s->prev_length; 260 261 262 ;;; Store the sum of s->window + best_len in esi locally, and in esi. 263 264 lea rsi,[r10+r11] 265 266 ;;; register ush scan_start = *(ushf*)scan; 267 ;;; register ush scan_end = *(ushf*)(scan+best_len-1); 268 ;;; Posf *prev = s->prev; 269 270 movzx r12d,word [r9] 271 movzx ebx, word [r9 + r11 - 1] 272 273 mov rdi, prev_ad 274 275 ;;; Jump into the main loop. 276 277 mov edx, [chainlenwmask] 278 279 cmp bx,word [rsi + r8 - 1] 280 jz LookupLoopIsZero 281 282 LookupLoop1: 283 and r8d, edx 284 285 movzx r8d, word [rdi + r8*2] 286 cmp r8d, ebp 287 jbe LeaveNow 288 sub edx, 00010000h 289 js LeaveNow 290 291 LoopEntry1: 292 cmp bx,word [rsi + r8 - 1] 293 jz LookupLoopIsZero 294 295 LookupLoop2: 296 and r8d, edx 297 298 movzx r8d, word [rdi + r8*2] 299 cmp r8d, ebp 300 jbe LeaveNow 301 sub edx, 00010000h 302 js LeaveNow 303 304 LoopEntry2: 305 cmp bx,word [rsi + r8 - 1] 306 jz LookupLoopIsZero 307 308 LookupLoop4: 309 and r8d, edx 310 311 movzx r8d, word [rdi + r8*2] 312 cmp r8d, ebp 313 jbe LeaveNow 314 sub edx, 00010000h 315 js LeaveNow 316 317 LoopEntry4: 318 319 cmp bx,word [rsi + r8 - 1] 320 jnz LookupLoop1 321 jmp LookupLoopIsZero 322 323 324 ;;; do { 325 ;;; match = s->window + cur_match; 326 ;;; if (*(ushf*)(match+best_len-1) != scan_end || 327 ;;; *(ushf*)match != scan_start) continue; 328 ;;; [...] 329 ;;; } while ((cur_match = prev[cur_match & wmask]) > limit 330 ;;; && --chain_length != 0); 331 ;;; 332 ;;; Here is the inner loop of the function. The function will spend the 333 ;;; majority of its time in this loop, and majority of that time will 334 ;;; be spent in the first ten instructions. 335 ;;; 336 ;;; Within this loop: 337 ;;; ebx = scanend 338 ;;; r8d = curmatch 339 ;;; edx = chainlenwmask - i.e., ((chainlen << 16) | wmask) 340 ;;; esi = windowbestlen - i.e., (window + bestlen) 341 ;;; edi = prev 342 ;;; ebp = limit 343 344 LookupLoop: 345 and r8d, edx 346 347 movzx r8d, word [rdi + r8*2] 348 cmp r8d, ebp 349 jbe LeaveNow 350 sub edx, 00010000h 351 js LeaveNow 352 353 LoopEntry: 354 355 cmp bx,word [rsi + r8 - 1] 356 jnz LookupLoop1 357 LookupLoopIsZero: 358 cmp r12w, word [r10 + r8] 359 jnz LookupLoop1 360 361 362 ;;; Store the current value of chainlen. 363 mov [chainlenwmask], edx 364 365 ;;; Point edi to the string under scrutiny, and esi to the string we 366 ;;; are hoping to match it up with. In actuality, esi and edi are 367 ;;; both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and edx is 368 ;;; initialized to -(MAX_MATCH_8 - scanalign). 369 370 lea rsi,[r8+r10] 371 mov rdx, 0fffffffffffffef8h; -(MAX_MATCH_8) 372 lea rsi, [rsi + r13 + 0108h] ;MAX_MATCH_8] 373 lea rdi, [r9 + r13 + 0108h] ;MAX_MATCH_8] 374 375 prefetcht1 [rsi+rdx] 376 prefetcht1 [rdi+rdx] 377 378 379 ;;; Test the strings %define for ality, 8 bytes at a time. At the end, 380 ;;; adjust rdx so that it is offset to the exact byte that mismatched. 381 ;;; 382 ;;; We already know at this point that the first three bytes of the 383 ;;; strings match each other, and they can be safely passed over before 384 ;;; starting the compare loop. So what this code does is skip over 0-3 385 ;;; bytes, as much as necessary in order to dword-align the edi 386 ;;; pointer. (rsi will still be misaligned three times out of four.) 387 ;;; 388 ;;; It should be confessed that this loop usually does not represent 389 ;;; much of the total running time. Replacing it with a more 390 ;;; straightforward "rep cmpsb" would not drastically degrade 391 ;;; performance. 392 393 LoopCmps: 394 mov rax, [rsi + rdx] 395 xor rax, [rdi + rdx] 396 jnz LeaveLoopCmps 397 398 mov rax, [rsi + rdx + 8] 399 xor rax, [rdi + rdx + 8] 400 jnz LeaveLoopCmps8 401 402 403 mov rax, [rsi + rdx + 8+8] 404 xor rax, [rdi + rdx + 8+8] 405 jnz LeaveLoopCmps16 406 407 add rdx,8+8+8 408 409 jmp short LoopCmps 410 LeaveLoopCmps16: add rdx,8 411 LeaveLoopCmps8: add rdx,8 412 LeaveLoopCmps: 413 414 test eax, 0000FFFFh 415 jnz LenLower 416 417 test eax,0ffffffffh 418 419 jnz LenLower32 420 421 add rdx,4 422 shr rax,32 423 or ax,ax 424 jnz LenLower 425 426 LenLower32: 427 shr eax,16 428 add rdx,2 429 LenLower: sub al, 1 430 adc rdx, 0 431 ;;; Calculate the length of the match. If it is longer than MAX_MATCH, 432 ;;; then automatically accept it as the best possible match and leave. 433 434 lea rax, [rdi + rdx] 435 sub rax, r9 436 cmp eax, MAX_MATCH 437 jge LenMaximum 438 439 ;;; If the length of the match is not longer than the best match we 440 ;;; have so far, then forget it and return to the lookup loop. 441 ;/////////////////////////////////// 442 443 cmp eax, r11d 444 jg LongerMatch 445 446 lea rsi,[r10+r11] 447 448 mov rdi, prev_ad 449 mov edx, [chainlenwmask] 450 jmp LookupLoop 451 452 ;;; s->match_start = cur_match; 453 ;;; best_len = len; 454 ;;; if (len >= nice_match) break; 455 ;;; scan_end = *(ushf*)(scan+best_len-1); 456 457 LongerMatch: 458 mov r11d, eax 459 mov match_start, r8d 460 cmp eax, [nicematch] 461 jge LeaveNow 462 463 lea rsi,[r10+rax] 464 465 movzx ebx, word [r9 + rax - 1] 466 mov rdi, prev_ad 467 mov edx, [chainlenwmask] 468 jmp LookupLoop 469 470 ;;; Accept the current string, with the maximum possible length. 471 472 LenMaximum: 473 mov r11d,MAX_MATCH 474 mov match_start, r8d 475 ;;; if ((uInt)best_len <= s->lookahead) return (uInt)best_len; 476 ;;; return s->lookahead; 477 478 LeaveNow: 479 %ifdef INFOZIP 480 mov eax,r11d 481 %else 482 mov eax, Lookahead 483 cmp r11d, eax 484 cmovng eax, r11d 485 %endif 486 487 ;;; Restore the stack and return from whence we came. 488 489 mov rsi,[save_rsi] 490 mov rdi,[save_rdi] 491 mov rbx,[save_rbx] 492 mov rbp,[save_rbp] 493 mov r12,[save_r12] 494 mov r13,[save_r13] 495 ; mov r14,[save_r14] 496 ; mov r15,[save_r15] 497 498 499 ret 0 500 ; please don't remove this string ! 501 ; Your can freely use gvmat64 in any free or externercial app 502 ; but it is far better don't remove the string in the binary! 503 db 0dh,0ah,"asm686 with masm, optimised assembly code from Brian Raiter, written 1998, converted to amd 64 by Gilles Vollant 2005",0dh,0ah,0 504 %if 0 505 longest_match ENDP 506 %endif 507 508 match_init: ; PROC 509 ret 0 510 %if 0 511 match_init ENDP 512 %endif 513 514 END 515