1 2 ; match.asm -- Pentium-Pro optimized version of longest_match() 3 ; 4 ; Updated for zlib 1.1.3 and converted to MASM 6.1x 5 ; Copyright (C) 2000 Dan Higdon <hdan (a] kinesoft.com> 6 ; and Chuck Walbourn <chuckw (a] kinesoft.com> 7 ; Corrections by Cosmin Truta <cosmint (a] cs.ubbcluj.ro> 8 ; 9 ; This is free software; you can redistribute it and/or modify it 10 ; under the terms of the GNU General Public License. 11 12 ; Based on match.S 13 ; Written for zlib 1.1.2 14 ; Copyright (C) 1998 Brian Raiter <breadbox (a] muppetlabs.com> 15 ; 16 ; Modified by Gilles Vollant (2005) for add gzhead and gzindex 17 18 .686P 19 .MODEL FLAT 20 21 ;=========================================================================== 22 ; EQUATES 23 ;=========================================================================== 24 25 MAX_MATCH EQU 258 26 MIN_MATCH EQU 3 27 MIN_LOOKAHEAD EQU (MAX_MATCH + MIN_MATCH + 1) 28 MAX_MATCH_8 EQU ((MAX_MATCH + 7) AND (NOT 7)) 29 30 ;=========================================================================== 31 ; STRUCTURES 32 ;=========================================================================== 33 34 ; This STRUCT assumes a 4-byte alignment 35 36 DEFLATE_STATE STRUCT 37 ds_strm dd ? 38 ds_status dd ? 39 ds_pending_buf dd ? 40 ds_pending_buf_size dd ? 41 ds_pending_out dd ? 42 ds_pending dd ? 43 ds_wrap dd ? 44 ; gzhead and gzindex are added in zlib 1.2.2.2 (see deflate.h) 45 ds_gzhead dd ? 46 ds_gzindex dd ? 47 ds_data_type db ? 48 ds_method db ? 49 db ? ; padding 50 db ? ; padding 51 ds_last_flush dd ? 52 ds_w_size dd ? ; used 53 ds_w_bits dd ? 54 ds_w_mask dd ? ; used 55 ds_window dd ? ; used 56 ds_window_size dd ? 57 ds_prev dd ? ; used 58 ds_head dd ? 59 ds_ins_h dd ? 60 ds_hash_size dd ? 61 ds_hash_bits dd ? 62 ds_hash_mask dd ? 63 ds_hash_shift dd ? 64 ds_block_start dd ? 65 ds_match_length dd ? ; used 66 ds_prev_match dd ? ; used 67 ds_match_available dd ? 68 ds_strstart dd ? ; used 69 ds_match_start dd ? ; used 70 ds_lookahead dd ? ; used 71 ds_prev_length dd ? ; used 72 ds_max_chain_length dd ? ; used 73 ds_max_laxy_match dd ? 74 ds_level dd ? 75 ds_strategy dd ? 76 ds_good_match dd ? ; used 77 ds_nice_match dd ? ; used 78 79 ; Don't need anymore of the struct for match 80 DEFLATE_STATE ENDS 81 82 ;=========================================================================== 83 ; CODE 84 ;=========================================================================== 85 _TEXT SEGMENT 86 87 ;--------------------------------------------------------------------------- 88 ; match_init 89 ;--------------------------------------------------------------------------- 90 ALIGN 4 91 PUBLIC _match_init 92 _match_init PROC 93 ; no initialization needed 94 ret 95 _match_init ENDP 96 97 ;--------------------------------------------------------------------------- 98 ; uInt longest_match(deflate_state *deflatestate, IPos curmatch) 99 ;--------------------------------------------------------------------------- 100 ALIGN 4 101 102 PUBLIC _longest_match 103 _longest_match PROC 104 105 ; Since this code uses EBP for a scratch register, the stack frame must 106 ; be manually constructed and referenced relative to the ESP register. 107 108 ; Stack image 109 ; Variables 110 chainlenwmask = 0 ; high word: current chain len 111 ; low word: s->wmask 112 window = 4 ; local copy of s->window 113 windowbestlen = 8 ; s->window + bestlen 114 scanend = 12 ; last two bytes of string 115 scanstart = 16 ; first two bytes of string 116 scanalign = 20 ; dword-misalignment of string 117 nicematch = 24 ; a good enough match size 118 bestlen = 28 ; size of best match so far 119 scan = 32 ; ptr to string wanting match 120 varsize = 36 ; number of bytes (also offset to last saved register) 121 122 ; Saved Registers (actually pushed into place) 123 ebx_save = 36 124 edi_save = 40 125 esi_save = 44 126 ebp_save = 48 127 128 ; Parameters 129 retaddr = 52 130 deflatestate = 56 131 curmatch = 60 132 133 ; Save registers that the compiler may be using 134 push ebp 135 push edi 136 push esi 137 push ebx 138 139 ; Allocate local variable space 140 sub esp,varsize 141 142 ; Retrieve the function arguments. ecx will hold cur_match 143 ; throughout the entire function. edx will hold the pointer to the 144 ; deflate_state structure during the function's setup (before 145 ; entering the main loop). 146 147 mov edx, [esp+deflatestate] 148 ASSUME edx:PTR DEFLATE_STATE 149 150 mov ecx, [esp+curmatch] 151 152 ; uInt wmask = s->w_mask; 153 ; unsigned chain_length = s->max_chain_length; 154 ; if (s->prev_length >= s->good_match) { 155 ; chain_length >>= 2; 156 ; } 157 158 mov eax, [edx].ds_prev_length 159 mov ebx, [edx].ds_good_match 160 cmp eax, ebx 161 mov eax, [edx].ds_w_mask 162 mov ebx, [edx].ds_max_chain_length 163 jl SHORT LastMatchGood 164 shr ebx, 2 165 LastMatchGood: 166 167 ; chainlen is decremented once beforehand so that the function can 168 ; use the sign flag instead of the zero flag for the exit test. 169 ; It is then shifted into the high word, to make room for the wmask 170 ; value, which it will always accompany. 171 172 dec ebx 173 shl ebx, 16 174 or ebx, eax 175 mov [esp+chainlenwmask], ebx 176 177 ; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead; 178 179 mov eax, [edx].ds_nice_match 180 mov ebx, [edx].ds_lookahead 181 cmp ebx, eax 182 jl SHORT LookaheadLess 183 mov ebx, eax 184 LookaheadLess: 185 mov [esp+nicematch], ebx 186 187 ;/* register Bytef *scan = s->window + s->strstart; */ 188 189 mov esi, [edx].ds_window 190 mov [esp+window], esi 191 mov ebp, [edx].ds_strstart 192 lea edi, [esi+ebp] 193 mov [esp+scan],edi 194 195 ;/* Determine how many bytes the scan ptr is off from being */ 196 ;/* dword-aligned. */ 197 198 mov eax, edi 199 neg eax 200 and eax, 3 201 mov [esp+scanalign], eax 202 203 ;/* IPos limit = s->strstart > (IPos)MAX_DIST(s) ? */ 204 ;/* s->strstart - (IPos)MAX_DIST(s) : NIL; */ 205 206 mov eax, [edx].ds_w_size 207 sub eax, MIN_LOOKAHEAD 208 sub ebp, eax 209 jg SHORT LimitPositive 210 xor ebp, ebp 211 LimitPositive: 212 213 ;/* int best_len = s->prev_length; */ 214 215 mov eax, [edx].ds_prev_length 216 mov [esp+bestlen], eax 217 218 ;/* Store the sum of s->window + best_len in %esi locally, and in %esi. */ 219 220 add esi, eax 221 mov [esp+windowbestlen], esi 222 223 ;/* register ush scan_start = *(ushf*)scan; */ 224 ;/* register ush scan_end = *(ushf*)(scan+best_len-1); */ 225 ;/* Posf *prev = s->prev; */ 226 227 movzx ebx, WORD PTR[edi] 228 mov [esp+scanstart], ebx 229 movzx ebx, WORD PTR[eax+edi-1] 230 mov [esp+scanend], ebx 231 mov edi, [edx].ds_prev 232 233 ;/* Jump into the main loop. */ 234 235 mov edx, [esp+chainlenwmask] 236 jmp SHORT LoopEntry 237 238 ;/* do { 239 ; * match = s->window + cur_match; 240 ; * if (*(ushf*)(match+best_len-1) != scan_end || 241 ; * *(ushf*)match != scan_start) continue; 242 ; * [...] 243 ; * } while ((cur_match = prev[cur_match & wmask]) > limit 244 ; * && --chain_length != 0); 245 ; * 246 ; * Here is the inner loop of the function. The function will spend the 247 ; * majority of its time in this loop, and majority of that time will 248 ; * be spent in the first ten instructions. 249 ; * 250 ; * Within this loop: 251 ; * %ebx = scanend 252 ; * %ecx = curmatch 253 ; * %edx = chainlenwmask - i.e., ((chainlen << 16) | wmask) 254 ; * %esi = windowbestlen - i.e., (window + bestlen) 255 ; * %edi = prev 256 ; * %ebp = limit 257 ; */ 258 259 ALIGN 4 260 LookupLoop: 261 and ecx, edx 262 movzx ecx, WORD PTR[edi+ecx*2] 263 cmp ecx, ebp 264 jbe LeaveNow 265 sub edx, 000010000H 266 js LeaveNow 267 268 LoopEntry: 269 movzx eax, WORD PTR[esi+ecx-1] 270 cmp eax, ebx 271 jnz SHORT LookupLoop 272 273 mov eax, [esp+window] 274 movzx eax, WORD PTR[eax+ecx] 275 cmp eax, [esp+scanstart] 276 jnz SHORT LookupLoop 277 278 ;/* Store the current value of chainlen. */ 279 280 mov [esp+chainlenwmask], edx 281 282 ;/* Point %edi to the string under scrutiny, and %esi to the string we */ 283 ;/* are hoping to match it up with. In actuality, %esi and %edi are */ 284 ;/* both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and %edx is */ 285 ;/* initialized to -(MAX_MATCH_8 - scanalign). */ 286 287 mov esi, [esp+window] 288 mov edi, [esp+scan] 289 add esi, ecx 290 mov eax, [esp+scanalign] 291 mov edx, -MAX_MATCH_8 292 lea edi, [edi+eax+MAX_MATCH_8] 293 lea esi, [esi+eax+MAX_MATCH_8] 294 295 ;/* Test the strings for equality, 8 bytes at a time. At the end, 296 ; * adjust %edx so that it is offset to the exact byte that mismatched. 297 ; * 298 ; * We already know at this point that the first three bytes of the 299 ; * strings match each other, and they can be safely passed over before 300 ; * starting the compare loop. So what this code does is skip over 0-3 301 ; * bytes, as much as necessary in order to dword-align the %edi 302 ; * pointer. (%esi will still be misaligned three times out of four.) 303 ; * 304 ; * It should be confessed that this loop usually does not represent 305 ; * much of the total running time. Replacing it with a more 306 ; * straightforward "rep cmpsb" would not drastically degrade 307 ; * performance. 308 ; */ 309 310 LoopCmps: 311 mov eax, DWORD PTR[esi+edx] 312 xor eax, DWORD PTR[edi+edx] 313 jnz SHORT LeaveLoopCmps 314 315 mov eax, DWORD PTR[esi+edx+4] 316 xor eax, DWORD PTR[edi+edx+4] 317 jnz SHORT LeaveLoopCmps4 318 319 add edx, 8 320 jnz SHORT LoopCmps 321 jmp LenMaximum 322 ALIGN 4 323 324 LeaveLoopCmps4: 325 add edx, 4 326 327 LeaveLoopCmps: 328 test eax, 00000FFFFH 329 jnz SHORT LenLower 330 331 add edx, 2 332 shr eax, 16 333 334 LenLower: 335 sub al, 1 336 adc edx, 0 337 338 ;/* Calculate the length of the match. If it is longer than MAX_MATCH, */ 339 ;/* then automatically accept it as the best possible match and leave. */ 340 341 lea eax, [edi+edx] 342 mov edi, [esp+scan] 343 sub eax, edi 344 cmp eax, MAX_MATCH 345 jge SHORT LenMaximum 346 347 ;/* If the length of the match is not longer than the best match we */ 348 ;/* have so far, then forget it and return to the lookup loop. */ 349 350 mov edx, [esp+deflatestate] 351 mov ebx, [esp+bestlen] 352 cmp eax, ebx 353 jg SHORT LongerMatch 354 mov esi, [esp+windowbestlen] 355 mov edi, [edx].ds_prev 356 mov ebx, [esp+scanend] 357 mov edx, [esp+chainlenwmask] 358 jmp LookupLoop 359 ALIGN 4 360 361 ;/* s->match_start = cur_match; */ 362 ;/* best_len = len; */ 363 ;/* if (len >= nice_match) break; */ 364 ;/* scan_end = *(ushf*)(scan+best_len-1); */ 365 366 LongerMatch: 367 mov ebx, [esp+nicematch] 368 mov [esp+bestlen], eax 369 mov [edx].ds_match_start, ecx 370 cmp eax, ebx 371 jge SHORT LeaveNow 372 mov esi, [esp+window] 373 add esi, eax 374 mov [esp+windowbestlen], esi 375 movzx ebx, WORD PTR[edi+eax-1] 376 mov edi, [edx].ds_prev 377 mov [esp+scanend], ebx 378 mov edx, [esp+chainlenwmask] 379 jmp LookupLoop 380 ALIGN 4 381 382 ;/* Accept the current string, with the maximum possible length. */ 383 384 LenMaximum: 385 mov edx, [esp+deflatestate] 386 mov DWORD PTR[esp+bestlen], MAX_MATCH 387 mov [edx].ds_match_start, ecx 388 389 ;/* if ((uInt)best_len <= s->lookahead) return (uInt)best_len; */ 390 ;/* return s->lookahead; */ 391 392 LeaveNow: 393 mov edx, [esp+deflatestate] 394 mov ebx, [esp+bestlen] 395 mov eax, [edx].ds_lookahead 396 cmp ebx, eax 397 jg SHORT LookaheadRet 398 mov eax, ebx 399 LookaheadRet: 400 401 ; Restore the stack and return from whence we came. 402 403 add esp, varsize 404 pop ebx 405 pop esi 406 pop edi 407 pop ebp 408 ret 409 410 _longest_match ENDP 411 412 _TEXT ENDS 413 END 414