Home | History | Annotate | Download | only in masm686
      1 
      2 ; match.asm -- Pentium-Pro optimized version of longest_match()
      3 ;
      4 ; Updated for zlib 1.1.3 and converted to MASM 6.1x
      5 ; Copyright (C) 2000 Dan Higdon <hdan (a] kinesoft.com>
      6 ;                    and Chuck Walbourn <chuckw (a] kinesoft.com>
      7 ; Corrections by Cosmin Truta <cosmint (a] cs.ubbcluj.ro>
      8 ;
      9 ; This is free software; you can redistribute it and/or modify it
     10 ; under the terms of the GNU General Public License.
     11 
     12 ; Based on match.S
     13 ; Written for zlib 1.1.2
     14 ; Copyright (C) 1998 Brian Raiter <breadbox (a] muppetlabs.com>
     15 ;
     16 ; Modified by Gilles Vollant (2005) for add gzhead and gzindex
     17 
     18 	.686P
     19 	.MODEL	FLAT
     20 
     21 ;===========================================================================
     22 ; EQUATES
     23 ;===========================================================================
     24 
     25 MAX_MATCH	EQU 258
     26 MIN_MATCH	EQU 3
     27 MIN_LOOKAHEAD	EQU (MAX_MATCH + MIN_MATCH + 1)
     28 MAX_MATCH_8	EQU ((MAX_MATCH + 7) AND (NOT 7))
     29 
     30 ;===========================================================================
     31 ; STRUCTURES
     32 ;===========================================================================
     33 
     34 ; This STRUCT assumes a 4-byte alignment
     35 
     36 DEFLATE_STATE	STRUCT
     37 ds_strm			dd ?
     38 ds_status		dd ?
     39 ds_pending_buf		dd ?
     40 ds_pending_buf_size	dd ?
     41 ds_pending_out		dd ?
     42 ds_pending		dd ?
     43 ds_wrap			dd ?
     44 ; gzhead and gzindex are added in zlib 1.2.2.2 (see deflate.h)
     45 ds_gzhead               dd ?
     46 ds_gzindex              dd ?
     47 ds_data_type		db ?
     48 ds_method		db ?
     49 			db ?	; padding
     50 			db ?	; padding
     51 ds_last_flush		dd ?
     52 ds_w_size		dd ?	; used
     53 ds_w_bits		dd ?
     54 ds_w_mask		dd ?	; used
     55 ds_window		dd ?	; used
     56 ds_window_size		dd ?
     57 ds_prev			dd ?	; used
     58 ds_head			dd ?
     59 ds_ins_h		dd ?
     60 ds_hash_size		dd ?
     61 ds_hash_bits		dd ?
     62 ds_hash_mask		dd ?
     63 ds_hash_shift		dd ?
     64 ds_block_start		dd ?
     65 ds_match_length		dd ?	; used
     66 ds_prev_match		dd ?	; used
     67 ds_match_available	dd ?
     68 ds_strstart		dd ?	; used
     69 ds_match_start		dd ?	; used
     70 ds_lookahead		dd ?	; used
     71 ds_prev_length		dd ?	; used
     72 ds_max_chain_length	dd ?	; used
     73 ds_max_laxy_match	dd ?
     74 ds_level		dd ?
     75 ds_strategy		dd ?
     76 ds_good_match		dd ?	; used
     77 ds_nice_match		dd ?	; used
     78 
     79 ; Don't need anymore of the struct for match
     80 DEFLATE_STATE	ENDS
     81 
     82 ;===========================================================================
     83 ; CODE
     84 ;===========================================================================
     85 _TEXT	SEGMENT
     86 
     87 ;---------------------------------------------------------------------------
     88 ; match_init
     89 ;---------------------------------------------------------------------------
     90 	ALIGN	4
     91 PUBLIC	_match_init
     92 _match_init	PROC
     93 	; no initialization needed
     94 	ret
     95 _match_init	ENDP
     96 
     97 ;---------------------------------------------------------------------------
     98 ; uInt longest_match(deflate_state *deflatestate, IPos curmatch)
     99 ;---------------------------------------------------------------------------
    100 	ALIGN	4
    101 
    102 PUBLIC	_longest_match
    103 _longest_match	PROC
    104 
    105 ; Since this code uses EBP for a scratch register, the stack frame must
    106 ; be manually constructed and referenced relative to the ESP register.
    107 
    108 ; Stack image
    109 ; Variables
    110 chainlenwmask	=  0	; high word: current chain len
    111 			; low word: s->wmask
    112 window		=  4	; local copy of s->window
    113 windowbestlen	=  8	; s->window + bestlen
    114 scanend		= 12	; last two bytes of string
    115 scanstart	= 16	; first two bytes of string
    116 scanalign	= 20	; dword-misalignment of string
    117 nicematch	= 24	; a good enough match size
    118 bestlen		= 28	; size of best match so far
    119 scan		= 32	; ptr to string wanting match
    120 varsize		= 36	; number of bytes (also offset to last saved register)
    121 
    122 ; Saved Registers (actually pushed into place)
    123 ebx_save	= 36
    124 edi_save	= 40
    125 esi_save	= 44
    126 ebp_save	= 48
    127 
    128 ; Parameters
    129 retaddr		= 52
    130 deflatestate	= 56
    131 curmatch	= 60
    132 
    133 ; Save registers that the compiler may be using
    134 	push	ebp
    135 	push	edi
    136 	push	esi
    137 	push	ebx
    138 
    139 ; Allocate local variable space
    140 	sub	esp,varsize
    141 
    142 ; Retrieve the function arguments. ecx will hold cur_match
    143 ; throughout the entire function. edx will hold the pointer to the
    144 ; deflate_state structure during the function's setup (before
    145 ; entering the main loop).
    146 
    147 	mov	edx, [esp+deflatestate]
    148 ASSUME	edx:PTR DEFLATE_STATE
    149 
    150 	mov	ecx, [esp+curmatch]
    151 
    152 ; uInt wmask = s->w_mask;
    153 ; unsigned chain_length = s->max_chain_length;
    154 ; if (s->prev_length >= s->good_match) {
    155 ;     chain_length >>= 2;
    156 ; }
    157 
    158 	mov	eax, [edx].ds_prev_length
    159 	mov	ebx, [edx].ds_good_match
    160 	cmp	eax, ebx
    161 	mov	eax, [edx].ds_w_mask
    162 	mov	ebx, [edx].ds_max_chain_length
    163 	jl	SHORT LastMatchGood
    164 	shr	ebx, 2
    165 LastMatchGood:
    166 
    167 ; chainlen is decremented once beforehand so that the function can
    168 ; use the sign flag instead of the zero flag for the exit test.
    169 ; It is then shifted into the high word, to make room for the wmask
    170 ; value, which it will always accompany.
    171 
    172 	dec	ebx
    173 	shl	ebx, 16
    174 	or	ebx, eax
    175 	mov	[esp+chainlenwmask], ebx
    176 
    177 ; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead;
    178 
    179 	mov	eax, [edx].ds_nice_match
    180 	mov	ebx, [edx].ds_lookahead
    181 	cmp	ebx, eax
    182 	jl	SHORT LookaheadLess
    183 	mov	ebx, eax
    184 LookaheadLess:
    185 	mov	[esp+nicematch], ebx
    186 
    187 ;/* register Bytef *scan = s->window + s->strstart;                     */
    188 
    189 	mov	esi, [edx].ds_window
    190 	mov	[esp+window], esi
    191 	mov	ebp, [edx].ds_strstart
    192 	lea	edi, [esi+ebp]
    193 	mov	[esp+scan],edi
    194 
    195 ;/* Determine how many bytes the scan ptr is off from being             */
    196 ;/* dword-aligned.                                                      */
    197 
    198 	mov	eax, edi
    199 	neg	eax
    200 	and	eax, 3
    201 	mov	[esp+scanalign], eax
    202 
    203 ;/* IPos limit = s->strstart > (IPos)MAX_DIST(s) ?                      */
    204 ;/*     s->strstart - (IPos)MAX_DIST(s) : NIL;                          */
    205 
    206 	mov	eax, [edx].ds_w_size
    207 	sub	eax, MIN_LOOKAHEAD
    208 	sub	ebp, eax
    209 	jg	SHORT LimitPositive
    210 	xor	ebp, ebp
    211 LimitPositive:
    212 
    213 ;/* int best_len = s->prev_length;                                      */
    214 
    215 	mov	eax, [edx].ds_prev_length
    216 	mov	[esp+bestlen], eax
    217 
    218 ;/* Store the sum of s->window + best_len in %esi locally, and in %esi. */
    219 
    220 	add	esi, eax
    221 	mov	[esp+windowbestlen], esi
    222 
    223 ;/* register ush scan_start = *(ushf*)scan;                             */
    224 ;/* register ush scan_end   = *(ushf*)(scan+best_len-1);                */
    225 ;/* Posf *prev = s->prev;                                               */
    226 
    227 	movzx	ebx, WORD PTR[edi]
    228 	mov	[esp+scanstart], ebx
    229 	movzx	ebx, WORD PTR[eax+edi-1]
    230 	mov	[esp+scanend], ebx
    231 	mov	edi, [edx].ds_prev
    232 
    233 ;/* Jump into the main loop.                                            */
    234 
    235 	mov	edx, [esp+chainlenwmask]
    236 	jmp	SHORT LoopEntry
    237 
    238 ;/* do {
    239 ; *     match = s->window + cur_match;
    240 ; *     if (*(ushf*)(match+best_len-1) != scan_end ||
    241 ; *         *(ushf*)match != scan_start) continue;
    242 ; *     [...]
    243 ; * } while ((cur_match = prev[cur_match & wmask]) > limit
    244 ; *          && --chain_length != 0);
    245 ; *
    246 ; * Here is the inner loop of the function. The function will spend the
    247 ; * majority of its time in this loop, and majority of that time will
    248 ; * be spent in the first ten instructions.
    249 ; *
    250 ; * Within this loop:
    251 ; * %ebx = scanend
    252 ; * %ecx = curmatch
    253 ; * %edx = chainlenwmask - i.e., ((chainlen << 16) | wmask)
    254 ; * %esi = windowbestlen - i.e., (window + bestlen)
    255 ; * %edi = prev
    256 ; * %ebp = limit
    257 ; */
    258 
    259 	ALIGN	4
    260 LookupLoop:
    261 	and	ecx, edx
    262 	movzx	ecx, WORD PTR[edi+ecx*2]
    263 	cmp	ecx, ebp
    264 	jbe	LeaveNow
    265 	sub	edx, 000010000H
    266 	js	LeaveNow
    267 
    268 LoopEntry:
    269 	movzx	eax, WORD PTR[esi+ecx-1]
    270 	cmp	eax, ebx
    271 	jnz	SHORT LookupLoop
    272 
    273 	mov	eax, [esp+window]
    274 	movzx	eax, WORD PTR[eax+ecx]
    275 	cmp	eax, [esp+scanstart]
    276 	jnz	SHORT LookupLoop
    277 
    278 ;/* Store the current value of chainlen.                                */
    279 
    280 	mov	[esp+chainlenwmask], edx
    281 
    282 ;/* Point %edi to the string under scrutiny, and %esi to the string we  */
    283 ;/* are hoping to match it up with. In actuality, %esi and %edi are     */
    284 ;/* both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and %edx is     */
    285 ;/* initialized to -(MAX_MATCH_8 - scanalign).                          */
    286 
    287 	mov	esi, [esp+window]
    288 	mov	edi, [esp+scan]
    289 	add	esi, ecx
    290 	mov	eax, [esp+scanalign]
    291 	mov	edx, -MAX_MATCH_8
    292 	lea	edi, [edi+eax+MAX_MATCH_8]
    293 	lea	esi, [esi+eax+MAX_MATCH_8]
    294 
    295 ;/* Test the strings for equality, 8 bytes at a time. At the end,
    296 ; * adjust %edx so that it is offset to the exact byte that mismatched.
    297 ; *
    298 ; * We already know at this point that the first three bytes of the
    299 ; * strings match each other, and they can be safely passed over before
    300 ; * starting the compare loop. So what this code does is skip over 0-3
    301 ; * bytes, as much as necessary in order to dword-align the %edi
    302 ; * pointer. (%esi will still be misaligned three times out of four.)
    303 ; *
    304 ; * It should be confessed that this loop usually does not represent
    305 ; * much of the total running time. Replacing it with a more
    306 ; * straightforward "rep cmpsb" would not drastically degrade
    307 ; * performance.
    308 ; */
    309 
    310 LoopCmps:
    311 	mov	eax, DWORD PTR[esi+edx]
    312 	xor	eax, DWORD PTR[edi+edx]
    313 	jnz	SHORT LeaveLoopCmps
    314 
    315 	mov	eax, DWORD PTR[esi+edx+4]
    316 	xor	eax, DWORD PTR[edi+edx+4]
    317 	jnz	SHORT LeaveLoopCmps4
    318 
    319 	add	edx, 8
    320 	jnz	SHORT LoopCmps
    321 	jmp	LenMaximum
    322 	ALIGN	4
    323 
    324 LeaveLoopCmps4:
    325 	add	edx, 4
    326 
    327 LeaveLoopCmps:
    328 	test	eax, 00000FFFFH
    329 	jnz	SHORT LenLower
    330 
    331 	add	edx, 2
    332 	shr	eax, 16
    333 
    334 LenLower:
    335 	sub	al, 1
    336 	adc	edx, 0
    337 
    338 ;/* Calculate the length of the match. If it is longer than MAX_MATCH,  */
    339 ;/* then automatically accept it as the best possible match and leave.  */
    340 
    341 	lea	eax, [edi+edx]
    342 	mov	edi, [esp+scan]
    343 	sub	eax, edi
    344 	cmp	eax, MAX_MATCH
    345 	jge	SHORT LenMaximum
    346 
    347 ;/* If the length of the match is not longer than the best match we     */
    348 ;/* have so far, then forget it and return to the lookup loop.          */
    349 
    350 	mov	edx, [esp+deflatestate]
    351 	mov	ebx, [esp+bestlen]
    352 	cmp	eax, ebx
    353 	jg	SHORT LongerMatch
    354 	mov	esi, [esp+windowbestlen]
    355 	mov	edi, [edx].ds_prev
    356 	mov	ebx, [esp+scanend]
    357 	mov	edx, [esp+chainlenwmask]
    358 	jmp	LookupLoop
    359 	ALIGN	4
    360 
    361 ;/*         s->match_start = cur_match;                                 */
    362 ;/*         best_len = len;                                             */
    363 ;/*         if (len >= nice_match) break;                               */
    364 ;/*         scan_end = *(ushf*)(scan+best_len-1);                       */
    365 
    366 LongerMatch:
    367 	mov	ebx, [esp+nicematch]
    368 	mov	[esp+bestlen], eax
    369 	mov	[edx].ds_match_start, ecx
    370 	cmp	eax, ebx
    371 	jge	SHORT LeaveNow
    372 	mov	esi, [esp+window]
    373 	add	esi, eax
    374 	mov	[esp+windowbestlen], esi
    375 	movzx	ebx, WORD PTR[edi+eax-1]
    376 	mov	edi, [edx].ds_prev
    377 	mov	[esp+scanend], ebx
    378 	mov	edx, [esp+chainlenwmask]
    379 	jmp	LookupLoop
    380 	ALIGN	4
    381 
    382 ;/* Accept the current string, with the maximum possible length.        */
    383 
    384 LenMaximum:
    385 	mov	edx, [esp+deflatestate]
    386 	mov	DWORD PTR[esp+bestlen], MAX_MATCH
    387 	mov	[edx].ds_match_start, ecx
    388 
    389 ;/* if ((uInt)best_len <= s->lookahead) return (uInt)best_len;          */
    390 ;/* return s->lookahead;                                                */
    391 
    392 LeaveNow:
    393 	mov	edx, [esp+deflatestate]
    394 	mov	ebx, [esp+bestlen]
    395 	mov	eax, [edx].ds_lookahead
    396 	cmp	ebx, eax
    397 	jg	SHORT LookaheadRet
    398 	mov	eax, ebx
    399 LookaheadRet:
    400 
    401 ; Restore the stack and return from whence we came.
    402 
    403 	add	esp, varsize
    404 	pop	ebx
    405 	pop	esi
    406 	pop	edi
    407 	pop	ebp
    408 	ret
    409 
    410 _longest_match	ENDP
    411 
    412 _TEXT	ENDS
    413 END
    414