Home | History | Annotate | Download | only in masmx64
      1 ; inffasx64.asm is a hand tuned assembler version of inffast.c - fast decoding
      2 ; version for AMD64 on Windows using Microsoft C compiler
      3 ;
      4 ; inffasx64.asm is automatically convert from AMD64 portion of inffas86.c
      5 ; inffasx64.asm is called by inffas8664.c, which contain more info.
      6 
      7 
      8 ; to compile this file, I use option
      9 ;   ml64.exe /Flinffasx64 /c /Zi inffasx64.asm
     10 ;   with Microsoft Macro Assembler (x64) for AMD64
     11 ;
     12 
     13 ; This file compile with Microsoft Macro Assembler (x64) for AMD64
     14 ;
     15 ;   ml64.exe is given with Visual Studio 2005/2008/2010 and Windows WDK
     16 ;
     17 ;   (you can get Windows WDK with ml64 for AMD64 from
     18 ;      http://www.microsoft.com/whdc/Devtools/wdk/default.mspx for low price)
     19 ;
     20 
     21 
     22 .code
     23 inffas8664fnc PROC
     24 
     25 ; see http://weblogs.asp.net/oldnewthing/archive/2004/01/14/58579.aspx and
     26 ; http://msdn.microsoft.com/library/en-us/kmarch/hh/kmarch/64bitAMD_8e951dd2-ee77-4728-8702-55ce4b5dd24a.xml.asp
     27 ;
     28 ; All registers must be preserved across the call, except for
     29 ;   rax, rcx, rdx, r8, r-9, r10, and r11, which are scratch.
     30 
     31 
     32 	mov [rsp-8],rsi
     33 	mov [rsp-16],rdi
     34 	mov [rsp-24],r12
     35 	mov [rsp-32],r13
     36 	mov [rsp-40],r14
     37 	mov [rsp-48],r15
     38 	mov [rsp-56],rbx
     39 
     40 	mov rax,rcx
     41 
     42 	mov	[rax+8], rbp       ; /* save regs rbp and rsp */
     43 	mov	[rax], rsp
     44 
     45 	mov	rsp, rax          ; /* make rsp point to &ar */
     46 
     47 	mov	rsi, [rsp+16]      ; /* rsi  = in */
     48 	mov	rdi, [rsp+32]      ; /* rdi  = out */
     49 	mov	r9, [rsp+24]       ; /* r9   = last */
     50 	mov	r10, [rsp+48]      ; /* r10  = end */
     51 	mov	rbp, [rsp+64]      ; /* rbp  = lcode */
     52 	mov	r11, [rsp+72]      ; /* r11  = dcode */
     53 	mov	rdx, [rsp+80]      ; /* rdx  = hold */
     54 	mov	ebx, [rsp+88]      ; /* ebx  = bits */
     55 	mov	r12d, [rsp+100]    ; /* r12d = lmask */
     56 	mov	r13d, [rsp+104]    ; /* r13d = dmask */
     57                                           ; /* r14d = len */
     58                                           ; /* r15d = dist */
     59 
     60 
     61 	cld
     62 	cmp	r10, rdi
     63 	je	L_one_time           ; /* if only one decode left */
     64 	cmp	r9, rsi
     65 
     66     jne L_do_loop
     67 
     68 
     69 L_one_time:
     70 	mov	r8, r12           ; /* r8 = lmask */
     71 	cmp	bl, 32
     72 	ja	L_get_length_code_one_time
     73 
     74 	lodsd                         ; /* eax = *(uint *)in++ */
     75 	mov	cl, bl            ; /* cl = bits, needs it for shifting */
     76 	add	bl, 32             ; /* bits += 32 */
     77 	shl	rax, cl
     78 	or	rdx, rax          ; /* hold |= *((uint *)in)++ << bits */
     79 	jmp	L_get_length_code_one_time
     80 
     81 ALIGN 4
     82 L_while_test:
     83 	cmp	r10, rdi
     84 	jbe	L_break_loop
     85 	cmp	r9, rsi
     86 	jbe	L_break_loop
     87 
     88 L_do_loop:
     89 	mov	r8, r12           ; /* r8 = lmask */
     90 	cmp	bl, 32
     91 	ja	L_get_length_code    ; /* if (32 < bits) */
     92 
     93 	lodsd                         ; /* eax = *(uint *)in++ */
     94 	mov	cl, bl            ; /* cl = bits, needs it for shifting */
     95 	add	bl, 32             ; /* bits += 32 */
     96 	shl	rax, cl
     97 	or	rdx, rax          ; /* hold |= *((uint *)in)++ << bits */
     98 
     99 L_get_length_code:
    100 	and	r8, rdx            ; /* r8 &= hold */
    101 	mov	eax, [rbp+r8*4]  ; /* eax = lcode[hold & lmask] */
    102 
    103 	mov	cl, ah            ; /* cl = this.bits */
    104 	sub	bl, ah            ; /* bits -= this.bits */
    105 	shr	rdx, cl           ; /* hold >>= this.bits */
    106 
    107 	test	al, al
    108 	jnz	L_test_for_length_base ; /* if (op != 0) 45.7% */
    109 
    110 	mov	r8, r12            ; /* r8 = lmask */
    111 	shr	eax, 16            ; /* output this.val char */
    112 	stosb
    113 
    114 L_get_length_code_one_time:
    115 	and	r8, rdx            ; /* r8 &= hold */
    116 	mov	eax, [rbp+r8*4] ; /* eax = lcode[hold & lmask] */
    117 
    118 L_dolen:
    119 	mov	cl, ah            ; /* cl = this.bits */
    120 	sub	bl, ah            ; /* bits -= this.bits */
    121 	shr	rdx, cl           ; /* hold >>= this.bits */
    122 
    123 	test	al, al
    124 	jnz	L_test_for_length_base ; /* if (op != 0) 45.7% */
    125 
    126 	shr	eax, 16            ; /* output this.val char */
    127 	stosb
    128 	jmp	L_while_test
    129 
    130 ALIGN 4
    131 L_test_for_length_base:
    132 	mov	r14d, eax         ; /* len = this */
    133 	shr	r14d, 16           ; /* len = this.val */
    134 	mov	cl, al
    135 
    136 	test	al, 16
    137 	jz	L_test_for_second_level_length ; /* if ((op & 16) == 0) 8% */
    138 	and	cl, 15             ; /* op &= 15 */
    139 	jz	L_decode_distance    ; /* if (!op) */
    140 
    141 L_add_bits_to_len:
    142 	sub	bl, cl
    143 	xor	eax, eax
    144 	inc	eax
    145 	shl	eax, cl
    146 	dec	eax
    147 	and	eax, edx          ; /* eax &= hold */
    148 	shr	rdx, cl
    149 	add	r14d, eax         ; /* len += hold & mask[op] */
    150 
    151 L_decode_distance:
    152 	mov	r8, r13           ; /* r8 = dmask */
    153 	cmp	bl, 32
    154 	ja	L_get_distance_code  ; /* if (32 < bits) */
    155 
    156 	lodsd                         ; /* eax = *(uint *)in++ */
    157 	mov	cl, bl            ; /* cl = bits, needs it for shifting */
    158 	add	bl, 32             ; /* bits += 32 */
    159 	shl	rax, cl
    160 	or	rdx, rax          ; /* hold |= *((uint *)in)++ << bits */
    161 
    162 L_get_distance_code:
    163 	and	r8, rdx           ; /* r8 &= hold */
    164 	mov	eax, [r11+r8*4] ; /* eax = dcode[hold & dmask] */
    165 
    166 L_dodist:
    167 	mov	r15d, eax         ; /* dist = this */
    168 	shr	r15d, 16           ; /* dist = this.val */
    169 	mov	cl, ah
    170 	sub	bl, ah            ; /* bits -= this.bits */
    171 	shr	rdx, cl           ; /* hold >>= this.bits */
    172 	mov	cl, al            ; /* cl = this.op */
    173 
    174 	test	al, 16             ; /* if ((op & 16) == 0) */
    175 	jz	L_test_for_second_level_dist
    176 	and	cl, 15             ; /* op &= 15 */
    177 	jz	L_check_dist_one
    178 
    179 L_add_bits_to_dist:
    180 	sub	bl, cl
    181 	xor	eax, eax
    182 	inc	eax
    183 	shl	eax, cl
    184 	dec	eax                 ; /* (1 << op) - 1 */
    185 	and	eax, edx          ; /* eax &= hold */
    186 	shr	rdx, cl
    187 	add	r15d, eax         ; /* dist += hold & ((1 << op) - 1) */
    188 
    189 L_check_window:
    190 	mov	r8, rsi           ; /* save in so from can use it's reg */
    191 	mov	rax, rdi
    192 	sub	rax, [rsp+40]      ; /* nbytes = out - beg */
    193 
    194 	cmp	eax, r15d
    195 	jb	L_clip_window        ; /* if (dist > nbytes) 4.2% */
    196 
    197 	mov	ecx, r14d         ; /* ecx = len */
    198 	mov	rsi, rdi
    199 	sub	rsi, r15          ; /* from = out - dist */
    200 
    201 	sar	ecx, 1
    202 	jnc	L_copy_two           ; /* if len % 2 == 0 */
    203 
    204 	rep     movsw
    205 	mov	al, [rsi]
    206 	mov	[rdi], al
    207 	inc	rdi
    208 
    209 	mov	rsi, r8           ; /* move in back to %rsi, toss from */
    210 	jmp	L_while_test
    211 
    212 L_copy_two:
    213 	rep     movsw
    214 	mov	rsi, r8           ; /* move in back to %rsi, toss from */
    215 	jmp	L_while_test
    216 
    217 ALIGN 4
    218 L_check_dist_one:
    219 	cmp	r15d, 1            ; /* if dist 1, is a memset */
    220 	jne	L_check_window
    221 	cmp	[rsp+40], rdi      ; /* if out == beg, outside window */
    222 	je	L_check_window
    223 
    224 	mov	ecx, r14d         ; /* ecx = len */
    225 	mov	al, [rdi-1]
    226 	mov	ah, al
    227 
    228 	sar	ecx, 1
    229 	jnc	L_set_two
    230 	mov	[rdi], al
    231 	inc	rdi
    232 
    233 L_set_two:
    234 	rep     stosw
    235 	jmp	L_while_test
    236 
    237 ALIGN 4
    238 L_test_for_second_level_length:
    239 	test	al, 64
    240 	jnz	L_test_for_end_of_block ; /* if ((op & 64) != 0) */
    241 
    242 	xor	eax, eax
    243 	inc	eax
    244 	shl	eax, cl
    245 	dec	eax
    246 	and	eax, edx         ; /* eax &= hold */
    247 	add	eax, r14d        ; /* eax += len */
    248 	mov	eax, [rbp+rax*4] ; /* eax = lcode[val+(hold&mask[op])]*/
    249 	jmp	L_dolen
    250 
    251 ALIGN 4
    252 L_test_for_second_level_dist:
    253 	test	al, 64
    254 	jnz	L_invalid_distance_code ; /* if ((op & 64) != 0) */
    255 
    256 	xor	eax, eax
    257 	inc	eax
    258 	shl	eax, cl
    259 	dec	eax
    260 	and	eax, edx         ; /* eax &= hold */
    261 	add	eax, r15d        ; /* eax += dist */
    262 	mov	eax, [r11+rax*4] ; /* eax = dcode[val+(hold&mask[op])]*/
    263 	jmp	L_dodist
    264 
    265 ALIGN 4
    266 L_clip_window:
    267 	mov	ecx, eax         ; /* ecx = nbytes */
    268 	mov	eax, [rsp+92]     ; /* eax = wsize, prepare for dist cmp */
    269 	neg	ecx                ; /* nbytes = -nbytes */
    270 
    271 	cmp	eax, r15d
    272 	jb	L_invalid_distance_too_far ; /* if (dist > wsize) */
    273 
    274 	add	ecx, r15d         ; /* nbytes = dist - nbytes */
    275 	cmp	dword ptr [rsp+96], 0
    276 	jne	L_wrap_around_window ; /* if (write != 0) */
    277 
    278 	mov	rsi, [rsp+56]     ; /* from  = window */
    279 	sub	eax, ecx         ; /* eax  -= nbytes */
    280 	add	rsi, rax         ; /* from += wsize - nbytes */
    281 
    282 	mov	eax, r14d        ; /* eax = len */
    283 	cmp	r14d, ecx
    284 	jbe	L_do_copy           ; /* if (nbytes >= len) */
    285 
    286 	sub	eax, ecx         ; /* eax -= nbytes */
    287 	rep     movsb
    288 	mov	rsi, rdi
    289 	sub	rsi, r15         ; /* from = &out[ -dist ] */
    290 	jmp	L_do_copy
    291 
    292 ALIGN 4
    293 L_wrap_around_window:
    294 	mov	eax, [rsp+96]     ; /* eax = write */
    295 	cmp	ecx, eax
    296 	jbe	L_contiguous_in_window ; /* if (write >= nbytes) */
    297 
    298 	mov	esi, [rsp+92]     ; /* from  = wsize */
    299 	add	rsi, [rsp+56]     ; /* from += window */
    300 	add	rsi, rax         ; /* from += write */
    301 	sub	rsi, rcx         ; /* from -= nbytes */
    302 	sub	ecx, eax         ; /* nbytes -= write */
    303 
    304 	mov	eax, r14d        ; /* eax = len */
    305 	cmp	eax, ecx
    306 	jbe	L_do_copy           ; /* if (nbytes >= len) */
    307 
    308 	sub	eax, ecx         ; /* len -= nbytes */
    309 	rep     movsb
    310 	mov	rsi, [rsp+56]     ; /* from = window */
    311 	mov	ecx, [rsp+96]     ; /* nbytes = write */
    312 	cmp	eax, ecx
    313 	jbe	L_do_copy           ; /* if (nbytes >= len) */
    314 
    315 	sub	eax, ecx         ; /* len -= nbytes */
    316 	rep     movsb
    317 	mov	rsi, rdi
    318 	sub	rsi, r15         ; /* from = out - dist */
    319 	jmp	L_do_copy
    320 
    321 ALIGN 4
    322 L_contiguous_in_window:
    323 	mov	rsi, [rsp+56]     ; /* rsi = window */
    324 	add	rsi, rax
    325 	sub	rsi, rcx         ; /* from += write - nbytes */
    326 
    327 	mov	eax, r14d        ; /* eax = len */
    328 	cmp	eax, ecx
    329 	jbe	L_do_copy           ; /* if (nbytes >= len) */
    330 
    331 	sub	eax, ecx         ; /* len -= nbytes */
    332 	rep     movsb
    333 	mov	rsi, rdi
    334 	sub	rsi, r15         ; /* from = out - dist */
    335 	jmp	L_do_copy           ; /* if (nbytes >= len) */
    336 
    337 ALIGN 4
    338 L_do_copy:
    339 	mov	ecx, eax         ; /* ecx = len */
    340 	rep     movsb
    341 
    342 	mov	rsi, r8          ; /* move in back to %esi, toss from */
    343 	jmp	L_while_test
    344 
    345 L_test_for_end_of_block:
    346 	test	al, 32
    347 	jz	L_invalid_literal_length_code
    348 	mov	dword ptr [rsp+116], 1
    349 	jmp	L_break_loop_with_status
    350 
    351 L_invalid_literal_length_code:
    352 	mov	dword ptr [rsp+116], 2
    353 	jmp	L_break_loop_with_status
    354 
    355 L_invalid_distance_code:
    356 	mov	dword ptr [rsp+116], 3
    357 	jmp	L_break_loop_with_status
    358 
    359 L_invalid_distance_too_far:
    360 	mov	dword ptr [rsp+116], 4
    361 	jmp	L_break_loop_with_status
    362 
    363 L_break_loop:
    364 	mov	dword ptr [rsp+116], 0
    365 
    366 L_break_loop_with_status:
    367 ; /* put in, out, bits, and hold back into ar and pop esp */
    368 	mov	[rsp+16], rsi     ; /* in */
    369 	mov	[rsp+32], rdi     ; /* out */
    370 	mov	[rsp+88], ebx     ; /* bits */
    371 	mov	[rsp+80], rdx     ; /* hold */
    372 
    373 	mov	rax, [rsp]       ; /* restore rbp and rsp */
    374 	mov	rbp, [rsp+8]
    375 	mov	rsp, rax
    376 
    377 
    378 
    379 	mov rsi,[rsp-8]
    380 	mov rdi,[rsp-16]
    381 	mov r12,[rsp-24]
    382 	mov r13,[rsp-32]
    383 	mov r14,[rsp-40]
    384 	mov r15,[rsp-48]
    385 	mov rbx,[rsp-56]
    386 
    387     ret 0
    388 ;          :
    389 ;          : "m" (ar)
    390 ;          : "memory", "%rax", "%rbx", "%rcx", "%rdx", "%rsi", "%rdi",
    391 ;            "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15"
    392 ;    );
    393 
    394 inffas8664fnc 	ENDP
    395 ;_TEXT	ENDS
    396 END
    397