Home | History | Annotate | Download | only in ia32
      1 ;  vim:filetype=nasm ts=8
      2 
      3 ;  libFLAC - Free Lossless Audio Codec library
      4 ;  Copyright (C) 2001,2002,2003,2004,2005,2006,2007  Josh Coalson
      5 ;
      6 ;  Redistribution and use in source and binary forms, with or without
      7 ;  modification, are permitted provided that the following conditions
      8 ;  are met:
      9 ;
     10 ;  - Redistributions of source code must retain the above copyright
     11 ;  notice, this list of conditions and the following disclaimer.
     12 ;
     13 ;  - Redistributions in binary form must reproduce the above copyright
     14 ;  notice, this list of conditions and the following disclaimer in the
     15 ;  documentation and/or other materials provided with the distribution.
     16 ;
     17 ;  - Neither the name of the Xiph.org Foundation nor the names of its
     18 ;  contributors may be used to endorse or promote products derived from
     19 ;  this software without specific prior written permission.
     20 ;
     21 ;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     22 ;  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     23 ;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     24 ;  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
     25 ;  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
     26 ;  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
     27 ;  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
     28 ;  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
     29 ;  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
     30 ;  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     31 ;  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     32 
     33 %include "nasm.h"
     34 
     35 	data_section
     36 
     37 cextern FLAC__crc16_table		; unsigned FLAC__crc16_table[256];
     38 cextern bitreader_read_from_client_	; FLAC__bool bitreader_read_from_client_(FLAC__BitReader *br);
     39 
     40 cglobal FLAC__bitreader_read_rice_signed_block_asm_ia32_bswap
     41 
     42 	code_section
     43 
     44 
     45 ; **********************************************************************
     46 ;
     47 ; void FLAC__bool FLAC__bitreader_read_rice_signed_block(FLAC__BitReader *br, int vals[], unsigned nvals, unsigned parameter)
     48 ;
     49 ; Some details like assertions and other checking is performed by the caller.
     50 	ALIGN 16
     51 cident FLAC__bitreader_read_rice_signed_block_asm_ia32_bswap
     52 
     53 	;ASSERT(0 != br);
     54 	;ASSERT(0 != br->buffer);
     55 	; WATCHOUT: code only works if sizeof(brword)==32; we can make things much faster with this assertion
     56 	;ASSERT(FLAC__BITS_PER_WORD == 32);
     57 	;ASSERT(parameter < 32);
     58 	; the above two asserts also guarantee that the binary part never straddles more than 2 words, so we don't have to loop to read it
     59 
     60 	;; peppered throughout the code at major checkpoints are keys like this as to where things are at that point in time
     61 	;; [esp + 16]	unsigned parameter
     62 	;; [esp + 12]	unsigned nvals
     63 	;; [esp + 8]	int vals[]
     64 	;; [esp + 4]	FLAC__BitReader *br
     65 	mov	eax, [esp + 12]		; if(nvals == 0)
     66 	test	eax, eax
     67 	ja	.nvals_gt_0
     68 	mov	eax, 1			;   return true;
     69 	ret
     70 
     71 .nvals_gt_0:
     72 	push	ebp
     73 	push	ebx
     74 	push	esi
     75 	push	edi
     76 	sub	esp, 4
     77 	;; [esp + 36]	unsigned parameter
     78 	;; [esp + 32]	unsigned nvals
     79 	;; [esp + 28]	int vals[]
     80 	;; [esp + 24]	FLAC__BitReader *br
     81 	;; [esp]	ucbits
     82 	mov	ebp, [esp + 24]		; ebp <- br == br->buffer
     83 	mov	esi, [ebp + 16]		; esi <- br->consumed_words (aka 'cwords' in the C version)
     84 	mov	ecx, [ebp + 20]		; ecx <- br->consumed_bits  (aka 'cbits'  in the C version)
     85 	xor	edi, edi		; edi <- 0  'uval'
     86 	;; ecx		cbits
     87 	;; esi		cwords
     88 	;; edi		uval
     89 	;; ebp		br
     90 	;; [ebp]	br->buffer
     91 	;; [ebp + 8]	br->words
     92 	;; [ebp + 12]	br->bytes
     93 	;; [ebp + 16]	br->consumed_words
     94 	;; [ebp + 20]	br->consumed_bits
     95 	;; [ebp + 24]	br->read_crc
     96 	;; [ebp + 28]	br->crc16_align
     97 
     98 					; ucbits = (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits;
     99 	mov	eax, [ebp + 8]		;   eax <- br->words
    100 	sub	eax, esi		;   eax <- br->words-cwords
    101 	shl	eax, 2			;   eax <- (br->words-cwords)*FLAC__BYTES_PER_WORD
    102 	add	eax, [ebp + 12]		;   eax <- (br->words-cwords)*FLAC__BYTES_PER_WORD + br->bytes
    103 	shl	eax, 3			;   eax <- (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8
    104 	sub	eax, ecx		;   eax <- (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits
    105 	mov	[esp], eax		;   ucbits <- eax
    106 
    107 	ALIGN 16
    108 .val_loop:				; while(1) {
    109 
    110 	;
    111 	; read unary part
    112 	;
    113 .unary_loop:				;   while(1) {
    114 	;; ecx		cbits
    115 	;; esi		cwords
    116 	;; edi		uval
    117 	;; ebp		br
    118 	cmp	esi, [ebp + 8]		;     while(cwords < br->words)   /* if we've not consumed up to a partial tail word... */
    119 	jae	near .c1_next1
    120 .c1_loop:				;     {
    121 	mov	ebx, [ebp]
    122 	mov	eax, [ebx + 4*esi]	;       b = br->buffer[cwords]
    123 	mov	edx, eax		;       edx = br->buffer[cwords] (saved for later use)
    124 	shl	eax, cl 		;       b = br->buffer[cwords] << cbits
    125 	test	eax, eax		;         (still have to test since cbits may be 0, thus ZF not updated for shl eax,0)
    126 	jz	near .c1_next2		;       if(b) {
    127 	bsr	ebx, eax
    128 	not	ebx
    129 	and	ebx, 31			;         ebx = 'i' = # of leading 0 bits in 'b' (eax)
    130 	add	ecx, ebx		;         cbits += i;
    131 	add	edi, ebx		;         uval += i;
    132 	add	ecx, byte 1		;         cbits++; /* skip over stop bit */
    133 	test	ecx, ~31
    134 	jz	near .break1 		;         if(cbits >= FLAC__BITS_PER_WORD) { /* faster way of testing if(cbits == FLAC__BITS_PER_WORD) */
    135 					;           crc16_update_word_(br, br->buffer[cwords]);
    136 	push	edi			;		[need more registers]
    137 	bswap	edx			;		edx = br->buffer[cwords] swapped; now we can CRC the bytes from LSByte to MSByte which makes things much easier
    138 	mov	ecx, [ebp + 28]		;		ecx <- br->crc16_align
    139 	mov	eax, [ebp + 24]		;		ax <- br->read_crc (a.k.a. crc)
    140 %ifdef FLAC__PUBLIC_NEEDS_UNDERSCORE
    141 	mov	edi, _FLAC__crc16_table
    142 %else
    143 	mov	edi, FLAC__crc16_table
    144 %endif
    145 	;; eax (ax)	crc a.k.a. br->read_crc
    146 	;; ebx (bl)	intermediate result index into FLAC__crc16_table[]
    147 	;; ecx		br->crc16_align
    148 	;; edx		byteswapped brword to CRC
    149 	;; esi		cwords
    150 	;; edi		unsigned FLAC__crc16_table[]
    151 	;; ebp		br
    152 	test	ecx, ecx		;		switch(br->crc16_align) ...
    153 	jnz	.c0b4			;		[br->crc16_align is 0 the vast majority of the time so we optimize the common case]
    154 .c0b0:	xor	dl, ah			;		dl <- (crc>>8)^(word>>24)
    155 	movzx	ebx, dl
    156 	mov	ecx, [ebx*4 + edi]	;		cx <- FLAC__crc16_table[(crc>>8)^(word>>24)]
    157 	shl	eax, 8			;		ax <- (crc<<8)
    158 	xor	eax, ecx		;		crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^(word>>24)]
    159 .c0b1:	xor	dh, ah			;		dh <- (crc>>8)^((word>>16)&0xff))
    160 	movzx	ebx, dh
    161 	mov	ecx, [ebx*4 + edi]	;		cx <- FLAC__crc16_table[(crc>>8)^((word>>16)&0xff))]
    162 	shl	eax, 8			;		ax <- (crc<<8)
    163 	xor	eax, ecx		;		crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^((word>>16)&0xff))]
    164 	shr	edx, 16
    165 .c0b2:	xor	dl, ah			;		dl <- (crc>>8)^((word>>8)&0xff))
    166 	movzx	ebx, dl
    167 	mov	ecx, [ebx*4 + edi]	;		cx <- FLAC__crc16_table[(crc>>8)^((word>>8)&0xff))]
    168 	shl	eax, 8			;		ax <- (crc<<8)
    169 	xor	eax, ecx		;		crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^((word>>8)&0xff))]
    170 .c0b3:	xor	dh, ah			;		dh <- (crc>>8)^(word&0xff)
    171 	movzx	ebx, dh
    172 	mov	ecx, [ebx*4 + edi]	;		cx <- FLAC__crc16_table[(crc>>8)^(word&0xff)]
    173 	shl	eax, 8			;		ax <- (crc<<8)
    174 	xor	eax, ecx		;		crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^(word&0xff)]
    175 	movzx	eax, ax
    176 	mov	[ebp + 24], eax		;		br->read_crc <- crc
    177 	pop	edi
    178 
    179 	add	esi, byte 1		;           cwords++;
    180 	xor	ecx, ecx		;           cbits = 0;
    181 					;         }
    182 	jmp	near .break1		;         goto break1;
    183 	;; this section relocated out of the way for performance
    184 .c0b4:
    185 	mov	[ebp + 28], dword 0	;		br->crc16_align <- 0
    186 	cmp	ecx, 8
    187 	je	.c0b1
    188 	shr	edx, 16
    189 	cmp	ecx, 16
    190 	je	.c0b2
    191 	jmp	.c0b3
    192 
    193 	;; this section relocated out of the way for performance
    194 .c1b4:
    195 	mov	[ebp + 28], dword 0	;		br->crc16_align <- 0
    196 	cmp	ecx, 8
    197 	je	.c1b1
    198 	shr	edx, 16
    199 	cmp	ecx, 16
    200 	je	.c1b2
    201 	jmp	.c1b3
    202 
    203 .c1_next2:				;       } else {
    204 	;; ecx		cbits
    205 	;; edx		current brword 'b'
    206 	;; esi		cwords
    207 	;; edi		uval
    208 	;; ebp		br
    209 	add	edi, 32
    210 	sub	edi, ecx		;         uval += FLAC__BITS_PER_WORD - cbits;
    211 					;         crc16_update_word_(br, br->buffer[cwords]);
    212 	push	edi			;		[need more registers]
    213 	bswap	edx			;		edx = br->buffer[cwords] swapped; now we can CRC the bytes from LSByte to MSByte which makes things much easier
    214 	mov	ecx, [ebp + 28]		;		ecx <- br->crc16_align
    215 	mov	eax, [ebp + 24]		;		ax <- br->read_crc (a.k.a. crc)
    216 %ifdef FLAC__PUBLIC_NEEDS_UNDERSCORE
    217 	mov	edi, _FLAC__crc16_table
    218 %else
    219 	mov	edi, FLAC__crc16_table
    220 %endif
    221 	;; eax (ax)	crc a.k.a. br->read_crc
    222 	;; ebx (bl)	intermediate result index into FLAC__crc16_table[]
    223 	;; ecx		br->crc16_align
    224 	;; edx		byteswapped brword to CRC
    225 	;; esi		cwords
    226 	;; edi		unsigned FLAC__crc16_table[]
    227 	;; ebp		br
    228 	test	ecx, ecx		;		switch(br->crc16_align) ...
    229 	jnz	.c1b4			;		[br->crc16_align is 0 the vast majority of the time so we optimize the common case]
    230 .c1b0:	xor	dl, ah			;		dl <- (crc>>8)^(word>>24)
    231 	movzx	ebx, dl
    232 	mov	ecx, [ebx*4 + edi]	;		cx <- FLAC__crc16_table[(crc>>8)^(word>>24)]
    233 	shl	eax, 8			;		ax <- (crc<<8)
    234 	xor	eax, ecx		;		crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^(word>>24)]
    235 .c1b1:	xor	dh, ah			;		dh <- (crc>>8)^((word>>16)&0xff))
    236 	movzx	ebx, dh
    237 	mov	ecx, [ebx*4 + edi]	;		cx <- FLAC__crc16_table[(crc>>8)^((word>>16)&0xff))]
    238 	shl	eax, 8			;		ax <- (crc<<8)
    239 	xor	eax, ecx		;		crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^((word>>16)&0xff))]
    240 	shr	edx, 16
    241 .c1b2:	xor	dl, ah			;		dl <- (crc>>8)^((word>>8)&0xff))
    242 	movzx	ebx, dl
    243 	mov	ecx, [ebx*4 + edi]	;		cx <- FLAC__crc16_table[(crc>>8)^((word>>8)&0xff))]
    244 	shl	eax, 8			;		ax <- (crc<<8)
    245 	xor	eax, ecx		;		crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^((word>>8)&0xff))]
    246 .c1b3:	xor	dh, ah			;		dh <- (crc>>8)^(word&0xff)
    247 	movzx	ebx, dh
    248 	mov	ecx, [ebx*4 + edi]	;		cx <- FLAC__crc16_table[(crc>>8)^(word&0xff)]
    249 	shl	eax, 8			;		ax <- (crc<<8)
    250 	xor	eax, ecx		;		crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^(word&0xff)]
    251 	movzx	eax, ax
    252 	mov	[ebp + 24], eax		;		br->read_crc <- crc
    253 	pop	edi
    254 
    255 	add	esi, byte 1		;         cwords++;
    256 	xor	ecx, ecx		;         cbits = 0;
    257 					;         /* didn't find stop bit yet, have to keep going... */
    258 					;       }
    259 
    260 	cmp	esi, [ebp + 8]		;     } while(cwords < br->words)   /* if we've not consumed up to a partial tail word... */
    261 	jb	near .c1_loop
    262 
    263 .c1_next1:
    264 	; at this point we've eaten up all the whole words; have to try
    265 	; reading through any tail bytes before calling the read callback.
    266 	; this is a repeat of the above logic adjusted for the fact we
    267 	; don't have a whole word.  note though if the client is feeding
    268 	; us data a byte at a time (unlikely), br->consumed_bits may not
    269 	; be zero.
    270 	;; ecx		cbits
    271 	;; esi		cwords
    272 	;; edi		uval
    273 	;; ebp		br
    274 	mov	edx, [ebp + 12]		;     edx <- br->bytes
    275 	test	edx, edx
    276 	jz	.read1			;     if(br->bytes) {  [NOTE: this case is rare so it doesn't have to be all that fast ]
    277 	mov	ebx, [ebp]
    278 	shl	edx, 3			;       edx <- const unsigned end = br->bytes * 8;
    279 	mov	eax, [ebx + 4*esi]	;       b = br->buffer[cwords]
    280 	xchg	edx, ecx		;       [edx <- cbits , ecx <- end]
    281 	mov	ebx, 0xffffffff		;       ebx <- FLAC__WORD_ALL_ONES
    282 	shr	ebx, cl			;       ebx <- FLAC__WORD_ALL_ONES >> end
    283 	not	ebx			;       ebx <- ~(FLAC__WORD_ALL_ONES >> end)
    284 	xchg	edx, ecx		;       [edx <- end , ecx <- cbits]
    285 	and	eax, ebx		;       b = (br->buffer[cwords] & ~(FLAC__WORD_ALL_ONES >> end));
    286 	shl	eax, cl 		;       b = (br->buffer[cwords] & ~(FLAC__WORD_ALL_ONES >> end)) << cbits;
    287 	test	eax, eax		;         (still have to test since cbits may be 0, thus ZF not updated for shl eax,0)
    288 	jz	.c1_next3		;       if(b) {
    289 	bsr	ebx, eax
    290 	not	ebx
    291 	and	ebx, 31			;         ebx = 'i' = # of leading 0 bits in 'b' (eax)
    292 	add	ecx, ebx		;         cbits += i;
    293 	add	edi, ebx		;         uval += i;
    294 	add	ecx, byte 1		;         cbits++; /* skip over stop bit */
    295 	jmp	short .break1 		;         goto break1;
    296 .c1_next3:				;       } else {
    297 	sub	edi, ecx
    298 	add	edi, edx		;         uval += end - cbits;
    299 	add	ecx, edx		;         cbits += end
    300 					;         /* didn't find stop bit yet, have to keep going... */
    301 					;       }
    302 					;     }
    303 .read1:
    304 	; flush registers and read; bitreader_read_from_client_() does
    305 	; not touch br->consumed_bits at all but we still need to set
    306 	; it in case it fails and we have to return false.
    307 	;; ecx		cbits
    308 	;; esi		cwords
    309 	;; edi		uval
    310 	;; ebp		br
    311 	mov	[ebp + 16], esi		;     br->consumed_words = cwords;
    312 	mov	[ebp + 20], ecx		;     br->consumed_bits = cbits;
    313 	push	ecx			;     /* save */
    314 	push	ebp			;     /* push br argument */
    315 %ifdef FLAC__PUBLIC_NEEDS_UNDERSCORE
    316 	call	_bitreader_read_from_client_
    317 %else
    318 	call	bitreader_read_from_client_
    319 %endif
    320 	pop	edx			;     /* discard, unused */
    321 	pop	ecx			;     /* restore */
    322 	mov	esi, [ebp + 16]		;     cwords = br->consumed_words;
    323 					;     ucbits = (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits;
    324 	mov	ebx, [ebp + 8]		;       ebx <- br->words
    325 	sub	ebx, esi		;       ebx <- br->words-cwords
    326 	shl	ebx, 2			;       ebx <- (br->words-cwords)*FLAC__BYTES_PER_WORD
    327 	add	ebx, [ebp + 12]		;       ebx <- (br->words-cwords)*FLAC__BYTES_PER_WORD + br->bytes
    328 	shl	ebx, 3			;       ebx <- (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8
    329 	sub	ebx, ecx		;       ebx <- (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits
    330 	add	ebx, edi		;       ebx <- (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits + uval
    331 					;           + uval to offset our count by the # of unary bits already
    332 					;           consumed before the read, because we will add these back
    333 					;           in all at once at break1
    334 	mov	[esp], ebx		;       ucbits <- ebx
    335 	test	eax, eax		;     if(!bitreader_read_from_client_(br))
    336 	jnz	near .unary_loop
    337 	jmp	.end			;       return false; /* eax (the return value) is already 0 */
    338 					;   } /* end while(1) unary part */
    339 
    340 	ALIGN 16
    341 .break1:
    342 	;; ecx		cbits
    343 	;; esi		cwords
    344 	;; edi		uval
    345 	;; ebp		br
    346 	;; [esp]	ucbits
    347 	sub	[esp], edi		;   ucbits -= uval;
    348 	sub	dword [esp], byte 1	;   ucbits--; /* account for stop bit */
    349 
    350 	;
    351 	; read binary part
    352 	;
    353 	mov	ebx, [esp + 36]		;   ebx <- parameter
    354 	test	ebx, ebx		;   if(parameter) {
    355 	jz	near .break2
    356 .read2:
    357 	cmp	[esp], ebx		;     while(ucbits < parameter) {
    358 	jae	.c2_next1
    359 	; flush registers and read; bitreader_read_from_client_() does
    360 	; not touch br->consumed_bits at all but we still need to set
    361 	; it in case it fails and we have to return false.
    362 	mov	[ebp + 16], esi		;       br->consumed_words = cwords;
    363 	mov	[ebp + 20], ecx		;       br->consumed_bits = cbits;
    364 	push	ecx			;       /* save */
    365 	push	ebp			;       /* push br argument */
    366 %ifdef FLAC__PUBLIC_NEEDS_UNDERSCORE
    367 	call	_bitreader_read_from_client_
    368 %else
    369 	call	bitreader_read_from_client_
    370 %endif
    371 	pop	edx			;       /* discard, unused */
    372 	pop	ecx			;       /* restore */
    373 	mov	esi, [ebp + 16]		;       cwords = br->consumed_words;
    374 					;       ucbits = (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits;
    375 	mov	edx, [ebp + 8]		;         edx <- br->words
    376 	sub	edx, esi		;         edx <- br->words-cwords
    377 	shl	edx, 2			;         edx <- (br->words-cwords)*FLAC__BYTES_PER_WORD
    378 	add	edx, [ebp + 12]		;         edx <- (br->words-cwords)*FLAC__BYTES_PER_WORD + br->bytes
    379 	shl	edx, 3			;         edx <- (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8
    380 	sub	edx, ecx		;         edx <- (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits
    381 	mov	[esp], edx		;         ucbits <- edx
    382 	test	eax, eax		;       if(!bitreader_read_from_client_(br))
    383 	jnz	.read2
    384 	jmp	.end			;         return false; /* eax (the return value) is already 0 */
    385 					;     }
    386 .c2_next1:
    387 	;; ebx		parameter
    388 	;; ecx		cbits
    389 	;; esi		cwords
    390 	;; edi		uval
    391 	;; ebp		br
    392 	;; [esp]	ucbits
    393 	cmp	esi, [ebp + 8]		;     if(cwords < br->words) { /* if we've not consumed up to a partial tail word... */
    394 	jae	near .c2_next2
    395 	test	ecx, ecx		;       if(cbits) {
    396 	jz	near .c2_next3		;         /* this also works when consumed_bits==0, it's just a little slower than necessary for that case */
    397 	mov	eax, 32
    398 	mov	edx, [ebp]
    399 	sub	eax, ecx		;         const unsigned n = FLAC__BITS_PER_WORD - cbits;
    400 	mov	edx, [edx + 4*esi]	;         const brword word = br->buffer[cwords];
    401 	cmp	ebx, eax		;         if(parameter < n) {
    402 	jae	.c2_next4
    403 					;           uval <<= parameter;
    404 					;           uval |= (word & (FLAC__WORD_ALL_ONES >> cbits)) >> (n-parameter);
    405 	shl	edx, cl
    406 	xchg	ebx, ecx
    407 	shld	edi, edx, cl
    408 	add	ebx, ecx		;           cbits += parameter;
    409 	xchg	ebx, ecx		;           ebx <- parameter, ecx <- cbits
    410 	jmp	.break2			;           goto break2;
    411 					;         }
    412 .c2_next4:
    413 					;         uval <<= n;
    414 					;         uval |= word & (FLAC__WORD_ALL_ONES >> cbits);
    415 %if 1
    416 	rol	edx, cl			;            @@@@@@OPT: may be faster to use rol to save edx so we can restore it for CRC'ing
    417 					;            @@@@@@OPT: or put parameter in ch instead and free up ebx completely again
    418 %else
    419 	shl	edx, cl
    420 %endif
    421 	xchg	eax, ecx
    422 	shld	edi, edx, cl
    423 	xchg	eax, ecx
    424 %if 1
    425 	ror	edx, cl			;            restored.
    426 %else
    427 	mov	edx, [ebp]
    428 	mov	edx, [edx + 4*esi]
    429 %endif
    430 					;         crc16_update_word_(br, br->buffer[cwords]);
    431 	push	edi			;		[need more registers]
    432 	push	ebx			;		[need more registers]
    433 	push	eax			;		[need more registers]
    434 	bswap	edx			;		edx = br->buffer[cwords] swapped; now we can CRC the bytes from LSByte to MSByte which makes things much easier
    435 	mov	ecx, [ebp + 28]		;		ecx <- br->crc16_align
    436 	mov	eax, [ebp + 24]		;		ax <- br->read_crc (a.k.a. crc)
    437 %ifdef FLAC__PUBLIC_NEEDS_UNDERSCORE
    438 	mov	edi, _FLAC__crc16_table
    439 %else
    440 	mov	edi, FLAC__crc16_table
    441 %endif
    442 	;; eax (ax)	crc a.k.a. br->read_crc
    443 	;; ebx (bl)	intermediate result index into FLAC__crc16_table[]
    444 	;; ecx		br->crc16_align
    445 	;; edx		byteswapped brword to CRC
    446 	;; esi		cwords
    447 	;; edi		unsigned FLAC__crc16_table[]
    448 	;; ebp		br
    449 	test	ecx, ecx		;		switch(br->crc16_align) ...
    450 	jnz	.c2b4			;		[br->crc16_align is 0 the vast majority of the time so we optimize the common case]
    451 .c2b0:	xor	dl, ah			;		dl <- (crc>>8)^(word>>24)
    452 	movzx	ebx, dl
    453 	mov	ecx, [ebx*4 + edi]	;		cx <- FLAC__crc16_table[(crc>>8)^(word>>24)]
    454 	shl	eax, 8			;		ax <- (crc<<8)
    455 	xor	eax, ecx		;		crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^(word>>24)]
    456 .c2b1:	xor	dh, ah			;		dh <- (crc>>8)^((word>>16)&0xff))
    457 	movzx	ebx, dh
    458 	mov	ecx, [ebx*4 + edi]	;		cx <- FLAC__crc16_table[(crc>>8)^((word>>16)&0xff))]
    459 	shl	eax, 8			;		ax <- (crc<<8)
    460 	xor	eax, ecx		;		crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^((word>>16)&0xff))]
    461 	shr	edx, 16
    462 .c2b2:	xor	dl, ah			;		dl <- (crc>>8)^((word>>8)&0xff))
    463 	movzx	ebx, dl
    464 	mov	ecx, [ebx*4 + edi]	;		cx <- FLAC__crc16_table[(crc>>8)^((word>>8)&0xff))]
    465 	shl	eax, 8			;		ax <- (crc<<8)
    466 	xor	eax, ecx		;		crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^((word>>8)&0xff))]
    467 .c2b3:	xor	dh, ah			;		dh <- (crc>>8)^(word&0xff)
    468 	movzx	ebx, dh
    469 	mov	ecx, [ebx*4 + edi]	;		cx <- FLAC__crc16_table[(crc>>8)^(word&0xff)]
    470 	shl	eax, 8			;		ax <- (crc<<8)
    471 	xor	eax, ecx		;		crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^(word&0xff)]
    472 	movzx	eax, ax
    473 	mov	[ebp + 24], eax		;		br->read_crc <- crc
    474 	pop	eax
    475 	pop	ebx
    476 	pop	edi
    477 	add	esi, byte 1		;         cwords++;
    478 	mov	ecx, ebx
    479 	sub	ecx, eax		;         cbits = parameter - n;
    480 	jz	.break2			;         if(cbits) { /* parameter > n, i.e. if there are still bits left to read, there have to be less than 32 so they will all be in the next word */
    481 					;           uval <<= cbits;
    482 					;           uval |= (br->buffer[cwords] >> (FLAC__BITS_PER_WORD-cbits));
    483 	mov	eax, [ebp]
    484 	mov	eax, [eax + 4*esi]
    485 	shld	edi, eax, cl
    486 					;         }
    487 	jmp	.break2			;         goto break2;
    488 
    489 	;; this section relocated out of the way for performance
    490 .c2b4:
    491 	mov	[ebp + 28], dword 0	;		br->crc16_align <- 0
    492 	cmp	ecx, 8
    493 	je	.c2b1
    494 	shr	edx, 16
    495 	cmp	ecx, 16
    496 	je	.c2b2
    497 	jmp	.c2b3
    498 
    499 .c2_next3:				;       } else {
    500 	mov	ecx, ebx		;         cbits = parameter;
    501 					;         uval <<= cbits;
    502 					;         uval |= (br->buffer[cwords] >> (FLAC__BITS_PER_WORD-cbits));
    503 	mov	eax, [ebp]
    504 	mov	eax, [eax + 4*esi]
    505 	shld	edi, eax, cl
    506 	jmp	.break2			;         goto break2;
    507 					;       }
    508 .c2_next2:				;     } else {
    509 	; in this case we're starting our read at a partial tail word;
    510 	; the reader has guaranteed that we have at least 'parameter'
    511 	; bits available to read, which makes this case simpler.
    512 					;       uval <<= parameter;
    513 					;       if(cbits) {
    514 					;         /* this also works when consumed_bits==0, it's just a little slower than necessary for that case */
    515 					;         uval |= (br->buffer[cwords] & (FLAC__WORD_ALL_ONES >> cbits)) >> (FLAC__BITS_PER_WORD-cbits-parameter);
    516 					;         cbits += parameter;
    517 					;         goto break2;
    518 					;       } else {
    519 					;         cbits = parameter;
    520 					;         uval |= br->buffer[cwords] >> (FLAC__BITS_PER_WORD-cbits);
    521 					;         goto break2;
    522 					;       }
    523 					;       the above is much shorter in assembly:
    524 	mov	eax, [ebp]
    525 	mov	eax, [eax + 4*esi]	;       eax <- br->buffer[cwords]
    526 	shl	eax, cl			;       eax <- br->buffer[cwords] << cbits
    527 	add	ecx, ebx		;       cbits += parameter
    528 	xchg	ebx, ecx		;       ebx <- cbits, ecx <- parameter
    529 	shld	edi, eax, cl		;       uval <<= parameter <<< 'parameter' bits of tail word
    530 	xchg	ebx, ecx		;       ebx <- parameter, ecx <- cbits
    531 					;     }
    532 					;   }
    533 .break2:
    534 	sub	[esp], ebx		;   ucbits -= parameter;
    535 
    536 	;
    537 	; compose the value
    538 	;
    539 	mov	ebx, [esp + 28]		;   ebx <- vals
    540 	mov	edx, edi		;   edx <- uval
    541 	and	edi, 1			;   edi <- uval & 1
    542 	shr	edx, 1			;   edx <- uval >> 1
    543 	neg	edi			;   edi <- -(int)(uval & 1)
    544 	xor	edx, edi		;   edx <- (uval >> 1 ^ -(int)(uval & 1))
    545 	mov	[ebx], edx		;   *vals <- edx
    546 	sub	dword [esp + 32], byte 1	;   --nvals;
    547 	jz	.finished		;   if(nvals == 0) /* jump to finish */
    548 	xor	edi, edi		;   uval = 0;
    549 	add	dword [esp + 28], 4	;   ++vals
    550 	jmp	.val_loop		; }
    551 
    552 .finished:
    553 	mov	[ebp + 16], esi		; br->consumed_words = cwords;
    554 	mov	[ebp + 20], ecx		; br->consumed_bits = cbits;
    555 	mov	eax, 1
    556 .end:
    557 	add	esp, 4
    558 	pop	edi
    559 	pop	esi
    560 	pop	ebx
    561 	pop	ebp
    562 	ret
    563 
    564 end
    565 
    566 %ifdef OBJ_FORMAT_elf
    567 	section .note.GNU-stack noalloc
    568 %endif
    569