Home | History | Annotate | Download | only in ia32
      1 ;  vim:filetype=nasm ts=8
      2 
      3 ;  libFLAC - Free Lossless Audio Codec library
      4 ;  Copyright (C) 2001,2002,2003,2004,2005,2006,2007  Josh Coalson
      5 ;
      6 ;  Redistribution and use in source and binary forms, with or without
      7 ;  modification, are permitted provided that the following conditions
      8 ;  are met:
      9 ;
     10 ;  - Redistributions of source code must retain the above copyright
     11 ;  notice, this list of conditions and the following disclaimer.
     12 ;
     13 ;  - Redistributions in binary form must reproduce the above copyright
     14 ;  notice, this list of conditions and the following disclaimer in the
     15 ;  documentation and/or other materials provided with the distribution.
     16 ;
     17 ;  - Neither the name of the Xiph.org Foundation nor the names of its
     18 ;  contributors may be used to endorse or promote products derived from
     19 ;  this software without specific prior written permission.
     20 ;
     21 ;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     22 ;  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     23 ;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     24 ;  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
     25 ;  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
     26 ;  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
     27 ;  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
     28 ;  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
     29 ;  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
     30 ;  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     31 ;  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     32 
     33 %include "nasm.h"
     34 
     35 	data_section
     36 
     37 cglobal FLAC__fixed_compute_best_predictor_asm_ia32_mmx_cmov
     38 
     39 	code_section
     40 
     41 ; **********************************************************************
     42 ;
     43 ; unsigned FLAC__fixed_compute_best_predictor(const FLAC__int32 *data, unsigned data_len, FLAC__float residual_bits_per_sample[FLAC__MAX_FIXED_ORDER+1])
     44 ; {
     45 ; 	FLAC__int32 last_error_0 = data[-1];
     46 ; 	FLAC__int32 last_error_1 = data[-1] - data[-2];
     47 ; 	FLAC__int32 last_error_2 = last_error_1 - (data[-2] - data[-3]);
     48 ; 	FLAC__int32 last_error_3 = last_error_2 - (data[-2] - 2*data[-3] + data[-4]);
     49 ; 	FLAC__int32 error, save;
     50 ; 	FLAC__uint32 total_error_0 = 0, total_error_1 = 0, total_error_2 = 0, total_error_3 = 0, total_error_4 = 0;
     51 ; 	unsigned i, order;
     52 ;
     53 ; 	for(i = 0; i < data_len; i++) {
     54 ; 		error  = data[i]     ; total_error_0 += local_abs(error);                      save = error;
     55 ; 		error -= last_error_0; total_error_1 += local_abs(error); last_error_0 = save; save = error;
     56 ; 		error -= last_error_1; total_error_2 += local_abs(error); last_error_1 = save; save = error;
     57 ; 		error -= last_error_2; total_error_3 += local_abs(error); last_error_2 = save; save = error;
     58 ; 		error -= last_error_3; total_error_4 += local_abs(error); last_error_3 = save;
     59 ; 	}
     60 ;
     61 ; 	if(total_error_0 < min(min(min(total_error_1, total_error_2), total_error_3), total_error_4))
     62 ; 		order = 0;
     63 ; 	else if(total_error_1 < min(min(total_error_2, total_error_3), total_error_4))
     64 ; 		order = 1;
     65 ; 	else if(total_error_2 < min(total_error_3, total_error_4))
     66 ; 		order = 2;
     67 ; 	else if(total_error_3 < total_error_4)
     68 ; 		order = 3;
     69 ; 	else
     70 ; 		order = 4;
     71 ;
     72 ; 	residual_bits_per_sample[0] = (FLAC__float)((data_len > 0 && total_error_0 > 0) ? log(M_LN2 * (FLAC__double)total_error_0 / (FLAC__double)data_len) / M_LN2 : 0.0);
     73 ; 	residual_bits_per_sample[1] = (FLAC__float)((data_len > 0 && total_error_1 > 0) ? log(M_LN2 * (FLAC__double)total_error_1 / (FLAC__double)data_len) / M_LN2 : 0.0);
     74 ; 	residual_bits_per_sample[2] = (FLAC__float)((data_len > 0 && total_error_2 > 0) ? log(M_LN2 * (FLAC__double)total_error_2 / (FLAC__double)data_len) / M_LN2 : 0.0);
     75 ; 	residual_bits_per_sample[3] = (FLAC__float)((data_len > 0 && total_error_3 > 0) ? log(M_LN2 * (FLAC__double)total_error_3 / (FLAC__double)data_len) / M_LN2 : 0.0);
     76 ; 	residual_bits_per_sample[4] = (FLAC__float)((data_len > 0 && total_error_4 > 0) ? log(M_LN2 * (FLAC__double)total_error_4 / (FLAC__double)data_len) / M_LN2 : 0.0);
     77 ;
     78 ; 	return order;
     79 ; }
     80 	ALIGN 16
     81 cident FLAC__fixed_compute_best_predictor_asm_ia32_mmx_cmov
     82 
     83 	; esp + 36 == data[]
     84 	; esp + 40 == data_len
     85 	; esp + 44 == residual_bits_per_sample[]
     86 
     87 	push	ebp
     88 	push	ebx
     89 	push	esi
     90 	push	edi
     91 	sub	esp, byte 16
     92 	; qword [esp] == temp space for loading FLAC__uint64s to FPU regs
     93 
     94 	; ebx == &data[i]
     95 	; ecx == loop counter (i)
     96 	; ebp == order
     97 	; mm0 == total_error_1:total_error_0
     98 	; mm1 == total_error_2:total_error_3
     99 	; mm2 == :total_error_4
    100 	; mm3 == last_error_1:last_error_0
    101 	; mm4 == last_error_2:last_error_3
    102 
    103 	mov	ecx, [esp + 40]			; ecx = data_len
    104 	test	ecx, ecx
    105 	jz	near .data_len_is_0
    106 
    107 	mov	ebx, [esp + 36]			; ebx = data[]
    108 	movd	mm3, [ebx - 4]			; mm3 = 0:last_error_0
    109 	movd	mm2, [ebx - 8]			; mm2 = 0:data[-2]
    110 	movd	mm1, [ebx - 12]			; mm1 = 0:data[-3]
    111 	movd	mm0, [ebx - 16]			; mm0 = 0:data[-4]
    112 	movq	mm5, mm3			; mm5 = 0:last_error_0
    113 	psubd	mm5, mm2			; mm5 = 0:last_error_1
    114 	punpckldq	mm3, mm5		; mm3 = last_error_1:last_error_0
    115 	psubd	mm2, mm1			; mm2 = 0:data[-2] - data[-3]
    116 	psubd	mm5, mm2			; mm5 = 0:last_error_2
    117 	movq	mm4, mm5			; mm4 = 0:last_error_2
    118 	psubd	mm4, mm2			; mm4 = 0:last_error_2 - (data[-2] - data[-3])
    119 	paddd	mm4, mm1			; mm4 = 0:last_error_2 - (data[-2] - 2 * data[-3])
    120 	psubd	mm4, mm0			; mm4 = 0:last_error_3
    121 	punpckldq	mm4, mm5		; mm4 = last_error_2:last_error_3
    122 	pxor	mm0, mm0			; mm0 = total_error_1:total_error_0
    123 	pxor	mm1, mm1			; mm1 = total_error_2:total_error_3
    124 	pxor	mm2, mm2			; mm2 = 0:total_error_4
    125 
    126 	ALIGN 16
    127 .loop:
    128 	movd	mm7, [ebx]			; mm7 = 0:error_0
    129 	add	ebx, byte 4
    130 	movq	mm6, mm7			; mm6 = 0:error_0
    131 	psubd	mm7, mm3			; mm7 = :error_1
    132 	punpckldq	mm6, mm7		; mm6 = error_1:error_0
    133 	movq	mm5, mm6			; mm5 = error_1:error_0
    134 	movq	mm7, mm6			; mm7 = error_1:error_0
    135 	psubd	mm5, mm3			; mm5 = error_2:
    136 	movq	mm3, mm6			; mm3 = error_1:error_0	
    137 	psrad	mm6, 31
    138 	pxor	mm7, mm6
    139 	psubd	mm7, mm6			; mm7 = abs(error_1):abs(error_0)
    140 	paddd	mm0, mm7			; mm0 = total_error_1:total_error_0
    141 	movq	mm6, mm5			; mm6 = error_2:
    142 	psubd	mm5, mm4			; mm5 = error_3:
    143 	punpckhdq	mm5, mm6		; mm5 = error_2:error_3
    144 	movq	mm7, mm5			; mm7 = error_2:error_3
    145 	movq	mm6, mm5			; mm6 = error_2:error_3
    146 	psubd	mm5, mm4			; mm5 = :error_4
    147 	movq	mm4, mm6			; mm4 = error_2:error_3
    148 	psrad	mm6, 31
    149 	pxor	mm7, mm6
    150 	psubd	mm7, mm6			; mm7 = abs(error_2):abs(error_3)
    151 	paddd	mm1, mm7			; mm1 = total_error_2:total_error_3
    152 	movq	mm6, mm5			; mm6 = :error_4
    153 	psrad	mm5, 31
    154 	pxor	mm6, mm5
    155 	psubd	mm6, mm5			; mm6 = :abs(error_4)
    156 	paddd	mm2, mm6			; mm2 = :total_error_4
    157 	
    158 	dec	ecx
    159 	jnz	short .loop
    160 
    161 ; 	if(total_error_0 < min(min(min(total_error_1, total_error_2), total_error_3), total_error_4))
    162 ; 		order = 0;
    163 ; 	else if(total_error_1 < min(min(total_error_2, total_error_3), total_error_4))
    164 ; 		order = 1;
    165 ; 	else if(total_error_2 < min(total_error_3, total_error_4))
    166 ; 		order = 2;
    167 ; 	else if(total_error_3 < total_error_4)
    168 ; 		order = 3;
    169 ; 	else
    170 ; 		order = 4;
    171 	movq	mm3, mm0			; mm3 = total_error_1:total_error_0
    172 	movd	edi, mm2			; edi = total_error_4
    173 	movd	esi, mm1			; esi = total_error_3
    174 	movd	eax, mm0			; eax = total_error_0
    175 	punpckhdq	mm1, mm1		; mm1 = total_error_2:total_error_2
    176 	punpckhdq	mm3, mm3		; mm3 = total_error_1:total_error_1
    177 	movd	edx, mm1			; edx = total_error_2
    178 	movd	ecx, mm3			; ecx = total_error_1
    179 
    180 	xor	ebx, ebx
    181 	xor	ebp, ebp
    182 	inc	ebx
    183 	cmp	ecx, eax
    184 	cmovb	eax, ecx			; eax = min(total_error_0, total_error_1)
    185 	cmovbe	ebp, ebx
    186 	inc	ebx
    187 	cmp	edx, eax
    188 	cmovb	eax, edx			; eax = min(total_error_0, total_error_1, total_error_2)
    189 	cmovbe	ebp, ebx
    190 	inc	ebx
    191 	cmp	esi, eax
    192 	cmovb	eax, esi			; eax = min(total_error_0, total_error_1, total_error_2, total_error_3)
    193 	cmovbe	ebp, ebx
    194 	inc	ebx
    195 	cmp	edi, eax
    196 	cmovb	eax, edi			; eax = min(total_error_0, total_error_1, total_error_2, total_error_3, total_error_4)
    197 	cmovbe	ebp, ebx
    198 	movd	ebx, mm0			; ebx = total_error_0
    199 	emms
    200 
    201 	; 	residual_bits_per_sample[0] = (FLAC__float)((data_len > 0 && total_error_0 > 0) ? log(M_LN2 * (FLAC__double)total_error_0 / (FLAC__double)data_len) / M_LN2 : 0.0);
    202 	; 	residual_bits_per_sample[1] = (FLAC__float)((data_len > 0 && total_error_1 > 0) ? log(M_LN2 * (FLAC__double)total_error_1 / (FLAC__double)data_len) / M_LN2 : 0.0);
    203 	; 	residual_bits_per_sample[2] = (FLAC__float)((data_len > 0 && total_error_2 > 0) ? log(M_LN2 * (FLAC__double)total_error_2 / (FLAC__double)data_len) / M_LN2 : 0.0);
    204 	; 	residual_bits_per_sample[3] = (FLAC__float)((data_len > 0 && total_error_3 > 0) ? log(M_LN2 * (FLAC__double)total_error_3 / (FLAC__double)data_len) / M_LN2 : 0.0);
    205 	; 	residual_bits_per_sample[4] = (FLAC__float)((data_len > 0 && total_error_4 > 0) ? log(M_LN2 * (FLAC__double)total_error_4 / (FLAC__double)data_len) / M_LN2 : 0.0);
    206 	xor	eax, eax
    207 	fild	dword [esp + 40]		; ST = data_len (NOTE: assumes data_len is <2gigs)
    208 .rbps_0:
    209 	test	ebx, ebx
    210 	jz	.total_error_0_is_0
    211 	fld1					; ST = 1.0 data_len
    212 	mov	[esp], ebx
    213 	mov	[esp + 4], eax			; [esp] = (FLAC__uint64)total_error_0
    214 	mov	ebx, [esp + 44]
    215 	fild	qword [esp]			; ST = total_error_0 1.0 data_len
    216 	fdiv	st2				; ST = total_error_0/data_len 1.0 data_len
    217 	fldln2					; ST = ln2 total_error_0/data_len 1.0 data_len
    218 	fmulp	st1				; ST = ln2*total_error_0/data_len 1.0 data_len
    219 	fyl2x					; ST = log2(ln2*total_error_0/data_len) data_len
    220 	fstp	dword [ebx]			; residual_bits_per_sample[0] = log2(ln2*total_error_0/data_len)   ST = data_len
    221 	jmp	short .rbps_1
    222 .total_error_0_is_0:
    223 	mov	ebx, [esp + 44]
    224 	mov	[ebx], eax			; residual_bits_per_sample[0] = 0.0
    225 .rbps_1:
    226 	test	ecx, ecx
    227 	jz	.total_error_1_is_0
    228 	fld1					; ST = 1.0 data_len
    229 	mov	[esp], ecx
    230 	mov	[esp + 4], eax			; [esp] = (FLAC__uint64)total_error_1
    231 	fild	qword [esp]			; ST = total_error_1 1.0 data_len
    232 	fdiv	st2				; ST = total_error_1/data_len 1.0 data_len
    233 	fldln2					; ST = ln2 total_error_1/data_len 1.0 data_len
    234 	fmulp	st1				; ST = ln2*total_error_1/data_len 1.0 data_len
    235 	fyl2x					; ST = log2(ln2*total_error_1/data_len) data_len
    236 	fstp	dword [ebx + 4]			; residual_bits_per_sample[1] = log2(ln2*total_error_1/data_len)   ST = data_len
    237 	jmp	short .rbps_2
    238 .total_error_1_is_0:
    239 	mov	[ebx + 4], eax			; residual_bits_per_sample[1] = 0.0
    240 .rbps_2:
    241 	test	edx, edx
    242 	jz	.total_error_2_is_0
    243 	fld1					; ST = 1.0 data_len
    244 	mov	[esp], edx
    245 	mov	[esp + 4], eax			; [esp] = (FLAC__uint64)total_error_2
    246 	fild	qword [esp]			; ST = total_error_2 1.0 data_len
    247 	fdiv	st2				; ST = total_error_2/data_len 1.0 data_len
    248 	fldln2					; ST = ln2 total_error_2/data_len 1.0 data_len
    249 	fmulp	st1				; ST = ln2*total_error_2/data_len 1.0 data_len
    250 	fyl2x					; ST = log2(ln2*total_error_2/data_len) data_len
    251 	fstp	dword [ebx + 8]			; residual_bits_per_sample[2] = log2(ln2*total_error_2/data_len)   ST = data_len
    252 	jmp	short .rbps_3
    253 .total_error_2_is_0:
    254 	mov	[ebx + 8], eax			; residual_bits_per_sample[2] = 0.0
    255 .rbps_3:
    256 	test	esi, esi
    257 	jz	.total_error_3_is_0
    258 	fld1					; ST = 1.0 data_len
    259 	mov	[esp], esi
    260 	mov	[esp + 4], eax			; [esp] = (FLAC__uint64)total_error_3
    261 	fild	qword [esp]			; ST = total_error_3 1.0 data_len
    262 	fdiv	st2				; ST = total_error_3/data_len 1.0 data_len
    263 	fldln2					; ST = ln2 total_error_3/data_len 1.0 data_len
    264 	fmulp	st1				; ST = ln2*total_error_3/data_len 1.0 data_len
    265 	fyl2x					; ST = log2(ln2*total_error_3/data_len) data_len
    266 	fstp	dword [ebx + 12]		; residual_bits_per_sample[3] = log2(ln2*total_error_3/data_len)   ST = data_len
    267 	jmp	short .rbps_4
    268 .total_error_3_is_0:
    269 	mov	[ebx + 12], eax			; residual_bits_per_sample[3] = 0.0
    270 .rbps_4:
    271 	test	edi, edi
    272 	jz	.total_error_4_is_0
    273 	fld1					; ST = 1.0 data_len
    274 	mov	[esp], edi
    275 	mov	[esp + 4], eax			; [esp] = (FLAC__uint64)total_error_4
    276 	fild	qword [esp]			; ST = total_error_4 1.0 data_len
    277 	fdiv	st2				; ST = total_error_4/data_len 1.0 data_len
    278 	fldln2					; ST = ln2 total_error_4/data_len 1.0 data_len
    279 	fmulp	st1				; ST = ln2*total_error_4/data_len 1.0 data_len
    280 	fyl2x					; ST = log2(ln2*total_error_4/data_len) data_len
    281 	fstp	dword [ebx + 16]		; residual_bits_per_sample[4] = log2(ln2*total_error_4/data_len)   ST = data_len
    282 	jmp	short .rbps_end
    283 .total_error_4_is_0:
    284 	mov	[ebx + 16], eax			; residual_bits_per_sample[4] = 0.0
    285 .rbps_end:
    286 	fstp	st0				; ST = [empty]
    287 	jmp	short .end
    288 .data_len_is_0:
    289 	; data_len == 0, so residual_bits_per_sample[*] = 0.0
    290 	xor	ebp, ebp
    291 	mov	edi, [esp + 44]
    292 	mov	[edi], ebp
    293 	mov	[edi + 4], ebp
    294 	mov	[edi + 8], ebp
    295 	mov	[edi + 12], ebp
    296 	mov	[edi + 16], ebp
    297 	add	ebp, byte 4			; order = 4
    298 
    299 .end:
    300 	mov	eax, ebp			; return order
    301 	add	esp, byte 16
    302 	pop	edi
    303 	pop	esi
    304 	pop	ebx
    305 	pop	ebp
    306 	ret
    307 
    308 end
    309 
    310 %ifdef OBJ_FORMAT_elf
    311        section .note.GNU-stack noalloc
    312 %endif
    313