Home | History | Annotate | Download | only in dsp
      1 /* Copyright 2013 The Chromium OS Authors. All rights reserved.
      2  * Use of this source code is governed by a BSD-style license that can be
      3  * found in the LICENSE file.
      4  */
      6 #include "dsp_util.h"
      8 #ifndef max
      9 #define max(a, b) ({ __typeof__(a) _a = (a);	\
     10 			__typeof__(b) _b = (b);	\
     11 			_a > _b ? _a : _b; })
     12 #endif
     14 #ifndef min
     15 #define min(a, b) ({ __typeof__(a) _a = (a);	\
     16 			__typeof__(b) _b = (b);	\
     17 			_a < _b ? _a : _b; })
     18 #endif
     20 #undef deinterleave_stereo
     21 #undef interleave_stereo
     23 /* Converts shorts in range of -32768 to 32767 to floats in range of
     24  * -1.0f to 1.0f.
     25  * scvtf instruction accepts fixed point ints, so sxtl is used to lengthen
     26  * shorts to int with sign extension.
     27  */
     28 #ifdef __aarch64__
     29 static void deinterleave_stereo(int16_t *input, float *output1,
     30 				float *output2, int frames)
     31 {
     32 	int chunk = frames >> 3;
     33 	frames &= 7;
     34 	/* Process 8 frames (16 samples) each loop. */
     35 	/* L0 R0 L1 R1 L2 R2 L3 R3... -> L0 L1 L2 L3... R0 R1 R2 R3... */
     36 	if (chunk) {
     37 		__asm__ __volatile__ (
     38 			"1:                                         \n"
     39 			"ld2  {v2.8h, v3.8h}, [%[input]], #32       \n"
     40 			"subs %w[chunk], %w[chunk], #1              \n"
     41 			"sxtl   v0.4s, v2.4h                        \n"
     42 			"sxtl2  v1.4s, v2.8h                        \n"
     43 			"sxtl   v2.4s, v3.4h                        \n"
     44 			"sxtl2  v3.4s, v3.8h                        \n"
     45 			"scvtf  v0.4s, v0.4s, #15                   \n"
     46 			"scvtf  v1.4s, v1.4s, #15                   \n"
     47 			"scvtf  v2.4s, v2.4s, #15                   \n"
     48 			"scvtf  v3.4s, v3.4s, #15                   \n"
     49 			"st1    {v0.4s, v1.4s}, [%[output1]], #32   \n"
     50 			"st1    {v2.4s, v3.4s}, [%[output2]], #32   \n"
     51 			"b.ne   1b                                  \n"
     52 			: /* output */
     53 			  [chunk]"+r"(chunk),
     54 			  [input]"+r"(input),
     55 			  [output1]"+r"(output1),
     56 			  [output2]"+r"(output2)
     57 			: /* input */
     58 			: /* clobber */
     59 			  "v0", "v1", "v2", "v3", "memory", "cc"
     60 			);
     61 	}
     63 	/* The remaining samples. */
     64 	while (frames--) {
     65 		*output1++ = *input++ / 32768.0f;
     66 		*output2++ = *input++ / 32768.0f;
     67 	}
     68 }
     69 #define deinterleave_stereo deinterleave_stereo
     71 /* Converts floats in range of -1.0f to 1.0f to shorts in range of
     72  * -32768 to 32767 with rounding to nearest, with ties (0.5) rounding away
     73  * from zero.
     74  * Rounding is achieved by using fcvtas instruction. (a = away)
     75  * The float scaled to a range of -32768 to 32767 by adding 15 to the exponent.
     76  * Add to exponent is equivalent to multiply for exponent range of 0 to 239,
     77  * which is 2.59 * 10^33.  A signed saturating add (sqadd) limits exponents
     78  * from 240 to 255 to clamp to 255.
     79  * For very large values, beyond +/- 2 billion, fcvtas will clamp the result
     80  * to the min or max value that fits an int.
     81  * For other values, sqxtn clamps the output to -32768 to 32767 range.
     82  */
     83 static void interleave_stereo(float *input1, float *input2,
     84 			      int16_t *output, int frames)
     85 {
     86 	/* Process 4 frames (8 samples) each loop. */
     87 	/* L0 L1 L2 L3, R0 R1 R2 R3 -> L0 R0 L1 R1, L2 R2 L3 R3 */
     88 	int chunk = frames >> 2;
     89 	frames &= 3;
     91 	if (chunk) {
     92 		__asm__ __volatile__ (
     93 			"dup    v2.4s, %w[scale]                    \n"
     94 			"1:                                         \n"
     95 			"ld1    {v0.4s}, [%[input1]], #16           \n"
     96 			"ld1    {v1.4s}, [%[input2]], #16           \n"
     97 			"subs   %w[chunk], %w[chunk], #1            \n"
     98 			"sqadd  v0.4s, v0.4s, v2.4s                 \n"
     99 			"sqadd  v1.4s, v1.4s, v2.4s                 \n"
    100 			"fcvtas v0.4s, v0.4s                        \n"
    101 			"fcvtas v1.4s, v1.4s                        \n"
    102 			"sqxtn  v0.4h, v0.4s                        \n"
    103 			"sqxtn  v1.4h, v1.4s                        \n"
    104 			"st2    {v0.4h, v1.4h}, [%[output]], #16    \n"
    105 			"b.ne   1b                                  \n"
    106 			: /* output */
    107 			  [chunk]"+r"(chunk),
    108 			  [input1]"+r"(input1),
    109 			  [input2]"+r"(input2),
    110 			  [output]"+r"(output)
    111 			: /* input */
    112 			  [scale]"r"(15 << 23)
    113 			: /* clobber */
    114 			  "v0", "v1", "v2", "memory", "cc"
    115 			);
    116 	}
    118 	/* The remaining samples */
    119 	while (frames--) {
    120 		float f;
    121 		f = *input1++ * 32768.0f;
    122 		f += (f >= 0) ? 0.5f : -0.5f;
    123 		*output++ = max(-32768, min(32767, (int)(f)));
    124 		f = *input2++ * 32768.0f;
    125 		f += (f >= 0) ? 0.5f : -0.5f;
    126 		*output++ = max(-32768, min(32767, (int)(f)));
    127 	}
    128 }
    129 #define interleave_stereo interleave_stereo
    130 #endif
    132 #ifdef __ARM_NEON__
    133 #include <arm_neon.h>
    135 static void deinterleave_stereo(int16_t *input, float *output1,
    136 				float *output2, int frames)
    137 {
    138 	/* Process 8 frames (16 samples) each loop. */
    139 	/* L0 R0 L1 R1 L2 R2 L3 R3... -> L0 L1 L2 L3... R0 R1 R2 R3... */
    140 	int chunk = frames >> 3;
    141 	frames &= 7;
    142 	if (chunk) {
    143 		__asm__ __volatile__ (
    144 			"1:					    \n"
    145 			"vld2.16 {d0-d3}, [%[input]]!		    \n"
    146 			"subs %[chunk], #1			    \n"
    147 			"vmovl.s16 q3, d3			    \n"
    148 			"vmovl.s16 q2, d2			    \n"
    149 			"vmovl.s16 q1, d1			    \n"
    150 			"vmovl.s16 q0, d0			    \n"
    151 			"vcvt.f32.s32 q3, q3, #15		    \n"
    152 			"vcvt.f32.s32 q2, q2, #15		    \n"
    153 			"vcvt.f32.s32 q1, q1, #15		    \n"
    154 			"vcvt.f32.s32 q0, q0, #15		    \n"
    155 			"vst1.32 {d4-d7}, [%[output2]]!		    \n"
    156 			"vst1.32 {d0-d3}, [%[output1]]!		    \n"
    157 			"bne 1b					    \n"
    158 			: /* output */
    159 			  [chunk]"+r"(chunk),
    160 			  [input]"+r"(input),
    161 			  [output1]"+r"(output1),
    162 			  [output2]"+r"(output2)
    163 			: /* input */
    164 			: /* clobber */
    165 			  "q0", "q1", "q2", "q3", "memory", "cc"
    166 			);
    167 	}
    169 	/* The remaining samples. */
    170 	while (frames--) {
    171 		*output1++ = *input++ / 32768.0f;
    172 		*output2++ = *input++ / 32768.0f;
    173 	}
    174 }
    175 #define deinterleave_stereo deinterleave_stereo
    177 /* Converts floats in range of -1.0f to 1.0f to shorts in range of
    178  * -32768 to 32767 with rounding to nearest, with ties (0.5) rounding away
    179  * from zero.
    180  * Rounding is achieved by adding 0.5 or -0.5 adjusted for fixed point
    181  * precision, and then converting float to fixed point using vcvt instruction
    182  * which truncated toward zero.
    183  * For very large values, beyond +/- 2 billion, vcvt will clamp the result
    184  * to the min or max value that fits an int.
    185  * For other values, vqmovn clamps the output to -32768 to 32767 range.
    186  */
    187 static void interleave_stereo(float *input1, float *input2,
    188 			      int16_t *output, int frames)
    189 {
    190 	/* Process 4 frames (8 samples) each loop. */
    191 	/* L0 L1 L2 L3, R0 R1 R2 R3 -> L0 R0 L1 R1, L2 R2 L3 R3 */
    192 	float32x4_t pos = vdupq_n_f32(0.5f / 32768.0f);
    193 	float32x4_t neg = vdupq_n_f32(-0.5f / 32768.0f);
    194 	int chunk = frames >> 2;
    195 	frames &= 3;
    197 	if (chunk) {
    198 		__asm__ __volatile__ (
    199 			"veor q0, q0, q0			    \n"
    200 			"1:					    \n"
    201 			"vld1.32 {d2-d3}, [%[input1]]!		    \n"
    202 			"vld1.32 {d4-d5}, [%[input2]]!		    \n"
    203 			"subs %[chunk], #1			    \n"
    204 			/* We try to round to the nearest number by adding 0.5
    205 			 * to positive input, and adding -0.5 to the negative
    206 			 * input, then truncate.
    207 			 */
    208 			"vcgt.f32 q3, q1, q0			    \n"
    209 			"vcgt.f32 q4, q2, q0			    \n"
    210 			"vbsl q3, %q[pos], %q[neg]		    \n"
    211 			"vbsl q4, %q[pos], %q[neg]		    \n"
    212 			"vadd.f32 q1, q1, q3			    \n"
    213 			"vadd.f32 q2, q2, q4			    \n"
    214 			"vcvt.s32.f32 q1, q1, #15		    \n"
    215 			"vcvt.s32.f32 q2, q2, #15		    \n"
    216 			"vqmovn.s32 d2, q1			    \n"
    217 			"vqmovn.s32 d3, q2			    \n"
    218 			"vst2.16 {d2-d3}, [%[output]]!		    \n"
    219 			"bne 1b					    \n"
    220 			: /* output */
    221 			  [chunk]"+r"(chunk),
    222 			  [input1]"+r"(input1),
    223 			  [input2]"+r"(input2),
    224 			  [output]"+r"(output)
    225 			: /* input */
    226 			  [pos]"w"(pos),
    227 			  [neg]"w"(neg)
    228 			: /* clobber */
    229 			  "q0", "q1", "q2", "q3", "q4", "memory", "cc"
    230 			);
    231 	}
    233 	/* The remaining samples */
    234 	while (frames--) {
    235 		float f;
    236 		f = *input1++ * 32768.0f;
    237 		f += (f >= 0) ? 0.5f : -0.5f;
    238 		*output++ = max(-32768, min(32767, (int)(f)));
    239 		f = *input2++ * 32768.0f;
    240 		f += (f >= 0) ? 0.5f : -0.5f;
    241 		*output++ = max(-32768, min(32767, (int)(f)));
    242 	}
    243 }
    244 #define interleave_stereo interleave_stereo
    245 #endif
    247 #ifdef __SSE3__
    248 #include <emmintrin.h>
    250 /* Converts shorts in range of -32768 to 32767 to floats in range of
    251  * -1.0f to 1.0f.
    252  * pslld and psrad shifts are used to isolate the low and high word, but
    253  * each in a different range:
    254  * The low word is shifted to the high bits in range 0x80000000 .. 0x7fff0000.
    255  * The high word is shifted to the low bits in range 0x00008000 .. 0x00007fff.
    256  * cvtdq2ps converts ints to floats as is.
    257  * mulps is used to normalize the range of the low and high words, adjusting
    258  * for high and low words being in different range.
    259  */
    260 static void deinterleave_stereo(int16_t *input, float *output1,
    261 				float *output2, int frames)
    262 {
    263 	/* Process 8 frames (16 samples) each loop. */
    264 	/* L0 R0 L1 R1 L2 R2 L3 R3... -> L0 L1 L2 L3... R0 R1 R2 R3... */
    265 	int chunk = frames >> 3;
    266 	frames &= 7;
    267 	if (chunk) {
    268 		__asm__ __volatile__ (
    269 			"1:                                         \n"
    270 			"lddqu (%[input]), %%xmm0                   \n"
    271 			"lddqu 16(%[input]), %%xmm1                 \n"
    272 			"add $32, %[input]                          \n"
    273 			"movdqa %%xmm0, %%xmm2                      \n"
    274 			"movdqa %%xmm1, %%xmm3                      \n"
    275 			"pslld $16, %%xmm0                          \n"
    276 			"pslld $16, %%xmm1                          \n"
    277 			"psrad $16, %%xmm2                          \n"
    278 			"psrad $16, %%xmm3                          \n"
    279 			"cvtdq2ps %%xmm0, %%xmm0                    \n"
    280 			"cvtdq2ps %%xmm1, %%xmm1                    \n"
    281 			"cvtdq2ps %%xmm2, %%xmm2                    \n"
    282 			"cvtdq2ps %%xmm3, %%xmm3                    \n"
    283 			"mulps %[scale_2_n31], %%xmm0               \n"
    284 			"mulps %[scale_2_n31], %%xmm1               \n"
    285 			"mulps %[scale_2_n15], %%xmm2               \n"
    286 			"mulps %[scale_2_n15], %%xmm3               \n"
    287 			"movdqu %%xmm0, (%[output1])                \n"
    288 			"movdqu %%xmm1, 16(%[output1])              \n"
    289 			"movdqu %%xmm2, (%[output2])                \n"
    290 			"movdqu %%xmm3, 16(%[output2])              \n"
    291 			"add $32, %[output1]                        \n"
    292 			"add $32, %[output2]                        \n"
    293 			"sub $1, %[chunk]                           \n"
    294 			"jnz 1b                                     \n"
    295 			: /* output */
    296 			  [chunk]"+r"(chunk),
    297 			  [input]"+r"(input),
    298 			  [output1]"+r"(output1),
    299 			  [output2]"+r"(output2)
    300 			: /* input */
    301 			  [scale_2_n31]"x"(_mm_set1_ps(1.0f/(1<<15)/(1<<16))),
    302 			  [scale_2_n15]"x"(_mm_set1_ps(1.0f/(1<<15)))
    303 			: /* clobber */
    304 			  "xmm0", "xmm1", "xmm2", "xmm3", "memory", "cc"
    305 			);
    306 	}
    308 	/* The remaining samples. */
    309 	while (frames--) {
    310 		*output1++ = *input++ / 32768.0f;
    311 		*output2++ = *input++ / 32768.0f;
    312 	}
    313 }
    314 #define deinterleave_stereo deinterleave_stereo
    316 /* Converts floats in range of -1.0f to 1.0f to shorts in range of
    317  * -32768 to 32767 with rounding to nearest, with ties (0.5) rounding to
    318  * even.
    319  * For very large values, beyond +/- 2 billion, cvtps2dq will produce
    320  * 0x80000000 and packssdw will clamp -32768.
    321  */
    322 static void interleave_stereo(float *input1, float *input2,
    323 			      int16_t *output, int frames)
    324 {
    325 	/* Process 4 frames (8 samples) each loop. */
    326 	/* L0 L1 L2 L3, R0 R1 R2 R3 -> L0 R0 L1 R1, L2 R2 L3 R3 */
    327 	int chunk = frames >> 2;
    328 	frames &= 3;
    330 	if (chunk) {
    331 		__asm__ __volatile__ (
    332 			"1:                                         \n"
    333 			"lddqu (%[input1]), %%xmm0                  \n"
    334 			"lddqu (%[input2]), %%xmm2                  \n"
    335 			"add $16, %[input1]                         \n"
    336 			"add $16, %[input2]                         \n"
    337 			"movaps %%xmm0, %%xmm1                      \n"
    338 			"unpcklps %%xmm2, %%xmm0                    \n"
    339 			"unpckhps %%xmm2, %%xmm1                    \n"
    340 			"paddsw %[scale_2_15], %%xmm0               \n"
    341 			"paddsw %[scale_2_15], %%xmm1               \n"
    342 			"cvtps2dq %%xmm0, %%xmm0                    \n"
    343 			"cvtps2dq %%xmm1, %%xmm1                    \n"
    344 			"packssdw %%xmm1, %%xmm0                    \n"
    345 			"movdqu %%xmm0, (%[output])                 \n"
    346 			"add $16, %[output]                         \n"
    347 			"sub $1, %[chunk]                           \n"
    348 			"jnz 1b                                     \n"
    349 			: /* output */
    350 			  [chunk]"+r"(chunk),
    351 			  [input1]"+r"(input1),
    352 			  [input2]"+r"(input2),
    353 			  [output]"+r"(output)
    354 			: /* input */
    355 			  [scale_2_15]"x"(_mm_set1_epi32(15 << 23)),
    356 			  [clamp_large]"x"(_mm_set1_ps(32767.0f))
    357 			: /* clobber */
    358 			  "xmm0", "xmm1", "xmm2", "memory", "cc"
    359 			);
    360 	}
    362 	/* The remaining samples */
    363 	while (frames--) {
    364 		float f;
    365 		f = *input1++ * 32768.0f;
    366 		f += (f >= 0) ? 0.5f : -0.5f;
    367 		*output++ = max(-32768, min(32767, (int)(f)));
    368 		f = *input2++ * 32768.0f;
    369 		f += (f >= 0) ? 0.5f : -0.5f;
    370 		*output++ = max(-32768, min(32767, (int)(f)));
    371 	}
    372 }
    373 #define interleave_stereo interleave_stereo
    374 #endif
    376 void dsp_util_deinterleave(int16_t *input, float *const *output, int channels,
    377 			   int frames)
    378 {
    379 	float *output_ptr[channels];
    380 	int i, j;
    382 #ifdef deinterleave_stereo
    383 	if (channels == 2) {
    384 		deinterleave_stereo(input, output[0], output[1], frames);
    385 		return;
    386 	}
    387 #endif
    389 	for (i = 0; i < channels; i++)
    390 		output_ptr[i] = output[i];
    392 	for (i = 0; i < frames; i++)
    393 		for (j = 0; j < channels; j++)
    394 			*(output_ptr[j]++) = *input++ / 32768.0f;
    395 }
    397 void dsp_util_interleave(float *const *input, int16_t *output, int channels,
    398 			 int frames)
    399 {
    400 	float *input_ptr[channels];
    401 	int i, j;
    403 #ifdef interleave_stereo
    404 	if (channels == 2) {
    405 		interleave_stereo(input[0], input[1], output, frames);
    406 		return;
    407 	}
    408 #endif
    410 	for (i = 0; i < channels; i++)
    411 		input_ptr[i] = input[i];
    413 	for (i = 0; i < frames; i++)
    414 		for (j = 0; j < channels; j++) {
    415 			float f = *(input_ptr[j]++) * 32768.0f;
    416 			f += (f >= 0) ? 0.5f : -0.5f;
    417 			*output++ = max(-32768, min(32767, (int)(f)));
    418 		}
    419 }
    421 void dsp_enable_flush_denormal_to_zero()
    422 {
    423 #if defined(__i386__) || defined(__x86_64__)
    424 	unsigned int mxcsr;
    425 	mxcsr = __builtin_ia32_stmxcsr();
    426 	__builtin_ia32_ldmxcsr(mxcsr | 0x8040);
    427 #elif defined(__aarch64__)
    428 	uint64_t cw;
    429 	__asm__ __volatile__ (
    430 		"mrs    %0, fpcr			    \n"
    431 		"orr    %0, %0, #0x1000000		    \n"
    432 		"msr    fpcr, %0			    \n"
    433 		"isb					    \n"
    434 		: "=r"(cw) :: "memory");
    435 #elif defined(__arm__)
    436 	uint32_t cw;
    437 	__asm__ __volatile__ (
    438 		"vmrs   %0, fpscr			    \n"
    439 		"orr    %0, %0, #0x1000000		    \n"
    440 		"vmsr   fpscr, %0			    \n"
    441 		: "=r"(cw) :: "memory");
    442 #else
    443 #warning "Don't know how to disable denorms. Performace may suffer."
    444 #endif
    445 }