Home | History | Annotate | Download | only in dsp
      1 /* Copyright (c) 2013 The Chromium OS Authors. All rights reserved.
      2  * Use of this source code is governed by a BSD-style license that can be
      3  * found in the LICENSE file.
      4  */
      5 
      6 #include "dsp_util.h"
      7 
      8 #ifndef max
      9 #define max(a, b) ({ __typeof__(a) _a = (a);	\
     10 			__typeof__(b) _b = (b);	\
     11 			_a > _b ? _a : _b; })
     12 #endif
     13 
     14 #ifndef min
     15 #define min(a, b) ({ __typeof__(a) _a = (a);	\
     16 			__typeof__(b) _b = (b);	\
     17 			_a < _b ? _a : _b; })
     18 #endif
     19 
     20 #undef deinterleave_stereo
     21 #undef interleave_stereo
     22 
     23 #ifdef __ARM_NEON__
     24 #include <arm_neon.h>
     25 
     26 static void deinterleave_stereo(int16_t *input, float *output1,
     27 				float *output2, int frames)
     28 {
     29 	/* Process 8 frames (16 samples) each loop. */
     30 	/* L0 R0 L1 R1 L2 R2 L3 R3... -> L0 L1 L2 L3... R0 R1 R2 R3... */
     31 	int chunk = frames >> 3;
     32 	frames &= 7;
     33 	if (chunk) {
     34 		__asm__ __volatile__ (
     35 			"1:					    \n"
     36 			"vld2.16 {d0-d3}, [%[input]]!		    \n"
     37 			"subs %[chunk], #1			    \n"
     38 			"vmovl.s16 q3, d3			    \n"
     39 			"vmovl.s16 q2, d2			    \n"
     40 			"vmovl.s16 q1, d1			    \n"
     41 			"vmovl.s16 q0, d0			    \n"
     42 			"vcvt.f32.s32 q3, q3, #15		    \n"
     43 			"vcvt.f32.s32 q2, q2, #15		    \n"
     44 			"vcvt.f32.s32 q1, q1, #15		    \n"
     45 			"vcvt.f32.s32 q0, q0, #15		    \n"
     46 			"vst1.32 {d4-d7}, [%[output2]]!		    \n"
     47 			"vst1.32 {d0-d3}, [%[output1]]!		    \n"
     48 			"bne 1b					    \n"
     49 			: /* output */
     50 			  [chunk]"+r"(chunk),
     51 			  [input]"+r"(input),
     52 			  [output1]"+r"(output1),
     53 			  [output2]"+r"(output2)
     54 			: /* input */
     55 			: /* clobber */
     56 			  "q0", "q1", "q2", "q3", "memory", "cc"
     57 			);
     58 	}
     59 
     60 	/* The remaining samples. */
     61 	while (frames--) {
     62 		*output1++ = *input++ / 32768.0f;
     63 		*output2++ = *input++ / 32768.0f;
     64 	}
     65 }
     66 #define deinterleave_stereo deinterleave_stereo
     67 
     68 static void interleave_stereo(float *input1, float *input2,
     69 			      int16_t *output, int frames)
     70 {
     71 	/* Process 4 frames (8 samples) each loop. */
     72 	/* L0 L1 L2 L3, R0 R1 R2 R3 -> L0 R0 L1 R1, L2 R2 L3 R3 */
     73 	float32x4_t pos = vdupq_n_f32(0.5f / 32768.0f);
     74 	float32x4_t neg = vdupq_n_f32(-0.5f / 32768.0f);
     75 	int chunk = frames >> 2;
     76 	frames &= 3;
     77 
     78 	if (chunk) {
     79 		__asm__ __volatile__ (
     80 			"veor q0, q0, q0			    \n"
     81 			"1:					    \n"
     82 			"vld1.32 {d2-d3}, [%[input1]]!		    \n"
     83 			"vld1.32 {d4-d5}, [%[input2]]!		    \n"
     84 			"subs %[chunk], #1			    \n"
     85 			/* We try to round to the nearest number by adding 0.5
     86 			 * to positive input, and adding -0.5 to the negative
     87 			 * input, then truncate.
     88 			 */
     89 			"vcgt.f32 q3, q1, q0			    \n"
     90 			"vcgt.f32 q4, q2, q0			    \n"
     91 			"vbsl q3, %q[pos], %q[neg]		    \n"
     92 			"vbsl q4, %q[pos], %q[neg]		    \n"
     93 			"vadd.f32 q1, q1, q3			    \n"
     94 			"vadd.f32 q2, q2, q4			    \n"
     95 			"vcvt.s32.f32 q1, q1, #15		    \n"
     96 			"vcvt.s32.f32 q2, q2, #15		    \n"
     97 			"vqmovn.s32 d2, q1			    \n"
     98 			"vqmovn.s32 d3, q2			    \n"
     99 			"vst2.16 {d2-d3}, [%[output]]!		    \n"
    100 			"bne 1b					    \n"
    101 			: /* output */
    102 			  "=r"(chunk),
    103 			  "=r"(input1),
    104 			  "=r"(input2),
    105 			  "=r"(output)
    106 			: /* input */
    107 			  [chunk]"0"(chunk),
    108 			  [input1]"1"(input1),
    109 			  [input2]"2"(input2),
    110 			  [output]"3"(output),
    111 			  [pos]"w"(pos),
    112 			  [neg]"w"(neg)
    113 			: /* clobber */
    114 			  "q0", "q1", "q2", "q3", "q4", "memory", "cc"
    115 			);
    116 	}
    117 
    118 	/* The remaining samples */
    119 	while (frames--) {
    120 		float f;
    121 		f = *input1++;
    122 		f += (f > 0) ? (0.5f / 32768.0f) : (-0.5f / 32768.0f);
    123 		*output++ = max(-32768, min(32767, (int)(f * 32768.0f)));
    124 		f = *input2++;
    125 		f += (f > 0) ? (0.5f / 32768.0f) : (-0.5f / 32768.0f);
    126 		*output++ = max(-32768, min(32767, (int)(f * 32768.0f)));
    127 	}
    128 }
    129 #define interleave_stereo interleave_stereo
    130 
    131 #endif
    132 
    133 #ifdef __SSE3__
    134 #include <emmintrin.h>
    135 
    136 static void deinterleave_stereo(int16_t *input, float *output1,
    137 				float *output2, int frames)
    138 {
    139 	/* Process 8 frames (16 samples) each loop. */
    140 	/* L0 R0 L1 R1 L2 R2 L3 R3... -> L0 L1 L2 L3... R0 R1 R2 R3... */
    141 	int chunk = frames >> 3;
    142 	frames &= 7;
    143 	if (chunk) {
    144 		__asm__ __volatile__ (
    145 			"1:                                         \n"
    146 			"lddqu (%[input]), %%xmm0                   \n"
    147 			"lddqu 16(%[input]), %%xmm1                 \n"
    148 			"add $32, %[input]                          \n"
    149 			"movdqa %%xmm0, %%xmm2                      \n"
    150 			"movdqa %%xmm1, %%xmm3                      \n"
    151 			"pslld $16, %%xmm0                          \n"
    152 			"pslld $16, %%xmm1                          \n"
    153 			"psrad $16, %%xmm2                          \n"
    154 			"psrad $16, %%xmm3                          \n"
    155 			"cvtdq2ps %%xmm0, %%xmm0                    \n"
    156 			"cvtdq2ps %%xmm1, %%xmm1                    \n"
    157 			"cvtdq2ps %%xmm2, %%xmm2                    \n"
    158 			"cvtdq2ps %%xmm3, %%xmm3                    \n"
    159 			"mulps %[scale_2_n31], %%xmm0               \n"
    160 			"mulps %[scale_2_n31], %%xmm1               \n"
    161 			"mulps %[scale_2_n15], %%xmm2               \n"
    162 			"mulps %[scale_2_n15], %%xmm3               \n"
    163 			"movdqu %%xmm0, (%[output1])                \n"
    164 			"movdqu %%xmm1, 16(%[output1])              \n"
    165 			"movdqu %%xmm2, (%[output2])                \n"
    166 			"movdqu %%xmm3, 16(%[output2])              \n"
    167 			"add $32, %[output1]                        \n"
    168 			"add $32, %[output2]                        \n"
    169 			"sub $1, %[chunk]                           \n"
    170 			"jnz 1b                                     \n"
    171 			: /* output */
    172 			  [chunk]"+r"(chunk),
    173 			  [input]"+r"(input),
    174 			  [output1]"+r"(output1),
    175 			  [output2]"+r"(output2)
    176 			: /* input */
    177 			  [scale_2_n31]"x"(_mm_set1_ps(1.0f/(1<<15)/(1<<16))),
    178 			  [scale_2_n15]"x"(_mm_set1_ps(1.0f/(1<<15)))
    179 			: /* clobber */
    180 			  "xmm0", "xmm1", "xmm2", "xmm3", "memory", "cc"
    181 			);
    182 	}
    183 
    184 	/* The remaining samples. */
    185 	while (frames--) {
    186 		*output1++ = *input++ / 32768.0f;
    187 		*output2++ = *input++ / 32768.0f;
    188 	}
    189 }
    190 #define deinterleave_stereo deinterleave_stereo
    191 
    192 static void interleave_stereo(float *input1, float *input2,
    193 			      int16_t *output, int frames)
    194 {
    195 	/* Process 4 frames (8 samples) each loop. */
    196 	/* L0 L1 L2 L3, R0 R1 R2 R3 -> L0 R0 L1 R1, L2 R2 L3 R3 */
    197 	int chunk = frames >> 2;
    198 	frames &= 3;
    199 
    200 	if (chunk) {
    201 		__asm__ __volatile__ (
    202 			"1:                                         \n"
    203 			"lddqu (%[input1]), %%xmm0                  \n"
    204 			"lddqu (%[input2]), %%xmm2                  \n"
    205 			"movaps %%xmm0, %%xmm1                      \n"
    206 			"unpcklps %%xmm2, %%xmm0                    \n"
    207 			"unpckhps %%xmm2, %%xmm1                    \n"
    208 			"add $16, %[input1]                         \n"
    209 			"add $16, %[input2]                         \n"
    210 			"mulps %[scale_2_15], %%xmm0                \n"
    211 			"mulps %[scale_2_15], %%xmm1                \n"
    212 			"cvtps2dq %%xmm0, %%xmm0                    \n"
    213 			"cvtps2dq %%xmm1, %%xmm1                    \n"
    214 			"packssdw %%xmm1, %%xmm0                    \n"
    215 			"movdqu %%xmm0, (%[output])                 \n"
    216 			"add $16, %[output]                         \n"
    217 			"sub $1, %[chunk]                           \n"
    218 			"jnz 1b                                     \n"
    219 			: /* output */
    220 			  "=r"(chunk),
    221 			  "=r"(input1),
    222 			  "=r"(input2),
    223 			  "=r"(output)
    224 			: /* input */
    225 			  [chunk]"0"(chunk),
    226 			  [input1]"1"(input1),
    227 			  [input2]"2"(input2),
    228 			  [output]"3"(output),
    229 			  [scale_2_15]"x"(_mm_set1_ps(1.0f*(1<<15)))
    230 			: /* clobber */
    231 			  "xmm0", "xmm1", "xmm2", "memory", "cc"
    232 			);
    233 	}
    234 
    235 	/* The remaining samples */
    236 	while (frames--) {
    237 		float f;
    238 		f = *input1++;
    239 		f += (f > 0) ? (0.5f / 32768.0f) : (-0.5f / 32768.0f);
    240 		*output++ = max(-32768, min(32767, (int)(f * 32768.0f)));
    241 		f = *input2++;
    242 		f += (f > 0) ? (0.5f / 32768.0f) : (-0.5f / 32768.0f);
    243 		*output++ = max(-32768, min(32767, (int)(f * 32768.0f)));
    244 	}
    245 }
    246 #define interleave_stereo interleave_stereo
    247 
    248 #endif
    249 
    250 void dsp_util_deinterleave(int16_t *input, float *const *output, int channels,
    251 			   int frames)
    252 {
    253 	float *output_ptr[channels];
    254 	int i, j;
    255 
    256 #ifdef deinterleave_stereo
    257 	if (channels == 2) {
    258 		deinterleave_stereo(input, output[0], output[1], frames);
    259 		return;
    260 	}
    261 #endif
    262 
    263 	for (i = 0; i < channels; i++)
    264 		output_ptr[i] = output[i];
    265 
    266 	for (i = 0; i < frames; i++)
    267 		for (j = 0; j < channels; j++)
    268 			*(output_ptr[j]++) = *input++ / 32768.0f;
    269 }
    270 
    271 void dsp_util_interleave(float *const *input, int16_t *output, int channels,
    272 			 int frames)
    273 {
    274 	float *input_ptr[channels];
    275 	int i, j;
    276 
    277 #ifdef interleave_stereo
    278 	if (channels == 2) {
    279 		interleave_stereo(input[0], input[1], output, frames);
    280 		return;
    281 	}
    282 #endif
    283 
    284 	for (i = 0; i < channels; i++)
    285 		input_ptr[i] = input[i];
    286 
    287 	for (i = 0; i < frames; i++)
    288 		for (j = 0; j < channels; j++) {
    289 			int16_t i16;
    290 			float f = *(input_ptr[j]++) * 32768.0f;
    291 			if (f > 32767)
    292 				i16 = 32767;
    293 			else if (f < -32768)
    294 				i16 = -32768;
    295 			else
    296 				i16 = (int16_t) (f > 0 ? f + 0.5f : f - 0.5f);
    297 			*output++ = i16;
    298 		}
    299 }
    300 
    301 void dsp_enable_flush_denormal_to_zero()
    302 {
    303 #if defined(__i386__) || defined(__x86_64__)
    304 	unsigned int mxcsr;
    305 	mxcsr = __builtin_ia32_stmxcsr();
    306 	__builtin_ia32_ldmxcsr(mxcsr | 0x8040);
    307 #elif defined(__arm__)
    308 	int cw;
    309 	__asm__ __volatile__ ("mrc p10, 7, %0, cr1, cr0, 0" : "=r" (cw));
    310 	__asm__ __volatile__ ("mcr p10, 7, %0, cr1, cr0, 0" : : "r" (cw | (1 << 24)));
    311 #else
    312 #warning "Don't know how to disable denorms. Performace may suffer."
    313 #endif
    314 }
    315