1 /* Copyright (c) 2013 The Chromium OS Authors. All rights reserved. 2 * Use of this source code is governed by a BSD-style license that can be 3 * found in the LICENSE file. 4 */ 5 6 #include "dsp_util.h" 7 8 #ifndef max 9 #define max(a, b) ({ __typeof__(a) _a = (a); \ 10 __typeof__(b) _b = (b); \ 11 _a > _b ? _a : _b; }) 12 #endif 13 14 #ifndef min 15 #define min(a, b) ({ __typeof__(a) _a = (a); \ 16 __typeof__(b) _b = (b); \ 17 _a < _b ? _a : _b; }) 18 #endif 19 20 #undef deinterleave_stereo 21 #undef interleave_stereo 22 23 #ifdef __ARM_NEON__ 24 #include <arm_neon.h> 25 26 static void deinterleave_stereo(int16_t *input, float *output1, 27 float *output2, int frames) 28 { 29 /* Process 8 frames (16 samples) each loop. */ 30 /* L0 R0 L1 R1 L2 R2 L3 R3... -> L0 L1 L2 L3... R0 R1 R2 R3... */ 31 int chunk = frames >> 3; 32 frames &= 7; 33 if (chunk) { 34 __asm__ __volatile__ ( 35 "1: \n" 36 "vld2.16 {d0-d3}, [%[input]]! \n" 37 "subs %[chunk], #1 \n" 38 "vmovl.s16 q3, d3 \n" 39 "vmovl.s16 q2, d2 \n" 40 "vmovl.s16 q1, d1 \n" 41 "vmovl.s16 q0, d0 \n" 42 "vcvt.f32.s32 q3, q3, #15 \n" 43 "vcvt.f32.s32 q2, q2, #15 \n" 44 "vcvt.f32.s32 q1, q1, #15 \n" 45 "vcvt.f32.s32 q0, q0, #15 \n" 46 "vst1.32 {d4-d7}, [%[output2]]! \n" 47 "vst1.32 {d0-d3}, [%[output1]]! \n" 48 "bne 1b \n" 49 : /* output */ 50 [chunk]"+r"(chunk), 51 [input]"+r"(input), 52 [output1]"+r"(output1), 53 [output2]"+r"(output2) 54 : /* input */ 55 : /* clobber */ 56 "q0", "q1", "q2", "q3", "memory", "cc" 57 ); 58 } 59 60 /* The remaining samples. */ 61 while (frames--) { 62 *output1++ = *input++ / 32768.0f; 63 *output2++ = *input++ / 32768.0f; 64 } 65 } 66 #define deinterleave_stereo deinterleave_stereo 67 68 static void interleave_stereo(float *input1, float *input2, 69 int16_t *output, int frames) 70 { 71 /* Process 4 frames (8 samples) each loop. */ 72 /* L0 L1 L2 L3, R0 R1 R2 R3 -> L0 R0 L1 R1, L2 R2 L3 R3 */ 73 float32x4_t pos = vdupq_n_f32(0.5f / 32768.0f); 74 float32x4_t neg = vdupq_n_f32(-0.5f / 32768.0f); 75 int chunk = frames >> 2; 76 frames &= 3; 77 78 if (chunk) { 79 __asm__ __volatile__ ( 80 "veor q0, q0, q0 \n" 81 "1: \n" 82 "vld1.32 {d2-d3}, [%[input1]]! \n" 83 "vld1.32 {d4-d5}, [%[input2]]! \n" 84 "subs %[chunk], #1 \n" 85 /* We try to round to the nearest number by adding 0.5 86 * to positive input, and adding -0.5 to the negative 87 * input, then truncate. 88 */ 89 "vcgt.f32 q3, q1, q0 \n" 90 "vcgt.f32 q4, q2, q0 \n" 91 "vbsl q3, %q[pos], %q[neg] \n" 92 "vbsl q4, %q[pos], %q[neg] \n" 93 "vadd.f32 q1, q1, q3 \n" 94 "vadd.f32 q2, q2, q4 \n" 95 "vcvt.s32.f32 q1, q1, #15 \n" 96 "vcvt.s32.f32 q2, q2, #15 \n" 97 "vqmovn.s32 d2, q1 \n" 98 "vqmovn.s32 d3, q2 \n" 99 "vst2.16 {d2-d3}, [%[output]]! \n" 100 "bne 1b \n" 101 : /* output */ 102 "=r"(chunk), 103 "=r"(input1), 104 "=r"(input2), 105 "=r"(output) 106 : /* input */ 107 [chunk]"0"(chunk), 108 [input1]"1"(input1), 109 [input2]"2"(input2), 110 [output]"3"(output), 111 [pos]"w"(pos), 112 [neg]"w"(neg) 113 : /* clobber */ 114 "q0", "q1", "q2", "q3", "q4", "memory", "cc" 115 ); 116 } 117 118 /* The remaining samples */ 119 while (frames--) { 120 float f; 121 f = *input1++; 122 f += (f > 0) ? (0.5f / 32768.0f) : (-0.5f / 32768.0f); 123 *output++ = max(-32768, min(32767, (int)(f * 32768.0f))); 124 f = *input2++; 125 f += (f > 0) ? (0.5f / 32768.0f) : (-0.5f / 32768.0f); 126 *output++ = max(-32768, min(32767, (int)(f * 32768.0f))); 127 } 128 } 129 #define interleave_stereo interleave_stereo 130 131 #endif 132 133 #ifdef __SSE3__ 134 #include <emmintrin.h> 135 136 static void deinterleave_stereo(int16_t *input, float *output1, 137 float *output2, int frames) 138 { 139 /* Process 8 frames (16 samples) each loop. */ 140 /* L0 R0 L1 R1 L2 R2 L3 R3... -> L0 L1 L2 L3... R0 R1 R2 R3... */ 141 int chunk = frames >> 3; 142 frames &= 7; 143 if (chunk) { 144 __asm__ __volatile__ ( 145 "1: \n" 146 "lddqu (%[input]), %%xmm0 \n" 147 "lddqu 16(%[input]), %%xmm1 \n" 148 "add $32, %[input] \n" 149 "movdqa %%xmm0, %%xmm2 \n" 150 "movdqa %%xmm1, %%xmm3 \n" 151 "pslld $16, %%xmm0 \n" 152 "pslld $16, %%xmm1 \n" 153 "psrad $16, %%xmm2 \n" 154 "psrad $16, %%xmm3 \n" 155 "cvtdq2ps %%xmm0, %%xmm0 \n" 156 "cvtdq2ps %%xmm1, %%xmm1 \n" 157 "cvtdq2ps %%xmm2, %%xmm2 \n" 158 "cvtdq2ps %%xmm3, %%xmm3 \n" 159 "mulps %[scale_2_n31], %%xmm0 \n" 160 "mulps %[scale_2_n31], %%xmm1 \n" 161 "mulps %[scale_2_n15], %%xmm2 \n" 162 "mulps %[scale_2_n15], %%xmm3 \n" 163 "movdqu %%xmm0, (%[output1]) \n" 164 "movdqu %%xmm1, 16(%[output1]) \n" 165 "movdqu %%xmm2, (%[output2]) \n" 166 "movdqu %%xmm3, 16(%[output2]) \n" 167 "add $32, %[output1] \n" 168 "add $32, %[output2] \n" 169 "sub $1, %[chunk] \n" 170 "jnz 1b \n" 171 : /* output */ 172 [chunk]"+r"(chunk), 173 [input]"+r"(input), 174 [output1]"+r"(output1), 175 [output2]"+r"(output2) 176 : /* input */ 177 [scale_2_n31]"x"(_mm_set1_ps(1.0f/(1<<15)/(1<<16))), 178 [scale_2_n15]"x"(_mm_set1_ps(1.0f/(1<<15))) 179 : /* clobber */ 180 "xmm0", "xmm1", "xmm2", "xmm3", "memory", "cc" 181 ); 182 } 183 184 /* The remaining samples. */ 185 while (frames--) { 186 *output1++ = *input++ / 32768.0f; 187 *output2++ = *input++ / 32768.0f; 188 } 189 } 190 #define deinterleave_stereo deinterleave_stereo 191 192 static void interleave_stereo(float *input1, float *input2, 193 int16_t *output, int frames) 194 { 195 /* Process 4 frames (8 samples) each loop. */ 196 /* L0 L1 L2 L3, R0 R1 R2 R3 -> L0 R0 L1 R1, L2 R2 L3 R3 */ 197 int chunk = frames >> 2; 198 frames &= 3; 199 200 if (chunk) { 201 __asm__ __volatile__ ( 202 "1: \n" 203 "lddqu (%[input1]), %%xmm0 \n" 204 "lddqu (%[input2]), %%xmm2 \n" 205 "movaps %%xmm0, %%xmm1 \n" 206 "unpcklps %%xmm2, %%xmm0 \n" 207 "unpckhps %%xmm2, %%xmm1 \n" 208 "add $16, %[input1] \n" 209 "add $16, %[input2] \n" 210 "mulps %[scale_2_15], %%xmm0 \n" 211 "mulps %[scale_2_15], %%xmm1 \n" 212 "cvtps2dq %%xmm0, %%xmm0 \n" 213 "cvtps2dq %%xmm1, %%xmm1 \n" 214 "packssdw %%xmm1, %%xmm0 \n" 215 "movdqu %%xmm0, (%[output]) \n" 216 "add $16, %[output] \n" 217 "sub $1, %[chunk] \n" 218 "jnz 1b \n" 219 : /* output */ 220 "=r"(chunk), 221 "=r"(input1), 222 "=r"(input2), 223 "=r"(output) 224 : /* input */ 225 [chunk]"0"(chunk), 226 [input1]"1"(input1), 227 [input2]"2"(input2), 228 [output]"3"(output), 229 [scale_2_15]"x"(_mm_set1_ps(1.0f*(1<<15))) 230 : /* clobber */ 231 "xmm0", "xmm1", "xmm2", "memory", "cc" 232 ); 233 } 234 235 /* The remaining samples */ 236 while (frames--) { 237 float f; 238 f = *input1++; 239 f += (f > 0) ? (0.5f / 32768.0f) : (-0.5f / 32768.0f); 240 *output++ = max(-32768, min(32767, (int)(f * 32768.0f))); 241 f = *input2++; 242 f += (f > 0) ? (0.5f / 32768.0f) : (-0.5f / 32768.0f); 243 *output++ = max(-32768, min(32767, (int)(f * 32768.0f))); 244 } 245 } 246 #define interleave_stereo interleave_stereo 247 248 #endif 249 250 void dsp_util_deinterleave(int16_t *input, float *const *output, int channels, 251 int frames) 252 { 253 float *output_ptr[channels]; 254 int i, j; 255 256 #ifdef deinterleave_stereo 257 if (channels == 2) { 258 deinterleave_stereo(input, output[0], output[1], frames); 259 return; 260 } 261 #endif 262 263 for (i = 0; i < channels; i++) 264 output_ptr[i] = output[i]; 265 266 for (i = 0; i < frames; i++) 267 for (j = 0; j < channels; j++) 268 *(output_ptr[j]++) = *input++ / 32768.0f; 269 } 270 271 void dsp_util_interleave(float *const *input, int16_t *output, int channels, 272 int frames) 273 { 274 float *input_ptr[channels]; 275 int i, j; 276 277 #ifdef interleave_stereo 278 if (channels == 2) { 279 interleave_stereo(input[0], input[1], output, frames); 280 return; 281 } 282 #endif 283 284 for (i = 0; i < channels; i++) 285 input_ptr[i] = input[i]; 286 287 for (i = 0; i < frames; i++) 288 for (j = 0; j < channels; j++) { 289 int16_t i16; 290 float f = *(input_ptr[j]++) * 32768.0f; 291 if (f > 32767) 292 i16 = 32767; 293 else if (f < -32768) 294 i16 = -32768; 295 else 296 i16 = (int16_t) (f > 0 ? f + 0.5f : f - 0.5f); 297 *output++ = i16; 298 } 299 } 300 301 void dsp_enable_flush_denormal_to_zero() 302 { 303 #if defined(__i386__) || defined(__x86_64__) 304 unsigned int mxcsr; 305 mxcsr = __builtin_ia32_stmxcsr(); 306 __builtin_ia32_ldmxcsr(mxcsr | 0x8040); 307 #elif defined(__arm__) 308 int cw; 309 __asm__ __volatile__ ("mrc p10, 7, %0, cr1, cr0, 0" : "=r" (cw)); 310 __asm__ __volatile__ ("mcr p10, 7, %0, cr1, cr0, 0" : : "r" (cw | (1 << 24))); 311 #else 312 #warning "Don't know how to disable denorms. Performace may suffer." 313 #endif 314 } 315