1 /* Copyright 2013 The Chromium OS Authors. All rights reserved. 2 * Use of this source code is governed by a BSD-style license that can be 3 * found in the LICENSE file. 4 */ 5 6 #include "dsp_util.h" 7 8 #ifndef max 9 #define max(a, b) ({ __typeof__(a) _a = (a); \ 10 __typeof__(b) _b = (b); \ 11 _a > _b ? _a : _b; }) 12 #endif 13 14 #ifndef min 15 #define min(a, b) ({ __typeof__(a) _a = (a); \ 16 __typeof__(b) _b = (b); \ 17 _a < _b ? _a : _b; }) 18 #endif 19 20 #undef deinterleave_stereo 21 #undef interleave_stereo 22 23 /* Converts shorts in range of -32768 to 32767 to floats in range of 24 * -1.0f to 1.0f. 25 * scvtf instruction accepts fixed point ints, so sxtl is used to lengthen 26 * shorts to int with sign extension. 27 */ 28 #ifdef __aarch64__ 29 static void deinterleave_stereo(int16_t *input, float *output1, 30 float *output2, int frames) 31 { 32 int chunk = frames >> 3; 33 frames &= 7; 34 /* Process 8 frames (16 samples) each loop. */ 35 /* L0 R0 L1 R1 L2 R2 L3 R3... -> L0 L1 L2 L3... R0 R1 R2 R3... */ 36 if (chunk) { 37 __asm__ __volatile__ ( 38 "1: \n" 39 "ld2 {v2.8h, v3.8h}, [%[input]], #32 \n" 40 "subs %w[chunk], %w[chunk], #1 \n" 41 "sxtl v0.4s, v2.4h \n" 42 "sxtl2 v1.4s, v2.8h \n" 43 "sxtl v2.4s, v3.4h \n" 44 "sxtl2 v3.4s, v3.8h \n" 45 "scvtf v0.4s, v0.4s, #15 \n" 46 "scvtf v1.4s, v1.4s, #15 \n" 47 "scvtf v2.4s, v2.4s, #15 \n" 48 "scvtf v3.4s, v3.4s, #15 \n" 49 "st1 {v0.4s, v1.4s}, [%[output1]], #32 \n" 50 "st1 {v2.4s, v3.4s}, [%[output2]], #32 \n" 51 "b.ne 1b \n" 52 : /* output */ 53 [chunk]"+r"(chunk), 54 [input]"+r"(input), 55 [output1]"+r"(output1), 56 [output2]"+r"(output2) 57 : /* input */ 58 : /* clobber */ 59 "v0", "v1", "v2", "v3", "memory", "cc" 60 ); 61 } 62 63 /* The remaining samples. */ 64 while (frames--) { 65 *output1++ = *input++ / 32768.0f; 66 *output2++ = *input++ / 32768.0f; 67 } 68 } 69 #define deinterleave_stereo deinterleave_stereo 70 71 /* Converts floats in range of -1.0f to 1.0f to shorts in range of 72 * -32768 to 32767 with rounding to nearest, with ties (0.5) rounding away 73 * from zero. 74 * Rounding is achieved by using fcvtas instruction. (a = away) 75 * The float scaled to a range of -32768 to 32767 by adding 15 to the exponent. 76 * Add to exponent is equivalent to multiply for exponent range of 0 to 239, 77 * which is 2.59 * 10^33. A signed saturating add (sqadd) limits exponents 78 * from 240 to 255 to clamp to 255. 79 * For very large values, beyond +/- 2 billion, fcvtas will clamp the result 80 * to the min or max value that fits an int. 81 * For other values, sqxtn clamps the output to -32768 to 32767 range. 82 */ 83 static void interleave_stereo(float *input1, float *input2, 84 int16_t *output, int frames) 85 { 86 /* Process 4 frames (8 samples) each loop. */ 87 /* L0 L1 L2 L3, R0 R1 R2 R3 -> L0 R0 L1 R1, L2 R2 L3 R3 */ 88 int chunk = frames >> 2; 89 frames &= 3; 90 91 if (chunk) { 92 __asm__ __volatile__ ( 93 "dup v2.4s, %w[scale] \n" 94 "1: \n" 95 "ld1 {v0.4s}, [%[input1]], #16 \n" 96 "ld1 {v1.4s}, [%[input2]], #16 \n" 97 "subs %w[chunk], %w[chunk], #1 \n" 98 "sqadd v0.4s, v0.4s, v2.4s \n" 99 "sqadd v1.4s, v1.4s, v2.4s \n" 100 "fcvtas v0.4s, v0.4s \n" 101 "fcvtas v1.4s, v1.4s \n" 102 "sqxtn v0.4h, v0.4s \n" 103 "sqxtn v1.4h, v1.4s \n" 104 "st2 {v0.4h, v1.4h}, [%[output]], #16 \n" 105 "b.ne 1b \n" 106 : /* output */ 107 [chunk]"+r"(chunk), 108 [input1]"+r"(input1), 109 [input2]"+r"(input2), 110 [output]"+r"(output) 111 : /* input */ 112 [scale]"r"(15 << 23) 113 : /* clobber */ 114 "v0", "v1", "v2", "memory", "cc" 115 ); 116 } 117 118 /* The remaining samples */ 119 while (frames--) { 120 float f; 121 f = *input1++ * 32768.0f; 122 f += (f >= 0) ? 0.5f : -0.5f; 123 *output++ = max(-32768, min(32767, (int)(f))); 124 f = *input2++ * 32768.0f; 125 f += (f >= 0) ? 0.5f : -0.5f; 126 *output++ = max(-32768, min(32767, (int)(f))); 127 } 128 } 129 #define interleave_stereo interleave_stereo 130 #endif 131 132 #ifdef __ARM_NEON__ 133 #include <arm_neon.h> 134 135 static void deinterleave_stereo(int16_t *input, float *output1, 136 float *output2, int frames) 137 { 138 /* Process 8 frames (16 samples) each loop. */ 139 /* L0 R0 L1 R1 L2 R2 L3 R3... -> L0 L1 L2 L3... R0 R1 R2 R3... */ 140 int chunk = frames >> 3; 141 frames &= 7; 142 if (chunk) { 143 __asm__ __volatile__ ( 144 "1: \n" 145 "vld2.16 {d0-d3}, [%[input]]! \n" 146 "subs %[chunk], #1 \n" 147 "vmovl.s16 q3, d3 \n" 148 "vmovl.s16 q2, d2 \n" 149 "vmovl.s16 q1, d1 \n" 150 "vmovl.s16 q0, d0 \n" 151 "vcvt.f32.s32 q3, q3, #15 \n" 152 "vcvt.f32.s32 q2, q2, #15 \n" 153 "vcvt.f32.s32 q1, q1, #15 \n" 154 "vcvt.f32.s32 q0, q0, #15 \n" 155 "vst1.32 {d4-d7}, [%[output2]]! \n" 156 "vst1.32 {d0-d3}, [%[output1]]! \n" 157 "bne 1b \n" 158 : /* output */ 159 [chunk]"+r"(chunk), 160 [input]"+r"(input), 161 [output1]"+r"(output1), 162 [output2]"+r"(output2) 163 : /* input */ 164 : /* clobber */ 165 "q0", "q1", "q2", "q3", "memory", "cc" 166 ); 167 } 168 169 /* The remaining samples. */ 170 while (frames--) { 171 *output1++ = *input++ / 32768.0f; 172 *output2++ = *input++ / 32768.0f; 173 } 174 } 175 #define deinterleave_stereo deinterleave_stereo 176 177 /* Converts floats in range of -1.0f to 1.0f to shorts in range of 178 * -32768 to 32767 with rounding to nearest, with ties (0.5) rounding away 179 * from zero. 180 * Rounding is achieved by adding 0.5 or -0.5 adjusted for fixed point 181 * precision, and then converting float to fixed point using vcvt instruction 182 * which truncated toward zero. 183 * For very large values, beyond +/- 2 billion, vcvt will clamp the result 184 * to the min or max value that fits an int. 185 * For other values, vqmovn clamps the output to -32768 to 32767 range. 186 */ 187 static void interleave_stereo(float *input1, float *input2, 188 int16_t *output, int frames) 189 { 190 /* Process 4 frames (8 samples) each loop. */ 191 /* L0 L1 L2 L3, R0 R1 R2 R3 -> L0 R0 L1 R1, L2 R2 L3 R3 */ 192 float32x4_t pos = vdupq_n_f32(0.5f / 32768.0f); 193 float32x4_t neg = vdupq_n_f32(-0.5f / 32768.0f); 194 int chunk = frames >> 2; 195 frames &= 3; 196 197 if (chunk) { 198 __asm__ __volatile__ ( 199 "veor q0, q0, q0 \n" 200 "1: \n" 201 "vld1.32 {d2-d3}, [%[input1]]! \n" 202 "vld1.32 {d4-d5}, [%[input2]]! \n" 203 "subs %[chunk], #1 \n" 204 /* We try to round to the nearest number by adding 0.5 205 * to positive input, and adding -0.5 to the negative 206 * input, then truncate. 207 */ 208 "vcgt.f32 q3, q1, q0 \n" 209 "vcgt.f32 q4, q2, q0 \n" 210 "vbsl q3, %q[pos], %q[neg] \n" 211 "vbsl q4, %q[pos], %q[neg] \n" 212 "vadd.f32 q1, q1, q3 \n" 213 "vadd.f32 q2, q2, q4 \n" 214 "vcvt.s32.f32 q1, q1, #15 \n" 215 "vcvt.s32.f32 q2, q2, #15 \n" 216 "vqmovn.s32 d2, q1 \n" 217 "vqmovn.s32 d3, q2 \n" 218 "vst2.16 {d2-d3}, [%[output]]! \n" 219 "bne 1b \n" 220 : /* output */ 221 [chunk]"+r"(chunk), 222 [input1]"+r"(input1), 223 [input2]"+r"(input2), 224 [output]"+r"(output) 225 : /* input */ 226 [pos]"w"(pos), 227 [neg]"w"(neg) 228 : /* clobber */ 229 "q0", "q1", "q2", "q3", "q4", "memory", "cc" 230 ); 231 } 232 233 /* The remaining samples */ 234 while (frames--) { 235 float f; 236 f = *input1++ * 32768.0f; 237 f += (f >= 0) ? 0.5f : -0.5f; 238 *output++ = max(-32768, min(32767, (int)(f))); 239 f = *input2++ * 32768.0f; 240 f += (f >= 0) ? 0.5f : -0.5f; 241 *output++ = max(-32768, min(32767, (int)(f))); 242 } 243 } 244 #define interleave_stereo interleave_stereo 245 #endif 246 247 #ifdef __SSE3__ 248 #include <emmintrin.h> 249 250 /* Converts shorts in range of -32768 to 32767 to floats in range of 251 * -1.0f to 1.0f. 252 * pslld and psrad shifts are used to isolate the low and high word, but 253 * each in a different range: 254 * The low word is shifted to the high bits in range 0x80000000 .. 0x7fff0000. 255 * The high word is shifted to the low bits in range 0x00008000 .. 0x00007fff. 256 * cvtdq2ps converts ints to floats as is. 257 * mulps is used to normalize the range of the low and high words, adjusting 258 * for high and low words being in different range. 259 */ 260 static void deinterleave_stereo(int16_t *input, float *output1, 261 float *output2, int frames) 262 { 263 /* Process 8 frames (16 samples) each loop. */ 264 /* L0 R0 L1 R1 L2 R2 L3 R3... -> L0 L1 L2 L3... R0 R1 R2 R3... */ 265 int chunk = frames >> 3; 266 frames &= 7; 267 if (chunk) { 268 __asm__ __volatile__ ( 269 "1: \n" 270 "lddqu (%[input]), %%xmm0 \n" 271 "lddqu 16(%[input]), %%xmm1 \n" 272 "add $32, %[input] \n" 273 "movdqa %%xmm0, %%xmm2 \n" 274 "movdqa %%xmm1, %%xmm3 \n" 275 "pslld $16, %%xmm0 \n" 276 "pslld $16, %%xmm1 \n" 277 "psrad $16, %%xmm2 \n" 278 "psrad $16, %%xmm3 \n" 279 "cvtdq2ps %%xmm0, %%xmm0 \n" 280 "cvtdq2ps %%xmm1, %%xmm1 \n" 281 "cvtdq2ps %%xmm2, %%xmm2 \n" 282 "cvtdq2ps %%xmm3, %%xmm3 \n" 283 "mulps %[scale_2_n31], %%xmm0 \n" 284 "mulps %[scale_2_n31], %%xmm1 \n" 285 "mulps %[scale_2_n15], %%xmm2 \n" 286 "mulps %[scale_2_n15], %%xmm3 \n" 287 "movdqu %%xmm0, (%[output1]) \n" 288 "movdqu %%xmm1, 16(%[output1]) \n" 289 "movdqu %%xmm2, (%[output2]) \n" 290 "movdqu %%xmm3, 16(%[output2]) \n" 291 "add $32, %[output1] \n" 292 "add $32, %[output2] \n" 293 "sub $1, %[chunk] \n" 294 "jnz 1b \n" 295 : /* output */ 296 [chunk]"+r"(chunk), 297 [input]"+r"(input), 298 [output1]"+r"(output1), 299 [output2]"+r"(output2) 300 : /* input */ 301 [scale_2_n31]"x"(_mm_set1_ps(1.0f/(1<<15)/(1<<16))), 302 [scale_2_n15]"x"(_mm_set1_ps(1.0f/(1<<15))) 303 : /* clobber */ 304 "xmm0", "xmm1", "xmm2", "xmm3", "memory", "cc" 305 ); 306 } 307 308 /* The remaining samples. */ 309 while (frames--) { 310 *output1++ = *input++ / 32768.0f; 311 *output2++ = *input++ / 32768.0f; 312 } 313 } 314 #define deinterleave_stereo deinterleave_stereo 315 316 /* Converts floats in range of -1.0f to 1.0f to shorts in range of 317 * -32768 to 32767 with rounding to nearest, with ties (0.5) rounding to 318 * even. 319 * For very large values, beyond +/- 2 billion, cvtps2dq will produce 320 * 0x80000000 and packssdw will clamp -32768. 321 */ 322 static void interleave_stereo(float *input1, float *input2, 323 int16_t *output, int frames) 324 { 325 /* Process 4 frames (8 samples) each loop. */ 326 /* L0 L1 L2 L3, R0 R1 R2 R3 -> L0 R0 L1 R1, L2 R2 L3 R3 */ 327 int chunk = frames >> 2; 328 frames &= 3; 329 330 if (chunk) { 331 __asm__ __volatile__ ( 332 "1: \n" 333 "lddqu (%[input1]), %%xmm0 \n" 334 "lddqu (%[input2]), %%xmm2 \n" 335 "add $16, %[input1] \n" 336 "add $16, %[input2] \n" 337 "movaps %%xmm0, %%xmm1 \n" 338 "unpcklps %%xmm2, %%xmm0 \n" 339 "unpckhps %%xmm2, %%xmm1 \n" 340 "paddsw %[scale_2_15], %%xmm0 \n" 341 "paddsw %[scale_2_15], %%xmm1 \n" 342 "cvtps2dq %%xmm0, %%xmm0 \n" 343 "cvtps2dq %%xmm1, %%xmm1 \n" 344 "packssdw %%xmm1, %%xmm0 \n" 345 "movdqu %%xmm0, (%[output]) \n" 346 "add $16, %[output] \n" 347 "sub $1, %[chunk] \n" 348 "jnz 1b \n" 349 : /* output */ 350 [chunk]"+r"(chunk), 351 [input1]"+r"(input1), 352 [input2]"+r"(input2), 353 [output]"+r"(output) 354 : /* input */ 355 [scale_2_15]"x"(_mm_set1_epi32(15 << 23)), 356 [clamp_large]"x"(_mm_set1_ps(32767.0f)) 357 : /* clobber */ 358 "xmm0", "xmm1", "xmm2", "memory", "cc" 359 ); 360 } 361 362 /* The remaining samples */ 363 while (frames--) { 364 float f; 365 f = *input1++ * 32768.0f; 366 f += (f >= 0) ? 0.5f : -0.5f; 367 *output++ = max(-32768, min(32767, (int)(f))); 368 f = *input2++ * 32768.0f; 369 f += (f >= 0) ? 0.5f : -0.5f; 370 *output++ = max(-32768, min(32767, (int)(f))); 371 } 372 } 373 #define interleave_stereo interleave_stereo 374 #endif 375 376 void dsp_util_deinterleave(int16_t *input, float *const *output, int channels, 377 int frames) 378 { 379 float *output_ptr[channels]; 380 int i, j; 381 382 #ifdef deinterleave_stereo 383 if (channels == 2) { 384 deinterleave_stereo(input, output[0], output[1], frames); 385 return; 386 } 387 #endif 388 389 for (i = 0; i < channels; i++) 390 output_ptr[i] = output[i]; 391 392 for (i = 0; i < frames; i++) 393 for (j = 0; j < channels; j++) 394 *(output_ptr[j]++) = *input++ / 32768.0f; 395 } 396 397 void dsp_util_interleave(float *const *input, int16_t *output, int channels, 398 int frames) 399 { 400 float *input_ptr[channels]; 401 int i, j; 402 403 #ifdef interleave_stereo 404 if (channels == 2) { 405 interleave_stereo(input[0], input[1], output, frames); 406 return; 407 } 408 #endif 409 410 for (i = 0; i < channels; i++) 411 input_ptr[i] = input[i]; 412 413 for (i = 0; i < frames; i++) 414 for (j = 0; j < channels; j++) { 415 float f = *(input_ptr[j]++) * 32768.0f; 416 f += (f >= 0) ? 0.5f : -0.5f; 417 *output++ = max(-32768, min(32767, (int)(f))); 418 } 419 } 420 421 void dsp_enable_flush_denormal_to_zero() 422 { 423 #if defined(__i386__) || defined(__x86_64__) 424 unsigned int mxcsr; 425 mxcsr = __builtin_ia32_stmxcsr(); 426 __builtin_ia32_ldmxcsr(mxcsr | 0x8040); 427 #elif defined(__aarch64__) 428 uint64_t cw; 429 __asm__ __volatile__ ( 430 "mrs %0, fpcr \n" 431 "orr %0, %0, #0x1000000 \n" 432 "msr fpcr, %0 \n" 433 "isb \n" 434 : "=r"(cw) :: "memory"); 435 #elif defined(__arm__) 436 uint32_t cw; 437 __asm__ __volatile__ ( 438 "vmrs %0, fpscr \n" 439 "orr %0, %0, #0x1000000 \n" 440 "vmsr fpscr, %0 \n" 441 : "=r"(cw) :: "memory"); 442 #else 443 #warning "Don't know how to disable denorms. Performace may suffer." 444 #endif 445 } 446