1 /* Copyright (c) 2013 The Chromium OS Authors. All rights reserved. 2 * Use of this source code is governed by a BSD-style license that can be 3 * found in the LICENSE file. 4 */ 5 6 /* Copyright (C) 2011 Google Inc. All rights reserved. 7 * Use of this source code is governed by a BSD-style license that can be 8 * found in the LICENSE.WEBKIT file. 9 */ 10 11 #include <stdio.h> 12 #include <stdlib.h> 13 #include <string.h> 14 15 #include "drc_math.h" 16 #include "drc_kernel.h" 17 18 #define MAX_PRE_DELAY_FRAMES 1024U 19 #define MAX_PRE_DELAY_FRAMES_MASK (MAX_PRE_DELAY_FRAMES - 1) 20 #define DEFAULT_PRE_DELAY_FRAMES 256U 21 #define DIVISION_FRAMES 32U 22 #define DIVISION_FRAMES_MASK (DIVISION_FRAMES - 1) 23 24 #define assert_on_compile(e) ((void)sizeof(char[1 - 2 * !(e)])) 25 #define assert_on_compile_is_power_of_2(n) \ 26 assert_on_compile((n) != 0 && (((n) & ((n) - 1)) == 0)) 27 28 const float uninitialized_value = -1; 29 static int drc_math_initialized; 30 31 void dk_init(struct drc_kernel *dk, float sample_rate) 32 { 33 unsigned int i; 34 35 if (!drc_math_initialized) { 36 drc_math_initialized = 1; 37 drc_math_init(); 38 } 39 40 dk->sample_rate = sample_rate; 41 dk->detector_average = 0; 42 dk->compressor_gain = 1; 43 dk->enabled = 0; 44 dk->processed = 0; 45 dk->last_pre_delay_frames = DEFAULT_PRE_DELAY_FRAMES; 46 dk->pre_delay_read_index = 0; 47 dk->pre_delay_write_index = DEFAULT_PRE_DELAY_FRAMES; 48 dk->max_attack_compression_diff_db = -INFINITY; 49 dk->ratio = uninitialized_value; 50 dk->slope = uninitialized_value; 51 dk->linear_threshold = uninitialized_value; 52 dk->db_threshold = uninitialized_value; 53 dk->db_knee = uninitialized_value; 54 dk->knee_threshold = uninitialized_value; 55 dk->ratio_base = uninitialized_value; 56 dk->K = uninitialized_value; 57 58 assert_on_compile_is_power_of_2(DIVISION_FRAMES); 59 assert_on_compile(DIVISION_FRAMES % 4 == 0); 60 /* Allocate predelay buffers */ 61 assert_on_compile_is_power_of_2(MAX_PRE_DELAY_FRAMES); 62 for (i = 0; i < DRC_NUM_CHANNELS; i++) { 63 size_t size = sizeof(float) * MAX_PRE_DELAY_FRAMES; 64 dk->pre_delay_buffers[i] = (float *)calloc(1, size); 65 } 66 } 67 68 void dk_free(struct drc_kernel *dk) 69 { 70 unsigned int i; 71 for (i = 0; i < DRC_NUM_CHANNELS; ++i) 72 free(dk->pre_delay_buffers[i]); 73 } 74 75 /* Sets the pre-delay (lookahead) buffer size */ 76 static void set_pre_delay_time(struct drc_kernel *dk, float pre_delay_time) 77 { 78 unsigned int i; 79 /* Re-configure look-ahead section pre-delay if delay time has 80 * changed. */ 81 unsigned pre_delay_frames = pre_delay_time * dk->sample_rate; 82 pre_delay_frames = min(pre_delay_frames, MAX_PRE_DELAY_FRAMES - 1); 83 84 /* Make pre_delay_frames multiplies of DIVISION_FRAMES. This way we 85 * won't split a division of samples into two blocks of memory, so it is 86 * easier to process. This may make the actual delay time slightly less 87 * than the specified value, but the difference is less than 1ms. */ 88 pre_delay_frames &= ~DIVISION_FRAMES_MASK; 89 90 /* We need at least one division buffer, so the incoming data won't 91 * overwrite the output data */ 92 pre_delay_frames = max(pre_delay_frames, DIVISION_FRAMES); 93 94 if (dk->last_pre_delay_frames != pre_delay_frames) { 95 dk->last_pre_delay_frames = pre_delay_frames; 96 for (i = 0; i < DRC_NUM_CHANNELS; ++i) { 97 size_t size = sizeof(float) * MAX_PRE_DELAY_FRAMES; 98 memset(dk->pre_delay_buffers[i], 0, size); 99 } 100 101 dk->pre_delay_read_index = 0; 102 dk->pre_delay_write_index = pre_delay_frames; 103 } 104 } 105 106 /* Exponential curve for the knee. It is 1st derivative matched at 107 * dk->linear_threshold and asymptotically approaches the value 108 * dk->linear_threshold + 1 / k. 109 * 110 * This is used only when calculating the static curve, not used when actually 111 * compress the input data (knee_curveK below is used instead). 112 */ 113 static float knee_curve(struct drc_kernel *dk, float x, float k) 114 { 115 /* Linear up to threshold. */ 116 if (x < dk->linear_threshold) 117 return x; 118 119 return dk->linear_threshold + 120 (1 - knee_expf(-k * (x - dk->linear_threshold))) / k; 121 } 122 123 /* Approximate 1st derivative with input and output expressed in dB. This slope 124 * is equal to the inverse of the compression "ratio". In other words, a 125 * compression ratio of 20 would be a slope of 1/20. 126 */ 127 static float slope_at(struct drc_kernel *dk, float x, float k) 128 { 129 if (x < dk->linear_threshold) 130 return 1; 131 132 float x2 = x * 1.001; 133 134 float x_db = linear_to_decibels(x); 135 float x2Db = linear_to_decibels(x2); 136 137 float y_db = linear_to_decibels(knee_curve(dk, x, k)); 138 float y2Db = linear_to_decibels(knee_curve(dk, x2, k)); 139 140 float m = (y2Db - y_db) / (x2Db - x_db); 141 142 return m; 143 } 144 145 static float k_at_slope(struct drc_kernel *dk, float desired_slope) 146 { 147 float x_db = dk->db_threshold + dk->db_knee; 148 float x = decibels_to_linear(x_db); 149 150 /* Approximate k given initial values. */ 151 float minK = 0.1; 152 float maxK = 10000; 153 float k = 5; 154 unsigned int i; 155 156 for (i = 0; i < 15; ++i) { 157 /* A high value for k will more quickly asymptotically approach 158 * a slope of 0. */ 159 float slope = slope_at(dk, x, k); 160 161 if (slope < desired_slope) { 162 /* k is too high. */ 163 maxK = k; 164 } else { 165 /* k is too low. */ 166 minK = k; 167 } 168 169 /* Re-calculate based on geometric mean. */ 170 k = sqrtf(minK * maxK); 171 } 172 173 return k; 174 } 175 176 static void update_static_curve_parameters(struct drc_kernel *dk, 177 float db_threshold, 178 float db_knee, float ratio) 179 { 180 if (db_threshold != dk->db_threshold || db_knee != dk->db_knee || 181 ratio != dk->ratio) { 182 /* Threshold and knee. */ 183 dk->db_threshold = db_threshold; 184 dk->linear_threshold = decibels_to_linear(db_threshold); 185 dk->db_knee = db_knee; 186 187 /* Compute knee parameters. */ 188 dk->ratio = ratio; 189 dk->slope = 1 / dk->ratio; 190 191 float k = k_at_slope(dk, 1 / dk->ratio); 192 dk->K = k; 193 /* See knee_curveK() for details */ 194 dk->knee_alpha = dk->linear_threshold + 1 / k; 195 dk->knee_beta = -expf(k * dk->linear_threshold) / k; 196 197 dk->knee_threshold = decibels_to_linear(db_threshold + db_knee); 198 /* See volume_gain() for details */ 199 float y0 = knee_curve(dk, dk->knee_threshold, k); 200 dk->ratio_base = y0 * powf(dk->knee_threshold, -dk->slope); 201 } 202 } 203 204 /* This is the knee part of the compression curve. Returns the output level 205 * given the input level x. */ 206 static float knee_curveK(struct drc_kernel *dk, float x) 207 { 208 /* The formula in knee_curveK is dk->linear_threshold + 209 * (1 - expf(-k * (x - dk->linear_threshold))) / k 210 * which simplifies to (alpha + beta * expf(gamma)) 211 * where alpha = dk->linear_threshold + 1 / k 212 * beta = -expf(k * dk->linear_threshold) / k 213 * gamma = -k * x 214 */ 215 return dk->knee_alpha + dk->knee_beta * knee_expf(-dk->K * x); 216 } 217 218 /* Full compression curve with constant ratio after knee. Returns the ratio of 219 * output and input signal. */ 220 static float volume_gain(struct drc_kernel *dk, float x) 221 { 222 float y; 223 224 if (x < dk->knee_threshold) { 225 if (x < dk->linear_threshold) 226 return 1; 227 y = knee_curveK(dk, x) / x; 228 } else { 229 /* Constant ratio after knee. 230 * log(y/y0) = s * log(x/x0) 231 * => y = y0 * (x/x0)^s 232 * => y = [y0 * (1/x0)^s] * x^s 233 * => y = dk->ratio_base * x^s 234 * => y/x = dk->ratio_base * x^(s - 1) 235 * => y/x = dk->ratio_base * e^(log(x) * (s - 1)) 236 */ 237 y = dk->ratio_base * knee_expf(logf(x) * (dk->slope - 1)); 238 } 239 240 return y; 241 } 242 243 void dk_set_parameters(struct drc_kernel *dk, 244 float db_threshold, 245 float db_knee, 246 float ratio, 247 float attack_time, 248 float release_time, 249 float pre_delay_time, 250 float db_post_gain, 251 float releaseZone1, 252 float releaseZone2, 253 float releaseZone3, 254 float releaseZone4) 255 { 256 float sample_rate = dk->sample_rate; 257 258 update_static_curve_parameters(dk, db_threshold, db_knee, ratio); 259 260 /* Makeup gain. */ 261 float full_range_gain = volume_gain(dk, 1); 262 float full_range_makeup_gain = 1 / full_range_gain; 263 264 /* Empirical/perceptual tuning. */ 265 full_range_makeup_gain = powf(full_range_makeup_gain, 0.6f); 266 267 dk->master_linear_gain = decibels_to_linear(db_post_gain) * 268 full_range_makeup_gain; 269 270 /* Attack parameters. */ 271 attack_time = max(0.001f, attack_time); 272 dk->attack_frames = attack_time * sample_rate; 273 274 /* Release parameters. */ 275 float release_frames = sample_rate * release_time; 276 277 /* Detector release time. */ 278 float sat_release_time = 0.0025f; 279 float sat_release_frames = sat_release_time * sample_rate; 280 dk->sat_release_frames_inv_neg = -1 / sat_release_frames; 281 dk->sat_release_rate_at_neg_two_db = 282 decibels_to_linear(-2 * dk->sat_release_frames_inv_neg) - 1; 283 284 /* Create a smooth function which passes through four points. 285 * Polynomial of the form y = a + b*x + c*x^2 + d*x^3 + e*x^4 286 */ 287 float y1 = release_frames * releaseZone1; 288 float y2 = release_frames * releaseZone2; 289 float y3 = release_frames * releaseZone3; 290 float y4 = release_frames * releaseZone4; 291 292 /* All of these coefficients were derived for 4th order polynomial curve 293 * fitting where the y values match the evenly spaced x values as 294 * follows: (y1 : x == 0, y2 : x == 1, y3 : x == 2, y4 : x == 3) 295 */ 296 dk->kA = 0.9999999999999998f*y1 + 1.8432219684323923e-16f*y2 297 - 1.9373394351676423e-16f*y3 + 8.824516011816245e-18f*y4; 298 dk->kB = -1.5788320352845888f*y1 + 2.3305837032074286f*y2 299 - 0.9141194204840429f*y3 + 0.1623677525612032f*y4; 300 dk->kC = 0.5334142869106424f*y1 - 1.272736789213631f*y2 301 + 0.9258856042207512f*y3 - 0.18656310191776226f*y4; 302 dk->kD = 0.08783463138207234f*y1 - 0.1694162967925622f*y2 303 + 0.08588057951595272f*y3 - 0.00429891410546283f*y4; 304 dk->kE = -0.042416883008123074f*y1 + 0.1115693827987602f*y2 305 - 0.09764676325265872f*y3 + 0.028494263462021576f*y4; 306 307 /* x ranges from 0 -> 3 0 1 2 3 308 * -15 -10 -5 0db 309 * 310 * y calculates adaptive release frames depending on the amount of 311 * compression. 312 */ 313 set_pre_delay_time(dk, pre_delay_time); 314 } 315 316 void dk_set_enabled(struct drc_kernel *dk, int enabled) 317 { 318 dk->enabled = enabled; 319 } 320 321 /* Updates the envelope_rate used for the next division */ 322 static void dk_update_envelope(struct drc_kernel *dk) 323 { 324 const float kA = dk->kA; 325 const float kB = dk->kB; 326 const float kC = dk->kC; 327 const float kD = dk->kD; 328 const float kE = dk->kE; 329 const float attack_frames = dk->attack_frames; 330 331 /* Calculate desired gain */ 332 float desired_gain = dk->detector_average; 333 334 /* Pre-warp so we get desired_gain after sin() warp below. */ 335 float scaled_desired_gain = warp_asinf(desired_gain); 336 337 /* Deal with envelopes */ 338 339 /* envelope_rate is the rate we slew from current compressor level to 340 * the desired level. The exact rate depends on if we're attacking or 341 * releasing and by how much. 342 */ 343 float envelope_rate; 344 345 int is_releasing = scaled_desired_gain > dk->compressor_gain; 346 347 /* compression_diff_db is the difference between current compression 348 * level and the desired level. */ 349 float compression_diff_db = linear_to_decibels( 350 dk->compressor_gain / scaled_desired_gain); 351 352 if (is_releasing) { 353 /* Release mode - compression_diff_db should be negative dB */ 354 dk->max_attack_compression_diff_db = -INFINITY; 355 356 /* Fix gremlins. */ 357 if (isbadf(compression_diff_db)) 358 compression_diff_db = -1; 359 360 /* Adaptive release - higher compression (lower 361 * compression_diff_db) releases faster. Contain within range: 362 * -12 -> 0 then scale to go from 0 -> 3 363 */ 364 float x = compression_diff_db; 365 x = max(-12.0f, x); 366 x = min(0.0f, x); 367 x = 0.25f * (x + 12); 368 369 /* Compute adaptive release curve using 4th order polynomial. 370 * Normal values for the polynomial coefficients would create a 371 * monotonically increasing function. 372 */ 373 float x2 = x * x; 374 float x3 = x2 * x; 375 float x4 = x2 * x2; 376 float release_frames = kA + kB * x + kC * x2 + kD * x3 + 377 kE * x4; 378 379 #define kSpacingDb 5 380 float db_per_frame = kSpacingDb / release_frames; 381 envelope_rate = decibels_to_linear(db_per_frame); 382 } else { 383 /* Attack mode - compression_diff_db should be positive dB */ 384 385 /* Fix gremlins. */ 386 if (isbadf(compression_diff_db)) 387 compression_diff_db = 1; 388 389 /* As long as we're still in attack mode, use a rate based off 390 * the largest compression_diff_db we've encountered so far. 391 */ 392 dk->max_attack_compression_diff_db = max( 393 dk->max_attack_compression_diff_db, 394 compression_diff_db); 395 396 float eff_atten_diff_db = 397 max(0.5f, dk->max_attack_compression_diff_db); 398 399 float x = 0.25f / eff_atten_diff_db; 400 envelope_rate = 1 - powf(x, 1 / attack_frames); 401 } 402 403 dk->envelope_rate = envelope_rate; 404 dk->scaled_desired_gain = scaled_desired_gain; 405 } 406 407 /* For a division of frames, take the absolute values of left channel and right 408 * channel, store the maximum of them in output. */ 409 #ifdef __ARM_NEON__ 410 #include <arm_neon.h> 411 static inline void max_abs_division(float *output, float *data0, float *data1) 412 { 413 float32x4_t x, y; 414 int count = DIVISION_FRAMES / 4; 415 416 __asm__ __volatile__( 417 "1: \n" 418 "vld1.32 {%e[x],%f[x]}, [%[data0]]! \n" 419 "vld1.32 {%e[y],%f[y]}, [%[data1]]! \n" 420 "vabs.f32 %q[x], %q[x] \n" 421 "vabs.f32 %q[y], %q[y] \n" 422 "vmax.f32 %q[x], %q[y] \n" 423 "vst1.32 {%e[x],%f[x]}, [%[output]]! \n" 424 "subs %[count], #1 \n" 425 "bne 1b \n" 426 : /* output */ 427 "=r"(data0), 428 "=r"(data1), 429 "=r"(output), 430 "=r"(count), 431 [x]"=&w"(x), 432 [y]"=&w"(y) 433 : /* input */ 434 [data0]"0"(data0), 435 [data1]"1"(data1), 436 [output]"2"(output), 437 [count]"3"(count) 438 : /* clobber */ 439 "memory", "cc" 440 ); 441 } 442 #elif defined(__SSE3__) 443 #include <emmintrin.h> 444 static inline void max_abs_division(float *output, float *data0, float *data1) 445 { 446 __m128 x, y; 447 int count = DIVISION_FRAMES / 4; 448 449 __asm__ __volatile__( 450 "1: \n" 451 "lddqu (%[data0]), %[x] \n" 452 "lddqu (%[data1]), %[y] \n" 453 "andps %[mask], %[x] \n" 454 "andps %[mask], %[y] \n" 455 "maxps %[y], %[x] \n" 456 "movdqu %[x], (%[output]) \n" 457 "add $16, %[data0] \n" 458 "add $16, %[data1] \n" 459 "add $16, %[output] \n" 460 "sub $1, %[count] \n" 461 "jnz 1b \n" 462 : /* output */ 463 [data0]"+r"(data0), 464 [data1]"+r"(data1), 465 [output]"+r"(output), 466 [count]"+r"(count), 467 [x]"=&x"(x), 468 [y]"=&x"(y) 469 : /* input */ 470 [mask]"x"(_mm_set1_epi32(0x7fffffff)) 471 : /* clobber */ 472 "memory", "cc" 473 ); 474 } 475 #else 476 static inline void max_abs_division(float *output, float *data0, float *data1) 477 { 478 unsigned int i; 479 for (i = 0; i < DIVISION_FRAMES; i++) 480 output[i] = fmaxf(fabsf(data0[i]), fabsf(data1[i])); 481 } 482 #endif 483 484 /* Update detector_average from the last input division. */ 485 static void dk_update_detector_average(struct drc_kernel *dk) 486 { 487 float abs_input_array[DIVISION_FRAMES]; 488 const float sat_release_frames_inv_neg = dk->sat_release_frames_inv_neg; 489 const float sat_release_rate_at_neg_two_db = 490 dk->sat_release_rate_at_neg_two_db; 491 float detector_average = dk->detector_average; 492 unsigned int div_start, i; 493 494 /* Calculate the start index of the last input division */ 495 if (dk->pre_delay_write_index == 0) { 496 div_start = MAX_PRE_DELAY_FRAMES - DIVISION_FRAMES; 497 } else { 498 div_start = dk->pre_delay_write_index - DIVISION_FRAMES; 499 } 500 501 /* The max abs value across all channels for this frame */ 502 max_abs_division(abs_input_array, 503 &dk->pre_delay_buffers[0][div_start], 504 &dk->pre_delay_buffers[1][div_start]); 505 506 for (i = 0; i < DIVISION_FRAMES; i++) { 507 /* Compute compression amount from un-delayed signal */ 508 float abs_input = abs_input_array[i]; 509 510 /* Calculate shaped power on undelayed input. Put through 511 * shaping curve. This is linear up to the threshold, then 512 * enters a "knee" portion followed by the "ratio" portion. The 513 * transition from the threshold to the knee is smooth (1st 514 * derivative matched). The transition from the knee to the 515 * ratio portion is smooth (1st derivative matched). 516 */ 517 float gain = volume_gain(dk, abs_input); 518 int is_release = (gain > detector_average); 519 if (is_release) { 520 if (gain > NEG_TWO_DB) { 521 detector_average += (gain - detector_average) * 522 sat_release_rate_at_neg_two_db; 523 } else { 524 float gain_db = linear_to_decibels(gain); 525 float db_per_frame = gain_db * 526 sat_release_frames_inv_neg; 527 float sat_release_rate = 528 decibels_to_linear(db_per_frame) - 1; 529 detector_average += (gain - detector_average) * 530 sat_release_rate; 531 } 532 } else { 533 detector_average = gain; 534 } 535 536 /* Fix gremlins. */ 537 if (isbadf(detector_average)) 538 detector_average = 1.0f; 539 else 540 detector_average = min(detector_average, 1.0f); 541 } 542 543 dk->detector_average = detector_average; 544 } 545 546 /* Calculate compress_gain from the envelope and apply total_gain to compress 547 * the next output division. */ 548 #if defined(__ARM_NEON__) 549 #include <arm_neon.h> 550 static void dk_compress_output(struct drc_kernel *dk) 551 { 552 const float master_linear_gain = dk->master_linear_gain; 553 const float envelope_rate = dk->envelope_rate; 554 const float scaled_desired_gain = dk->scaled_desired_gain; 555 const float compressor_gain = dk->compressor_gain; 556 unsigned const int div_start = dk->pre_delay_read_index; 557 float *ptr_left = &dk->pre_delay_buffers[0][div_start]; 558 float *ptr_right = &dk->pre_delay_buffers[1][div_start]; 559 int count = DIVISION_FRAMES / 4; 560 561 /* See warp_sinf() for the details for the constants. */ 562 const float32x4_t A7 = vdupq_n_f32(-4.3330336920917034149169921875e-3f); 563 const float32x4_t A5 = vdupq_n_f32(7.9434238374233245849609375e-2f); 564 const float32x4_t A3 = vdupq_n_f32(-0.645892798900604248046875f); 565 const float32x4_t A1 = vdupq_n_f32(1.5707910060882568359375f); 566 567 /* Exponential approach to desired gain. */ 568 if (envelope_rate < 1) { 569 float c = compressor_gain - scaled_desired_gain; 570 float r = 1 - envelope_rate; 571 float32x4_t x0 = {c*r, c*r*r, c*r*r*r, c*r*r*r*r}; 572 float32x4_t x, x2, x4, left, right, tmp1, tmp2; 573 574 __asm__ __volatile( 575 "b 2f \n" 576 "1: \n" 577 "vmul.f32 %q[x0], %q[r4] \n" 578 "2: \n" 579 "vld1.32 {%e[left],%f[left]}, [%[ptr_left]] \n" 580 "vld1.32 {%e[right],%f[right]}, [%[ptr_right]] \n" 581 "vadd.f32 %q[x], %q[x0], %q[base] \n" 582 /* Calculate warp_sin() for four values in x. */ 583 "vmul.f32 %q[x2], %q[x], %q[x] \n" 584 "vmov.f32 %q[tmp1], %q[A5] \n" 585 "vmov.f32 %q[tmp2], %q[A1] \n" 586 "vmul.f32 %q[x4], %q[x2], %q[x2] \n" 587 "vmla.f32 %q[tmp1], %q[A7], %q[x2] \n" 588 "vmla.f32 %q[tmp2], %q[A3], %q[x2] \n" 589 "vmla.f32 %q[tmp2], %q[tmp1], %q[x4] \n" 590 "vmul.f32 %q[tmp2], %q[tmp2], %q[x] \n" 591 /* Now tmp2 contains the result of warp_sin(). */ 592 "vmul.f32 %q[tmp2], %q[tmp2], %q[g] \n" 593 "vmul.f32 %q[left], %q[tmp2] \n" 594 "vmul.f32 %q[right], %q[tmp2] \n" 595 "vst1.32 {%e[left],%f[left]}, [%[ptr_left]]! \n" 596 "vst1.32 {%e[right],%f[right]}, [%[ptr_right]]! \n" 597 "subs %[count], #1 \n" 598 "bne 1b \n" 599 : /* output */ 600 "=r"(count), 601 "=r"(ptr_left), 602 "=r"(ptr_right), 603 "=w"(x0), 604 [x]"=&w"(x), 605 [x2]"=&w"(x2), 606 [x4]"=&w"(x4), 607 [left]"=&w"(left), 608 [right]"=&w"(right), 609 [tmp1]"=&w"(tmp1), 610 [tmp2]"=&w"(tmp2) 611 : /* input */ 612 [count]"0"(count), 613 [ptr_left]"1"(ptr_left), 614 [ptr_right]"2"(ptr_right), 615 [x0]"3"(x0), 616 [A1]"w"(A1), 617 [A3]"w"(A3), 618 [A5]"w"(A5), 619 [A7]"w"(A7), 620 [base]"w"(vdupq_n_f32(scaled_desired_gain)), 621 [r4]"w"(vdupq_n_f32(r*r*r*r)), 622 [g]"w"(vdupq_n_f32(master_linear_gain)) 623 : /* clobber */ 624 "memory", "cc" 625 ); 626 dk->compressor_gain = x[3]; 627 } else { 628 float c = compressor_gain; 629 float r = envelope_rate; 630 float32x4_t x = {c*r, c*r*r, c*r*r*r, c*r*r*r*r}; 631 float32x4_t x2, x4, left, right, tmp1, tmp2; 632 633 __asm__ __volatile( 634 "b 2f \n" 635 "1: \n" 636 "vmul.f32 %q[x], %q[r4] \n" 637 "2: \n" 638 "vld1.32 {%e[left],%f[left]}, [%[ptr_left]] \n" 639 "vld1.32 {%e[right],%f[right]}, [%[ptr_right]] \n" 640 "vmin.f32 %q[x], %q[one] \n" 641 /* Calculate warp_sin() for four values in x. */ 642 "vmul.f32 %q[x2], %q[x], %q[x] \n" 643 "vmov.f32 %q[tmp1], %q[A5] \n" 644 "vmov.f32 %q[tmp2], %q[A1] \n" 645 "vmul.f32 %q[x4], %q[x2], %q[x2] \n" 646 "vmla.f32 %q[tmp1], %q[A7], %q[x2] \n" 647 "vmla.f32 %q[tmp2], %q[A3], %q[x2] \n" 648 "vmla.f32 %q[tmp2], %q[tmp1], %q[x4] \n" 649 "vmul.f32 %q[tmp2], %q[tmp2], %q[x] \n" 650 /* Now tmp2 contains the result of warp_sin(). */ 651 "vmul.f32 %q[tmp2], %q[tmp2], %q[g] \n" 652 "vmul.f32 %q[left], %q[tmp2] \n" 653 "vmul.f32 %q[right], %q[tmp2] \n" 654 "vst1.32 {%e[left],%f[left]}, [%[ptr_left]]! \n" 655 "vst1.32 {%e[right],%f[right]}, [%[ptr_right]]! \n" 656 "subs %[count], #1 \n" 657 "bne 1b \n" 658 : /* output */ 659 "=r"(count), 660 "=r"(ptr_left), 661 "=r"(ptr_right), 662 "=w"(x), 663 [x2]"=&w"(x2), 664 [x4]"=&w"(x4), 665 [left]"=&w"(left), 666 [right]"=&w"(right), 667 [tmp1]"=&w"(tmp1), 668 [tmp2]"=&w"(tmp2) 669 : /* input */ 670 [count]"0"(count), 671 [ptr_left]"1"(ptr_left), 672 [ptr_right]"2"(ptr_right), 673 [x]"3"(x), 674 [A1]"w"(A1), 675 [A3]"w"(A3), 676 [A5]"w"(A5), 677 [A7]"w"(A7), 678 [one]"w"(vdupq_n_f32(1)), 679 [r4]"w"(vdupq_n_f32(r*r*r*r)), 680 [g]"w"(vdupq_n_f32(master_linear_gain)) 681 : /* clobber */ 682 "memory", "cc" 683 ); 684 dk->compressor_gain = x[3]; 685 } 686 } 687 #elif defined(__SSE3__) && defined(__x86_64__) 688 #include <emmintrin.h> 689 static void dk_compress_output(struct drc_kernel *dk) 690 { 691 const float master_linear_gain = dk->master_linear_gain; 692 const float envelope_rate = dk->envelope_rate; 693 const float scaled_desired_gain = dk->scaled_desired_gain; 694 const float compressor_gain = dk->compressor_gain; 695 const int div_start = dk->pre_delay_read_index; 696 float *ptr_left = &dk->pre_delay_buffers[0][div_start]; 697 float *ptr_right = &dk->pre_delay_buffers[1][div_start]; 698 int count = DIVISION_FRAMES / 4; 699 700 /* See warp_sinf() for the details for the constants. */ 701 const __m128 A7 = _mm_set1_ps(-4.3330336920917034149169921875e-3f); 702 const __m128 A5 = _mm_set1_ps(7.9434238374233245849609375e-2f); 703 const __m128 A3 = _mm_set1_ps(-0.645892798900604248046875f); 704 const __m128 A1 = _mm_set1_ps(1.5707910060882568359375f); 705 706 /* Exponential approach to desired gain. */ 707 if (envelope_rate < 1) { 708 float c = compressor_gain - scaled_desired_gain; 709 float r = 1 - envelope_rate; 710 __m128 x0 = {c*r, c*r*r, c*r*r*r, c*r*r*r*r}; 711 __m128 x, x2, x4, left, right, tmp1, tmp2; 712 713 __asm__ __volatile( 714 "jmp 2f \n" 715 "1: \n" 716 "mulps %[r4], %[x0] \n" 717 "2: \n" 718 "lddqu (%[ptr_left]), %[left] \n" 719 "lddqu (%[ptr_right]), %[right] \n" 720 "movaps %[x0], %[x] \n" 721 "addps %[base], %[x] \n" 722 /* Calculate warp_sin() for four values in x. */ 723 "movaps %[x], %[x2] \n" 724 "mulps %[x], %[x2] \n" 725 "movaps %[x2], %[x4] \n" 726 "movaps %[x2], %[tmp1] \n" 727 "movaps %[x2], %[tmp2] \n" 728 "mulps %[x2], %[x4] \n" 729 "mulps %[A7], %[tmp1] \n" 730 "mulps %[A3], %[tmp2] \n" 731 "addps %[A5], %[tmp1] \n" 732 "addps %[A1], %[tmp2] \n" 733 "mulps %[x4], %[tmp1] \n" 734 "addps %[tmp1], %[tmp2] \n" 735 "mulps %[x], %[tmp2] \n" 736 /* Now tmp2 contains the result of warp_sin(). */ 737 "mulps %[g], %[tmp2] \n" 738 "mulps %[tmp2], %[left] \n" 739 "mulps %[tmp2], %[right] \n" 740 "movdqu %[left], (%[ptr_left]) \n" 741 "movdqu %[right], (%[ptr_right]) \n" 742 "add $16, %[ptr_left] \n" 743 "add $16, %[ptr_right] \n" 744 "sub $1, %[count] \n" 745 "jne 1b \n" 746 : /* output */ 747 "=r"(count), 748 "=r"(ptr_left), 749 "=r"(ptr_right), 750 "=x"(x0), 751 [x]"=&x"(x), 752 [x2]"=&x"(x2), 753 [x4]"=&x"(x4), 754 [left]"=&x"(left), 755 [right]"=&x"(right), 756 [tmp1]"=&x"(tmp1), 757 [tmp2]"=&x"(tmp2) 758 : /* input */ 759 [count]"0"(count), 760 [ptr_left]"1"(ptr_left), 761 [ptr_right]"2"(ptr_right), 762 [x0]"3"(x0), 763 [A1]"x"(A1), 764 [A3]"x"(A3), 765 [A5]"x"(A5), 766 [A7]"x"(A7), 767 [base]"x"(_mm_set1_ps(scaled_desired_gain)), 768 [r4]"x"(_mm_set1_ps(r*r*r*r)), 769 [g]"x"(_mm_set1_ps(master_linear_gain)) 770 : /* clobber */ 771 "memory", "cc" 772 ); 773 dk->compressor_gain = x[3]; 774 } else { 775 /* See warp_sinf() for the details for the constants. */ 776 __m128 A7 = _mm_set1_ps(-4.3330336920917034149169921875e-3f); 777 __m128 A5 = _mm_set1_ps(7.9434238374233245849609375e-2f); 778 __m128 A3 = _mm_set1_ps(-0.645892798900604248046875f); 779 __m128 A1 = _mm_set1_ps(1.5707910060882568359375f); 780 781 float c = compressor_gain; 782 float r = envelope_rate; 783 __m128 x = {c*r, c*r*r, c*r*r*r, c*r*r*r*r}; 784 __m128 x2, x4, left, right, tmp1, tmp2; 785 786 __asm__ __volatile( 787 "jmp 2f \n" 788 "1: \n" 789 "mulps %[r4], %[x] \n" 790 "2: \n" 791 "lddqu (%[ptr_left]), %[left] \n" 792 "lddqu (%[ptr_right]), %[right] \n" 793 "minps %[one], %[x] \n" 794 /* Calculate warp_sin() for four values in x. */ 795 "movaps %[x], %[x2] \n" 796 "mulps %[x], %[x2] \n" 797 "movaps %[x2], %[x4] \n" 798 "movaps %[x2], %[tmp1] \n" 799 "movaps %[x2], %[tmp2] \n" 800 "mulps %[x2], %[x4] \n" 801 "mulps %[A7], %[tmp1] \n" 802 "mulps %[A3], %[tmp2] \n" 803 "addps %[A5], %[tmp1] \n" 804 "addps %[A1], %[tmp2] \n" 805 "mulps %[x4], %[tmp1] \n" 806 "addps %[tmp1], %[tmp2] \n" 807 "mulps %[x], %[tmp2] \n" 808 /* Now tmp2 contains the result of warp_sin(). */ 809 "mulps %[g], %[tmp2] \n" 810 "mulps %[tmp2], %[left] \n" 811 "mulps %[tmp2], %[right] \n" 812 "movdqu %[left], (%[ptr_left]) \n" 813 "movdqu %[right], (%[ptr_right]) \n" 814 "add $16, %[ptr_left] \n" 815 "add $16, %[ptr_right] \n" 816 "sub $1, %[count] \n" 817 "jne 1b \n" 818 : /* output */ 819 "=r"(count), 820 "=r"(ptr_left), 821 "=r"(ptr_right), 822 "=x"(x), 823 [x2]"=&x"(x2), 824 [x4]"=&x"(x4), 825 [left]"=&x"(left), 826 [right]"=&x"(right), 827 [tmp1]"=&x"(tmp1), 828 [tmp2]"=&x"(tmp2) 829 : /* input */ 830 [count]"0"(count), 831 [ptr_left]"1"(ptr_left), 832 [ptr_right]"2"(ptr_right), 833 [x]"3"(x), 834 [A1]"x"(A1), 835 [A3]"x"(A3), 836 [A5]"x"(A5), 837 [A7]"x"(A7), 838 [one]"x"(_mm_set1_ps(1)), 839 [r4]"x"(_mm_set1_ps(r*r*r*r)), 840 [g]"x"(_mm_set1_ps(master_linear_gain)) 841 : /* clobber */ 842 "memory", "cc" 843 ); 844 dk->compressor_gain = x[3]; 845 } 846 } 847 #else 848 static void dk_compress_output(struct drc_kernel *dk) 849 { 850 const float master_linear_gain = dk->master_linear_gain; 851 const float envelope_rate = dk->envelope_rate; 852 const float scaled_desired_gain = dk->scaled_desired_gain; 853 const float compressor_gain = dk->compressor_gain; 854 const int div_start = dk->pre_delay_read_index; 855 float *ptr_left = &dk->pre_delay_buffers[0][div_start]; 856 float *ptr_right = &dk->pre_delay_buffers[1][div_start]; 857 unsigned int count = DIVISION_FRAMES / 4; 858 859 unsigned int i, j; 860 861 /* Exponential approach to desired gain. */ 862 if (envelope_rate < 1) { 863 /* Attack - reduce gain to desired. */ 864 float c = compressor_gain - scaled_desired_gain; 865 float base = scaled_desired_gain; 866 float r = 1 - envelope_rate; 867 float x[4] = {c*r, c*r*r, c*r*r*r, c*r*r*r*r}; 868 float r4 = r*r*r*r; 869 870 i = 0; 871 while (1) { 872 for (j = 0; j < 4; j++) { 873 /* Warp pre-compression gain to smooth out sharp 874 * exponential transition points. 875 */ 876 float post_warp_compressor_gain = 877 warp_sinf(x[j] + base); 878 879 /* Calculate total gain using master gain. */ 880 float total_gain = master_linear_gain * 881 post_warp_compressor_gain; 882 883 /* Apply final gain. */ 884 *ptr_left++ *= total_gain; 885 *ptr_right++ *= total_gain; 886 } 887 888 if (++i == count) 889 break; 890 891 for (j = 0; j < 4; j++) 892 x[j] = x[j] * r4; 893 } 894 895 dk->compressor_gain = x[3] + base; 896 } else { 897 /* Release - exponentially increase gain to 1.0 */ 898 float c = compressor_gain; 899 float r = envelope_rate; 900 float x[4] = {c*r, c*r*r, c*r*r*r, c*r*r*r*r}; 901 float r4 = r*r*r*r; 902 903 i = 0; 904 while (1) { 905 for (j = 0; j < 4; j++) { 906 /* Warp pre-compression gain to smooth out sharp 907 * exponential transition points. 908 */ 909 float post_warp_compressor_gain = 910 warp_sinf(x[j]); 911 912 /* Calculate total gain using master gain. */ 913 float total_gain = master_linear_gain * 914 post_warp_compressor_gain; 915 916 /* Apply final gain. */ 917 *ptr_left++ *= total_gain; 918 *ptr_right++ *= total_gain; 919 } 920 921 if (++i == count) 922 break; 923 924 for (j = 0; j < 4; j++) 925 x[j] = min(1.0f, x[j] * r4); 926 } 927 928 dk->compressor_gain = x[3]; 929 } 930 } 931 #endif 932 933 /* After one complete divison of samples have been received (and one divison of 934 * samples have been output), we calculate shaped power average 935 * (detector_average) from the input division, update envelope parameters from 936 * detector_average, then prepare the next output division by applying the 937 * envelope to compress the samples. 938 */ 939 static void dk_process_one_division(struct drc_kernel *dk) 940 { 941 dk_update_detector_average(dk); 942 dk_update_envelope(dk); 943 dk_compress_output(dk); 944 } 945 946 /* Copy the input data to the pre-delay buffer, and copy the output data back to 947 * the input buffer */ 948 static void dk_copy_fragment(struct drc_kernel *dk, float *data_channels[], 949 unsigned frame_index, int frames_to_process) 950 { 951 int write_index = dk->pre_delay_write_index; 952 int read_index = dk->pre_delay_read_index; 953 int j; 954 955 for (j = 0; j < DRC_NUM_CHANNELS; ++j) { 956 memcpy(&dk->pre_delay_buffers[j][write_index], 957 &data_channels[j][frame_index], 958 frames_to_process * sizeof(float)); 959 memcpy(&data_channels[j][frame_index], 960 &dk->pre_delay_buffers[j][read_index], 961 frames_to_process * sizeof(float)); 962 } 963 964 dk->pre_delay_write_index = (write_index + frames_to_process) & 965 MAX_PRE_DELAY_FRAMES_MASK; 966 dk->pre_delay_read_index = (read_index + frames_to_process) & 967 MAX_PRE_DELAY_FRAMES_MASK; 968 } 969 970 /* Delay the input sample only and don't do other processing. This is used when 971 * the kernel is disabled. We want to do this to match the processing delay in 972 * kernels of other bands. 973 */ 974 static void dk_process_delay_only(struct drc_kernel *dk, float *data_channels[], 975 unsigned count) 976 { 977 int read_index = dk->pre_delay_read_index; 978 int write_index = dk->pre_delay_write_index; 979 unsigned int i = 0; 980 981 while (i < count) { 982 unsigned int j; 983 unsigned int small = min(read_index, write_index); 984 unsigned int large = max(read_index, write_index); 985 /* chunk is the minimum of readable samples in contiguous 986 * buffer, writable samples in contiguous buffer, and the 987 * available input samples. */ 988 unsigned int chunk = min(large - small, 989 MAX_PRE_DELAY_FRAMES - large); 990 chunk = min(chunk, count - i); 991 for (j = 0; j < DRC_NUM_CHANNELS; ++j) { 992 memcpy(&dk->pre_delay_buffers[j][write_index], 993 &data_channels[j][i], 994 chunk * sizeof(float)); 995 memcpy(&data_channels[j][i], 996 &dk->pre_delay_buffers[j][read_index], 997 chunk * sizeof(float)); 998 } 999 read_index = (read_index + chunk) & MAX_PRE_DELAY_FRAMES_MASK; 1000 write_index = (write_index + chunk) & MAX_PRE_DELAY_FRAMES_MASK; 1001 i += chunk; 1002 } 1003 1004 dk->pre_delay_read_index = read_index; 1005 dk->pre_delay_write_index = write_index; 1006 } 1007 1008 void dk_process(struct drc_kernel *dk, float *data_channels[], unsigned count) 1009 { 1010 unsigned int i = 0; 1011 int fragment; 1012 1013 if (!dk->enabled) { 1014 dk_process_delay_only(dk, data_channels, count); 1015 return; 1016 } 1017 1018 if (!dk->processed) { 1019 dk_update_envelope(dk); 1020 dk_compress_output(dk); 1021 dk->processed = 1; 1022 } 1023 1024 int offset = dk->pre_delay_write_index & DIVISION_FRAMES_MASK; 1025 while (i < count) { 1026 fragment = min(DIVISION_FRAMES - offset, count - i); 1027 dk_copy_fragment(dk, data_channels, i, fragment); 1028 i += fragment; 1029 offset = (offset + fragment) & DIVISION_FRAMES_MASK; 1030 1031 /* Process the input division (32 frames). */ 1032 if (offset == 0) 1033 dk_process_one_division(dk); 1034 } 1035 } 1036