1 /* Copyright (c) 2013 The Chromium OS Authors. All rights reserved. 2 * Use of this source code is governed by a BSD-style license that can be 3 * found in the LICENSE file. 4 */ 5 6 /* Copyright (C) 2011 Google Inc. All rights reserved. 7 * Use of this source code is governed by a BSD-style license that can be 8 * found in the LICENSE.WEBKIT file. 9 */ 10 11 #include <stdio.h> 12 #include <stdlib.h> 13 #include <string.h> 14 15 #include "drc_math.h" 16 #include "drc_kernel.h" 17 18 #define MAX_PRE_DELAY_FRAMES 1024 19 #define MAX_PRE_DELAY_FRAMES_MASK (MAX_PRE_DELAY_FRAMES - 1) 20 #define DEFAULT_PRE_DELAY_FRAMES 256 21 #define DIVISION_FRAMES 32 22 #define DIVISION_FRAMES_MASK (DIVISION_FRAMES - 1) 23 24 #define assert_on_compile(e) ((void)sizeof(char[1 - 2 * !(e)])) 25 #define assert_on_compile_is_power_of_2(n) \ 26 assert_on_compile((n) != 0 && (((n) & ((n) - 1)) == 0)) 27 28 const float uninitialized_value = -1; 29 static int drc_math_initialized; 30 31 void dk_init(struct drc_kernel *dk, float sample_rate) 32 { 33 int i; 34 35 if (!drc_math_initialized) { 36 drc_math_initialized = 1; 37 drc_math_init(); 38 } 39 40 dk->sample_rate = sample_rate; 41 dk->detector_average = 0; 42 dk->compressor_gain = 1; 43 dk->enabled = 0; 44 dk->processed = 0; 45 dk->last_pre_delay_frames = DEFAULT_PRE_DELAY_FRAMES; 46 dk->pre_delay_read_index = 0; 47 dk->pre_delay_write_index = DEFAULT_PRE_DELAY_FRAMES; 48 dk->max_attack_compression_diff_db = -INFINITY; 49 dk->ratio = uninitialized_value; 50 dk->slope = uninitialized_value; 51 dk->linear_threshold = uninitialized_value; 52 dk->db_threshold = uninitialized_value; 53 dk->db_knee = uninitialized_value; 54 dk->knee_threshold = uninitialized_value; 55 dk->ratio_base = uninitialized_value; 56 dk->K = uninitialized_value; 57 58 assert_on_compile_is_power_of_2(DIVISION_FRAMES); 59 assert_on_compile(DIVISION_FRAMES % 4 == 0); 60 /* Allocate predelay buffers */ 61 assert_on_compile_is_power_of_2(MAX_PRE_DELAY_FRAMES); 62 for (i = 0; i < DRC_NUM_CHANNELS; i++) { 63 size_t size = sizeof(float) * MAX_PRE_DELAY_FRAMES; 64 dk->pre_delay_buffers[i] = (float *)calloc(1, size); 65 } 66 } 67 68 void dk_free(struct drc_kernel *dk) 69 { 70 int i; 71 for (i = 0; i < DRC_NUM_CHANNELS; ++i) 72 free(dk->pre_delay_buffers[i]); 73 } 74 75 /* Sets the pre-delay (lookahead) buffer size */ 76 static void set_pre_delay_time(struct drc_kernel *dk, float pre_delay_time) 77 { 78 int i; 79 /* Re-configure look-ahead section pre-delay if delay time has 80 * changed. */ 81 unsigned pre_delay_frames = pre_delay_time * dk->sample_rate; 82 pre_delay_frames = min(pre_delay_frames, MAX_PRE_DELAY_FRAMES - 1); 83 84 /* Make pre_delay_frames multiplies of DIVISION_FRAMES. This way we 85 * won't split a division of samples into two blocks of memory, so it is 86 * easier to process. This may make the actual delay time slightly less 87 * than the specified value, but the difference is less than 1ms. */ 88 pre_delay_frames &= ~DIVISION_FRAMES_MASK; 89 90 /* We need at least one division buffer, so the incoming data won't 91 * overwrite the output data */ 92 pre_delay_frames = max(pre_delay_frames, DIVISION_FRAMES); 93 94 if (dk->last_pre_delay_frames != pre_delay_frames) { 95 dk->last_pre_delay_frames = pre_delay_frames; 96 for (i = 0; i < DRC_NUM_CHANNELS; ++i) { 97 size_t size = sizeof(float) * MAX_PRE_DELAY_FRAMES; 98 memset(dk->pre_delay_buffers[i], 0, size); 99 } 100 101 dk->pre_delay_read_index = 0; 102 dk->pre_delay_write_index = pre_delay_frames; 103 } 104 } 105 106 /* Exponential curve for the knee. It is 1st derivative matched at 107 * dk->linear_threshold and asymptotically approaches the value 108 * dk->linear_threshold + 1 / k. 109 * 110 * This is used only when calculating the static curve, not used when actually 111 * compress the input data (knee_curveK below is used instead). 112 */ 113 static float knee_curve(struct drc_kernel *dk, float x, float k) 114 { 115 /* Linear up to threshold. */ 116 if (x < dk->linear_threshold) 117 return x; 118 119 return dk->linear_threshold + 120 (1 - knee_expf(-k * (x - dk->linear_threshold))) / k; 121 } 122 123 /* Approximate 1st derivative with input and output expressed in dB. This slope 124 * is equal to the inverse of the compression "ratio". In other words, a 125 * compression ratio of 20 would be a slope of 1/20. 126 */ 127 static float slope_at(struct drc_kernel *dk, float x, float k) 128 { 129 if (x < dk->linear_threshold) 130 return 1; 131 132 float x2 = x * 1.001; 133 134 float x_db = linear_to_decibels(x); 135 float x2Db = linear_to_decibels(x2); 136 137 float y_db = linear_to_decibels(knee_curve(dk, x, k)); 138 float y2Db = linear_to_decibels(knee_curve(dk, x2, k)); 139 140 float m = (y2Db - y_db) / (x2Db - x_db); 141 142 return m; 143 } 144 145 static float k_at_slope(struct drc_kernel *dk, float desired_slope) 146 { 147 float x_db = dk->db_threshold + dk->db_knee; 148 float x = decibels_to_linear(x_db); 149 150 /* Approximate k given initial values. */ 151 float minK = 0.1; 152 float maxK = 10000; 153 float k = 5; 154 int i; 155 156 for (i = 0; i < 15; ++i) { 157 /* A high value for k will more quickly asymptotically approach 158 * a slope of 0. */ 159 float slope = slope_at(dk, x, k); 160 161 if (slope < desired_slope) { 162 /* k is too high. */ 163 maxK = k; 164 } else { 165 /* k is too low. */ 166 minK = k; 167 } 168 169 /* Re-calculate based on geometric mean. */ 170 k = sqrtf(minK * maxK); 171 } 172 173 return k; 174 } 175 176 static void update_static_curve_parameters(struct drc_kernel *dk, 177 float db_threshold, 178 float db_knee, float ratio) 179 { 180 if (db_threshold != dk->db_threshold || db_knee != dk->db_knee || 181 ratio != dk->ratio) { 182 /* Threshold and knee. */ 183 dk->db_threshold = db_threshold; 184 dk->linear_threshold = decibels_to_linear(db_threshold); 185 dk->db_knee = db_knee; 186 187 /* Compute knee parameters. */ 188 dk->ratio = ratio; 189 dk->slope = 1 / dk->ratio; 190 191 float k = k_at_slope(dk, 1 / dk->ratio); 192 dk->K = k; 193 /* See knee_curveK() for details */ 194 dk->knee_alpha = dk->linear_threshold + 1 / k; 195 dk->knee_beta = -expf(k * dk->linear_threshold) / k; 196 197 dk->knee_threshold = decibels_to_linear(db_threshold + db_knee); 198 /* See volume_gain() for details */ 199 float y0 = knee_curve(dk, dk->knee_threshold, k); 200 dk->ratio_base = y0 * powf(dk->knee_threshold, -dk->slope); 201 } 202 } 203 204 /* This is the knee part of the compression curve. Returns the output level 205 * given the input level x. */ 206 static float knee_curveK(struct drc_kernel *dk, float x) 207 { 208 /* The formula in knee_curveK is dk->linear_threshold + 209 * (1 - expf(-k * (x - dk->linear_threshold))) / k 210 * which simplifies to (alpha + beta * expf(gamma)) 211 * where alpha = dk->linear_threshold + 1 / k 212 * beta = -expf(k * dk->linear_threshold) / k 213 * gamma = -k * x 214 */ 215 return dk->knee_alpha + dk->knee_beta * knee_expf(-dk->K * x); 216 } 217 218 /* Full compression curve with constant ratio after knee. Returns the ratio of 219 * output and input signal. */ 220 static float volume_gain(struct drc_kernel *dk, float x) 221 { 222 float y; 223 224 if (x < dk->knee_threshold) { 225 if (x < dk->linear_threshold) 226 return 1; 227 y = knee_curveK(dk, x) / x; 228 } else { 229 /* Constant ratio after knee. 230 * log(y/y0) = s * log(x/x0) 231 * => y = y0 * (x/x0)^s 232 * => y = [y0 * (1/x0)^s] * x^s 233 * => y = dk->ratio_base * x^s 234 * => y/x = dk->ratio_base * x^(s - 1) 235 * => y/x = dk->ratio_base * e^(log(x) * (s - 1)) 236 */ 237 y = dk->ratio_base * knee_expf(logf(x) * (dk->slope - 1)); 238 } 239 240 return y; 241 } 242 243 void dk_set_parameters(struct drc_kernel *dk, 244 float db_threshold, 245 float db_knee, 246 float ratio, 247 float attack_time, 248 float release_time, 249 float pre_delay_time, 250 float db_post_gain, 251 float releaseZone1, 252 float releaseZone2, 253 float releaseZone3, 254 float releaseZone4) 255 { 256 float sample_rate = dk->sample_rate; 257 258 update_static_curve_parameters(dk, db_threshold, db_knee, ratio); 259 260 /* Makeup gain. */ 261 float full_range_gain = volume_gain(dk, 1); 262 float full_range_makeup_gain = 1 / full_range_gain; 263 264 /* Empirical/perceptual tuning. */ 265 full_range_makeup_gain = powf(full_range_makeup_gain, 0.6f); 266 267 dk->master_linear_gain = decibels_to_linear(db_post_gain) * 268 full_range_makeup_gain; 269 270 /* Attack parameters. */ 271 attack_time = max(0.001f, attack_time); 272 dk->attack_frames = attack_time * sample_rate; 273 274 /* Release parameters. */ 275 float release_frames = sample_rate * release_time; 276 277 /* Detector release time. */ 278 float sat_release_time = 0.0025f; 279 float sat_release_frames = sat_release_time * sample_rate; 280 dk->sat_release_frames_inv_neg = -1 / sat_release_frames; 281 dk->sat_release_rate_at_neg_two_db = 282 decibels_to_linear(-2 * dk->sat_release_frames_inv_neg) - 1; 283 284 /* Create a smooth function which passes through four points. 285 * Polynomial of the form y = a + b*x + c*x^2 + d*x^3 + e*x^4 286 */ 287 float y1 = release_frames * releaseZone1; 288 float y2 = release_frames * releaseZone2; 289 float y3 = release_frames * releaseZone3; 290 float y4 = release_frames * releaseZone4; 291 292 /* All of these coefficients were derived for 4th order polynomial curve 293 * fitting where the y values match the evenly spaced x values as 294 * follows: (y1 : x == 0, y2 : x == 1, y3 : x == 2, y4 : x == 3) 295 */ 296 dk->kA = 0.9999999999999998f*y1 + 1.8432219684323923e-16f*y2 297 - 1.9373394351676423e-16f*y3 + 8.824516011816245e-18f*y4; 298 dk->kB = -1.5788320352845888f*y1 + 2.3305837032074286f*y2 299 - 0.9141194204840429f*y3 + 0.1623677525612032f*y4; 300 dk->kC = 0.5334142869106424f*y1 - 1.272736789213631f*y2 301 + 0.9258856042207512f*y3 - 0.18656310191776226f*y4; 302 dk->kD = 0.08783463138207234f*y1 - 0.1694162967925622f*y2 303 + 0.08588057951595272f*y3 - 0.00429891410546283f*y4; 304 dk->kE = -0.042416883008123074f*y1 + 0.1115693827987602f*y2 305 - 0.09764676325265872f*y3 + 0.028494263462021576f*y4; 306 307 /* x ranges from 0 -> 3 0 1 2 3 308 * -15 -10 -5 0db 309 * 310 * y calculates adaptive release frames depending on the amount of 311 * compression. 312 */ 313 set_pre_delay_time(dk, pre_delay_time); 314 } 315 316 void dk_set_enabled(struct drc_kernel *dk, int enabled) 317 { 318 dk->enabled = enabled; 319 } 320 321 /* Updates the envelope_rate used for the next division */ 322 static void dk_update_envelope(struct drc_kernel *dk) 323 { 324 const float kA = dk->kA; 325 const float kB = dk->kB; 326 const float kC = dk->kC; 327 const float kD = dk->kD; 328 const float kE = dk->kE; 329 const float attack_frames = dk->attack_frames; 330 331 /* Calculate desired gain */ 332 float desired_gain = dk->detector_average; 333 334 /* Pre-warp so we get desired_gain after sin() warp below. */ 335 float scaled_desired_gain = warp_asinf(desired_gain); 336 337 /* Deal with envelopes */ 338 339 /* envelope_rate is the rate we slew from current compressor level to 340 * the desired level. The exact rate depends on if we're attacking or 341 * releasing and by how much. 342 */ 343 float envelope_rate; 344 345 int is_releasing = scaled_desired_gain > dk->compressor_gain; 346 347 /* compression_diff_db is the difference between current compression 348 * level and the desired level. */ 349 float compression_diff_db = linear_to_decibels( 350 dk->compressor_gain / scaled_desired_gain); 351 352 if (is_releasing) { 353 /* Release mode - compression_diff_db should be negative dB */ 354 dk->max_attack_compression_diff_db = -INFINITY; 355 356 /* Fix gremlins. */ 357 if (isbadf(compression_diff_db)) 358 compression_diff_db = -1; 359 360 /* Adaptive release - higher compression (lower 361 * compression_diff_db) releases faster. Contain within range: 362 * -12 -> 0 then scale to go from 0 -> 3 363 */ 364 float x = compression_diff_db; 365 x = max(-12.0f, x); 366 x = min(0.0f, x); 367 x = 0.25f * (x + 12); 368 369 /* Compute adaptive release curve using 4th order polynomial. 370 * Normal values for the polynomial coefficients would create a 371 * monotonically increasing function. 372 */ 373 float x2 = x * x; 374 float x3 = x2 * x; 375 float x4 = x2 * x2; 376 float release_frames = kA + kB * x + kC * x2 + kD * x3 + 377 kE * x4; 378 379 #define kSpacingDb 5 380 float db_per_frame = kSpacingDb / release_frames; 381 envelope_rate = decibels_to_linear(db_per_frame); 382 } else { 383 /* Attack mode - compression_diff_db should be positive dB */ 384 385 /* Fix gremlins. */ 386 if (isbadf(compression_diff_db)) 387 compression_diff_db = 1; 388 389 /* As long as we're still in attack mode, use a rate based off 390 * the largest compression_diff_db we've encountered so far. 391 */ 392 dk->max_attack_compression_diff_db = max( 393 dk->max_attack_compression_diff_db, 394 compression_diff_db); 395 396 float eff_atten_diff_db = 397 max(0.5f, dk->max_attack_compression_diff_db); 398 399 float x = 0.25f / eff_atten_diff_db; 400 envelope_rate = 1 - powf(x, 1 / attack_frames); 401 } 402 403 dk->envelope_rate = envelope_rate; 404 dk->scaled_desired_gain = scaled_desired_gain; 405 } 406 407 /* For a division of frames, take the absolute values of left channel and right 408 * channel, store the maximum of them in output. */ 409 #if defined(__aarch64__) 410 static inline void max_abs_division(float *output, 411 const float *data0, const float *data1) 412 { 413 int count = DIVISION_FRAMES / 4; 414 415 __asm__ __volatile__( 416 "1: \n" 417 "ld1 {v0.4s}, [%[data0]], #16 \n" 418 "ld1 {v1.4s}, [%[data1]], #16 \n" 419 "fabs v0.4s, v0.4s \n" 420 "fabs v1.4s, v1.4s \n" 421 "fmax v0.4s, v0.4s, v1.4s \n" 422 "st1 {v0.4s}, [%[output]], #16 \n" 423 "subs %w[count], %w[count], #1 \n" 424 "b.ne 1b \n" 425 : /* output */ 426 [data0]"+r"(data0), 427 [data1]"+r"(data1), 428 [output]"+r"(output), 429 [count]"+r"(count) 430 : /* input */ 431 : /* clobber */ 432 "v0", "v1", "memory", "cc" 433 ); 434 } 435 #elif defined(__ARM_NEON__) 436 static inline void max_abs_division(float *output, 437 const float *data0, const float *data1) 438 { 439 int count = DIVISION_FRAMES / 4; 440 441 __asm__ __volatile__( 442 "1: \n" 443 "vld1.32 {q0}, [%[data0]]! \n" 444 "vld1.32 {q1}, [%[data1]]! \n" 445 "vabs.f32 q0, q0 \n" 446 "vabs.f32 q1, q1 \n" 447 "vmax.f32 q0, q1 \n" 448 "vst1.32 {q0}, [%[output]]! \n" 449 "subs %[count], #1 \n" 450 "bne 1b \n" 451 : /* output */ 452 [data0]"+r"(data0), 453 [data1]"+r"(data1), 454 [output]"+r"(output), 455 [count]"+r"(count) 456 : /* input */ 457 : /* clobber */ 458 "q0", "q1", "memory", "cc" 459 ); 460 } 461 #elif defined(__SSE3__) 462 #include <emmintrin.h> 463 static inline void max_abs_division(float *output, 464 const float *data0, const float *data1) 465 { 466 __m128 x, y; 467 int count = DIVISION_FRAMES / 4; 468 469 __asm__ __volatile__( 470 "1: \n" 471 "lddqu (%[data0]), %[x] \n" 472 "lddqu (%[data1]), %[y] \n" 473 "andps %[mask], %[x] \n" 474 "andps %[mask], %[y] \n" 475 "maxps %[y], %[x] \n" 476 "movdqu %[x], (%[output]) \n" 477 "add $16, %[data0] \n" 478 "add $16, %[data1] \n" 479 "add $16, %[output] \n" 480 "sub $1, %[count] \n" 481 "jnz 1b \n" 482 : /* output */ 483 [data0]"+r"(data0), 484 [data1]"+r"(data1), 485 [output]"+r"(output), 486 [count]"+r"(count), 487 [x]"=&x"(x), 488 [y]"=&x"(y) 489 : /* input */ 490 [mask]"x"(_mm_set1_epi32(0x7fffffff)) 491 : /* clobber */ 492 "memory", "cc" 493 ); 494 } 495 #else 496 static inline void max_abs_division(float *output, 497 const float *data0, const float *data1) 498 { 499 int i; 500 for (i = 0; i < DIVISION_FRAMES; i++) 501 output[i] = fmaxf(fabsf(data0[i]), fabsf(data1[i])); 502 } 503 #endif 504 505 /* Update detector_average from the last input division. */ 506 static void dk_update_detector_average(struct drc_kernel *dk) 507 { 508 float abs_input_array[DIVISION_FRAMES]; 509 const float sat_release_frames_inv_neg = dk->sat_release_frames_inv_neg; 510 const float sat_release_rate_at_neg_two_db = 511 dk->sat_release_rate_at_neg_two_db; 512 float detector_average = dk->detector_average; 513 int div_start, i; 514 515 /* Calculate the start index of the last input division */ 516 if (dk->pre_delay_write_index == 0) { 517 div_start = MAX_PRE_DELAY_FRAMES - DIVISION_FRAMES; 518 } else { 519 div_start = dk->pre_delay_write_index - DIVISION_FRAMES; 520 } 521 522 /* The max abs value across all channels for this frame */ 523 max_abs_division(abs_input_array, 524 &dk->pre_delay_buffers[0][div_start], 525 &dk->pre_delay_buffers[1][div_start]); 526 527 for (i = 0; i < DIVISION_FRAMES; i++) { 528 /* Compute compression amount from un-delayed signal */ 529 float abs_input = abs_input_array[i]; 530 531 /* Calculate shaped power on undelayed input. Put through 532 * shaping curve. This is linear up to the threshold, then 533 * enters a "knee" portion followed by the "ratio" portion. The 534 * transition from the threshold to the knee is smooth (1st 535 * derivative matched). The transition from the knee to the 536 * ratio portion is smooth (1st derivative matched). 537 */ 538 float gain = volume_gain(dk, abs_input); 539 int is_release = (gain > detector_average); 540 if (is_release) { 541 if (gain > NEG_TWO_DB) { 542 detector_average += (gain - detector_average) * 543 sat_release_rate_at_neg_two_db; 544 } else { 545 float gain_db = linear_to_decibels(gain); 546 float db_per_frame = gain_db * 547 sat_release_frames_inv_neg; 548 float sat_release_rate = 549 decibels_to_linear(db_per_frame) - 1; 550 detector_average += (gain - detector_average) * 551 sat_release_rate; 552 } 553 } else { 554 detector_average = gain; 555 } 556 557 /* Fix gremlins. */ 558 if (isbadf(detector_average)) 559 detector_average = 1.0f; 560 else 561 detector_average = min(detector_average, 1.0f); 562 } 563 564 dk->detector_average = detector_average; 565 } 566 567 /* Calculate compress_gain from the envelope and apply total_gain to compress 568 * the next output division. */ 569 /* TODO(fbarchard): Port to aarch64 */ 570 #if defined(__ARM_NEON__) 571 #include <arm_neon.h> 572 static void dk_compress_output(struct drc_kernel *dk) 573 { 574 const float master_linear_gain = dk->master_linear_gain; 575 const float envelope_rate = dk->envelope_rate; 576 const float scaled_desired_gain = dk->scaled_desired_gain; 577 const float compressor_gain = dk->compressor_gain; 578 const int div_start = dk->pre_delay_read_index; 579 float *ptr_left = &dk->pre_delay_buffers[0][div_start]; 580 float *ptr_right = &dk->pre_delay_buffers[1][div_start]; 581 int count = DIVISION_FRAMES / 4; 582 583 /* See warp_sinf() for the details for the constants. */ 584 const float32x4_t A7 = vdupq_n_f32(-4.3330336920917034149169921875e-3f); 585 const float32x4_t A5 = vdupq_n_f32(7.9434238374233245849609375e-2f); 586 const float32x4_t A3 = vdupq_n_f32(-0.645892798900604248046875f); 587 const float32x4_t A1 = vdupq_n_f32(1.5707910060882568359375f); 588 589 /* Exponential approach to desired gain. */ 590 if (envelope_rate < 1) { 591 float c = compressor_gain - scaled_desired_gain; 592 float r = 1 - envelope_rate; 593 float32x4_t x0 = {c*r, c*r*r, c*r*r*r, c*r*r*r*r}; 594 float32x4_t x, x2, x4, left, right, tmp1, tmp2; 595 596 __asm__ __volatile( 597 "b 2f \n" 598 "1: \n" 599 "vmul.f32 %q[x0], %q[r4] \n" 600 "2: \n" 601 "vld1.32 {%e[left],%f[left]}, [%[ptr_left]] \n" 602 "vld1.32 {%e[right],%f[right]}, [%[ptr_right]] \n" 603 "vadd.f32 %q[x], %q[x0], %q[base] \n" 604 /* Calculate warp_sin() for four values in x. */ 605 "vmul.f32 %q[x2], %q[x], %q[x] \n" 606 "vmov.f32 %q[tmp1], %q[A5] \n" 607 "vmov.f32 %q[tmp2], %q[A1] \n" 608 "vmul.f32 %q[x4], %q[x2], %q[x2] \n" 609 "vmla.f32 %q[tmp1], %q[A7], %q[x2] \n" 610 "vmla.f32 %q[tmp2], %q[A3], %q[x2] \n" 611 "vmla.f32 %q[tmp2], %q[tmp1], %q[x4] \n" 612 "vmul.f32 %q[tmp2], %q[tmp2], %q[x] \n" 613 /* Now tmp2 contains the result of warp_sin(). */ 614 "vmul.f32 %q[tmp2], %q[tmp2], %q[g] \n" 615 "vmul.f32 %q[left], %q[tmp2] \n" 616 "vmul.f32 %q[right], %q[tmp2] \n" 617 "vst1.32 {%e[left],%f[left]}, [%[ptr_left]]! \n" 618 "vst1.32 {%e[right],%f[right]}, [%[ptr_right]]! \n" 619 "subs %[count], #1 \n" 620 "bne 1b \n" 621 : /* output */ 622 "=r"(count), 623 "=r"(ptr_left), 624 "=r"(ptr_right), 625 "=w"(x0), 626 [x]"=&w"(x), 627 [x2]"=&w"(x2), 628 [x4]"=&w"(x4), 629 [left]"=&w"(left), 630 [right]"=&w"(right), 631 [tmp1]"=&w"(tmp1), 632 [tmp2]"=&w"(tmp2) 633 : /* input */ 634 [count]"0"(count), 635 [ptr_left]"1"(ptr_left), 636 [ptr_right]"2"(ptr_right), 637 [x0]"3"(x0), 638 [A1]"w"(A1), 639 [A3]"w"(A3), 640 [A5]"w"(A5), 641 [A7]"w"(A7), 642 [base]"w"(vdupq_n_f32(scaled_desired_gain)), 643 [r4]"w"(vdupq_n_f32(r*r*r*r)), 644 [g]"w"(vdupq_n_f32(master_linear_gain)) 645 : /* clobber */ 646 "memory", "cc" 647 ); 648 dk->compressor_gain = x[3]; 649 } else { 650 float c = compressor_gain; 651 float r = envelope_rate; 652 float32x4_t x = {c*r, c*r*r, c*r*r*r, c*r*r*r*r}; 653 float32x4_t x2, x4, left, right, tmp1, tmp2; 654 655 __asm__ __volatile( 656 "b 2f \n" 657 "1: \n" 658 "vmul.f32 %q[x], %q[r4] \n" 659 "2: \n" 660 "vld1.32 {%e[left],%f[left]}, [%[ptr_left]] \n" 661 "vld1.32 {%e[right],%f[right]}, [%[ptr_right]] \n" 662 "vmin.f32 %q[x], %q[one] \n" 663 /* Calculate warp_sin() for four values in x. */ 664 "vmul.f32 %q[x2], %q[x], %q[x] \n" 665 "vmov.f32 %q[tmp1], %q[A5] \n" 666 "vmov.f32 %q[tmp2], %q[A1] \n" 667 "vmul.f32 %q[x4], %q[x2], %q[x2] \n" 668 "vmla.f32 %q[tmp1], %q[A7], %q[x2] \n" 669 "vmla.f32 %q[tmp2], %q[A3], %q[x2] \n" 670 "vmla.f32 %q[tmp2], %q[tmp1], %q[x4] \n" 671 "vmul.f32 %q[tmp2], %q[tmp2], %q[x] \n" 672 /* Now tmp2 contains the result of warp_sin(). */ 673 "vmul.f32 %q[tmp2], %q[tmp2], %q[g] \n" 674 "vmul.f32 %q[left], %q[tmp2] \n" 675 "vmul.f32 %q[right], %q[tmp2] \n" 676 "vst1.32 {%e[left],%f[left]}, [%[ptr_left]]! \n" 677 "vst1.32 {%e[right],%f[right]}, [%[ptr_right]]! \n" 678 "subs %[count], #1 \n" 679 "bne 1b \n" 680 : /* output */ 681 "=r"(count), 682 "=r"(ptr_left), 683 "=r"(ptr_right), 684 "=w"(x), 685 [x2]"=&w"(x2), 686 [x4]"=&w"(x4), 687 [left]"=&w"(left), 688 [right]"=&w"(right), 689 [tmp1]"=&w"(tmp1), 690 [tmp2]"=&w"(tmp2) 691 : /* input */ 692 [count]"0"(count), 693 [ptr_left]"1"(ptr_left), 694 [ptr_right]"2"(ptr_right), 695 [x]"3"(x), 696 [A1]"w"(A1), 697 [A3]"w"(A3), 698 [A5]"w"(A5), 699 [A7]"w"(A7), 700 [one]"w"(vdupq_n_f32(1)), 701 [r4]"w"(vdupq_n_f32(r*r*r*r)), 702 [g]"w"(vdupq_n_f32(master_linear_gain)) 703 : /* clobber */ 704 "memory", "cc" 705 ); 706 dk->compressor_gain = x[3]; 707 } 708 } 709 #elif defined(__SSE3__) && defined(__x86_64__) 710 #include <emmintrin.h> 711 static void dk_compress_output(struct drc_kernel *dk) 712 { 713 const float master_linear_gain = dk->master_linear_gain; 714 const float envelope_rate = dk->envelope_rate; 715 const float scaled_desired_gain = dk->scaled_desired_gain; 716 const float compressor_gain = dk->compressor_gain; 717 const int div_start = dk->pre_delay_read_index; 718 float *ptr_left = &dk->pre_delay_buffers[0][div_start]; 719 float *ptr_right = &dk->pre_delay_buffers[1][div_start]; 720 int count = DIVISION_FRAMES / 4; 721 722 /* See warp_sinf() for the details for the constants. */ 723 const __m128 A7 = _mm_set1_ps(-4.3330336920917034149169921875e-3f); 724 const __m128 A5 = _mm_set1_ps(7.9434238374233245849609375e-2f); 725 const __m128 A3 = _mm_set1_ps(-0.645892798900604248046875f); 726 const __m128 A1 = _mm_set1_ps(1.5707910060882568359375f); 727 728 /* Exponential approach to desired gain. */ 729 if (envelope_rate < 1) { 730 float c = compressor_gain - scaled_desired_gain; 731 float r = 1 - envelope_rate; 732 __m128 x0 = {c*r, c*r*r, c*r*r*r, c*r*r*r*r}; 733 __m128 x, x2, x4, left, right, tmp1, tmp2; 734 735 __asm__ __volatile( 736 "jmp 2f \n" 737 "1: \n" 738 "mulps %[r4], %[x0] \n" 739 "2: \n" 740 "lddqu (%[ptr_left]), %[left] \n" 741 "lddqu (%[ptr_right]), %[right] \n" 742 "movaps %[x0], %[x] \n" 743 "addps %[base], %[x] \n" 744 /* Calculate warp_sin() for four values in x. */ 745 "movaps %[x], %[x2] \n" 746 "mulps %[x], %[x2] \n" 747 "movaps %[x2], %[x4] \n" 748 "movaps %[x2], %[tmp1] \n" 749 "movaps %[x2], %[tmp2] \n" 750 "mulps %[x2], %[x4] \n" 751 "mulps %[A7], %[tmp1] \n" 752 "mulps %[A3], %[tmp2] \n" 753 "addps %[A5], %[tmp1] \n" 754 "addps %[A1], %[tmp2] \n" 755 "mulps %[x4], %[tmp1] \n" 756 "addps %[tmp1], %[tmp2] \n" 757 "mulps %[x], %[tmp2] \n" 758 /* Now tmp2 contains the result of warp_sin(). */ 759 "mulps %[g], %[tmp2] \n" 760 "mulps %[tmp2], %[left] \n" 761 "mulps %[tmp2], %[right] \n" 762 "movdqu %[left], (%[ptr_left]) \n" 763 "movdqu %[right], (%[ptr_right]) \n" 764 "add $16, %[ptr_left] \n" 765 "add $16, %[ptr_right] \n" 766 "sub $1, %[count] \n" 767 "jne 1b \n" 768 : /* output */ 769 "=r"(count), 770 "=r"(ptr_left), 771 "=r"(ptr_right), 772 "=x"(x0), 773 [x]"=&x"(x), 774 [x2]"=&x"(x2), 775 [x4]"=&x"(x4), 776 [left]"=&x"(left), 777 [right]"=&x"(right), 778 [tmp1]"=&x"(tmp1), 779 [tmp2]"=&x"(tmp2) 780 : /* input */ 781 [count]"0"(count), 782 [ptr_left]"1"(ptr_left), 783 [ptr_right]"2"(ptr_right), 784 [x0]"3"(x0), 785 [A1]"x"(A1), 786 [A3]"x"(A3), 787 [A5]"x"(A5), 788 [A7]"x"(A7), 789 [base]"x"(_mm_set1_ps(scaled_desired_gain)), 790 [r4]"x"(_mm_set1_ps(r*r*r*r)), 791 [g]"x"(_mm_set1_ps(master_linear_gain)) 792 : /* clobber */ 793 "memory", "cc" 794 ); 795 dk->compressor_gain = x[3]; 796 } else { 797 /* See warp_sinf() for the details for the constants. */ 798 __m128 A7 = _mm_set1_ps(-4.3330336920917034149169921875e-3f); 799 __m128 A5 = _mm_set1_ps(7.9434238374233245849609375e-2f); 800 __m128 A3 = _mm_set1_ps(-0.645892798900604248046875f); 801 __m128 A1 = _mm_set1_ps(1.5707910060882568359375f); 802 803 float c = compressor_gain; 804 float r = envelope_rate; 805 __m128 x = {c*r, c*r*r, c*r*r*r, c*r*r*r*r}; 806 __m128 x2, x4, left, right, tmp1, tmp2; 807 808 __asm__ __volatile( 809 "jmp 2f \n" 810 "1: \n" 811 "mulps %[r4], %[x] \n" 812 "2: \n" 813 "lddqu (%[ptr_left]), %[left] \n" 814 "lddqu (%[ptr_right]), %[right] \n" 815 "minps %[one], %[x] \n" 816 /* Calculate warp_sin() for four values in x. */ 817 "movaps %[x], %[x2] \n" 818 "mulps %[x], %[x2] \n" 819 "movaps %[x2], %[x4] \n" 820 "movaps %[x2], %[tmp1] \n" 821 "movaps %[x2], %[tmp2] \n" 822 "mulps %[x2], %[x4] \n" 823 "mulps %[A7], %[tmp1] \n" 824 "mulps %[A3], %[tmp2] \n" 825 "addps %[A5], %[tmp1] \n" 826 "addps %[A1], %[tmp2] \n" 827 "mulps %[x4], %[tmp1] \n" 828 "addps %[tmp1], %[tmp2] \n" 829 "mulps %[x], %[tmp2] \n" 830 /* Now tmp2 contains the result of warp_sin(). */ 831 "mulps %[g], %[tmp2] \n" 832 "mulps %[tmp2], %[left] \n" 833 "mulps %[tmp2], %[right] \n" 834 "movdqu %[left], (%[ptr_left]) \n" 835 "movdqu %[right], (%[ptr_right]) \n" 836 "add $16, %[ptr_left] \n" 837 "add $16, %[ptr_right] \n" 838 "sub $1, %[count] \n" 839 "jne 1b \n" 840 : /* output */ 841 "=r"(count), 842 "=r"(ptr_left), 843 "=r"(ptr_right), 844 "=x"(x), 845 [x2]"=&x"(x2), 846 [x4]"=&x"(x4), 847 [left]"=&x"(left), 848 [right]"=&x"(right), 849 [tmp1]"=&x"(tmp1), 850 [tmp2]"=&x"(tmp2) 851 : /* input */ 852 [count]"0"(count), 853 [ptr_left]"1"(ptr_left), 854 [ptr_right]"2"(ptr_right), 855 [x]"3"(x), 856 [A1]"x"(A1), 857 [A3]"x"(A3), 858 [A5]"x"(A5), 859 [A7]"x"(A7), 860 [one]"x"(_mm_set1_ps(1)), 861 [r4]"x"(_mm_set1_ps(r*r*r*r)), 862 [g]"x"(_mm_set1_ps(master_linear_gain)) 863 : /* clobber */ 864 "memory", "cc" 865 ); 866 dk->compressor_gain = x[3]; 867 } 868 } 869 #else 870 static void dk_compress_output(struct drc_kernel *dk) 871 { 872 const float master_linear_gain = dk->master_linear_gain; 873 const float envelope_rate = dk->envelope_rate; 874 const float scaled_desired_gain = dk->scaled_desired_gain; 875 const float compressor_gain = dk->compressor_gain; 876 const int div_start = dk->pre_delay_read_index; 877 float *ptr_left = &dk->pre_delay_buffers[0][div_start]; 878 float *ptr_right = &dk->pre_delay_buffers[1][div_start]; 879 int count = DIVISION_FRAMES / 4; 880 881 int i, j; 882 883 /* Exponential approach to desired gain. */ 884 if (envelope_rate < 1) { 885 /* Attack - reduce gain to desired. */ 886 float c = compressor_gain - scaled_desired_gain; 887 float base = scaled_desired_gain; 888 float r = 1 - envelope_rate; 889 float x[4] = {c*r, c*r*r, c*r*r*r, c*r*r*r*r}; 890 float r4 = r*r*r*r; 891 892 i = 0; 893 while (1) { 894 for (j = 0; j < 4; j++) { 895 /* Warp pre-compression gain to smooth out sharp 896 * exponential transition points. 897 */ 898 float post_warp_compressor_gain = 899 warp_sinf(x[j] + base); 900 901 /* Calculate total gain using master gain. */ 902 float total_gain = master_linear_gain * 903 post_warp_compressor_gain; 904 905 /* Apply final gain. */ 906 *ptr_left++ *= total_gain; 907 *ptr_right++ *= total_gain; 908 } 909 910 if (++i == count) 911 break; 912 913 for (j = 0; j < 4; j++) 914 x[j] = x[j] * r4; 915 } 916 917 dk->compressor_gain = x[3] + base; 918 } else { 919 /* Release - exponentially increase gain to 1.0 */ 920 float c = compressor_gain; 921 float r = envelope_rate; 922 float x[4] = {c*r, c*r*r, c*r*r*r, c*r*r*r*r}; 923 float r4 = r*r*r*r; 924 925 i = 0; 926 while (1) { 927 for (j = 0; j < 4; j++) { 928 /* Warp pre-compression gain to smooth out sharp 929 * exponential transition points. 930 */ 931 float post_warp_compressor_gain = 932 warp_sinf(x[j]); 933 934 /* Calculate total gain using master gain. */ 935 float total_gain = master_linear_gain * 936 post_warp_compressor_gain; 937 938 /* Apply final gain. */ 939 *ptr_left++ *= total_gain; 940 *ptr_right++ *= total_gain; 941 } 942 943 if (++i == count) 944 break; 945 946 for (j = 0; j < 4; j++) 947 x[j] = min(1.0f, x[j] * r4); 948 } 949 950 dk->compressor_gain = x[3]; 951 } 952 } 953 #endif 954 955 /* After one complete divison of samples have been received (and one divison of 956 * samples have been output), we calculate shaped power average 957 * (detector_average) from the input division, update envelope parameters from 958 * detector_average, then prepare the next output division by applying the 959 * envelope to compress the samples. 960 */ 961 static void dk_process_one_division(struct drc_kernel *dk) 962 { 963 dk_update_detector_average(dk); 964 dk_update_envelope(dk); 965 dk_compress_output(dk); 966 } 967 968 /* Copy the input data to the pre-delay buffer, and copy the output data back to 969 * the input buffer */ 970 static void dk_copy_fragment(struct drc_kernel *dk, float *data_channels[], 971 unsigned frame_index, int frames_to_process) 972 { 973 int write_index = dk->pre_delay_write_index; 974 int read_index = dk->pre_delay_read_index; 975 int j; 976 977 for (j = 0; j < DRC_NUM_CHANNELS; ++j) { 978 memcpy(&dk->pre_delay_buffers[j][write_index], 979 &data_channels[j][frame_index], 980 frames_to_process * sizeof(float)); 981 memcpy(&data_channels[j][frame_index], 982 &dk->pre_delay_buffers[j][read_index], 983 frames_to_process * sizeof(float)); 984 } 985 986 dk->pre_delay_write_index = (write_index + frames_to_process) & 987 MAX_PRE_DELAY_FRAMES_MASK; 988 dk->pre_delay_read_index = (read_index + frames_to_process) & 989 MAX_PRE_DELAY_FRAMES_MASK; 990 } 991 992 /* Delay the input sample only and don't do other processing. This is used when 993 * the kernel is disabled. We want to do this to match the processing delay in 994 * kernels of other bands. 995 */ 996 static void dk_process_delay_only(struct drc_kernel *dk, float *data_channels[], 997 unsigned count) 998 { 999 int read_index = dk->pre_delay_read_index; 1000 int write_index = dk->pre_delay_write_index; 1001 int i = 0; 1002 1003 while (i < count) { 1004 int j; 1005 int small = min(read_index, write_index); 1006 int large = max(read_index, write_index); 1007 /* chunk is the minimum of readable samples in contiguous 1008 * buffer, writable samples in contiguous buffer, and the 1009 * available input samples. */ 1010 int chunk = min(large - small, MAX_PRE_DELAY_FRAMES - large); 1011 chunk = min(chunk, count - i); 1012 for (j = 0; j < DRC_NUM_CHANNELS; ++j) { 1013 memcpy(&dk->pre_delay_buffers[j][write_index], 1014 &data_channels[j][i], 1015 chunk * sizeof(float)); 1016 memcpy(&data_channels[j][i], 1017 &dk->pre_delay_buffers[j][read_index], 1018 chunk * sizeof(float)); 1019 } 1020 read_index = (read_index + chunk) & MAX_PRE_DELAY_FRAMES_MASK; 1021 write_index = (write_index + chunk) & MAX_PRE_DELAY_FRAMES_MASK; 1022 i += chunk; 1023 } 1024 1025 dk->pre_delay_read_index = read_index; 1026 dk->pre_delay_write_index = write_index; 1027 } 1028 1029 void dk_process(struct drc_kernel *dk, float *data_channels[], unsigned count) 1030 { 1031 int i = 0; 1032 int fragment; 1033 1034 if (!dk->enabled) { 1035 dk_process_delay_only(dk, data_channels, count); 1036 return; 1037 } 1038 1039 if (!dk->processed) { 1040 dk_update_envelope(dk); 1041 dk_compress_output(dk); 1042 dk->processed = 1; 1043 } 1044 1045 int offset = dk->pre_delay_write_index & DIVISION_FRAMES_MASK; 1046 while (i < count) { 1047 fragment = min(DIVISION_FRAMES - offset, count - i); 1048 dk_copy_fragment(dk, data_channels, i, fragment); 1049 i += fragment; 1050 offset = (offset + fragment) & DIVISION_FRAMES_MASK; 1051 1052 /* Process the input division (32 frames). */ 1053 if (offset == 0) 1054 dk_process_one_division(dk); 1055 } 1056 } 1057