Home | History | Annotate | Download | only in dsp
      1 /* Copyright (c) 2013 The Chromium OS Authors. All rights reserved.
      2  * Use of this source code is governed by a BSD-style license that can be
      3  * found in the LICENSE file.
      4  */
      5 
      6 /* Copyright (C) 2011 Google Inc. All rights reserved.
      7  * Use of this source code is governed by a BSD-style license that can be
      8  * found in the LICENSE.WEBKIT file.
      9  */
     10 
     11 #include <stdio.h>
     12 #include <stdlib.h>
     13 #include <string.h>
     14 
     15 #include "drc_math.h"
     16 #include "drc_kernel.h"
     17 
     18 #define MAX_PRE_DELAY_FRAMES 1024
     19 #define MAX_PRE_DELAY_FRAMES_MASK (MAX_PRE_DELAY_FRAMES - 1)
     20 #define DEFAULT_PRE_DELAY_FRAMES 256
     21 #define DIVISION_FRAMES 32
     22 #define DIVISION_FRAMES_MASK (DIVISION_FRAMES - 1)
     23 
     24 #define assert_on_compile(e) ((void)sizeof(char[1 - 2 * !(e)]))
     25 #define assert_on_compile_is_power_of_2(n) \
     26 	assert_on_compile((n) != 0 && (((n) & ((n) - 1)) == 0))
     27 
     28 const float uninitialized_value = -1;
     29 static int drc_math_initialized;
     30 
     31 void dk_init(struct drc_kernel *dk, float sample_rate)
     32 {
     33 	int i;
     34 
     35 	if (!drc_math_initialized) {
     36 		drc_math_initialized = 1;
     37 		drc_math_init();
     38 	}
     39 
     40 	dk->sample_rate = sample_rate;
     41 	dk->detector_average = 0;
     42 	dk->compressor_gain = 1;
     43 	dk->enabled = 0;
     44 	dk->processed = 0;
     45 	dk->last_pre_delay_frames = DEFAULT_PRE_DELAY_FRAMES;
     46 	dk->pre_delay_read_index = 0;
     47 	dk->pre_delay_write_index = DEFAULT_PRE_DELAY_FRAMES;
     48 	dk->max_attack_compression_diff_db = -INFINITY;
     49 	dk->ratio = uninitialized_value;
     50 	dk->slope = uninitialized_value;
     51 	dk->linear_threshold = uninitialized_value;
     52 	dk->db_threshold = uninitialized_value;
     53 	dk->db_knee = uninitialized_value;
     54 	dk->knee_threshold = uninitialized_value;
     55 	dk->ratio_base = uninitialized_value;
     56 	dk->K = uninitialized_value;
     57 
     58 	assert_on_compile_is_power_of_2(DIVISION_FRAMES);
     59 	assert_on_compile(DIVISION_FRAMES % 4 == 0);
     60 	/* Allocate predelay buffers */
     61 	assert_on_compile_is_power_of_2(MAX_PRE_DELAY_FRAMES);
     62 	for (i = 0; i < DRC_NUM_CHANNELS; i++) {
     63 		size_t size = sizeof(float) * MAX_PRE_DELAY_FRAMES;
     64 		dk->pre_delay_buffers[i] = (float *)calloc(1, size);
     65 	}
     66 }
     67 
     68 void dk_free(struct drc_kernel *dk)
     69 {
     70 	int i;
     71 	for (i = 0; i < DRC_NUM_CHANNELS; ++i)
     72 		free(dk->pre_delay_buffers[i]);
     73 }
     74 
     75 /* Sets the pre-delay (lookahead) buffer size */
     76 static void set_pre_delay_time(struct drc_kernel *dk, float pre_delay_time)
     77 {
     78 	int i;
     79 	/* Re-configure look-ahead section pre-delay if delay time has
     80 	 * changed. */
     81 	unsigned pre_delay_frames = pre_delay_time * dk->sample_rate;
     82 	pre_delay_frames = min(pre_delay_frames, MAX_PRE_DELAY_FRAMES - 1);
     83 
     84 	/* Make pre_delay_frames multiplies of DIVISION_FRAMES. This way we
     85 	 * won't split a division of samples into two blocks of memory, so it is
     86 	 * easier to process. This may make the actual delay time slightly less
     87 	 * than the specified value, but the difference is less than 1ms. */
     88 	pre_delay_frames &= ~DIVISION_FRAMES_MASK;
     89 
     90 	/* We need at least one division buffer, so the incoming data won't
     91 	 * overwrite the output data */
     92 	pre_delay_frames = max(pre_delay_frames, DIVISION_FRAMES);
     93 
     94 	if (dk->last_pre_delay_frames != pre_delay_frames) {
     95 		dk->last_pre_delay_frames = pre_delay_frames;
     96 		for (i = 0; i < DRC_NUM_CHANNELS; ++i) {
     97 			size_t size = sizeof(float) * MAX_PRE_DELAY_FRAMES;
     98 			memset(dk->pre_delay_buffers[i], 0, size);
     99 		}
    100 
    101 		dk->pre_delay_read_index = 0;
    102 		dk->pre_delay_write_index = pre_delay_frames;
    103 	}
    104 }
    105 
    106 /* Exponential curve for the knee.  It is 1st derivative matched at
    107  * dk->linear_threshold and asymptotically approaches the value
    108  * dk->linear_threshold + 1 / k.
    109  *
    110  * This is used only when calculating the static curve, not used when actually
    111  * compress the input data (knee_curveK below is used instead).
    112  */
    113 static float knee_curve(struct drc_kernel *dk, float x, float k)
    114 {
    115 	/* Linear up to threshold. */
    116 	if (x < dk->linear_threshold)
    117 		return x;
    118 
    119 	return dk->linear_threshold +
    120 		(1 - knee_expf(-k * (x - dk->linear_threshold))) / k;
    121 }
    122 
    123 /* Approximate 1st derivative with input and output expressed in dB.  This slope
    124  * is equal to the inverse of the compression "ratio".  In other words, a
    125  * compression ratio of 20 would be a slope of 1/20.
    126  */
    127 static float slope_at(struct drc_kernel *dk, float x, float k)
    128 {
    129 	if (x < dk->linear_threshold)
    130 		return 1;
    131 
    132 	float x2 = x * 1.001;
    133 
    134 	float x_db = linear_to_decibels(x);
    135 	float x2Db = linear_to_decibels(x2);
    136 
    137 	float y_db = linear_to_decibels(knee_curve(dk, x, k));
    138 	float y2Db = linear_to_decibels(knee_curve(dk, x2, k));
    139 
    140 	float m = (y2Db - y_db) / (x2Db - x_db);
    141 
    142 	return m;
    143 }
    144 
    145 static float k_at_slope(struct drc_kernel *dk, float desired_slope)
    146 {
    147 	float x_db = dk->db_threshold + dk->db_knee;
    148 	float x = decibels_to_linear(x_db);
    149 
    150 	/* Approximate k given initial values. */
    151 	float minK = 0.1;
    152 	float maxK = 10000;
    153 	float k = 5;
    154 	int i;
    155 
    156 	for (i = 0; i < 15; ++i) {
    157 		/* A high value for k will more quickly asymptotically approach
    158 		 * a slope of 0. */
    159 		float slope = slope_at(dk, x, k);
    160 
    161 		if (slope < desired_slope) {
    162 			/* k is too high. */
    163 			maxK = k;
    164 		} else {
    165 			/* k is too low. */
    166 			minK = k;
    167 		}
    168 
    169 		/* Re-calculate based on geometric mean. */
    170 		k = sqrtf(minK * maxK);
    171 	}
    172 
    173 	return k;
    174 }
    175 
    176 static void update_static_curve_parameters(struct drc_kernel *dk,
    177 					   float db_threshold,
    178 					   float db_knee, float ratio)
    179 {
    180 	if (db_threshold != dk->db_threshold || db_knee != dk->db_knee ||
    181 	    ratio != dk->ratio) {
    182 		/* Threshold and knee. */
    183 		dk->db_threshold = db_threshold;
    184 		dk->linear_threshold = decibels_to_linear(db_threshold);
    185 		dk->db_knee = db_knee;
    186 
    187 		/* Compute knee parameters. */
    188 		dk->ratio = ratio;
    189 		dk->slope = 1 / dk->ratio;
    190 
    191 		float k = k_at_slope(dk, 1 / dk->ratio);
    192 		dk->K = k;
    193 		/* See knee_curveK() for details */
    194 		dk->knee_alpha = dk->linear_threshold + 1 / k;
    195 		dk->knee_beta = -expf(k * dk->linear_threshold) / k;
    196 
    197 		dk->knee_threshold = decibels_to_linear(db_threshold + db_knee);
    198 		/* See volume_gain() for details */
    199 		float y0 = knee_curve(dk, dk->knee_threshold, k);
    200 		dk->ratio_base = y0 * powf(dk->knee_threshold, -dk->slope);
    201 	}
    202 }
    203 
    204 /* This is the knee part of the compression curve. Returns the output level
    205  * given the input level x. */
    206 static float knee_curveK(struct drc_kernel *dk, float x)
    207 {
    208 	/* The formula in knee_curveK is dk->linear_threshold +
    209 	 * (1 - expf(-k * (x - dk->linear_threshold))) / k
    210 	 * which simplifies to (alpha + beta * expf(gamma))
    211 	 * where alpha = dk->linear_threshold + 1 / k
    212 	 *	 beta = -expf(k * dk->linear_threshold) / k
    213 	 *	 gamma = -k * x
    214 	 */
    215 	return dk->knee_alpha + dk->knee_beta * knee_expf(-dk->K * x);
    216 }
    217 
    218 /* Full compression curve with constant ratio after knee. Returns the ratio of
    219  * output and input signal. */
    220 static float volume_gain(struct drc_kernel *dk, float x)
    221 {
    222 	float y;
    223 
    224 	if (x < dk->knee_threshold) {
    225 		if (x < dk->linear_threshold)
    226 			return 1;
    227 		y = knee_curveK(dk, x) / x;
    228 	} else {
    229 		/* Constant ratio after knee.
    230 		 * log(y/y0) = s * log(x/x0)
    231 		 * => y = y0 * (x/x0)^s
    232 		 * => y = [y0 * (1/x0)^s] * x^s
    233 		 * => y = dk->ratio_base * x^s
    234 		 * => y/x = dk->ratio_base * x^(s - 1)
    235 		 * => y/x = dk->ratio_base * e^(log(x) * (s - 1))
    236 		 */
    237 		y = dk->ratio_base * knee_expf(logf(x) * (dk->slope - 1));
    238 	}
    239 
    240 	return y;
    241 }
    242 
    243 void dk_set_parameters(struct drc_kernel *dk,
    244 		       float db_threshold,
    245 		       float db_knee,
    246 		       float ratio,
    247 		       float attack_time,
    248 		       float release_time,
    249 		       float pre_delay_time,
    250 		       float db_post_gain,
    251 		       float releaseZone1,
    252 		       float releaseZone2,
    253 		       float releaseZone3,
    254 		       float releaseZone4)
    255 {
    256 	float sample_rate = dk->sample_rate;
    257 
    258 	update_static_curve_parameters(dk, db_threshold, db_knee, ratio);
    259 
    260 	/* Makeup gain. */
    261 	float full_range_gain = volume_gain(dk, 1);
    262 	float full_range_makeup_gain = 1 / full_range_gain;
    263 
    264 	/* Empirical/perceptual tuning. */
    265 	full_range_makeup_gain = powf(full_range_makeup_gain, 0.6f);
    266 
    267 	dk->master_linear_gain = decibels_to_linear(db_post_gain) *
    268 		full_range_makeup_gain;
    269 
    270 	/* Attack parameters. */
    271 	attack_time = max(0.001f, attack_time);
    272 	dk->attack_frames = attack_time * sample_rate;
    273 
    274 	/* Release parameters. */
    275 	float release_frames = sample_rate * release_time;
    276 
    277 	/* Detector release time. */
    278 	float sat_release_time = 0.0025f;
    279 	float sat_release_frames = sat_release_time * sample_rate;
    280 	dk->sat_release_frames_inv_neg = -1 / sat_release_frames;
    281 	dk->sat_release_rate_at_neg_two_db =
    282 		decibels_to_linear(-2 * dk->sat_release_frames_inv_neg) - 1;
    283 
    284 	/* Create a smooth function which passes through four points.
    285 	 * Polynomial of the form y = a + b*x + c*x^2 + d*x^3 + e*x^4
    286 	 */
    287 	float y1 = release_frames * releaseZone1;
    288 	float y2 = release_frames * releaseZone2;
    289 	float y3 = release_frames * releaseZone3;
    290 	float y4 = release_frames * releaseZone4;
    291 
    292 	/* All of these coefficients were derived for 4th order polynomial curve
    293 	 * fitting where the y values match the evenly spaced x values as
    294 	 * follows: (y1 : x == 0, y2 : x == 1, y3 : x == 2, y4 : x == 3)
    295 	 */
    296 	dk->kA = 0.9999999999999998f*y1 + 1.8432219684323923e-16f*y2
    297 		- 1.9373394351676423e-16f*y3 + 8.824516011816245e-18f*y4;
    298 	dk->kB = -1.5788320352845888f*y1 + 2.3305837032074286f*y2
    299 		- 0.9141194204840429f*y3 + 0.1623677525612032f*y4;
    300 	dk->kC = 0.5334142869106424f*y1 - 1.272736789213631f*y2
    301 		+ 0.9258856042207512f*y3 - 0.18656310191776226f*y4;
    302 	dk->kD = 0.08783463138207234f*y1 - 0.1694162967925622f*y2
    303 		+ 0.08588057951595272f*y3 - 0.00429891410546283f*y4;
    304 	dk->kE = -0.042416883008123074f*y1 + 0.1115693827987602f*y2
    305 		- 0.09764676325265872f*y3 + 0.028494263462021576f*y4;
    306 
    307 	/* x ranges from 0 -> 3	      0	   1	2   3
    308 	 *			     -15  -10  -5   0db
    309 	 *
    310 	 * y calculates adaptive release frames depending on the amount of
    311 	 * compression.
    312 	 */
    313 	set_pre_delay_time(dk, pre_delay_time);
    314 }
    315 
    316 void dk_set_enabled(struct drc_kernel *dk, int enabled)
    317 {
    318 	dk->enabled = enabled;
    319 }
    320 
    321 /* Updates the envelope_rate used for the next division */
    322 static void dk_update_envelope(struct drc_kernel *dk)
    323 {
    324 	const float kA = dk->kA;
    325 	const float kB = dk->kB;
    326 	const float kC = dk->kC;
    327 	const float kD = dk->kD;
    328 	const float kE = dk->kE;
    329 	const float attack_frames = dk->attack_frames;
    330 
    331 	/* Calculate desired gain */
    332 	float desired_gain = dk->detector_average;
    333 
    334 	/* Pre-warp so we get desired_gain after sin() warp below. */
    335 	float scaled_desired_gain = warp_asinf(desired_gain);
    336 
    337 	/* Deal with envelopes */
    338 
    339 	/* envelope_rate is the rate we slew from current compressor level to
    340 	 * the desired level.  The exact rate depends on if we're attacking or
    341 	 * releasing and by how much.
    342 	 */
    343 	float envelope_rate;
    344 
    345 	int is_releasing = scaled_desired_gain > dk->compressor_gain;
    346 
    347 	/* compression_diff_db is the difference between current compression
    348 	 * level and the desired level. */
    349 	float compression_diff_db = linear_to_decibels(
    350 		dk->compressor_gain / scaled_desired_gain);
    351 
    352 	if (is_releasing) {
    353 		/* Release mode - compression_diff_db should be negative dB */
    354 		dk->max_attack_compression_diff_db = -INFINITY;
    355 
    356 		/* Fix gremlins. */
    357 		if (isbadf(compression_diff_db))
    358 			compression_diff_db = -1;
    359 
    360 		/* Adaptive release - higher compression (lower
    361 		 * compression_diff_db) releases faster. Contain within range:
    362 		 * -12 -> 0 then scale to go from 0 -> 3
    363 		 */
    364 		float x = compression_diff_db;
    365 		x = max(-12.0f, x);
    366 		x = min(0.0f, x);
    367 		x = 0.25f * (x + 12);
    368 
    369 		/* Compute adaptive release curve using 4th order polynomial.
    370 		 * Normal values for the polynomial coefficients would create a
    371 		 * monotonically increasing function.
    372 		 */
    373 		float x2 = x * x;
    374 		float x3 = x2 * x;
    375 		float x4 = x2 * x2;
    376 		float release_frames = kA + kB * x + kC * x2 + kD * x3 +
    377 			kE * x4;
    378 
    379 #define kSpacingDb 5
    380 		float db_per_frame = kSpacingDb / release_frames;
    381 		envelope_rate = decibels_to_linear(db_per_frame);
    382 	} else {
    383 		/* Attack mode - compression_diff_db should be positive dB */
    384 
    385 		/* Fix gremlins. */
    386 		if (isbadf(compression_diff_db))
    387 			compression_diff_db = 1;
    388 
    389 		/* As long as we're still in attack mode, use a rate based off
    390 		 * the largest compression_diff_db we've encountered so far.
    391 		 */
    392 		dk->max_attack_compression_diff_db = max(
    393 			dk->max_attack_compression_diff_db,
    394 			compression_diff_db);
    395 
    396 		float eff_atten_diff_db =
    397 			max(0.5f, dk->max_attack_compression_diff_db);
    398 
    399 		float x = 0.25f / eff_atten_diff_db;
    400 		envelope_rate = 1 - powf(x, 1 / attack_frames);
    401 	}
    402 
    403 	dk->envelope_rate = envelope_rate;
    404 	dk->scaled_desired_gain = scaled_desired_gain;
    405 }
    406 
    407 /* For a division of frames, take the absolute values of left channel and right
    408  * channel, store the maximum of them in output. */
    409 #if defined(__aarch64__)
    410 static inline void max_abs_division(float *output,
    411 				    const float *data0, const float *data1)
    412 {
    413 	int count = DIVISION_FRAMES / 4;
    414 
    415 	__asm__ __volatile__(
    416 		"1:                                     \n"
    417 		"ld1 {v0.4s}, [%[data0]], #16           \n"
    418 		"ld1 {v1.4s}, [%[data1]], #16           \n"
    419 		"fabs v0.4s, v0.4s                      \n"
    420 		"fabs v1.4s, v1.4s                      \n"
    421 		"fmax v0.4s, v0.4s, v1.4s               \n"
    422 		"st1 {v0.4s}, [%[output]], #16          \n"
    423 		"subs %w[count], %w[count], #1          \n"
    424 		"b.ne 1b                                \n"
    425 		: /* output */
    426 		  [data0]"+r"(data0),
    427 		  [data1]"+r"(data1),
    428 		  [output]"+r"(output),
    429 		  [count]"+r"(count)
    430 		: /* input */
    431 		: /* clobber */
    432 		  "v0", "v1", "memory", "cc"
    433 		);
    434 }
    435 #elif defined(__ARM_NEON__)
    436 static inline void max_abs_division(float *output,
    437 				    const float *data0, const float *data1)
    438 {
    439 	int count = DIVISION_FRAMES / 4;
    440 
    441 	__asm__ __volatile__(
    442 		"1:                                     \n"
    443 		"vld1.32 {q0}, [%[data0]]!              \n"
    444 		"vld1.32 {q1}, [%[data1]]!              \n"
    445 		"vabs.f32 q0, q0                        \n"
    446 		"vabs.f32 q1, q1                        \n"
    447 		"vmax.f32 q0, q1                        \n"
    448 		"vst1.32 {q0}, [%[output]]!             \n"
    449 		"subs %[count], #1                      \n"
    450 		"bne 1b                                 \n"
    451 		: /* output */
    452 		  [data0]"+r"(data0),
    453 		  [data1]"+r"(data1),
    454 		  [output]"+r"(output),
    455 		  [count]"+r"(count)
    456 		: /* input */
    457 		: /* clobber */
    458 		  "q0", "q1", "memory", "cc"
    459 		);
    460 }
    461 #elif defined(__SSE3__)
    462 #include <emmintrin.h>
    463 static inline void max_abs_division(float *output,
    464 				    const float *data0, const float *data1)
    465 {
    466 	__m128 x, y;
    467 	int count = DIVISION_FRAMES / 4;
    468 
    469 	__asm__ __volatile__(
    470 		"1:                                     \n"
    471 		"lddqu (%[data0]), %[x]                 \n"
    472 		"lddqu (%[data1]), %[y]                 \n"
    473 		"andps %[mask], %[x]                    \n"
    474 		"andps %[mask], %[y]                    \n"
    475 		"maxps %[y], %[x]                       \n"
    476 		"movdqu %[x], (%[output])               \n"
    477 		"add $16, %[data0]                      \n"
    478 		"add $16, %[data1]                      \n"
    479 		"add $16, %[output]                     \n"
    480 		"sub $1, %[count]                       \n"
    481 		"jnz 1b                                 \n"
    482 		: /* output */
    483 		  [data0]"+r"(data0),
    484 		  [data1]"+r"(data1),
    485 		  [output]"+r"(output),
    486 		  [count]"+r"(count),
    487 		  [x]"=&x"(x),
    488 		  [y]"=&x"(y)
    489 		: /* input */
    490 		  [mask]"x"(_mm_set1_epi32(0x7fffffff))
    491 		: /* clobber */
    492 		  "memory", "cc"
    493 		);
    494 }
    495 #else
    496 static inline void max_abs_division(float *output,
    497 				    const float *data0, const float *data1)
    498 {
    499 	int i;
    500 	for (i = 0; i < DIVISION_FRAMES; i++)
    501 		output[i] = fmaxf(fabsf(data0[i]), fabsf(data1[i]));
    502 }
    503 #endif
    504 
    505 /* Update detector_average from the last input division. */
    506 static void dk_update_detector_average(struct drc_kernel *dk)
    507 {
    508 	float abs_input_array[DIVISION_FRAMES];
    509 	const float sat_release_frames_inv_neg = dk->sat_release_frames_inv_neg;
    510 	const float sat_release_rate_at_neg_two_db =
    511 		dk->sat_release_rate_at_neg_two_db;
    512 	float detector_average = dk->detector_average;
    513 	int div_start, i;
    514 
    515 	/* Calculate the start index of the last input division */
    516 	if (dk->pre_delay_write_index == 0) {
    517 		div_start = MAX_PRE_DELAY_FRAMES - DIVISION_FRAMES;
    518 	} else {
    519 		div_start = dk->pre_delay_write_index - DIVISION_FRAMES;
    520 	}
    521 
    522 	/* The max abs value across all channels for this frame */
    523 	max_abs_division(abs_input_array,
    524 			 &dk->pre_delay_buffers[0][div_start],
    525 			 &dk->pre_delay_buffers[1][div_start]);
    526 
    527 	for (i = 0; i < DIVISION_FRAMES; i++) {
    528 		/* Compute compression amount from un-delayed signal */
    529 		float abs_input = abs_input_array[i];
    530 
    531 		/* Calculate shaped power on undelayed input.  Put through
    532 		 * shaping curve. This is linear up to the threshold, then
    533 		 * enters a "knee" portion followed by the "ratio" portion. The
    534 		 * transition from the threshold to the knee is smooth (1st
    535 		 * derivative matched). The transition from the knee to the
    536 		 * ratio portion is smooth (1st derivative matched).
    537 		 */
    538 		float gain = volume_gain(dk, abs_input);
    539 		int is_release = (gain > detector_average);
    540 		if (is_release) {
    541 			if (gain > NEG_TWO_DB) {
    542 				detector_average += (gain - detector_average) *
    543 					sat_release_rate_at_neg_two_db;
    544 			} else {
    545 				float gain_db = linear_to_decibels(gain);
    546 				float db_per_frame = gain_db *
    547 					sat_release_frames_inv_neg;
    548 				float sat_release_rate =
    549 					decibels_to_linear(db_per_frame) - 1;
    550 				detector_average += (gain - detector_average) *
    551 					sat_release_rate;
    552 			}
    553 		} else {
    554 			detector_average = gain;
    555 		}
    556 
    557 		/* Fix gremlins. */
    558 		if (isbadf(detector_average))
    559 			detector_average = 1.0f;
    560 		else
    561 			detector_average = min(detector_average, 1.0f);
    562 	}
    563 
    564 	dk->detector_average = detector_average;
    565 }
    566 
    567 /* Calculate compress_gain from the envelope and apply total_gain to compress
    568  * the next output division. */
    569 /* TODO(fbarchard): Port to aarch64 */
    570 #if defined(__ARM_NEON__)
    571 #include <arm_neon.h>
    572 static void dk_compress_output(struct drc_kernel *dk)
    573 {
    574 	const float master_linear_gain = dk->master_linear_gain;
    575 	const float envelope_rate = dk->envelope_rate;
    576 	const float scaled_desired_gain = dk->scaled_desired_gain;
    577 	const float compressor_gain = dk->compressor_gain;
    578 	const int div_start = dk->pre_delay_read_index;
    579 	float *ptr_left = &dk->pre_delay_buffers[0][div_start];
    580 	float *ptr_right = &dk->pre_delay_buffers[1][div_start];
    581 	int count = DIVISION_FRAMES / 4;
    582 
    583 	/* See warp_sinf() for the details for the constants. */
    584 	const float32x4_t A7 = vdupq_n_f32(-4.3330336920917034149169921875e-3f);
    585 	const float32x4_t A5 = vdupq_n_f32(7.9434238374233245849609375e-2f);
    586 	const float32x4_t A3 = vdupq_n_f32(-0.645892798900604248046875f);
    587 	const float32x4_t A1 = vdupq_n_f32(1.5707910060882568359375f);
    588 
    589 	/* Exponential approach to desired gain. */
    590 	if (envelope_rate < 1) {
    591 		float c = compressor_gain - scaled_desired_gain;
    592 		float r = 1 - envelope_rate;
    593 		float32x4_t x0 = {c*r, c*r*r, c*r*r*r, c*r*r*r*r};
    594 		float32x4_t x, x2, x4, left, right, tmp1, tmp2;
    595 
    596 		__asm__ __volatile(
    597 			"b 2f                                               \n"
    598 			"1:                                                 \n"
    599 			"vmul.f32 %q[x0], %q[r4]                            \n"
    600 			"2:                                                 \n"
    601 			"vld1.32 {%e[left],%f[left]}, [%[ptr_left]]         \n"
    602 			"vld1.32 {%e[right],%f[right]}, [%[ptr_right]]      \n"
    603 			"vadd.f32 %q[x], %q[x0], %q[base]                   \n"
    604 			/* Calculate warp_sin() for four values in x. */
    605 			"vmul.f32 %q[x2], %q[x], %q[x]                      \n"
    606 			"vmov.f32 %q[tmp1], %q[A5]                          \n"
    607 			"vmov.f32 %q[tmp2], %q[A1]                          \n"
    608 			"vmul.f32 %q[x4], %q[x2], %q[x2]                    \n"
    609 			"vmla.f32 %q[tmp1], %q[A7], %q[x2]                  \n"
    610 			"vmla.f32 %q[tmp2], %q[A3], %q[x2]                  \n"
    611 			"vmla.f32 %q[tmp2], %q[tmp1], %q[x4]                \n"
    612 			"vmul.f32 %q[tmp2], %q[tmp2], %q[x]                 \n"
    613 			/* Now tmp2 contains the result of warp_sin(). */
    614 			"vmul.f32 %q[tmp2], %q[tmp2], %q[g]                 \n"
    615 			"vmul.f32 %q[left], %q[tmp2]                        \n"
    616 			"vmul.f32 %q[right], %q[tmp2]                       \n"
    617 			"vst1.32 {%e[left],%f[left]}, [%[ptr_left]]!        \n"
    618 			"vst1.32 {%e[right],%f[right]}, [%[ptr_right]]!     \n"
    619 			"subs %[count], #1                                  \n"
    620 			"bne 1b                                             \n"
    621 			: /* output */
    622 			  "=r"(count),
    623 			  "=r"(ptr_left),
    624 			  "=r"(ptr_right),
    625 			  "=w"(x0),
    626 			  [x]"=&w"(x),
    627 			  [x2]"=&w"(x2),
    628 			  [x4]"=&w"(x4),
    629 			  [left]"=&w"(left),
    630 			  [right]"=&w"(right),
    631 			  [tmp1]"=&w"(tmp1),
    632 			  [tmp2]"=&w"(tmp2)
    633 			: /* input */
    634 			  [count]"0"(count),
    635 			  [ptr_left]"1"(ptr_left),
    636 			  [ptr_right]"2"(ptr_right),
    637 			  [x0]"3"(x0),
    638 			  [A1]"w"(A1),
    639 			  [A3]"w"(A3),
    640 			  [A5]"w"(A5),
    641 			  [A7]"w"(A7),
    642 			  [base]"w"(vdupq_n_f32(scaled_desired_gain)),
    643 			  [r4]"w"(vdupq_n_f32(r*r*r*r)),
    644 			  [g]"w"(vdupq_n_f32(master_linear_gain))
    645 			: /* clobber */
    646 			  "memory", "cc"
    647 			);
    648 		dk->compressor_gain = x[3];
    649 	} else {
    650 		float c = compressor_gain;
    651 		float r = envelope_rate;
    652 		float32x4_t x = {c*r, c*r*r, c*r*r*r, c*r*r*r*r};
    653 		float32x4_t x2, x4, left, right, tmp1, tmp2;
    654 
    655 		__asm__ __volatile(
    656 			"b 2f                                               \n"
    657 			"1:                                                 \n"
    658 			"vmul.f32 %q[x], %q[r4]                             \n"
    659 			"2:                                                 \n"
    660 			"vld1.32 {%e[left],%f[left]}, [%[ptr_left]]         \n"
    661 			"vld1.32 {%e[right],%f[right]}, [%[ptr_right]]      \n"
    662 			"vmin.f32 %q[x], %q[one]                            \n"
    663 			/* Calculate warp_sin() for four values in x. */
    664 			"vmul.f32 %q[x2], %q[x], %q[x]                      \n"
    665 			"vmov.f32 %q[tmp1], %q[A5]                          \n"
    666 			"vmov.f32 %q[tmp2], %q[A1]                          \n"
    667 			"vmul.f32 %q[x4], %q[x2], %q[x2]                    \n"
    668 			"vmla.f32 %q[tmp1], %q[A7], %q[x2]                  \n"
    669 			"vmla.f32 %q[tmp2], %q[A3], %q[x2]                  \n"
    670 			"vmla.f32 %q[tmp2], %q[tmp1], %q[x4]                \n"
    671 			"vmul.f32 %q[tmp2], %q[tmp2], %q[x]                 \n"
    672 			/* Now tmp2 contains the result of warp_sin(). */
    673 			"vmul.f32 %q[tmp2], %q[tmp2], %q[g]                 \n"
    674 			"vmul.f32 %q[left], %q[tmp2]                        \n"
    675 			"vmul.f32 %q[right], %q[tmp2]                       \n"
    676 			"vst1.32 {%e[left],%f[left]}, [%[ptr_left]]!        \n"
    677 			"vst1.32 {%e[right],%f[right]}, [%[ptr_right]]!     \n"
    678 			"subs %[count], #1                                  \n"
    679 			"bne 1b                                             \n"
    680 			: /* output */
    681 			  "=r"(count),
    682 			  "=r"(ptr_left),
    683 			  "=r"(ptr_right),
    684 			  "=w"(x),
    685 			  [x2]"=&w"(x2),
    686 			  [x4]"=&w"(x4),
    687 			  [left]"=&w"(left),
    688 			  [right]"=&w"(right),
    689 			  [tmp1]"=&w"(tmp1),
    690 			  [tmp2]"=&w"(tmp2)
    691 			: /* input */
    692 			  [count]"0"(count),
    693 			  [ptr_left]"1"(ptr_left),
    694 			  [ptr_right]"2"(ptr_right),
    695 			  [x]"3"(x),
    696 			  [A1]"w"(A1),
    697 			  [A3]"w"(A3),
    698 			  [A5]"w"(A5),
    699 			  [A7]"w"(A7),
    700 			  [one]"w"(vdupq_n_f32(1)),
    701 			  [r4]"w"(vdupq_n_f32(r*r*r*r)),
    702 			  [g]"w"(vdupq_n_f32(master_linear_gain))
    703 			: /* clobber */
    704 			  "memory", "cc"
    705 			);
    706 		dk->compressor_gain = x[3];
    707 	}
    708 }
    709 #elif defined(__SSE3__) && defined(__x86_64__)
    710 #include <emmintrin.h>
    711 static void dk_compress_output(struct drc_kernel *dk)
    712 {
    713 	const float master_linear_gain = dk->master_linear_gain;
    714 	const float envelope_rate = dk->envelope_rate;
    715 	const float scaled_desired_gain = dk->scaled_desired_gain;
    716 	const float compressor_gain = dk->compressor_gain;
    717 	const int div_start = dk->pre_delay_read_index;
    718 	float *ptr_left = &dk->pre_delay_buffers[0][div_start];
    719 	float *ptr_right = &dk->pre_delay_buffers[1][div_start];
    720 	int count = DIVISION_FRAMES / 4;
    721 
    722 	/* See warp_sinf() for the details for the constants. */
    723 	const __m128 A7 = _mm_set1_ps(-4.3330336920917034149169921875e-3f);
    724 	const __m128 A5 = _mm_set1_ps(7.9434238374233245849609375e-2f);
    725 	const __m128 A3 = _mm_set1_ps(-0.645892798900604248046875f);
    726 	const __m128 A1 = _mm_set1_ps(1.5707910060882568359375f);
    727 
    728 	/* Exponential approach to desired gain. */
    729 	if (envelope_rate < 1) {
    730 		float c = compressor_gain - scaled_desired_gain;
    731 		float r = 1 - envelope_rate;
    732 		__m128 x0 = {c*r, c*r*r, c*r*r*r, c*r*r*r*r};
    733 		__m128 x, x2, x4, left, right, tmp1, tmp2;
    734 
    735 		__asm__ __volatile(
    736 			"jmp 2f                                     \n"
    737 			"1:                                         \n"
    738 			"mulps %[r4], %[x0]                         \n"
    739 			"2:                                         \n"
    740 			"lddqu (%[ptr_left]), %[left]               \n"
    741 			"lddqu (%[ptr_right]), %[right]             \n"
    742 			"movaps %[x0], %[x]                         \n"
    743 			"addps %[base], %[x]                        \n"
    744 			/* Calculate warp_sin() for four values in x. */
    745 			"movaps %[x], %[x2]                         \n"
    746 			"mulps %[x], %[x2]                          \n"
    747 			"movaps %[x2], %[x4]                        \n"
    748 			"movaps %[x2], %[tmp1]                      \n"
    749 			"movaps %[x2], %[tmp2]                      \n"
    750 			"mulps %[x2], %[x4]                         \n"
    751 			"mulps %[A7], %[tmp1]                       \n"
    752 			"mulps %[A3], %[tmp2]                       \n"
    753 			"addps %[A5], %[tmp1]                       \n"
    754 			"addps %[A1], %[tmp2]                       \n"
    755 			"mulps %[x4], %[tmp1]                       \n"
    756 			"addps %[tmp1], %[tmp2]                     \n"
    757 			"mulps %[x], %[tmp2]                        \n"
    758 			/* Now tmp2 contains the result of warp_sin(). */
    759 			"mulps %[g], %[tmp2]                        \n"
    760 			"mulps %[tmp2], %[left]                     \n"
    761 			"mulps %[tmp2], %[right]                    \n"
    762 			"movdqu %[left], (%[ptr_left])              \n"
    763 			"movdqu %[right], (%[ptr_right])            \n"
    764 			"add $16, %[ptr_left]                       \n"
    765 			"add $16, %[ptr_right]                      \n"
    766 			"sub $1, %[count]                           \n"
    767 			"jne 1b                                     \n"
    768 			: /* output */
    769 			  "=r"(count),
    770 			  "=r"(ptr_left),
    771 			  "=r"(ptr_right),
    772 			  "=x"(x0),
    773 			  [x]"=&x"(x),
    774 			  [x2]"=&x"(x2),
    775 			  [x4]"=&x"(x4),
    776 			  [left]"=&x"(left),
    777 			  [right]"=&x"(right),
    778 			  [tmp1]"=&x"(tmp1),
    779 			  [tmp2]"=&x"(tmp2)
    780 			: /* input */
    781 			  [count]"0"(count),
    782 			  [ptr_left]"1"(ptr_left),
    783 			  [ptr_right]"2"(ptr_right),
    784 			  [x0]"3"(x0),
    785 			  [A1]"x"(A1),
    786 			  [A3]"x"(A3),
    787 			  [A5]"x"(A5),
    788 			  [A7]"x"(A7),
    789 			  [base]"x"(_mm_set1_ps(scaled_desired_gain)),
    790 			  [r4]"x"(_mm_set1_ps(r*r*r*r)),
    791 			  [g]"x"(_mm_set1_ps(master_linear_gain))
    792 			: /* clobber */
    793 			  "memory", "cc"
    794 			);
    795 		dk->compressor_gain = x[3];
    796 	} else {
    797 		/* See warp_sinf() for the details for the constants. */
    798 		__m128 A7 = _mm_set1_ps(-4.3330336920917034149169921875e-3f);
    799 		__m128 A5 = _mm_set1_ps(7.9434238374233245849609375e-2f);
    800 		__m128 A3 = _mm_set1_ps(-0.645892798900604248046875f);
    801 		__m128 A1 = _mm_set1_ps(1.5707910060882568359375f);
    802 
    803 		float c = compressor_gain;
    804 		float r = envelope_rate;
    805 		__m128 x = {c*r, c*r*r, c*r*r*r, c*r*r*r*r};
    806 		__m128 x2, x4, left, right, tmp1, tmp2;
    807 
    808 		__asm__ __volatile(
    809 			"jmp 2f                                     \n"
    810 			"1:                                         \n"
    811 			"mulps %[r4], %[x]                          \n"
    812 			"2:                                         \n"
    813 			"lddqu (%[ptr_left]), %[left]               \n"
    814 			"lddqu (%[ptr_right]), %[right]             \n"
    815 			"minps %[one], %[x]                         \n"
    816 			/* Calculate warp_sin() for four values in x. */
    817 			"movaps %[x], %[x2]                         \n"
    818 			"mulps %[x], %[x2]                          \n"
    819 			"movaps %[x2], %[x4]                        \n"
    820 			"movaps %[x2], %[tmp1]                      \n"
    821 			"movaps %[x2], %[tmp2]                      \n"
    822 			"mulps %[x2], %[x4]                         \n"
    823 			"mulps %[A7], %[tmp1]                       \n"
    824 			"mulps %[A3], %[tmp2]                       \n"
    825 			"addps %[A5], %[tmp1]                       \n"
    826 			"addps %[A1], %[tmp2]                       \n"
    827 			"mulps %[x4], %[tmp1]                       \n"
    828 			"addps %[tmp1], %[tmp2]                     \n"
    829 			"mulps %[x], %[tmp2]                        \n"
    830 			/* Now tmp2 contains the result of warp_sin(). */
    831 			"mulps %[g], %[tmp2]                        \n"
    832 			"mulps %[tmp2], %[left]                     \n"
    833 			"mulps %[tmp2], %[right]                    \n"
    834 			"movdqu %[left], (%[ptr_left])              \n"
    835 			"movdqu %[right], (%[ptr_right])            \n"
    836 			"add $16, %[ptr_left]                       \n"
    837 			"add $16, %[ptr_right]                      \n"
    838 			"sub $1, %[count]                           \n"
    839 			"jne 1b                                     \n"
    840 			: /* output */
    841 			  "=r"(count),
    842 			  "=r"(ptr_left),
    843 			  "=r"(ptr_right),
    844 			  "=x"(x),
    845 			  [x2]"=&x"(x2),
    846 			  [x4]"=&x"(x4),
    847 			  [left]"=&x"(left),
    848 			  [right]"=&x"(right),
    849 			  [tmp1]"=&x"(tmp1),
    850 			  [tmp2]"=&x"(tmp2)
    851 			: /* input */
    852 			  [count]"0"(count),
    853 			  [ptr_left]"1"(ptr_left),
    854 			  [ptr_right]"2"(ptr_right),
    855 			  [x]"3"(x),
    856 			  [A1]"x"(A1),
    857 			  [A3]"x"(A3),
    858 			  [A5]"x"(A5),
    859 			  [A7]"x"(A7),
    860 			  [one]"x"(_mm_set1_ps(1)),
    861 			  [r4]"x"(_mm_set1_ps(r*r*r*r)),
    862 			  [g]"x"(_mm_set1_ps(master_linear_gain))
    863 			: /* clobber */
    864 			  "memory", "cc"
    865 			);
    866 		dk->compressor_gain = x[3];
    867 	}
    868 }
    869 #else
    870 static void dk_compress_output(struct drc_kernel *dk)
    871 {
    872 	const float master_linear_gain = dk->master_linear_gain;
    873 	const float envelope_rate = dk->envelope_rate;
    874 	const float scaled_desired_gain = dk->scaled_desired_gain;
    875 	const float compressor_gain = dk->compressor_gain;
    876 	const int div_start = dk->pre_delay_read_index;
    877 	float *ptr_left = &dk->pre_delay_buffers[0][div_start];
    878 	float *ptr_right = &dk->pre_delay_buffers[1][div_start];
    879 	int count = DIVISION_FRAMES / 4;
    880 
    881 	int i, j;
    882 
    883 	/* Exponential approach to desired gain. */
    884 	if (envelope_rate < 1) {
    885 		/* Attack - reduce gain to desired. */
    886 		float c = compressor_gain - scaled_desired_gain;
    887 		float base = scaled_desired_gain;
    888 		float r = 1 - envelope_rate;
    889 		float x[4] = {c*r, c*r*r, c*r*r*r, c*r*r*r*r};
    890 		float r4 = r*r*r*r;
    891 
    892 		i = 0;
    893 		while (1) {
    894 			for (j = 0; j < 4; j++) {
    895 				/* Warp pre-compression gain to smooth out sharp
    896 				 * exponential transition points.
    897 				 */
    898 				float post_warp_compressor_gain =
    899 					warp_sinf(x[j] + base);
    900 
    901 				/* Calculate total gain using master gain. */
    902 				float total_gain = master_linear_gain *
    903 					post_warp_compressor_gain;
    904 
    905 				/* Apply final gain. */
    906 				*ptr_left++ *= total_gain;
    907 				*ptr_right++ *= total_gain;
    908 			}
    909 
    910 			if (++i == count)
    911 				break;
    912 
    913 			for (j = 0; j < 4; j++)
    914 				x[j] = x[j] * r4;
    915 		}
    916 
    917 		dk->compressor_gain = x[3] + base;
    918 	} else {
    919 		/* Release - exponentially increase gain to 1.0 */
    920 		float c = compressor_gain;
    921 		float r = envelope_rate;
    922 		float x[4] = {c*r, c*r*r, c*r*r*r, c*r*r*r*r};
    923 		float r4 = r*r*r*r;
    924 
    925 		i = 0;
    926 		while (1) {
    927 			for (j = 0; j < 4; j++) {
    928 				/* Warp pre-compression gain to smooth out sharp
    929 				 * exponential transition points.
    930 				 */
    931 				float post_warp_compressor_gain =
    932 					warp_sinf(x[j]);
    933 
    934 				/* Calculate total gain using master gain. */
    935 				float total_gain = master_linear_gain *
    936 					post_warp_compressor_gain;
    937 
    938 				/* Apply final gain. */
    939 				*ptr_left++ *= total_gain;
    940 				*ptr_right++ *= total_gain;
    941 			}
    942 
    943 			if (++i == count)
    944 				break;
    945 
    946 			for (j = 0; j < 4; j++)
    947 				x[j] = min(1.0f, x[j] * r4);
    948 		}
    949 
    950 		dk->compressor_gain = x[3];
    951 	}
    952 }
    953 #endif
    954 
    955 /* After one complete divison of samples have been received (and one divison of
    956  * samples have been output), we calculate shaped power average
    957  * (detector_average) from the input division, update envelope parameters from
    958  * detector_average, then prepare the next output division by applying the
    959  * envelope to compress the samples.
    960  */
    961 static void dk_process_one_division(struct drc_kernel *dk)
    962 {
    963 	dk_update_detector_average(dk);
    964 	dk_update_envelope(dk);
    965 	dk_compress_output(dk);
    966 }
    967 
    968 /* Copy the input data to the pre-delay buffer, and copy the output data back to
    969  * the input buffer */
    970 static void dk_copy_fragment(struct drc_kernel *dk, float *data_channels[],
    971 			     unsigned frame_index, int frames_to_process)
    972 {
    973 	int write_index = dk->pre_delay_write_index;
    974 	int read_index = dk->pre_delay_read_index;
    975 	int j;
    976 
    977 	for (j = 0; j < DRC_NUM_CHANNELS; ++j) {
    978 		memcpy(&dk->pre_delay_buffers[j][write_index],
    979 		       &data_channels[j][frame_index],
    980 		       frames_to_process * sizeof(float));
    981 		memcpy(&data_channels[j][frame_index],
    982 		       &dk->pre_delay_buffers[j][read_index],
    983 		       frames_to_process * sizeof(float));
    984 	}
    985 
    986 	dk->pre_delay_write_index = (write_index + frames_to_process) &
    987 		MAX_PRE_DELAY_FRAMES_MASK;
    988 	dk->pre_delay_read_index = (read_index + frames_to_process) &
    989 		MAX_PRE_DELAY_FRAMES_MASK;
    990 }
    991 
    992 /* Delay the input sample only and don't do other processing. This is used when
    993  * the kernel is disabled. We want to do this to match the processing delay in
    994  * kernels of other bands.
    995  */
    996 static void dk_process_delay_only(struct drc_kernel *dk, float *data_channels[],
    997 				  unsigned count)
    998 {
    999 	int read_index = dk->pre_delay_read_index;
   1000 	int write_index = dk->pre_delay_write_index;
   1001 	int i = 0;
   1002 
   1003 	while (i < count) {
   1004 		int j;
   1005 		int small = min(read_index, write_index);
   1006 		int large = max(read_index, write_index);
   1007 		/* chunk is the minimum of readable samples in contiguous
   1008 		 * buffer, writable samples in contiguous buffer, and the
   1009 		 * available input samples. */
   1010 		int chunk = min(large - small, MAX_PRE_DELAY_FRAMES - large);
   1011 		chunk = min(chunk, count - i);
   1012 		for (j = 0; j < DRC_NUM_CHANNELS; ++j) {
   1013 			memcpy(&dk->pre_delay_buffers[j][write_index],
   1014 			       &data_channels[j][i],
   1015 			       chunk * sizeof(float));
   1016 			memcpy(&data_channels[j][i],
   1017 			       &dk->pre_delay_buffers[j][read_index],
   1018 			       chunk * sizeof(float));
   1019 		}
   1020 		read_index = (read_index + chunk) & MAX_PRE_DELAY_FRAMES_MASK;
   1021 		write_index = (write_index + chunk) & MAX_PRE_DELAY_FRAMES_MASK;
   1022 		i += chunk;
   1023 	}
   1024 
   1025 	dk->pre_delay_read_index = read_index;
   1026 	dk->pre_delay_write_index = write_index;
   1027 }
   1028 
   1029 void dk_process(struct drc_kernel *dk, float *data_channels[], unsigned count)
   1030 {
   1031 	int i = 0;
   1032 	int fragment;
   1033 
   1034 	if (!dk->enabled) {
   1035 		dk_process_delay_only(dk, data_channels, count);
   1036 		return;
   1037 	}
   1038 
   1039 	if (!dk->processed) {
   1040 		dk_update_envelope(dk);
   1041 		dk_compress_output(dk);
   1042 		dk->processed = 1;
   1043 	}
   1044 
   1045 	int offset = dk->pre_delay_write_index & DIVISION_FRAMES_MASK;
   1046 	while (i < count) {
   1047 		fragment = min(DIVISION_FRAMES - offset, count - i);
   1048 		dk_copy_fragment(dk, data_channels, i, fragment);
   1049 		i += fragment;
   1050 		offset = (offset + fragment) & DIVISION_FRAMES_MASK;
   1051 
   1052 		/* Process the input division (32 frames). */
   1053 		if (offset == 0)
   1054 			dk_process_one_division(dk);
   1055 	}
   1056 }
   1057