1 /* 2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 /* 12 * The core AEC algorithm, neon version of speed-critical functions. 13 * 14 * Based on aec_core_sse2.c. 15 */ 16 17 #include <arm_neon.h> 18 #include <math.h> 19 #include <string.h> // memset 20 21 #include "webrtc/common_audio/signal_processing/include/signal_processing_library.h" 22 #include "webrtc/modules/audio_processing/aec/aec_common.h" 23 #include "webrtc/modules/audio_processing/aec/aec_core_internal.h" 24 #include "webrtc/modules/audio_processing/aec/aec_rdft.h" 25 26 enum { kShiftExponentIntoTopMantissa = 8 }; 27 enum { kFloatExponentShift = 23 }; 28 29 __inline static float MulRe(float aRe, float aIm, float bRe, float bIm) { 30 return aRe * bRe - aIm * bIm; 31 } 32 33 __inline static float MulIm(float aRe, float aIm, float bRe, float bIm) { 34 return aRe * bIm + aIm * bRe; 35 } 36 37 static void FilterFarNEON(AecCore* aec, float yf[2][PART_LEN1]) { 38 int i; 39 const int num_partitions = aec->num_partitions; 40 for (i = 0; i < num_partitions; i++) { 41 int j; 42 int xPos = (i + aec->xfBufBlockPos) * PART_LEN1; 43 int pos = i * PART_LEN1; 44 // Check for wrap 45 if (i + aec->xfBufBlockPos >= num_partitions) { 46 xPos -= num_partitions * PART_LEN1; 47 } 48 49 // vectorized code (four at once) 50 for (j = 0; j + 3 < PART_LEN1; j += 4) { 51 const float32x4_t xfBuf_re = vld1q_f32(&aec->xfBuf[0][xPos + j]); 52 const float32x4_t xfBuf_im = vld1q_f32(&aec->xfBuf[1][xPos + j]); 53 const float32x4_t wfBuf_re = vld1q_f32(&aec->wfBuf[0][pos + j]); 54 const float32x4_t wfBuf_im = vld1q_f32(&aec->wfBuf[1][pos + j]); 55 const float32x4_t yf_re = vld1q_f32(&yf[0][j]); 56 const float32x4_t yf_im = vld1q_f32(&yf[1][j]); 57 const float32x4_t a = vmulq_f32(xfBuf_re, wfBuf_re); 58 const float32x4_t e = vmlsq_f32(a, xfBuf_im, wfBuf_im); 59 const float32x4_t c = vmulq_f32(xfBuf_re, wfBuf_im); 60 const float32x4_t f = vmlaq_f32(c, xfBuf_im, wfBuf_re); 61 const float32x4_t g = vaddq_f32(yf_re, e); 62 const float32x4_t h = vaddq_f32(yf_im, f); 63 vst1q_f32(&yf[0][j], g); 64 vst1q_f32(&yf[1][j], h); 65 } 66 // scalar code for the remaining items. 67 for (; j < PART_LEN1; j++) { 68 yf[0][j] += MulRe(aec->xfBuf[0][xPos + j], 69 aec->xfBuf[1][xPos + j], 70 aec->wfBuf[0][pos + j], 71 aec->wfBuf[1][pos + j]); 72 yf[1][j] += MulIm(aec->xfBuf[0][xPos + j], 73 aec->xfBuf[1][xPos + j], 74 aec->wfBuf[0][pos + j], 75 aec->wfBuf[1][pos + j]); 76 } 77 } 78 } 79 80 static float32x4_t vdivq_f32(float32x4_t a, float32x4_t b) { 81 int i; 82 float32x4_t x = vrecpeq_f32(b); 83 // from arm documentation 84 // The Newton-Raphson iteration: 85 // x[n+1] = x[n] * (2 - d * x[n]) 86 // converges to (1/d) if x0 is the result of VRECPE applied to d. 87 // 88 // Note: The precision did not improve after 2 iterations. 89 for (i = 0; i < 2; i++) { 90 x = vmulq_f32(vrecpsq_f32(b, x), x); 91 } 92 // a/b = a*(1/b) 93 return vmulq_f32(a, x); 94 } 95 96 static float32x4_t vsqrtq_f32(float32x4_t s) { 97 int i; 98 float32x4_t x = vrsqrteq_f32(s); 99 100 // Code to handle sqrt(0). 101 // If the input to sqrtf() is zero, a zero will be returned. 102 // If the input to vrsqrteq_f32() is zero, positive infinity is returned. 103 const uint32x4_t vec_p_inf = vdupq_n_u32(0x7F800000); 104 // check for divide by zero 105 const uint32x4_t div_by_zero = vceqq_u32(vec_p_inf, vreinterpretq_u32_f32(x)); 106 // zero out the positive infinity results 107 x = vreinterpretq_f32_u32(vandq_u32(vmvnq_u32(div_by_zero), 108 vreinterpretq_u32_f32(x))); 109 // from arm documentation 110 // The Newton-Raphson iteration: 111 // x[n+1] = x[n] * (3 - d * (x[n] * x[n])) / 2) 112 // converges to (1/d) if x0 is the result of VRSQRTE applied to d. 113 // 114 // Note: The precision did not improve after 2 iterations. 115 for (i = 0; i < 2; i++) { 116 x = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, x), s), x); 117 } 118 // sqrt(s) = s * 1/sqrt(s) 119 return vmulq_f32(s, x);; 120 } 121 122 static void ScaleErrorSignalNEON(AecCore* aec, float ef[2][PART_LEN1]) { 123 const float mu = aec->extended_filter_enabled ? kExtendedMu : aec->normal_mu; 124 const float error_threshold = aec->extended_filter_enabled ? 125 kExtendedErrorThreshold : aec->normal_error_threshold; 126 const float32x4_t k1e_10f = vdupq_n_f32(1e-10f); 127 const float32x4_t kMu = vmovq_n_f32(mu); 128 const float32x4_t kThresh = vmovq_n_f32(error_threshold); 129 int i; 130 // vectorized code (four at once) 131 for (i = 0; i + 3 < PART_LEN1; i += 4) { 132 const float32x4_t xPow = vld1q_f32(&aec->xPow[i]); 133 const float32x4_t ef_re_base = vld1q_f32(&ef[0][i]); 134 const float32x4_t ef_im_base = vld1q_f32(&ef[1][i]); 135 const float32x4_t xPowPlus = vaddq_f32(xPow, k1e_10f); 136 float32x4_t ef_re = vdivq_f32(ef_re_base, xPowPlus); 137 float32x4_t ef_im = vdivq_f32(ef_im_base, xPowPlus); 138 const float32x4_t ef_re2 = vmulq_f32(ef_re, ef_re); 139 const float32x4_t ef_sum2 = vmlaq_f32(ef_re2, ef_im, ef_im); 140 const float32x4_t absEf = vsqrtq_f32(ef_sum2); 141 const uint32x4_t bigger = vcgtq_f32(absEf, kThresh); 142 const float32x4_t absEfPlus = vaddq_f32(absEf, k1e_10f); 143 const float32x4_t absEfInv = vdivq_f32(kThresh, absEfPlus); 144 uint32x4_t ef_re_if = vreinterpretq_u32_f32(vmulq_f32(ef_re, absEfInv)); 145 uint32x4_t ef_im_if = vreinterpretq_u32_f32(vmulq_f32(ef_im, absEfInv)); 146 uint32x4_t ef_re_u32 = vandq_u32(vmvnq_u32(bigger), 147 vreinterpretq_u32_f32(ef_re)); 148 uint32x4_t ef_im_u32 = vandq_u32(vmvnq_u32(bigger), 149 vreinterpretq_u32_f32(ef_im)); 150 ef_re_if = vandq_u32(bigger, ef_re_if); 151 ef_im_if = vandq_u32(bigger, ef_im_if); 152 ef_re_u32 = vorrq_u32(ef_re_u32, ef_re_if); 153 ef_im_u32 = vorrq_u32(ef_im_u32, ef_im_if); 154 ef_re = vmulq_f32(vreinterpretq_f32_u32(ef_re_u32), kMu); 155 ef_im = vmulq_f32(vreinterpretq_f32_u32(ef_im_u32), kMu); 156 vst1q_f32(&ef[0][i], ef_re); 157 vst1q_f32(&ef[1][i], ef_im); 158 } 159 // scalar code for the remaining items. 160 for (; i < PART_LEN1; i++) { 161 float abs_ef; 162 ef[0][i] /= (aec->xPow[i] + 1e-10f); 163 ef[1][i] /= (aec->xPow[i] + 1e-10f); 164 abs_ef = sqrtf(ef[0][i] * ef[0][i] + ef[1][i] * ef[1][i]); 165 166 if (abs_ef > error_threshold) { 167 abs_ef = error_threshold / (abs_ef + 1e-10f); 168 ef[0][i] *= abs_ef; 169 ef[1][i] *= abs_ef; 170 } 171 172 // Stepsize factor 173 ef[0][i] *= mu; 174 ef[1][i] *= mu; 175 } 176 } 177 178 static void FilterAdaptationNEON(AecCore* aec, 179 float* fft, 180 float ef[2][PART_LEN1]) { 181 int i; 182 const int num_partitions = aec->num_partitions; 183 for (i = 0; i < num_partitions; i++) { 184 int xPos = (i + aec->xfBufBlockPos) * PART_LEN1; 185 int pos = i * PART_LEN1; 186 int j; 187 // Check for wrap 188 if (i + aec->xfBufBlockPos >= num_partitions) { 189 xPos -= num_partitions * PART_LEN1; 190 } 191 192 // Process the whole array... 193 for (j = 0; j < PART_LEN; j += 4) { 194 // Load xfBuf and ef. 195 const float32x4_t xfBuf_re = vld1q_f32(&aec->xfBuf[0][xPos + j]); 196 const float32x4_t xfBuf_im = vld1q_f32(&aec->xfBuf[1][xPos + j]); 197 const float32x4_t ef_re = vld1q_f32(&ef[0][j]); 198 const float32x4_t ef_im = vld1q_f32(&ef[1][j]); 199 // Calculate the product of conjugate(xfBuf) by ef. 200 // re(conjugate(a) * b) = aRe * bRe + aIm * bIm 201 // im(conjugate(a) * b)= aRe * bIm - aIm * bRe 202 const float32x4_t a = vmulq_f32(xfBuf_re, ef_re); 203 const float32x4_t e = vmlaq_f32(a, xfBuf_im, ef_im); 204 const float32x4_t c = vmulq_f32(xfBuf_re, ef_im); 205 const float32x4_t f = vmlsq_f32(c, xfBuf_im, ef_re); 206 // Interleave real and imaginary parts. 207 const float32x4x2_t g_n_h = vzipq_f32(e, f); 208 // Store 209 vst1q_f32(&fft[2 * j + 0], g_n_h.val[0]); 210 vst1q_f32(&fft[2 * j + 4], g_n_h.val[1]); 211 } 212 // ... and fixup the first imaginary entry. 213 fft[1] = MulRe(aec->xfBuf[0][xPos + PART_LEN], 214 -aec->xfBuf[1][xPos + PART_LEN], 215 ef[0][PART_LEN], 216 ef[1][PART_LEN]); 217 218 aec_rdft_inverse_128(fft); 219 memset(fft + PART_LEN, 0, sizeof(float) * PART_LEN); 220 221 // fft scaling 222 { 223 const float scale = 2.0f / PART_LEN2; 224 const float32x4_t scale_ps = vmovq_n_f32(scale); 225 for (j = 0; j < PART_LEN; j += 4) { 226 const float32x4_t fft_ps = vld1q_f32(&fft[j]); 227 const float32x4_t fft_scale = vmulq_f32(fft_ps, scale_ps); 228 vst1q_f32(&fft[j], fft_scale); 229 } 230 } 231 aec_rdft_forward_128(fft); 232 233 { 234 const float wt1 = aec->wfBuf[1][pos]; 235 aec->wfBuf[0][pos + PART_LEN] += fft[1]; 236 for (j = 0; j < PART_LEN; j += 4) { 237 float32x4_t wtBuf_re = vld1q_f32(&aec->wfBuf[0][pos + j]); 238 float32x4_t wtBuf_im = vld1q_f32(&aec->wfBuf[1][pos + j]); 239 const float32x4_t fft0 = vld1q_f32(&fft[2 * j + 0]); 240 const float32x4_t fft4 = vld1q_f32(&fft[2 * j + 4]); 241 const float32x4x2_t fft_re_im = vuzpq_f32(fft0, fft4); 242 wtBuf_re = vaddq_f32(wtBuf_re, fft_re_im.val[0]); 243 wtBuf_im = vaddq_f32(wtBuf_im, fft_re_im.val[1]); 244 245 vst1q_f32(&aec->wfBuf[0][pos + j], wtBuf_re); 246 vst1q_f32(&aec->wfBuf[1][pos + j], wtBuf_im); 247 } 248 aec->wfBuf[1][pos] = wt1; 249 } 250 } 251 } 252 253 static float32x4_t vpowq_f32(float32x4_t a, float32x4_t b) { 254 // a^b = exp2(b * log2(a)) 255 // exp2(x) and log2(x) are calculated using polynomial approximations. 256 float32x4_t log2_a, b_log2_a, a_exp_b; 257 258 // Calculate log2(x), x = a. 259 { 260 // To calculate log2(x), we decompose x like this: 261 // x = y * 2^n 262 // n is an integer 263 // y is in the [1.0, 2.0) range 264 // 265 // log2(x) = log2(y) + n 266 // n can be evaluated by playing with float representation. 267 // log2(y) in a small range can be approximated, this code uses an order 268 // five polynomial approximation. The coefficients have been 269 // estimated with the Remez algorithm and the resulting 270 // polynomial has a maximum relative error of 0.00086%. 271 272 // Compute n. 273 // This is done by masking the exponent, shifting it into the top bit of 274 // the mantissa, putting eight into the biased exponent (to shift/ 275 // compensate the fact that the exponent has been shifted in the top/ 276 // fractional part and finally getting rid of the implicit leading one 277 // from the mantissa by substracting it out. 278 const uint32x4_t vec_float_exponent_mask = vdupq_n_u32(0x7F800000); 279 const uint32x4_t vec_eight_biased_exponent = vdupq_n_u32(0x43800000); 280 const uint32x4_t vec_implicit_leading_one = vdupq_n_u32(0x43BF8000); 281 const uint32x4_t two_n = vandq_u32(vreinterpretq_u32_f32(a), 282 vec_float_exponent_mask); 283 const uint32x4_t n_1 = vshrq_n_u32(two_n, kShiftExponentIntoTopMantissa); 284 const uint32x4_t n_0 = vorrq_u32(n_1, vec_eight_biased_exponent); 285 const float32x4_t n = 286 vsubq_f32(vreinterpretq_f32_u32(n_0), 287 vreinterpretq_f32_u32(vec_implicit_leading_one)); 288 // Compute y. 289 const uint32x4_t vec_mantissa_mask = vdupq_n_u32(0x007FFFFF); 290 const uint32x4_t vec_zero_biased_exponent_is_one = vdupq_n_u32(0x3F800000); 291 const uint32x4_t mantissa = vandq_u32(vreinterpretq_u32_f32(a), 292 vec_mantissa_mask); 293 const float32x4_t y = 294 vreinterpretq_f32_u32(vorrq_u32(mantissa, 295 vec_zero_biased_exponent_is_one)); 296 // Approximate log2(y) ~= (y - 1) * pol5(y). 297 // pol5(y) = C5 * y^5 + C4 * y^4 + C3 * y^3 + C2 * y^2 + C1 * y + C0 298 const float32x4_t C5 = vdupq_n_f32(-3.4436006e-2f); 299 const float32x4_t C4 = vdupq_n_f32(3.1821337e-1f); 300 const float32x4_t C3 = vdupq_n_f32(-1.2315303f); 301 const float32x4_t C2 = vdupq_n_f32(2.5988452f); 302 const float32x4_t C1 = vdupq_n_f32(-3.3241990f); 303 const float32x4_t C0 = vdupq_n_f32(3.1157899f); 304 float32x4_t pol5_y = C5; 305 pol5_y = vmlaq_f32(C4, y, pol5_y); 306 pol5_y = vmlaq_f32(C3, y, pol5_y); 307 pol5_y = vmlaq_f32(C2, y, pol5_y); 308 pol5_y = vmlaq_f32(C1, y, pol5_y); 309 pol5_y = vmlaq_f32(C0, y, pol5_y); 310 const float32x4_t y_minus_one = 311 vsubq_f32(y, vreinterpretq_f32_u32(vec_zero_biased_exponent_is_one)); 312 const float32x4_t log2_y = vmulq_f32(y_minus_one, pol5_y); 313 314 // Combine parts. 315 log2_a = vaddq_f32(n, log2_y); 316 } 317 318 // b * log2(a) 319 b_log2_a = vmulq_f32(b, log2_a); 320 321 // Calculate exp2(x), x = b * log2(a). 322 { 323 // To calculate 2^x, we decompose x like this: 324 // x = n + y 325 // n is an integer, the value of x - 0.5 rounded down, therefore 326 // y is in the [0.5, 1.5) range 327 // 328 // 2^x = 2^n * 2^y 329 // 2^n can be evaluated by playing with float representation. 330 // 2^y in a small range can be approximated, this code uses an order two 331 // polynomial approximation. The coefficients have been estimated 332 // with the Remez algorithm and the resulting polynomial has a 333 // maximum relative error of 0.17%. 334 // To avoid over/underflow, we reduce the range of input to ]-127, 129]. 335 const float32x4_t max_input = vdupq_n_f32(129.f); 336 const float32x4_t min_input = vdupq_n_f32(-126.99999f); 337 const float32x4_t x_min = vminq_f32(b_log2_a, max_input); 338 const float32x4_t x_max = vmaxq_f32(x_min, min_input); 339 // Compute n. 340 const float32x4_t half = vdupq_n_f32(0.5f); 341 const float32x4_t x_minus_half = vsubq_f32(x_max, half); 342 const int32x4_t x_minus_half_floor = vcvtq_s32_f32(x_minus_half); 343 344 // Compute 2^n. 345 const int32x4_t float_exponent_bias = vdupq_n_s32(127); 346 const int32x4_t two_n_exponent = 347 vaddq_s32(x_minus_half_floor, float_exponent_bias); 348 const float32x4_t two_n = 349 vreinterpretq_f32_s32(vshlq_n_s32(two_n_exponent, kFloatExponentShift)); 350 // Compute y. 351 const float32x4_t y = vsubq_f32(x_max, vcvtq_f32_s32(x_minus_half_floor)); 352 353 // Approximate 2^y ~= C2 * y^2 + C1 * y + C0. 354 const float32x4_t C2 = vdupq_n_f32(3.3718944e-1f); 355 const float32x4_t C1 = vdupq_n_f32(6.5763628e-1f); 356 const float32x4_t C0 = vdupq_n_f32(1.0017247f); 357 float32x4_t exp2_y = C2; 358 exp2_y = vmlaq_f32(C1, y, exp2_y); 359 exp2_y = vmlaq_f32(C0, y, exp2_y); 360 361 // Combine parts. 362 a_exp_b = vmulq_f32(exp2_y, two_n); 363 } 364 365 return a_exp_b; 366 } 367 368 static void OverdriveAndSuppressNEON(AecCore* aec, 369 float hNl[PART_LEN1], 370 const float hNlFb, 371 float efw[2][PART_LEN1]) { 372 int i; 373 const float32x4_t vec_hNlFb = vmovq_n_f32(hNlFb); 374 const float32x4_t vec_one = vdupq_n_f32(1.0f); 375 const float32x4_t vec_minus_one = vdupq_n_f32(-1.0f); 376 const float32x4_t vec_overDriveSm = vmovq_n_f32(aec->overDriveSm); 377 378 // vectorized code (four at once) 379 for (i = 0; i + 3 < PART_LEN1; i += 4) { 380 // Weight subbands 381 float32x4_t vec_hNl = vld1q_f32(&hNl[i]); 382 const float32x4_t vec_weightCurve = vld1q_f32(&WebRtcAec_weightCurve[i]); 383 const uint32x4_t bigger = vcgtq_f32(vec_hNl, vec_hNlFb); 384 const float32x4_t vec_weightCurve_hNlFb = vmulq_f32(vec_weightCurve, 385 vec_hNlFb); 386 const float32x4_t vec_one_weightCurve = vsubq_f32(vec_one, vec_weightCurve); 387 const float32x4_t vec_one_weightCurve_hNl = vmulq_f32(vec_one_weightCurve, 388 vec_hNl); 389 const uint32x4_t vec_if0 = vandq_u32(vmvnq_u32(bigger), 390 vreinterpretq_u32_f32(vec_hNl)); 391 const float32x4_t vec_one_weightCurve_add = 392 vaddq_f32(vec_weightCurve_hNlFb, vec_one_weightCurve_hNl); 393 const uint32x4_t vec_if1 = 394 vandq_u32(bigger, vreinterpretq_u32_f32(vec_one_weightCurve_add)); 395 396 vec_hNl = vreinterpretq_f32_u32(vorrq_u32(vec_if0, vec_if1)); 397 398 { 399 const float32x4_t vec_overDriveCurve = 400 vld1q_f32(&WebRtcAec_overDriveCurve[i]); 401 const float32x4_t vec_overDriveSm_overDriveCurve = 402 vmulq_f32(vec_overDriveSm, vec_overDriveCurve); 403 vec_hNl = vpowq_f32(vec_hNl, vec_overDriveSm_overDriveCurve); 404 vst1q_f32(&hNl[i], vec_hNl); 405 } 406 407 // Suppress error signal 408 { 409 float32x4_t vec_efw_re = vld1q_f32(&efw[0][i]); 410 float32x4_t vec_efw_im = vld1q_f32(&efw[1][i]); 411 vec_efw_re = vmulq_f32(vec_efw_re, vec_hNl); 412 vec_efw_im = vmulq_f32(vec_efw_im, vec_hNl); 413 414 // Ooura fft returns incorrect sign on imaginary component. It matters 415 // here because we are making an additive change with comfort noise. 416 vec_efw_im = vmulq_f32(vec_efw_im, vec_minus_one); 417 vst1q_f32(&efw[0][i], vec_efw_re); 418 vst1q_f32(&efw[1][i], vec_efw_im); 419 } 420 } 421 422 // scalar code for the remaining items. 423 for (; i < PART_LEN1; i++) { 424 // Weight subbands 425 if (hNl[i] > hNlFb) { 426 hNl[i] = WebRtcAec_weightCurve[i] * hNlFb + 427 (1 - WebRtcAec_weightCurve[i]) * hNl[i]; 428 } 429 430 hNl[i] = powf(hNl[i], aec->overDriveSm * WebRtcAec_overDriveCurve[i]); 431 432 // Suppress error signal 433 efw[0][i] *= hNl[i]; 434 efw[1][i] *= hNl[i]; 435 436 // Ooura fft returns incorrect sign on imaginary component. It matters 437 // here because we are making an additive change with comfort noise. 438 efw[1][i] *= -1; 439 } 440 } 441 442 static int PartitionDelay(const AecCore* aec) { 443 // Measures the energy in each filter partition and returns the partition with 444 // highest energy. 445 // TODO(bjornv): Spread computational cost by computing one partition per 446 // block? 447 float wfEnMax = 0; 448 int i; 449 int delay = 0; 450 451 for (i = 0; i < aec->num_partitions; i++) { 452 int j; 453 int pos = i * PART_LEN1; 454 float wfEn = 0; 455 float32x4_t vec_wfEn = vdupq_n_f32(0.0f); 456 // vectorized code (four at once) 457 for (j = 0; j + 3 < PART_LEN1; j += 4) { 458 const float32x4_t vec_wfBuf0 = vld1q_f32(&aec->wfBuf[0][pos + j]); 459 const float32x4_t vec_wfBuf1 = vld1q_f32(&aec->wfBuf[1][pos + j]); 460 vec_wfEn = vmlaq_f32(vec_wfEn, vec_wfBuf0, vec_wfBuf0); 461 vec_wfEn = vmlaq_f32(vec_wfEn, vec_wfBuf1, vec_wfBuf1); 462 } 463 { 464 float32x2_t vec_total; 465 // A B C D 466 vec_total = vpadd_f32(vget_low_f32(vec_wfEn), vget_high_f32(vec_wfEn)); 467 // A+B C+D 468 vec_total = vpadd_f32(vec_total, vec_total); 469 // A+B+C+D A+B+C+D 470 wfEn = vget_lane_f32(vec_total, 0); 471 } 472 473 // scalar code for the remaining items. 474 for (; j < PART_LEN1; j++) { 475 wfEn += aec->wfBuf[0][pos + j] * aec->wfBuf[0][pos + j] + 476 aec->wfBuf[1][pos + j] * aec->wfBuf[1][pos + j]; 477 } 478 479 if (wfEn > wfEnMax) { 480 wfEnMax = wfEn; 481 delay = i; 482 } 483 } 484 return delay; 485 } 486 487 // Updates the following smoothed Power Spectral Densities (PSD): 488 // - sd : near-end 489 // - se : residual echo 490 // - sx : far-end 491 // - sde : cross-PSD of near-end and residual echo 492 // - sxd : cross-PSD of near-end and far-end 493 // 494 // In addition to updating the PSDs, also the filter diverge state is determined 495 // upon actions are taken. 496 static void SmoothedPSD(AecCore* aec, 497 float efw[2][PART_LEN1], 498 float dfw[2][PART_LEN1], 499 float xfw[2][PART_LEN1]) { 500 // Power estimate smoothing coefficients. 501 const float* ptrGCoh = aec->extended_filter_enabled 502 ? WebRtcAec_kExtendedSmoothingCoefficients[aec->mult - 1] 503 : WebRtcAec_kNormalSmoothingCoefficients[aec->mult - 1]; 504 int i; 505 float sdSum = 0, seSum = 0; 506 const float32x4_t vec_15 = vdupq_n_f32(WebRtcAec_kMinFarendPSD); 507 float32x4_t vec_sdSum = vdupq_n_f32(0.0f); 508 float32x4_t vec_seSum = vdupq_n_f32(0.0f); 509 510 for (i = 0; i + 3 < PART_LEN1; i += 4) { 511 const float32x4_t vec_dfw0 = vld1q_f32(&dfw[0][i]); 512 const float32x4_t vec_dfw1 = vld1q_f32(&dfw[1][i]); 513 const float32x4_t vec_efw0 = vld1q_f32(&efw[0][i]); 514 const float32x4_t vec_efw1 = vld1q_f32(&efw[1][i]); 515 const float32x4_t vec_xfw0 = vld1q_f32(&xfw[0][i]); 516 const float32x4_t vec_xfw1 = vld1q_f32(&xfw[1][i]); 517 float32x4_t vec_sd = vmulq_n_f32(vld1q_f32(&aec->sd[i]), ptrGCoh[0]); 518 float32x4_t vec_se = vmulq_n_f32(vld1q_f32(&aec->se[i]), ptrGCoh[0]); 519 float32x4_t vec_sx = vmulq_n_f32(vld1q_f32(&aec->sx[i]), ptrGCoh[0]); 520 float32x4_t vec_dfw_sumsq = vmulq_f32(vec_dfw0, vec_dfw0); 521 float32x4_t vec_efw_sumsq = vmulq_f32(vec_efw0, vec_efw0); 522 float32x4_t vec_xfw_sumsq = vmulq_f32(vec_xfw0, vec_xfw0); 523 524 vec_dfw_sumsq = vmlaq_f32(vec_dfw_sumsq, vec_dfw1, vec_dfw1); 525 vec_efw_sumsq = vmlaq_f32(vec_efw_sumsq, vec_efw1, vec_efw1); 526 vec_xfw_sumsq = vmlaq_f32(vec_xfw_sumsq, vec_xfw1, vec_xfw1); 527 vec_xfw_sumsq = vmaxq_f32(vec_xfw_sumsq, vec_15); 528 vec_sd = vmlaq_n_f32(vec_sd, vec_dfw_sumsq, ptrGCoh[1]); 529 vec_se = vmlaq_n_f32(vec_se, vec_efw_sumsq, ptrGCoh[1]); 530 vec_sx = vmlaq_n_f32(vec_sx, vec_xfw_sumsq, ptrGCoh[1]); 531 532 vst1q_f32(&aec->sd[i], vec_sd); 533 vst1q_f32(&aec->se[i], vec_se); 534 vst1q_f32(&aec->sx[i], vec_sx); 535 536 { 537 float32x4x2_t vec_sde = vld2q_f32(&aec->sde[i][0]); 538 float32x4_t vec_dfwefw0011 = vmulq_f32(vec_dfw0, vec_efw0); 539 float32x4_t vec_dfwefw0110 = vmulq_f32(vec_dfw0, vec_efw1); 540 vec_sde.val[0] = vmulq_n_f32(vec_sde.val[0], ptrGCoh[0]); 541 vec_sde.val[1] = vmulq_n_f32(vec_sde.val[1], ptrGCoh[0]); 542 vec_dfwefw0011 = vmlaq_f32(vec_dfwefw0011, vec_dfw1, vec_efw1); 543 vec_dfwefw0110 = vmlsq_f32(vec_dfwefw0110, vec_dfw1, vec_efw0); 544 vec_sde.val[0] = vmlaq_n_f32(vec_sde.val[0], vec_dfwefw0011, ptrGCoh[1]); 545 vec_sde.val[1] = vmlaq_n_f32(vec_sde.val[1], vec_dfwefw0110, ptrGCoh[1]); 546 vst2q_f32(&aec->sde[i][0], vec_sde); 547 } 548 549 { 550 float32x4x2_t vec_sxd = vld2q_f32(&aec->sxd[i][0]); 551 float32x4_t vec_dfwxfw0011 = vmulq_f32(vec_dfw0, vec_xfw0); 552 float32x4_t vec_dfwxfw0110 = vmulq_f32(vec_dfw0, vec_xfw1); 553 vec_sxd.val[0] = vmulq_n_f32(vec_sxd.val[0], ptrGCoh[0]); 554 vec_sxd.val[1] = vmulq_n_f32(vec_sxd.val[1], ptrGCoh[0]); 555 vec_dfwxfw0011 = vmlaq_f32(vec_dfwxfw0011, vec_dfw1, vec_xfw1); 556 vec_dfwxfw0110 = vmlsq_f32(vec_dfwxfw0110, vec_dfw1, vec_xfw0); 557 vec_sxd.val[0] = vmlaq_n_f32(vec_sxd.val[0], vec_dfwxfw0011, ptrGCoh[1]); 558 vec_sxd.val[1] = vmlaq_n_f32(vec_sxd.val[1], vec_dfwxfw0110, ptrGCoh[1]); 559 vst2q_f32(&aec->sxd[i][0], vec_sxd); 560 } 561 562 vec_sdSum = vaddq_f32(vec_sdSum, vec_sd); 563 vec_seSum = vaddq_f32(vec_seSum, vec_se); 564 } 565 { 566 float32x2_t vec_sdSum_total; 567 float32x2_t vec_seSum_total; 568 // A B C D 569 vec_sdSum_total = vpadd_f32(vget_low_f32(vec_sdSum), 570 vget_high_f32(vec_sdSum)); 571 vec_seSum_total = vpadd_f32(vget_low_f32(vec_seSum), 572 vget_high_f32(vec_seSum)); 573 // A+B C+D 574 vec_sdSum_total = vpadd_f32(vec_sdSum_total, vec_sdSum_total); 575 vec_seSum_total = vpadd_f32(vec_seSum_total, vec_seSum_total); 576 // A+B+C+D A+B+C+D 577 sdSum = vget_lane_f32(vec_sdSum_total, 0); 578 seSum = vget_lane_f32(vec_seSum_total, 0); 579 } 580 581 // scalar code for the remaining items. 582 for (; i < PART_LEN1; i++) { 583 aec->sd[i] = ptrGCoh[0] * aec->sd[i] + 584 ptrGCoh[1] * (dfw[0][i] * dfw[0][i] + dfw[1][i] * dfw[1][i]); 585 aec->se[i] = ptrGCoh[0] * aec->se[i] + 586 ptrGCoh[1] * (efw[0][i] * efw[0][i] + efw[1][i] * efw[1][i]); 587 // We threshold here to protect against the ill-effects of a zero farend. 588 // The threshold is not arbitrarily chosen, but balances protection and 589 // adverse interaction with the algorithm's tuning. 590 // TODO(bjornv): investigate further why this is so sensitive. 591 aec->sx[i] = 592 ptrGCoh[0] * aec->sx[i] + 593 ptrGCoh[1] * WEBRTC_SPL_MAX( 594 xfw[0][i] * xfw[0][i] + xfw[1][i] * xfw[1][i], 595 WebRtcAec_kMinFarendPSD); 596 597 aec->sde[i][0] = 598 ptrGCoh[0] * aec->sde[i][0] + 599 ptrGCoh[1] * (dfw[0][i] * efw[0][i] + dfw[1][i] * efw[1][i]); 600 aec->sde[i][1] = 601 ptrGCoh[0] * aec->sde[i][1] + 602 ptrGCoh[1] * (dfw[0][i] * efw[1][i] - dfw[1][i] * efw[0][i]); 603 604 aec->sxd[i][0] = 605 ptrGCoh[0] * aec->sxd[i][0] + 606 ptrGCoh[1] * (dfw[0][i] * xfw[0][i] + dfw[1][i] * xfw[1][i]); 607 aec->sxd[i][1] = 608 ptrGCoh[0] * aec->sxd[i][1] + 609 ptrGCoh[1] * (dfw[0][i] * xfw[1][i] - dfw[1][i] * xfw[0][i]); 610 611 sdSum += aec->sd[i]; 612 seSum += aec->se[i]; 613 } 614 615 // Divergent filter safeguard. 616 aec->divergeState = (aec->divergeState ? 1.05f : 1.0f) * seSum > sdSum; 617 618 if (aec->divergeState) 619 memcpy(efw, dfw, sizeof(efw[0][0]) * 2 * PART_LEN1); 620 621 // Reset if error is significantly larger than nearend (13 dB). 622 if (!aec->extended_filter_enabled && seSum > (19.95f * sdSum)) 623 memset(aec->wfBuf, 0, sizeof(aec->wfBuf)); 624 } 625 626 // Window time domain data to be used by the fft. 627 __inline static void WindowData(float* x_windowed, const float* x) { 628 int i; 629 for (i = 0; i < PART_LEN; i += 4) { 630 const float32x4_t vec_Buf1 = vld1q_f32(&x[i]); 631 const float32x4_t vec_Buf2 = vld1q_f32(&x[PART_LEN + i]); 632 const float32x4_t vec_sqrtHanning = vld1q_f32(&WebRtcAec_sqrtHanning[i]); 633 // A B C D 634 float32x4_t vec_sqrtHanning_rev = 635 vld1q_f32(&WebRtcAec_sqrtHanning[PART_LEN - i - 3]); 636 // B A D C 637 vec_sqrtHanning_rev = vrev64q_f32(vec_sqrtHanning_rev); 638 // D C B A 639 vec_sqrtHanning_rev = vcombine_f32(vget_high_f32(vec_sqrtHanning_rev), 640 vget_low_f32(vec_sqrtHanning_rev)); 641 vst1q_f32(&x_windowed[i], vmulq_f32(vec_Buf1, vec_sqrtHanning)); 642 vst1q_f32(&x_windowed[PART_LEN + i], 643 vmulq_f32(vec_Buf2, vec_sqrtHanning_rev)); 644 } 645 } 646 647 // Puts fft output data into a complex valued array. 648 __inline static void StoreAsComplex(const float* data, 649 float data_complex[2][PART_LEN1]) { 650 int i; 651 for (i = 0; i < PART_LEN; i += 4) { 652 const float32x4x2_t vec_data = vld2q_f32(&data[2 * i]); 653 vst1q_f32(&data_complex[0][i], vec_data.val[0]); 654 vst1q_f32(&data_complex[1][i], vec_data.val[1]); 655 } 656 // fix beginning/end values 657 data_complex[1][0] = 0; 658 data_complex[1][PART_LEN] = 0; 659 data_complex[0][0] = data[0]; 660 data_complex[0][PART_LEN] = data[1]; 661 } 662 663 static void SubbandCoherenceNEON(AecCore* aec, 664 float efw[2][PART_LEN1], 665 float xfw[2][PART_LEN1], 666 float* fft, 667 float* cohde, 668 float* cohxd) { 669 float dfw[2][PART_LEN1]; 670 int i; 671 672 if (aec->delayEstCtr == 0) 673 aec->delayIdx = PartitionDelay(aec); 674 675 // Use delayed far. 676 memcpy(xfw, 677 aec->xfwBuf + aec->delayIdx * PART_LEN1, 678 sizeof(xfw[0][0]) * 2 * PART_LEN1); 679 680 // Windowed near fft 681 WindowData(fft, aec->dBuf); 682 aec_rdft_forward_128(fft); 683 StoreAsComplex(fft, dfw); 684 685 // Windowed error fft 686 WindowData(fft, aec->eBuf); 687 aec_rdft_forward_128(fft); 688 StoreAsComplex(fft, efw); 689 690 SmoothedPSD(aec, efw, dfw, xfw); 691 692 { 693 const float32x4_t vec_1eminus10 = vdupq_n_f32(1e-10f); 694 695 // Subband coherence 696 for (i = 0; i + 3 < PART_LEN1; i += 4) { 697 const float32x4_t vec_sd = vld1q_f32(&aec->sd[i]); 698 const float32x4_t vec_se = vld1q_f32(&aec->se[i]); 699 const float32x4_t vec_sx = vld1q_f32(&aec->sx[i]); 700 const float32x4_t vec_sdse = vmlaq_f32(vec_1eminus10, vec_sd, vec_se); 701 const float32x4_t vec_sdsx = vmlaq_f32(vec_1eminus10, vec_sd, vec_sx); 702 float32x4x2_t vec_sde = vld2q_f32(&aec->sde[i][0]); 703 float32x4x2_t vec_sxd = vld2q_f32(&aec->sxd[i][0]); 704 float32x4_t vec_cohde = vmulq_f32(vec_sde.val[0], vec_sde.val[0]); 705 float32x4_t vec_cohxd = vmulq_f32(vec_sxd.val[0], vec_sxd.val[0]); 706 vec_cohde = vmlaq_f32(vec_cohde, vec_sde.val[1], vec_sde.val[1]); 707 vec_cohde = vdivq_f32(vec_cohde, vec_sdse); 708 vec_cohxd = vmlaq_f32(vec_cohxd, vec_sxd.val[1], vec_sxd.val[1]); 709 vec_cohxd = vdivq_f32(vec_cohxd, vec_sdsx); 710 711 vst1q_f32(&cohde[i], vec_cohde); 712 vst1q_f32(&cohxd[i], vec_cohxd); 713 } 714 } 715 // scalar code for the remaining items. 716 for (; i < PART_LEN1; i++) { 717 cohde[i] = 718 (aec->sde[i][0] * aec->sde[i][0] + aec->sde[i][1] * aec->sde[i][1]) / 719 (aec->sd[i] * aec->se[i] + 1e-10f); 720 cohxd[i] = 721 (aec->sxd[i][0] * aec->sxd[i][0] + aec->sxd[i][1] * aec->sxd[i][1]) / 722 (aec->sx[i] * aec->sd[i] + 1e-10f); 723 } 724 } 725 726 void WebRtcAec_InitAec_neon(void) { 727 WebRtcAec_FilterFar = FilterFarNEON; 728 WebRtcAec_ScaleErrorSignal = ScaleErrorSignalNEON; 729 WebRtcAec_FilterAdaptation = FilterAdaptationNEON; 730 WebRtcAec_OverdriveAndSuppress = OverdriveAndSuppressNEON; 731 WebRtcAec_SubbandCoherence = SubbandCoherenceNEON; 732 } 733 734