1 /* 2 * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include "webrtc/modules/audio_processing/ns/nsx_core.h" 12 13 #include <arm_neon.h> 14 #include <assert.h> 15 16 // Constants to compensate for shifting signal log(2^shifts). 17 const int16_t WebRtcNsx_kLogTable[9] = { 18 0, 177, 355, 532, 710, 887, 1065, 1242, 1420 19 }; 20 21 const int16_t WebRtcNsx_kCounterDiv[201] = { 22 32767, 16384, 10923, 8192, 6554, 5461, 4681, 4096, 3641, 3277, 2979, 2731, 23 2521, 2341, 2185, 2048, 1928, 1820, 1725, 1638, 1560, 1489, 1425, 1365, 1311, 24 1260, 1214, 1170, 1130, 1092, 1057, 1024, 993, 964, 936, 910, 886, 862, 840, 25 819, 799, 780, 762, 745, 728, 712, 697, 683, 669, 655, 643, 630, 618, 607, 26 596, 585, 575, 565, 555, 546, 537, 529, 520, 512, 504, 496, 489, 482, 475, 27 468, 462, 455, 449, 443, 437, 431, 426, 420, 415, 410, 405, 400, 395, 390, 28 386, 381, 377, 372, 368, 364, 360, 356, 352, 349, 345, 341, 338, 334, 331, 29 328, 324, 321, 318, 315, 312, 309, 306, 303, 301, 298, 295, 293, 290, 287, 30 285, 282, 280, 278, 275, 273, 271, 269, 266, 264, 262, 260, 258, 256, 254, 31 252, 250, 248, 246, 245, 243, 241, 239, 237, 236, 234, 232, 231, 229, 228, 32 226, 224, 223, 221, 220, 218, 217, 216, 214, 213, 211, 210, 209, 207, 206, 33 205, 204, 202, 201, 200, 199, 197, 196, 195, 194, 193, 192, 191, 189, 188, 34 187, 186, 185, 184, 183, 182, 181, 180, 179, 178, 177, 176, 175, 174, 173, 35 172, 172, 171, 170, 169, 168, 167, 166, 165, 165, 164, 163 36 }; 37 38 const int16_t WebRtcNsx_kLogTableFrac[256] = { 39 0, 1, 3, 4, 6, 7, 9, 10, 11, 13, 14, 16, 17, 18, 20, 21, 40 22, 24, 25, 26, 28, 29, 30, 32, 33, 34, 36, 37, 38, 40, 41, 42, 41 44, 45, 46, 47, 49, 50, 51, 52, 54, 55, 56, 57, 59, 60, 61, 62, 42 63, 65, 66, 67, 68, 69, 71, 72, 73, 74, 75, 77, 78, 79, 80, 81, 43 82, 84, 85, 86, 87, 88, 89, 90, 92, 93, 94, 95, 96, 97, 98, 99, 44 100, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 116, 45 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 46 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 47 147, 148, 149, 150, 151, 152, 153, 154, 155, 155, 156, 157, 158, 159, 160, 48 161, 162, 163, 164, 165, 166, 167, 168, 169, 169, 170, 171, 172, 173, 174, 49 175, 176, 177, 178, 178, 179, 180, 181, 182, 183, 184, 185, 185, 186, 187, 50 188, 189, 190, 191, 192, 192, 193, 194, 195, 196, 197, 198, 198, 199, 200, 51 201, 202, 203, 203, 204, 205, 206, 207, 208, 208, 209, 210, 211, 212, 212, 52 213, 214, 215, 216, 216, 217, 218, 219, 220, 220, 221, 222, 223, 224, 224, 53 225, 226, 227, 228, 228, 229, 230, 231, 231, 232, 233, 234, 234, 235, 236, 54 237, 238, 238, 239, 240, 241, 241, 242, 243, 244, 244, 245, 246, 247, 247, 55 248, 249, 249, 250, 251, 252, 252, 253, 254, 255, 255 56 }; 57 58 // Update the noise estimation information. 59 static void UpdateNoiseEstimateNeon(NoiseSuppressionFixedC* inst, int offset) { 60 const int16_t kExp2Const = 11819; // Q13 61 int16_t* ptr_noiseEstLogQuantile = NULL; 62 int16_t* ptr_noiseEstQuantile = NULL; 63 int16x4_t kExp2Const16x4 = vdup_n_s16(kExp2Const); 64 int32x4_t twentyOne32x4 = vdupq_n_s32(21); 65 int32x4_t constA32x4 = vdupq_n_s32(0x1fffff); 66 int32x4_t constB32x4 = vdupq_n_s32(0x200000); 67 68 int16_t tmp16 = WebRtcSpl_MaxValueW16(inst->noiseEstLogQuantile + offset, 69 inst->magnLen); 70 71 // Guarantee a Q-domain as high as possible and still fit in int16 72 inst->qNoise = 14 - (int) WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(kExp2Const, 73 tmp16, 74 21); 75 76 int32x4_t qNoise32x4 = vdupq_n_s32(inst->qNoise); 77 78 for (ptr_noiseEstLogQuantile = &inst->noiseEstLogQuantile[offset], 79 ptr_noiseEstQuantile = &inst->noiseEstQuantile[0]; 80 ptr_noiseEstQuantile < &inst->noiseEstQuantile[inst->magnLen - 3]; 81 ptr_noiseEstQuantile += 4, ptr_noiseEstLogQuantile += 4) { 82 83 // tmp32no2 = kExp2Const * inst->noiseEstLogQuantile[offset + i]; 84 int16x4_t v16x4 = vld1_s16(ptr_noiseEstLogQuantile); 85 int32x4_t v32x4B = vmull_s16(v16x4, kExp2Const16x4); 86 87 // tmp32no1 = (0x00200000 | (tmp32no2 & 0x001FFFFF)); // 2^21 + frac 88 int32x4_t v32x4A = vandq_s32(v32x4B, constA32x4); 89 v32x4A = vorrq_s32(v32x4A, constB32x4); 90 91 // tmp16 = (int16_t)(tmp32no2 >> 21); 92 v32x4B = vshrq_n_s32(v32x4B, 21); 93 94 // tmp16 -= 21;// shift 21 to get result in Q0 95 v32x4B = vsubq_s32(v32x4B, twentyOne32x4); 96 97 // tmp16 += (int16_t) inst->qNoise; 98 // shift to get result in Q(qNoise) 99 v32x4B = vaddq_s32(v32x4B, qNoise32x4); 100 101 // if (tmp16 < 0) { 102 // tmp32no1 >>= -tmp16; 103 // } else { 104 // tmp32no1 <<= tmp16; 105 // } 106 v32x4B = vshlq_s32(v32x4A, v32x4B); 107 108 // tmp16 = WebRtcSpl_SatW32ToW16(tmp32no1); 109 v16x4 = vqmovn_s32(v32x4B); 110 111 //inst->noiseEstQuantile[i] = tmp16; 112 vst1_s16(ptr_noiseEstQuantile, v16x4); 113 } 114 115 // Last iteration: 116 117 // inst->quantile[i]=exp(inst->lquantile[offset+i]); 118 // in Q21 119 int32_t tmp32no2 = kExp2Const * *ptr_noiseEstLogQuantile; 120 int32_t tmp32no1 = (0x00200000 | (tmp32no2 & 0x001FFFFF)); // 2^21 + frac 121 122 tmp16 = (int16_t)(tmp32no2 >> 21); 123 tmp16 -= 21;// shift 21 to get result in Q0 124 tmp16 += (int16_t) inst->qNoise; //shift to get result in Q(qNoise) 125 if (tmp16 < 0) { 126 tmp32no1 >>= -tmp16; 127 } else { 128 tmp32no1 <<= tmp16; 129 } 130 *ptr_noiseEstQuantile = WebRtcSpl_SatW32ToW16(tmp32no1); 131 } 132 133 // Noise Estimation 134 void WebRtcNsx_NoiseEstimationNeon(NoiseSuppressionFixedC* inst, 135 uint16_t* magn, 136 uint32_t* noise, 137 int16_t* q_noise) { 138 int16_t lmagn[HALF_ANAL_BLOCKL], counter, countDiv; 139 int16_t countProd, delta, zeros, frac; 140 int16_t log2, tabind, logval, tmp16, tmp16no1, tmp16no2; 141 const int16_t log2_const = 22713; 142 const int16_t width_factor = 21845; 143 144 size_t i, s, offset; 145 146 tabind = inst->stages - inst->normData; 147 assert(tabind < 9); 148 assert(tabind > -9); 149 if (tabind < 0) { 150 logval = -WebRtcNsx_kLogTable[-tabind]; 151 } else { 152 logval = WebRtcNsx_kLogTable[tabind]; 153 } 154 155 int16x8_t logval_16x8 = vdupq_n_s16(logval); 156 157 // lmagn(i)=log(magn(i))=log(2)*log2(magn(i)) 158 // magn is in Q(-stages), and the real lmagn values are: 159 // real_lmagn(i)=log(magn(i)*2^stages)=log(magn(i))+log(2^stages) 160 // lmagn in Q8 161 for (i = 0; i < inst->magnLen; i++) { 162 if (magn[i]) { 163 zeros = WebRtcSpl_NormU32((uint32_t)magn[i]); 164 frac = (int16_t)((((uint32_t)magn[i] << zeros) 165 & 0x7FFFFFFF) >> 23); 166 assert(frac < 256); 167 // log2(magn(i)) 168 log2 = (int16_t)(((31 - zeros) << 8) 169 + WebRtcNsx_kLogTableFrac[frac]); 170 // log2(magn(i))*log(2) 171 lmagn[i] = (int16_t)((log2 * log2_const) >> 15); 172 // + log(2^stages) 173 lmagn[i] += logval; 174 } else { 175 lmagn[i] = logval; 176 } 177 } 178 179 int16x4_t Q3_16x4 = vdup_n_s16(3); 180 int16x8_t WIDTHQ8_16x8 = vdupq_n_s16(WIDTH_Q8); 181 int16x8_t WIDTHFACTOR_16x8 = vdupq_n_s16(width_factor); 182 183 int16_t factor = FACTOR_Q7; 184 if (inst->blockIndex < END_STARTUP_LONG) 185 factor = FACTOR_Q7_STARTUP; 186 187 // Loop over simultaneous estimates 188 for (s = 0; s < SIMULT; s++) { 189 offset = s * inst->magnLen; 190 191 // Get counter values from state 192 counter = inst->noiseEstCounter[s]; 193 assert(counter < 201); 194 countDiv = WebRtcNsx_kCounterDiv[counter]; 195 countProd = (int16_t)(counter * countDiv); 196 197 // quant_est(...) 198 int16_t deltaBuff[8]; 199 int16x4_t tmp16x4_0; 200 int16x4_t tmp16x4_1; 201 int16x4_t countDiv_16x4 = vdup_n_s16(countDiv); 202 int16x8_t countProd_16x8 = vdupq_n_s16(countProd); 203 int16x8_t tmp16x8_0 = vdupq_n_s16(countDiv); 204 int16x8_t prod16x8 = vqrdmulhq_s16(WIDTHFACTOR_16x8, tmp16x8_0); 205 int16x8_t tmp16x8_1; 206 int16x8_t tmp16x8_2; 207 int16x8_t tmp16x8_3; 208 uint16x8_t tmp16x8_4; 209 int32x4_t tmp32x4; 210 211 for (i = 0; i + 7 < inst->magnLen; i += 8) { 212 // Compute delta. 213 // Smaller step size during startup. This prevents from using 214 // unrealistic values causing overflow. 215 tmp16x8_0 = vdupq_n_s16(factor); 216 vst1q_s16(deltaBuff, tmp16x8_0); 217 218 int j; 219 for (j = 0; j < 8; j++) { 220 if (inst->noiseEstDensity[offset + i + j] > 512) { 221 // Get values for deltaBuff by shifting intead of dividing. 222 int factor = WebRtcSpl_NormW16(inst->noiseEstDensity[offset + i + j]); 223 deltaBuff[j] = (int16_t)(FACTOR_Q16 >> (14 - factor)); 224 } 225 } 226 227 // Update log quantile estimate 228 229 // tmp16 = (int16_t)((delta * countDiv) >> 14); 230 tmp32x4 = vmull_s16(vld1_s16(&deltaBuff[0]), countDiv_16x4); 231 tmp16x4_1 = vshrn_n_s32(tmp32x4, 14); 232 tmp32x4 = vmull_s16(vld1_s16(&deltaBuff[4]), countDiv_16x4); 233 tmp16x4_0 = vshrn_n_s32(tmp32x4, 14); 234 tmp16x8_0 = vcombine_s16(tmp16x4_1, tmp16x4_0); // Keep for several lines. 235 236 // prepare for the "if" branch 237 // tmp16 += 2; 238 // tmp16_1 = (Word16)(tmp16>>2); 239 tmp16x8_1 = vrshrq_n_s16(tmp16x8_0, 2); 240 241 // inst->noiseEstLogQuantile[offset+i] + tmp16_1; 242 tmp16x8_2 = vld1q_s16(&inst->noiseEstLogQuantile[offset + i]); // Keep 243 tmp16x8_1 = vaddq_s16(tmp16x8_2, tmp16x8_1); // Keep for several lines 244 245 // Prepare for the "else" branch 246 // tmp16 += 1; 247 // tmp16_1 = (Word16)(tmp16>>1); 248 tmp16x8_0 = vrshrq_n_s16(tmp16x8_0, 1); 249 250 // tmp16_2 = (int16_t)((tmp16_1 * 3) >> 1); 251 tmp32x4 = vmull_s16(vget_low_s16(tmp16x8_0), Q3_16x4); 252 tmp16x4_1 = vshrn_n_s32(tmp32x4, 1); 253 254 // tmp16_2 = (int16_t)((tmp16_1 * 3) >> 1); 255 tmp32x4 = vmull_s16(vget_high_s16(tmp16x8_0), Q3_16x4); 256 tmp16x4_0 = vshrn_n_s32(tmp32x4, 1); 257 258 // inst->noiseEstLogQuantile[offset + i] - tmp16_2; 259 tmp16x8_0 = vcombine_s16(tmp16x4_1, tmp16x4_0); // keep 260 tmp16x8_0 = vsubq_s16(tmp16x8_2, tmp16x8_0); 261 262 // logval is the smallest fixed point representation we can have. Values 263 // below that will correspond to values in the interval [0, 1], which 264 // can't possibly occur. 265 tmp16x8_0 = vmaxq_s16(tmp16x8_0, logval_16x8); 266 267 // Do the if-else branches: 268 tmp16x8_3 = vld1q_s16(&lmagn[i]); // keep for several lines 269 tmp16x8_4 = vcgtq_s16(tmp16x8_3, tmp16x8_2); 270 tmp16x8_2 = vbslq_s16(tmp16x8_4, tmp16x8_1, tmp16x8_0); 271 vst1q_s16(&inst->noiseEstLogQuantile[offset + i], tmp16x8_2); 272 273 // Update density estimate 274 // tmp16_1 + tmp16_2 275 tmp16x8_1 = vld1q_s16(&inst->noiseEstDensity[offset + i]); 276 tmp16x8_0 = vqrdmulhq_s16(tmp16x8_1, countProd_16x8); 277 tmp16x8_0 = vaddq_s16(tmp16x8_0, prod16x8); 278 279 // lmagn[i] - inst->noiseEstLogQuantile[offset + i] 280 tmp16x8_3 = vsubq_s16(tmp16x8_3, tmp16x8_2); 281 tmp16x8_3 = vabsq_s16(tmp16x8_3); 282 tmp16x8_4 = vcgtq_s16(WIDTHQ8_16x8, tmp16x8_3); 283 tmp16x8_1 = vbslq_s16(tmp16x8_4, tmp16x8_0, tmp16x8_1); 284 vst1q_s16(&inst->noiseEstDensity[offset + i], tmp16x8_1); 285 } // End loop over magnitude spectrum 286 287 // Last iteration over magnitude spectrum: 288 // compute delta 289 if (inst->noiseEstDensity[offset + i] > 512) { 290 // Get values for deltaBuff by shifting intead of dividing. 291 int factor = WebRtcSpl_NormW16(inst->noiseEstDensity[offset + i]); 292 delta = (int16_t)(FACTOR_Q16 >> (14 - factor)); 293 } else { 294 delta = FACTOR_Q7; 295 if (inst->blockIndex < END_STARTUP_LONG) { 296 // Smaller step size during startup. This prevents from using 297 // unrealistic values causing overflow. 298 delta = FACTOR_Q7_STARTUP; 299 } 300 } 301 // update log quantile estimate 302 tmp16 = (int16_t)((delta * countDiv) >> 14); 303 if (lmagn[i] > inst->noiseEstLogQuantile[offset + i]) { 304 // +=QUANTILE*delta/(inst->counter[s]+1) QUANTILE=0.25, =1 in Q2 305 // CounterDiv=1/(inst->counter[s]+1) in Q15 306 tmp16 += 2; 307 inst->noiseEstLogQuantile[offset + i] += tmp16 / 4; 308 } else { 309 tmp16 += 1; 310 // *(1-QUANTILE), in Q2 QUANTILE=0.25, 1-0.25=0.75=3 in Q2 311 // TODO(bjornv): investigate why we need to truncate twice. 312 tmp16no2 = (int16_t)((tmp16 / 2) * 3 / 2); 313 inst->noiseEstLogQuantile[offset + i] -= tmp16no2; 314 if (inst->noiseEstLogQuantile[offset + i] < logval) { 315 // logval is the smallest fixed point representation we can have. 316 // Values below that will correspond to values in the interval 317 // [0, 1], which can't possibly occur. 318 inst->noiseEstLogQuantile[offset + i] = logval; 319 } 320 } 321 322 // update density estimate 323 if (WEBRTC_SPL_ABS_W16(lmagn[i] - inst->noiseEstLogQuantile[offset + i]) 324 < WIDTH_Q8) { 325 tmp16no1 = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND( 326 inst->noiseEstDensity[offset + i], countProd, 15); 327 tmp16no2 = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND( 328 width_factor, countDiv, 15); 329 inst->noiseEstDensity[offset + i] = tmp16no1 + tmp16no2; 330 } 331 332 333 if (counter >= END_STARTUP_LONG) { 334 inst->noiseEstCounter[s] = 0; 335 if (inst->blockIndex >= END_STARTUP_LONG) { 336 UpdateNoiseEstimateNeon(inst, offset); 337 } 338 } 339 inst->noiseEstCounter[s]++; 340 341 } // end loop over simultaneous estimates 342 343 // Sequentially update the noise during startup 344 if (inst->blockIndex < END_STARTUP_LONG) { 345 UpdateNoiseEstimateNeon(inst, offset); 346 } 347 348 for (i = 0; i < inst->magnLen; i++) { 349 noise[i] = (uint32_t)(inst->noiseEstQuantile[i]); // Q(qNoise) 350 } 351 (*q_noise) = (int16_t)inst->qNoise; 352 } 353 354 // Filter the data in the frequency domain, and create spectrum. 355 void WebRtcNsx_PrepareSpectrumNeon(NoiseSuppressionFixedC* inst, 356 int16_t* freq_buf) { 357 assert(inst->magnLen % 8 == 1); 358 assert(inst->anaLen2 % 16 == 0); 359 360 // (1) Filtering. 361 362 // Fixed point C code for the next block is as follows: 363 // for (i = 0; i < inst->magnLen; i++) { 364 // inst->real[i] = (int16_t)((inst->real[i] * 365 // (int16_t)(inst->noiseSupFilter[i])) >> 14); // Q(normData-stages) 366 // inst->imag[i] = (int16_t)((inst->imag[i] * 367 // (int16_t)(inst->noiseSupFilter[i])) >> 14); // Q(normData-stages) 368 // } 369 370 int16_t* preal = &inst->real[0]; 371 int16_t* pimag = &inst->imag[0]; 372 int16_t* pns_filter = (int16_t*)&inst->noiseSupFilter[0]; 373 int16_t* pimag_end = pimag + inst->magnLen - 4; 374 375 while (pimag < pimag_end) { 376 int16x8_t real = vld1q_s16(preal); 377 int16x8_t imag = vld1q_s16(pimag); 378 int16x8_t ns_filter = vld1q_s16(pns_filter); 379 380 int32x4_t tmp_r_0 = vmull_s16(vget_low_s16(real), vget_low_s16(ns_filter)); 381 int32x4_t tmp_i_0 = vmull_s16(vget_low_s16(imag), vget_low_s16(ns_filter)); 382 int32x4_t tmp_r_1 = vmull_s16(vget_high_s16(real), 383 vget_high_s16(ns_filter)); 384 int32x4_t tmp_i_1 = vmull_s16(vget_high_s16(imag), 385 vget_high_s16(ns_filter)); 386 387 int16x4_t result_r_0 = vshrn_n_s32(tmp_r_0, 14); 388 int16x4_t result_i_0 = vshrn_n_s32(tmp_i_0, 14); 389 int16x4_t result_r_1 = vshrn_n_s32(tmp_r_1, 14); 390 int16x4_t result_i_1 = vshrn_n_s32(tmp_i_1, 14); 391 392 vst1q_s16(preal, vcombine_s16(result_r_0, result_r_1)); 393 vst1q_s16(pimag, vcombine_s16(result_i_0, result_i_1)); 394 preal += 8; 395 pimag += 8; 396 pns_filter += 8; 397 } 398 399 // Filter the last element 400 *preal = (int16_t)((*preal * *pns_filter) >> 14); 401 *pimag = (int16_t)((*pimag * *pns_filter) >> 14); 402 403 // (2) Create spectrum. 404 405 // Fixed point C code for the rest of the function is as follows: 406 // freq_buf[0] = inst->real[0]; 407 // freq_buf[1] = -inst->imag[0]; 408 // for (i = 1, j = 2; i < inst->anaLen2; i += 1, j += 2) { 409 // freq_buf[j] = inst->real[i]; 410 // freq_buf[j + 1] = -inst->imag[i]; 411 // } 412 // freq_buf[inst->anaLen] = inst->real[inst->anaLen2]; 413 // freq_buf[inst->anaLen + 1] = -inst->imag[inst->anaLen2]; 414 415 preal = &inst->real[0]; 416 pimag = &inst->imag[0]; 417 pimag_end = pimag + inst->anaLen2; 418 int16_t * freq_buf_start = freq_buf; 419 while (pimag < pimag_end) { 420 // loop unroll 421 int16x8x2_t real_imag_0; 422 int16x8x2_t real_imag_1; 423 real_imag_0.val[1] = vld1q_s16(pimag); 424 real_imag_0.val[0] = vld1q_s16(preal); 425 preal += 8; 426 pimag += 8; 427 real_imag_1.val[1] = vld1q_s16(pimag); 428 real_imag_1.val[0] = vld1q_s16(preal); 429 preal += 8; 430 pimag += 8; 431 432 real_imag_0.val[1] = vnegq_s16(real_imag_0.val[1]); 433 real_imag_1.val[1] = vnegq_s16(real_imag_1.val[1]); 434 vst2q_s16(freq_buf_start, real_imag_0); 435 freq_buf_start += 16; 436 vst2q_s16(freq_buf_start, real_imag_1); 437 freq_buf_start += 16; 438 } 439 freq_buf[inst->anaLen] = inst->real[inst->anaLen2]; 440 freq_buf[inst->anaLen + 1] = -inst->imag[inst->anaLen2]; 441 } 442 443 // For the noise supress process, synthesis, read out fully processed segment, 444 // and update synthesis buffer. 445 void WebRtcNsx_SynthesisUpdateNeon(NoiseSuppressionFixedC* inst, 446 int16_t* out_frame, 447 int16_t gain_factor) { 448 assert(inst->anaLen % 16 == 0); 449 assert(inst->blockLen10ms % 16 == 0); 450 451 int16_t* preal_start = inst->real; 452 const int16_t* pwindow = inst->window; 453 int16_t* preal_end = preal_start + inst->anaLen; 454 int16_t* psynthesis_buffer = inst->synthesisBuffer; 455 456 while (preal_start < preal_end) { 457 // Loop unroll. 458 int16x8_t window_0 = vld1q_s16(pwindow); 459 int16x8_t real_0 = vld1q_s16(preal_start); 460 int16x8_t synthesis_buffer_0 = vld1q_s16(psynthesis_buffer); 461 462 int16x8_t window_1 = vld1q_s16(pwindow + 8); 463 int16x8_t real_1 = vld1q_s16(preal_start + 8); 464 int16x8_t synthesis_buffer_1 = vld1q_s16(psynthesis_buffer + 8); 465 466 int32x4_t tmp32a_0_low = vmull_s16(vget_low_s16(real_0), 467 vget_low_s16(window_0)); 468 int32x4_t tmp32a_0_high = vmull_s16(vget_high_s16(real_0), 469 vget_high_s16(window_0)); 470 471 int32x4_t tmp32a_1_low = vmull_s16(vget_low_s16(real_1), 472 vget_low_s16(window_1)); 473 int32x4_t tmp32a_1_high = vmull_s16(vget_high_s16(real_1), 474 vget_high_s16(window_1)); 475 476 int16x4_t tmp16a_0_low = vqrshrn_n_s32(tmp32a_0_low, 14); 477 int16x4_t tmp16a_0_high = vqrshrn_n_s32(tmp32a_0_high, 14); 478 479 int16x4_t tmp16a_1_low = vqrshrn_n_s32(tmp32a_1_low, 14); 480 int16x4_t tmp16a_1_high = vqrshrn_n_s32(tmp32a_1_high, 14); 481 482 int32x4_t tmp32b_0_low = vmull_n_s16(tmp16a_0_low, gain_factor); 483 int32x4_t tmp32b_0_high = vmull_n_s16(tmp16a_0_high, gain_factor); 484 485 int32x4_t tmp32b_1_low = vmull_n_s16(tmp16a_1_low, gain_factor); 486 int32x4_t tmp32b_1_high = vmull_n_s16(tmp16a_1_high, gain_factor); 487 488 int16x4_t tmp16b_0_low = vqrshrn_n_s32(tmp32b_0_low, 13); 489 int16x4_t tmp16b_0_high = vqrshrn_n_s32(tmp32b_0_high, 13); 490 491 int16x4_t tmp16b_1_low = vqrshrn_n_s32(tmp32b_1_low, 13); 492 int16x4_t tmp16b_1_high = vqrshrn_n_s32(tmp32b_1_high, 13); 493 494 synthesis_buffer_0 = vqaddq_s16(vcombine_s16(tmp16b_0_low, tmp16b_0_high), 495 synthesis_buffer_0); 496 synthesis_buffer_1 = vqaddq_s16(vcombine_s16(tmp16b_1_low, tmp16b_1_high), 497 synthesis_buffer_1); 498 vst1q_s16(psynthesis_buffer, synthesis_buffer_0); 499 vst1q_s16(psynthesis_buffer + 8, synthesis_buffer_1); 500 501 pwindow += 16; 502 preal_start += 16; 503 psynthesis_buffer += 16; 504 } 505 506 // Read out fully processed segment. 507 int16_t * p_start = inst->synthesisBuffer; 508 int16_t * p_end = inst->synthesisBuffer + inst->blockLen10ms; 509 int16_t * p_frame = out_frame; 510 while (p_start < p_end) { 511 int16x8_t frame_0 = vld1q_s16(p_start); 512 vst1q_s16(p_frame, frame_0); 513 p_start += 8; 514 p_frame += 8; 515 } 516 517 // Update synthesis buffer. 518 int16_t* p_start_src = inst->synthesisBuffer + inst->blockLen10ms; 519 int16_t* p_end_src = inst->synthesisBuffer + inst->anaLen; 520 int16_t* p_start_dst = inst->synthesisBuffer; 521 while (p_start_src < p_end_src) { 522 int16x8_t frame = vld1q_s16(p_start_src); 523 vst1q_s16(p_start_dst, frame); 524 p_start_src += 8; 525 p_start_dst += 8; 526 } 527 528 p_start = inst->synthesisBuffer + inst->anaLen - inst->blockLen10ms; 529 p_end = p_start + inst->blockLen10ms; 530 int16x8_t zero = vdupq_n_s16(0); 531 for (;p_start < p_end; p_start += 8) { 532 vst1q_s16(p_start, zero); 533 } 534 } 535 536 // Update analysis buffer for lower band, and window data before FFT. 537 void WebRtcNsx_AnalysisUpdateNeon(NoiseSuppressionFixedC* inst, 538 int16_t* out, 539 int16_t* new_speech) { 540 assert(inst->blockLen10ms % 16 == 0); 541 assert(inst->anaLen % 16 == 0); 542 543 // For lower band update analysis buffer. 544 // memcpy(inst->analysisBuffer, inst->analysisBuffer + inst->blockLen10ms, 545 // (inst->anaLen - inst->blockLen10ms) * sizeof(*inst->analysisBuffer)); 546 int16_t* p_start_src = inst->analysisBuffer + inst->blockLen10ms; 547 int16_t* p_end_src = inst->analysisBuffer + inst->anaLen; 548 int16_t* p_start_dst = inst->analysisBuffer; 549 while (p_start_src < p_end_src) { 550 int16x8_t frame = vld1q_s16(p_start_src); 551 vst1q_s16(p_start_dst, frame); 552 553 p_start_src += 8; 554 p_start_dst += 8; 555 } 556 557 // memcpy(inst->analysisBuffer + inst->anaLen - inst->blockLen10ms, 558 // new_speech, inst->blockLen10ms * sizeof(*inst->analysisBuffer)); 559 p_start_src = new_speech; 560 p_end_src = new_speech + inst->blockLen10ms; 561 p_start_dst = inst->analysisBuffer + inst->anaLen - inst->blockLen10ms; 562 while (p_start_src < p_end_src) { 563 int16x8_t frame = vld1q_s16(p_start_src); 564 vst1q_s16(p_start_dst, frame); 565 566 p_start_src += 8; 567 p_start_dst += 8; 568 } 569 570 // Window data before FFT. 571 int16_t* p_start_window = (int16_t*) inst->window; 572 int16_t* p_start_buffer = inst->analysisBuffer; 573 int16_t* p_start_out = out; 574 const int16_t* p_end_out = out + inst->anaLen; 575 576 // Load the first element to reduce pipeline bubble. 577 int16x8_t window = vld1q_s16(p_start_window); 578 int16x8_t buffer = vld1q_s16(p_start_buffer); 579 p_start_window += 8; 580 p_start_buffer += 8; 581 582 while (p_start_out < p_end_out) { 583 // Unroll loop. 584 int32x4_t tmp32_low = vmull_s16(vget_low_s16(window), vget_low_s16(buffer)); 585 int32x4_t tmp32_high = vmull_s16(vget_high_s16(window), 586 vget_high_s16(buffer)); 587 window = vld1q_s16(p_start_window); 588 buffer = vld1q_s16(p_start_buffer); 589 590 int16x4_t result_low = vrshrn_n_s32(tmp32_low, 14); 591 int16x4_t result_high = vrshrn_n_s32(tmp32_high, 14); 592 vst1q_s16(p_start_out, vcombine_s16(result_low, result_high)); 593 594 p_start_buffer += 8; 595 p_start_window += 8; 596 p_start_out += 8; 597 } 598 } 599