Home | History | Annotate | Download | only in ns
      1 /*
      2  *  Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include "webrtc/modules/audio_processing/ns/nsx_core.h"
     12 
     13 #include <arm_neon.h>
     14 #include <assert.h>
     15 
     16 // Constants to compensate for shifting signal log(2^shifts).
     17 const int16_t WebRtcNsx_kLogTable[9] = {
     18   0, 177, 355, 532, 710, 887, 1065, 1242, 1420
     19 };
     20 
     21 const int16_t WebRtcNsx_kCounterDiv[201] = {
     22   32767, 16384, 10923, 8192, 6554, 5461, 4681, 4096, 3641, 3277, 2979, 2731,
     23   2521, 2341, 2185, 2048, 1928, 1820, 1725, 1638, 1560, 1489, 1425, 1365, 1311,
     24   1260, 1214, 1170, 1130, 1092, 1057, 1024, 993, 964, 936, 910, 886, 862, 840,
     25   819, 799, 780, 762, 745, 728, 712, 697, 683, 669, 655, 643, 630, 618, 607,
     26   596, 585, 575, 565, 555, 546, 537, 529, 520, 512, 504, 496, 489, 482, 475,
     27   468, 462, 455, 449, 443, 437, 431, 426, 420, 415, 410, 405, 400, 395, 390,
     28   386, 381, 377, 372, 368, 364, 360, 356, 352, 349, 345, 341, 338, 334, 331,
     29   328, 324, 321, 318, 315, 312, 309, 306, 303, 301, 298, 295, 293, 290, 287,
     30   285, 282, 280, 278, 275, 273, 271, 269, 266, 264, 262, 260, 258, 256, 254,
     31   252, 250, 248, 246, 245, 243, 241, 239, 237, 236, 234, 232, 231, 229, 228,
     32   226, 224, 223, 221, 220, 218, 217, 216, 214, 213, 211, 210, 209, 207, 206,
     33   205, 204, 202, 201, 200, 199, 197, 196, 195, 194, 193, 192, 191, 189, 188,
     34   187, 186, 185, 184, 183, 182, 181, 180, 179, 178, 177, 176, 175, 174, 173,
     35   172, 172, 171, 170, 169, 168, 167, 166, 165, 165, 164, 163
     36 };
     37 
     38 const int16_t WebRtcNsx_kLogTableFrac[256] = {
     39   0, 1, 3, 4, 6, 7, 9, 10, 11, 13, 14, 16, 17, 18, 20, 21,
     40   22, 24, 25, 26, 28, 29, 30, 32, 33, 34, 36, 37, 38, 40, 41, 42,
     41   44, 45, 46, 47, 49, 50, 51, 52, 54, 55, 56, 57, 59, 60, 61, 62,
     42   63, 65, 66, 67, 68, 69, 71, 72, 73, 74, 75, 77, 78, 79, 80, 81,
     43   82, 84, 85, 86, 87, 88, 89, 90, 92, 93, 94, 95, 96, 97, 98, 99,
     44   100, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 116,
     45   117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131,
     46   132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146,
     47   147, 148, 149, 150, 151, 152, 153, 154, 155, 155, 156, 157, 158, 159, 160,
     48   161, 162, 163, 164, 165, 166, 167, 168, 169, 169, 170, 171, 172, 173, 174,
     49   175, 176, 177, 178, 178, 179, 180, 181, 182, 183, 184, 185, 185, 186, 187,
     50   188, 189, 190, 191, 192, 192, 193, 194, 195, 196, 197, 198, 198, 199, 200,
     51   201, 202, 203, 203, 204, 205, 206, 207, 208, 208, 209, 210, 211, 212, 212,
     52   213, 214, 215, 216, 216, 217, 218, 219, 220, 220, 221, 222, 223, 224, 224,
     53   225, 226, 227, 228, 228, 229, 230, 231, 231, 232, 233, 234, 234, 235, 236,
     54   237, 238, 238, 239, 240, 241, 241, 242, 243, 244, 244, 245, 246, 247, 247,
     55   248, 249, 249, 250, 251, 252, 252, 253, 254, 255, 255
     56 };
     57 
     58 // Update the noise estimation information.
     59 static void UpdateNoiseEstimateNeon(NoiseSuppressionFixedC* inst, int offset) {
     60   const int16_t kExp2Const = 11819; // Q13
     61   int16_t* ptr_noiseEstLogQuantile = NULL;
     62   int16_t* ptr_noiseEstQuantile = NULL;
     63   int16x4_t kExp2Const16x4 = vdup_n_s16(kExp2Const);
     64   int32x4_t twentyOne32x4 = vdupq_n_s32(21);
     65   int32x4_t constA32x4 = vdupq_n_s32(0x1fffff);
     66   int32x4_t constB32x4 = vdupq_n_s32(0x200000);
     67 
     68   int16_t tmp16 = WebRtcSpl_MaxValueW16(inst->noiseEstLogQuantile + offset,
     69                                         inst->magnLen);
     70 
     71   // Guarantee a Q-domain as high as possible and still fit in int16
     72   inst->qNoise = 14 - (int) WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(kExp2Const,
     73                                                                  tmp16,
     74                                                                  21);
     75 
     76   int32x4_t qNoise32x4 = vdupq_n_s32(inst->qNoise);
     77 
     78   for (ptr_noiseEstLogQuantile = &inst->noiseEstLogQuantile[offset],
     79        ptr_noiseEstQuantile = &inst->noiseEstQuantile[0];
     80        ptr_noiseEstQuantile < &inst->noiseEstQuantile[inst->magnLen - 3];
     81        ptr_noiseEstQuantile += 4, ptr_noiseEstLogQuantile += 4) {
     82 
     83     // tmp32no2 = kExp2Const * inst->noiseEstLogQuantile[offset + i];
     84     int16x4_t v16x4 = vld1_s16(ptr_noiseEstLogQuantile);
     85     int32x4_t v32x4B = vmull_s16(v16x4, kExp2Const16x4);
     86 
     87     // tmp32no1 = (0x00200000 | (tmp32no2 & 0x001FFFFF)); // 2^21 + frac
     88     int32x4_t v32x4A = vandq_s32(v32x4B, constA32x4);
     89     v32x4A = vorrq_s32(v32x4A, constB32x4);
     90 
     91     // tmp16 = (int16_t)(tmp32no2 >> 21);
     92     v32x4B = vshrq_n_s32(v32x4B, 21);
     93 
     94     // tmp16 -= 21;// shift 21 to get result in Q0
     95     v32x4B = vsubq_s32(v32x4B, twentyOne32x4);
     96 
     97     // tmp16 += (int16_t) inst->qNoise;
     98     // shift to get result in Q(qNoise)
     99     v32x4B = vaddq_s32(v32x4B, qNoise32x4);
    100 
    101     // if (tmp16 < 0) {
    102     //   tmp32no1 >>= -tmp16;
    103     // } else {
    104     //   tmp32no1 <<= tmp16;
    105     // }
    106     v32x4B = vshlq_s32(v32x4A, v32x4B);
    107 
    108     // tmp16 = WebRtcSpl_SatW32ToW16(tmp32no1);
    109     v16x4 = vqmovn_s32(v32x4B);
    110 
    111     //inst->noiseEstQuantile[i] = tmp16;
    112     vst1_s16(ptr_noiseEstQuantile, v16x4);
    113   }
    114 
    115   // Last iteration:
    116 
    117   // inst->quantile[i]=exp(inst->lquantile[offset+i]);
    118   // in Q21
    119   int32_t tmp32no2 = kExp2Const * *ptr_noiseEstLogQuantile;
    120   int32_t tmp32no1 = (0x00200000 | (tmp32no2 & 0x001FFFFF)); // 2^21 + frac
    121 
    122   tmp16 = (int16_t)(tmp32no2 >> 21);
    123   tmp16 -= 21;// shift 21 to get result in Q0
    124   tmp16 += (int16_t) inst->qNoise; //shift to get result in Q(qNoise)
    125   if (tmp16 < 0) {
    126     tmp32no1 >>= -tmp16;
    127   } else {
    128     tmp32no1 <<= tmp16;
    129   }
    130   *ptr_noiseEstQuantile = WebRtcSpl_SatW32ToW16(tmp32no1);
    131 }
    132 
    133 // Noise Estimation
    134 void WebRtcNsx_NoiseEstimationNeon(NoiseSuppressionFixedC* inst,
    135                                    uint16_t* magn,
    136                                    uint32_t* noise,
    137                                    int16_t* q_noise) {
    138   int16_t lmagn[HALF_ANAL_BLOCKL], counter, countDiv;
    139   int16_t countProd, delta, zeros, frac;
    140   int16_t log2, tabind, logval, tmp16, tmp16no1, tmp16no2;
    141   const int16_t log2_const = 22713;
    142   const int16_t width_factor = 21845;
    143 
    144   size_t i, s, offset;
    145 
    146   tabind = inst->stages - inst->normData;
    147   assert(tabind < 9);
    148   assert(tabind > -9);
    149   if (tabind < 0) {
    150     logval = -WebRtcNsx_kLogTable[-tabind];
    151   } else {
    152     logval = WebRtcNsx_kLogTable[tabind];
    153   }
    154 
    155   int16x8_t logval_16x8 = vdupq_n_s16(logval);
    156 
    157   // lmagn(i)=log(magn(i))=log(2)*log2(magn(i))
    158   // magn is in Q(-stages), and the real lmagn values are:
    159   // real_lmagn(i)=log(magn(i)*2^stages)=log(magn(i))+log(2^stages)
    160   // lmagn in Q8
    161   for (i = 0; i < inst->magnLen; i++) {
    162     if (magn[i]) {
    163       zeros = WebRtcSpl_NormU32((uint32_t)magn[i]);
    164       frac = (int16_t)((((uint32_t)magn[i] << zeros)
    165                         & 0x7FFFFFFF) >> 23);
    166       assert(frac < 256);
    167       // log2(magn(i))
    168       log2 = (int16_t)(((31 - zeros) << 8)
    169                        + WebRtcNsx_kLogTableFrac[frac]);
    170       // log2(magn(i))*log(2)
    171       lmagn[i] = (int16_t)((log2 * log2_const) >> 15);
    172       // + log(2^stages)
    173       lmagn[i] += logval;
    174     } else {
    175       lmagn[i] = logval;
    176     }
    177   }
    178 
    179   int16x4_t Q3_16x4  = vdup_n_s16(3);
    180   int16x8_t WIDTHQ8_16x8 = vdupq_n_s16(WIDTH_Q8);
    181   int16x8_t WIDTHFACTOR_16x8 = vdupq_n_s16(width_factor);
    182 
    183   int16_t factor = FACTOR_Q7;
    184   if (inst->blockIndex < END_STARTUP_LONG)
    185     factor = FACTOR_Q7_STARTUP;
    186 
    187   // Loop over simultaneous estimates
    188   for (s = 0; s < SIMULT; s++) {
    189     offset = s * inst->magnLen;
    190 
    191     // Get counter values from state
    192     counter = inst->noiseEstCounter[s];
    193     assert(counter < 201);
    194     countDiv = WebRtcNsx_kCounterDiv[counter];
    195     countProd = (int16_t)(counter * countDiv);
    196 
    197     // quant_est(...)
    198     int16_t deltaBuff[8];
    199     int16x4_t tmp16x4_0;
    200     int16x4_t tmp16x4_1;
    201     int16x4_t countDiv_16x4 = vdup_n_s16(countDiv);
    202     int16x8_t countProd_16x8 = vdupq_n_s16(countProd);
    203     int16x8_t tmp16x8_0 = vdupq_n_s16(countDiv);
    204     int16x8_t prod16x8 = vqrdmulhq_s16(WIDTHFACTOR_16x8, tmp16x8_0);
    205     int16x8_t tmp16x8_1;
    206     int16x8_t tmp16x8_2;
    207     int16x8_t tmp16x8_3;
    208     uint16x8_t tmp16x8_4;
    209     int32x4_t tmp32x4;
    210 
    211     for (i = 0; i + 7 < inst->magnLen; i += 8) {
    212       // Compute delta.
    213       // Smaller step size during startup. This prevents from using
    214       // unrealistic values causing overflow.
    215       tmp16x8_0 = vdupq_n_s16(factor);
    216       vst1q_s16(deltaBuff, tmp16x8_0);
    217 
    218       int j;
    219       for (j = 0; j < 8; j++) {
    220         if (inst->noiseEstDensity[offset + i + j] > 512) {
    221           // Get values for deltaBuff by shifting intead of dividing.
    222           int factor = WebRtcSpl_NormW16(inst->noiseEstDensity[offset + i + j]);
    223           deltaBuff[j] = (int16_t)(FACTOR_Q16 >> (14 - factor));
    224         }
    225       }
    226 
    227       // Update log quantile estimate
    228 
    229       // tmp16 = (int16_t)((delta * countDiv) >> 14);
    230       tmp32x4 = vmull_s16(vld1_s16(&deltaBuff[0]), countDiv_16x4);
    231       tmp16x4_1 = vshrn_n_s32(tmp32x4, 14);
    232       tmp32x4 = vmull_s16(vld1_s16(&deltaBuff[4]), countDiv_16x4);
    233       tmp16x4_0 = vshrn_n_s32(tmp32x4, 14);
    234       tmp16x8_0 = vcombine_s16(tmp16x4_1, tmp16x4_0); // Keep for several lines.
    235 
    236       // prepare for the "if" branch
    237       // tmp16 += 2;
    238       // tmp16_1 = (Word16)(tmp16>>2);
    239       tmp16x8_1 = vrshrq_n_s16(tmp16x8_0, 2);
    240 
    241       // inst->noiseEstLogQuantile[offset+i] + tmp16_1;
    242       tmp16x8_2 = vld1q_s16(&inst->noiseEstLogQuantile[offset + i]); // Keep
    243       tmp16x8_1 = vaddq_s16(tmp16x8_2, tmp16x8_1); // Keep for several lines
    244 
    245       // Prepare for the "else" branch
    246       // tmp16 += 1;
    247       // tmp16_1 = (Word16)(tmp16>>1);
    248       tmp16x8_0 = vrshrq_n_s16(tmp16x8_0, 1);
    249 
    250       // tmp16_2 = (int16_t)((tmp16_1 * 3) >> 1);
    251       tmp32x4 = vmull_s16(vget_low_s16(tmp16x8_0), Q3_16x4);
    252       tmp16x4_1 = vshrn_n_s32(tmp32x4, 1);
    253 
    254       // tmp16_2 = (int16_t)((tmp16_1 * 3) >> 1);
    255       tmp32x4 = vmull_s16(vget_high_s16(tmp16x8_0), Q3_16x4);
    256       tmp16x4_0 = vshrn_n_s32(tmp32x4, 1);
    257 
    258       // inst->noiseEstLogQuantile[offset + i] - tmp16_2;
    259       tmp16x8_0 = vcombine_s16(tmp16x4_1, tmp16x4_0); // keep
    260       tmp16x8_0 = vsubq_s16(tmp16x8_2, tmp16x8_0);
    261 
    262       // logval is the smallest fixed point representation we can have. Values
    263       // below that will correspond to values in the interval [0, 1], which
    264       // can't possibly occur.
    265       tmp16x8_0 = vmaxq_s16(tmp16x8_0, logval_16x8);
    266 
    267       // Do the if-else branches:
    268       tmp16x8_3 = vld1q_s16(&lmagn[i]); // keep for several lines
    269       tmp16x8_4 = vcgtq_s16(tmp16x8_3, tmp16x8_2);
    270       tmp16x8_2 = vbslq_s16(tmp16x8_4, tmp16x8_1, tmp16x8_0);
    271       vst1q_s16(&inst->noiseEstLogQuantile[offset + i], tmp16x8_2);
    272 
    273       // Update density estimate
    274       // tmp16_1 + tmp16_2
    275       tmp16x8_1 = vld1q_s16(&inst->noiseEstDensity[offset + i]);
    276       tmp16x8_0 = vqrdmulhq_s16(tmp16x8_1, countProd_16x8);
    277       tmp16x8_0 = vaddq_s16(tmp16x8_0, prod16x8);
    278 
    279       // lmagn[i] - inst->noiseEstLogQuantile[offset + i]
    280       tmp16x8_3 = vsubq_s16(tmp16x8_3, tmp16x8_2);
    281       tmp16x8_3 = vabsq_s16(tmp16x8_3);
    282       tmp16x8_4 = vcgtq_s16(WIDTHQ8_16x8, tmp16x8_3);
    283       tmp16x8_1 = vbslq_s16(tmp16x8_4, tmp16x8_0, tmp16x8_1);
    284       vst1q_s16(&inst->noiseEstDensity[offset + i], tmp16x8_1);
    285     }  // End loop over magnitude spectrum
    286 
    287     // Last iteration over magnitude spectrum:
    288     // compute delta
    289     if (inst->noiseEstDensity[offset + i] > 512) {
    290       // Get values for deltaBuff by shifting intead of dividing.
    291       int factor = WebRtcSpl_NormW16(inst->noiseEstDensity[offset + i]);
    292       delta = (int16_t)(FACTOR_Q16 >> (14 - factor));
    293     } else {
    294       delta = FACTOR_Q7;
    295       if (inst->blockIndex < END_STARTUP_LONG) {
    296         // Smaller step size during startup. This prevents from using
    297         // unrealistic values causing overflow.
    298         delta = FACTOR_Q7_STARTUP;
    299       }
    300     }
    301     // update log quantile estimate
    302     tmp16 = (int16_t)((delta * countDiv) >> 14);
    303     if (lmagn[i] > inst->noiseEstLogQuantile[offset + i]) {
    304       // +=QUANTILE*delta/(inst->counter[s]+1) QUANTILE=0.25, =1 in Q2
    305       // CounterDiv=1/(inst->counter[s]+1) in Q15
    306       tmp16 += 2;
    307       inst->noiseEstLogQuantile[offset + i] += tmp16 / 4;
    308     } else {
    309       tmp16 += 1;
    310       // *(1-QUANTILE), in Q2 QUANTILE=0.25, 1-0.25=0.75=3 in Q2
    311       // TODO(bjornv): investigate why we need to truncate twice.
    312       tmp16no2 = (int16_t)((tmp16 / 2) * 3 / 2);
    313       inst->noiseEstLogQuantile[offset + i] -= tmp16no2;
    314       if (inst->noiseEstLogQuantile[offset + i] < logval) {
    315         // logval is the smallest fixed point representation we can have.
    316         // Values below that will correspond to values in the interval
    317         // [0, 1], which can't possibly occur.
    318         inst->noiseEstLogQuantile[offset + i] = logval;
    319       }
    320     }
    321 
    322     // update density estimate
    323     if (WEBRTC_SPL_ABS_W16(lmagn[i] - inst->noiseEstLogQuantile[offset + i])
    324         < WIDTH_Q8) {
    325       tmp16no1 = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
    326                    inst->noiseEstDensity[offset + i], countProd, 15);
    327       tmp16no2 = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
    328                    width_factor, countDiv, 15);
    329       inst->noiseEstDensity[offset + i] = tmp16no1 + tmp16no2;
    330     }
    331 
    332 
    333     if (counter >= END_STARTUP_LONG) {
    334       inst->noiseEstCounter[s] = 0;
    335       if (inst->blockIndex >= END_STARTUP_LONG) {
    336         UpdateNoiseEstimateNeon(inst, offset);
    337       }
    338     }
    339     inst->noiseEstCounter[s]++;
    340 
    341   }  // end loop over simultaneous estimates
    342 
    343   // Sequentially update the noise during startup
    344   if (inst->blockIndex < END_STARTUP_LONG) {
    345     UpdateNoiseEstimateNeon(inst, offset);
    346   }
    347 
    348   for (i = 0; i < inst->magnLen; i++) {
    349     noise[i] = (uint32_t)(inst->noiseEstQuantile[i]); // Q(qNoise)
    350   }
    351   (*q_noise) = (int16_t)inst->qNoise;
    352 }
    353 
    354 // Filter the data in the frequency domain, and create spectrum.
    355 void WebRtcNsx_PrepareSpectrumNeon(NoiseSuppressionFixedC* inst,
    356                                    int16_t* freq_buf) {
    357   assert(inst->magnLen % 8 == 1);
    358   assert(inst->anaLen2 % 16 == 0);
    359 
    360   // (1) Filtering.
    361 
    362   // Fixed point C code for the next block is as follows:
    363   // for (i = 0; i < inst->magnLen; i++) {
    364   //   inst->real[i] = (int16_t)((inst->real[i] *
    365   //      (int16_t)(inst->noiseSupFilter[i])) >> 14);  // Q(normData-stages)
    366   //   inst->imag[i] = (int16_t)((inst->imag[i] *
    367   //      (int16_t)(inst->noiseSupFilter[i])) >> 14);  // Q(normData-stages)
    368   // }
    369 
    370   int16_t* preal = &inst->real[0];
    371   int16_t* pimag = &inst->imag[0];
    372   int16_t* pns_filter = (int16_t*)&inst->noiseSupFilter[0];
    373   int16_t* pimag_end = pimag + inst->magnLen - 4;
    374 
    375   while (pimag < pimag_end) {
    376     int16x8_t real = vld1q_s16(preal);
    377     int16x8_t imag = vld1q_s16(pimag);
    378     int16x8_t ns_filter = vld1q_s16(pns_filter);
    379 
    380     int32x4_t tmp_r_0 = vmull_s16(vget_low_s16(real), vget_low_s16(ns_filter));
    381     int32x4_t tmp_i_0 = vmull_s16(vget_low_s16(imag), vget_low_s16(ns_filter));
    382     int32x4_t tmp_r_1 = vmull_s16(vget_high_s16(real),
    383                                   vget_high_s16(ns_filter));
    384     int32x4_t tmp_i_1 = vmull_s16(vget_high_s16(imag),
    385                                   vget_high_s16(ns_filter));
    386 
    387     int16x4_t result_r_0 = vshrn_n_s32(tmp_r_0, 14);
    388     int16x4_t result_i_0 = vshrn_n_s32(tmp_i_0, 14);
    389     int16x4_t result_r_1 = vshrn_n_s32(tmp_r_1, 14);
    390     int16x4_t result_i_1 = vshrn_n_s32(tmp_i_1, 14);
    391 
    392     vst1q_s16(preal, vcombine_s16(result_r_0, result_r_1));
    393     vst1q_s16(pimag, vcombine_s16(result_i_0, result_i_1));
    394     preal += 8;
    395     pimag += 8;
    396     pns_filter += 8;
    397   }
    398 
    399   // Filter the last element
    400   *preal = (int16_t)((*preal * *pns_filter) >> 14);
    401   *pimag = (int16_t)((*pimag * *pns_filter) >> 14);
    402 
    403   // (2) Create spectrum.
    404 
    405   // Fixed point C code for the rest of the function is as follows:
    406   // freq_buf[0] = inst->real[0];
    407   // freq_buf[1] = -inst->imag[0];
    408   // for (i = 1, j = 2; i < inst->anaLen2; i += 1, j += 2) {
    409   //   freq_buf[j] = inst->real[i];
    410   //   freq_buf[j + 1] = -inst->imag[i];
    411   // }
    412   // freq_buf[inst->anaLen] = inst->real[inst->anaLen2];
    413   // freq_buf[inst->anaLen + 1] = -inst->imag[inst->anaLen2];
    414 
    415   preal = &inst->real[0];
    416   pimag = &inst->imag[0];
    417   pimag_end = pimag + inst->anaLen2;
    418   int16_t * freq_buf_start = freq_buf;
    419   while (pimag < pimag_end) {
    420     // loop unroll
    421     int16x8x2_t real_imag_0;
    422     int16x8x2_t real_imag_1;
    423     real_imag_0.val[1] = vld1q_s16(pimag);
    424     real_imag_0.val[0] = vld1q_s16(preal);
    425     preal += 8;
    426     pimag += 8;
    427     real_imag_1.val[1] = vld1q_s16(pimag);
    428     real_imag_1.val[0] = vld1q_s16(preal);
    429     preal += 8;
    430     pimag += 8;
    431 
    432     real_imag_0.val[1] = vnegq_s16(real_imag_0.val[1]);
    433     real_imag_1.val[1] = vnegq_s16(real_imag_1.val[1]);
    434     vst2q_s16(freq_buf_start, real_imag_0);
    435     freq_buf_start += 16;
    436     vst2q_s16(freq_buf_start, real_imag_1);
    437     freq_buf_start += 16;
    438   }
    439   freq_buf[inst->anaLen] = inst->real[inst->anaLen2];
    440   freq_buf[inst->anaLen + 1] = -inst->imag[inst->anaLen2];
    441 }
    442 
    443 // For the noise supress process, synthesis, read out fully processed segment,
    444 // and update synthesis buffer.
    445 void WebRtcNsx_SynthesisUpdateNeon(NoiseSuppressionFixedC* inst,
    446                                    int16_t* out_frame,
    447                                    int16_t gain_factor) {
    448   assert(inst->anaLen % 16 == 0);
    449   assert(inst->blockLen10ms % 16 == 0);
    450 
    451   int16_t* preal_start = inst->real;
    452   const int16_t* pwindow = inst->window;
    453   int16_t* preal_end = preal_start + inst->anaLen;
    454   int16_t* psynthesis_buffer = inst->synthesisBuffer;
    455 
    456   while (preal_start < preal_end) {
    457     // Loop unroll.
    458     int16x8_t window_0 = vld1q_s16(pwindow);
    459     int16x8_t real_0 = vld1q_s16(preal_start);
    460     int16x8_t synthesis_buffer_0 = vld1q_s16(psynthesis_buffer);
    461 
    462     int16x8_t window_1 = vld1q_s16(pwindow + 8);
    463     int16x8_t real_1 = vld1q_s16(preal_start + 8);
    464     int16x8_t synthesis_buffer_1 = vld1q_s16(psynthesis_buffer + 8);
    465 
    466     int32x4_t tmp32a_0_low = vmull_s16(vget_low_s16(real_0),
    467                                        vget_low_s16(window_0));
    468     int32x4_t tmp32a_0_high = vmull_s16(vget_high_s16(real_0),
    469                                         vget_high_s16(window_0));
    470 
    471     int32x4_t tmp32a_1_low = vmull_s16(vget_low_s16(real_1),
    472                                        vget_low_s16(window_1));
    473     int32x4_t tmp32a_1_high = vmull_s16(vget_high_s16(real_1),
    474                                         vget_high_s16(window_1));
    475 
    476     int16x4_t tmp16a_0_low = vqrshrn_n_s32(tmp32a_0_low, 14);
    477     int16x4_t tmp16a_0_high = vqrshrn_n_s32(tmp32a_0_high, 14);
    478 
    479     int16x4_t tmp16a_1_low = vqrshrn_n_s32(tmp32a_1_low, 14);
    480     int16x4_t tmp16a_1_high = vqrshrn_n_s32(tmp32a_1_high, 14);
    481 
    482     int32x4_t tmp32b_0_low = vmull_n_s16(tmp16a_0_low, gain_factor);
    483     int32x4_t tmp32b_0_high = vmull_n_s16(tmp16a_0_high, gain_factor);
    484 
    485     int32x4_t tmp32b_1_low = vmull_n_s16(tmp16a_1_low, gain_factor);
    486     int32x4_t tmp32b_1_high = vmull_n_s16(tmp16a_1_high, gain_factor);
    487 
    488     int16x4_t tmp16b_0_low = vqrshrn_n_s32(tmp32b_0_low, 13);
    489     int16x4_t tmp16b_0_high = vqrshrn_n_s32(tmp32b_0_high, 13);
    490 
    491     int16x4_t tmp16b_1_low = vqrshrn_n_s32(tmp32b_1_low, 13);
    492     int16x4_t tmp16b_1_high = vqrshrn_n_s32(tmp32b_1_high, 13);
    493 
    494     synthesis_buffer_0 = vqaddq_s16(vcombine_s16(tmp16b_0_low, tmp16b_0_high),
    495                                     synthesis_buffer_0);
    496     synthesis_buffer_1 = vqaddq_s16(vcombine_s16(tmp16b_1_low, tmp16b_1_high),
    497                                     synthesis_buffer_1);
    498     vst1q_s16(psynthesis_buffer, synthesis_buffer_0);
    499     vst1q_s16(psynthesis_buffer + 8, synthesis_buffer_1);
    500 
    501     pwindow += 16;
    502     preal_start += 16;
    503     psynthesis_buffer += 16;
    504   }
    505 
    506   // Read out fully processed segment.
    507   int16_t * p_start = inst->synthesisBuffer;
    508   int16_t * p_end = inst->synthesisBuffer + inst->blockLen10ms;
    509   int16_t * p_frame = out_frame;
    510   while (p_start < p_end) {
    511     int16x8_t frame_0 = vld1q_s16(p_start);
    512     vst1q_s16(p_frame, frame_0);
    513     p_start += 8;
    514     p_frame += 8;
    515   }
    516 
    517   // Update synthesis buffer.
    518   int16_t* p_start_src = inst->synthesisBuffer + inst->blockLen10ms;
    519   int16_t* p_end_src = inst->synthesisBuffer + inst->anaLen;
    520   int16_t* p_start_dst = inst->synthesisBuffer;
    521   while (p_start_src < p_end_src) {
    522     int16x8_t frame = vld1q_s16(p_start_src);
    523     vst1q_s16(p_start_dst, frame);
    524     p_start_src += 8;
    525     p_start_dst += 8;
    526   }
    527 
    528   p_start = inst->synthesisBuffer + inst->anaLen - inst->blockLen10ms;
    529   p_end = p_start + inst->blockLen10ms;
    530   int16x8_t zero = vdupq_n_s16(0);
    531   for (;p_start < p_end; p_start += 8) {
    532     vst1q_s16(p_start, zero);
    533   }
    534 }
    535 
    536 // Update analysis buffer for lower band, and window data before FFT.
    537 void WebRtcNsx_AnalysisUpdateNeon(NoiseSuppressionFixedC* inst,
    538                                   int16_t* out,
    539                                   int16_t* new_speech) {
    540   assert(inst->blockLen10ms % 16 == 0);
    541   assert(inst->anaLen % 16 == 0);
    542 
    543   // For lower band update analysis buffer.
    544   // memcpy(inst->analysisBuffer, inst->analysisBuffer + inst->blockLen10ms,
    545   //     (inst->anaLen - inst->blockLen10ms) * sizeof(*inst->analysisBuffer));
    546   int16_t* p_start_src = inst->analysisBuffer + inst->blockLen10ms;
    547   int16_t* p_end_src = inst->analysisBuffer + inst->anaLen;
    548   int16_t* p_start_dst = inst->analysisBuffer;
    549   while (p_start_src < p_end_src) {
    550     int16x8_t frame = vld1q_s16(p_start_src);
    551     vst1q_s16(p_start_dst, frame);
    552 
    553     p_start_src += 8;
    554     p_start_dst += 8;
    555   }
    556 
    557   // memcpy(inst->analysisBuffer + inst->anaLen - inst->blockLen10ms,
    558   //     new_speech, inst->blockLen10ms * sizeof(*inst->analysisBuffer));
    559   p_start_src = new_speech;
    560   p_end_src = new_speech + inst->blockLen10ms;
    561   p_start_dst = inst->analysisBuffer + inst->anaLen - inst->blockLen10ms;
    562   while (p_start_src < p_end_src) {
    563     int16x8_t frame = vld1q_s16(p_start_src);
    564     vst1q_s16(p_start_dst, frame);
    565 
    566     p_start_src += 8;
    567     p_start_dst += 8;
    568   }
    569 
    570   // Window data before FFT.
    571   int16_t* p_start_window = (int16_t*) inst->window;
    572   int16_t* p_start_buffer = inst->analysisBuffer;
    573   int16_t* p_start_out = out;
    574   const int16_t* p_end_out = out + inst->anaLen;
    575 
    576   // Load the first element to reduce pipeline bubble.
    577   int16x8_t window = vld1q_s16(p_start_window);
    578   int16x8_t buffer = vld1q_s16(p_start_buffer);
    579   p_start_window += 8;
    580   p_start_buffer += 8;
    581 
    582   while (p_start_out < p_end_out) {
    583     // Unroll loop.
    584     int32x4_t tmp32_low = vmull_s16(vget_low_s16(window), vget_low_s16(buffer));
    585     int32x4_t tmp32_high = vmull_s16(vget_high_s16(window),
    586                                      vget_high_s16(buffer));
    587     window = vld1q_s16(p_start_window);
    588     buffer = vld1q_s16(p_start_buffer);
    589 
    590     int16x4_t result_low = vrshrn_n_s32(tmp32_low, 14);
    591     int16x4_t result_high = vrshrn_n_s32(tmp32_high, 14);
    592     vst1q_s16(p_start_out, vcombine_s16(result_low, result_high));
    593 
    594     p_start_buffer += 8;
    595     p_start_window += 8;
    596     p_start_out += 8;
    597   }
    598 }
    599