Home | History | Annotate | Download | only in ns
      1 /*
      2  *  Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include "webrtc/modules/audio_processing/ns/nsx_core.h"
     12 
     13 #include <arm_neon.h>
     14 #include <assert.h>
     15 
     16 // Constants to compensate for shifting signal log(2^shifts).
     17 const int16_t WebRtcNsx_kLogTable[9] = {
     18   0, 177, 355, 532, 710, 887, 1065, 1242, 1420
     19 };
     20 
     21 const int16_t WebRtcNsx_kCounterDiv[201] = {
     22   32767, 16384, 10923, 8192, 6554, 5461, 4681, 4096, 3641, 3277, 2979, 2731,
     23   2521, 2341, 2185, 2048, 1928, 1820, 1725, 1638, 1560, 1489, 1425, 1365, 1311,
     24   1260, 1214, 1170, 1130, 1092, 1057, 1024, 993, 964, 936, 910, 886, 862, 840,
     25   819, 799, 780, 762, 745, 728, 712, 697, 683, 669, 655, 643, 630, 618, 607,
     26   596, 585, 575, 565, 555, 546, 537, 529, 520, 512, 504, 496, 489, 482, 475,
     27   468, 462, 455, 449, 443, 437, 431, 426, 420, 415, 410, 405, 400, 395, 390,
     28   386, 381, 377, 372, 368, 364, 360, 356, 352, 349, 345, 341, 338, 334, 331,
     29   328, 324, 321, 318, 315, 312, 309, 306, 303, 301, 298, 295, 293, 290, 287,
     30   285, 282, 280, 278, 275, 273, 271, 269, 266, 264, 262, 260, 258, 256, 254,
     31   252, 250, 248, 246, 245, 243, 241, 239, 237, 236, 234, 232, 231, 229, 228,
     32   226, 224, 223, 221, 220, 218, 217, 216, 214, 213, 211, 210, 209, 207, 206,
     33   205, 204, 202, 201, 200, 199, 197, 196, 195, 194, 193, 192, 191, 189, 188,
     34   187, 186, 185, 184, 183, 182, 181, 180, 179, 178, 177, 176, 175, 174, 173,
     35   172, 172, 171, 170, 169, 168, 167, 166, 165, 165, 164, 163
     36 };
     37 
     38 const int16_t WebRtcNsx_kLogTableFrac[256] = {
     39   0, 1, 3, 4, 6, 7, 9, 10, 11, 13, 14, 16, 17, 18, 20, 21,
     40   22, 24, 25, 26, 28, 29, 30, 32, 33, 34, 36, 37, 38, 40, 41, 42,
     41   44, 45, 46, 47, 49, 50, 51, 52, 54, 55, 56, 57, 59, 60, 61, 62,
     42   63, 65, 66, 67, 68, 69, 71, 72, 73, 74, 75, 77, 78, 79, 80, 81,
     43   82, 84, 85, 86, 87, 88, 89, 90, 92, 93, 94, 95, 96, 97, 98, 99,
     44   100, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 116,
     45   117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131,
     46   132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146,
     47   147, 148, 149, 150, 151, 152, 153, 154, 155, 155, 156, 157, 158, 159, 160,
     48   161, 162, 163, 164, 165, 166, 167, 168, 169, 169, 170, 171, 172, 173, 174,
     49   175, 176, 177, 178, 178, 179, 180, 181, 182, 183, 184, 185, 185, 186, 187,
     50   188, 189, 190, 191, 192, 192, 193, 194, 195, 196, 197, 198, 198, 199, 200,
     51   201, 202, 203, 203, 204, 205, 206, 207, 208, 208, 209, 210, 211, 212, 212,
     52   213, 214, 215, 216, 216, 217, 218, 219, 220, 220, 221, 222, 223, 224, 224,
     53   225, 226, 227, 228, 228, 229, 230, 231, 231, 232, 233, 234, 234, 235, 236,
     54   237, 238, 238, 239, 240, 241, 241, 242, 243, 244, 244, 245, 246, 247, 247,
     55   248, 249, 249, 250, 251, 252, 252, 253, 254, 255, 255
     56 };
     57 
     58 // Update the noise estimation information.
     59 static void UpdateNoiseEstimateNeon(NsxInst_t* inst, int offset) {
     60   const int16_t kExp2Const = 11819; // Q13
     61   int16_t* ptr_noiseEstLogQuantile = NULL;
     62   int16_t* ptr_noiseEstQuantile = NULL;
     63   int16x4_t kExp2Const16x4 = vdup_n_s16(kExp2Const);
     64   int32x4_t twentyOne32x4 = vdupq_n_s32(21);
     65   int32x4_t constA32x4 = vdupq_n_s32(0x1fffff);
     66   int32x4_t constB32x4 = vdupq_n_s32(0x200000);
     67 
     68   int16_t tmp16 = WebRtcSpl_MaxValueW16(inst->noiseEstLogQuantile + offset,
     69                                         inst->magnLen);
     70 
     71   // Guarantee a Q-domain as high as possible and still fit in int16
     72   inst->qNoise = 14 - (int) WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(kExp2Const,
     73                                                                  tmp16,
     74                                                                  21);
     75 
     76   int32x4_t qNoise32x4 = vdupq_n_s32(inst->qNoise);
     77 
     78   for (ptr_noiseEstLogQuantile = &inst->noiseEstLogQuantile[offset],
     79        ptr_noiseEstQuantile = &inst->noiseEstQuantile[0];
     80        ptr_noiseEstQuantile < &inst->noiseEstQuantile[inst->magnLen - 3];
     81        ptr_noiseEstQuantile += 4, ptr_noiseEstLogQuantile += 4) {
     82 
     83     // tmp32no2 = WEBRTC_SPL_MUL_16_16(kExp2Const,
     84     //                                inst->noiseEstLogQuantile[offset + i]);
     85     int16x4_t v16x4 = vld1_s16(ptr_noiseEstLogQuantile);
     86     int32x4_t v32x4B = vmull_s16(v16x4, kExp2Const16x4);
     87 
     88     // tmp32no1 = (0x00200000 | (tmp32no2 & 0x001FFFFF)); // 2^21 + frac
     89     int32x4_t v32x4A = vandq_s32(v32x4B, constA32x4);
     90     v32x4A = vorrq_s32(v32x4A, constB32x4);
     91 
     92     // tmp16 = (int16_t) WEBRTC_SPL_RSHIFT_W32(tmp32no2, 21);
     93     v32x4B = vshrq_n_s32(v32x4B, 21);
     94 
     95     // tmp16 -= 21;// shift 21 to get result in Q0
     96     v32x4B = vsubq_s32(v32x4B, twentyOne32x4);
     97 
     98     // tmp16 += (int16_t) inst->qNoise;
     99     // shift to get result in Q(qNoise)
    100     v32x4B = vaddq_s32(v32x4B, qNoise32x4);
    101 
    102     // if (tmp16 < 0) {
    103     //   tmp32no1 = WEBRTC_SPL_RSHIFT_W32(tmp32no1, -tmp16);
    104     // } else {
    105     //   tmp32no1 = WEBRTC_SPL_LSHIFT_W32(tmp32no1, tmp16);
    106     // }
    107     v32x4B = vshlq_s32(v32x4A, v32x4B);
    108 
    109     // tmp16 = WebRtcSpl_SatW32ToW16(tmp32no1);
    110     v16x4 = vqmovn_s32(v32x4B);
    111 
    112     //inst->noiseEstQuantile[i] = tmp16;
    113     vst1_s16(ptr_noiseEstQuantile, v16x4);
    114   }
    115 
    116   // Last iteration:
    117 
    118   // inst->quantile[i]=exp(inst->lquantile[offset+i]);
    119   // in Q21
    120   int32_t tmp32no2 = WEBRTC_SPL_MUL_16_16(kExp2Const,
    121                                           *ptr_noiseEstLogQuantile);
    122   int32_t tmp32no1 = (0x00200000 | (tmp32no2 & 0x001FFFFF)); // 2^21 + frac
    123 
    124   tmp16 = (int16_t) WEBRTC_SPL_RSHIFT_W32(tmp32no2, 21);
    125   tmp16 -= 21;// shift 21 to get result in Q0
    126   tmp16 += (int16_t) inst->qNoise; //shift to get result in Q(qNoise)
    127   if (tmp16 < 0) {
    128     tmp32no1 = WEBRTC_SPL_RSHIFT_W32(tmp32no1, -tmp16);
    129   } else {
    130     tmp32no1 = WEBRTC_SPL_LSHIFT_W32(tmp32no1, tmp16);
    131   }
    132   *ptr_noiseEstQuantile = WebRtcSpl_SatW32ToW16(tmp32no1);
    133 }
    134 
    135 // Noise Estimation
    136 void WebRtcNsx_NoiseEstimationNeon(NsxInst_t* inst,
    137                                    uint16_t* magn,
    138                                    uint32_t* noise,
    139                                    int16_t* q_noise) {
    140   int16_t lmagn[HALF_ANAL_BLOCKL], counter, countDiv;
    141   int16_t countProd, delta, zeros, frac;
    142   int16_t log2, tabind, logval, tmp16, tmp16no1, tmp16no2;
    143   const int16_t log2_const = 22713;
    144   const int16_t width_factor = 21845;
    145 
    146   int i, s, offset;
    147 
    148   tabind = inst->stages - inst->normData;
    149   assert(tabind < 9);
    150   assert(tabind > -9);
    151   if (tabind < 0) {
    152     logval = -WebRtcNsx_kLogTable[-tabind];
    153   } else {
    154     logval = WebRtcNsx_kLogTable[tabind];
    155   }
    156 
    157   int16x8_t logval_16x8 = vdupq_n_s16(logval);
    158 
    159   // lmagn(i)=log(magn(i))=log(2)*log2(magn(i))
    160   // magn is in Q(-stages), and the real lmagn values are:
    161   // real_lmagn(i)=log(magn(i)*2^stages)=log(magn(i))+log(2^stages)
    162   // lmagn in Q8
    163   for (i = 0; i < inst->magnLen; i++) {
    164     if (magn[i]) {
    165       zeros = WebRtcSpl_NormU32((uint32_t)magn[i]);
    166       frac = (int16_t)((((uint32_t)magn[i] << zeros)
    167                         & 0x7FFFFFFF) >> 23);
    168       assert(frac < 256);
    169       // log2(magn(i))
    170       log2 = (int16_t)(((31 - zeros) << 8)
    171                        + WebRtcNsx_kLogTableFrac[frac]);
    172       // log2(magn(i))*log(2)
    173       lmagn[i] = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(log2, log2_const, 15);
    174       // + log(2^stages)
    175       lmagn[i] += logval;
    176     } else {
    177       lmagn[i] = logval;
    178     }
    179   }
    180 
    181   int16x4_t Q3_16x4  = vdup_n_s16(3);
    182   int16x8_t WIDTHQ8_16x8 = vdupq_n_s16(WIDTH_Q8);
    183   int16x8_t WIDTHFACTOR_16x8 = vdupq_n_s16(width_factor);
    184 
    185   int16_t factor = FACTOR_Q7;
    186   if (inst->blockIndex < END_STARTUP_LONG)
    187     factor = FACTOR_Q7_STARTUP;
    188 
    189   // Loop over simultaneous estimates
    190   for (s = 0; s < SIMULT; s++) {
    191     offset = s * inst->magnLen;
    192 
    193     // Get counter values from state
    194     counter = inst->noiseEstCounter[s];
    195     assert(counter < 201);
    196     countDiv = WebRtcNsx_kCounterDiv[counter];
    197     countProd = (int16_t)WEBRTC_SPL_MUL_16_16(counter, countDiv);
    198 
    199     // quant_est(...)
    200     int16_t deltaBuff[8];
    201     int16x4_t tmp16x4_0;
    202     int16x4_t tmp16x4_1;
    203     int16x4_t countDiv_16x4 = vdup_n_s16(countDiv);
    204     int16x8_t countProd_16x8 = vdupq_n_s16(countProd);
    205     int16x8_t tmp16x8_0 = vdupq_n_s16(countDiv);
    206     int16x8_t prod16x8 = vqrdmulhq_s16(WIDTHFACTOR_16x8, tmp16x8_0);
    207     int16x8_t tmp16x8_1;
    208     int16x8_t tmp16x8_2;
    209     int16x8_t tmp16x8_3;
    210     // Initialize tmp16x8_4 to zero to avoid compilaton error.
    211     int16x8_t tmp16x8_4 = vdupq_n_s16(0);
    212     int16x8_t tmp16x8_5;
    213     int32x4_t tmp32x4;
    214 
    215     for (i = 0; i < inst->magnLen - 7; i += 8) {
    216       // Compute delta.
    217       // Smaller step size during startup. This prevents from using
    218       // unrealistic values causing overflow.
    219       tmp16x8_0 = vdupq_n_s16(factor);
    220       vst1q_s16(deltaBuff, tmp16x8_0);
    221 
    222       int j;
    223       for (j = 0; j < 8; j++) {
    224         if (inst->noiseEstDensity[offset + i + j] > 512) {
    225           // Get values for deltaBuff by shifting intead of dividing.
    226           int factor = WebRtcSpl_NormW16(inst->noiseEstDensity[offset + i + j]);
    227           deltaBuff[j] = (int16_t)(FACTOR_Q16 >> (14 - factor));
    228         }
    229       }
    230 
    231       // Update log quantile estimate
    232 
    233       // tmp16 = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(delta, countDiv, 14);
    234       tmp32x4 = vmull_s16(vld1_s16(&deltaBuff[0]), countDiv_16x4);
    235       tmp16x4_1 = vshrn_n_s32(tmp32x4, 14);
    236       tmp32x4 = vmull_s16(vld1_s16(&deltaBuff[4]), countDiv_16x4);
    237       tmp16x4_0 = vshrn_n_s32(tmp32x4, 14);
    238       tmp16x8_0 = vcombine_s16(tmp16x4_1, tmp16x4_0); // Keep for several lines.
    239 
    240       // prepare for the "if" branch
    241       // tmp16 += 2;
    242       // tmp16_1 = (Word16)(tmp16>>2);
    243       tmp16x8_1 = vrshrq_n_s16(tmp16x8_0, 2);
    244 
    245       // inst->noiseEstLogQuantile[offset+i] + tmp16_1;
    246       tmp16x8_2 = vld1q_s16(&inst->noiseEstLogQuantile[offset + i]); // Keep
    247       tmp16x8_1 = vaddq_s16(tmp16x8_2, tmp16x8_1); // Keep for several lines
    248 
    249       // Prepare for the "else" branch
    250       // tmp16 += 1;
    251       // tmp16_1 = (Word16)(tmp16>>1);
    252       tmp16x8_0 = vrshrq_n_s16(tmp16x8_0, 1);
    253 
    254       // tmp16_2 = (Word16)WEBRTC_SPL_MUL_16_16_RSFT(tmp16_1,3,1);
    255       tmp32x4 = vmull_s16(vget_low_s16(tmp16x8_0), Q3_16x4);
    256       tmp16x4_1 = vshrn_n_s32(tmp32x4, 1);
    257 
    258       // tmp16_2 = (Word16)WEBRTC_SPL_MUL_16_16_RSFT(tmp16_1,3,1);
    259       tmp32x4 = vmull_s16(vget_high_s16(tmp16x8_0), Q3_16x4);
    260       tmp16x4_0 = vshrn_n_s32(tmp32x4, 1);
    261 
    262       // inst->noiseEstLogQuantile[offset + i] - tmp16_2;
    263       tmp16x8_0 = vcombine_s16(tmp16x4_1, tmp16x4_0); // keep
    264       tmp16x8_0 = vsubq_s16(tmp16x8_2, tmp16x8_0);
    265 
    266       // logval is the smallest fixed point representation we can have. Values
    267       // below that will correspond to values in the interval [0, 1], which
    268       // can't possibly occur.
    269       tmp16x8_0 = vmaxq_s16(tmp16x8_0, logval_16x8);
    270 
    271       // Do the if-else branches:
    272       tmp16x8_3 = vld1q_s16(&lmagn[i]); // keep for several lines
    273       tmp16x8_5 = vsubq_s16(tmp16x8_3, tmp16x8_2);
    274       __asm__("vcgt.s16 %q0, %q1, #0"::"w"(tmp16x8_4), "w"(tmp16x8_5));
    275       __asm__("vbit %q0, %q1, %q2"::
    276               "w"(tmp16x8_2), "w"(tmp16x8_1), "w"(tmp16x8_4));
    277       __asm__("vbif %q0, %q1, %q2"::
    278               "w"(tmp16x8_2), "w"(tmp16x8_0), "w"(tmp16x8_4));
    279       vst1q_s16(&inst->noiseEstLogQuantile[offset + i], tmp16x8_2);
    280 
    281       // Update density estimate
    282       // tmp16_1 + tmp16_2
    283       tmp16x8_1 = vld1q_s16(&inst->noiseEstDensity[offset + i]);
    284       tmp16x8_0 = vqrdmulhq_s16(tmp16x8_1, countProd_16x8);
    285       tmp16x8_0 = vaddq_s16(tmp16x8_0, prod16x8);
    286 
    287       // lmagn[i] - inst->noiseEstLogQuantile[offset + i]
    288       tmp16x8_3 = vsubq_s16(tmp16x8_3, tmp16x8_2);
    289       tmp16x8_3 = vabsq_s16(tmp16x8_3);
    290       tmp16x8_4 = vcgtq_s16(WIDTHQ8_16x8, tmp16x8_3);
    291       __asm__("vbit %q0, %q1, %q2"::
    292               "w"(tmp16x8_1), "w"(tmp16x8_0), "w"(tmp16x8_4));
    293       vst1q_s16(&inst->noiseEstDensity[offset + i], tmp16x8_1);
    294     }  // End loop over magnitude spectrum
    295 
    296     // Last iteration over magnitude spectrum:
    297     // compute delta
    298     if (inst->noiseEstDensity[offset + i] > 512) {
    299       // Get values for deltaBuff by shifting intead of dividing.
    300       int factor = WebRtcSpl_NormW16(inst->noiseEstDensity[offset + i]);
    301       delta = (int16_t)(FACTOR_Q16 >> (14 - factor));
    302     } else {
    303       delta = FACTOR_Q7;
    304       if (inst->blockIndex < END_STARTUP_LONG) {
    305         // Smaller step size during startup. This prevents from using
    306         // unrealistic values causing overflow.
    307         delta = FACTOR_Q7_STARTUP;
    308       }
    309     }
    310     // update log quantile estimate
    311     tmp16 = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(delta, countDiv, 14);
    312     if (lmagn[i] > inst->noiseEstLogQuantile[offset + i]) {
    313       // +=QUANTILE*delta/(inst->counter[s]+1) QUANTILE=0.25, =1 in Q2
    314       // CounterDiv=1/(inst->counter[s]+1) in Q15
    315       tmp16 += 2;
    316       tmp16no1 = WEBRTC_SPL_RSHIFT_W16(tmp16, 2);
    317       inst->noiseEstLogQuantile[offset + i] += tmp16no1;
    318     } else {
    319       tmp16 += 1;
    320       tmp16no1 = WEBRTC_SPL_RSHIFT_W16(tmp16, 1);
    321       // *(1-QUANTILE), in Q2 QUANTILE=0.25, 1-0.25=0.75=3 in Q2
    322       tmp16no2 = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(tmp16no1, 3, 1);
    323       inst->noiseEstLogQuantile[offset + i] -= tmp16no2;
    324       if (inst->noiseEstLogQuantile[offset + i] < logval) {
    325         // logval is the smallest fixed point representation we can have.
    326         // Values below that will correspond to values in the interval
    327         // [0, 1], which can't possibly occur.
    328         inst->noiseEstLogQuantile[offset + i] = logval;
    329       }
    330     }
    331 
    332     // update density estimate
    333     if (WEBRTC_SPL_ABS_W16(lmagn[i] - inst->noiseEstLogQuantile[offset + i])
    334         < WIDTH_Q8) {
    335       tmp16no1 = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
    336                    inst->noiseEstDensity[offset + i], countProd, 15);
    337       tmp16no2 = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
    338                    width_factor, countDiv, 15);
    339       inst->noiseEstDensity[offset + i] = tmp16no1 + tmp16no2;
    340     }
    341 
    342 
    343     if (counter >= END_STARTUP_LONG) {
    344       inst->noiseEstCounter[s] = 0;
    345       if (inst->blockIndex >= END_STARTUP_LONG) {
    346         UpdateNoiseEstimateNeon(inst, offset);
    347       }
    348     }
    349     inst->noiseEstCounter[s]++;
    350 
    351   }  // end loop over simultaneous estimates
    352 
    353   // Sequentially update the noise during startup
    354   if (inst->blockIndex < END_STARTUP_LONG) {
    355     UpdateNoiseEstimateNeon(inst, offset);
    356   }
    357 
    358   for (i = 0; i < inst->magnLen; i++) {
    359     noise[i] = (uint32_t)(inst->noiseEstQuantile[i]); // Q(qNoise)
    360   }
    361   (*q_noise) = (int16_t)inst->qNoise;
    362 }
    363 
    364 // Filter the data in the frequency domain, and create spectrum.
    365 void WebRtcNsx_PrepareSpectrumNeon(NsxInst_t* inst, int16_t* freq_buf) {
    366 
    367   // (1) Filtering.
    368 
    369   // Fixed point C code for the next block is as follows:
    370   // for (i = 0; i < inst->magnLen; i++) {
    371   //   inst->real[i] = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(inst->real[i],
    372   //      (int16_t)(inst->noiseSupFilter[i]), 14); // Q(normData-stages)
    373   //   inst->imag[i] = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(inst->imag[i],
    374   //      (int16_t)(inst->noiseSupFilter[i]), 14); // Q(normData-stages)
    375   // }
    376 
    377   int16_t* ptr_real = &inst->real[0];
    378   int16_t* ptr_imag = &inst->imag[0];
    379   uint16_t* ptr_noiseSupFilter = &inst->noiseSupFilter[0];
    380 
    381   // Filter the rest in the frequency domain.
    382   for (; ptr_real < &inst->real[inst->magnLen - 1];) {
    383     // Loop unrolled once. Both pointers are incremented by 4 twice.
    384     __asm__ __volatile__(
    385       "vld1.16 d20, [%[ptr_real]]\n\t"
    386       "vld1.16 d22, [%[ptr_imag]]\n\t"
    387       "vld1.16 d23, [%[ptr_noiseSupFilter]]!\n\t"
    388       "vmull.s16 q10, d20, d23\n\t"
    389       "vmull.s16 q11, d22, d23\n\t"
    390       "vshrn.s32 d20, q10, #14\n\t"
    391       "vshrn.s32 d22, q11, #14\n\t"
    392       "vst1.16 d20, [%[ptr_real]]!\n\t"
    393       "vst1.16 d22, [%[ptr_imag]]!\n\t"
    394 
    395       "vld1.16 d18, [%[ptr_real]]\n\t"
    396       "vld1.16 d24, [%[ptr_imag]]\n\t"
    397       "vld1.16 d25, [%[ptr_noiseSupFilter]]!\n\t"
    398       "vmull.s16 q9, d18, d25\n\t"
    399       "vmull.s16 q12, d24, d25\n\t"
    400       "vshrn.s32 d18, q9, #14\n\t"
    401       "vshrn.s32 d24, q12, #14\n\t"
    402       "vst1.16 d18, [%[ptr_real]]!\n\t"
    403       "vst1.16 d24, [%[ptr_imag]]!\n\t"
    404 
    405       // Specify constraints.
    406       :[ptr_imag]"+r"(ptr_imag),
    407        [ptr_real]"+r"(ptr_real),
    408        [ptr_noiseSupFilter]"+r"(ptr_noiseSupFilter)
    409       :
    410       :"d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25",
    411        "q9", "q10", "q11", "q12"
    412     );
    413   }
    414 
    415   // Filter the last pair of elements in the frequency domain.
    416   *ptr_real = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(*ptr_real,
    417       (int16_t)(*ptr_noiseSupFilter), 14); // Q(normData-stages)
    418   *ptr_imag = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(*ptr_imag,
    419       (int16_t)(*ptr_noiseSupFilter), 14); // Q(normData-stages)
    420 
    421   // (2) Create spectrum.
    422 
    423   // Fixed point C code for the rest of the function is as follows:
    424   // freq_buf[0] = inst->real[0];
    425   // freq_buf[1] = -inst->imag[0];
    426   // for (i = 1, j = 2; i < inst->anaLen2; i += 1, j += 2) {
    427   //   tmp16 = (inst->anaLen << 1) - j;
    428   //   freq_buf[j] = inst->real[i];
    429   //   freq_buf[j + 1] = -inst->imag[i];
    430   //   freq_buf[tmp16] = inst->real[i];
    431   //   freq_buf[tmp16 + 1] = inst->imag[i];
    432   // }
    433   // freq_buf[inst->anaLen] = inst->real[inst->anaLen2];
    434   // freq_buf[inst->anaLen + 1] = -inst->imag[inst->anaLen2];
    435 
    436   freq_buf[0] = inst->real[0];
    437   freq_buf[1] = -inst->imag[0];
    438 
    439   int offset = -16;
    440   int16_t* ptr_realImag1 = &freq_buf[2];
    441   int16_t* ptr_realImag2 = ptr_realImag2 = &freq_buf[(inst->anaLen << 1) - 8];
    442   ptr_real = &inst->real[1];
    443   ptr_imag = &inst->imag[1];
    444   for (; ptr_real < &inst->real[inst->anaLen2 - 11];) {
    445     // Loop unrolled once. All pointers are incremented twice.
    446     __asm__ __volatile__(
    447       "vld1.16 d22, [%[ptr_real]]!\n\t"
    448       "vld1.16 d23, [%[ptr_imag]]!\n\t"
    449       // Negate and interleave:
    450       "vmov.s16 d20, d22\n\t"
    451       "vneg.s16 d21, d23\n\t"
    452       "vzip.16 d20, d21\n\t"
    453       // Write 8 elements to &freq_buf[j]
    454       "vst1.16 {d20, d21}, [%[ptr_realImag1]]!\n\t"
    455       // Interleave and reverse elements:
    456       "vzip.16 d22, d23\n\t"
    457       "vrev64.32 d18, d23\n\t"
    458       "vrev64.32 d19, d22\n\t"
    459       // Write 8 elements to &freq_buf[tmp16]
    460       "vst1.16 {d18, d19}, [%[ptr_realImag2]], %[offset]\n\t"
    461 
    462       "vld1.16 d22, [%[ptr_real]]!\n\t"
    463       "vld1.16 d23, [%[ptr_imag]]!\n\t"
    464       // Negate and interleave:
    465       "vmov.s16 d20, d22\n\t"
    466       "vneg.s16 d21, d23\n\t"
    467       "vzip.16 d20, d21\n\t"
    468       // Write 8 elements to &freq_buf[j]
    469       "vst1.16 {d20, d21}, [%[ptr_realImag1]]!\n\t"
    470       // Interleave and reverse elements:
    471       "vzip.16 d22, d23\n\t"
    472       "vrev64.32 d18, d23\n\t"
    473       "vrev64.32 d19, d22\n\t"
    474       // Write 8 elements to &freq_buf[tmp16]
    475       "vst1.16 {d18, d19}, [%[ptr_realImag2]], %[offset]\n\t"
    476 
    477       // Specify constraints.
    478       :[ptr_imag]"+r"(ptr_imag),
    479        [ptr_real]"+r"(ptr_real),
    480        [ptr_realImag1]"+r"(ptr_realImag1),
    481        [ptr_realImag2]"+r"(ptr_realImag2)
    482       :[offset]"r"(offset)
    483       :"d18", "d19", "d20", "d21", "d22", "d23"
    484     );
    485   }
    486   for (ptr_realImag2 += 6;
    487        ptr_real <= &inst->real[inst->anaLen2];
    488        ptr_real += 1, ptr_imag += 1, ptr_realImag1 += 2, ptr_realImag2 -= 2) {
    489     *ptr_realImag1 = *ptr_real;
    490     *(ptr_realImag1 + 1) = -(*ptr_imag);
    491     *ptr_realImag2 = *ptr_real;
    492     *(ptr_realImag2 + 1) = *ptr_imag;
    493   }
    494 
    495   freq_buf[inst->anaLen] = inst->real[inst->anaLen2];
    496   freq_buf[inst->anaLen + 1] = -inst->imag[inst->anaLen2];
    497 }
    498 
    499 // Denormalize the input buffer.
    500 void WebRtcNsx_DenormalizeNeon(NsxInst_t* inst, int16_t* in, int factor) {
    501   int16_t* ptr_real = &inst->real[0];
    502   int16_t* ptr_in = &in[0];
    503 
    504   __asm__ __volatile__("vdup.32 q10, %0" ::
    505                        "r"((int32_t)(factor - inst->normData)) : "q10");
    506   for (; ptr_real < &inst->real[inst->anaLen];) {
    507 
    508     // Loop unrolled once. Both pointers are incremented.
    509     __asm__ __volatile__(
    510       // tmp32 = WEBRTC_SPL_SHIFT_W32((int32_t)in[j],
    511       //                             factor - inst->normData);
    512       "vld2.16 {d24, d25}, [%[ptr_in]]!\n\t"
    513       "vmovl.s16 q12, d24\n\t"
    514       "vshl.s32 q12, q10\n\t"
    515       // inst->real[i] = WebRtcSpl_SatW32ToW16(tmp32); // Q0
    516       "vqmovn.s32 d24, q12\n\t"
    517       "vst1.16 d24, [%[ptr_real]]!\n\t"
    518 
    519       // tmp32 = WEBRTC_SPL_SHIFT_W32((int32_t)in[j],
    520       //                             factor - inst->normData);
    521       "vld2.16 {d22, d23}, [%[ptr_in]]!\n\t"
    522       "vmovl.s16 q11, d22\n\t"
    523       "vshl.s32 q11, q10\n\t"
    524       // inst->real[i] = WebRtcSpl_SatW32ToW16(tmp32); // Q0
    525       "vqmovn.s32 d22, q11\n\t"
    526       "vst1.16 d22, [%[ptr_real]]!\n\t"
    527 
    528       // Specify constraints.
    529       :[ptr_in]"+r"(ptr_in),
    530        [ptr_real]"+r"(ptr_real)
    531       :
    532       :"d22", "d23", "d24", "d25"
    533     );
    534   }
    535 }
    536 
    537 // For the noise supress process, synthesis, read out fully processed segment,
    538 // and update synthesis buffer.
    539 void WebRtcNsx_SynthesisUpdateNeon(NsxInst_t* inst,
    540                                    int16_t* out_frame,
    541                                    int16_t gain_factor) {
    542   int16_t* ptr_real = &inst->real[0];
    543   int16_t* ptr_syn = &inst->synthesisBuffer[0];
    544   const int16_t* ptr_window = &inst->window[0];
    545 
    546   // synthesis
    547   __asm__ __volatile__("vdup.16 d24, %0" : : "r"(gain_factor) : "d24");
    548   // Loop unrolled once. All pointers are incremented in the assembly code.
    549   for (; ptr_syn < &inst->synthesisBuffer[inst->anaLen];) {
    550     __asm__ __volatile__(
    551       // Load variables.
    552       "vld1.16 d22, [%[ptr_real]]!\n\t"
    553       "vld1.16 d23, [%[ptr_window]]!\n\t"
    554       "vld1.16 d25, [%[ptr_syn]]\n\t"
    555       // tmp16a = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
    556       //           inst->window[i], inst->real[i], 14); // Q0, window in Q14
    557       "vmull.s16 q11, d22, d23\n\t"
    558       "vrshrn.i32 d22, q11, #14\n\t"
    559       // tmp32 = WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(tmp16a, gain_factor, 13);
    560       "vmull.s16 q11, d24, d22\n\t"
    561       // tmp16b = WebRtcSpl_SatW32ToW16(tmp32); // Q0
    562       "vqrshrn.s32 d22, q11, #13\n\t"
    563       // inst->synthesisBuffer[i] = WEBRTC_SPL_ADD_SAT_W16(
    564       //     inst->synthesisBuffer[i], tmp16b); // Q0
    565       "vqadd.s16 d25, d22\n\t"
    566       "vst1.16 d25, [%[ptr_syn]]!\n\t"
    567 
    568       // Load variables.
    569       "vld1.16 d26, [%[ptr_real]]!\n\t"
    570       "vld1.16 d27, [%[ptr_window]]!\n\t"
    571       "vld1.16 d28, [%[ptr_syn]]\n\t"
    572       // tmp16a = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
    573       //           inst->window[i], inst->real[i], 14); // Q0, window in Q14
    574       "vmull.s16 q13, d26, d27\n\t"
    575       "vrshrn.i32 d26, q13, #14\n\t"
    576       // tmp32 = WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(tmp16a, gain_factor, 13);
    577       "vmull.s16 q13, d24, d26\n\t"
    578       // tmp16b = WebRtcSpl_SatW32ToW16(tmp32); // Q0
    579       "vqrshrn.s32 d26, q13, #13\n\t"
    580       // inst->synthesisBuffer[i] = WEBRTC_SPL_ADD_SAT_W16(
    581       //     inst->synthesisBuffer[i], tmp16b); // Q0
    582       "vqadd.s16 d28, d26\n\t"
    583       "vst1.16 d28, [%[ptr_syn]]!\n\t"
    584 
    585       // Specify constraints.
    586       :[ptr_real]"+r"(ptr_real),
    587        [ptr_window]"+r"(ptr_window),
    588        [ptr_syn]"+r"(ptr_syn)
    589       :
    590       :"d22", "d23", "d24", "d25", "d26", "d27", "d28", "q11", "q12", "q13"
    591     );
    592   }
    593 
    594   int16_t* ptr_out = &out_frame[0];
    595   ptr_syn = &inst->synthesisBuffer[0];
    596   // read out fully processed segment
    597   for (; ptr_syn < &inst->synthesisBuffer[inst->blockLen10ms];) {
    598     // Loop unrolled once. Both pointers are incremented in the assembly code.
    599     __asm__ __volatile__(
    600       // out_frame[i] = inst->synthesisBuffer[i]; // Q0
    601       "vld1.16 {d22, d23}, [%[ptr_syn]]!\n\t"
    602       "vld1.16 {d24, d25}, [%[ptr_syn]]!\n\t"
    603       "vst1.16 {d22, d23}, [%[ptr_out]]!\n\t"
    604       "vst1.16 {d24, d25}, [%[ptr_out]]!\n\t"
    605       :[ptr_syn]"+r"(ptr_syn),
    606        [ptr_out]"+r"(ptr_out)
    607       :
    608       :"d22", "d23", "d24", "d25"
    609     );
    610   }
    611 
    612   // Update synthesis buffer.
    613   // C code:
    614   // WEBRTC_SPL_MEMCPY_W16(inst->synthesisBuffer,
    615   //                      inst->synthesisBuffer + inst->blockLen10ms,
    616   //                      inst->anaLen - inst->blockLen10ms);
    617   ptr_out = &inst->synthesisBuffer[0],
    618   ptr_syn = &inst->synthesisBuffer[inst->blockLen10ms];
    619   for (; ptr_syn < &inst->synthesisBuffer[inst->anaLen];) {
    620     // Loop unrolled once. Both pointers are incremented in the assembly code.
    621     __asm__ __volatile__(
    622       "vld1.16 {d22, d23}, [%[ptr_syn]]!\n\t"
    623       "vld1.16 {d24, d25}, [%[ptr_syn]]!\n\t"
    624       "vst1.16 {d22, d23}, [%[ptr_out]]!\n\t"
    625       "vst1.16 {d24, d25}, [%[ptr_out]]!\n\t"
    626       :[ptr_syn]"+r"(ptr_syn),
    627        [ptr_out]"+r"(ptr_out)
    628       :
    629       :"d22", "d23", "d24", "d25"
    630     );
    631   }
    632 
    633   // C code:
    634   // WebRtcSpl_ZerosArrayW16(inst->synthesisBuffer
    635   //    + inst->anaLen - inst->blockLen10ms, inst->blockLen10ms);
    636   __asm__ __volatile__("vdup.16 q10, %0" : : "r"(0) : "q10");
    637   for (; ptr_out < &inst->synthesisBuffer[inst->anaLen];) {
    638     // Loop unrolled once. Pointer is incremented in the assembly code.
    639     __asm__ __volatile__(
    640       "vst1.16 {d20, d21}, [%[ptr_out]]!\n\t"
    641       "vst1.16 {d20, d21}, [%[ptr_out]]!\n\t"
    642       :[ptr_out]"+r"(ptr_out)
    643       :
    644       :"d20", "d21"
    645     );
    646   }
    647 }
    648 
    649 // Update analysis buffer for lower band, and window data before FFT.
    650 void WebRtcNsx_AnalysisUpdateNeon(NsxInst_t* inst,
    651                                   int16_t* out,
    652                                   int16_t* new_speech) {
    653 
    654   int16_t* ptr_ana = &inst->analysisBuffer[inst->blockLen10ms];
    655   int16_t* ptr_out = &inst->analysisBuffer[0];
    656 
    657   // For lower band update analysis buffer.
    658   // WEBRTC_SPL_MEMCPY_W16(inst->analysisBuffer,
    659   //                      inst->analysisBuffer + inst->blockLen10ms,
    660   //                      inst->anaLen - inst->blockLen10ms);
    661   for (; ptr_out < &inst->analysisBuffer[inst->anaLen - inst->blockLen10ms];) {
    662     // Loop unrolled once, so both pointers are incremented by 8 twice.
    663     __asm__ __volatile__(
    664       "vld1.16 {d20, d21}, [%[ptr_ana]]!\n\t"
    665       "vst1.16 {d20, d21}, [%[ptr_out]]!\n\t"
    666       "vld1.16 {d22, d23}, [%[ptr_ana]]!\n\t"
    667       "vst1.16 {d22, d23}, [%[ptr_out]]!\n\t"
    668       :[ptr_ana]"+r"(ptr_ana),
    669        [ptr_out]"+r"(ptr_out)
    670       :
    671       :"d20", "d21", "d22", "d23"
    672     );
    673   }
    674 
    675   // WEBRTC_SPL_MEMCPY_W16(inst->analysisBuffer
    676   //    + inst->anaLen - inst->blockLen10ms, new_speech, inst->blockLen10ms);
    677   for (ptr_ana = new_speech; ptr_out < &inst->analysisBuffer[inst->anaLen];) {
    678     // Loop unrolled once, so both pointers are incremented by 8 twice.
    679     __asm__ __volatile__(
    680       "vld1.16 {d20, d21}, [%[ptr_ana]]!\n\t"
    681       "vst1.16 {d20, d21}, [%[ptr_out]]!\n\t"
    682       "vld1.16 {d22, d23}, [%[ptr_ana]]!\n\t"
    683       "vst1.16 {d22, d23}, [%[ptr_out]]!\n\t"
    684       :[ptr_ana]"+r"(ptr_ana),
    685        [ptr_out]"+r"(ptr_out)
    686       :
    687       :"d20", "d21", "d22", "d23"
    688     );
    689   }
    690 
    691   // Window data before FFT
    692   const int16_t* ptr_window = &inst->window[0];
    693   ptr_out = &out[0];
    694   ptr_ana = &inst->analysisBuffer[0];
    695   for (; ptr_out < &out[inst->anaLen];) {
    696 
    697     // Loop unrolled once, so all pointers are incremented by 4 twice.
    698     __asm__ __volatile__(
    699       "vld1.16 d20, [%[ptr_ana]]!\n\t"
    700       "vld1.16 d21, [%[ptr_window]]!\n\t"
    701       // out[i] = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
    702       //           inst->window[i], inst->analysisBuffer[i], 14); // Q0
    703       "vmull.s16 q10, d20, d21\n\t"
    704       "vrshrn.i32 d20, q10, #14\n\t"
    705       "vst1.16 d20, [%[ptr_out]]!\n\t"
    706 
    707       "vld1.16 d22, [%[ptr_ana]]!\n\t"
    708       "vld1.16 d23, [%[ptr_window]]!\n\t"
    709       // out[i] = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
    710       //           inst->window[i], inst->analysisBuffer[i], 14); // Q0
    711       "vmull.s16 q11, d22, d23\n\t"
    712       "vrshrn.i32 d22, q11, #14\n\t"
    713       "vst1.16 d22, [%[ptr_out]]!\n\t"
    714 
    715       // Specify constraints.
    716       :[ptr_ana]"+r"(ptr_ana),
    717        [ptr_window]"+r"(ptr_window),
    718        [ptr_out]"+r"(ptr_out)
    719       :
    720       :"d20", "d21", "d22", "d23", "q10", "q11"
    721     );
    722   }
    723 }
    724 
    725 // Create a complex number buffer (out[]) as the intput (in[]) interleaved with
    726 // zeros, and normalize it.
    727 void WebRtcNsx_CreateComplexBufferNeon(NsxInst_t* inst,
    728                                        int16_t* in,
    729                                        int16_t* out) {
    730   int16_t* ptr_out = &out[0];
    731   int16_t* ptr_in = &in[0];
    732 
    733   __asm__ __volatile__("vdup.16 d25, %0" : : "r"(0) : "d25");
    734   __asm__ __volatile__("vdup.16 q10, %0" : : "r"(inst->normData) : "q10");
    735   for (; ptr_in < &in[inst->anaLen];) {
    736 
    737     // Loop unrolled once, so ptr_in is incremented by 8 twice,
    738     // and ptr_out is incremented by 8 four times.
    739     __asm__ __volatile__(
    740       // out[j] = WEBRTC_SPL_LSHIFT_W16(in[i], inst->normData); // Q(normData)
    741       "vld1.16 {d22, d23}, [%[ptr_in]]!\n\t"
    742       "vshl.s16 q11, q10\n\t"
    743       "vmov d24, d23\n\t"
    744 
    745       // out[j + 1] = 0; // Insert zeros in imaginary part
    746       "vmov d23, d25\n\t"
    747       "vst2.16 {d22, d23}, [%[ptr_out]]!\n\t"
    748       "vst2.16 {d24, d25}, [%[ptr_out]]!\n\t"
    749 
    750       // out[j] = WEBRTC_SPL_LSHIFT_W16(in[i], inst->normData); // Q(normData)
    751       "vld1.16 {d22, d23}, [%[ptr_in]]!\n\t"
    752       "vshl.s16 q11, q10\n\t"
    753       "vmov d24, d23\n\t"
    754 
    755       // out[j + 1] = 0; // Insert zeros in imaginary part
    756       "vmov d23, d25\n\t"
    757       "vst2.16 {d22, d23}, [%[ptr_out]]!\n\t"
    758       "vst2.16 {d24, d25}, [%[ptr_out]]!\n\t"
    759 
    760       // Specify constraints.
    761       :[ptr_in]"+r"(ptr_in),
    762        [ptr_out]"+r"(ptr_out)
    763       :
    764       :"d22", "d23", "d24", "d25", "q10", "q11"
    765     );
    766   }
    767 }
    768