Home | History | Annotate | Download | only in ns
      1 /*
      2  *  Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include "nsx_core.h"
     12 
     13 #include <arm_neon.h>
     14 #include <assert.h>
     15 
     16 // Update the noise estimation information.
     17 static void UpdateNoiseEstimateNeon(NsxInst_t* inst, int offset) {
     18   int i = 0;
     19   const int16_t kExp2Const = 11819; // Q13
     20   int16_t* ptr_noiseEstLogQuantile = NULL;
     21   int16_t* ptr_noiseEstQuantile = NULL;
     22   int16x4_t kExp2Const16x4 = vdup_n_s16(kExp2Const);
     23   int32x4_t twentyOne32x4 = vdupq_n_s32(21);
     24   int32x4_t constA32x4 = vdupq_n_s32(0x1fffff);
     25   int32x4_t constB32x4 = vdupq_n_s32(0x200000);
     26 
     27   int16_t tmp16 = WebRtcSpl_MaxValueW16(inst->noiseEstLogQuantile + offset,
     28                                         inst->magnLen);
     29 
     30   // Guarantee a Q-domain as high as possible and still fit in int16
     31   inst->qNoise = 14 - (int) WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(kExp2Const,
     32                                                                  tmp16,
     33                                                                  21);
     34 
     35   int32x4_t qNoise32x4 = vdupq_n_s32(inst->qNoise);
     36 
     37   for (ptr_noiseEstLogQuantile = &inst->noiseEstLogQuantile[offset],
     38        ptr_noiseEstQuantile = &inst->noiseEstQuantile[0];
     39        ptr_noiseEstQuantile < &inst->noiseEstQuantile[inst->magnLen - 3];
     40        ptr_noiseEstQuantile += 4, ptr_noiseEstLogQuantile += 4) {
     41 
     42     // tmp32no2 = WEBRTC_SPL_MUL_16_16(kExp2Const,
     43     //                                inst->noiseEstLogQuantile[offset + i]);
     44     int16x4_t v16x4 = vld1_s16(ptr_noiseEstLogQuantile);
     45     int32x4_t v32x4B = vmull_s16(v16x4, kExp2Const16x4);
     46 
     47     // tmp32no1 = (0x00200000 | (tmp32no2 & 0x001FFFFF)); // 2^21 + frac
     48     int32x4_t v32x4A = vandq_s32(v32x4B, constA32x4);
     49     v32x4A = vorrq_s32(v32x4A, constB32x4);
     50 
     51     // tmp16 = (int16_t) WEBRTC_SPL_RSHIFT_W32(tmp32no2, 21);
     52     v32x4B = vshrq_n_s32(v32x4B, 21);
     53 
     54     // tmp16 -= 21;// shift 21 to get result in Q0
     55     v32x4B = vsubq_s32(v32x4B, twentyOne32x4);
     56 
     57     // tmp16 += (int16_t) inst->qNoise;
     58     // shift to get result in Q(qNoise)
     59     v32x4B = vaddq_s32(v32x4B, qNoise32x4);
     60 
     61     // if (tmp16 < 0) {
     62     //   tmp32no1 = WEBRTC_SPL_RSHIFT_W32(tmp32no1, -tmp16);
     63     // } else {
     64     //   tmp32no1 = WEBRTC_SPL_LSHIFT_W32(tmp32no1, tmp16);
     65     // }
     66     v32x4B = vshlq_s32(v32x4A, v32x4B);
     67 
     68     // tmp16 = WebRtcSpl_SatW32ToW16(tmp32no1);
     69     v16x4 = vqmovn_s32(v32x4B);
     70 
     71     //inst->noiseEstQuantile[i] = tmp16;
     72     vst1_s16(ptr_noiseEstQuantile, v16x4);
     73   }
     74 
     75   // Last iteration:
     76 
     77   // inst->quantile[i]=exp(inst->lquantile[offset+i]);
     78   // in Q21
     79   int32_t tmp32no2 = WEBRTC_SPL_MUL_16_16(kExp2Const,
     80                                           *ptr_noiseEstLogQuantile);
     81   int32_t tmp32no1 = (0x00200000 | (tmp32no2 & 0x001FFFFF)); // 2^21 + frac
     82 
     83   tmp16 = (int16_t) WEBRTC_SPL_RSHIFT_W32(tmp32no2, 21);
     84   tmp16 -= 21;// shift 21 to get result in Q0
     85   tmp16 += (int16_t) inst->qNoise; //shift to get result in Q(qNoise)
     86   if (tmp16 < 0) {
     87     tmp32no1 = WEBRTC_SPL_RSHIFT_W32(tmp32no1, -tmp16);
     88   } else {
     89     tmp32no1 = WEBRTC_SPL_LSHIFT_W32(tmp32no1, tmp16);
     90   }
     91   *ptr_noiseEstQuantile = WebRtcSpl_SatW32ToW16(tmp32no1);
     92 }
     93 
     94 // Noise Estimation
     95 static void NoiseEstimationNeon(NsxInst_t* inst,
     96                                 uint16_t* magn,
     97                                 uint32_t* noise,
     98                                 int16_t* q_noise) {
     99   int16_t lmagn[HALF_ANAL_BLOCKL], counter, countDiv;
    100   int16_t countProd, delta, zeros, frac;
    101   int16_t log2, tabind, logval, tmp16, tmp16no1, tmp16no2;
    102   const int16_t log2_const = 22713;
    103   const int16_t width_factor = 21845;
    104 
    105   int i, s, offset;
    106 
    107   tabind = inst->stages - inst->normData;
    108   assert(tabind < 9);
    109   assert(tabind > -9);
    110   if (tabind < 0) {
    111     logval = -WebRtcNsx_kLogTable[-tabind];
    112   } else {
    113     logval = WebRtcNsx_kLogTable[tabind];
    114   }
    115 
    116   int16x8_t logval_16x8 = vdupq_n_s16(logval);
    117 
    118   // lmagn(i)=log(magn(i))=log(2)*log2(magn(i))
    119   // magn is in Q(-stages), and the real lmagn values are:
    120   // real_lmagn(i)=log(magn(i)*2^stages)=log(magn(i))+log(2^stages)
    121   // lmagn in Q8
    122   for (i = 0; i < inst->magnLen; i++) {
    123     if (magn[i]) {
    124       zeros = WebRtcSpl_NormU32((uint32_t)magn[i]);
    125       frac = (int16_t)((((uint32_t)magn[i] << zeros)
    126                         & 0x7FFFFFFF) >> 23);
    127       assert(frac < 256);
    128       // log2(magn(i))
    129       log2 = (int16_t)(((31 - zeros) << 8)
    130                        + WebRtcNsx_kLogTableFrac[frac]);
    131       // log2(magn(i))*log(2)
    132       lmagn[i] = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(log2, log2_const, 15);
    133       // + log(2^stages)
    134       lmagn[i] += logval;
    135     } else {
    136       lmagn[i] = logval;
    137     }
    138   }
    139 
    140   int16x4_t Q3_16x4  = vdup_n_s16(3);
    141   int16x8_t WIDTHQ8_16x8 = vdupq_n_s16(WIDTH_Q8);
    142   int16x8_t WIDTHFACTOR_16x8 = vdupq_n_s16(width_factor);
    143 
    144   int16_t factor = FACTOR_Q7;
    145   if (inst->blockIndex < END_STARTUP_LONG)
    146     factor = FACTOR_Q7_STARTUP;
    147 
    148   // Loop over simultaneous estimates
    149   for (s = 0; s < SIMULT; s++) {
    150     offset = s * inst->magnLen;
    151 
    152     // Get counter values from state
    153     counter = inst->noiseEstCounter[s];
    154     assert(counter < 201);
    155     countDiv = WebRtcNsx_kCounterDiv[counter];
    156     countProd = (int16_t)WEBRTC_SPL_MUL_16_16(counter, countDiv);
    157 
    158     // quant_est(...)
    159     int16_t deltaBuff[8];
    160     int16x4_t tmp16x4_0;
    161     int16x4_t tmp16x4_1;
    162     int16x4_t countDiv_16x4 = vdup_n_s16(countDiv);
    163     int16x8_t countProd_16x8 = vdupq_n_s16(countProd);
    164     int16x8_t tmp16x8_0 = vdupq_n_s16(countDiv);
    165     int16x8_t prod16x8 = vqrdmulhq_s16(WIDTHFACTOR_16x8, tmp16x8_0);
    166     int16x8_t tmp16x8_1;
    167     int16x8_t tmp16x8_2;
    168     int16x8_t tmp16x8_3;
    169     int16x8_t tmp16x8_4;
    170     int16x8_t tmp16x8_5;
    171     int32x4_t tmp32x4;
    172 
    173     for (i = 0; i < inst->magnLen - 7; i += 8) {
    174       // Compute delta.
    175       // Smaller step size during startup. This prevents from using
    176       // unrealistic values causing overflow.
    177       tmp16x8_0 = vdupq_n_s16(factor);
    178       vst1q_s16(deltaBuff, tmp16x8_0);
    179 
    180       int j;
    181       for (j = 0; j < 8; j++) {
    182         if (inst->noiseEstDensity[offset + i + j] > 512) {
    183           // Get values for deltaBuff by shifting intead of dividing.
    184           int factor = WebRtcSpl_NormW16(inst->noiseEstDensity[offset + i + j]);
    185           deltaBuff[j] = (int16_t)(FACTOR_Q16 >> (14 - factor));
    186         }
    187       }
    188 
    189       // Update log quantile estimate
    190 
    191       // tmp16 = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(delta, countDiv, 14);
    192       tmp32x4 = vmull_s16(vld1_s16(&deltaBuff[0]), countDiv_16x4);
    193       tmp16x4_1 = vshrn_n_s32(tmp32x4, 14);
    194       tmp32x4 = vmull_s16(vld1_s16(&deltaBuff[4]), countDiv_16x4);
    195       tmp16x4_0 = vshrn_n_s32(tmp32x4, 14);
    196       tmp16x8_0 = vcombine_s16(tmp16x4_1, tmp16x4_0); // Keep for several lines.
    197 
    198       // prepare for the "if" branch
    199       // tmp16 += 2;
    200       // tmp16_1 = (Word16)(tmp16>>2);
    201       tmp16x8_1 = vrshrq_n_s16(tmp16x8_0, 2);
    202 
    203       // inst->noiseEstLogQuantile[offset+i] + tmp16_1;
    204       tmp16x8_2 = vld1q_s16(&inst->noiseEstLogQuantile[offset + i]); // Keep
    205       tmp16x8_1 = vaddq_s16(tmp16x8_2, tmp16x8_1); // Keep for several lines
    206 
    207       // Prepare for the "else" branch
    208       // tmp16 += 1;
    209       // tmp16_1 = (Word16)(tmp16>>1);
    210       tmp16x8_0 = vrshrq_n_s16(tmp16x8_0, 1);
    211 
    212       // tmp16_2 = (Word16)WEBRTC_SPL_MUL_16_16_RSFT(tmp16_1,3,1);
    213       tmp32x4 = vmull_s16(vget_low_s16(tmp16x8_0), Q3_16x4);
    214       tmp16x4_1 = vshrn_n_s32(tmp32x4, 1);
    215 
    216       // tmp16_2 = (Word16)WEBRTC_SPL_MUL_16_16_RSFT(tmp16_1,3,1);
    217       tmp32x4 = vmull_s16(vget_high_s16(tmp16x8_0), Q3_16x4);
    218       tmp16x4_0 = vshrn_n_s32(tmp32x4, 1);
    219 
    220       // inst->noiseEstLogQuantile[offset + i] - tmp16_2;
    221       tmp16x8_0 = vcombine_s16(tmp16x4_1, tmp16x4_0); // keep
    222       tmp16x8_0 = vsubq_s16(tmp16x8_2, tmp16x8_0);
    223 
    224       // logval is the smallest fixed point representation we can have. Values
    225       // below that will correspond to values in the interval [0, 1], which
    226       // can't possibly occur.
    227       tmp16x8_0 = vmaxq_s16(tmp16x8_0, logval_16x8);
    228 
    229       // Do the if-else branches:
    230       tmp16x8_3 = vld1q_s16(&lmagn[i]); // keep for several lines
    231       tmp16x8_5 = vsubq_s16(tmp16x8_3, tmp16x8_2);
    232       __asm__("vcgt.s16 %q0, %q1, #0"::"w"(tmp16x8_4), "w"(tmp16x8_5));
    233       __asm__("vbit %q0, %q1, %q2"::
    234               "w"(tmp16x8_2), "w"(tmp16x8_1), "w"(tmp16x8_4));
    235       __asm__("vbif %q0, %q1, %q2"::
    236               "w"(tmp16x8_2), "w"(tmp16x8_0), "w"(tmp16x8_4));
    237       vst1q_s16(&inst->noiseEstLogQuantile[offset + i], tmp16x8_2);
    238 
    239       // Update density estimate
    240       // tmp16_1 + tmp16_2
    241       tmp16x8_1 = vld1q_s16(&inst->noiseEstDensity[offset + i]);
    242       tmp16x8_0 = vqrdmulhq_s16(tmp16x8_1, countProd_16x8);
    243       tmp16x8_0 = vaddq_s16(tmp16x8_0, prod16x8);
    244 
    245       // lmagn[i] - inst->noiseEstLogQuantile[offset + i]
    246       tmp16x8_3 = vsubq_s16(tmp16x8_3, tmp16x8_2);
    247       tmp16x8_3 = vabsq_s16(tmp16x8_3);
    248       tmp16x8_4 = vcgtq_s16(WIDTHQ8_16x8, tmp16x8_3);
    249       __asm__("vbit %q0, %q1, %q2"::
    250               "w"(tmp16x8_1), "w"(tmp16x8_0), "w"(tmp16x8_4));
    251       vst1q_s16(&inst->noiseEstDensity[offset + i], tmp16x8_1);
    252     } // End loop over magnitude spectrum
    253 
    254     // Last iteration over magnitude spectrum:
    255     // compute delta
    256     if (inst->noiseEstDensity[offset + i] > 512) {
    257       // Get values for deltaBuff by shifting intead of dividing.
    258       int factor = WebRtcSpl_NormW16(inst->noiseEstDensity[offset + i]);
    259       delta = (int16_t)(FACTOR_Q16 >> (14 - factor));
    260     } else {
    261       delta = FACTOR_Q7;
    262       if (inst->blockIndex < END_STARTUP_LONG) {
    263         // Smaller step size during startup. This prevents from using
    264         // unrealistic values causing overflow.
    265         delta = FACTOR_Q7_STARTUP;
    266       }
    267     }
    268     // update log quantile estimate
    269     tmp16 = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(delta, countDiv, 14);
    270     if (lmagn[i] > inst->noiseEstLogQuantile[offset + i]) {
    271       // +=QUANTILE*delta/(inst->counter[s]+1) QUANTILE=0.25, =1 in Q2
    272       // CounterDiv=1/(inst->counter[s]+1) in Q15
    273       tmp16 += 2;
    274       tmp16no1 = WEBRTC_SPL_RSHIFT_W16(tmp16, 2);
    275       inst->noiseEstLogQuantile[offset + i] += tmp16no1;
    276     } else {
    277       tmp16 += 1;
    278       tmp16no1 = WEBRTC_SPL_RSHIFT_W16(tmp16, 1);
    279       // *(1-QUANTILE), in Q2 QUANTILE=0.25, 1-0.25=0.75=3 in Q2
    280       tmp16no2 = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(tmp16no1, 3, 1);
    281       inst->noiseEstLogQuantile[offset + i] -= tmp16no2;
    282       if (inst->noiseEstLogQuantile[offset + i] < logval) {
    283         // logval is the smallest fixed point representation we can have.
    284         // Values below that will correspond to values in the interval
    285         // [0, 1], which can't possibly occur.
    286         inst->noiseEstLogQuantile[offset + i] = logval;
    287       }
    288     }
    289 
    290     // update density estimate
    291     if (WEBRTC_SPL_ABS_W16(lmagn[i] - inst->noiseEstLogQuantile[offset + i])
    292         < WIDTH_Q8) {
    293       tmp16no1 = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
    294                    inst->noiseEstDensity[offset + i], countProd, 15);
    295       tmp16no2 = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
    296                    width_factor, countDiv, 15);
    297       inst->noiseEstDensity[offset + i] = tmp16no1 + tmp16no2;
    298     }
    299 
    300 
    301     if (counter >= END_STARTUP_LONG) {
    302       inst->noiseEstCounter[s] = 0;
    303       if (inst->blockIndex >= END_STARTUP_LONG) {
    304         UpdateNoiseEstimateNeon(inst, offset);
    305       }
    306     }
    307     inst->noiseEstCounter[s]++;
    308 
    309   } // end loop over simultaneous estimates
    310 
    311   // Sequentially update the noise during startup
    312   if (inst->blockIndex < END_STARTUP_LONG) {
    313     UpdateNoiseEstimateNeon(inst, offset);
    314   }
    315 
    316   for (i = 0; i < inst->magnLen; i++) {
    317     noise[i] = (uint32_t)(inst->noiseEstQuantile[i]); // Q(qNoise)
    318   }
    319   (*q_noise) = (int16_t)inst->qNoise;
    320 }
    321 
    322 // Filter the data in the frequency domain, and create spectrum.
    323 static void PrepareSpectrumNeon(NsxInst_t* inst, int16_t* freq_buf) {
    324 
    325   // (1) Filtering.
    326 
    327   // Fixed point C code for the next block is as follows:
    328   // for (i = 0; i < inst->magnLen; i++) {
    329   //   inst->real[i] = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(inst->real[i],
    330   //      (int16_t)(inst->noiseSupFilter[i]), 14); // Q(normData-stages)
    331   //   inst->imag[i] = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(inst->imag[i],
    332   //      (int16_t)(inst->noiseSupFilter[i]), 14); // Q(normData-stages)
    333   // }
    334 
    335   int16_t* ptr_real = &inst->real[0];
    336   int16_t* ptr_imag = &inst->imag[0];
    337   uint16_t* ptr_noiseSupFilter = &inst->noiseSupFilter[0];
    338 
    339   // Filter the rest in the frequency domain.
    340   for (; ptr_real < &inst->real[inst->magnLen - 1];) {
    341     // Loop unrolled once. Both pointers are incremented by 4 twice.
    342     __asm__ __volatile__(
    343       "vld1.16 d20, [%[ptr_real]]\n\t"
    344       "vld1.16 d22, [%[ptr_imag]]\n\t"
    345       "vld1.16 d23, [%[ptr_noiseSupFilter]]!\n\t"
    346       "vmull.s16 q10, d20, d23\n\t"
    347       "vmull.s16 q11, d22, d23\n\t"
    348       "vshrn.s32 d20, q10, #14\n\t"
    349       "vshrn.s32 d22, q11, #14\n\t"
    350       "vst1.16 d20, [%[ptr_real]]!\n\t"
    351       "vst1.16 d22, [%[ptr_imag]]!\n\t"
    352 
    353       "vld1.16 d18, [%[ptr_real]]\n\t"
    354       "vld1.16 d24, [%[ptr_imag]]\n\t"
    355       "vld1.16 d25, [%[ptr_noiseSupFilter]]!\n\t"
    356       "vmull.s16 q9, d18, d25\n\t"
    357       "vmull.s16 q12, d24, d25\n\t"
    358       "vshrn.s32 d18, q9, #14\n\t"
    359       "vshrn.s32 d24, q12, #14\n\t"
    360       "vst1.16 d18, [%[ptr_real]]!\n\t"
    361       "vst1.16 d24, [%[ptr_imag]]!\n\t"
    362 
    363       // Specify constraints.
    364       :[ptr_imag]"+r"(ptr_imag),
    365        [ptr_real]"+r"(ptr_real),
    366        [ptr_noiseSupFilter]"+r"(ptr_noiseSupFilter)
    367       :
    368       :"d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25",
    369        "q9", "q10", "q11", "q12"
    370     );
    371   }
    372 
    373   // Filter the last pair of elements in the frequency domain.
    374   *ptr_real = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(*ptr_real,
    375       (int16_t)(*ptr_noiseSupFilter), 14); // Q(normData-stages)
    376   *ptr_imag = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(*ptr_imag,
    377       (int16_t)(*ptr_noiseSupFilter), 14); // Q(normData-stages)
    378 
    379   // (2) Create spectrum.
    380 
    381   // Fixed point C code for the rest of the function is as follows:
    382   // freq_buf[0] = inst->real[0];
    383   // freq_buf[1] = -inst->imag[0];
    384   // for (i = 1, j = 2; i < inst->anaLen2; i += 1, j += 2) {
    385   //   tmp16 = (inst->anaLen << 1) - j;
    386   //   freq_buf[j] = inst->real[i];
    387   //   freq_buf[j + 1] = -inst->imag[i];
    388   //   freq_buf[tmp16] = inst->real[i];
    389   //   freq_buf[tmp16 + 1] = inst->imag[i];
    390   // }
    391   // freq_buf[inst->anaLen] = inst->real[inst->anaLen2];
    392   // freq_buf[inst->anaLen + 1] = -inst->imag[inst->anaLen2];
    393 
    394   freq_buf[0] = inst->real[0];
    395   freq_buf[1] = -inst->imag[0];
    396 
    397   int offset = -16;
    398   int16_t* ptr_realImag1 = &freq_buf[2];
    399   int16_t* ptr_realImag2 = ptr_realImag2 = &freq_buf[(inst->anaLen << 1) - 8];
    400   ptr_real = &inst->real[1];
    401   ptr_imag = &inst->imag[1];
    402   for (; ptr_real < &inst->real[inst->anaLen2 - 11];) {
    403     // Loop unrolled once. All pointers are incremented twice.
    404     __asm__ __volatile__(
    405       "vld1.16 d22, [%[ptr_real]]!\n\t"
    406       "vld1.16 d23, [%[ptr_imag]]!\n\t"
    407       // Negate and interleave:
    408       "vmov.s16 d20, d22\n\t"
    409       "vneg.s16 d21, d23\n\t"
    410       "vzip.16 d20, d21\n\t"
    411       // Write 8 elements to &freq_buf[j]
    412       "vst1.16 {d20, d21}, [%[ptr_realImag1]]!\n\t"
    413       // Interleave and reverse elements:
    414       "vzip.16 d22, d23\n\t"
    415       "vrev64.32 d18, d23\n\t"
    416       "vrev64.32 d19, d22\n\t"
    417       // Write 8 elements to &freq_buf[tmp16]
    418       "vst1.16 {d18, d19}, [%[ptr_realImag2]], %[offset]\n\t"
    419 
    420       "vld1.16 d22, [%[ptr_real]]!\n\t"
    421       "vld1.16 d23, [%[ptr_imag]]!\n\t"
    422       // Negate and interleave:
    423       "vmov.s16 d20, d22\n\t"
    424       "vneg.s16 d21, d23\n\t"
    425       "vzip.16 d20, d21\n\t"
    426       // Write 8 elements to &freq_buf[j]
    427       "vst1.16 {d20, d21}, [%[ptr_realImag1]]!\n\t"
    428       // Interleave and reverse elements:
    429       "vzip.16 d22, d23\n\t"
    430       "vrev64.32 d18, d23\n\t"
    431       "vrev64.32 d19, d22\n\t"
    432       // Write 8 elements to &freq_buf[tmp16]
    433       "vst1.16 {d18, d19}, [%[ptr_realImag2]], %[offset]\n\t"
    434 
    435       // Specify constraints.
    436       :[ptr_imag]"+r"(ptr_imag),
    437        [ptr_real]"+r"(ptr_real),
    438        [ptr_realImag1]"+r"(ptr_realImag1),
    439        [ptr_realImag2]"+r"(ptr_realImag2)
    440       :[offset]"r"(offset)
    441       :"d18", "d19", "d20", "d21", "d22", "d23"
    442     );
    443   }
    444   for (ptr_realImag2 += 6;
    445        ptr_real <= &inst->real[inst->anaLen2];
    446        ptr_real += 1, ptr_imag += 1, ptr_realImag1 += 2, ptr_realImag2 -= 2) {
    447     *ptr_realImag1 = *ptr_real;
    448     *(ptr_realImag1 + 1) = -(*ptr_imag);
    449     *ptr_realImag2 = *ptr_real;
    450     *(ptr_realImag2 + 1) = *ptr_imag;
    451   }
    452 
    453   freq_buf[inst->anaLen] = inst->real[inst->anaLen2];
    454   freq_buf[inst->anaLen + 1] = -inst->imag[inst->anaLen2];
    455 }
    456 
    457 // Denormalize the input buffer.
    458 static __inline void DenormalizeNeon(NsxInst_t* inst, int16_t* in, int factor) {
    459   int16_t* ptr_real = &inst->real[0];
    460   int16_t* ptr_in = &in[0];
    461 
    462   __asm__ __volatile__("vdup.32 q10, %0" ::
    463                        "r"((int32_t)(factor - inst->normData)) : "q10");
    464   for (; ptr_real < &inst->real[inst->anaLen];) {
    465 
    466     // Loop unrolled once. Both pointers are incremented.
    467     __asm__ __volatile__(
    468       // tmp32 = WEBRTC_SPL_SHIFT_W32((int32_t)in[j],
    469       //                             factor - inst->normData);
    470       "vld2.16 {d24, d25}, [%[ptr_in]]!\n\t"
    471       "vmovl.s16 q12, d24\n\t"
    472       "vshl.s32 q12, q10\n\t"
    473       // inst->real[i] = WebRtcSpl_SatW32ToW16(tmp32); // Q0
    474       "vqmovn.s32 d24, q12\n\t"
    475       "vst1.16 d24, [%[ptr_real]]!\n\t"
    476 
    477       // tmp32 = WEBRTC_SPL_SHIFT_W32((int32_t)in[j],
    478       //                             factor - inst->normData);
    479       "vld2.16 {d22, d23}, [%[ptr_in]]!\n\t"
    480       "vmovl.s16 q11, d22\n\t"
    481       "vshl.s32 q11, q10\n\t"
    482       // inst->real[i] = WebRtcSpl_SatW32ToW16(tmp32); // Q0
    483       "vqmovn.s32 d22, q11\n\t"
    484       "vst1.16 d22, [%[ptr_real]]!\n\t"
    485 
    486       // Specify constraints.
    487       :[ptr_in]"+r"(ptr_in),
    488        [ptr_real]"+r"(ptr_real)
    489       :
    490       :"d22", "d23", "d24", "d25"
    491     );
    492   }
    493 }
    494 
    495 // For the noise supress process, synthesis, read out fully processed segment,
    496 // and update synthesis buffer.
    497 static void SynthesisUpdateNeon(NsxInst_t* inst,
    498                                 int16_t* out_frame,
    499                                 int16_t gain_factor) {
    500   int16_t* ptr_real = &inst->real[0];
    501   int16_t* ptr_syn = &inst->synthesisBuffer[0];
    502   int16_t* ptr_window = &inst->window[0];
    503 
    504   // synthesis
    505   __asm__ __volatile__("vdup.16 d24, %0" : : "r"(gain_factor) : "d24");
    506   // Loop unrolled once. All pointers are incremented in the assembly code.
    507   for (; ptr_syn < &inst->synthesisBuffer[inst->anaLen];) {
    508     __asm__ __volatile__(
    509       // Load variables.
    510       "vld1.16 d22, [%[ptr_real]]!\n\t"
    511       "vld1.16 d23, [%[ptr_window]]!\n\t"
    512       "vld1.16 d25, [%[ptr_syn]]\n\t"
    513       // tmp16a = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
    514       //           inst->window[i], inst->real[i], 14); // Q0, window in Q14
    515       "vmull.s16 q11, d22, d23\n\t"
    516       "vrshrn.i32 d22, q11, #14\n\t"
    517       // tmp32 = WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(tmp16a, gain_factor, 13);
    518       "vmull.s16 q11, d24, d22\n\t"
    519       // tmp16b = WebRtcSpl_SatW32ToW16(tmp32); // Q0
    520       "vqrshrn.s32 d22, q11, #13\n\t"
    521       // inst->synthesisBuffer[i] = WEBRTC_SPL_ADD_SAT_W16(
    522       //     inst->synthesisBuffer[i], tmp16b); // Q0
    523       "vqadd.s16 d25, d22\n\t"
    524       "vst1.16 d25, [%[ptr_syn]]!\n\t"
    525 
    526       // Load variables.
    527       "vld1.16 d26, [%[ptr_real]]!\n\t"
    528       "vld1.16 d27, [%[ptr_window]]!\n\t"
    529       "vld1.16 d28, [%[ptr_syn]]\n\t"
    530       // tmp16a = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
    531       //           inst->window[i], inst->real[i], 14); // Q0, window in Q14
    532       "vmull.s16 q13, d26, d27\n\t"
    533       "vrshrn.i32 d26, q13, #14\n\t"
    534       // tmp32 = WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(tmp16a, gain_factor, 13);
    535       "vmull.s16 q13, d24, d26\n\t"
    536       // tmp16b = WebRtcSpl_SatW32ToW16(tmp32); // Q0
    537       "vqrshrn.s32 d26, q13, #13\n\t"
    538       // inst->synthesisBuffer[i] = WEBRTC_SPL_ADD_SAT_W16(
    539       //     inst->synthesisBuffer[i], tmp16b); // Q0
    540       "vqadd.s16 d28, d26\n\t"
    541       "vst1.16 d28, [%[ptr_syn]]!\n\t"
    542 
    543       // Specify constraints.
    544       :[ptr_real]"+r"(ptr_real),
    545        [ptr_window]"+r"(ptr_window),
    546        [ptr_syn]"+r"(ptr_syn)
    547       :
    548       :"d22", "d23", "d24", "d25", "d26", "d27", "d28", "q11", "q12", "q13"
    549     );
    550   }
    551 
    552   int16_t* ptr_out = &out_frame[0];
    553   ptr_syn = &inst->synthesisBuffer[0];
    554   // read out fully processed segment
    555   for (; ptr_syn < &inst->synthesisBuffer[inst->blockLen10ms];) {
    556     // Loop unrolled once. Both pointers are incremented in the assembly code.
    557     __asm__ __volatile__(
    558       // out_frame[i] = inst->synthesisBuffer[i]; // Q0
    559       "vld1.16 {d22, d23}, [%[ptr_syn]]!\n\t"
    560       "vld1.16 {d24, d25}, [%[ptr_syn]]!\n\t"
    561       "vst1.16 {d22, d23}, [%[ptr_out]]!\n\t"
    562       "vst1.16 {d24, d25}, [%[ptr_out]]!\n\t"
    563       :[ptr_syn]"+r"(ptr_syn),
    564        [ptr_out]"+r"(ptr_out)
    565       :
    566       :"d22", "d23", "d24", "d25"
    567     );
    568   }
    569 
    570   // Update synthesis buffer.
    571   // C code:
    572   // WEBRTC_SPL_MEMCPY_W16(inst->synthesisBuffer,
    573   //                      inst->synthesisBuffer + inst->blockLen10ms,
    574   //                      inst->anaLen - inst->blockLen10ms);
    575   ptr_out = &inst->synthesisBuffer[0],
    576   ptr_syn = &inst->synthesisBuffer[inst->blockLen10ms];
    577   for (; ptr_syn < &inst->synthesisBuffer[inst->anaLen];) {
    578     // Loop unrolled once. Both pointers are incremented in the assembly code.
    579     __asm__ __volatile__(
    580       "vld1.16 {d22, d23}, [%[ptr_syn]]!\n\t"
    581       "vld1.16 {d24, d25}, [%[ptr_syn]]!\n\t"
    582       "vst1.16 {d22, d23}, [%[ptr_out]]!\n\t"
    583       "vst1.16 {d24, d25}, [%[ptr_out]]!\n\t"
    584       :[ptr_syn]"+r"(ptr_syn),
    585        [ptr_out]"+r"(ptr_out)
    586       :
    587       :"d22", "d23", "d24", "d25"
    588     );
    589   }
    590 
    591   // C code:
    592   // WebRtcSpl_ZerosArrayW16(inst->synthesisBuffer
    593   //    + inst->anaLen - inst->blockLen10ms, inst->blockLen10ms);
    594   __asm__ __volatile__("vdup.16 q10, %0" : : "r"(0) : "q10");
    595   for (; ptr_out < &inst->synthesisBuffer[inst->anaLen];) {
    596     // Loop unrolled once. Pointer is incremented in the assembly code.
    597     __asm__ __volatile__(
    598       "vst1.16 {d20, d21}, [%[ptr_out]]!\n\t"
    599       "vst1.16 {d20, d21}, [%[ptr_out]]!\n\t"
    600       :[ptr_out]"+r"(ptr_out)
    601       :
    602       :"d20", "d21"
    603     );
    604   }
    605 }
    606 
    607 // Update analysis buffer for lower band, and window data before FFT.
    608 static void AnalysisUpdateNeon(NsxInst_t* inst,
    609                                int16_t* out,
    610                                int16_t* new_speech) {
    611 
    612   int16_t* ptr_ana = &inst->analysisBuffer[inst->blockLen10ms];
    613   int16_t* ptr_out = &inst->analysisBuffer[0];
    614 
    615   // For lower band update analysis buffer.
    616   // WEBRTC_SPL_MEMCPY_W16(inst->analysisBuffer,
    617   //                      inst->analysisBuffer + inst->blockLen10ms,
    618   //                      inst->anaLen - inst->blockLen10ms);
    619   for (; ptr_out < &inst->analysisBuffer[inst->anaLen - inst->blockLen10ms];) {
    620     // Loop unrolled once, so both pointers are incremented by 8 twice.
    621     __asm__ __volatile__(
    622       "vld1.16 {d20, d21}, [%[ptr_ana]]!\n\t"
    623       "vst1.16 {d20, d21}, [%[ptr_out]]!\n\t"
    624       "vld1.16 {d22, d23}, [%[ptr_ana]]!\n\t"
    625       "vst1.16 {d22, d23}, [%[ptr_out]]!\n\t"
    626       :[ptr_ana]"+r"(ptr_ana),
    627        [ptr_out]"+r"(ptr_out)
    628       :
    629       :"d20", "d21", "d22", "d23"
    630     );
    631   }
    632 
    633   // WEBRTC_SPL_MEMCPY_W16(inst->analysisBuffer
    634   //    + inst->anaLen - inst->blockLen10ms, new_speech, inst->blockLen10ms);
    635   for (ptr_ana = new_speech; ptr_out < &inst->analysisBuffer[inst->anaLen];) {
    636     // Loop unrolled once, so both pointers are incremented by 8 twice.
    637     __asm__ __volatile__(
    638       "vld1.16 {d20, d21}, [%[ptr_ana]]!\n\t"
    639       "vst1.16 {d20, d21}, [%[ptr_out]]!\n\t"
    640       "vld1.16 {d22, d23}, [%[ptr_ana]]!\n\t"
    641       "vst1.16 {d22, d23}, [%[ptr_out]]!\n\t"
    642       :[ptr_ana]"+r"(ptr_ana),
    643        [ptr_out]"+r"(ptr_out)
    644       :
    645       :"d20", "d21", "d22", "d23"
    646     );
    647   }
    648 
    649   // Window data before FFT
    650   int16_t* ptr_window = &inst->window[0];
    651   ptr_out = &out[0];
    652   ptr_ana = &inst->analysisBuffer[0];
    653   for (; ptr_out < &out[inst->anaLen];) {
    654 
    655     // Loop unrolled once, so all pointers are incremented by 4 twice.
    656     __asm__ __volatile__(
    657       "vld1.16 d20, [%[ptr_ana]]!\n\t"
    658       "vld1.16 d21, [%[ptr_window]]!\n\t"
    659       // out[i] = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
    660       //           inst->window[i], inst->analysisBuffer[i], 14); // Q0
    661       "vmull.s16 q10, d20, d21\n\t"
    662       "vrshrn.i32 d20, q10, #14\n\t"
    663       "vst1.16 d20, [%[ptr_out]]!\n\t"
    664 
    665       "vld1.16 d22, [%[ptr_ana]]!\n\t"
    666       "vld1.16 d23, [%[ptr_window]]!\n\t"
    667       // out[i] = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
    668       //           inst->window[i], inst->analysisBuffer[i], 14); // Q0
    669       "vmull.s16 q11, d22, d23\n\t"
    670       "vrshrn.i32 d22, q11, #14\n\t"
    671       "vst1.16 d22, [%[ptr_out]]!\n\t"
    672 
    673       // Specify constraints.
    674       :[ptr_ana]"+r"(ptr_ana),
    675        [ptr_window]"+r"(ptr_window),
    676        [ptr_out]"+r"(ptr_out)
    677       :
    678       :"d20", "d21", "d22", "d23", "q10", "q11"
    679     );
    680   }
    681 }
    682 
    683 // Create a complex number buffer (out[]) as the intput (in[]) interleaved with
    684 // zeros, and normalize it.
    685 static __inline void CreateComplexBufferNeon(NsxInst_t* inst,
    686                                              int16_t* in,
    687                                              int16_t* out) {
    688   int16_t* ptr_out = &out[0];
    689   int16_t* ptr_in = &in[0];
    690 
    691   __asm__ __volatile__("vdup.16 d25, %0" : : "r"(0) : "d25");
    692   __asm__ __volatile__("vdup.16 q10, %0" : : "r"(inst->normData) : "q10");
    693   for (; ptr_in < &in[inst->anaLen];) {
    694 
    695     // Loop unrolled once, so ptr_in is incremented by 8 twice,
    696     // and ptr_out is incremented by 8 four times.
    697     __asm__ __volatile__(
    698       // out[j] = WEBRTC_SPL_LSHIFT_W16(in[i], inst->normData); // Q(normData)
    699       "vld1.16 {d22, d23}, [%[ptr_in]]!\n\t"
    700       "vshl.s16 q11, q10\n\t"
    701       "vmov d24, d23\n\t"
    702 
    703       // out[j + 1] = 0; // Insert zeros in imaginary part
    704       "vmov d23, d25\n\t"
    705       "vst2.16 {d22, d23}, [%[ptr_out]]!\n\t"
    706       "vst2.16 {d24, d25}, [%[ptr_out]]!\n\t"
    707 
    708       // out[j] = WEBRTC_SPL_LSHIFT_W16(in[i], inst->normData); // Q(normData)
    709       "vld1.16 {d22, d23}, [%[ptr_in]]!\n\t"
    710       "vshl.s16 q11, q10\n\t"
    711       "vmov d24, d23\n\t"
    712 
    713       // out[j + 1] = 0; // Insert zeros in imaginary part
    714       "vmov d23, d25\n\t"
    715       "vst2.16 {d22, d23}, [%[ptr_out]]!\n\t"
    716       "vst2.16 {d24, d25}, [%[ptr_out]]!\n\t"
    717 
    718       // Specify constraints.
    719       :[ptr_in]"+r"(ptr_in),
    720        [ptr_out]"+r"(ptr_out)
    721       :
    722       :"d22", "d23", "d24", "d25", "q10", "q11"
    723     );
    724   }
    725 }
    726 
    727 void WebRtcNsx_InitNeon(void) {
    728   WebRtcNsx_NoiseEstimation = NoiseEstimationNeon;
    729   WebRtcNsx_PrepareSpectrum = PrepareSpectrumNeon;
    730   WebRtcNsx_SynthesisUpdate = SynthesisUpdateNeon;
    731   WebRtcNsx_AnalysisUpdate = AnalysisUpdateNeon;
    732   WebRtcNsx_Denormalize = DenormalizeNeon;
    733   WebRtcNsx_CreateComplexBuffer = CreateComplexBufferNeon;
    734 }
    735