Home | History | Annotate | Download | only in aecm
      1 /*
      2  *  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include "webrtc/modules/audio_processing/aecm/aecm_core.h"
     12 
     13 #include <assert.h>
     14 
     15 #include "webrtc/modules/audio_processing/aecm/include/echo_control_mobile.h"
     16 #include "webrtc/modules/audio_processing/utility/delay_estimator_wrapper.h"
     17 
     18 static const ALIGN8_BEG int16_t WebRtcAecm_kSqrtHanning[] ALIGN8_END = {
     19   0, 399, 798, 1196, 1594, 1990, 2386, 2780, 3172,
     20   3562, 3951, 4337, 4720, 5101, 5478, 5853, 6224,
     21   6591, 6954, 7313, 7668, 8019, 8364, 8705, 9040,
     22   9370, 9695, 10013, 10326, 10633, 10933, 11227, 11514,
     23   11795, 12068, 12335, 12594, 12845, 13089, 13325, 13553,
     24   13773, 13985, 14189, 14384, 14571, 14749, 14918, 15079,
     25   15231, 15373, 15506, 15631, 15746, 15851, 15947, 16034,
     26   16111, 16179, 16237, 16286, 16325, 16354, 16373, 16384
     27 };
     28 
     29 static const int16_t kNoiseEstQDomain = 15;
     30 static const int16_t kNoiseEstIncCount = 5;
     31 
     32 static int16_t coefTable[] = {
     33    0,   4, 256, 260, 128, 132, 384, 388,
     34   64,  68, 320, 324, 192, 196, 448, 452,
     35   32,  36, 288, 292, 160, 164, 416, 420,
     36   96, 100, 352, 356, 224, 228, 480, 484,
     37   16,  20, 272, 276, 144, 148, 400, 404,
     38   80,  84, 336, 340, 208, 212, 464, 468,
     39   48,  52, 304, 308, 176, 180, 432, 436,
     40  112, 116, 368, 372, 240, 244, 496, 500,
     41    8,  12, 264, 268, 136, 140, 392, 396,
     42   72,  76, 328, 332, 200, 204, 456, 460,
     43   40,  44, 296, 300, 168, 172, 424, 428,
     44  104, 108, 360, 364, 232, 236, 488, 492,
     45   24,  28, 280, 284, 152, 156, 408, 412,
     46   88,  92, 344, 348, 216, 220, 472, 476,
     47   56,  60, 312, 316, 184, 188, 440, 444,
     48  120, 124, 376, 380, 248, 252, 504, 508
     49 };
     50 
     51 static int16_t coefTable_ifft[] = {
     52     0, 512, 256, 508, 128, 252, 384, 380,
     53    64, 124, 320, 444, 192, 188, 448, 316,
     54    32,  60, 288, 476, 160, 220, 416, 348,
     55    96,  92, 352, 412, 224, 156, 480, 284,
     56    16,  28, 272, 492, 144, 236, 400, 364,
     57    80, 108, 336, 428, 208, 172, 464, 300,
     58    48,  44, 304, 460, 176, 204, 432, 332,
     59   112,  76, 368, 396, 240, 140, 496, 268,
     60     8,  12, 264, 500, 136, 244, 392, 372,
     61    72, 116, 328, 436, 200, 180, 456, 308,
     62    40,  52, 296, 468, 168, 212, 424, 340,
     63   104,  84, 360, 404, 232, 148, 488, 276,
     64    24,  20, 280, 484, 152, 228, 408, 356,
     65    88, 100, 344, 420, 216, 164, 472, 292,
     66    56,  36, 312, 452, 184, 196, 440, 324,
     67   120,  68, 376, 388, 248, 132, 504, 260
     68 };
     69 
     70 static void ComfortNoise(AecmCore_t* aecm,
     71                          const uint16_t* dfa,
     72                          complex16_t* out,
     73                          const int16_t* lambda);
     74 
     75 static void WindowAndFFT(AecmCore_t* aecm,
     76                          int16_t* fft,
     77                          const int16_t* time_signal,
     78                          complex16_t* freq_signal,
     79                          int time_signal_scaling) {
     80   int i, j;
     81   int32_t tmp1, tmp2, tmp3, tmp4;
     82   int16_t* pfrfi;
     83   complex16_t* pfreq_signal;
     84   int16_t  f_coef, s_coef;
     85   int32_t load_ptr, store_ptr1, store_ptr2, shift, shift1;
     86   int32_t hann, hann1, coefs;
     87 
     88   memset(fft, 0, sizeof(int16_t) * PART_LEN4);
     89 
     90   // FFT of signal
     91   __asm __volatile (
     92     ".set        push                                                    \n\t"
     93     ".set        noreorder                                               \n\t"
     94     "addiu       %[shift],          %[time_signal_scaling], -14          \n\t"
     95     "addiu       %[i],              $zero,                  64           \n\t"
     96     "addiu       %[load_ptr],       %[time_signal],         0            \n\t"
     97     "addiu       %[hann],           %[hanning],             0            \n\t"
     98     "addiu       %[hann1],          %[hanning],             128          \n\t"
     99     "addiu       %[coefs],          %[coefTable],           0            \n\t"
    100     "bltz        %[shift],          2f                                   \n\t"
    101     " negu       %[shift1],         %[shift]                             \n\t"
    102    "1:                                                                   \n\t"
    103     "lh          %[tmp1],           0(%[load_ptr])                       \n\t"
    104     "lh          %[tmp2],           0(%[hann])                           \n\t"
    105     "lh          %[tmp3],           128(%[load_ptr])                     \n\t"
    106     "lh          %[tmp4],           0(%[hann1])                          \n\t"
    107     "addiu       %[i],              %[i],                   -1           \n\t"
    108     "mul         %[tmp1],           %[tmp1],                %[tmp2]      \n\t"
    109     "mul         %[tmp3],           %[tmp3],                %[tmp4]      \n\t"
    110     "lh          %[f_coef],         0(%[coefs])                          \n\t"
    111     "lh          %[s_coef],         2(%[coefs])                          \n\t"
    112     "addiu       %[load_ptr],       %[load_ptr],            2            \n\t"
    113     "addiu       %[hann],           %[hann],                2            \n\t"
    114     "addiu       %[hann1],          %[hann1],               -2           \n\t"
    115     "addu        %[store_ptr1],     %[fft],                 %[f_coef]    \n\t"
    116     "addu        %[store_ptr2],     %[fft],                 %[s_coef]    \n\t"
    117     "sllv        %[tmp1],           %[tmp1],                %[shift]     \n\t"
    118     "sllv        %[tmp3],           %[tmp3],                %[shift]     \n\t"
    119     "sh          %[tmp1],           0(%[store_ptr1])                     \n\t"
    120     "sh          %[tmp3],           0(%[store_ptr2])                     \n\t"
    121     "bgtz        %[i],              1b                                   \n\t"
    122     " addiu      %[coefs],          %[coefs],               4            \n\t"
    123     "b           3f                                                      \n\t"
    124     " nop                                                                \n\t"
    125    "2:                                                                   \n\t"
    126     "lh          %[tmp1],           0(%[load_ptr])                       \n\t"
    127     "lh          %[tmp2],           0(%[hann])                           \n\t"
    128     "lh          %[tmp3],           128(%[load_ptr])                     \n\t"
    129     "lh          %[tmp4],           0(%[hann1])                          \n\t"
    130     "addiu       %[i],              %[i],                   -1           \n\t"
    131     "mul         %[tmp1],           %[tmp1],                %[tmp2]      \n\t"
    132     "mul         %[tmp3],           %[tmp3],                %[tmp4]      \n\t"
    133     "lh          %[f_coef],         0(%[coefs])                          \n\t"
    134     "lh          %[s_coef],         2(%[coefs])                          \n\t"
    135     "addiu       %[load_ptr],       %[load_ptr],            2            \n\t"
    136     "addiu       %[hann],           %[hann],                2            \n\t"
    137     "addiu       %[hann1],          %[hann1],               -2           \n\t"
    138     "addu        %[store_ptr1],     %[fft],                 %[f_coef]    \n\t"
    139     "addu        %[store_ptr2],     %[fft],                 %[s_coef]    \n\t"
    140     "srav        %[tmp1],           %[tmp1],                %[shift1]    \n\t"
    141     "srav        %[tmp3],           %[tmp3],                %[shift1]    \n\t"
    142     "sh          %[tmp1],           0(%[store_ptr1])                     \n\t"
    143     "sh          %[tmp3],           0(%[store_ptr2])                     \n\t"
    144     "bgtz        %[i],              2b                                   \n\t"
    145     " addiu      %[coefs],          %[coefs],               4            \n\t"
    146    "3:                                                                   \n\t"
    147     ".set        pop                                                     \n\t"
    148     : [load_ptr] "=&r" (load_ptr), [shift] "=&r" (shift), [hann] "=&r" (hann),
    149       [hann1] "=&r" (hann1), [shift1] "=&r" (shift1), [coefs] "=&r" (coefs),
    150       [tmp1] "=&r" (tmp1), [tmp2] "=&r" (tmp2), [tmp3] "=&r" (tmp3),
    151       [tmp4] "=&r" (tmp4), [i] "=&r" (i), [f_coef] "=&r" (f_coef),
    152       [s_coef] "=&r" (s_coef), [store_ptr1] "=&r" (store_ptr1),
    153       [store_ptr2] "=&r" (store_ptr2)
    154     : [time_signal] "r" (time_signal), [coefTable] "r" (coefTable),
    155       [time_signal_scaling] "r" (time_signal_scaling),
    156       [hanning] "r" (WebRtcAecm_kSqrtHanning), [fft] "r" (fft)
    157     : "memory", "hi", "lo"
    158   );
    159 
    160   WebRtcSpl_ComplexFFT(fft, PART_LEN_SHIFT, 1);
    161   pfrfi = fft;
    162   pfreq_signal = freq_signal;
    163 
    164   __asm __volatile (
    165     ".set        push                                                     \n\t"
    166     ".set        noreorder                                                \n\t"
    167     "addiu       %[j],              $zero,                 128            \n\t"
    168    "1:                                                                    \n\t"
    169     "lh          %[tmp1],           0(%[pfrfi])                           \n\t"
    170     "lh          %[tmp2],           2(%[pfrfi])                           \n\t"
    171     "lh          %[tmp3],           4(%[pfrfi])                           \n\t"
    172     "lh          %[tmp4],           6(%[pfrfi])                           \n\t"
    173     "subu        %[tmp2],           $zero,                 %[tmp2]        \n\t"
    174     "sh          %[tmp1],           0(%[pfreq_signal])                    \n\t"
    175     "sh          %[tmp2],           2(%[pfreq_signal])                    \n\t"
    176     "subu        %[tmp4],           $zero,                 %[tmp4]        \n\t"
    177     "sh          %[tmp3],           4(%[pfreq_signal])                    \n\t"
    178     "sh          %[tmp4],           6(%[pfreq_signal])                    \n\t"
    179     "lh          %[tmp1],           8(%[pfrfi])                           \n\t"
    180     "lh          %[tmp2],           10(%[pfrfi])                          \n\t"
    181     "lh          %[tmp3],           12(%[pfrfi])                          \n\t"
    182     "lh          %[tmp4],           14(%[pfrfi])                          \n\t"
    183     "addiu       %[j],              %[j],                  -8             \n\t"
    184     "subu        %[tmp2],           $zero,                 %[tmp2]        \n\t"
    185     "sh          %[tmp1],           8(%[pfreq_signal])                    \n\t"
    186     "sh          %[tmp2],           10(%[pfreq_signal])                   \n\t"
    187     "subu        %[tmp4],           $zero,                 %[tmp4]        \n\t"
    188     "sh          %[tmp3],           12(%[pfreq_signal])                   \n\t"
    189     "sh          %[tmp4],           14(%[pfreq_signal])                   \n\t"
    190     "addiu       %[pfreq_signal],   %[pfreq_signal],       16             \n\t"
    191     "bgtz        %[j],              1b                                    \n\t"
    192     " addiu      %[pfrfi],          %[pfrfi],              16             \n\t"
    193     ".set        pop                                                      \n\t"
    194     : [tmp1] "=&r" (tmp1), [tmp2] "=&r" (tmp2), [tmp3] "=&r" (tmp3),
    195       [j] "=&r" (j), [pfrfi] "+r" (pfrfi), [pfreq_signal] "+r" (pfreq_signal),
    196       [tmp4] "=&r" (tmp4)
    197     :
    198     : "memory"
    199   );
    200 }
    201 
    202 static void InverseFFTAndWindow(AecmCore_t* aecm,
    203                                 int16_t* fft,
    204                                 complex16_t* efw,
    205                                 int16_t* output,
    206                                 const int16_t* nearendClean) {
    207   int i, outCFFT;
    208   int32_t tmp1, tmp2, tmp3, tmp4, tmp_re, tmp_im;
    209   int16_t* pcoefTable_ifft = coefTable_ifft;
    210   int16_t* pfft = fft;
    211   int16_t* ppfft = fft;
    212   complex16_t* pefw = efw;
    213   int32_t out_aecm;
    214   int16_t* paecm_buf = aecm->outBuf;
    215   const int16_t* p_kSqrtHanning = WebRtcAecm_kSqrtHanning;
    216   const int16_t* pp_kSqrtHanning = &WebRtcAecm_kSqrtHanning[PART_LEN];
    217   int16_t* output1 = output;
    218 
    219   __asm __volatile (
    220     ".set      push                                                        \n\t"
    221     ".set      noreorder                                                   \n\t"
    222     "addiu     %[i],                $zero,                   64            \n\t"
    223    "1:                                                                     \n\t"
    224     "lh        %[tmp1],             0(%[pcoefTable_ifft])                  \n\t"
    225     "lh        %[tmp2],             2(%[pcoefTable_ifft])                  \n\t"
    226     "lh        %[tmp_re],           0(%[pefw])                             \n\t"
    227     "lh        %[tmp_im],           2(%[pefw])                             \n\t"
    228     "addu      %[pfft],             %[fft],                  %[tmp2]       \n\t"
    229     "sh        %[tmp_re],           0(%[pfft])                             \n\t"
    230     "sh        %[tmp_im],           2(%[pfft])                             \n\t"
    231     "addu      %[pfft],             %[fft],                  %[tmp1]       \n\t"
    232     "sh        %[tmp_re],           0(%[pfft])                             \n\t"
    233     "subu      %[tmp_im],           $zero,                   %[tmp_im]     \n\t"
    234     "sh        %[tmp_im],           2(%[pfft])                             \n\t"
    235     "lh        %[tmp1],             4(%[pcoefTable_ifft])                  \n\t"
    236     "lh        %[tmp2],             6(%[pcoefTable_ifft])                  \n\t"
    237     "lh        %[tmp_re],           4(%[pefw])                             \n\t"
    238     "lh        %[tmp_im],           6(%[pefw])                             \n\t"
    239     "addu      %[pfft],             %[fft],                  %[tmp2]       \n\t"
    240     "sh        %[tmp_re],           0(%[pfft])                             \n\t"
    241     "sh        %[tmp_im],           2(%[pfft])                             \n\t"
    242     "addu      %[pfft],             %[fft],                  %[tmp1]       \n\t"
    243     "sh        %[tmp_re],           0(%[pfft])                             \n\t"
    244     "subu      %[tmp_im],           $zero,                   %[tmp_im]     \n\t"
    245     "sh        %[tmp_im],           2(%[pfft])                             \n\t"
    246     "lh        %[tmp1],             8(%[pcoefTable_ifft])                  \n\t"
    247     "lh        %[tmp2],             10(%[pcoefTable_ifft])                 \n\t"
    248     "lh        %[tmp_re],           8(%[pefw])                             \n\t"
    249     "lh        %[tmp_im],           10(%[pefw])                            \n\t"
    250     "addu      %[pfft],             %[fft],                  %[tmp2]       \n\t"
    251     "sh        %[tmp_re],           0(%[pfft])                             \n\t"
    252     "sh        %[tmp_im],           2(%[pfft])                             \n\t"
    253     "addu      %[pfft],             %[fft],                  %[tmp1]       \n\t"
    254     "sh        %[tmp_re],           0(%[pfft])                             \n\t"
    255     "subu      %[tmp_im],           $zero,                   %[tmp_im]     \n\t"
    256     "sh        %[tmp_im],           2(%[pfft])                             \n\t"
    257     "lh        %[tmp1],             12(%[pcoefTable_ifft])                 \n\t"
    258     "lh        %[tmp2],             14(%[pcoefTable_ifft])                 \n\t"
    259     "lh        %[tmp_re],           12(%[pefw])                            \n\t"
    260     "lh        %[tmp_im],           14(%[pefw])                            \n\t"
    261     "addu      %[pfft],             %[fft],                  %[tmp2]       \n\t"
    262     "sh        %[tmp_re],           0(%[pfft])                             \n\t"
    263     "sh        %[tmp_im],           2(%[pfft])                             \n\t"
    264     "addu      %[pfft],             %[fft],                  %[tmp1]       \n\t"
    265     "sh        %[tmp_re],           0(%[pfft])                             \n\t"
    266     "subu      %[tmp_im],           $zero,                   %[tmp_im]     \n\t"
    267     "sh        %[tmp_im],           2(%[pfft])                             \n\t"
    268     "addiu     %[pcoefTable_ifft],  %[pcoefTable_ifft],      16            \n\t"
    269     "addiu     %[i],                %[i],                    -4            \n\t"
    270     "bgtz      %[i],                1b                                     \n\t"
    271     " addiu    %[pefw],             %[pefw],                 16            \n\t"
    272     ".set      pop                                                         \n\t"
    273     : [tmp1] "=&r" (tmp1), [tmp2] "=&r" (tmp2), [pfft] "+r" (pfft),
    274       [i] "=&r" (i), [tmp_re] "=&r" (tmp_re), [tmp_im] "=&r" (tmp_im),
    275       [pefw] "+r" (pefw), [pcoefTable_ifft] "+r" (pcoefTable_ifft),
    276       [fft] "+r" (fft)
    277     :
    278     : "memory"
    279   );
    280 
    281   fft[2] = efw[PART_LEN].real;
    282   fft[3] = -efw[PART_LEN].imag;
    283 
    284   outCFFT = WebRtcSpl_ComplexIFFT(fft, PART_LEN_SHIFT, 1);
    285   pfft = fft;
    286 
    287   __asm __volatile (
    288     ".set       push                                               \n\t"
    289     ".set       noreorder                                          \n\t"
    290     "addiu      %[i],            $zero,               128          \n\t"
    291    "1:                                                             \n\t"
    292     "lh         %[tmp1],         0(%[ppfft])                       \n\t"
    293     "lh         %[tmp2],         4(%[ppfft])                       \n\t"
    294     "lh         %[tmp3],         8(%[ppfft])                       \n\t"
    295     "lh         %[tmp4],         12(%[ppfft])                      \n\t"
    296     "addiu      %[i],            %[i],                -4           \n\t"
    297     "sh         %[tmp1],         0(%[pfft])                        \n\t"
    298     "sh         %[tmp2],         2(%[pfft])                        \n\t"
    299     "sh         %[tmp3],         4(%[pfft])                        \n\t"
    300     "sh         %[tmp4],         6(%[pfft])                        \n\t"
    301     "addiu      %[ppfft],        %[ppfft],            16           \n\t"
    302     "bgtz       %[i],            1b                                \n\t"
    303     " addiu     %[pfft],         %[pfft],             8            \n\t"
    304     ".set       pop                                                \n\t"
    305     : [tmp1] "=&r" (tmp1), [tmp2] "=&r" (tmp2), [pfft] "+r" (pfft),
    306       [i] "=&r" (i), [tmp3] "=&r" (tmp3), [tmp4] "=&r" (tmp4),
    307       [ppfft] "+r" (ppfft)
    308     :
    309     : "memory"
    310   );
    311 
    312   pfft = fft;
    313   out_aecm = (int32_t)(outCFFT - aecm->dfaCleanQDomain);
    314 
    315   __asm __volatile (
    316     ".set       push                                                       \n\t"
    317     ".set       noreorder                                                  \n\t"
    318     "addiu      %[i],                $zero,                  64            \n\t"
    319    "11:                                                                    \n\t"
    320     "lh         %[tmp1],             0(%[pfft])                            \n\t"
    321     "lh         %[tmp2],             0(%[p_kSqrtHanning])                  \n\t"
    322     "addiu      %[i],                %[i],                   -2            \n\t"
    323     "mul        %[tmp1],             %[tmp1],                %[tmp2]       \n\t"
    324     "lh         %[tmp3],             2(%[pfft])                            \n\t"
    325     "lh         %[tmp4],             2(%[p_kSqrtHanning])                  \n\t"
    326     "mul        %[tmp3],             %[tmp3],                %[tmp4]       \n\t"
    327     "addiu      %[tmp1],             %[tmp1],                8192          \n\t"
    328     "sra        %[tmp1],             %[tmp1],                14            \n\t"
    329     "addiu      %[tmp3],             %[tmp3],                8192          \n\t"
    330     "sra        %[tmp3],             %[tmp3],                14            \n\t"
    331     "bgez       %[out_aecm],         1f                                    \n\t"
    332     " negu      %[tmp2],             %[out_aecm]                           \n\t"
    333     "srav       %[tmp1],             %[tmp1],                %[tmp2]       \n\t"
    334     "b          2f                                                         \n\t"
    335     " srav      %[tmp3],             %[tmp3],                %[tmp2]       \n\t"
    336    "1:                                                                     \n\t"
    337     "sllv       %[tmp1],             %[tmp1],                %[out_aecm]   \n\t"
    338     "sllv       %[tmp3],             %[tmp3],                %[out_aecm]   \n\t"
    339    "2:                                                                     \n\t"
    340     "lh         %[tmp4],             0(%[paecm_buf])                       \n\t"
    341     "lh         %[tmp2],             2(%[paecm_buf])                       \n\t"
    342     "addu       %[tmp3],             %[tmp3],                %[tmp2]       \n\t"
    343     "addu       %[tmp1],             %[tmp1],                %[tmp4]       \n\t"
    344 #if defined(MIPS_DSP_R1_LE)
    345     "shll_s.w   %[tmp1],             %[tmp1],                16            \n\t"
    346     "sra        %[tmp1],             %[tmp1],                16            \n\t"
    347     "shll_s.w   %[tmp3],             %[tmp3],                16            \n\t"
    348     "sra        %[tmp3],             %[tmp3],                16            \n\t"
    349 #else  // #if defined(MIPS_DSP_R1_LE)
    350     "sra        %[tmp4],             %[tmp1],                31            \n\t"
    351     "sra        %[tmp2],             %[tmp1],                15            \n\t"
    352     "beq        %[tmp4],             %[tmp2],                3f            \n\t"
    353     " ori       %[tmp2],             $zero,                  0x7fff        \n\t"
    354     "xor        %[tmp1],             %[tmp2],                %[tmp4]       \n\t"
    355    "3:                                                                     \n\t"
    356     "sra        %[tmp2],             %[tmp3],                31            \n\t"
    357     "sra        %[tmp4],             %[tmp3],                15            \n\t"
    358     "beq        %[tmp2],             %[tmp4],                4f            \n\t"
    359     " ori       %[tmp4],             $zero,                  0x7fff        \n\t"
    360     "xor        %[tmp3],             %[tmp4],                %[tmp2]       \n\t"
    361    "4:                                                                     \n\t"
    362 #endif  // #if defined(MIPS_DSP_R1_LE)
    363     "sh         %[tmp1],             0(%[pfft])                            \n\t"
    364     "sh         %[tmp1],             0(%[output1])                         \n\t"
    365     "sh         %[tmp3],             2(%[pfft])                            \n\t"
    366     "sh         %[tmp3],             2(%[output1])                         \n\t"
    367     "lh         %[tmp1],             128(%[pfft])                          \n\t"
    368     "lh         %[tmp2],             0(%[pp_kSqrtHanning])                 \n\t"
    369     "mul        %[tmp1],             %[tmp1],                %[tmp2]       \n\t"
    370     "lh         %[tmp3],             130(%[pfft])                          \n\t"
    371     "lh         %[tmp4],             -2(%[pp_kSqrtHanning])                \n\t"
    372     "mul        %[tmp3],             %[tmp3],                %[tmp4]       \n\t"
    373     "sra        %[tmp1],             %[tmp1],                14            \n\t"
    374     "sra        %[tmp3],             %[tmp3],                14            \n\t"
    375     "bgez       %[out_aecm],         5f                                    \n\t"
    376     " negu      %[tmp2],             %[out_aecm]                           \n\t"
    377     "srav       %[tmp3],             %[tmp3],                %[tmp2]       \n\t"
    378     "b          6f                                                         \n\t"
    379     " srav      %[tmp1],             %[tmp1],                %[tmp2]       \n\t"
    380    "5:                                                                     \n\t"
    381     "sllv       %[tmp1],             %[tmp1],                %[out_aecm]   \n\t"
    382     "sllv       %[tmp3],             %[tmp3],                %[out_aecm]   \n\t"
    383    "6:                                                                     \n\t"
    384 #if defined(MIPS_DSP_R1_LE)
    385     "shll_s.w   %[tmp1],             %[tmp1],                16            \n\t"
    386     "sra        %[tmp1],             %[tmp1],                16            \n\t"
    387     "shll_s.w   %[tmp3],             %[tmp3],                16            \n\t"
    388     "sra        %[tmp3],             %[tmp3],                16            \n\t"
    389 #else  // #if defined(MIPS_DSP_R1_LE)
    390     "sra        %[tmp4],             %[tmp1],                31            \n\t"
    391     "sra        %[tmp2],             %[tmp1],                15            \n\t"
    392     "beq        %[tmp4],             %[tmp2],                7f            \n\t"
    393     " ori       %[tmp2],             $zero,                  0x7fff        \n\t"
    394     "xor        %[tmp1],             %[tmp2],                %[tmp4]       \n\t"
    395    "7:                                                                     \n\t"
    396     "sra        %[tmp2],             %[tmp3],                31            \n\t"
    397     "sra        %[tmp4],             %[tmp3],                15            \n\t"
    398     "beq        %[tmp2],             %[tmp4],                8f            \n\t"
    399     " ori       %[tmp4],             $zero,                  0x7fff        \n\t"
    400     "xor        %[tmp3],             %[tmp4],                %[tmp2]       \n\t"
    401    "8:                                                                     \n\t"
    402 #endif  // #if defined(MIPS_DSP_R1_LE)
    403     "sh         %[tmp1],             0(%[paecm_buf])                       \n\t"
    404     "sh         %[tmp3],             2(%[paecm_buf])                       \n\t"
    405     "addiu      %[output1],          %[output1],             4             \n\t"
    406     "addiu      %[paecm_buf],        %[paecm_buf],           4             \n\t"
    407     "addiu      %[pfft],             %[pfft],                4             \n\t"
    408     "addiu      %[p_kSqrtHanning],   %[p_kSqrtHanning],      4             \n\t"
    409     "bgtz       %[i],                11b                                   \n\t"
    410     " addiu     %[pp_kSqrtHanning],  %[pp_kSqrtHanning],     -4            \n\t"
    411     ".set       pop                                                        \n\t"
    412     : [tmp1] "=&r" (tmp1), [tmp2] "=&r" (tmp2), [pfft] "+r" (pfft),
    413       [output1] "+r" (output1), [tmp3] "=&r" (tmp3), [tmp4] "=&r" (tmp4),
    414       [paecm_buf] "+r" (paecm_buf), [i] "=&r" (i),
    415       [pp_kSqrtHanning] "+r" (pp_kSqrtHanning),
    416       [p_kSqrtHanning] "+r" (p_kSqrtHanning)
    417     : [out_aecm] "r" (out_aecm),
    418       [WebRtcAecm_kSqrtHanning] "r" (WebRtcAecm_kSqrtHanning)
    419     : "hi", "lo","memory"
    420   );
    421 
    422   // Copy the current block to the old position
    423   // (aecm->outBuf is shifted elsewhere)
    424   memcpy(aecm->xBuf, aecm->xBuf + PART_LEN, sizeof(int16_t) * PART_LEN);
    425   memcpy(aecm->dBufNoisy,
    426          aecm->dBufNoisy + PART_LEN,
    427          sizeof(int16_t) * PART_LEN);
    428   if (nearendClean != NULL) {
    429     memcpy(aecm->dBufClean,
    430            aecm->dBufClean + PART_LEN,
    431            sizeof(int16_t) * PART_LEN);
    432   }
    433 }
    434 
    435 void WebRtcAecm_CalcLinearEnergies_mips(AecmCore_t* aecm,
    436                                         const uint16_t* far_spectrum,
    437                                         int32_t* echo_est,
    438                                         uint32_t* far_energy,
    439                                         uint32_t* echo_energy_adapt,
    440                                         uint32_t* echo_energy_stored) {
    441   int i;
    442   uint32_t par1 = (*far_energy);
    443   uint32_t par2 = (*echo_energy_adapt);
    444   uint32_t par3 = (*echo_energy_stored);
    445   int16_t* ch_stored_p = &(aecm->channelStored[0]);
    446   int16_t* ch_adapt_p = &(aecm->channelAdapt16[0]);
    447   uint16_t* spectrum_p = (uint16_t*)(&(far_spectrum[0]));
    448   int32_t* echo_p = &(echo_est[0]);
    449   int32_t temp0, stored0, echo0, adept0, spectrum0;
    450   int32_t stored1, adept1, spectrum1, echo1, temp1;
    451 
    452   // Get energy for the delayed far end signal and estimated
    453   // echo using both stored and adapted channels.
    454   for (i = 0; i < PART_LEN; i+= 4) {
    455     __asm __volatile (
    456       ".set           push                                            \n\t"
    457       ".set           noreorder                                       \n\t"
    458       "lh             %[stored0],     0(%[ch_stored_p])               \n\t"
    459       "lhu            %[adept0],      0(%[ch_adapt_p])                \n\t"
    460       "lhu            %[spectrum0],   0(%[spectrum_p])                \n\t"
    461       "lh             %[stored1],     2(%[ch_stored_p])               \n\t"
    462       "lhu            %[adept1],      2(%[ch_adapt_p])                \n\t"
    463       "lhu            %[spectrum1],   2(%[spectrum_p])                \n\t"
    464       "mul            %[echo0],       %[stored0],     %[spectrum0]    \n\t"
    465       "mul            %[temp0],       %[adept0],      %[spectrum0]    \n\t"
    466       "mul            %[echo1],       %[stored1],     %[spectrum1]    \n\t"
    467       "mul            %[temp1],       %[adept1],      %[spectrum1]    \n\t"
    468       "addu           %[par1],        %[par1],        %[spectrum0]    \n\t"
    469       "addu           %[par1],        %[par1],        %[spectrum1]    \n\t"
    470       "addiu          %[echo_p],      %[echo_p],      16              \n\t"
    471       "addu           %[par3],        %[par3],        %[echo0]        \n\t"
    472       "addu           %[par2],        %[par2],        %[temp0]        \n\t"
    473       "addu           %[par3],        %[par3],        %[echo1]        \n\t"
    474       "addu           %[par2],        %[par2],        %[temp1]        \n\t"
    475       "usw            %[echo0],       -16(%[echo_p])                  \n\t"
    476       "usw            %[echo1],       -12(%[echo_p])                  \n\t"
    477       "lh             %[stored0],     4(%[ch_stored_p])               \n\t"
    478       "lhu            %[adept0],      4(%[ch_adapt_p])                \n\t"
    479       "lhu            %[spectrum0],   4(%[spectrum_p])                \n\t"
    480       "lh             %[stored1],     6(%[ch_stored_p])               \n\t"
    481       "lhu            %[adept1],      6(%[ch_adapt_p])                \n\t"
    482       "lhu            %[spectrum1],   6(%[spectrum_p])                \n\t"
    483       "mul            %[echo0],       %[stored0],     %[spectrum0]    \n\t"
    484       "mul            %[temp0],       %[adept0],      %[spectrum0]    \n\t"
    485       "mul            %[echo1],       %[stored1],     %[spectrum1]    \n\t"
    486       "mul            %[temp1],       %[adept1],      %[spectrum1]    \n\t"
    487       "addu           %[par1],        %[par1],        %[spectrum0]    \n\t"
    488       "addu           %[par1],        %[par1],        %[spectrum1]    \n\t"
    489       "addiu          %[ch_stored_p], %[ch_stored_p], 8               \n\t"
    490       "addiu          %[ch_adapt_p],  %[ch_adapt_p],  8               \n\t"
    491       "addiu          %[spectrum_p],  %[spectrum_p],  8               \n\t"
    492       "addu           %[par3],        %[par3],        %[echo0]        \n\t"
    493       "addu           %[par2],        %[par2],        %[temp0]        \n\t"
    494       "addu           %[par3],        %[par3],        %[echo1]        \n\t"
    495       "addu           %[par2],        %[par2],        %[temp1]        \n\t"
    496       "usw            %[echo0],       -8(%[echo_p])                   \n\t"
    497       "usw            %[echo1],       -4(%[echo_p])                   \n\t"
    498       ".set           pop                                             \n\t"
    499       : [temp0] "=&r" (temp0), [stored0] "=&r" (stored0),
    500         [adept0] "=&r" (adept0), [spectrum0] "=&r" (spectrum0),
    501         [echo0] "=&r" (echo0), [echo_p] "+r" (echo_p), [par3] "+r" (par3),
    502         [par1] "+r" (par1), [par2] "+r" (par2), [stored1] "=&r" (stored1),
    503         [adept1] "=&r" (adept1), [echo1] "=&r" (echo1),
    504         [spectrum1] "=&r" (spectrum1), [temp1] "=&r" (temp1),
    505         [ch_stored_p] "+r" (ch_stored_p), [ch_adapt_p] "+r" (ch_adapt_p),
    506         [spectrum_p] "+r" (spectrum_p)
    507       :
    508       : "hi", "lo", "memory"
    509     );
    510   }
    511 
    512   echo_est[PART_LEN] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[PART_LEN],
    513                                              far_spectrum[PART_LEN]);
    514   par1 += (uint32_t)(far_spectrum[PART_LEN]);
    515   par2 += aecm->channelAdapt16[PART_LEN] * far_spectrum[PART_LEN];
    516   par3 += (uint32_t)echo_est[PART_LEN];
    517 
    518   (*far_energy) = par1;
    519   (*echo_energy_adapt) = par2;
    520   (*echo_energy_stored) = par3;
    521 }
    522 
    523 #if defined(MIPS_DSP_R1_LE)
    524 void WebRtcAecm_StoreAdaptiveChannel_mips(AecmCore_t* aecm,
    525                                           const uint16_t* far_spectrum,
    526                                           int32_t* echo_est) {
    527   int i;
    528   int16_t* temp1;
    529   uint16_t* temp8;
    530   int32_t temp0, temp2, temp3, temp4, temp5, temp6;
    531   int32_t* temp7 = &(echo_est[0]);
    532   temp1 = &(aecm->channelStored[0]);
    533   temp8 = (uint16_t*)(&far_spectrum[0]);
    534 
    535   // During startup we store the channel every block.
    536   memcpy(aecm->channelStored, aecm->channelAdapt16,
    537          sizeof(int16_t) * PART_LEN1);
    538   // Recalculate echo estimate
    539   for (i = 0; i < PART_LEN; i += 4) {
    540     __asm __volatile (
    541       "ulw            %[temp0],   0(%[temp8])               \n\t"
    542       "ulw            %[temp2],   0(%[temp1])               \n\t"
    543       "ulw            %[temp4],   4(%[temp8])               \n\t"
    544       "ulw            %[temp5],   4(%[temp1])               \n\t"
    545       "muleq_s.w.phl  %[temp3],   %[temp2],     %[temp0]    \n\t"
    546       "muleq_s.w.phr  %[temp0],   %[temp2],     %[temp0]    \n\t"
    547       "muleq_s.w.phl  %[temp6],   %[temp5],     %[temp4]    \n\t"
    548       "muleq_s.w.phr  %[temp4],   %[temp5],     %[temp4]    \n\t"
    549       "addiu          %[temp7],   %[temp7],     16          \n\t"
    550       "addiu          %[temp1],   %[temp1],     8           \n\t"
    551       "addiu          %[temp8],   %[temp8],     8           \n\t"
    552       "sra            %[temp3],   %[temp3],     1           \n\t"
    553       "sra            %[temp0],   %[temp0],     1           \n\t"
    554       "sra            %[temp6],   %[temp6],     1           \n\t"
    555       "sra            %[temp4],   %[temp4],     1           \n\t"
    556       "usw            %[temp3],   -12(%[temp7])             \n\t"
    557       "usw            %[temp0],   -16(%[temp7])             \n\t"
    558       "usw            %[temp6],   -4(%[temp7])              \n\t"
    559       "usw            %[temp4],   -8(%[temp7])              \n\t"
    560       : [temp0] "=&r" (temp0), [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
    561         [temp4] "=&r" (temp4), [temp5] "=&r" (temp5), [temp6] "=&r" (temp6),
    562         [temp1] "+r" (temp1), [temp8] "+r" (temp8), [temp7] "+r" (temp7)
    563       :
    564       : "hi", "lo", "memory"
    565     );
    566   }
    567   echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i],
    568                                       far_spectrum[i]);
    569 }
    570 
    571 void WebRtcAecm_ResetAdaptiveChannel_mips(AecmCore_t* aecm) {
    572   int i;
    573   int32_t* temp3;
    574   int16_t* temp0;
    575   int32_t temp1, temp2, temp4, temp5;
    576 
    577   temp0 = &(aecm->channelStored[0]);
    578   temp3 = &(aecm->channelAdapt32[0]);
    579 
    580   // The stored channel has a significantly lower MSE than the adaptive one for
    581   // two consecutive calculations. Reset the adaptive channel.
    582   memcpy(aecm->channelAdapt16,
    583          aecm->channelStored,
    584          sizeof(int16_t) * PART_LEN1);
    585 
    586   // Restore the W32 channel
    587   for (i = 0; i < PART_LEN; i += 4) {
    588     __asm __volatile (
    589       "ulw            %[temp1], 0(%[temp0])           \n\t"
    590       "ulw            %[temp4], 4(%[temp0])           \n\t"
    591       "preceq.w.phl   %[temp2], %[temp1]              \n\t"
    592       "preceq.w.phr   %[temp1], %[temp1]              \n\t"
    593       "preceq.w.phl   %[temp5], %[temp4]              \n\t"
    594       "preceq.w.phr   %[temp4], %[temp4]              \n\t"
    595       "addiu          %[temp0], %[temp0], 8           \n\t"
    596       "usw            %[temp2], 4(%[temp3])           \n\t"
    597       "usw            %[temp1], 0(%[temp3])           \n\t"
    598       "usw            %[temp5], 12(%[temp3])          \n\t"
    599       "usw            %[temp4], 8(%[temp3])           \n\t"
    600       "addiu          %[temp3], %[temp3], 16          \n\t"
    601       : [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),
    602         [temp4] "=&r" (temp4), [temp5] "=&r" (temp5),
    603         [temp3] "+r" (temp3), [temp0] "+r" (temp0)
    604       :
    605       : "memory"
    606     );
    607   }
    608 
    609   aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32(
    610                               (int32_t)aecm->channelStored[i], 16);
    611 }
    612 #endif  // #if defined(MIPS_DSP_R1_LE)
    613 
    614 // Transforms a time domain signal into the frequency domain, outputting the
    615 // complex valued signal, absolute value and sum of absolute values.
    616 //
    617 // time_signal          [in]    Pointer to time domain signal
    618 // freq_signal_real     [out]   Pointer to real part of frequency domain array
    619 // freq_signal_imag     [out]   Pointer to imaginary part of frequency domain
    620 //                              array
    621 // freq_signal_abs      [out]   Pointer to absolute value of frequency domain
    622 //                              array
    623 // freq_signal_sum_abs  [out]   Pointer to the sum of all absolute values in
    624 //                              the frequency domain array
    625 // return value                 The Q-domain of current frequency values
    626 //
    627 static int TimeToFrequencyDomain(AecmCore_t* aecm,
    628                                  const int16_t* time_signal,
    629                                  complex16_t* freq_signal,
    630                                  uint16_t* freq_signal_abs,
    631                                  uint32_t* freq_signal_sum_abs)
    632 {
    633   int i = 0;
    634   int time_signal_scaling = 0;
    635 
    636   // In fft_buf, +16 for 32-byte alignment.
    637   int16_t fft_buf[PART_LEN4 + 16];
    638   int16_t *fft = (int16_t *) (((uintptr_t) fft_buf + 31) & ~31);
    639 
    640   int16_t tmp16no1;
    641 #if !defined(MIPS_DSP_R2_LE)
    642   int32_t tmp32no1;
    643   int32_t tmp32no2;
    644   int16_t tmp16no2;
    645 #else
    646   int32_t tmp32no10, tmp32no11, tmp32no12, tmp32no13;
    647   int32_t tmp32no20, tmp32no21, tmp32no22, tmp32no23;
    648   int16_t* freqp;
    649   uint16_t* freqabsp;
    650   uint32_t freqt0, freqt1, freqt2, freqt3;
    651   uint32_t freqs;
    652 #endif
    653 
    654 #ifdef AECM_DYNAMIC_Q
    655   tmp16no1 = WebRtcSpl_MaxAbsValueW16(time_signal, PART_LEN2);
    656   time_signal_scaling = WebRtcSpl_NormW16(tmp16no1);
    657 #endif
    658 
    659   WindowAndFFT(aecm, fft, time_signal, freq_signal, time_signal_scaling);
    660 
    661   // Extract imaginary and real part,
    662   // calculate the magnitude for all frequency bins
    663   freq_signal[0].imag = 0;
    664   freq_signal[PART_LEN].imag = 0;
    665   freq_signal[PART_LEN].real = fft[PART_LEN2];
    666   freq_signal_abs[0] = (uint16_t)WEBRTC_SPL_ABS_W16(freq_signal[0].real);
    667   freq_signal_abs[PART_LEN] = (uint16_t)WEBRTC_SPL_ABS_W16(
    668     freq_signal[PART_LEN].real);
    669   (*freq_signal_sum_abs) = (uint32_t)(freq_signal_abs[0]) +
    670     (uint32_t)(freq_signal_abs[PART_LEN]);
    671 
    672 #if !defined(MIPS_DSP_R2_LE)
    673   for (i = 1; i < PART_LEN; i++) {
    674     if (freq_signal[i].real == 0)
    675     {
    676       freq_signal_abs[i] = (uint16_t)WEBRTC_SPL_ABS_W16(
    677         freq_signal[i].imag);
    678     }
    679     else if (freq_signal[i].imag == 0)
    680     {
    681       freq_signal_abs[i] = (uint16_t)WEBRTC_SPL_ABS_W16(
    682         freq_signal[i].real);
    683     }
    684     else
    685     {
    686       // Approximation for magnitude of complex fft output
    687       // magn = sqrt(real^2 + imag^2)
    688       // magn ~= alpha * max(|imag|,|real|) + beta * min(|imag|,|real|)
    689       //
    690       // The parameters alpha and beta are stored in Q15
    691       tmp16no1 = WEBRTC_SPL_ABS_W16(freq_signal[i].real);
    692       tmp16no2 = WEBRTC_SPL_ABS_W16(freq_signal[i].imag);
    693       tmp32no1 = WEBRTC_SPL_MUL_16_16(tmp16no1, tmp16no1);
    694       tmp32no2 = WEBRTC_SPL_MUL_16_16(tmp16no2, tmp16no2);
    695       tmp32no2 = WebRtcSpl_AddSatW32(tmp32no1, tmp32no2);
    696       tmp32no1 = WebRtcSpl_SqrtFloor(tmp32no2);
    697 
    698       freq_signal_abs[i] = (uint16_t)tmp32no1;
    699     }
    700     (*freq_signal_sum_abs) += (uint32_t)freq_signal_abs[i];
    701   }
    702 #else // #if !defined(MIPS_DSP_R2_LE)
    703   freqs = (uint32_t)(freq_signal_abs[0]) +
    704           (uint32_t)(freq_signal_abs[PART_LEN]);
    705   freqp = &(freq_signal[1].real);
    706 
    707   __asm __volatile (
    708     "lw             %[freqt0],      0(%[freqp])             \n\t"
    709     "lw             %[freqt1],      4(%[freqp])             \n\t"
    710     "lw             %[freqt2],      8(%[freqp])             \n\t"
    711     "mult           $ac0,           $zero,      $zero       \n\t"
    712     "mult           $ac1,           $zero,      $zero       \n\t"
    713     "mult           $ac2,           $zero,      $zero       \n\t"
    714     "dpaq_s.w.ph    $ac0,           %[freqt0],  %[freqt0]   \n\t"
    715     "dpaq_s.w.ph    $ac1,           %[freqt1],  %[freqt1]   \n\t"
    716     "dpaq_s.w.ph    $ac2,           %[freqt2],  %[freqt2]   \n\t"
    717     "addiu          %[freqp],       %[freqp],   12          \n\t"
    718     "extr.w         %[tmp32no20],   $ac0,       1           \n\t"
    719     "extr.w         %[tmp32no21],   $ac1,       1           \n\t"
    720     "extr.w         %[tmp32no22],   $ac2,       1           \n\t"
    721     : [freqt0] "=&r" (freqt0), [freqt1] "=&r" (freqt1),
    722       [freqt2] "=&r" (freqt2), [freqp] "+r" (freqp),
    723       [tmp32no20] "=r" (tmp32no20), [tmp32no21] "=r" (tmp32no21),
    724       [tmp32no22] "=r" (tmp32no22)
    725     :
    726     : "memory", "hi", "lo", "$ac1hi", "$ac1lo", "$ac2hi", "$ac2lo"
    727   );
    728 
    729   tmp32no10 = WebRtcSpl_SqrtFloor(tmp32no20);
    730   tmp32no11 = WebRtcSpl_SqrtFloor(tmp32no21);
    731   tmp32no12 = WebRtcSpl_SqrtFloor(tmp32no22);
    732   freq_signal_abs[1] = (uint16_t)tmp32no10;
    733   freq_signal_abs[2] = (uint16_t)tmp32no11;
    734   freq_signal_abs[3] = (uint16_t)tmp32no12;
    735   freqs += (uint32_t)tmp32no10;
    736   freqs += (uint32_t)tmp32no11;
    737   freqs += (uint32_t)tmp32no12;
    738   freqabsp = &(freq_signal_abs[4]);
    739   for (i = 4; i < PART_LEN; i+=4)
    740   {
    741     __asm __volatile (
    742       "ulw            %[freqt0],      0(%[freqp])                 \n\t"
    743       "ulw            %[freqt1],      4(%[freqp])                 \n\t"
    744       "ulw            %[freqt2],      8(%[freqp])                 \n\t"
    745       "ulw            %[freqt3],      12(%[freqp])                \n\t"
    746       "mult           $ac0,           $zero,          $zero       \n\t"
    747       "mult           $ac1,           $zero,          $zero       \n\t"
    748       "mult           $ac2,           $zero,          $zero       \n\t"
    749       "mult           $ac3,           $zero,          $zero       \n\t"
    750       "dpaq_s.w.ph    $ac0,           %[freqt0],      %[freqt0]   \n\t"
    751       "dpaq_s.w.ph    $ac1,           %[freqt1],      %[freqt1]   \n\t"
    752       "dpaq_s.w.ph    $ac2,           %[freqt2],      %[freqt2]   \n\t"
    753       "dpaq_s.w.ph    $ac3,           %[freqt3],      %[freqt3]   \n\t"
    754       "addiu          %[freqp],       %[freqp],       16          \n\t"
    755       "addiu          %[freqabsp],    %[freqabsp],    8           \n\t"
    756       "extr.w         %[tmp32no20],   $ac0,           1           \n\t"
    757       "extr.w         %[tmp32no21],   $ac1,           1           \n\t"
    758       "extr.w         %[tmp32no22],   $ac2,           1           \n\t"
    759       "extr.w         %[tmp32no23],   $ac3,           1           \n\t"
    760       : [freqt0] "=&r" (freqt0), [freqt1] "=&r" (freqt1),
    761         [freqt2] "=&r" (freqt2), [freqt3] "=&r" (freqt3),
    762         [tmp32no20] "=r" (tmp32no20), [tmp32no21] "=r" (tmp32no21),
    763         [tmp32no22] "=r" (tmp32no22), [tmp32no23] "=r" (tmp32no23),
    764         [freqabsp] "+r" (freqabsp), [freqp] "+r" (freqp)
    765       :
    766       : "memory", "hi", "lo", "$ac1hi", "$ac1lo",
    767         "$ac2hi", "$ac2lo", "$ac3hi", "$ac3lo"
    768     );
    769 
    770     tmp32no10 = WebRtcSpl_SqrtFloor(tmp32no20);
    771     tmp32no11 = WebRtcSpl_SqrtFloor(tmp32no21);
    772     tmp32no12 = WebRtcSpl_SqrtFloor(tmp32no22);
    773     tmp32no13 = WebRtcSpl_SqrtFloor(tmp32no23);
    774 
    775     __asm __volatile (
    776       "sh             %[tmp32no10],   -8(%[freqabsp])                 \n\t"
    777       "sh             %[tmp32no11],   -6(%[freqabsp])                 \n\t"
    778       "sh             %[tmp32no12],   -4(%[freqabsp])                 \n\t"
    779       "sh             %[tmp32no13],   -2(%[freqabsp])                 \n\t"
    780       "addu           %[freqs],       %[freqs],       %[tmp32no10]    \n\t"
    781       "addu           %[freqs],       %[freqs],       %[tmp32no11]    \n\t"
    782       "addu           %[freqs],       %[freqs],       %[tmp32no12]    \n\t"
    783       "addu           %[freqs],       %[freqs],       %[tmp32no13]    \n\t"
    784       : [freqs] "+r" (freqs)
    785       : [tmp32no10] "r" (tmp32no10), [tmp32no11] "r" (tmp32no11),
    786         [tmp32no12] "r" (tmp32no12), [tmp32no13] "r" (tmp32no13),
    787         [freqabsp] "r" (freqabsp)
    788       : "memory"
    789     );
    790   }
    791 
    792   (*freq_signal_sum_abs) = freqs;
    793 #endif
    794 
    795   return time_signal_scaling;
    796 }
    797 
    798 int WebRtcAecm_ProcessBlock(AecmCore_t* aecm,
    799                             const int16_t* farend,
    800                             const int16_t* nearendNoisy,
    801                             const int16_t* nearendClean,
    802                             int16_t* output) {
    803   int i;
    804   uint32_t xfaSum;
    805   uint32_t dfaNoisySum;
    806   uint32_t dfaCleanSum;
    807   uint32_t echoEst32Gained;
    808   uint32_t tmpU32;
    809   int32_t tmp32no1;
    810 
    811   uint16_t xfa[PART_LEN1];
    812   uint16_t dfaNoisy[PART_LEN1];
    813   uint16_t dfaClean[PART_LEN1];
    814   uint16_t* ptrDfaClean = dfaClean;
    815   const uint16_t* far_spectrum_ptr = NULL;
    816 
    817   // 32 byte aligned buffers (with +8 or +16).
    818   int16_t fft_buf[PART_LEN4 + 2 + 16]; // +2 to make a loop safe.
    819   int32_t echoEst32_buf[PART_LEN1 + 8];
    820   int32_t dfw_buf[PART_LEN2 + 8];
    821   int32_t efw_buf[PART_LEN2 + 8];
    822 
    823   int16_t* fft = (int16_t*)(((uint32_t)fft_buf + 31) & ~ 31);
    824   int32_t* echoEst32 = (int32_t*)(((uint32_t)echoEst32_buf + 31) & ~ 31);
    825   complex16_t* dfw = (complex16_t*)(((uint32_t)dfw_buf + 31) & ~ 31);
    826   complex16_t* efw = (complex16_t*)(((uint32_t)efw_buf + 31) & ~ 31);
    827 
    828   int16_t hnl[PART_LEN1];
    829   int16_t numPosCoef = 0;
    830   int delay;
    831   int16_t tmp16no1;
    832   int16_t tmp16no2;
    833   int16_t mu;
    834   int16_t supGain;
    835   int16_t zeros32, zeros16;
    836   int16_t zerosDBufNoisy, zerosDBufClean, zerosXBuf;
    837   int far_q;
    838   int16_t resolutionDiff, qDomainDiff, dfa_clean_q_domain_diff;
    839 
    840   const int kMinPrefBand = 4;
    841   const int kMaxPrefBand = 24;
    842   int32_t avgHnl32 = 0;
    843 
    844   int32_t temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
    845   int16_t* ptr;
    846   int16_t* ptr1;
    847   int16_t* er_ptr;
    848   int16_t* dr_ptr;
    849 
    850   ptr = &hnl[0];
    851   ptr1 = &hnl[0];
    852   er_ptr = &efw[0].real;
    853   dr_ptr = &dfw[0].real;
    854 
    855   // Determine startup state. There are three states:
    856   // (0) the first CONV_LEN blocks
    857   // (1) another CONV_LEN blocks
    858   // (2) the rest
    859 
    860   if (aecm->startupState < 2) {
    861     aecm->startupState = (aecm->totCount >= CONV_LEN) +
    862                          (aecm->totCount >= CONV_LEN2);
    863   }
    864   // END: Determine startup state
    865 
    866   // Buffer near and far end signals
    867   memcpy(aecm->xBuf + PART_LEN, farend, sizeof(int16_t) * PART_LEN);
    868   memcpy(aecm->dBufNoisy + PART_LEN,
    869          nearendNoisy,
    870          sizeof(int16_t) * PART_LEN);
    871   if (nearendClean != NULL) {
    872     memcpy(aecm->dBufClean + PART_LEN,
    873            nearendClean,
    874            sizeof(int16_t) * PART_LEN);
    875   }
    876 
    877   // Transform far end signal from time domain to frequency domain.
    878   far_q = TimeToFrequencyDomain(aecm,
    879                                 aecm->xBuf,
    880                                 dfw,
    881                                 xfa,
    882                                 &xfaSum);
    883 
    884   // Transform noisy near end signal from time domain to frequency domain.
    885   zerosDBufNoisy = TimeToFrequencyDomain(aecm,
    886                                          aecm->dBufNoisy,
    887                                          dfw,
    888                                          dfaNoisy,
    889                                          &dfaNoisySum);
    890   aecm->dfaNoisyQDomainOld = aecm->dfaNoisyQDomain;
    891   aecm->dfaNoisyQDomain = (int16_t)zerosDBufNoisy;
    892 
    893   if (nearendClean == NULL) {
    894     ptrDfaClean = dfaNoisy;
    895     aecm->dfaCleanQDomainOld = aecm->dfaNoisyQDomainOld;
    896     aecm->dfaCleanQDomain = aecm->dfaNoisyQDomain;
    897     dfaCleanSum = dfaNoisySum;
    898   } else {
    899     // Transform clean near end signal from time domain to frequency domain.
    900     zerosDBufClean = TimeToFrequencyDomain(aecm,
    901                                            aecm->dBufClean,
    902                                            dfw,
    903                                            dfaClean,
    904                                            &dfaCleanSum);
    905     aecm->dfaCleanQDomainOld = aecm->dfaCleanQDomain;
    906     aecm->dfaCleanQDomain = (int16_t)zerosDBufClean;
    907   }
    908 
    909   // Get the delay
    910   // Save far-end history and estimate delay
    911   WebRtcAecm_UpdateFarHistory(aecm, xfa, far_q);
    912 
    913   if (WebRtc_AddFarSpectrumFix(aecm->delay_estimator_farend, xfa, PART_LEN1,
    914                                far_q) == -1) {
    915     return -1;
    916   }
    917   delay = WebRtc_DelayEstimatorProcessFix(aecm->delay_estimator,
    918                                           dfaNoisy,
    919                                           PART_LEN1,
    920                                           zerosDBufNoisy);
    921   if (delay == -1) {
    922     return -1;
    923   }
    924   else if (delay == -2) {
    925     // If the delay is unknown, we assume zero.
    926     // NOTE: this will have to be adjusted if we ever add lookahead.
    927     delay = 0;
    928   }
    929 
    930   if (aecm->fixedDelay >= 0) {
    931     // Use fixed delay
    932     delay = aecm->fixedDelay;
    933   }
    934 
    935   // Get aligned far end spectrum
    936   far_spectrum_ptr = WebRtcAecm_AlignedFarend(aecm, &far_q, delay);
    937   zerosXBuf = (int16_t) far_q;
    938 
    939   if (far_spectrum_ptr == NULL) {
    940     return -1;
    941   }
    942 
    943   // Calculate log(energy) and update energy threshold levels
    944   WebRtcAecm_CalcEnergies(aecm,
    945                           far_spectrum_ptr,
    946                           zerosXBuf,
    947                           dfaNoisySum,
    948                           echoEst32);
    949   // Calculate stepsize
    950   mu = WebRtcAecm_CalcStepSize(aecm);
    951 
    952   // Update counters
    953   aecm->totCount++;
    954 
    955   // This is the channel estimation algorithm.
    956   // It is base on NLMS but has a variable step length,
    957   // which was calculated above.
    958   WebRtcAecm_UpdateChannel(aecm,
    959                            far_spectrum_ptr,
    960                            zerosXBuf,
    961                            dfaNoisy,
    962                            mu,
    963                            echoEst32);
    964 
    965   supGain = WebRtcAecm_CalcSuppressionGain(aecm);
    966 
    967   // Calculate Wiener filter hnl[]
    968   for (i = 0; i < PART_LEN1; i++) {
    969     // Far end signal through channel estimate in Q8
    970     // How much can we shift right to preserve resolution
    971     tmp32no1 = echoEst32[i] - aecm->echoFilt[i];
    972     aecm->echoFilt[i] += (tmp32no1 * 50) >> 8;
    973 
    974     zeros32 = WebRtcSpl_NormW32(aecm->echoFilt[i]) + 1;
    975     zeros16 = WebRtcSpl_NormW16(supGain) + 1;
    976     if (zeros32 + zeros16 > 16) {
    977       // Multiplication is safe
    978       // Result in
    979       // Q(RESOLUTION_CHANNEL+RESOLUTION_SUPGAIN+aecm->xfaQDomainBuf[diff])
    980       echoEst32Gained = WEBRTC_SPL_UMUL_32_16((uint32_t)aecm->echoFilt[i],
    981                                               (uint16_t)supGain);
    982       resolutionDiff = 14 - RESOLUTION_CHANNEL16 - RESOLUTION_SUPGAIN;
    983       resolutionDiff += (aecm->dfaCleanQDomain - zerosXBuf);
    984     } else {
    985       tmp16no1 = 17 - zeros32 - zeros16;
    986       resolutionDiff = 14 + tmp16no1 - RESOLUTION_CHANNEL16 -
    987                        RESOLUTION_SUPGAIN;
    988       resolutionDiff += (aecm->dfaCleanQDomain - zerosXBuf);
    989       if (zeros32 > tmp16no1) {
    990         echoEst32Gained = WEBRTC_SPL_UMUL_32_16(
    991                             (uint32_t)aecm->echoFilt[i],
    992                             (uint16_t)WEBRTC_SPL_RSHIFT_W16(supGain, tmp16no1));
    993       } else {
    994         // Result in Q-(RESOLUTION_CHANNEL+RESOLUTION_SUPGAIN-16)
    995         echoEst32Gained = WEBRTC_SPL_UMUL_32_16(
    996                             (uint32_t)WEBRTC_SPL_RSHIFT_W32(aecm->echoFilt[i],
    997                                                             tmp16no1),
    998                             (uint16_t)supGain);
    999       }
   1000     }
   1001 
   1002     zeros16 = WebRtcSpl_NormW16(aecm->nearFilt[i]);
   1003     assert(zeros16 >= 0);  // |zeros16| is a norm, hence non-negative.
   1004     dfa_clean_q_domain_diff = aecm->dfaCleanQDomain - aecm->dfaCleanQDomainOld;
   1005     if (zeros16 < dfa_clean_q_domain_diff && aecm->nearFilt[i]) {
   1006       tmp16no1 = aecm->nearFilt[i] << zeros16;
   1007       qDomainDiff = zeros16 - dfa_clean_q_domain_diff;
   1008       tmp16no2 = ptrDfaClean[i] >> -qDomainDiff;
   1009     } else {
   1010       tmp16no1 = dfa_clean_q_domain_diff < 0
   1011           ? aecm->nearFilt[i] >> -dfa_clean_q_domain_diff
   1012           : aecm->nearFilt[i] << dfa_clean_q_domain_diff;
   1013       qDomainDiff = 0;
   1014       tmp16no2 = ptrDfaClean[i];
   1015     }
   1016 
   1017     tmp32no1 = (int32_t)(tmp16no2 - tmp16no1);
   1018     tmp16no2 = (int16_t)WEBRTC_SPL_RSHIFT_W32(tmp32no1, 4);
   1019     tmp16no2 += tmp16no1;
   1020     zeros16 = WebRtcSpl_NormW16(tmp16no2);
   1021     if ((tmp16no2) & (-qDomainDiff > zeros16)) {
   1022       aecm->nearFilt[i] = WEBRTC_SPL_WORD16_MAX;
   1023     } else {
   1024       aecm->nearFilt[i] = qDomainDiff < 0 ? tmp16no2 << -qDomainDiff
   1025                                           : tmp16no2 >> qDomainDiff;
   1026     }
   1027 
   1028     // Wiener filter coefficients, resulting hnl in Q14
   1029     if (echoEst32Gained == 0) {
   1030       hnl[i] = ONE_Q14;
   1031       numPosCoef++;
   1032     } else if (aecm->nearFilt[i] == 0) {
   1033       hnl[i] = 0;
   1034     } else {
   1035       // Multiply the suppression gain
   1036       // Rounding
   1037       echoEst32Gained += (uint32_t)(aecm->nearFilt[i] >> 1);
   1038       tmpU32 = WebRtcSpl_DivU32U16(echoEst32Gained,
   1039                                    (uint16_t)aecm->nearFilt[i]);
   1040 
   1041       // Current resolution is
   1042       // Q-(RESOLUTION_CHANNEL + RESOLUTION_SUPGAIN
   1043       //    - max(0, 17 - zeros16 - zeros32))
   1044       // Make sure we are in Q14
   1045       tmp32no1 = (int32_t)WEBRTC_SPL_SHIFT_W32(tmpU32, resolutionDiff);
   1046       if (tmp32no1 > ONE_Q14) {
   1047         hnl[i] = 0;
   1048       } else if (tmp32no1 < 0) {
   1049         hnl[i] = ONE_Q14;
   1050         numPosCoef++;
   1051       } else {
   1052         // 1-echoEst/dfa
   1053         hnl[i] = ONE_Q14 - (int16_t)tmp32no1;
   1054         if (hnl[i] <= 0) {
   1055           hnl[i] = 0;
   1056         } else {
   1057           numPosCoef++;
   1058         }
   1059       }
   1060     }
   1061   }
   1062 
   1063   // Only in wideband. Prevent the gain in upper band from being larger than
   1064   // in lower band.
   1065   if (aecm->mult == 2) {
   1066     // TODO(bjornv): Investigate if the scaling of hnl[i] below can cause
   1067     //               speech distortion in double-talk.
   1068     for (i = 0; i < (PART_LEN1 >> 3); i++) {
   1069       __asm __volatile (
   1070         "lh         %[temp1],       0(%[ptr1])                  \n\t"
   1071         "lh         %[temp2],       2(%[ptr1])                  \n\t"
   1072         "lh         %[temp3],       4(%[ptr1])                  \n\t"
   1073         "lh         %[temp4],       6(%[ptr1])                  \n\t"
   1074         "lh         %[temp5],       8(%[ptr1])                  \n\t"
   1075         "lh         %[temp6],       10(%[ptr1])                 \n\t"
   1076         "lh         %[temp7],       12(%[ptr1])                 \n\t"
   1077         "lh         %[temp8],       14(%[ptr1])                 \n\t"
   1078         "mul        %[temp1],       %[temp1],       %[temp1]    \n\t"
   1079         "mul        %[temp2],       %[temp2],       %[temp2]    \n\t"
   1080         "mul        %[temp3],       %[temp3],       %[temp3]    \n\t"
   1081         "mul        %[temp4],       %[temp4],       %[temp4]    \n\t"
   1082         "mul        %[temp5],       %[temp5],       %[temp5]    \n\t"
   1083         "mul        %[temp6],       %[temp6],       %[temp6]    \n\t"
   1084         "mul        %[temp7],       %[temp7],       %[temp7]    \n\t"
   1085         "mul        %[temp8],       %[temp8],       %[temp8]    \n\t"
   1086         "sra        %[temp1],       %[temp1],       14          \n\t"
   1087         "sra        %[temp2],       %[temp2],       14          \n\t"
   1088         "sra        %[temp3],       %[temp3],       14          \n\t"
   1089         "sra        %[temp4],       %[temp4],       14          \n\t"
   1090         "sra        %[temp5],       %[temp5],       14          \n\t"
   1091         "sra        %[temp6],       %[temp6],       14          \n\t"
   1092         "sra        %[temp7],       %[temp7],       14          \n\t"
   1093         "sra        %[temp8],       %[temp8],       14          \n\t"
   1094         "sh         %[temp1],       0(%[ptr1])                  \n\t"
   1095         "sh         %[temp2],       2(%[ptr1])                  \n\t"
   1096         "sh         %[temp3],       4(%[ptr1])                  \n\t"
   1097         "sh         %[temp4],       6(%[ptr1])                  \n\t"
   1098         "sh         %[temp5],       8(%[ptr1])                  \n\t"
   1099         "sh         %[temp6],       10(%[ptr1])                 \n\t"
   1100         "sh         %[temp7],       12(%[ptr1])                 \n\t"
   1101         "sh         %[temp8],       14(%[ptr1])                 \n\t"
   1102         "addiu      %[ptr1],        %[ptr1],        16          \n\t"
   1103         : [temp1] "=&r" (temp1), [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
   1104           [temp4] "=&r" (temp4), [temp5] "=&r" (temp5), [temp6] "=&r" (temp6),
   1105           [temp7] "=&r" (temp7), [temp8] "=&r" (temp8), [ptr1] "+r" (ptr1)
   1106         :
   1107         : "memory", "hi", "lo"
   1108       );
   1109     }
   1110     for(i = 0; i < (PART_LEN1 & 7); i++) {
   1111       __asm __volatile (
   1112         "lh         %[temp1],       0(%[ptr1])                  \n\t"
   1113         "mul        %[temp1],       %[temp1],       %[temp1]    \n\t"
   1114         "sra        %[temp1],       %[temp1],       14          \n\t"
   1115         "sh         %[temp1],       0(%[ptr1])                  \n\t"
   1116         "addiu      %[ptr1],        %[ptr1],        2           \n\t"
   1117         : [temp1] "=&r" (temp1), [ptr1] "+r" (ptr1)
   1118         :
   1119         : "memory", "hi", "lo"
   1120       );
   1121     }
   1122 
   1123     for (i = kMinPrefBand; i <= kMaxPrefBand; i++) {
   1124       avgHnl32 += (int32_t)hnl[i];
   1125     }
   1126 
   1127     assert(kMaxPrefBand - kMinPrefBand + 1 > 0);
   1128     avgHnl32 /= (kMaxPrefBand - kMinPrefBand + 1);
   1129 
   1130     for (i = kMaxPrefBand; i < PART_LEN1; i++) {
   1131       if (hnl[i] > (int16_t)avgHnl32) {
   1132         hnl[i] = (int16_t)avgHnl32;
   1133       }
   1134     }
   1135   }
   1136 
   1137   // Calculate NLP gain, result is in Q14
   1138   if (aecm->nlpFlag) {
   1139     if (numPosCoef < 3) {
   1140       for (i = 0; i < PART_LEN1; i++) {
   1141         efw[i].real = 0;
   1142         efw[i].imag = 0;
   1143         hnl[i] = 0;
   1144       }
   1145     } else {
   1146       for (i = 0; i < PART_LEN1; i++) {
   1147 #if defined(MIPS_DSP_R1_LE)
   1148         __asm __volatile (
   1149           ".set       push                                        \n\t"
   1150           ".set       noreorder                                   \n\t"
   1151           "lh         %[temp1],       0(%[ptr])                   \n\t"
   1152           "lh         %[temp2],       0(%[dr_ptr])                \n\t"
   1153           "slti       %[temp4],       %[temp1],       0x4001      \n\t"
   1154           "beqz       %[temp4],       3f                          \n\t"
   1155           " lh        %[temp3],       2(%[dr_ptr])                \n\t"
   1156           "slti       %[temp5],       %[temp1],       3277        \n\t"
   1157           "bnez       %[temp5],       2f                          \n\t"
   1158           " addiu     %[dr_ptr],      %[dr_ptr],      4           \n\t"
   1159           "mul        %[temp2],       %[temp2],       %[temp1]    \n\t"
   1160           "mul        %[temp3],       %[temp3],       %[temp1]    \n\t"
   1161           "shra_r.w   %[temp2],       %[temp2],       14          \n\t"
   1162           "shra_r.w   %[temp3],       %[temp3],       14          \n\t"
   1163           "b          4f                                          \n\t"
   1164           " nop                                                   \n\t"
   1165          "2:                                                      \n\t"
   1166           "addu       %[temp1],       $zero,          $zero       \n\t"
   1167           "addu       %[temp2],       $zero,          $zero       \n\t"
   1168           "addu       %[temp3],       $zero,          $zero       \n\t"
   1169           "b          1f                                          \n\t"
   1170           " nop                                                   \n\t"
   1171          "3:                                                      \n\t"
   1172           "addiu      %[temp1],       $0,             0x4000      \n\t"
   1173          "1:                                                      \n\t"
   1174           "sh         %[temp1],       0(%[ptr])                   \n\t"
   1175          "4:                                                      \n\t"
   1176           "sh         %[temp2],       0(%[er_ptr])                \n\t"
   1177           "sh         %[temp3],       2(%[er_ptr])                \n\t"
   1178           "addiu      %[ptr],         %[ptr],         2           \n\t"
   1179           "addiu      %[er_ptr],      %[er_ptr],      4           \n\t"
   1180           ".set       pop                                         \n\t"
   1181           : [temp1] "=&r" (temp1), [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
   1182             [temp4] "=&r" (temp4), [temp5] "=&r" (temp5), [ptr] "+r" (ptr),
   1183             [er_ptr] "+r" (er_ptr), [dr_ptr] "+r" (dr_ptr)
   1184           :
   1185           : "memory", "hi", "lo"
   1186         );
   1187 #else
   1188         __asm __volatile (
   1189           ".set       push                                        \n\t"
   1190           ".set       noreorder                                   \n\t"
   1191           "lh         %[temp1],       0(%[ptr])                   \n\t"
   1192           "lh         %[temp2],       0(%[dr_ptr])                \n\t"
   1193           "slti       %[temp4],       %[temp1],       0x4001      \n\t"
   1194           "beqz       %[temp4],       3f                          \n\t"
   1195           " lh        %[temp3],       2(%[dr_ptr])                \n\t"
   1196           "slti       %[temp5],       %[temp1],       3277        \n\t"
   1197           "bnez       %[temp5],       2f                          \n\t"
   1198           " addiu     %[dr_ptr],      %[dr_ptr],      4           \n\t"
   1199           "mul        %[temp2],       %[temp2],       %[temp1]    \n\t"
   1200           "mul        %[temp3],       %[temp3],       %[temp1]    \n\t"
   1201           "addiu      %[temp2],       %[temp2],       0x2000      \n\t"
   1202           "addiu      %[temp3],       %[temp3],       0x2000      \n\t"
   1203           "sra        %[temp2],       %[temp2],       14          \n\t"
   1204           "sra        %[temp3],       %[temp3],       14          \n\t"
   1205           "b          4f                                          \n\t"
   1206           " nop                                                   \n\t"
   1207          "2:                                                      \n\t"
   1208           "addu       %[temp1],       $zero,          $zero       \n\t"
   1209           "addu       %[temp2],       $zero,          $zero       \n\t"
   1210           "addu       %[temp3],       $zero,          $zero       \n\t"
   1211           "b          1f                                          \n\t"
   1212           " nop                                                   \n\t"
   1213          "3:                                                      \n\t"
   1214           "addiu      %[temp1],       $0,             0x4000      \n\t"
   1215          "1:                                                      \n\t"
   1216           "sh         %[temp1],       0(%[ptr])                   \n\t"
   1217          "4:                                                      \n\t"
   1218           "sh         %[temp2],       0(%[er_ptr])                \n\t"
   1219           "sh         %[temp3],       2(%[er_ptr])                \n\t"
   1220           "addiu      %[ptr],         %[ptr],         2           \n\t"
   1221           "addiu      %[er_ptr],      %[er_ptr],      4           \n\t"
   1222           ".set       pop                                         \n\t"
   1223           : [temp1] "=&r" (temp1), [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
   1224             [temp4] "=&r" (temp4), [temp5] "=&r" (temp5), [ptr] "+r" (ptr),
   1225             [er_ptr] "+r" (er_ptr), [dr_ptr] "+r" (dr_ptr)
   1226           :
   1227           : "memory", "hi", "lo"
   1228         );
   1229 #endif
   1230       }
   1231     }
   1232   }
   1233   else {
   1234     // multiply with Wiener coefficients
   1235     for (i = 0; i < PART_LEN1; i++) {
   1236       efw[i].real = (int16_t)
   1237                       (WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(dfw[i].real,
   1238                                                             hnl[i],
   1239                                                             14));
   1240       efw[i].imag = (int16_t)
   1241                       (WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(dfw[i].imag,
   1242                                                             hnl[i],
   1243                                                             14));
   1244     }
   1245   }
   1246 
   1247   if (aecm->cngMode == AecmTrue) {
   1248     ComfortNoise(aecm, ptrDfaClean, efw, hnl);
   1249   }
   1250 
   1251   InverseFFTAndWindow(aecm, fft, efw, output, nearendClean);
   1252 
   1253   return 0;
   1254 }
   1255 
   1256 // Generate comfort noise and add to output signal.
   1257 static void ComfortNoise(AecmCore_t* aecm,
   1258                          const uint16_t* dfa,
   1259                          complex16_t* out,
   1260                          const int16_t* lambda) {
   1261   int16_t i;
   1262   int16_t tmp16, tmp161, tmp162, tmp163, nrsh1, nrsh2;
   1263   int32_t tmp32, tmp321, tnoise, tnoise1;
   1264   int32_t tmp322, tmp323, *tmp1;
   1265   int16_t* dfap;
   1266   int16_t* lambdap;
   1267   const int32_t c2049 = 2049;
   1268   const int32_t c359 = 359;
   1269   const int32_t c114 = ONE_Q14;
   1270 
   1271   int16_t randW16[PART_LEN];
   1272   int16_t uReal[PART_LEN1];
   1273   int16_t uImag[PART_LEN1];
   1274   int32_t outLShift32;
   1275 
   1276   int16_t shiftFromNearToNoise = kNoiseEstQDomain - aecm->dfaCleanQDomain;
   1277   int16_t minTrackShift = 9;
   1278 
   1279   assert(shiftFromNearToNoise >= 0);
   1280   assert(shiftFromNearToNoise < 16);
   1281 
   1282   if (aecm->noiseEstCtr < 100) {
   1283     // Track the minimum more quickly initially.
   1284     aecm->noiseEstCtr++;
   1285     minTrackShift = 6;
   1286   }
   1287 
   1288   // Generate a uniform random array on [0 2^15-1].
   1289   WebRtcSpl_RandUArray(randW16, PART_LEN, &aecm->seed);
   1290   int16_t* randW16p = (int16_t*)randW16;
   1291 #if defined (MIPS_DSP_R1_LE)
   1292   int16_t* kCosTablep = (int16_t*)WebRtcAecm_kCosTable;
   1293   int16_t* kSinTablep = (int16_t*)WebRtcAecm_kSinTable;
   1294 #endif   // #if defined(MIPS_DSP_R1_LE)
   1295   tmp1 = (int32_t*)aecm->noiseEst + 1;
   1296   dfap = (int16_t*)dfa + 1;
   1297   lambdap = (int16_t*)lambda + 1;
   1298   // Estimate noise power.
   1299   for (i = 1; i < PART_LEN1; i+=2) {
   1300   // Shift to the noise domain.
   1301     __asm __volatile (
   1302       "lh     %[tmp32],       0(%[dfap])                              \n\t"
   1303       "lw     %[tnoise],      0(%[tmp1])                              \n\t"
   1304       "sllv   %[outLShift32], %[tmp32],   %[shiftFromNearToNoise]     \n\t"
   1305       : [tmp32] "=&r" (tmp32), [outLShift32] "=r" (outLShift32),
   1306         [tnoise] "=&r" (tnoise)
   1307       : [tmp1] "r" (tmp1), [dfap] "r" (dfap),
   1308         [shiftFromNearToNoise] "r" (shiftFromNearToNoise)
   1309       : "memory"
   1310     );
   1311 
   1312     if (outLShift32 < tnoise) {
   1313       // Reset "too low" counter
   1314       aecm->noiseEstTooLowCtr[i] = 0;
   1315       // Track the minimum.
   1316       if (tnoise < (1 << minTrackShift)) {
   1317         // For small values, decrease noiseEst[i] every
   1318         // |kNoiseEstIncCount| block. The regular approach below can not
   1319         // go further down due to truncation.
   1320         aecm->noiseEstTooHighCtr[i]++;
   1321         if (aecm->noiseEstTooHighCtr[i] >= kNoiseEstIncCount) {
   1322           tnoise--;
   1323           aecm->noiseEstTooHighCtr[i] = 0;  // Reset the counter
   1324         }
   1325       } else {
   1326         __asm __volatile (
   1327           "subu   %[tmp32],       %[tnoise],      %[outLShift32]      \n\t"
   1328           "srav   %[tmp32],       %[tmp32],       %[minTrackShift]    \n\t"
   1329           "subu   %[tnoise],      %[tnoise],      %[tmp32]            \n\t"
   1330           : [tmp32] "=&r" (tmp32), [tnoise] "+r" (tnoise)
   1331           : [outLShift32] "r" (outLShift32), [minTrackShift] "r" (minTrackShift)
   1332         );
   1333       }
   1334     } else {
   1335       // Reset "too high" counter
   1336       aecm->noiseEstTooHighCtr[i] = 0;
   1337       // Ramp slowly upwards until we hit the minimum again.
   1338       if ((tnoise >> 19) <= 0) {
   1339         if ((tnoise >> 11) > 0) {
   1340           // Large enough for relative increase
   1341           __asm __volatile (
   1342             "mul    %[tnoise],  %[tnoise],  %[c2049]    \n\t"
   1343             "sra    %[tnoise],  %[tnoise],  11          \n\t"
   1344             : [tnoise] "+r" (tnoise)
   1345             : [c2049] "r" (c2049)
   1346             : "hi", "lo"
   1347           );
   1348         } else {
   1349           // Make incremental increases based on size every
   1350           // |kNoiseEstIncCount| block
   1351           aecm->noiseEstTooLowCtr[i]++;
   1352           if (aecm->noiseEstTooLowCtr[i] >= kNoiseEstIncCount) {
   1353             __asm __volatile (
   1354               "sra    %[tmp32],   %[tnoise],  9           \n\t"
   1355               "addi   %[tnoise],  %[tnoise],  1           \n\t"
   1356               "addu   %[tnoise],  %[tnoise],  %[tmp32]    \n\t"
   1357               : [tnoise] "+r" (tnoise), [tmp32] "=&r" (tmp32)
   1358               :
   1359             );
   1360             aecm->noiseEstTooLowCtr[i] = 0; // Reset counter
   1361           }
   1362         }
   1363       } else {
   1364         // Avoid overflow.
   1365         // Multiplication with 2049 will cause wrap around. Scale
   1366         // down first and then multiply
   1367         __asm __volatile (
   1368           "sra    %[tnoise],  %[tnoise],  11          \n\t"
   1369           "mul    %[tnoise],  %[tnoise],  %[c2049]    \n\t"
   1370           : [tnoise] "+r" (tnoise)
   1371           : [c2049] "r" (c2049)
   1372           : "hi", "lo"
   1373         );
   1374       }
   1375     }
   1376 
   1377     // Shift to the noise domain.
   1378     __asm __volatile (
   1379       "lh     %[tmp32],       2(%[dfap])                              \n\t"
   1380       "lw     %[tnoise1],     4(%[tmp1])                              \n\t"
   1381       "addiu  %[dfap],        %[dfap],    4                           \n\t"
   1382       "sllv   %[outLShift32], %[tmp32],   %[shiftFromNearToNoise]     \n\t"
   1383       : [tmp32] "=&r" (tmp32), [dfap] "+r" (dfap),
   1384         [outLShift32] "=r" (outLShift32), [tnoise1] "=&r" (tnoise1)
   1385       : [tmp1] "r" (tmp1), [shiftFromNearToNoise] "r" (shiftFromNearToNoise)
   1386       : "memory"
   1387     );
   1388 
   1389     if (outLShift32 < tnoise1) {
   1390       // Reset "too low" counter
   1391       aecm->noiseEstTooLowCtr[i + 1] = 0;
   1392       // Track the minimum.
   1393       if (tnoise1 < (1 << minTrackShift)) {
   1394         // For small values, decrease noiseEst[i] every
   1395         // |kNoiseEstIncCount| block. The regular approach below can not
   1396         // go further down due to truncation.
   1397         aecm->noiseEstTooHighCtr[i + 1]++;
   1398         if (aecm->noiseEstTooHighCtr[i + 1] >= kNoiseEstIncCount) {
   1399           tnoise1--;
   1400           aecm->noiseEstTooHighCtr[i + 1] = 0; // Reset the counter
   1401         }
   1402       } else {
   1403         __asm __volatile (
   1404           "subu   %[tmp32],       %[tnoise1],     %[outLShift32]      \n\t"
   1405           "srav   %[tmp32],       %[tmp32],       %[minTrackShift]    \n\t"
   1406           "subu   %[tnoise1],     %[tnoise1],     %[tmp32]            \n\t"
   1407           : [tmp32] "=&r" (tmp32), [tnoise1] "+r" (tnoise1)
   1408           : [outLShift32] "r" (outLShift32), [minTrackShift] "r" (minTrackShift)
   1409         );
   1410       }
   1411     } else {
   1412       // Reset "too high" counter
   1413       aecm->noiseEstTooHighCtr[i + 1] = 0;
   1414       // Ramp slowly upwards until we hit the minimum again.
   1415       if ((tnoise1 >> 19) <= 0) {
   1416         if ((tnoise1 >> 11) > 0) {
   1417           // Large enough for relative increase
   1418           __asm __volatile (
   1419             "mul    %[tnoise1], %[tnoise1], %[c2049]   \n\t"
   1420             "sra    %[tnoise1], %[tnoise1], 11         \n\t"
   1421             : [tnoise1] "+r" (tnoise1)
   1422             : [c2049] "r" (c2049)
   1423             : "hi", "lo"
   1424           );
   1425         } else {
   1426           // Make incremental increases based on size every
   1427           // |kNoiseEstIncCount| block
   1428           aecm->noiseEstTooLowCtr[i + 1]++;
   1429           if (aecm->noiseEstTooLowCtr[i + 1] >= kNoiseEstIncCount) {
   1430             __asm __volatile (
   1431               "sra    %[tmp32],   %[tnoise1], 9           \n\t"
   1432               "addi   %[tnoise1], %[tnoise1], 1           \n\t"
   1433               "addu   %[tnoise1], %[tnoise1], %[tmp32]    \n\t"
   1434               : [tnoise1] "+r" (tnoise1), [tmp32] "=&r" (tmp32)
   1435               :
   1436             );
   1437             aecm->noiseEstTooLowCtr[i + 1] = 0; // Reset counter
   1438           }
   1439         }
   1440       } else {
   1441         // Avoid overflow.
   1442         // Multiplication with 2049 will cause wrap around. Scale
   1443         // down first and then multiply
   1444         __asm __volatile (
   1445           "sra    %[tnoise1], %[tnoise1], 11          \n\t"
   1446           "mul    %[tnoise1], %[tnoise1], %[c2049]    \n\t"
   1447           : [tnoise1] "+r" (tnoise1)
   1448           : [c2049] "r" (c2049)
   1449           : "hi", "lo"
   1450         );
   1451       }
   1452     }
   1453 
   1454     __asm __volatile (
   1455       "lh     %[tmp16],   0(%[lambdap])                           \n\t"
   1456       "lh     %[tmp161],  2(%[lambdap])                           \n\t"
   1457       "sw     %[tnoise],  0(%[tmp1])                              \n\t"
   1458       "sw     %[tnoise1], 4(%[tmp1])                              \n\t"
   1459       "subu   %[tmp16],   %[c114],        %[tmp16]                \n\t"
   1460       "subu   %[tmp161],  %[c114],        %[tmp161]               \n\t"
   1461       "srav   %[tmp32],   %[tnoise],      %[shiftFromNearToNoise] \n\t"
   1462       "srav   %[tmp321],  %[tnoise1],     %[shiftFromNearToNoise] \n\t"
   1463       "addiu  %[lambdap], %[lambdap],     4                       \n\t"
   1464       "addiu  %[tmp1],    %[tmp1],        8                       \n\t"
   1465       : [tmp16] "=&r" (tmp16), [tmp161] "=&r" (tmp161), [tmp1] "+r" (tmp1),
   1466         [tmp32] "=&r" (tmp32), [tmp321] "=&r" (tmp321), [lambdap] "+r" (lambdap)
   1467       : [tnoise] "r" (tnoise), [tnoise1] "r" (tnoise1), [c114] "r" (c114),
   1468         [shiftFromNearToNoise] "r" (shiftFromNearToNoise)
   1469       : "memory"
   1470     );
   1471 
   1472     if (tmp32 > 32767) {
   1473       tmp32 = 32767;
   1474       aecm->noiseEst[i] = WEBRTC_SPL_LSHIFT_W32(tmp32, shiftFromNearToNoise);
   1475     }
   1476     if (tmp321 > 32767) {
   1477       tmp321 = 32767;
   1478       aecm->noiseEst[i+1] = WEBRTC_SPL_LSHIFT_W32(tmp321, shiftFromNearToNoise);
   1479     }
   1480 
   1481     __asm __volatile (
   1482       "mul    %[tmp32],   %[tmp32],       %[tmp16]                \n\t"
   1483       "mul    %[tmp321],  %[tmp321],      %[tmp161]               \n\t"
   1484       "sra    %[nrsh1],   %[tmp32],       14                      \n\t"
   1485       "sra    %[nrsh2],   %[tmp321],      14                      \n\t"
   1486       : [nrsh1] "=&r" (nrsh1), [nrsh2] "=r" (nrsh2)
   1487       : [tmp16] "r" (tmp16), [tmp161] "r" (tmp161), [tmp32] "r" (tmp32),
   1488         [tmp321] "r" (tmp321)
   1489       : "memory", "hi", "lo"
   1490     );
   1491 
   1492     __asm __volatile (
   1493       "lh     %[tmp32],       0(%[randW16p])              \n\t"
   1494       "lh     %[tmp321],      2(%[randW16p])              \n\t"
   1495       "addiu  %[randW16p],    %[randW16p],    4           \n\t"
   1496       "mul    %[tmp32],       %[tmp32],       %[c359]     \n\t"
   1497       "mul    %[tmp321],      %[tmp321],      %[c359]     \n\t"
   1498       "sra    %[tmp16],       %[tmp32],       15          \n\t"
   1499       "sra    %[tmp161],      %[tmp321],      15          \n\t"
   1500       : [randW16p] "+r" (randW16p), [tmp32] "=&r" (tmp32),
   1501         [tmp16] "=r" (tmp16), [tmp161] "=r" (tmp161), [tmp321] "=&r" (tmp321)
   1502       : [c359] "r" (c359)
   1503       : "memory", "hi", "lo"
   1504     );
   1505 
   1506 #if !defined(MIPS_DSP_R1_LE)
   1507     tmp32 = WebRtcAecm_kCosTable[tmp16];
   1508     tmp321 = WebRtcAecm_kSinTable[tmp16];
   1509     tmp322 = WebRtcAecm_kCosTable[tmp161];
   1510     tmp323 = WebRtcAecm_kSinTable[tmp161];
   1511 #else
   1512     __asm __volatile (
   1513       "sll    %[tmp16],       %[tmp16],                   1           \n\t"
   1514       "sll    %[tmp161],      %[tmp161],                  1           \n\t"
   1515       "lhx    %[tmp32],       %[tmp16](%[kCosTablep])                 \n\t"
   1516       "lhx    %[tmp321],      %[tmp16](%[kSinTablep])                 \n\t"
   1517       "lhx    %[tmp322],      %[tmp161](%[kCosTablep])                \n\t"
   1518       "lhx    %[tmp323],      %[tmp161](%[kSinTablep])                \n\t"
   1519       : [tmp32] "=&r" (tmp32), [tmp321] "=&r" (tmp321),
   1520         [tmp322] "=&r" (tmp322), [tmp323] "=&r" (tmp323)
   1521       : [kCosTablep] "r" (kCosTablep), [tmp16] "r" (tmp16),
   1522         [tmp161] "r" (tmp161), [kSinTablep] "r" (kSinTablep)
   1523       : "memory"
   1524     );
   1525 #endif
   1526     __asm __volatile (
   1527       "mul    %[tmp32],       %[tmp32],                   %[nrsh1]    \n\t"
   1528       "negu   %[tmp162],      %[nrsh1]                                \n\t"
   1529       "mul    %[tmp322],      %[tmp322],                  %[nrsh2]    \n\t"
   1530       "negu   %[tmp163],      %[nrsh2]                                \n\t"
   1531       "sra    %[tmp32],       %[tmp32],                   13          \n\t"
   1532       "mul    %[tmp321],      %[tmp321],                  %[tmp162]   \n\t"
   1533       "sra    %[tmp322],      %[tmp322],                  13          \n\t"
   1534       "mul    %[tmp323],      %[tmp323],                  %[tmp163]   \n\t"
   1535       "sra    %[tmp321],      %[tmp321],                  13          \n\t"
   1536       "sra    %[tmp323],      %[tmp323],                  13          \n\t"
   1537       : [tmp32] "+r" (tmp32), [tmp321] "+r" (tmp321), [tmp162] "=&r" (tmp162),
   1538         [tmp322] "+r" (tmp322), [tmp323] "+r" (tmp323), [tmp163] "=&r" (tmp163)
   1539       : [nrsh1] "r" (nrsh1), [nrsh2] "r" (nrsh2)
   1540       : "hi", "lo"
   1541     );
   1542     // Tables are in Q13.
   1543     uReal[i] = (int16_t)tmp32;
   1544     uImag[i] = (int16_t)tmp321;
   1545     uReal[i + 1] = (int16_t)tmp322;
   1546     uImag[i + 1] = (int16_t)tmp323;
   1547   }
   1548 
   1549   int32_t tt, sgn;
   1550   tt = out[0].real;
   1551   sgn = ((int)tt) >> 31;
   1552   out[0].real = sgn == (int16_t)(tt >> 15) ? (int16_t)tt : (16384 ^ sgn);
   1553   tt = out[0].imag;
   1554   sgn = ((int)tt) >> 31;
   1555   out[0].imag = sgn == (int16_t)(tt >> 15) ? (int16_t)tt : (16384 ^ sgn);
   1556   for (i = 1; i < PART_LEN; i++) {
   1557     tt = out[i].real + uReal[i];
   1558     sgn = ((int)tt) >> 31;
   1559     out[i].real = sgn == (int16_t)(tt >> 15) ? (int16_t)tt : (16384 ^ sgn);
   1560     tt = out[i].imag + uImag[i];
   1561     sgn = ((int)tt) >> 31;
   1562     out[i].imag = sgn == (int16_t)(tt >> 15) ? (int16_t)tt : (16384 ^ sgn);
   1563   }
   1564   tt = out[PART_LEN].real + uReal[PART_LEN];
   1565   sgn = ((int)tt) >> 31;
   1566   out[PART_LEN].real = sgn == (int16_t)(tt >> 15) ? (int16_t)tt : (16384 ^ sgn);
   1567   tt = out[PART_LEN].imag;
   1568   sgn = ((int)tt) >> 31;
   1569   out[PART_LEN].imag = sgn == (int16_t)(tt >> 15) ? (int16_t)tt : (16384 ^ sgn);
   1570 }
   1571 
   1572