Home | History | Annotate | Download | only in aecm
      1 /*
      2  *  Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include "webrtc/modules/audio_processing/aecm/aecm_core.h"
     12 
     13 #include <arm_neon.h>
     14 #include <assert.h>
     15 
     16 #include "webrtc/common_audio/signal_processing/include/real_fft.h"
     17 
     18 // TODO(kma): Re-write the corresponding assembly file, the offset
     19 // generating script and makefile, to replace these C functions.
     20 
     21 // Square root of Hanning window in Q14.
     22 const ALIGN8_BEG int16_t WebRtcAecm_kSqrtHanning[] ALIGN8_END = {
     23   0,
     24   399, 798, 1196, 1594, 1990, 2386, 2780, 3172,
     25   3562, 3951, 4337, 4720, 5101, 5478, 5853, 6224,
     26   6591, 6954, 7313, 7668, 8019, 8364, 8705, 9040,
     27   9370, 9695, 10013, 10326, 10633, 10933, 11227, 11514,
     28   11795, 12068, 12335, 12594, 12845, 13089, 13325, 13553,
     29   13773, 13985, 14189, 14384, 14571, 14749, 14918, 15079,
     30   15231, 15373, 15506, 15631, 15746, 15851, 15947, 16034,
     31   16111, 16179, 16237, 16286, 16325, 16354, 16373, 16384
     32 };
     33 
     34 // Square root of Hanning window in Q14, in reversed order.
     35 static const ALIGN8_BEG int16_t kSqrtHanningReversed[] ALIGN8_END = {
     36   16384, 16373, 16354, 16325, 16286, 16237, 16179, 16111,
     37   16034, 15947, 15851, 15746, 15631, 15506, 15373, 15231,
     38   15079, 14918, 14749, 14571, 14384, 14189, 13985, 13773,
     39   13553, 13325, 13089, 12845, 12594, 12335, 12068, 11795,
     40   11514, 11227, 10933, 10633, 10326, 10013, 9695,  9370,
     41   9040,  8705,  8364,  8019, 7668,  7313,  6954,  6591,
     42   6224,  5853,  5478,  5101, 4720,  4337,  3951,  3562,
     43   3172,  2780,  2386,  1990, 1594,  1196,  798,   399
     44 };
     45 
     46 void WebRtcAecm_WindowAndFFTNeon(AecmCore_t* aecm,
     47                                  int16_t* fft,
     48                                  const int16_t* time_signal,
     49                                  complex16_t* freq_signal,
     50                                  int time_signal_scaling) {
     51   int i = 0;
     52   const int16_t* p_time_signal = time_signal;
     53   const int16_t* p_time_signal_offset = &time_signal[PART_LEN];
     54   const int16_t* p_hanning = WebRtcAecm_kSqrtHanning;
     55   const int16_t* p_hanning_reversed = kSqrtHanningReversed;
     56   int16_t* p_fft = fft;
     57   int16_t* p_fft_offset = &fft[PART_LEN2];
     58 
     59   assert((uintptr_t)p_time_signal % 8 == 0);
     60   assert((uintptr_t)freq_signal % 32 == 0);
     61   assert((uintptr_t)p_hanning % 8 == 0);
     62   assert((uintptr_t)p_fft % 16 == 0);
     63 
     64   __asm __volatile(
     65     "vdup.16 d16, %0\n\t"
     66     "vmov.i16 d21, #0\n\t"
     67     "vmov.i16 d27, #0\n\t"
     68     :
     69     :"r"(time_signal_scaling)
     70     : "d16", "d21", "d27"
     71   );
     72 
     73   for (i = 0; i < PART_LEN; i += 4) {
     74     __asm __volatile(
     75       "vld1.16 d0, [%[p_time_signal], :64]!\n\t"
     76       "vld1.16 d22, [%[p_time_signal_offset], :64]!\n\t"
     77       "vld1.16 d17, [%[p_hanning], :64]!\n\t"
     78       "vld1.16 d23, [%[p_hanning_reversed], :64]!\n\t"
     79       "vshl.s16  d18, d0, d16\n\t"
     80       "vshl.s16  d22, d22, d16\n\t"
     81       "vmull.s16 q9, d18, d17\n\t"
     82       "vmull.s16 q12, d22, d23\n\t"
     83       "vshrn.i32 d20, q9, #14\n\t"
     84       "vshrn.i32 d26, q12, #14\n\t"
     85       "vst2.16 {d20, d21}, [%[p_fft], :128]!\n\t"
     86       "vst2.16 {d26, d27}, [%[p_fft_offset], :128]!\n\t"
     87       :[p_time_signal]"+r"(p_time_signal),
     88        [p_time_signal_offset]"+r"(p_time_signal_offset),
     89        [p_hanning]"+r"(p_hanning),
     90        [p_hanning_reversed]"+r"(p_hanning_reversed),
     91        [p_fft]"+r"(p_fft),
     92        [p_fft_offset]"+r"(p_fft_offset)
     93       :
     94       :"d0", "d16", "d17", "d18", "d19", "d20", "d21",
     95        "d22", "d23", "d24", "d25", "d26", "d27"
     96     );
     97   }
     98 
     99   // Do forward FFT, then take only the first PART_LEN complex samples,
    100   // and change signs of the imaginary parts.
    101   WebRtcSpl_RealForwardFFT(aecm->real_fft, (int16_t*)fft,
    102                            (int16_t*)freq_signal);
    103 
    104   for (i = 0; i < PART_LEN; i += 8) {
    105     __asm __volatile(
    106       "vld2.16 {d20, d21, d22, d23}, [%[freq_signal], :256]\n\t"
    107       "vneg.s16 d22, d22\n\t"
    108       "vneg.s16 d23, d23\n\t"
    109       "vst2.16 {d20, d21, d22, d23}, [%[freq_signal], :256]!\n\t"
    110       :[freq_signal]"+r"(freq_signal)
    111       :
    112       : "d20", "d21", "d22", "d23"
    113     );
    114   }
    115 }
    116 
    117 void WebRtcAecm_InverseFFTAndWindowNeon(AecmCore_t* aecm,
    118                                         int16_t* fft,
    119                                         complex16_t* efw,
    120                                         int16_t* output,
    121                                         const int16_t* nearendClean) {
    122   int i, j, outCFFT;
    123 
    124   assert((uintptr_t)efw % 32 == 0);
    125   assert((uintptr_t)fft % 16 == 0);
    126   assert((uintptr_t)output% 8 == 0);
    127   assert((uintptr_t)WebRtcAecm_kSqrtHanning % 8 == 0);
    128   assert((uintptr_t)kSqrtHanningReversed % 8 == 0);
    129   assert((uintptr_t)(aecm->outBuf) % 8 == 0);
    130   assert((uintptr_t)(aecm->xBuf) % 32 == 0);
    131   assert((uintptr_t)(aecm->dBufNoisy) % 32 == 0);
    132   assert((uintptr_t)(aecm->dBufClean) % 32 == 0);
    133 
    134   // Synthesis
    135   complex16_t* p_efw = efw;
    136   int16_t* p_fft = fft;
    137   int16_t* p_fft_offset = &fft[PART_LEN4 - 6];
    138 
    139   for (i = 0, j = 0; i < PART_LEN; i += 4, j += 8) {
    140     // We overwrite two more elements in fft[], but it's ok.
    141     __asm __volatile(
    142       "vld2.16 {q10}, [%[p_efw], :128]!\n\t"
    143       "vmov q11, q10\n\t"
    144       "vneg.s16 d23, d23\n\t"
    145       "vst2.16 {d22, d23}, [%[p_fft], :128]!\n\t"
    146       "vrev64.16 q10, q10\n\t"
    147       "vst2.16 {q10}, [%[p_fft_offset]], %[offset]\n\t"
    148       :[p_efw]"+r"(p_efw),
    149        [p_fft]"+r"(p_fft),
    150        [p_fft_offset]"+r"(p_fft_offset)
    151       :[offset]"r"(-16)
    152       :"d20", "d21", "d22", "d23"
    153     );
    154   }
    155 
    156   fft[PART_LEN2] = efw[PART_LEN].real;
    157   fft[PART_LEN2 + 1] = -efw[PART_LEN].imag;
    158 
    159   // Inverse FFT. Then take only the real values, and keep outCFFT
    160   // to scale the samples.
    161   outCFFT = WebRtcSpl_RealInverseFFT(aecm->real_fft, fft, (int16_t*)efw);
    162 
    163   int32x4_t tmp32x4_2;
    164   __asm __volatile("vdup.32 %q0, %1" : "=w"(tmp32x4_2) : "r"((int32_t)
    165       (outCFFT - aecm->dfaCleanQDomain)));
    166   for (i = 0; i < PART_LEN; i += 4) {
    167     int16x4_t tmp16x4_0;
    168     int16x4_t tmp16x4_1;
    169     int32x4_t tmp32x4_0;
    170     int32x4_t tmp32x4_1;
    171 
    172     //efw[i].real = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
    173     //              efw[i].real, WebRtcAecm_kSqrtHanning[i], 14);
    174     __asm __volatile("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&efw[i].real));
    175     __asm __volatile("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_1) : "r"(&WebRtcAecm_kSqrtHanning[i]));
    176     __asm __volatile("vmull.s16 %q0, %P1, %P2" : "=w"(tmp32x4_0) : "w"(tmp16x4_0), "w"(tmp16x4_1));
    177     __asm __volatile("vrshr.s32 %q0, %q1, #14" : "=w"(tmp32x4_0) : "0"(tmp32x4_0));
    178 
    179     //tmp32no1 = WEBRTC_SPL_SHIFT_W32((int32_t)efw[i].real,
    180     //        outCFFT - aecm->dfaCleanQDomain);
    181     __asm __volatile("vshl.s32 %q0, %q1, %q2" : "=w"(tmp32x4_0) : "0"(tmp32x4_0), "w"(tmp32x4_2));
    182 
    183     //efw[i].real = (int16_t)WEBRTC_SPL_SAT(WEBRTC_SPL_WORD16_MAX,
    184     //        tmp32no1 + aecm->outBuf[i], WEBRTC_SPL_WORD16_MIN);
    185     // output[i] = efw[i].real;
    186     __asm __volatile("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&aecm->outBuf[i]));
    187     __asm __volatile("vmovl.s16 %q0, %P1" : "=w"(tmp32x4_1) : "w"(tmp16x4_0));
    188     __asm __volatile("vadd.i32 %q0, %q1" : : "w"(tmp32x4_0), "w"(tmp32x4_1));
    189     __asm __volatile("vqmovn.s32 %P0, %q1" : "=w"(tmp16x4_0) : "w"(tmp32x4_0));
    190     __asm __volatile("vst1.16 %P0, [%1, :64]" : : "w"(tmp16x4_0), "r"(&efw[i].real));
    191     __asm __volatile("vst1.16 %P0, [%1, :64]" : : "w"(tmp16x4_0), "r"(&output[i]));
    192 
    193     // tmp32no1 = WEBRTC_SPL_MUL_16_16_RSFT(
    194     //        efw[PART_LEN + i].real, WebRtcAecm_kSqrtHanning[PART_LEN - i], 14);
    195     __asm __volatile("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&efw[PART_LEN + i].real));
    196     __asm __volatile("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_1) : "r"(&kSqrtHanningReversed[i]));
    197     __asm __volatile("vmull.s16 %q0, %P1, %P2" : "=w"(tmp32x4_0) : "w"(tmp16x4_0), "w"(tmp16x4_1));
    198     __asm __volatile("vshr.s32 %q0, %q1, #14" : "=w"(tmp32x4_0) : "0"(tmp32x4_0));
    199 
    200     // tmp32no1 = WEBRTC_SPL_SHIFT_W32(tmp32no1, outCFFT - aecm->dfaCleanQDomain);
    201     __asm __volatile("vshl.s32 %q0, %q1, %q2" : "=w"(tmp32x4_0) : "0"(tmp32x4_0), "w"(tmp32x4_2));
    202     // aecm->outBuf[i] = (int16_t)WEBRTC_SPL_SAT(
    203     //    WEBRTC_SPL_WORD16_MAX, tmp32no1, WEBRTC_SPL_WORD16_MIN);
    204     __asm __volatile("vqmovn.s32 %P0, %q1" : "=w"(tmp16x4_0) : "w"(tmp32x4_0));
    205     __asm __volatile("vst1.16 %P0, [%1, :64]" : : "w"(tmp16x4_0), "r"(&aecm->outBuf[i]));
    206   }
    207 
    208   // Copy the current block to the old position (outBuf is shifted elsewhere).
    209   for (i = 0; i < PART_LEN; i += 16) {
    210     __asm __volatile("vld1.16 {d20, d21, d22, d23}, [%0, :256]" : :
    211             "r"(&aecm->xBuf[i + PART_LEN]) : "q10");
    212     __asm __volatile("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&aecm->xBuf[i]): "q10");
    213   }
    214   for (i = 0; i < PART_LEN; i += 16) {
    215     __asm __volatile("vld1.16 {d20, d21, d22, d23}, [%0, :256]" : :
    216             "r"(&aecm->dBufNoisy[i + PART_LEN]) : "q10");
    217     __asm __volatile("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : :
    218             "r"(&aecm->dBufNoisy[i]): "q10");
    219   }
    220   if (nearendClean != NULL) {
    221     for (i = 0; i < PART_LEN; i += 16) {
    222       __asm __volatile("vld1.16 {d20, d21, d22, d23}, [%0, :256]" : :
    223               "r"(&aecm->dBufClean[i + PART_LEN]) : "q10");
    224       __asm __volatile("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : :
    225               "r"(&aecm->dBufClean[i]): "q10");
    226     }
    227   }
    228 }
    229 
    230 void WebRtcAecm_CalcLinearEnergiesNeon(AecmCore_t* aecm,
    231                                        const uint16_t* far_spectrum,
    232                                        int32_t* echo_est,
    233                                        uint32_t* far_energy,
    234                                        uint32_t* echo_energy_adapt,
    235                                        uint32_t* echo_energy_stored) {
    236   int i;
    237 
    238   register uint32_t far_energy_r;
    239   register uint32_t echo_energy_stored_r;
    240   register uint32_t echo_energy_adapt_r;
    241 
    242   assert((uintptr_t)echo_est % 32 == 0);
    243   assert((uintptr_t)(aecm->channelStored) % 16 == 0);
    244   assert((uintptr_t)(aecm->channelAdapt16) % 16 == 0);
    245   assert((uintptr_t)(aecm->channelStored) % 16 == 0);
    246   assert((uintptr_t)(aecm->channelStored) % 16 == 0);
    247 
    248   __asm __volatile("vmov.i32 q14, #0" : : : "q14"); // far_energy
    249   __asm __volatile("vmov.i32 q8,  #0" : : : "q8"); // echo_energy_stored
    250   __asm __volatile("vmov.i32 q9,  #0" : : : "q9"); // echo_energy_adapt
    251 
    252   for (i = 0; i < PART_LEN - 7; i += 8) {
    253     // far_energy += (uint32_t)(far_spectrum[i]);
    254     __asm __volatile("vld1.16 {d26, d27}, [%0]" : : "r"(&far_spectrum[i]) : "q13");
    255     __asm __volatile("vaddw.u16 q14, q14, d26" : : : "q14", "q13");
    256     __asm __volatile("vaddw.u16 q14, q14, d27" : : : "q14", "q13");
    257 
    258     // Get estimated echo energies for adaptive channel and stored channel.
    259     // echoEst[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]);
    260     __asm __volatile("vld1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelStored[i]) : "q12");
    261     __asm __volatile("vmull.u16 q10, d26, d24" : : : "q12", "q13", "q10");
    262     __asm __volatile("vmull.u16 q11, d27, d25" : : : "q12", "q13", "q11");
    263     __asm __volatile("vst1.32 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&echo_est[i]):
    264             "q10", "q11");
    265 
    266     // echo_energy_stored += (uint32_t)echoEst[i];
    267     __asm __volatile("vadd.u32 q8, q10" : : : "q10", "q8");
    268     __asm __volatile("vadd.u32 q8, q11" : : : "q11", "q8");
    269 
    270     // echo_energy_adapt += aecm->channelAdapt16[i] * far_spectrum[i];
    271     __asm __volatile("vld1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelAdapt16[i]) : "q12");
    272     __asm __volatile("vmull.u16 q10, d26, d24" : : : "q12", "q13", "q10");
    273     __asm __volatile("vmull.u16 q11, d27, d25" : : : "q12", "q13", "q11");
    274     __asm __volatile("vadd.u32 q9, q10" : : : "q9", "q15");
    275     __asm __volatile("vadd.u32 q9, q11" : : : "q9", "q11");
    276   }
    277 
    278   __asm __volatile("vadd.u32 d28, d29" : : : "q14");
    279   __asm __volatile("vpadd.u32 d28, d28" : : : "q14");
    280   __asm __volatile("vmov.32 %0, d28[0]" : "=r"(far_energy_r): : "q14");
    281 
    282   __asm __volatile("vadd.u32 d18, d19" : : : "q9");
    283   __asm __volatile("vpadd.u32 d18, d18" : : : "q9");
    284   __asm __volatile("vmov.32 %0, d18[0]" : "=r"(echo_energy_adapt_r): : "q9");
    285 
    286   __asm __volatile("vadd.u32 d16, d17" : : : "q8");
    287   __asm __volatile("vpadd.u32 d16, d16" : : : "q8");
    288   __asm __volatile("vmov.32 %0, d16[0]" : "=r"(echo_energy_stored_r): : "q8");
    289 
    290   // Get estimated echo energies for adaptive channel and stored channel.
    291   echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]);
    292   *echo_energy_stored = echo_energy_stored_r + (uint32_t)echo_est[i];
    293   *far_energy = far_energy_r + (uint32_t)(far_spectrum[i]);
    294   *echo_energy_adapt = echo_energy_adapt_r +
    295       aecm->channelAdapt16[i] * far_spectrum[i];
    296 }
    297 
    298 void WebRtcAecm_StoreAdaptiveChannelNeon(AecmCore_t* aecm,
    299                                          const uint16_t* far_spectrum,
    300                                          int32_t* echo_est) {
    301   int i;
    302 
    303   assert((uintptr_t)echo_est % 32 == 0);
    304   assert((uintptr_t)(aecm->channelStored) % 16 == 0);
    305   assert((uintptr_t)(aecm->channelAdapt16) % 16 == 0);
    306 
    307   // During startup we store the channel every block.
    308   // Recalculate echo estimate.
    309   for (i = 0; i < PART_LEN - 7; i += 8) {
    310     // aecm->channelStored[i] = acem->channelAdapt16[i];
    311     // echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]);
    312     __asm __volatile("vld1.16 {d26, d27}, [%0]" : : "r"(&far_spectrum[i]) : "q13");
    313     __asm __volatile("vld1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelAdapt16[i]) : "q12");
    314     __asm __volatile("vst1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelStored[i]) : "q12");
    315     __asm __volatile("vmull.u16 q10, d26, d24" : : : "q12", "q13", "q10");
    316     __asm __volatile("vmull.u16 q11, d27, d25" : : : "q12", "q13", "q11");
    317     __asm __volatile("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : :
    318             "r"(&echo_est[i]) : "q10", "q11");
    319   }
    320   aecm->channelStored[i] = aecm->channelAdapt16[i];
    321   echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]);
    322 }
    323 
    324 void WebRtcAecm_ResetAdaptiveChannelNeon(AecmCore_t* aecm) {
    325   int i;
    326 
    327   assert((uintptr_t)(aecm->channelStored) % 16 == 0);
    328   assert((uintptr_t)(aecm->channelAdapt16) % 16 == 0);
    329   assert((uintptr_t)(aecm->channelAdapt32) % 32 == 0);
    330 
    331   for (i = 0; i < PART_LEN - 7; i += 8) {
    332     // aecm->channelAdapt16[i] = aecm->channelStored[i];
    333     // aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32((int32_t)
    334     //                           aecm->channelStored[i], 16);
    335     __asm __volatile("vld1.16 {d24, d25}, [%0, :128]" : :
    336             "r"(&aecm->channelStored[i]) : "q12");
    337     __asm __volatile("vst1.16 {d24, d25}, [%0, :128]" : :
    338             "r"(&aecm->channelAdapt16[i]) : "q12");
    339     __asm __volatile("vshll.s16 q10, d24, #16" : : : "q12", "q13", "q10");
    340     __asm __volatile("vshll.s16 q11, d25, #16" : : : "q12", "q13", "q11");
    341     __asm __volatile("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : :
    342             "r"(&aecm->channelAdapt32[i]): "q10", "q11");
    343   }
    344   aecm->channelAdapt16[i] = aecm->channelStored[i];
    345   aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32(
    346       (int32_t)aecm->channelStored[i], 16);
    347 }
    348