Home | History | Annotate | Download | only in aecm
      1 /*
      2  *  Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include "aecm_core.h"
     12 
     13 #include <arm_neon.h>
     14 #include <assert.h>
     15 
     16 
     17 // Square root of Hanning window in Q14.
     18 static const WebRtc_Word16 kSqrtHanningReversed[] __attribute__((aligned(8))) = {
     19   16384, 16373, 16354, 16325,
     20   16286, 16237, 16179, 16111,
     21   16034, 15947, 15851, 15746,
     22   15631, 15506, 15373, 15231,
     23   15079, 14918, 14749, 14571,
     24   14384, 14189, 13985, 13773,
     25   13553, 13325, 13089, 12845,
     26   12594, 12335, 12068, 11795,
     27   11514, 11227, 10933, 10633,
     28   10326, 10013, 9695,  9370,
     29   9040,  8705,  8364,  8019,
     30   7668,  7313,  6954,  6591,
     31   6224,  5853,  5478,  5101,
     32   4720,  4337,  3951,  3562,
     33   3172,  2780,  2386,  1990,
     34   1594,  1196,  798,   399
     35 };
     36 
     37 static void WindowAndFFTNeon(WebRtc_Word16* fft,
     38                              const WebRtc_Word16* time_signal,
     39                              complex16_t* freq_signal,
     40                              int time_signal_scaling) {
     41   int i, j;
     42 
     43   int16x4_t tmp16x4_scaling = vdup_n_s16(time_signal_scaling);
     44   __asm__("vmov.i16 d21, #0" ::: "d21");
     45 
     46   for (i = 0, j = 0; i < PART_LEN; i += 4, j += 8) {
     47     int16x4_t tmp16x4_0;
     48     int16x4_t tmp16x4_1;
     49     int32x4_t tmp32x4_0;
     50 
     51     /* Window near end */
     52     // fft[j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT((time_signal[i]
     53     //       << time_signal_scaling), WebRtcAecm_kSqrtHanning[i], 14);
     54     __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&time_signal[i]));
     55     tmp16x4_0 = vshl_s16(tmp16x4_0, tmp16x4_scaling);
     56 
     57     __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_1) : "r"(&WebRtcAecm_kSqrtHanning[i]));
     58     tmp32x4_0 = vmull_s16(tmp16x4_0, tmp16x4_1);
     59 
     60     __asm__("vshrn.i32 d20, %q0, #14" : : "w"(tmp32x4_0) : "d20");
     61     __asm__("vst2.16 {d20, d21}, [%0, :128]" : : "r"(&fft[j]) : "q10");
     62 
     63     // fft[PART_LEN2 + j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(
     64     //      (time_signal[PART_LEN + i] << time_signal_scaling),
     65     //       WebRtcAecm_kSqrtHanning[PART_LEN - i], 14);
     66     __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&time_signal[i + PART_LEN]));
     67     tmp16x4_0 = vshl_s16(tmp16x4_0, tmp16x4_scaling);
     68 
     69     __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_1) : "r"(&kSqrtHanningReversed[i]));
     70     tmp32x4_0 = vmull_s16(tmp16x4_0, tmp16x4_1);
     71 
     72     __asm__("vshrn.i32 d20, %q0, #14" : : "w"(tmp32x4_0) : "d20");
     73     __asm__("vst2.16 {d20, d21}, [%0, :128]" : : "r"(&fft[PART_LEN2 + j]) : "q10");
     74   }
     75 
     76   WebRtcSpl_ComplexBitReverse(fft, PART_LEN_SHIFT);
     77   WebRtcSpl_ComplexFFT(fft, PART_LEN_SHIFT, 1);
     78 
     79   // Take only the first PART_LEN2 samples, and switch the sign of the imaginary part.
     80   for (i = 0, j = 0; j < PART_LEN2; i += 8, j += 16) {
     81     __asm__("vld2.16 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&fft[j]) : "q10", "q11");
     82     __asm__("vneg.s16 d22, d22" : : : "q10");
     83     __asm__("vneg.s16 d23, d23" : : : "q11");
     84     __asm__("vst2.16 {d20, d21, d22, d23}, [%0, :256]" : :
     85             "r"(&freq_signal[i].real): "q10", "q11");
     86   }
     87 }
     88 
     89 static void InverseFFTAndWindowNeon(AecmCore_t* aecm,
     90                                     WebRtc_Word16* fft,
     91                                     complex16_t* efw,
     92                                     WebRtc_Word16* output,
     93                                     const WebRtc_Word16* nearendClean) {
     94   int i, j, outCFFT;
     95   WebRtc_Word32 tmp32no1;
     96 
     97   // Synthesis
     98   for (i = 0, j = 0; i < PART_LEN; i += 4, j += 8) {
     99     // We overwrite two more elements in fft[], but it's ok.
    100     __asm__("vld2.16 {d20, d21}, [%0, :128]" : : "r"(&(efw[i].real)) : "q10");
    101     __asm__("vmov q11, q10" : : : "q10", "q11");
    102 
    103     __asm__("vneg.s16 d23, d23" : : : "q11");
    104     __asm__("vst2.16 {d22, d23}, [%0, :128]" : : "r"(&fft[j]): "q11");
    105 
    106     __asm__("vrev64.16 q10, q10" : : : "q10");
    107     __asm__("vst2.16 {d20, d21}, [%0]" : : "r"(&fft[PART_LEN4 - j - 6]): "q10");
    108   }
    109 
    110   fft[PART_LEN2] = efw[PART_LEN].real;
    111   fft[PART_LEN2 + 1] = -efw[PART_LEN].imag;
    112 
    113   // Inverse FFT, result should be scaled with outCFFT.
    114   WebRtcSpl_ComplexBitReverse(fft, PART_LEN_SHIFT);
    115   outCFFT = WebRtcSpl_ComplexIFFT(fft, PART_LEN_SHIFT, 1);
    116 
    117   // Take only the real values and scale with outCFFT.
    118   for (i = 0, j = 0; i < PART_LEN2; i += 8, j += 16) {
    119     __asm__("vld2.16 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&fft[j]) : "q10", "q11");
    120     __asm__("vst1.16 {d20, d21}, [%0, :128]" : : "r"(&fft[i]): "q10");
    121   }
    122 
    123   int32x4_t tmp32x4_2;
    124   __asm__("vdup.32 %q0, %1" : "=w"(tmp32x4_2) : "r"((WebRtc_Word32)
    125       (outCFFT - aecm->dfaCleanQDomain)));
    126   for (i = 0; i < PART_LEN; i += 4) {
    127     int16x4_t tmp16x4_0;
    128     int16x4_t tmp16x4_1;
    129     int32x4_t tmp32x4_0;
    130     int32x4_t tmp32x4_1;
    131 
    132     // fft[i] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
    133     //        fft[i], WebRtcAecm_kSqrtHanning[i], 14);
    134     __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&fft[i]));
    135     __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_1) : "r"(&WebRtcAecm_kSqrtHanning[i]));
    136     __asm__("vmull.s16 %q0, %P1, %P2" : "=w"(tmp32x4_0) : "w"(tmp16x4_0), "w"(tmp16x4_1));
    137     __asm__("vrshr.s32 %q0, %q1, #14" : "=w"(tmp32x4_0) : "0"(tmp32x4_0));
    138 
    139     // tmp32no1 = WEBRTC_SPL_SHIFT_W32((WebRtc_Word32)fft[i],
    140     //        outCFFT - aecm->dfaCleanQDomain);
    141     __asm__("vshl.s32 %q0, %q1, %q2" : "=w"(tmp32x4_0) : "0"(tmp32x4_0), "w"(tmp32x4_2));
    142 
    143     // fft[i] = (WebRtc_Word16)WEBRTC_SPL_SAT(WEBRTC_SPL_WORD16_MAX,
    144     //        tmp32no1 + outBuf[i], WEBRTC_SPL_WORD16_MIN);
    145     // output[i] = fft[i];
    146     __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&aecm->outBuf[i]));
    147     __asm__("vmovl.s16 %q0, %P1" : "=w"(tmp32x4_1) : "w"(tmp16x4_0));
    148     __asm__("vadd.i32 %q0, %q1" : : "w"(tmp32x4_0), "w"(tmp32x4_1));
    149     __asm__("vqshrn.s32 %P0, %q1, #0" : "=w"(tmp16x4_0) : "w"(tmp32x4_0));
    150     __asm__("vst1.16 %P0, [%1, :64]" : : "w"(tmp16x4_0), "r"(&fft[i]));
    151     __asm__("vst1.16 %P0, [%1, :64]" : : "w"(tmp16x4_0), "r"(&output[i]));
    152 
    153     // tmp32no1 = WEBRTC_SPL_MUL_16_16_RSFT(
    154     //        fft[PART_LEN + i], WebRtcAecm_kSqrtHanning[PART_LEN - i], 14);
    155     __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&fft[PART_LEN + i]));
    156     __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_1) : "r"(&kSqrtHanningReversed[i]));
    157     __asm__("vmull.s16 %q0, %P1, %P2" : "=w"(tmp32x4_0) : "w"(tmp16x4_0), "w"(tmp16x4_1));
    158     __asm__("vshr.s32 %q0, %q1, #14" : "=w"(tmp32x4_0) : "0"(tmp32x4_0));
    159 
    160     // tmp32no1 = WEBRTC_SPL_SHIFT_W32(tmp32no1, outCFFT - aecm->dfaCleanQDomain);
    161     __asm__("vshl.s32 %q0, %q1, %q2" : "=w"(tmp32x4_0) : "0"(tmp32x4_0), "w"(tmp32x4_2));
    162     // outBuf[i] = (WebRtc_Word16)WEBRTC_SPL_SAT(
    163     //        WEBRTC_SPL_WORD16_MAX, tmp32no1, WEBRTC_SPL_WORD16_MIN);
    164     __asm__("vqshrn.s32 %P0, %q1, #0" : "=w"(tmp16x4_0) : "w"(tmp32x4_0));
    165     __asm__("vst1.16 %P0, [%1, :64]" : : "w"(tmp16x4_0), "r"(&aecm->outBuf[i]));
    166   }
    167 
    168   // Copy the current block to the old position (outBuf is shifted elsewhere).
    169   for (i = 0; i < PART_LEN; i += 16) {
    170     __asm__("vld1.16 {d20, d21, d22, d23}, [%0, :256]" : :
    171             "r"(&aecm->xBuf[i + PART_LEN]) : "q10");
    172     __asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&aecm->xBuf[i]): "q10");
    173   }
    174   for (i = 0; i < PART_LEN; i += 16) {
    175     __asm__("vld1.16 {d20, d21, d22, d23}, [%0, :256]" : :
    176             "r"(&aecm->dBufNoisy[i + PART_LEN]) : "q10");
    177     __asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : :
    178             "r"(&aecm->dBufNoisy[i]): "q10");
    179   }
    180   if (nearendClean != NULL) {
    181     for (i = 0; i < PART_LEN; i += 16) {
    182       __asm__("vld1.16 {d20, d21, d22, d23}, [%0, :256]" : :
    183               "r"(&aecm->dBufClean[i + PART_LEN]) : "q10");
    184       __asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : :
    185               "r"(&aecm->dBufClean[i]): "q10");
    186     }
    187   }
    188 }
    189 
    190 static void CalcLinearEnergiesNeon(AecmCore_t* aecm,
    191                                    const WebRtc_UWord16* far_spectrum,
    192                                    WebRtc_Word32* echo_est,
    193                                    WebRtc_UWord32* far_energy,
    194                                    WebRtc_UWord32* echo_energy_adapt,
    195                                    WebRtc_UWord32* echo_energy_stored) {
    196   int i;
    197 
    198   register WebRtc_UWord32 far_energy_r;
    199   register WebRtc_UWord32 echo_energy_stored_r;
    200   register WebRtc_UWord32 echo_energy_adapt_r;
    201   uint32x4_t tmp32x4_0;
    202 
    203   __asm__("vmov.i32 q14, #0" : : : "q14"); // far_energy
    204   __asm__("vmov.i32 q8,  #0" : : : "q8"); // echo_energy_stored
    205   __asm__("vmov.i32 q9,  #0" : : : "q9"); // echo_energy_adapt
    206 
    207   for (i = 0; i < PART_LEN - 7; i += 8) {
    208     // far_energy += (WebRtc_UWord32)(far_spectrum[i]);
    209     __asm__("vld1.16 {d26, d27}, [%0]" : : "r"(&far_spectrum[i]) : "q13");
    210     __asm__("vaddw.u16 q14, q14, d26" : : : "q14", "q13");
    211     __asm__("vaddw.u16 q14, q14, d27" : : : "q14", "q13");
    212 
    213     // Get estimated echo energies for adaptive channel and stored channel.
    214     // echoEst[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]);
    215     __asm__("vld1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelStored[i]) : "q12");
    216     __asm__("vmull.u16 q10, d26, d24" : : : "q12", "q13", "q10");
    217     __asm__("vmull.u16 q11, d27, d25" : : : "q12", "q13", "q11");
    218     __asm__("vst1.32 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&echo_est[i]):
    219             "q10", "q11");
    220 
    221     // echo_energy_stored += (WebRtc_UWord32)echoEst[i];
    222     __asm__("vadd.u32 q8, q10" : : : "q10", "q8");
    223     __asm__("vadd.u32 q8, q11" : : : "q11", "q8");
    224 
    225     // echo_energy_adapt += WEBRTC_SPL_UMUL_16_16(
    226     //     aecm->channelAdapt16[i], far_spectrum[i]);
    227     __asm__("vld1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelAdapt16[i]) : "q12");
    228     __asm__("vmull.u16 q10, d26, d24" : : : "q12", "q13", "q10");
    229     __asm__("vmull.u16 q11, d27, d25" : : : "q12", "q13", "q11");
    230     __asm__("vadd.u32 q9, q10" : : : "q9", "q15");
    231     __asm__("vadd.u32 q9, q11" : : : "q9", "q11");
    232   }
    233 
    234   __asm__("vadd.u32 d28, d29" : : : "q14");
    235   __asm__("vpadd.u32 d28, d28" : : : "q14");
    236   __asm__("vmov.32 %0, d28[0]" : "=r"(far_energy_r): : "q14");
    237 
    238   __asm__("vadd.u32 d18, d19" : : : "q9");
    239   __asm__("vpadd.u32 d18, d18" : : : "q9");
    240   __asm__("vmov.32 %0, d18[0]" : "=r"(echo_energy_adapt_r): : "q9");
    241 
    242   __asm__("vadd.u32 d16, d17" : : : "q8");
    243   __asm__("vpadd.u32 d16, d16" : : : "q8");
    244   __asm__("vmov.32 %0, d16[0]" : "=r"(echo_energy_stored_r): : "q8");
    245 
    246   // Get estimated echo energies for adaptive channel and stored channel.
    247   echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]);
    248   *echo_energy_stored = echo_energy_stored_r + (WebRtc_UWord32)echo_est[i];
    249   *far_energy = far_energy_r + (WebRtc_UWord32)(far_spectrum[i]);
    250   *echo_energy_adapt = echo_energy_adapt_r + WEBRTC_SPL_UMUL_16_16(
    251       aecm->channelAdapt16[i], far_spectrum[i]);
    252 }
    253 
    254 static void StoreAdaptiveChannelNeon(AecmCore_t* aecm,
    255                                      const WebRtc_UWord16* far_spectrum,
    256                                      WebRtc_Word32* echo_est) {
    257   int i;
    258 
    259   // During startup we store the channel every block.
    260   // Recalculate echo estimate.
    261   for (i = 0; i < PART_LEN - 7; i += 8) {
    262     // aecm->channelStored[i] = acem->channelAdapt16[i];
    263     // echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]);
    264     __asm__("vld1.16 {d26, d27}, [%0]" : : "r"(&far_spectrum[i]) : "q13");
    265     __asm__("vld1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelAdapt16[i]) : "q12");
    266     __asm__("vst1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelStored[i]) : "q12");
    267     __asm__("vmull.u16 q10, d26, d24" : : : "q12", "q13", "q10");
    268     __asm__("vmull.u16 q11, d27, d25" : : : "q12", "q13", "q11");
    269     __asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : :
    270             "r"(&echo_est[i]) : "q10", "q11");
    271   }
    272   aecm->channelStored[i] = aecm->channelAdapt16[i];
    273   echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]);
    274 }
    275 
    276 static void ResetAdaptiveChannelNeon(AecmCore_t* aecm) {
    277   int i;
    278 
    279   for (i = 0; i < PART_LEN - 7; i += 8) {
    280     // aecm->channelAdapt16[i] = aecm->channelStored[i];
    281     // aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32((WebRtc_Word32)
    282     //                           aecm->channelStored[i], 16);
    283     __asm__("vld1.16 {d24, d25}, [%0, :128]" : :
    284             "r"(&aecm->channelStored[i]) : "q12");
    285     __asm__("vst1.16 {d24, d25}, [%0, :128]" : :
    286             "r"(&aecm->channelAdapt16[i]) : "q12");
    287     __asm__("vshll.s16 q10, d24, #16" : : : "q12", "q13", "q10");
    288     __asm__("vshll.s16 q11, d25, #16" : : : "q12", "q13", "q11");
    289     __asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : :
    290             "r"(&aecm->channelAdapt32[i]): "q10", "q11");
    291   }
    292   aecm->channelAdapt16[i] = aecm->channelStored[i];
    293   aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32(
    294       (WebRtc_Word32)aecm->channelStored[i], 16);
    295 }
    296 
    297 void WebRtcAecm_InitNeon(void) {
    298   WebRtcAecm_WindowAndFFT = WindowAndFFTNeon;
    299   WebRtcAecm_InverseFFTAndWindow = InverseFFTAndWindowNeon;
    300   WebRtcAecm_CalcLinearEnergies = CalcLinearEnergiesNeon;
    301   WebRtcAecm_StoreAdaptiveChannel = StoreAdaptiveChannelNeon;
    302   WebRtcAecm_ResetAdaptiveChannel = ResetAdaptiveChannelNeon;
    303 }
    304