1 /* 2 * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include "aecm_core.h" 12 13 #include <arm_neon.h> 14 #include <assert.h> 15 16 17 // Square root of Hanning window in Q14. 18 static const WebRtc_Word16 kSqrtHanningReversed[] __attribute__((aligned(8))) = { 19 16384, 16373, 16354, 16325, 20 16286, 16237, 16179, 16111, 21 16034, 15947, 15851, 15746, 22 15631, 15506, 15373, 15231, 23 15079, 14918, 14749, 14571, 24 14384, 14189, 13985, 13773, 25 13553, 13325, 13089, 12845, 26 12594, 12335, 12068, 11795, 27 11514, 11227, 10933, 10633, 28 10326, 10013, 9695, 9370, 29 9040, 8705, 8364, 8019, 30 7668, 7313, 6954, 6591, 31 6224, 5853, 5478, 5101, 32 4720, 4337, 3951, 3562, 33 3172, 2780, 2386, 1990, 34 1594, 1196, 798, 399 35 }; 36 37 static void WindowAndFFTNeon(WebRtc_Word16* fft, 38 const WebRtc_Word16* time_signal, 39 complex16_t* freq_signal, 40 int time_signal_scaling) { 41 int i, j; 42 43 int16x4_t tmp16x4_scaling = vdup_n_s16(time_signal_scaling); 44 __asm__("vmov.i16 d21, #0" ::: "d21"); 45 46 for (i = 0, j = 0; i < PART_LEN; i += 4, j += 8) { 47 int16x4_t tmp16x4_0; 48 int16x4_t tmp16x4_1; 49 int32x4_t tmp32x4_0; 50 51 /* Window near end */ 52 // fft[j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT((time_signal[i] 53 // << time_signal_scaling), WebRtcAecm_kSqrtHanning[i], 14); 54 __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&time_signal[i])); 55 tmp16x4_0 = vshl_s16(tmp16x4_0, tmp16x4_scaling); 56 57 __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_1) : "r"(&WebRtcAecm_kSqrtHanning[i])); 58 tmp32x4_0 = vmull_s16(tmp16x4_0, tmp16x4_1); 59 60 __asm__("vshrn.i32 d20, %q0, #14" : : "w"(tmp32x4_0) : "d20"); 61 __asm__("vst2.16 {d20, d21}, [%0, :128]" : : "r"(&fft[j]) : "q10"); 62 63 // fft[PART_LEN2 + j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT( 64 // (time_signal[PART_LEN + i] << time_signal_scaling), 65 // WebRtcAecm_kSqrtHanning[PART_LEN - i], 14); 66 __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&time_signal[i + PART_LEN])); 67 tmp16x4_0 = vshl_s16(tmp16x4_0, tmp16x4_scaling); 68 69 __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_1) : "r"(&kSqrtHanningReversed[i])); 70 tmp32x4_0 = vmull_s16(tmp16x4_0, tmp16x4_1); 71 72 __asm__("vshrn.i32 d20, %q0, #14" : : "w"(tmp32x4_0) : "d20"); 73 __asm__("vst2.16 {d20, d21}, [%0, :128]" : : "r"(&fft[PART_LEN2 + j]) : "q10"); 74 } 75 76 WebRtcSpl_ComplexBitReverse(fft, PART_LEN_SHIFT); 77 WebRtcSpl_ComplexFFT(fft, PART_LEN_SHIFT, 1); 78 79 // Take only the first PART_LEN2 samples, and switch the sign of the imaginary part. 80 for (i = 0, j = 0; j < PART_LEN2; i += 8, j += 16) { 81 __asm__("vld2.16 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&fft[j]) : "q10", "q11"); 82 __asm__("vneg.s16 d22, d22" : : : "q10"); 83 __asm__("vneg.s16 d23, d23" : : : "q11"); 84 __asm__("vst2.16 {d20, d21, d22, d23}, [%0, :256]" : : 85 "r"(&freq_signal[i].real): "q10", "q11"); 86 } 87 } 88 89 static void InverseFFTAndWindowNeon(AecmCore_t* aecm, 90 WebRtc_Word16* fft, 91 complex16_t* efw, 92 WebRtc_Word16* output, 93 const WebRtc_Word16* nearendClean) { 94 int i, j, outCFFT; 95 WebRtc_Word32 tmp32no1; 96 97 // Synthesis 98 for (i = 0, j = 0; i < PART_LEN; i += 4, j += 8) { 99 // We overwrite two more elements in fft[], but it's ok. 100 __asm__("vld2.16 {d20, d21}, [%0, :128]" : : "r"(&(efw[i].real)) : "q10"); 101 __asm__("vmov q11, q10" : : : "q10", "q11"); 102 103 __asm__("vneg.s16 d23, d23" : : : "q11"); 104 __asm__("vst2.16 {d22, d23}, [%0, :128]" : : "r"(&fft[j]): "q11"); 105 106 __asm__("vrev64.16 q10, q10" : : : "q10"); 107 __asm__("vst2.16 {d20, d21}, [%0]" : : "r"(&fft[PART_LEN4 - j - 6]): "q10"); 108 } 109 110 fft[PART_LEN2] = efw[PART_LEN].real; 111 fft[PART_LEN2 + 1] = -efw[PART_LEN].imag; 112 113 // Inverse FFT, result should be scaled with outCFFT. 114 WebRtcSpl_ComplexBitReverse(fft, PART_LEN_SHIFT); 115 outCFFT = WebRtcSpl_ComplexIFFT(fft, PART_LEN_SHIFT, 1); 116 117 // Take only the real values and scale with outCFFT. 118 for (i = 0, j = 0; i < PART_LEN2; i += 8, j += 16) { 119 __asm__("vld2.16 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&fft[j]) : "q10", "q11"); 120 __asm__("vst1.16 {d20, d21}, [%0, :128]" : : "r"(&fft[i]): "q10"); 121 } 122 123 int32x4_t tmp32x4_2; 124 __asm__("vdup.32 %q0, %1" : "=w"(tmp32x4_2) : "r"((WebRtc_Word32) 125 (outCFFT - aecm->dfaCleanQDomain))); 126 for (i = 0; i < PART_LEN; i += 4) { 127 int16x4_t tmp16x4_0; 128 int16x4_t tmp16x4_1; 129 int32x4_t tmp32x4_0; 130 int32x4_t tmp32x4_1; 131 132 // fft[i] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND( 133 // fft[i], WebRtcAecm_kSqrtHanning[i], 14); 134 __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&fft[i])); 135 __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_1) : "r"(&WebRtcAecm_kSqrtHanning[i])); 136 __asm__("vmull.s16 %q0, %P1, %P2" : "=w"(tmp32x4_0) : "w"(tmp16x4_0), "w"(tmp16x4_1)); 137 __asm__("vrshr.s32 %q0, %q1, #14" : "=w"(tmp32x4_0) : "0"(tmp32x4_0)); 138 139 // tmp32no1 = WEBRTC_SPL_SHIFT_W32((WebRtc_Word32)fft[i], 140 // outCFFT - aecm->dfaCleanQDomain); 141 __asm__("vshl.s32 %q0, %q1, %q2" : "=w"(tmp32x4_0) : "0"(tmp32x4_0), "w"(tmp32x4_2)); 142 143 // fft[i] = (WebRtc_Word16)WEBRTC_SPL_SAT(WEBRTC_SPL_WORD16_MAX, 144 // tmp32no1 + outBuf[i], WEBRTC_SPL_WORD16_MIN); 145 // output[i] = fft[i]; 146 __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&aecm->outBuf[i])); 147 __asm__("vmovl.s16 %q0, %P1" : "=w"(tmp32x4_1) : "w"(tmp16x4_0)); 148 __asm__("vadd.i32 %q0, %q1" : : "w"(tmp32x4_0), "w"(tmp32x4_1)); 149 __asm__("vqshrn.s32 %P0, %q1, #0" : "=w"(tmp16x4_0) : "w"(tmp32x4_0)); 150 __asm__("vst1.16 %P0, [%1, :64]" : : "w"(tmp16x4_0), "r"(&fft[i])); 151 __asm__("vst1.16 %P0, [%1, :64]" : : "w"(tmp16x4_0), "r"(&output[i])); 152 153 // tmp32no1 = WEBRTC_SPL_MUL_16_16_RSFT( 154 // fft[PART_LEN + i], WebRtcAecm_kSqrtHanning[PART_LEN - i], 14); 155 __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&fft[PART_LEN + i])); 156 __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_1) : "r"(&kSqrtHanningReversed[i])); 157 __asm__("vmull.s16 %q0, %P1, %P2" : "=w"(tmp32x4_0) : "w"(tmp16x4_0), "w"(tmp16x4_1)); 158 __asm__("vshr.s32 %q0, %q1, #14" : "=w"(tmp32x4_0) : "0"(tmp32x4_0)); 159 160 // tmp32no1 = WEBRTC_SPL_SHIFT_W32(tmp32no1, outCFFT - aecm->dfaCleanQDomain); 161 __asm__("vshl.s32 %q0, %q1, %q2" : "=w"(tmp32x4_0) : "0"(tmp32x4_0), "w"(tmp32x4_2)); 162 // outBuf[i] = (WebRtc_Word16)WEBRTC_SPL_SAT( 163 // WEBRTC_SPL_WORD16_MAX, tmp32no1, WEBRTC_SPL_WORD16_MIN); 164 __asm__("vqshrn.s32 %P0, %q1, #0" : "=w"(tmp16x4_0) : "w"(tmp32x4_0)); 165 __asm__("vst1.16 %P0, [%1, :64]" : : "w"(tmp16x4_0), "r"(&aecm->outBuf[i])); 166 } 167 168 // Copy the current block to the old position (outBuf is shifted elsewhere). 169 for (i = 0; i < PART_LEN; i += 16) { 170 __asm__("vld1.16 {d20, d21, d22, d23}, [%0, :256]" : : 171 "r"(&aecm->xBuf[i + PART_LEN]) : "q10"); 172 __asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&aecm->xBuf[i]): "q10"); 173 } 174 for (i = 0; i < PART_LEN; i += 16) { 175 __asm__("vld1.16 {d20, d21, d22, d23}, [%0, :256]" : : 176 "r"(&aecm->dBufNoisy[i + PART_LEN]) : "q10"); 177 __asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : : 178 "r"(&aecm->dBufNoisy[i]): "q10"); 179 } 180 if (nearendClean != NULL) { 181 for (i = 0; i < PART_LEN; i += 16) { 182 __asm__("vld1.16 {d20, d21, d22, d23}, [%0, :256]" : : 183 "r"(&aecm->dBufClean[i + PART_LEN]) : "q10"); 184 __asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : : 185 "r"(&aecm->dBufClean[i]): "q10"); 186 } 187 } 188 } 189 190 static void CalcLinearEnergiesNeon(AecmCore_t* aecm, 191 const WebRtc_UWord16* far_spectrum, 192 WebRtc_Word32* echo_est, 193 WebRtc_UWord32* far_energy, 194 WebRtc_UWord32* echo_energy_adapt, 195 WebRtc_UWord32* echo_energy_stored) { 196 int i; 197 198 register WebRtc_UWord32 far_energy_r; 199 register WebRtc_UWord32 echo_energy_stored_r; 200 register WebRtc_UWord32 echo_energy_adapt_r; 201 uint32x4_t tmp32x4_0; 202 203 __asm__("vmov.i32 q14, #0" : : : "q14"); // far_energy 204 __asm__("vmov.i32 q8, #0" : : : "q8"); // echo_energy_stored 205 __asm__("vmov.i32 q9, #0" : : : "q9"); // echo_energy_adapt 206 207 for (i = 0; i < PART_LEN - 7; i += 8) { 208 // far_energy += (WebRtc_UWord32)(far_spectrum[i]); 209 __asm__("vld1.16 {d26, d27}, [%0]" : : "r"(&far_spectrum[i]) : "q13"); 210 __asm__("vaddw.u16 q14, q14, d26" : : : "q14", "q13"); 211 __asm__("vaddw.u16 q14, q14, d27" : : : "q14", "q13"); 212 213 // Get estimated echo energies for adaptive channel and stored channel. 214 // echoEst[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]); 215 __asm__("vld1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelStored[i]) : "q12"); 216 __asm__("vmull.u16 q10, d26, d24" : : : "q12", "q13", "q10"); 217 __asm__("vmull.u16 q11, d27, d25" : : : "q12", "q13", "q11"); 218 __asm__("vst1.32 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&echo_est[i]): 219 "q10", "q11"); 220 221 // echo_energy_stored += (WebRtc_UWord32)echoEst[i]; 222 __asm__("vadd.u32 q8, q10" : : : "q10", "q8"); 223 __asm__("vadd.u32 q8, q11" : : : "q11", "q8"); 224 225 // echo_energy_adapt += WEBRTC_SPL_UMUL_16_16( 226 // aecm->channelAdapt16[i], far_spectrum[i]); 227 __asm__("vld1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelAdapt16[i]) : "q12"); 228 __asm__("vmull.u16 q10, d26, d24" : : : "q12", "q13", "q10"); 229 __asm__("vmull.u16 q11, d27, d25" : : : "q12", "q13", "q11"); 230 __asm__("vadd.u32 q9, q10" : : : "q9", "q15"); 231 __asm__("vadd.u32 q9, q11" : : : "q9", "q11"); 232 } 233 234 __asm__("vadd.u32 d28, d29" : : : "q14"); 235 __asm__("vpadd.u32 d28, d28" : : : "q14"); 236 __asm__("vmov.32 %0, d28[0]" : "=r"(far_energy_r): : "q14"); 237 238 __asm__("vadd.u32 d18, d19" : : : "q9"); 239 __asm__("vpadd.u32 d18, d18" : : : "q9"); 240 __asm__("vmov.32 %0, d18[0]" : "=r"(echo_energy_adapt_r): : "q9"); 241 242 __asm__("vadd.u32 d16, d17" : : : "q8"); 243 __asm__("vpadd.u32 d16, d16" : : : "q8"); 244 __asm__("vmov.32 %0, d16[0]" : "=r"(echo_energy_stored_r): : "q8"); 245 246 // Get estimated echo energies for adaptive channel and stored channel. 247 echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]); 248 *echo_energy_stored = echo_energy_stored_r + (WebRtc_UWord32)echo_est[i]; 249 *far_energy = far_energy_r + (WebRtc_UWord32)(far_spectrum[i]); 250 *echo_energy_adapt = echo_energy_adapt_r + WEBRTC_SPL_UMUL_16_16( 251 aecm->channelAdapt16[i], far_spectrum[i]); 252 } 253 254 static void StoreAdaptiveChannelNeon(AecmCore_t* aecm, 255 const WebRtc_UWord16* far_spectrum, 256 WebRtc_Word32* echo_est) { 257 int i; 258 259 // During startup we store the channel every block. 260 // Recalculate echo estimate. 261 for (i = 0; i < PART_LEN - 7; i += 8) { 262 // aecm->channelStored[i] = acem->channelAdapt16[i]; 263 // echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]); 264 __asm__("vld1.16 {d26, d27}, [%0]" : : "r"(&far_spectrum[i]) : "q13"); 265 __asm__("vld1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelAdapt16[i]) : "q12"); 266 __asm__("vst1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelStored[i]) : "q12"); 267 __asm__("vmull.u16 q10, d26, d24" : : : "q12", "q13", "q10"); 268 __asm__("vmull.u16 q11, d27, d25" : : : "q12", "q13", "q11"); 269 __asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : : 270 "r"(&echo_est[i]) : "q10", "q11"); 271 } 272 aecm->channelStored[i] = aecm->channelAdapt16[i]; 273 echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]); 274 } 275 276 static void ResetAdaptiveChannelNeon(AecmCore_t* aecm) { 277 int i; 278 279 for (i = 0; i < PART_LEN - 7; i += 8) { 280 // aecm->channelAdapt16[i] = aecm->channelStored[i]; 281 // aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32((WebRtc_Word32) 282 // aecm->channelStored[i], 16); 283 __asm__("vld1.16 {d24, d25}, [%0, :128]" : : 284 "r"(&aecm->channelStored[i]) : "q12"); 285 __asm__("vst1.16 {d24, d25}, [%0, :128]" : : 286 "r"(&aecm->channelAdapt16[i]) : "q12"); 287 __asm__("vshll.s16 q10, d24, #16" : : : "q12", "q13", "q10"); 288 __asm__("vshll.s16 q11, d25, #16" : : : "q12", "q13", "q11"); 289 __asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : : 290 "r"(&aecm->channelAdapt32[i]): "q10", "q11"); 291 } 292 aecm->channelAdapt16[i] = aecm->channelStored[i]; 293 aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32( 294 (WebRtc_Word32)aecm->channelStored[i], 16); 295 } 296 297 void WebRtcAecm_InitNeon(void) { 298 WebRtcAecm_WindowAndFFT = WindowAndFFTNeon; 299 WebRtcAecm_InverseFFTAndWindow = InverseFFTAndWindowNeon; 300 WebRtcAecm_CalcLinearEnergies = CalcLinearEnergiesNeon; 301 WebRtcAecm_StoreAdaptiveChannel = StoreAdaptiveChannelNeon; 302 WebRtcAecm_ResetAdaptiveChannel = ResetAdaptiveChannelNeon; 303 } 304