1 /* 2 * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include "webrtc/modules/audio_processing/aecm/aecm_core.h" 12 13 #include <arm_neon.h> 14 #include <assert.h> 15 16 #include "webrtc/common_audio/signal_processing/include/real_fft.h" 17 18 // TODO(kma): Re-write the corresponding assembly file, the offset 19 // generating script and makefile, to replace these C functions. 20 21 // Square root of Hanning window in Q14. 22 const ALIGN8_BEG int16_t WebRtcAecm_kSqrtHanning[] ALIGN8_END = { 23 0, 24 399, 798, 1196, 1594, 1990, 2386, 2780, 3172, 25 3562, 3951, 4337, 4720, 5101, 5478, 5853, 6224, 26 6591, 6954, 7313, 7668, 8019, 8364, 8705, 9040, 27 9370, 9695, 10013, 10326, 10633, 10933, 11227, 11514, 28 11795, 12068, 12335, 12594, 12845, 13089, 13325, 13553, 29 13773, 13985, 14189, 14384, 14571, 14749, 14918, 15079, 30 15231, 15373, 15506, 15631, 15746, 15851, 15947, 16034, 31 16111, 16179, 16237, 16286, 16325, 16354, 16373, 16384 32 }; 33 34 // Square root of Hanning window in Q14, in reversed order. 35 static const ALIGN8_BEG int16_t kSqrtHanningReversed[] ALIGN8_END = { 36 16384, 16373, 16354, 16325, 16286, 16237, 16179, 16111, 37 16034, 15947, 15851, 15746, 15631, 15506, 15373, 15231, 38 15079, 14918, 14749, 14571, 14384, 14189, 13985, 13773, 39 13553, 13325, 13089, 12845, 12594, 12335, 12068, 11795, 40 11514, 11227, 10933, 10633, 10326, 10013, 9695, 9370, 41 9040, 8705, 8364, 8019, 7668, 7313, 6954, 6591, 42 6224, 5853, 5478, 5101, 4720, 4337, 3951, 3562, 43 3172, 2780, 2386, 1990, 1594, 1196, 798, 399 44 }; 45 46 void WebRtcAecm_WindowAndFFTNeon(AecmCore_t* aecm, 47 int16_t* fft, 48 const int16_t* time_signal, 49 complex16_t* freq_signal, 50 int time_signal_scaling) { 51 int i = 0; 52 const int16_t* p_time_signal = time_signal; 53 const int16_t* p_time_signal_offset = &time_signal[PART_LEN]; 54 const int16_t* p_hanning = WebRtcAecm_kSqrtHanning; 55 const int16_t* p_hanning_reversed = kSqrtHanningReversed; 56 int16_t* p_fft = fft; 57 int16_t* p_fft_offset = &fft[PART_LEN2]; 58 59 assert((uintptr_t)p_time_signal % 8 == 0); 60 assert((uintptr_t)freq_signal % 32 == 0); 61 assert((uintptr_t)p_hanning % 8 == 0); 62 assert((uintptr_t)p_fft % 16 == 0); 63 64 __asm __volatile( 65 "vdup.16 d16, %0\n\t" 66 "vmov.i16 d21, #0\n\t" 67 "vmov.i16 d27, #0\n\t" 68 : 69 :"r"(time_signal_scaling) 70 : "d16", "d21", "d27" 71 ); 72 73 for (i = 0; i < PART_LEN; i += 4) { 74 __asm __volatile( 75 "vld1.16 d0, [%[p_time_signal], :64]!\n\t" 76 "vld1.16 d22, [%[p_time_signal_offset], :64]!\n\t" 77 "vld1.16 d17, [%[p_hanning], :64]!\n\t" 78 "vld1.16 d23, [%[p_hanning_reversed], :64]!\n\t" 79 "vshl.s16 d18, d0, d16\n\t" 80 "vshl.s16 d22, d22, d16\n\t" 81 "vmull.s16 q9, d18, d17\n\t" 82 "vmull.s16 q12, d22, d23\n\t" 83 "vshrn.i32 d20, q9, #14\n\t" 84 "vshrn.i32 d26, q12, #14\n\t" 85 "vst2.16 {d20, d21}, [%[p_fft], :128]!\n\t" 86 "vst2.16 {d26, d27}, [%[p_fft_offset], :128]!\n\t" 87 :[p_time_signal]"+r"(p_time_signal), 88 [p_time_signal_offset]"+r"(p_time_signal_offset), 89 [p_hanning]"+r"(p_hanning), 90 [p_hanning_reversed]"+r"(p_hanning_reversed), 91 [p_fft]"+r"(p_fft), 92 [p_fft_offset]"+r"(p_fft_offset) 93 : 94 :"d0", "d16", "d17", "d18", "d19", "d20", "d21", 95 "d22", "d23", "d24", "d25", "d26", "d27" 96 ); 97 } 98 99 // Do forward FFT, then take only the first PART_LEN complex samples, 100 // and change signs of the imaginary parts. 101 WebRtcSpl_RealForwardFFT(aecm->real_fft, (int16_t*)fft, 102 (int16_t*)freq_signal); 103 104 for (i = 0; i < PART_LEN; i += 8) { 105 __asm __volatile( 106 "vld2.16 {d20, d21, d22, d23}, [%[freq_signal], :256]\n\t" 107 "vneg.s16 d22, d22\n\t" 108 "vneg.s16 d23, d23\n\t" 109 "vst2.16 {d20, d21, d22, d23}, [%[freq_signal], :256]!\n\t" 110 :[freq_signal]"+r"(freq_signal) 111 : 112 : "d20", "d21", "d22", "d23" 113 ); 114 } 115 } 116 117 void WebRtcAecm_InverseFFTAndWindowNeon(AecmCore_t* aecm, 118 int16_t* fft, 119 complex16_t* efw, 120 int16_t* output, 121 const int16_t* nearendClean) { 122 int i, j, outCFFT; 123 124 assert((uintptr_t)efw % 32 == 0); 125 assert((uintptr_t)fft % 16 == 0); 126 assert((uintptr_t)output% 8 == 0); 127 assert((uintptr_t)WebRtcAecm_kSqrtHanning % 8 == 0); 128 assert((uintptr_t)kSqrtHanningReversed % 8 == 0); 129 assert((uintptr_t)(aecm->outBuf) % 8 == 0); 130 assert((uintptr_t)(aecm->xBuf) % 32 == 0); 131 assert((uintptr_t)(aecm->dBufNoisy) % 32 == 0); 132 assert((uintptr_t)(aecm->dBufClean) % 32 == 0); 133 134 // Synthesis 135 complex16_t* p_efw = efw; 136 int16_t* p_fft = fft; 137 int16_t* p_fft_offset = &fft[PART_LEN4 - 6]; 138 139 for (i = 0, j = 0; i < PART_LEN; i += 4, j += 8) { 140 // We overwrite two more elements in fft[], but it's ok. 141 __asm __volatile( 142 "vld2.16 {q10}, [%[p_efw], :128]!\n\t" 143 "vmov q11, q10\n\t" 144 "vneg.s16 d23, d23\n\t" 145 "vst2.16 {d22, d23}, [%[p_fft], :128]!\n\t" 146 "vrev64.16 q10, q10\n\t" 147 "vst2.16 {q10}, [%[p_fft_offset]], %[offset]\n\t" 148 :[p_efw]"+r"(p_efw), 149 [p_fft]"+r"(p_fft), 150 [p_fft_offset]"+r"(p_fft_offset) 151 :[offset]"r"(-16) 152 :"d20", "d21", "d22", "d23" 153 ); 154 } 155 156 fft[PART_LEN2] = efw[PART_LEN].real; 157 fft[PART_LEN2 + 1] = -efw[PART_LEN].imag; 158 159 // Inverse FFT. Then take only the real values, and keep outCFFT 160 // to scale the samples. 161 outCFFT = WebRtcSpl_RealInverseFFT(aecm->real_fft, fft, (int16_t*)efw); 162 163 int32x4_t tmp32x4_2; 164 __asm __volatile("vdup.32 %q0, %1" : "=w"(tmp32x4_2) : "r"((int32_t) 165 (outCFFT - aecm->dfaCleanQDomain))); 166 for (i = 0; i < PART_LEN; i += 4) { 167 int16x4_t tmp16x4_0; 168 int16x4_t tmp16x4_1; 169 int32x4_t tmp32x4_0; 170 int32x4_t tmp32x4_1; 171 172 //efw[i].real = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND( 173 // efw[i].real, WebRtcAecm_kSqrtHanning[i], 14); 174 __asm __volatile("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&efw[i].real)); 175 __asm __volatile("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_1) : "r"(&WebRtcAecm_kSqrtHanning[i])); 176 __asm __volatile("vmull.s16 %q0, %P1, %P2" : "=w"(tmp32x4_0) : "w"(tmp16x4_0), "w"(tmp16x4_1)); 177 __asm __volatile("vrshr.s32 %q0, %q1, #14" : "=w"(tmp32x4_0) : "0"(tmp32x4_0)); 178 179 //tmp32no1 = WEBRTC_SPL_SHIFT_W32((int32_t)efw[i].real, 180 // outCFFT - aecm->dfaCleanQDomain); 181 __asm __volatile("vshl.s32 %q0, %q1, %q2" : "=w"(tmp32x4_0) : "0"(tmp32x4_0), "w"(tmp32x4_2)); 182 183 //efw[i].real = (int16_t)WEBRTC_SPL_SAT(WEBRTC_SPL_WORD16_MAX, 184 // tmp32no1 + aecm->outBuf[i], WEBRTC_SPL_WORD16_MIN); 185 // output[i] = efw[i].real; 186 __asm __volatile("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&aecm->outBuf[i])); 187 __asm __volatile("vmovl.s16 %q0, %P1" : "=w"(tmp32x4_1) : "w"(tmp16x4_0)); 188 __asm __volatile("vadd.i32 %q0, %q1" : : "w"(tmp32x4_0), "w"(tmp32x4_1)); 189 __asm __volatile("vqmovn.s32 %P0, %q1" : "=w"(tmp16x4_0) : "w"(tmp32x4_0)); 190 __asm __volatile("vst1.16 %P0, [%1, :64]" : : "w"(tmp16x4_0), "r"(&efw[i].real)); 191 __asm __volatile("vst1.16 %P0, [%1, :64]" : : "w"(tmp16x4_0), "r"(&output[i])); 192 193 // tmp32no1 = WEBRTC_SPL_MUL_16_16_RSFT( 194 // efw[PART_LEN + i].real, WebRtcAecm_kSqrtHanning[PART_LEN - i], 14); 195 __asm __volatile("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&efw[PART_LEN + i].real)); 196 __asm __volatile("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_1) : "r"(&kSqrtHanningReversed[i])); 197 __asm __volatile("vmull.s16 %q0, %P1, %P2" : "=w"(tmp32x4_0) : "w"(tmp16x4_0), "w"(tmp16x4_1)); 198 __asm __volatile("vshr.s32 %q0, %q1, #14" : "=w"(tmp32x4_0) : "0"(tmp32x4_0)); 199 200 // tmp32no1 = WEBRTC_SPL_SHIFT_W32(tmp32no1, outCFFT - aecm->dfaCleanQDomain); 201 __asm __volatile("vshl.s32 %q0, %q1, %q2" : "=w"(tmp32x4_0) : "0"(tmp32x4_0), "w"(tmp32x4_2)); 202 // aecm->outBuf[i] = (int16_t)WEBRTC_SPL_SAT( 203 // WEBRTC_SPL_WORD16_MAX, tmp32no1, WEBRTC_SPL_WORD16_MIN); 204 __asm __volatile("vqmovn.s32 %P0, %q1" : "=w"(tmp16x4_0) : "w"(tmp32x4_0)); 205 __asm __volatile("vst1.16 %P0, [%1, :64]" : : "w"(tmp16x4_0), "r"(&aecm->outBuf[i])); 206 } 207 208 // Copy the current block to the old position (outBuf is shifted elsewhere). 209 for (i = 0; i < PART_LEN; i += 16) { 210 __asm __volatile("vld1.16 {d20, d21, d22, d23}, [%0, :256]" : : 211 "r"(&aecm->xBuf[i + PART_LEN]) : "q10"); 212 __asm __volatile("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&aecm->xBuf[i]): "q10"); 213 } 214 for (i = 0; i < PART_LEN; i += 16) { 215 __asm __volatile("vld1.16 {d20, d21, d22, d23}, [%0, :256]" : : 216 "r"(&aecm->dBufNoisy[i + PART_LEN]) : "q10"); 217 __asm __volatile("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : : 218 "r"(&aecm->dBufNoisy[i]): "q10"); 219 } 220 if (nearendClean != NULL) { 221 for (i = 0; i < PART_LEN; i += 16) { 222 __asm __volatile("vld1.16 {d20, d21, d22, d23}, [%0, :256]" : : 223 "r"(&aecm->dBufClean[i + PART_LEN]) : "q10"); 224 __asm __volatile("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : : 225 "r"(&aecm->dBufClean[i]): "q10"); 226 } 227 } 228 } 229 230 void WebRtcAecm_CalcLinearEnergiesNeon(AecmCore_t* aecm, 231 const uint16_t* far_spectrum, 232 int32_t* echo_est, 233 uint32_t* far_energy, 234 uint32_t* echo_energy_adapt, 235 uint32_t* echo_energy_stored) { 236 int i; 237 238 register uint32_t far_energy_r; 239 register uint32_t echo_energy_stored_r; 240 register uint32_t echo_energy_adapt_r; 241 242 assert((uintptr_t)echo_est % 32 == 0); 243 assert((uintptr_t)(aecm->channelStored) % 16 == 0); 244 assert((uintptr_t)(aecm->channelAdapt16) % 16 == 0); 245 assert((uintptr_t)(aecm->channelStored) % 16 == 0); 246 assert((uintptr_t)(aecm->channelStored) % 16 == 0); 247 248 __asm __volatile("vmov.i32 q14, #0" : : : "q14"); // far_energy 249 __asm __volatile("vmov.i32 q8, #0" : : : "q8"); // echo_energy_stored 250 __asm __volatile("vmov.i32 q9, #0" : : : "q9"); // echo_energy_adapt 251 252 for (i = 0; i < PART_LEN - 7; i += 8) { 253 // far_energy += (uint32_t)(far_spectrum[i]); 254 __asm __volatile("vld1.16 {d26, d27}, [%0]" : : "r"(&far_spectrum[i]) : "q13"); 255 __asm __volatile("vaddw.u16 q14, q14, d26" : : : "q14", "q13"); 256 __asm __volatile("vaddw.u16 q14, q14, d27" : : : "q14", "q13"); 257 258 // Get estimated echo energies for adaptive channel and stored channel. 259 // echoEst[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]); 260 __asm __volatile("vld1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelStored[i]) : "q12"); 261 __asm __volatile("vmull.u16 q10, d26, d24" : : : "q12", "q13", "q10"); 262 __asm __volatile("vmull.u16 q11, d27, d25" : : : "q12", "q13", "q11"); 263 __asm __volatile("vst1.32 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&echo_est[i]): 264 "q10", "q11"); 265 266 // echo_energy_stored += (uint32_t)echoEst[i]; 267 __asm __volatile("vadd.u32 q8, q10" : : : "q10", "q8"); 268 __asm __volatile("vadd.u32 q8, q11" : : : "q11", "q8"); 269 270 // echo_energy_adapt += aecm->channelAdapt16[i] * far_spectrum[i]; 271 __asm __volatile("vld1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelAdapt16[i]) : "q12"); 272 __asm __volatile("vmull.u16 q10, d26, d24" : : : "q12", "q13", "q10"); 273 __asm __volatile("vmull.u16 q11, d27, d25" : : : "q12", "q13", "q11"); 274 __asm __volatile("vadd.u32 q9, q10" : : : "q9", "q15"); 275 __asm __volatile("vadd.u32 q9, q11" : : : "q9", "q11"); 276 } 277 278 __asm __volatile("vadd.u32 d28, d29" : : : "q14"); 279 __asm __volatile("vpadd.u32 d28, d28" : : : "q14"); 280 __asm __volatile("vmov.32 %0, d28[0]" : "=r"(far_energy_r): : "q14"); 281 282 __asm __volatile("vadd.u32 d18, d19" : : : "q9"); 283 __asm __volatile("vpadd.u32 d18, d18" : : : "q9"); 284 __asm __volatile("vmov.32 %0, d18[0]" : "=r"(echo_energy_adapt_r): : "q9"); 285 286 __asm __volatile("vadd.u32 d16, d17" : : : "q8"); 287 __asm __volatile("vpadd.u32 d16, d16" : : : "q8"); 288 __asm __volatile("vmov.32 %0, d16[0]" : "=r"(echo_energy_stored_r): : "q8"); 289 290 // Get estimated echo energies for adaptive channel and stored channel. 291 echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]); 292 *echo_energy_stored = echo_energy_stored_r + (uint32_t)echo_est[i]; 293 *far_energy = far_energy_r + (uint32_t)(far_spectrum[i]); 294 *echo_energy_adapt = echo_energy_adapt_r + 295 aecm->channelAdapt16[i] * far_spectrum[i]; 296 } 297 298 void WebRtcAecm_StoreAdaptiveChannelNeon(AecmCore_t* aecm, 299 const uint16_t* far_spectrum, 300 int32_t* echo_est) { 301 int i; 302 303 assert((uintptr_t)echo_est % 32 == 0); 304 assert((uintptr_t)(aecm->channelStored) % 16 == 0); 305 assert((uintptr_t)(aecm->channelAdapt16) % 16 == 0); 306 307 // During startup we store the channel every block. 308 // Recalculate echo estimate. 309 for (i = 0; i < PART_LEN - 7; i += 8) { 310 // aecm->channelStored[i] = acem->channelAdapt16[i]; 311 // echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]); 312 __asm __volatile("vld1.16 {d26, d27}, [%0]" : : "r"(&far_spectrum[i]) : "q13"); 313 __asm __volatile("vld1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelAdapt16[i]) : "q12"); 314 __asm __volatile("vst1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelStored[i]) : "q12"); 315 __asm __volatile("vmull.u16 q10, d26, d24" : : : "q12", "q13", "q10"); 316 __asm __volatile("vmull.u16 q11, d27, d25" : : : "q12", "q13", "q11"); 317 __asm __volatile("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : : 318 "r"(&echo_est[i]) : "q10", "q11"); 319 } 320 aecm->channelStored[i] = aecm->channelAdapt16[i]; 321 echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]); 322 } 323 324 void WebRtcAecm_ResetAdaptiveChannelNeon(AecmCore_t* aecm) { 325 int i; 326 327 assert((uintptr_t)(aecm->channelStored) % 16 == 0); 328 assert((uintptr_t)(aecm->channelAdapt16) % 16 == 0); 329 assert((uintptr_t)(aecm->channelAdapt32) % 32 == 0); 330 331 for (i = 0; i < PART_LEN - 7; i += 8) { 332 // aecm->channelAdapt16[i] = aecm->channelStored[i]; 333 // aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32((int32_t) 334 // aecm->channelStored[i], 16); 335 __asm __volatile("vld1.16 {d24, d25}, [%0, :128]" : : 336 "r"(&aecm->channelStored[i]) : "q12"); 337 __asm __volatile("vst1.16 {d24, d25}, [%0, :128]" : : 338 "r"(&aecm->channelAdapt16[i]) : "q12"); 339 __asm __volatile("vshll.s16 q10, d24, #16" : : : "q12", "q13", "q10"); 340 __asm __volatile("vshll.s16 q11, d25, #16" : : : "q12", "q13", "q11"); 341 __asm __volatile("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : : 342 "r"(&aecm->channelAdapt32[i]): "q10", "q11"); 343 } 344 aecm->channelAdapt16[i] = aecm->channelStored[i]; 345 aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32( 346 (int32_t)aecm->channelStored[i], 16); 347 } 348