1 @ 2 @ Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. 3 @ 4 @ Use of this source code is governed by a BSD-style license 5 @ that can be found in the LICENSE file in the root of the source 6 @ tree. An additional intellectual property rights grant can be found 7 @ in the file PATENTS. All contributing project authors may 8 @ be found in the AUTHORS file in the root of the source tree. 9 @ 10 11 @ aecm_core_neon.s 12 @ This file contains some functions in AECM, optimized for ARM Neon 13 @ platforms. Reference C code is in file aecm_core.c. Bit-exact. 14 15 #include "aecm_core_neon_offsets.h" 16 #include "webrtc/modules/audio_processing/aecm/aecm_defines.h" 17 #include "webrtc/system_wrappers/interface/asm_defines.h" 18 19 GLOBAL_LABEL WebRtcAecm_kSqrtHanning 20 GLOBAL_FUNCTION WebRtcAecm_CalcLinearEnergiesNeon 21 GLOBAL_FUNCTION WebRtcAecm_StoreAdaptiveChannelNeon 22 GLOBAL_FUNCTION WebRtcAecm_ResetAdaptiveChannelNeon 23 24 @ void WebRtcAecm_CalcLinearEnergiesNeon(AecmCore_t* aecm, 25 @ const uint16_t* far_spectrum, 26 @ int32_t* echo_est, 27 @ uint32_t* far_energy, 28 @ uint32_t* echo_energy_adapt, 29 @ uint32_t* echo_energy_stored); 30 .align 2 31 DEFINE_FUNCTION WebRtcAecm_CalcLinearEnergiesNeon 32 push {r4-r7} 33 34 vmov.i32 q14, #0 35 vmov.i32 q8, #0 36 vmov.i32 q9, #0 37 38 movw r7, #offset_aecm_channelStored 39 movw r5, #offset_aecm_channelAdapt16 40 41 mov r4, r2 42 mov r12, #(PART_LEN / 8) @ Loop counter, unrolled by 8. 43 ldr r6, [r0, r7] 44 ldr r7, [r0, r5] 45 46 LOOP_CALC_LINEAR_ENERGIES: 47 vld1.16 {d26, d27}, [r1]! @ far_spectrum[i] 48 vld1.16 {d24, d25}, [r6, :128]! @ &aecm->channelStored[i] 49 vld1.16 {d0, d1}, [r7, :128]! @ &aecm->channelAdapt16[i] 50 vaddw.u16 q14, q14, d26 51 vmull.u16 q10, d26, d24 52 vmull.u16 q11, d27, d25 53 vaddw.u16 q14, q14, d27 54 vmull.u16 q1, d26, d0 55 vst1.32 {q10, q11}, [r4, :256]! @ &echo_est[i] 56 vadd.u32 q8, q10 57 vmull.u16 q2, d27, d1 58 vadd.u32 q8, q11 59 vadd.u32 q9, q1 60 subs r12, #1 61 vadd.u32 q9, q2 62 bgt LOOP_CALC_LINEAR_ENERGIES 63 64 vadd.u32 d28, d29 65 vpadd.u32 d28, d28 66 vmov.32 r12, d28[0] 67 vadd.u32 d18, d19 68 vpadd.u32 d18, d18 69 vmov.32 r5, d18[0] @ echo_energy_adapt_r 70 vadd.u32 d16, d17 71 vpadd.u32 d16, d16 72 73 ldrh r1, [r1] @ far_spectrum[i] 74 add r12, r12, r1 75 str r12, [r3] @ far_energy 76 vmov.32 r2, d16[0] 77 78 ldrsh r12, [r6] @ aecm->channelStored[i] 79 ldrh r6, [r7] @ aecm->channelAdapt16[i] 80 mul r0, r12, r1 81 mla r1, r6, r1, r5 82 add r2, r2, r0 83 str r0, [r4] @ echo_est[i] 84 ldr r4, [sp, #20] @ &echo_energy_stored 85 str r2, [r4] 86 ldr r3, [sp, #16] @ &echo_energy_adapt 87 str r1, [r3] 88 89 pop {r4-r7} 90 bx lr 91 92 @ void WebRtcAecm_StoreAdaptiveChannelNeon(AecmCore_t* aecm, 93 @ const uint16_t* far_spectrum, 94 @ int32_t* echo_est); 95 .align 2 96 DEFINE_FUNCTION WebRtcAecm_StoreAdaptiveChannelNeon 97 movw r3, #offset_aecm_channelAdapt16 98 movw r12, #offset_aecm_channelStored 99 ldr r3, [r0, r3] 100 ldr r0, [r0, r12] 101 mov r12, #(PART_LEN / 8) @ Loop counter, unrolled by 8. 102 103 LOOP_STORE_ADAPTIVE_CHANNEL: 104 vld1.16 {d24, d25}, [r3, :128]! @ &aecm->channelAdapt16[i] 105 vld1.16 {d26, d27}, [r1]! @ &far_spectrum[i] 106 vst1.16 {d24, d25}, [r0, :128]! @ &aecm->channelStored[i] 107 vmull.u16 q10, d26, d24 108 vmull.u16 q11, d27, d25 109 vst1.16 {q10, q11}, [r2, :256]! @ echo_est[i] 110 subs r12, #1 111 bgt LOOP_STORE_ADAPTIVE_CHANNEL 112 113 ldrsh r12, [r3] 114 strh r12, [r0] 115 ldrh r1, [r1] 116 mul r3, r1, r12 117 str r3, [r2] 118 119 bx lr 120 121 @ void WebRtcAecm_ResetAdaptiveChannelNeon(AecmCore_t* aecm); 122 .align 2 123 DEFINE_FUNCTION WebRtcAecm_ResetAdaptiveChannelNeon 124 movw r1, #offset_aecm_channelAdapt16 125 movw r2, #offset_aecm_channelAdapt32 126 movw r3, #offset_aecm_channelStored 127 ldr r1, [r0, r1] @ &aecm->channelAdapt16[0] 128 ldr r2, [r0, r2] @ &aecm->channelAdapt32[0] 129 ldr r0, [r0, r3] @ &aecm->channelStored[0] 130 mov r3, #(PART_LEN / 8) @ Loop counter, unrolled by 8. 131 132 LOOP_RESET_ADAPTIVE_CHANNEL: 133 vld1.16 {d24, d25}, [r0, :128]! 134 subs r3, #1 135 vst1.16 {d24, d25}, [r1, :128]! 136 vshll.s16 q10, d24, #16 137 vshll.s16 q11, d25, #16 138 vst1.16 {q10, q11}, [r2, :256]! 139 bgt LOOP_RESET_ADAPTIVE_CHANNEL 140 141 ldrh r0, [r0] 142 strh r0, [r1] 143 mov r0, r0, asl #16 144 str r0, [r2] 145 146 bx lr 147 148 @ Square root of Hanning window in Q14. 149 .align 4 150 WebRtcAecm_kSqrtHanning: 151 _WebRtcAecm_kSqrtHanning: 152 .short 0 153 .short 399, 798, 1196, 1594, 1990, 2386, 2780, 3172 154 .short 3562, 3951, 4337, 4720, 5101, 5478, 5853, 6224 155 .short 6591, 6954, 7313, 7668, 8019, 8364, 8705, 9040 156 .short 9370, 9695, 10013, 10326, 10633, 10933, 11227, 11514 157 .short 11795, 12068, 12335, 12594, 12845, 13089, 13325, 13553 158 .short 13773, 13985, 14189, 14384, 14571, 14749, 14918, 15079 159 .short 15231, 15373, 15506, 15631, 15746, 15851, 15947, 16034 160 .short 16111, 16179, 16237, 16286, 16325, 16354, 16373, 16384 161 162 @ Square root of Hanning window in Q14. Compared to WebRtcAecm_kSqrtHanning, 163 @ the order was reversed and one element (0) was removed. 164 .align 4 165 kSqrtHanningReversed: 166 .short 16384, 16373, 16354, 16325, 16286, 16237, 16179, 16111, 16034, 15947 167 .short 15851, 15746, 15631, 15506, 15373, 15231, 15079, 14918, 14749, 14571 168 .short 14384, 14189, 13985, 13773, 13553, 13325, 13089, 12845, 12594, 12335 169 .short 12068, 11795, 11514, 11227, 10933, 10633, 10326, 10013, 9695, 9370 170 .short 9040, 8705, 8364, 8019, 7668, 7313, 6954, 6591, 6224, 5853, 5478, 5101 171 .short 4720, 4337, 3951, 3562, 3172, 2780, 2386, 1990, 1594, 1196, 798, 399 172