1 /* 2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include <arm_neon.h> 12 13 #include "webrtc/modules/audio_coding/codecs/isac/fix/source/codec.h" 14 #include "webrtc/modules/audio_coding/codecs/isac/fix/source/settings.h" 15 16 // Contains a function for the core loop in the normalized lattice MA 17 // filter routine for iSAC codec, optimized for ARM Neon platform. 18 // It does: 19 // for 0 <= n < HALF_SUBFRAMELEN - 1: 20 // *ptr2 = input2 * ((*ptr2) + input0 * (*ptr0)); 21 // *ptr1 = input1 * (*ptr0) + input0 * (*ptr2); 22 // Output is not bit-exact with the reference C code, due to the replacement 23 // of WEBRTC_SPL_MUL_16_32_RSFT15 and LATTICE_MUL_32_32_RSFT16 with Neon 24 // instructions. The difference should not be bigger than 1. 25 void WebRtcIsacfix_FilterMaLoopNeon(int16_t input0, // Filter coefficient 26 int16_t input1, // Filter coefficient 27 int32_t input2, // Inverse coefficient 28 int32_t* ptr0, // Sample buffer 29 int32_t* ptr1, // Sample buffer 30 int32_t* ptr2) // Sample buffer 31 { 32 int n = 0; 33 int loop = (HALF_SUBFRAMELEN - 1) >> 3; 34 int loop_tail = (HALF_SUBFRAMELEN - 1) & 0x7; 35 36 int32x4_t input0_v = vdupq_n_s32((int32_t)input0 << 16); 37 int32x4_t input1_v = vdupq_n_s32((int32_t)input1 << 16); 38 int32x4_t input2_v = vdupq_n_s32(input2); 39 int32x4_t tmp0a, tmp1a, tmp2a, tmp3a; 40 int32x4_t tmp0b, tmp1b, tmp2b, tmp3b; 41 int32x4_t ptr0va, ptr1va, ptr2va; 42 int32x4_t ptr0vb, ptr1vb, ptr2vb; 43 44 int64x2_t tmp2al_low, tmp2al_high, tmp2bl_low, tmp2bl_high; 45 // Unroll to process 8 samples at once. 46 for (n = 0; n < loop; n++) { 47 ptr0va = vld1q_s32(ptr0); 48 ptr0vb = vld1q_s32(ptr0 + 4); 49 ptr0 += 8; 50 51 ptr2va = vld1q_s32(ptr2); 52 ptr2vb = vld1q_s32(ptr2 + 4); 53 54 // Calculate tmp0 = (*ptr0) * input0. 55 tmp0a = vqrdmulhq_s32(ptr0va, input0_v); 56 tmp0b = vqrdmulhq_s32(ptr0vb, input0_v); 57 58 // Calculate tmp1 = (*ptr0) * input1. 59 tmp1a = vqrdmulhq_s32(ptr0va, input1_v); 60 tmp1b = vqrdmulhq_s32(ptr0vb, input1_v); 61 62 // Calculate tmp2 = tmp0 + *(ptr2). 63 tmp2a = vaddq_s32(tmp0a, ptr2va); 64 tmp2b = vaddq_s32(tmp0b, ptr2vb); 65 66 // Calculate *ptr2 = input2 * tmp2. 67 tmp2al_low = vmull_s32(vget_low_s32(tmp2a), vget_low_s32(input2_v)); 68 #if defined(WEBRTC_ARCH_ARM64) 69 tmp2al_high = vmull_high_s32(tmp2a, input2_v); 70 #else 71 tmp2al_high = vmull_s32(vget_high_s32(tmp2a), vget_high_s32(input2_v)); 72 #endif 73 ptr2va = vcombine_s32(vrshrn_n_s64(tmp2al_low, 16), 74 vrshrn_n_s64(tmp2al_high, 16)); 75 76 tmp2bl_low = vmull_s32(vget_low_s32(tmp2b), vget_low_s32(input2_v)); 77 #if defined(WEBRTC_ARCH_ARM64) 78 tmp2bl_high = vmull_high_s32(tmp2b, input2_v); 79 #else 80 tmp2bl_high = vmull_s32(vget_high_s32(tmp2b), vget_high_s32(input2_v)); 81 #endif 82 ptr2vb = vcombine_s32(vrshrn_n_s64(tmp2bl_low, 16), 83 vrshrn_n_s64(tmp2bl_high, 16)); 84 85 vst1q_s32(ptr2, ptr2va); 86 vst1q_s32(ptr2 + 4, ptr2vb); 87 ptr2 += 8; 88 89 // Calculate tmp3 = ptr2v * input0. 90 tmp3a = vqrdmulhq_s32(ptr2va, input0_v); 91 tmp3b = vqrdmulhq_s32(ptr2vb, input0_v); 92 93 // Calculate *ptr1 = tmp1 + tmp3. 94 ptr1va = vaddq_s32(tmp1a, tmp3a); 95 ptr1vb = vaddq_s32(tmp1b, tmp3b); 96 97 vst1q_s32(ptr1, ptr1va); 98 vst1q_s32(ptr1 + 4, ptr1vb); 99 ptr1 += 8; 100 } 101 102 // Process four more samples. 103 if (loop_tail & 0x4) { 104 ptr0va = vld1q_s32(ptr0); 105 ptr2va = vld1q_s32(ptr2); 106 ptr0 += 4; 107 108 // Calculate tmp0 = (*ptr0) * input0. 109 tmp0a = vqrdmulhq_s32(ptr0va, input0_v); 110 111 // Calculate tmp1 = (*ptr0) * input1. 112 tmp1a = vqrdmulhq_s32(ptr0va, input1_v); 113 114 // Calculate tmp2 = tmp0 + *(ptr2). 115 tmp2a = vaddq_s32(tmp0a, ptr2va); 116 117 // Calculate *ptr2 = input2 * tmp2. 118 tmp2al_low = vmull_s32(vget_low_s32(tmp2a), vget_low_s32(input2_v)); 119 120 #if defined(WEBRTC_ARCH_ARM64) 121 tmp2al_high = vmull_high_s32(tmp2a, input2_v); 122 #else 123 tmp2al_high = vmull_s32(vget_high_s32(tmp2a), vget_high_s32(input2_v)); 124 #endif 125 ptr2va = vcombine_s32(vrshrn_n_s64(tmp2al_low, 16), 126 vrshrn_n_s64(tmp2al_high, 16)); 127 128 vst1q_s32(ptr2, ptr2va); 129 ptr2 += 4; 130 131 // Calculate tmp3 = *(ptr2) * input0. 132 tmp3a = vqrdmulhq_s32(ptr2va, input0_v); 133 134 // Calculate *ptr1 = tmp1 + tmp3. 135 ptr1va = vaddq_s32(tmp1a, tmp3a); 136 137 vst1q_s32(ptr1, ptr1va); 138 ptr1 += 4; 139 } 140 141 // Process two more samples. 142 if (loop_tail & 0x2) { 143 int32x2_t ptr0v_tail, ptr2v_tail, ptr1v_tail; 144 int32x2_t tmp0_tail, tmp1_tail, tmp2_tail, tmp3_tail; 145 int64x2_t tmp2l_tail; 146 ptr0v_tail = vld1_s32(ptr0); 147 ptr2v_tail = vld1_s32(ptr2); 148 ptr0 += 2; 149 150 // Calculate tmp0 = (*ptr0) * input0. 151 tmp0_tail = vqrdmulh_s32(ptr0v_tail, vget_low_s32(input0_v)); 152 153 // Calculate tmp1 = (*ptr0) * input1. 154 tmp1_tail = vqrdmulh_s32(ptr0v_tail, vget_low_s32(input1_v)); 155 156 // Calculate tmp2 = tmp0 + *(ptr2). 157 tmp2_tail = vadd_s32(tmp0_tail, ptr2v_tail); 158 159 // Calculate *ptr2 = input2 * tmp2. 160 tmp2l_tail = vmull_s32(tmp2_tail, vget_low_s32(input2_v)); 161 ptr2v_tail = vrshrn_n_s64(tmp2l_tail, 16); 162 163 vst1_s32(ptr2, ptr2v_tail); 164 ptr2 += 2; 165 166 // Calculate tmp3 = *(ptr2) * input0. 167 tmp3_tail = vqrdmulh_s32(ptr2v_tail, vget_low_s32(input0_v)); 168 169 // Calculate *ptr1 = tmp1 + tmp3. 170 ptr1v_tail = vadd_s32(tmp1_tail, tmp3_tail); 171 172 vst1_s32(ptr1, ptr1v_tail); 173 ptr1 += 2; 174 } 175 176 // Process one more sample. 177 if (loop_tail & 0x1) { 178 int16_t t16a = (int16_t)(input2 >> 16); 179 int16_t t16b = (int16_t)input2; 180 if (t16b < 0) t16a++; 181 int32_t tmp32a; 182 int32_t tmp32b; 183 184 // Calculate *ptr2 = input2 * (*ptr2 + input0 * (*ptr0)). 185 tmp32a = WEBRTC_SPL_MUL_16_32_RSFT15(input0, *ptr0); 186 tmp32b = *ptr2 + tmp32a; 187 *ptr2 = (int32_t)(WEBRTC_SPL_MUL(t16a, tmp32b) + 188 (WEBRTC_SPL_MUL_16_32_RSFT16(t16b, tmp32b))); 189 190 // Calculate *ptr1 = input1 * (*ptr0) + input0 * (*ptr2). 191 tmp32a = WEBRTC_SPL_MUL_16_32_RSFT15(input1, *ptr0); 192 tmp32b = WEBRTC_SPL_MUL_16_32_RSFT15(input0, *ptr2); 193 *ptr1 = tmp32a + tmp32b; 194 } 195 } 196