Home | History | Annotate | Download | only in source
      1 /*
      2  *  Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include <arm_neon.h>
     12 
     13 #include "webrtc/modules/audio_coding/codecs/isac/fix/source/codec.h"
     14 #include "webrtc/modules/audio_coding/codecs/isac/fix/source/settings.h"
     15 
     16 // Contains a function for the core loop in the normalized lattice MA
     17 // filter routine for iSAC codec, optimized for ARM Neon platform.
     18 // It does:
     19 //  for 0 <= n < HALF_SUBFRAMELEN - 1:
     20 //    *ptr2 = input2 * ((*ptr2) + input0 * (*ptr0));
     21 //    *ptr1 = input1 * (*ptr0) + input0 * (*ptr2);
     22 // Output is not bit-exact with the reference C code, due to the replacement
     23 // of WEBRTC_SPL_MUL_16_32_RSFT15 and LATTICE_MUL_32_32_RSFT16 with Neon
     24 // instructions. The difference should not be bigger than 1.
     25 void WebRtcIsacfix_FilterMaLoopNeon(int16_t input0,  // Filter coefficient
     26                                     int16_t input1,  // Filter coefficient
     27                                     int32_t input2,  // Inverse coefficient
     28                                     int32_t* ptr0,   // Sample buffer
     29                                     int32_t* ptr1,   // Sample buffer
     30                                     int32_t* ptr2)   // Sample buffer
     31 {
     32   int n = 0;
     33   int loop = (HALF_SUBFRAMELEN - 1) >> 3;
     34   int loop_tail = (HALF_SUBFRAMELEN - 1) & 0x7;
     35 
     36   int32x4_t input0_v = vdupq_n_s32((int32_t)input0 << 16);
     37   int32x4_t input1_v = vdupq_n_s32((int32_t)input1 << 16);
     38   int32x4_t input2_v = vdupq_n_s32(input2);
     39   int32x4_t tmp0a, tmp1a, tmp2a, tmp3a;
     40   int32x4_t tmp0b, tmp1b, tmp2b, tmp3b;
     41   int32x4_t ptr0va, ptr1va, ptr2va;
     42   int32x4_t ptr0vb, ptr1vb, ptr2vb;
     43 
     44   int64x2_t tmp2al_low, tmp2al_high, tmp2bl_low, tmp2bl_high;
     45   // Unroll to process 8 samples at once.
     46   for (n = 0; n < loop; n++) {
     47     ptr0va = vld1q_s32(ptr0);
     48     ptr0vb = vld1q_s32(ptr0 + 4);
     49     ptr0 += 8;
     50 
     51     ptr2va = vld1q_s32(ptr2);
     52     ptr2vb = vld1q_s32(ptr2 + 4);
     53 
     54     // Calculate tmp0 = (*ptr0) * input0.
     55     tmp0a = vqrdmulhq_s32(ptr0va, input0_v);
     56     tmp0b = vqrdmulhq_s32(ptr0vb, input0_v);
     57 
     58     // Calculate tmp1 = (*ptr0) * input1.
     59     tmp1a = vqrdmulhq_s32(ptr0va, input1_v);
     60     tmp1b = vqrdmulhq_s32(ptr0vb, input1_v);
     61 
     62     // Calculate tmp2 = tmp0 + *(ptr2).
     63     tmp2a = vaddq_s32(tmp0a, ptr2va);
     64     tmp2b = vaddq_s32(tmp0b, ptr2vb);
     65 
     66     // Calculate *ptr2 = input2 * tmp2.
     67     tmp2al_low = vmull_s32(vget_low_s32(tmp2a), vget_low_s32(input2_v));
     68 #if defined(WEBRTC_ARCH_ARM64)
     69     tmp2al_high = vmull_high_s32(tmp2a, input2_v);
     70 #else
     71     tmp2al_high = vmull_s32(vget_high_s32(tmp2a), vget_high_s32(input2_v));
     72 #endif
     73     ptr2va = vcombine_s32(vrshrn_n_s64(tmp2al_low, 16),
     74                           vrshrn_n_s64(tmp2al_high, 16));
     75 
     76     tmp2bl_low = vmull_s32(vget_low_s32(tmp2b), vget_low_s32(input2_v));
     77 #if defined(WEBRTC_ARCH_ARM64)
     78     tmp2bl_high = vmull_high_s32(tmp2b, input2_v);
     79 #else
     80     tmp2bl_high = vmull_s32(vget_high_s32(tmp2b), vget_high_s32(input2_v));
     81 #endif
     82     ptr2vb = vcombine_s32(vrshrn_n_s64(tmp2bl_low, 16),
     83                           vrshrn_n_s64(tmp2bl_high, 16));
     84 
     85     vst1q_s32(ptr2, ptr2va);
     86     vst1q_s32(ptr2 + 4, ptr2vb);
     87     ptr2 += 8;
     88 
     89     // Calculate tmp3 = ptr2v * input0.
     90     tmp3a = vqrdmulhq_s32(ptr2va, input0_v);
     91     tmp3b = vqrdmulhq_s32(ptr2vb, input0_v);
     92 
     93     // Calculate *ptr1 = tmp1 + tmp3.
     94     ptr1va = vaddq_s32(tmp1a, tmp3a);
     95     ptr1vb = vaddq_s32(tmp1b, tmp3b);
     96 
     97     vst1q_s32(ptr1, ptr1va);
     98     vst1q_s32(ptr1 + 4, ptr1vb);
     99     ptr1 += 8;
    100   }
    101 
    102   // Process four more samples.
    103   if (loop_tail & 0x4) {
    104     ptr0va = vld1q_s32(ptr0);
    105     ptr2va = vld1q_s32(ptr2);
    106     ptr0 += 4;
    107 
    108     // Calculate tmp0 = (*ptr0) * input0.
    109     tmp0a = vqrdmulhq_s32(ptr0va, input0_v);
    110 
    111     // Calculate tmp1 = (*ptr0) * input1.
    112     tmp1a = vqrdmulhq_s32(ptr0va, input1_v);
    113 
    114     // Calculate tmp2 = tmp0 + *(ptr2).
    115     tmp2a = vaddq_s32(tmp0a, ptr2va);
    116 
    117     // Calculate *ptr2 = input2 * tmp2.
    118     tmp2al_low = vmull_s32(vget_low_s32(tmp2a), vget_low_s32(input2_v));
    119 
    120 #if defined(WEBRTC_ARCH_ARM64)
    121     tmp2al_high = vmull_high_s32(tmp2a, input2_v);
    122 #else
    123     tmp2al_high = vmull_s32(vget_high_s32(tmp2a), vget_high_s32(input2_v));
    124 #endif
    125     ptr2va = vcombine_s32(vrshrn_n_s64(tmp2al_low, 16),
    126                           vrshrn_n_s64(tmp2al_high, 16));
    127 
    128     vst1q_s32(ptr2, ptr2va);
    129     ptr2 += 4;
    130 
    131     // Calculate tmp3 = *(ptr2) * input0.
    132     tmp3a = vqrdmulhq_s32(ptr2va, input0_v);
    133 
    134     // Calculate *ptr1 = tmp1 + tmp3.
    135     ptr1va = vaddq_s32(tmp1a, tmp3a);
    136 
    137     vst1q_s32(ptr1, ptr1va);
    138     ptr1 += 4;
    139   }
    140 
    141   // Process two more samples.
    142   if (loop_tail & 0x2) {
    143     int32x2_t ptr0v_tail, ptr2v_tail, ptr1v_tail;
    144     int32x2_t tmp0_tail, tmp1_tail, tmp2_tail, tmp3_tail;
    145     int64x2_t tmp2l_tail;
    146     ptr0v_tail = vld1_s32(ptr0);
    147     ptr2v_tail = vld1_s32(ptr2);
    148     ptr0 += 2;
    149 
    150     // Calculate tmp0 = (*ptr0) * input0.
    151     tmp0_tail = vqrdmulh_s32(ptr0v_tail, vget_low_s32(input0_v));
    152 
    153     // Calculate tmp1 = (*ptr0) * input1.
    154     tmp1_tail = vqrdmulh_s32(ptr0v_tail, vget_low_s32(input1_v));
    155 
    156     // Calculate tmp2 = tmp0 + *(ptr2).
    157     tmp2_tail = vadd_s32(tmp0_tail, ptr2v_tail);
    158 
    159     // Calculate *ptr2 = input2 * tmp2.
    160     tmp2l_tail = vmull_s32(tmp2_tail, vget_low_s32(input2_v));
    161     ptr2v_tail = vrshrn_n_s64(tmp2l_tail, 16);
    162 
    163     vst1_s32(ptr2, ptr2v_tail);
    164     ptr2 += 2;
    165 
    166     // Calculate tmp3 = *(ptr2) * input0.
    167     tmp3_tail = vqrdmulh_s32(ptr2v_tail, vget_low_s32(input0_v));
    168 
    169     // Calculate *ptr1 = tmp1 + tmp3.
    170     ptr1v_tail = vadd_s32(tmp1_tail, tmp3_tail);
    171 
    172     vst1_s32(ptr1, ptr1v_tail);
    173     ptr1 += 2;
    174   }
    175 
    176   // Process one more sample.
    177   if (loop_tail & 0x1) {
    178     int16_t t16a = (int16_t)(input2 >> 16);
    179     int16_t t16b = (int16_t)input2;
    180     if (t16b < 0) t16a++;
    181     int32_t tmp32a;
    182     int32_t tmp32b;
    183 
    184     // Calculate *ptr2 = input2 * (*ptr2 + input0 * (*ptr0)).
    185     tmp32a = WEBRTC_SPL_MUL_16_32_RSFT15(input0, *ptr0);
    186     tmp32b = *ptr2 + tmp32a;
    187     *ptr2 = (int32_t)(WEBRTC_SPL_MUL(t16a, tmp32b) +
    188                        (WEBRTC_SPL_MUL_16_32_RSFT16(t16b, tmp32b)));
    189 
    190     // Calculate *ptr1 = input1 * (*ptr0) + input0 * (*ptr2).
    191     tmp32a = WEBRTC_SPL_MUL_16_32_RSFT15(input1, *ptr0);
    192     tmp32b = WEBRTC_SPL_MUL_16_32_RSFT15(input0, *ptr2);
    193     *ptr1 = tmp32a + tmp32b;
    194   }
    195 }
    196