Home | History | Annotate | Download | only in signal_processing
      1 /*
      2  *  Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include "webrtc/common_audio/signal_processing/include/signal_processing_library.h"
     12 
     13 #include <arm_neon.h>
     14 
     15 static inline void DotProductWithScaleNeon(int32_t* cross_correlation,
     16                                            const int16_t* vector1,
     17                                            const int16_t* vector2,
     18                                            size_t length,
     19                                            int scaling) {
     20   size_t i = 0;
     21   size_t len1 = length >> 3;
     22   size_t len2 = length & 7;
     23   int64x2_t sum0 = vdupq_n_s64(0);
     24   int64x2_t sum1 = vdupq_n_s64(0);
     25 
     26   for (i = len1; i > 0; i -= 1) {
     27     int16x8_t seq1_16x8 = vld1q_s16(vector1);
     28     int16x8_t seq2_16x8 = vld1q_s16(vector2);
     29 #if defined(WEBRTC_ARCH_ARM64)
     30     int32x4_t tmp0 = vmull_s16(vget_low_s16(seq1_16x8),
     31                                vget_low_s16(seq2_16x8));
     32     int32x4_t tmp1 = vmull_high_s16(seq1_16x8, seq2_16x8);
     33 #else
     34     int32x4_t tmp0 = vmull_s16(vget_low_s16(seq1_16x8),
     35                                vget_low_s16(seq2_16x8));
     36     int32x4_t tmp1 = vmull_s16(vget_high_s16(seq1_16x8),
     37                                vget_high_s16(seq2_16x8));
     38 #endif
     39     sum0 = vpadalq_s32(sum0, tmp0);
     40     sum1 = vpadalq_s32(sum1, tmp1);
     41     vector1 += 8;
     42     vector2 += 8;
     43   }
     44 
     45   // Calculate the rest of the samples.
     46   int64_t sum_res = 0;
     47   for (i = len2; i > 0; i -= 1) {
     48     sum_res += WEBRTC_SPL_MUL_16_16(*vector1, *vector2);
     49     vector1++;
     50     vector2++;
     51   }
     52 
     53   sum0 = vaddq_s64(sum0, sum1);
     54 #if defined(WEBRTC_ARCH_ARM64)
     55   int64_t sum2 = vaddvq_s64(sum0);
     56   *cross_correlation = (int32_t)((sum2 + sum_res) >> scaling);
     57 #else
     58   int64x1_t shift = vdup_n_s64(-scaling);
     59   int64x1_t sum2 = vadd_s64(vget_low_s64(sum0), vget_high_s64(sum0));
     60   sum2 = vadd_s64(sum2, vdup_n_s64(sum_res));
     61   sum2 = vshl_s64(sum2, shift);
     62   vst1_lane_s32(cross_correlation, vreinterpret_s32_s64(sum2), 0);
     63 #endif
     64 }
     65 
     66 /* NEON version of WebRtcSpl_CrossCorrelation() for ARM32/64 platforms. */
     67 void WebRtcSpl_CrossCorrelationNeon(int32_t* cross_correlation,
     68                                     const int16_t* seq1,
     69                                     const int16_t* seq2,
     70                                     size_t dim_seq,
     71                                     size_t dim_cross_correlation,
     72                                     int right_shifts,
     73                                     int step_seq2) {
     74   size_t i = 0;
     75 
     76   for (i = 0; i < dim_cross_correlation; i++) {
     77     const int16_t* seq1_ptr = seq1;
     78     const int16_t* seq2_ptr = seq2 + (step_seq2 * i);
     79 
     80     DotProductWithScaleNeon(cross_correlation,
     81                             seq1_ptr,
     82                             seq2_ptr,
     83                             dim_seq,
     84                             right_shifts);
     85     cross_correlation++;
     86   }
     87 }
     88