Home | History | Annotate | Download | only in signal_processing
      1 /*
      2  *  Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include "webrtc/common_audio/signal_processing/include/signal_processing_library.h"
     12 
     13 #include <arm_neon.h>
     14 
     15 // NEON intrinsics version of WebRtcSpl_DownsampleFast()
     16 // for ARM 32-bit/64-bit platforms.
     17 int WebRtcSpl_DownsampleFastNeon(const int16_t* data_in,
     18                                  size_t data_in_length,
     19                                  int16_t* data_out,
     20                                  size_t data_out_length,
     21                                  const int16_t* __restrict coefficients,
     22                                  size_t coefficients_length,
     23                                  int factor,
     24                                  size_t delay) {
     25   size_t i = 0;
     26   size_t j = 0;
     27   int32_t out_s32 = 0;
     28   size_t endpos = delay + factor * (data_out_length - 1) + 1;
     29   size_t res = data_out_length & 0x7;
     30   size_t endpos1 = endpos - factor * res;
     31 
     32   // Return error if any of the running conditions doesn't meet.
     33   if (data_out_length == 0 || coefficients_length == 0
     34                            || data_in_length < endpos) {
     35     return -1;
     36   }
     37 
     38   // First part, unroll the loop 8 times, with 3 subcases
     39   // (factor == 2, 4, others).
     40   switch (factor) {
     41     case 2: {
     42       for (i = delay; i < endpos1; i += 16) {
     43         // Round value, 0.5 in Q12.
     44         int32x4_t out32x4_0 = vdupq_n_s32(2048);
     45         int32x4_t out32x4_1 = vdupq_n_s32(2048);
     46 
     47 #if defined(WEBRTC_ARCH_ARM64)
     48         // Unroll the loop 2 times.
     49         for (j = 0; j < coefficients_length - 1; j += 2) {
     50           int32x2_t coeff32 = vld1_dup_s32((int32_t*)&coefficients[j]);
     51           int16x4_t coeff16x4 = vreinterpret_s16_s32(coeff32);
     52           int16x8x2_t in16x8x2 = vld2q_s16(&data_in[i - j - 1]);
     53 
     54           // Mul and accumulate low 64-bit data.
     55           int16x4_t in16x4_0 = vget_low_s16(in16x8x2.val[0]);
     56           int16x4_t in16x4_1 = vget_low_s16(in16x8x2.val[1]);
     57           out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_0, coeff16x4, 1);
     58           out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_1, coeff16x4, 0);
     59 
     60           // Mul and accumulate high 64-bit data.
     61           // TODO: vget_high_s16 need extra cost on ARM64. This could be
     62           // replaced by vmlal_high_lane_s16. But for the interface of
     63           // vmlal_high_lane_s16, there is a bug in gcc 4.9.
     64           // This issue need to be tracked in the future.
     65           int16x4_t in16x4_2 = vget_high_s16(in16x8x2.val[0]);
     66           int16x4_t in16x4_3 = vget_high_s16(in16x8x2.val[1]);
     67           out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_2, coeff16x4, 1);
     68           out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_3, coeff16x4, 0);
     69         }
     70 
     71         for (; j < coefficients_length; j++) {
     72           int16x4_t coeff16x4 = vld1_dup_s16(&coefficients[j]);
     73           int16x8x2_t in16x8x2 = vld2q_s16(&data_in[i - j]);
     74 
     75           // Mul and accumulate low 64-bit data.
     76           int16x4_t in16x4_0 = vget_low_s16(in16x8x2.val[0]);
     77           out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_0, coeff16x4, 0);
     78 
     79           // Mul and accumulate high 64-bit data.
     80           // TODO: vget_high_s16 need extra cost on ARM64. This could be
     81           // replaced by vmlal_high_lane_s16. But for the interface of
     82           // vmlal_high_lane_s16, there is a bug in gcc 4.9.
     83           // This issue need to be tracked in the future.
     84           int16x4_t in16x4_1 = vget_high_s16(in16x8x2.val[0]);
     85           out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_1, coeff16x4, 0);
     86         }
     87 #else
     88         // On ARMv7, the loop unrolling 2 times results in performance
     89         // regression.
     90         for (j = 0; j < coefficients_length; j++) {
     91           int16x4_t coeff16x4 = vld1_dup_s16(&coefficients[j]);
     92           int16x8x2_t in16x8x2 = vld2q_s16(&data_in[i - j]);
     93 
     94           // Mul and accumulate.
     95           int16x4_t in16x4_0 = vget_low_s16(in16x8x2.val[0]);
     96           int16x4_t in16x4_1 = vget_high_s16(in16x8x2.val[0]);
     97           out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_0, coeff16x4, 0);
     98           out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_1, coeff16x4, 0);
     99         }
    100 #endif
    101 
    102         // Saturate and store the output.
    103         int16x4_t out16x4_0 = vqshrn_n_s32(out32x4_0, 12);
    104         int16x4_t out16x4_1 = vqshrn_n_s32(out32x4_1, 12);
    105         vst1q_s16(data_out, vcombine_s16(out16x4_0, out16x4_1));
    106         data_out += 8;
    107       }
    108       break;
    109     }
    110     case 4: {
    111       for (i = delay; i < endpos1; i += 32) {
    112         // Round value, 0.5 in Q12.
    113         int32x4_t out32x4_0 = vdupq_n_s32(2048);
    114         int32x4_t out32x4_1 = vdupq_n_s32(2048);
    115 
    116         // Unroll the loop 4 times.
    117         for (j = 0; j < coefficients_length - 3; j += 4) {
    118           int16x4_t coeff16x4 = vld1_s16(&coefficients[j]);
    119           int16x8x4_t in16x8x4 = vld4q_s16(&data_in[i - j - 3]);
    120 
    121           // Mul and accumulate low 64-bit data.
    122           int16x4_t in16x4_0 = vget_low_s16(in16x8x4.val[0]);
    123           int16x4_t in16x4_2 = vget_low_s16(in16x8x4.val[1]);
    124           int16x4_t in16x4_4 = vget_low_s16(in16x8x4.val[2]);
    125           int16x4_t in16x4_6 = vget_low_s16(in16x8x4.val[3]);
    126           out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_0, coeff16x4, 3);
    127           out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_2, coeff16x4, 2);
    128           out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_4, coeff16x4, 1);
    129           out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_6, coeff16x4, 0);
    130 
    131           // Mul and accumulate high 64-bit data.
    132           // TODO: vget_high_s16 need extra cost on ARM64. This could be
    133           // replaced by vmlal_high_lane_s16. But for the interface of
    134           // vmlal_high_lane_s16, there is a bug in gcc 4.9.
    135           // This issue need to be tracked in the future.
    136           int16x4_t in16x4_1 = vget_high_s16(in16x8x4.val[0]);
    137           int16x4_t in16x4_3 = vget_high_s16(in16x8x4.val[1]);
    138           int16x4_t in16x4_5 = vget_high_s16(in16x8x4.val[2]);
    139           int16x4_t in16x4_7 = vget_high_s16(in16x8x4.val[3]);
    140           out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_1, coeff16x4, 3);
    141           out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_3, coeff16x4, 2);
    142           out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_5, coeff16x4, 1);
    143           out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_7, coeff16x4, 0);
    144         }
    145 
    146         for (; j < coefficients_length; j++) {
    147           int16x4_t coeff16x4 = vld1_dup_s16(&coefficients[j]);
    148           int16x8x4_t in16x8x4 = vld4q_s16(&data_in[i - j]);
    149 
    150           // Mul and accumulate low 64-bit data.
    151           int16x4_t in16x4_0 = vget_low_s16(in16x8x4.val[0]);
    152           out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_0, coeff16x4, 0);
    153 
    154           // Mul and accumulate high 64-bit data.
    155           // TODO: vget_high_s16 need extra cost on ARM64. This could be
    156           // replaced by vmlal_high_lane_s16. But for the interface of
    157           // vmlal_high_lane_s16, there is a bug in gcc 4.9.
    158           // This issue need to be tracked in the future.
    159           int16x4_t in16x4_1 = vget_high_s16(in16x8x4.val[0]);
    160           out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_1, coeff16x4, 0);
    161         }
    162 
    163         // Saturate and store the output.
    164         int16x4_t out16x4_0 = vqshrn_n_s32(out32x4_0, 12);
    165         int16x4_t out16x4_1 = vqshrn_n_s32(out32x4_1, 12);
    166         vst1q_s16(data_out, vcombine_s16(out16x4_0, out16x4_1));
    167         data_out += 8;
    168       }
    169       break;
    170     }
    171     default: {
    172       for (i = delay; i < endpos1; i += factor * 8) {
    173         // Round value, 0.5 in Q12.
    174         int32x4_t out32x4_0 = vdupq_n_s32(2048);
    175         int32x4_t out32x4_1 = vdupq_n_s32(2048);
    176 
    177         for (j = 0; j < coefficients_length; j++) {
    178           int16x4_t coeff16x4 = vld1_dup_s16(&coefficients[j]);
    179           int16x4_t in16x4_0 = vld1_dup_s16(&data_in[i - j]);
    180           in16x4_0 = vld1_lane_s16(&data_in[i + factor - j], in16x4_0, 1);
    181           in16x4_0 = vld1_lane_s16(&data_in[i + factor * 2 - j], in16x4_0, 2);
    182           in16x4_0 = vld1_lane_s16(&data_in[i + factor * 3 - j], in16x4_0, 3);
    183           int16x4_t in16x4_1 = vld1_dup_s16(&data_in[i + factor * 4 - j]);
    184           in16x4_1 = vld1_lane_s16(&data_in[i + factor * 5 - j], in16x4_1, 1);
    185           in16x4_1 = vld1_lane_s16(&data_in[i + factor * 6 - j], in16x4_1, 2);
    186           in16x4_1 = vld1_lane_s16(&data_in[i + factor * 7 - j], in16x4_1, 3);
    187 
    188           // Mul and accumulate.
    189           out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_0, coeff16x4, 0);
    190           out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_1, coeff16x4, 0);
    191         }
    192 
    193         // Saturate and store the output.
    194         int16x4_t out16x4_0 = vqshrn_n_s32(out32x4_0, 12);
    195         int16x4_t out16x4_1 = vqshrn_n_s32(out32x4_1, 12);
    196         vst1q_s16(data_out, vcombine_s16(out16x4_0, out16x4_1));
    197         data_out += 8;
    198       }
    199       break;
    200     }
    201   }
    202 
    203   // Second part, do the rest iterations (if any).
    204   for (; i < endpos; i += factor) {
    205     out_s32 = 2048;  // Round value, 0.5 in Q12.
    206 
    207     for (j = 0; j < coefficients_length; j++) {
    208       out_s32 = WebRtc_MulAccumW16(coefficients[j], data_in[i - j], out_s32);
    209     }
    210 
    211     // Saturate and store the output.
    212     out_s32 >>= 12;
    213     *data_out++ = WebRtcSpl_SatW32ToW16(out_s32);
    214   }
    215 
    216   return 0;
    217 }
    218