Home | History | Annotate | Download | only in source
      1 /*
      2  *  Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 /*
     12  * filters_neon.c
     13  *
     14  * This file contains function WebRtcIsacfix_AutocorrNeon, optimized for
     15  * ARM Neon platform.
     16  *
     17  */
     18 
     19 #include <arm_neon.h>
     20 #include <assert.h>
     21 
     22 #include "codec.h"
     23 
     24 // Autocorrelation function in fixed point.
     25 // NOTE! Different from SPLIB-version in how it scales the signal.
     26 int WebRtcIsacfix_AutocorrNeon(
     27     WebRtc_Word32* __restrict r,
     28     const WebRtc_Word16* __restrict x,
     29     WebRtc_Word16 N,
     30     WebRtc_Word16 order,
     31     WebRtc_Word16* __restrict scale) {
     32 
     33   // The 1st for loop assumed N % 4 == 0.
     34   assert(N % 4 == 0);
     35 
     36   int i = 0;
     37   int zeros_low = 0;
     38   int zeros_high = 0;
     39   int16_t scaling = 0;
     40   int32_t sum = 0;
     41 
     42   // Step 1, calculate r[0] and how much scaling is needed.
     43 
     44   int16x4_t reg16x4;
     45   int64x1_t reg64x1a;
     46   int64x1_t reg64x1b;
     47   int32x4_t reg32x4;
     48   int64x2_t reg64x2 = vdupq_n_s64(0); // zeros
     49 
     50   // Loop over the samples and do:
     51   // sum += WEBRTC_SPL_MUL_16_16(x[i], x[i]);
     52   for (i = 0; i < N; i += 4) {
     53     reg16x4 = vld1_s16(&x[i]);
     54     reg32x4 = vmull_s16(reg16x4, reg16x4);
     55     reg64x2 = vpadalq_s32(reg64x2, reg32x4);
     56   }
     57   reg64x1a = vget_low_s64(reg64x2);
     58   reg64x1b = vget_high_s64(reg64x2);
     59   reg64x1a = vadd_s64(reg64x1a, reg64x1b);
     60 
     61   // Calculate the value of shifting (scaling).
     62   __asm__ __volatile__(
     63     "vmov %[z_l], %[z_h], %P[reg]\n\t"
     64     "clz %[z_l], %[z_l]\n\t"
     65     "clz %[z_h], %[z_h]\n\t"
     66     :[z_l]"+r"(zeros_low),
     67      [z_h]"+r"(zeros_high)
     68     :[reg]"w"(reg64x1a)
     69   );
     70   if (zeros_high != 32) {
     71     scaling = (32 - zeros_high + 1);
     72   } else if (zeros_low == 0) {
     73     scaling = 1;
     74   }
     75   reg64x1b = -scaling;
     76   reg64x1a = vshl_s64(reg64x1a, reg64x1b);
     77 
     78   // Record the result.
     79   r[0] = (int32_t)vget_lane_s64(reg64x1a, 0);
     80 
     81 
     82   // Step 2, perform the actual correlation calculation.
     83 
     84   /* Original C code (for the rest of the function):
     85   for (i = 1; i < order + 1; i++)  {
     86     prod = 0;
     87     for (j = 0; j < N - i; j++) {
     88       prod += WEBRTC_SPL_MUL_16_16(x[j], x[i + j]);
     89     }
     90     sum = (int32_t)(prod >> scaling);
     91     r[i] = sum;
     92   }
     93   */
     94 
     95   for (i = 1; i < order + 1; i++) {
     96     int32_t prod_lower = 0;
     97     int32_t prod_upper = 0;
     98     const int16_t* ptr0 = &x[0];
     99     const int16_t* ptr1 = &x[i];
    100     int32_t tmp = 0;
    101 
    102     // Initialize the sum (q9) to zero.
    103     __asm__ __volatile__("vmov.i32 q9, #0\n\t":::"q9");
    104 
    105     // Calculate the major block of the samples (a multiple of 8).
    106     for (; ptr0 < &x[N - i - 7];) {
    107       __asm__ __volatile__(
    108         "vld1.16 {d20, d21}, [%[ptr0]]!\n\t"
    109         "vld1.16 {d22, d23}, [%[ptr1]]!\n\t"
    110         "vmull.s16 q12, d20, d22\n\t"
    111         "vmull.s16 q13, d21, d23\n\t"
    112         "vpadal.s32 q9, q12\n\t"
    113         "vpadal.s32 q9, q13\n\t"
    114 
    115         // Specify constraints.
    116         :[ptr0]"+r"(ptr0),
    117         [ptr1]"+r"(ptr1)
    118         :
    119         :"d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27"
    120       );
    121     }
    122 
    123     // Calculate the rest of the samples.
    124     for (; ptr0 < &x[N - i]; ptr0++, ptr1++) {
    125       __asm__ __volatile__(
    126         "smulbb %[tmp], %[ptr0], %[ptr1]\n\t"
    127         "adds %[prod_lower], %[prod_lower], %[tmp]\n\t"
    128         "adc %[prod_upper], %[prod_upper], %[tmp], asr #31\n\t"
    129 
    130         // Specify constraints.
    131         :[prod_lower]"+r"(prod_lower),
    132         [prod_upper]"+r"(prod_upper),
    133         [tmp]"+r"(tmp)
    134         :[ptr0]"r"(*ptr0),
    135         [ptr1]"r"(*ptr1)
    136       );
    137     }
    138 
    139     // Sum the results up, and do shift.
    140     __asm__ __volatile__(
    141       "vadd.i64 d18, d19\n\t"
    142       "vmov.32 d17[0], %[prod_lower]\n\t"
    143       "vmov.32 d17[1], %[prod_upper]\n\t"
    144       "vadd.i64 d17, d18\n\t"
    145       "mov %[tmp], %[scaling], asr #31\n\t"
    146       "vmov.32 d16, %[scaling], %[tmp]\n\t"
    147       "vshl.s64 d17, d16\n\t"
    148       "vmov.32 %[sum], d17[0]\n\t"
    149 
    150       // Specify constraints.
    151       :[sum]"=r"(sum),
    152       [tmp]"+r"(tmp)
    153       :[prod_upper]"r"(prod_upper),
    154       [prod_lower]"r"(prod_lower),
    155       [scaling]"r"(-scaling)
    156       :"d16", "d17", "d18", "d19"
    157     );
    158 
    159     // Record the result.
    160     r[i] = sum;
    161   }
    162 
    163   // Record the result.
    164   *scale = scaling;
    165 
    166   return(order + 1);
    167 }
    168