Home | History | Annotate | Download | only in aec
      1 /*
      2  *  Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 /*
     12  * The rdft AEC algorithm, neon version of speed-critical functions.
     13  *
     14  * Based on the sse2 version.
     15  */
     16 
     17 
     18 #include "webrtc/modules/audio_processing/aec/aec_rdft.h"
     19 
     20 #include <arm_neon.h>
     21 
     22 static const ALIGN16_BEG float ALIGN16_END
     23     k_swap_sign[4] = {-1.f, 1.f, -1.f, 1.f};
     24 
     25 static void cft1st_128_neon(float* a) {
     26   const float32x4_t vec_swap_sign = vld1q_f32((float32_t*)k_swap_sign);
     27   int j, k2;
     28 
     29   for (k2 = 0, j = 0; j < 128; j += 16, k2 += 4) {
     30     float32x4_t a00v = vld1q_f32(&a[j + 0]);
     31     float32x4_t a04v = vld1q_f32(&a[j + 4]);
     32     float32x4_t a08v = vld1q_f32(&a[j + 8]);
     33     float32x4_t a12v = vld1q_f32(&a[j + 12]);
     34     float32x4_t a01v = vcombine_f32(vget_low_f32(a00v), vget_low_f32(a08v));
     35     float32x4_t a23v = vcombine_f32(vget_high_f32(a00v), vget_high_f32(a08v));
     36     float32x4_t a45v = vcombine_f32(vget_low_f32(a04v), vget_low_f32(a12v));
     37     float32x4_t a67v = vcombine_f32(vget_high_f32(a04v), vget_high_f32(a12v));
     38     const float32x4_t wk1rv = vld1q_f32(&rdft_wk1r[k2]);
     39     const float32x4_t wk1iv = vld1q_f32(&rdft_wk1i[k2]);
     40     const float32x4_t wk2rv = vld1q_f32(&rdft_wk2r[k2]);
     41     const float32x4_t wk2iv = vld1q_f32(&rdft_wk2i[k2]);
     42     const float32x4_t wk3rv = vld1q_f32(&rdft_wk3r[k2]);
     43     const float32x4_t wk3iv = vld1q_f32(&rdft_wk3i[k2]);
     44     float32x4_t x0v = vaddq_f32(a01v, a23v);
     45     const float32x4_t x1v = vsubq_f32(a01v, a23v);
     46     const float32x4_t x2v = vaddq_f32(a45v, a67v);
     47     const float32x4_t x3v = vsubq_f32(a45v, a67v);
     48     const float32x4_t x3w = vrev64q_f32(x3v);
     49     float32x4_t x0w;
     50     a01v = vaddq_f32(x0v, x2v);
     51     x0v = vsubq_f32(x0v, x2v);
     52     x0w = vrev64q_f32(x0v);
     53     a45v = vmulq_f32(wk2rv, x0v);
     54     a45v = vmlaq_f32(a45v, wk2iv, x0w);
     55     x0v = vmlaq_f32(x1v, x3w, vec_swap_sign);
     56     x0w = vrev64q_f32(x0v);
     57     a23v = vmulq_f32(wk1rv, x0v);
     58     a23v = vmlaq_f32(a23v, wk1iv, x0w);
     59     x0v = vmlsq_f32(x1v, x3w, vec_swap_sign);
     60     x0w = vrev64q_f32(x0v);
     61     a67v = vmulq_f32(wk3rv, x0v);
     62     a67v = vmlaq_f32(a67v, wk3iv, x0w);
     63     a00v = vcombine_f32(vget_low_f32(a01v), vget_low_f32(a23v));
     64     a04v = vcombine_f32(vget_low_f32(a45v), vget_low_f32(a67v));
     65     a08v = vcombine_f32(vget_high_f32(a01v), vget_high_f32(a23v));
     66     a12v = vcombine_f32(vget_high_f32(a45v), vget_high_f32(a67v));
     67     vst1q_f32(&a[j + 0], a00v);
     68     vst1q_f32(&a[j + 4], a04v);
     69     vst1q_f32(&a[j + 8], a08v);
     70     vst1q_f32(&a[j + 12], a12v);
     71   }
     72 }
     73 
     74 static void cftmdl_128_neon(float* a) {
     75   int j;
     76   const int l = 8;
     77   const float32x4_t vec_swap_sign = vld1q_f32((float32_t*)k_swap_sign);
     78   float32x4_t wk1rv = vld1q_f32(cftmdl_wk1r);
     79 
     80   for (j = 0; j < l; j += 2) {
     81     const float32x2_t a_00 = vld1_f32(&a[j + 0]);
     82     const float32x2_t a_08 = vld1_f32(&a[j + 8]);
     83     const float32x2_t a_32 = vld1_f32(&a[j + 32]);
     84     const float32x2_t a_40 = vld1_f32(&a[j + 40]);
     85     const float32x4_t a_00_32 = vcombine_f32(a_00, a_32);
     86     const float32x4_t a_08_40 = vcombine_f32(a_08, a_40);
     87     const float32x4_t x0r0_0i0_0r1_x0i1 = vaddq_f32(a_00_32, a_08_40);
     88     const float32x4_t x1r0_1i0_1r1_x1i1 = vsubq_f32(a_00_32, a_08_40);
     89     const float32x2_t a_16 = vld1_f32(&a[j + 16]);
     90     const float32x2_t a_24 = vld1_f32(&a[j + 24]);
     91     const float32x2_t a_48 = vld1_f32(&a[j + 48]);
     92     const float32x2_t a_56 = vld1_f32(&a[j + 56]);
     93     const float32x4_t a_16_48 = vcombine_f32(a_16, a_48);
     94     const float32x4_t a_24_56 = vcombine_f32(a_24, a_56);
     95     const float32x4_t x2r0_2i0_2r1_x2i1 = vaddq_f32(a_16_48, a_24_56);
     96     const float32x4_t x3r0_3i0_3r1_x3i1 = vsubq_f32(a_16_48, a_24_56);
     97     const float32x4_t xx0 = vaddq_f32(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
     98     const float32x4_t xx1 = vsubq_f32(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
     99     const float32x4_t x3i0_3r0_3i1_x3r1 = vrev64q_f32(x3r0_3i0_3r1_x3i1);
    100     const float32x4_t x1_x3_add =
    101         vmlaq_f32(x1r0_1i0_1r1_x1i1, vec_swap_sign, x3i0_3r0_3i1_x3r1);
    102     const float32x4_t x1_x3_sub =
    103         vmlsq_f32(x1r0_1i0_1r1_x1i1, vec_swap_sign, x3i0_3r0_3i1_x3r1);
    104     const float32x2_t yy0_a = vdup_lane_f32(vget_high_f32(x1_x3_add), 0);
    105     const float32x2_t yy0_s = vdup_lane_f32(vget_high_f32(x1_x3_sub), 0);
    106     const float32x4_t yy0_as = vcombine_f32(yy0_a, yy0_s);
    107     const float32x2_t yy1_a = vdup_lane_f32(vget_high_f32(x1_x3_add), 1);
    108     const float32x2_t yy1_s = vdup_lane_f32(vget_high_f32(x1_x3_sub), 1);
    109     const float32x4_t yy1_as = vcombine_f32(yy1_a, yy1_s);
    110     const float32x4_t yy0 = vmlaq_f32(yy0_as, vec_swap_sign, yy1_as);
    111     const float32x4_t yy4 = vmulq_f32(wk1rv, yy0);
    112     const float32x4_t xx1_rev = vrev64q_f32(xx1);
    113     const float32x4_t yy4_rev = vrev64q_f32(yy4);
    114 
    115     vst1_f32(&a[j + 0], vget_low_f32(xx0));
    116     vst1_f32(&a[j + 32], vget_high_f32(xx0));
    117     vst1_f32(&a[j + 16], vget_low_f32(xx1));
    118     vst1_f32(&a[j + 48], vget_high_f32(xx1_rev));
    119 
    120     a[j + 48] = -a[j + 48];
    121 
    122     vst1_f32(&a[j + 8], vget_low_f32(x1_x3_add));
    123     vst1_f32(&a[j + 24], vget_low_f32(x1_x3_sub));
    124     vst1_f32(&a[j + 40], vget_low_f32(yy4));
    125     vst1_f32(&a[j + 56], vget_high_f32(yy4_rev));
    126   }
    127 
    128   {
    129     const int k = 64;
    130     const int k1 = 2;
    131     const int k2 = 2 * k1;
    132     const float32x4_t wk2rv = vld1q_f32(&rdft_wk2r[k2 + 0]);
    133     const float32x4_t wk2iv = vld1q_f32(&rdft_wk2i[k2 + 0]);
    134     const float32x4_t wk1iv = vld1q_f32(&rdft_wk1i[k2 + 0]);
    135     const float32x4_t wk3rv = vld1q_f32(&rdft_wk3r[k2 + 0]);
    136     const float32x4_t wk3iv = vld1q_f32(&rdft_wk3i[k2 + 0]);
    137     wk1rv = vld1q_f32(&rdft_wk1r[k2 + 0]);
    138     for (j = k; j < l + k; j += 2) {
    139       const float32x2_t a_00 = vld1_f32(&a[j + 0]);
    140       const float32x2_t a_08 = vld1_f32(&a[j + 8]);
    141       const float32x2_t a_32 = vld1_f32(&a[j + 32]);
    142       const float32x2_t a_40 = vld1_f32(&a[j + 40]);
    143       const float32x4_t a_00_32 = vcombine_f32(a_00, a_32);
    144       const float32x4_t a_08_40 = vcombine_f32(a_08, a_40);
    145       const float32x4_t x0r0_0i0_0r1_x0i1 = vaddq_f32(a_00_32, a_08_40);
    146       const float32x4_t x1r0_1i0_1r1_x1i1 = vsubq_f32(a_00_32, a_08_40);
    147       const float32x2_t a_16 = vld1_f32(&a[j + 16]);
    148       const float32x2_t a_24 = vld1_f32(&a[j + 24]);
    149       const float32x2_t a_48 = vld1_f32(&a[j + 48]);
    150       const float32x2_t a_56 = vld1_f32(&a[j + 56]);
    151       const float32x4_t a_16_48 = vcombine_f32(a_16, a_48);
    152       const float32x4_t a_24_56 = vcombine_f32(a_24, a_56);
    153       const float32x4_t x2r0_2i0_2r1_x2i1 = vaddq_f32(a_16_48, a_24_56);
    154       const float32x4_t x3r0_3i0_3r1_x3i1 = vsubq_f32(a_16_48, a_24_56);
    155       const float32x4_t xx = vaddq_f32(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
    156       const float32x4_t xx1 = vsubq_f32(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
    157       const float32x4_t x3i0_3r0_3i1_x3r1 = vrev64q_f32(x3r0_3i0_3r1_x3i1);
    158       const float32x4_t x1_x3_add =
    159           vmlaq_f32(x1r0_1i0_1r1_x1i1, vec_swap_sign, x3i0_3r0_3i1_x3r1);
    160       const float32x4_t x1_x3_sub =
    161           vmlsq_f32(x1r0_1i0_1r1_x1i1, vec_swap_sign, x3i0_3r0_3i1_x3r1);
    162       float32x4_t xx4 = vmulq_f32(wk2rv, xx1);
    163       float32x4_t xx12 = vmulq_f32(wk1rv, x1_x3_add);
    164       float32x4_t xx22 = vmulq_f32(wk3rv, x1_x3_sub);
    165       xx4 = vmlaq_f32(xx4, wk2iv, vrev64q_f32(xx1));
    166       xx12 = vmlaq_f32(xx12, wk1iv, vrev64q_f32(x1_x3_add));
    167       xx22 = vmlaq_f32(xx22, wk3iv, vrev64q_f32(x1_x3_sub));
    168 
    169       vst1_f32(&a[j + 0], vget_low_f32(xx));
    170       vst1_f32(&a[j + 32], vget_high_f32(xx));
    171       vst1_f32(&a[j + 16], vget_low_f32(xx4));
    172       vst1_f32(&a[j + 48], vget_high_f32(xx4));
    173       vst1_f32(&a[j + 8], vget_low_f32(xx12));
    174       vst1_f32(&a[j + 40], vget_high_f32(xx12));
    175       vst1_f32(&a[j + 24], vget_low_f32(xx22));
    176       vst1_f32(&a[j + 56], vget_high_f32(xx22));
    177     }
    178   }
    179 }
    180 
    181 __inline static float32x4_t reverse_order_f32x4(float32x4_t in) {
    182   // A B C D -> C D A B
    183   const float32x4_t rev = vcombine_f32(vget_high_f32(in), vget_low_f32(in));
    184   // C D A B -> D C B A
    185   return vrev64q_f32(rev);
    186 }
    187 
    188 static void rftfsub_128_neon(float* a) {
    189   const float* c = rdft_w + 32;
    190   int j1, j2;
    191   const float32x4_t mm_half = vdupq_n_f32(0.5f);
    192 
    193   // Vectorized code (four at once).
    194   // Note: commented number are indexes for the first iteration of the loop.
    195   for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) {
    196     // Load 'wk'.
    197     const float32x4_t c_j1 = vld1q_f32(&c[j1]);          //  1,  2,  3,  4,
    198     const float32x4_t c_k1 = vld1q_f32(&c[29 - j1]);     // 28, 29, 30, 31,
    199     const float32x4_t wkrt = vsubq_f32(mm_half, c_k1);   // 28, 29, 30, 31,
    200     const float32x4_t wkr_ = reverse_order_f32x4(wkrt);  // 31, 30, 29, 28,
    201     const float32x4_t wki_ = c_j1;                       //  1,  2,  3,  4,
    202     // Load and shuffle 'a'.
    203     //   2,   4,   6,   8,   3,   5,   7,   9
    204     float32x4x2_t a_j2_p = vld2q_f32(&a[0 + j2]);
    205     // 120, 122, 124, 126, 121, 123, 125, 127,
    206     const float32x4x2_t k2_0_4 = vld2q_f32(&a[122 - j2]);
    207     // 126, 124, 122, 120
    208     const float32x4_t a_k2_p0 = reverse_order_f32x4(k2_0_4.val[0]);
    209     // 127, 125, 123, 121
    210     const float32x4_t a_k2_p1 = reverse_order_f32x4(k2_0_4.val[1]);
    211     // Calculate 'x'.
    212     const float32x4_t xr_ = vsubq_f32(a_j2_p.val[0], a_k2_p0);
    213     // 2-126, 4-124, 6-122, 8-120,
    214     const float32x4_t xi_ = vaddq_f32(a_j2_p.val[1], a_k2_p1);
    215     // 3-127, 5-125, 7-123, 9-121,
    216     // Calculate product into 'y'.
    217     //    yr = wkr * xr - wki * xi;
    218     //    yi = wkr * xi + wki * xr;
    219     const float32x4_t a_ = vmulq_f32(wkr_, xr_);
    220     const float32x4_t b_ = vmulq_f32(wki_, xi_);
    221     const float32x4_t c_ = vmulq_f32(wkr_, xi_);
    222     const float32x4_t d_ = vmulq_f32(wki_, xr_);
    223     const float32x4_t yr_ = vsubq_f32(a_, b_);  // 2-126, 4-124, 6-122, 8-120,
    224     const float32x4_t yi_ = vaddq_f32(c_, d_);  // 3-127, 5-125, 7-123, 9-121,
    225                                                 // Update 'a'.
    226                                                 //    a[j2 + 0] -= yr;
    227                                                 //    a[j2 + 1] -= yi;
    228                                                 //    a[k2 + 0] += yr;
    229                                                 //    a[k2 + 1] -= yi;
    230     // 126, 124, 122, 120,
    231     const float32x4_t a_k2_p0n = vaddq_f32(a_k2_p0, yr_);
    232     // 127, 125, 123, 121,
    233     const float32x4_t a_k2_p1n = vsubq_f32(a_k2_p1, yi_);
    234     // Shuffle in right order and store.
    235     const float32x4_t a_k2_p0nr = vrev64q_f32(a_k2_p0n);
    236     const float32x4_t a_k2_p1nr = vrev64q_f32(a_k2_p1n);
    237     // 124, 125, 126, 127, 120, 121, 122, 123
    238     const float32x4x2_t a_k2_n = vzipq_f32(a_k2_p0nr, a_k2_p1nr);
    239     //   2,   4,   6,   8,
    240     a_j2_p.val[0] = vsubq_f32(a_j2_p.val[0], yr_);
    241     //   3,   5,   7,   9,
    242     a_j2_p.val[1] = vsubq_f32(a_j2_p.val[1], yi_);
    243     //   2,   3,   4,   5,   6,   7,   8,   9,
    244     vst2q_f32(&a[0 + j2], a_j2_p);
    245 
    246     vst1q_f32(&a[122 - j2], a_k2_n.val[1]);
    247     vst1q_f32(&a[126 - j2], a_k2_n.val[0]);
    248   }
    249 
    250   // Scalar code for the remaining items.
    251   for (; j2 < 64; j1 += 1, j2 += 2) {
    252     const int k2 = 128 - j2;
    253     const int k1 = 32 - j1;
    254     const float wkr = 0.5f - c[k1];
    255     const float wki = c[j1];
    256     const float xr = a[j2 + 0] - a[k2 + 0];
    257     const float xi = a[j2 + 1] + a[k2 + 1];
    258     const float yr = wkr * xr - wki * xi;
    259     const float yi = wkr * xi + wki * xr;
    260     a[j2 + 0] -= yr;
    261     a[j2 + 1] -= yi;
    262     a[k2 + 0] += yr;
    263     a[k2 + 1] -= yi;
    264   }
    265 }
    266 
    267 static void rftbsub_128_neon(float* a) {
    268   const float* c = rdft_w + 32;
    269   int j1, j2;
    270   const float32x4_t mm_half = vdupq_n_f32(0.5f);
    271 
    272   a[1] = -a[1];
    273   // Vectorized code (four at once).
    274   //    Note: commented number are indexes for the first iteration of the loop.
    275   for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) {
    276     // Load 'wk'.
    277     const float32x4_t c_j1 = vld1q_f32(&c[j1]);         //  1,  2,  3,  4,
    278     const float32x4_t c_k1 = vld1q_f32(&c[29 - j1]);    // 28, 29, 30, 31,
    279     const float32x4_t wkrt = vsubq_f32(mm_half, c_k1);  // 28, 29, 30, 31,
    280     const float32x4_t wkr_ = reverse_order_f32x4(wkrt); // 31, 30, 29, 28,
    281     const float32x4_t wki_ = c_j1;                      //  1,  2,  3,  4,
    282     // Load and shuffle 'a'.
    283     //   2,   4,   6,   8,   3,   5,   7,   9
    284     float32x4x2_t a_j2_p = vld2q_f32(&a[0 + j2]);
    285     // 120, 122, 124, 126, 121, 123, 125, 127,
    286     const float32x4x2_t k2_0_4 = vld2q_f32(&a[122 - j2]);
    287     // 126, 124, 122, 120
    288     const float32x4_t a_k2_p0 = reverse_order_f32x4(k2_0_4.val[0]);
    289     // 127, 125, 123, 121
    290     const float32x4_t a_k2_p1 = reverse_order_f32x4(k2_0_4.val[1]);
    291     // Calculate 'x'.
    292     const float32x4_t xr_ = vsubq_f32(a_j2_p.val[0], a_k2_p0);
    293     // 2-126, 4-124, 6-122, 8-120,
    294     const float32x4_t xi_ = vaddq_f32(a_j2_p.val[1], a_k2_p1);
    295     // 3-127, 5-125, 7-123, 9-121,
    296     // Calculate product into 'y'.
    297     //    yr = wkr * xr - wki * xi;
    298     //    yi = wkr * xi + wki * xr;
    299     const float32x4_t a_ = vmulq_f32(wkr_, xr_);
    300     const float32x4_t b_ = vmulq_f32(wki_, xi_);
    301     const float32x4_t c_ = vmulq_f32(wkr_, xi_);
    302     const float32x4_t d_ = vmulq_f32(wki_, xr_);
    303     const float32x4_t yr_ = vaddq_f32(a_, b_);  // 2-126, 4-124, 6-122, 8-120,
    304     const float32x4_t yi_ = vsubq_f32(c_, d_);  // 3-127, 5-125, 7-123, 9-121,
    305                                                 // Update 'a'.
    306                                                 //    a[j2 + 0] -= yr;
    307                                                 //    a[j2 + 1] -= yi;
    308                                                 //    a[k2 + 0] += yr;
    309                                                 //    a[k2 + 1] -= yi;
    310     // 126, 124, 122, 120,
    311     const float32x4_t a_k2_p0n = vaddq_f32(a_k2_p0, yr_);
    312     // 127, 125, 123, 121,
    313     const float32x4_t a_k2_p1n = vsubq_f32(yi_, a_k2_p1);
    314     // Shuffle in right order and store.
    315     //   2,   3,   4,   5,   6,   7,   8,   9,
    316     const float32x4_t a_k2_p0nr = vrev64q_f32(a_k2_p0n);
    317     const float32x4_t a_k2_p1nr = vrev64q_f32(a_k2_p1n);
    318     // 124, 125, 126, 127, 120, 121, 122, 123
    319     const float32x4x2_t a_k2_n = vzipq_f32(a_k2_p0nr, a_k2_p1nr);
    320     //   2,   4,   6,   8,
    321     a_j2_p.val[0] = vsubq_f32(a_j2_p.val[0], yr_);
    322     //   3,   5,   7,   9,
    323     a_j2_p.val[1] = vsubq_f32(yi_, a_j2_p.val[1]);
    324     //   2,   3,   4,   5,   6,   7,   8,   9,
    325     vst2q_f32(&a[0 + j2], a_j2_p);
    326 
    327     vst1q_f32(&a[122 - j2], a_k2_n.val[1]);
    328     vst1q_f32(&a[126 - j2], a_k2_n.val[0]);
    329   }
    330 
    331   // Scalar code for the remaining items.
    332   for (; j2 < 64; j1 += 1, j2 += 2) {
    333     const int k2 = 128 - j2;
    334     const int k1 = 32 - j1;
    335     const float wkr = 0.5f - c[k1];
    336     const float wki = c[j1];
    337     const float xr = a[j2 + 0] - a[k2 + 0];
    338     const float xi = a[j2 + 1] + a[k2 + 1];
    339     const float yr = wkr * xr + wki * xi;
    340     const float yi = wkr * xi - wki * xr;
    341     a[j2 + 0] = a[j2 + 0] - yr;
    342     a[j2 + 1] = yi - a[j2 + 1];
    343     a[k2 + 0] = yr + a[k2 + 0];
    344     a[k2 + 1] = yi - a[k2 + 1];
    345   }
    346   a[65] = -a[65];
    347 }
    348 
    349 void aec_rdft_init_neon(void) {
    350   cft1st_128 = cft1st_128_neon;
    351   cftmdl_128 = cftmdl_128_neon;
    352   rftfsub_128 = rftfsub_128_neon;
    353   rftbsub_128 = rftbsub_128_neon;
    354 }
    355 
    356