1 /* 2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 /* 12 * The rdft AEC algorithm, neon version of speed-critical functions. 13 * 14 * Based on the sse2 version. 15 */ 16 17 18 #include "webrtc/modules/audio_processing/aec/aec_rdft.h" 19 20 #include <arm_neon.h> 21 22 static const ALIGN16_BEG float ALIGN16_END 23 k_swap_sign[4] = {-1.f, 1.f, -1.f, 1.f}; 24 25 static void cft1st_128_neon(float* a) { 26 const float32x4_t vec_swap_sign = vld1q_f32((float32_t*)k_swap_sign); 27 int j, k2; 28 29 for (k2 = 0, j = 0; j < 128; j += 16, k2 += 4) { 30 float32x4_t a00v = vld1q_f32(&a[j + 0]); 31 float32x4_t a04v = vld1q_f32(&a[j + 4]); 32 float32x4_t a08v = vld1q_f32(&a[j + 8]); 33 float32x4_t a12v = vld1q_f32(&a[j + 12]); 34 float32x4_t a01v = vcombine_f32(vget_low_f32(a00v), vget_low_f32(a08v)); 35 float32x4_t a23v = vcombine_f32(vget_high_f32(a00v), vget_high_f32(a08v)); 36 float32x4_t a45v = vcombine_f32(vget_low_f32(a04v), vget_low_f32(a12v)); 37 float32x4_t a67v = vcombine_f32(vget_high_f32(a04v), vget_high_f32(a12v)); 38 const float32x4_t wk1rv = vld1q_f32(&rdft_wk1r[k2]); 39 const float32x4_t wk1iv = vld1q_f32(&rdft_wk1i[k2]); 40 const float32x4_t wk2rv = vld1q_f32(&rdft_wk2r[k2]); 41 const float32x4_t wk2iv = vld1q_f32(&rdft_wk2i[k2]); 42 const float32x4_t wk3rv = vld1q_f32(&rdft_wk3r[k2]); 43 const float32x4_t wk3iv = vld1q_f32(&rdft_wk3i[k2]); 44 float32x4_t x0v = vaddq_f32(a01v, a23v); 45 const float32x4_t x1v = vsubq_f32(a01v, a23v); 46 const float32x4_t x2v = vaddq_f32(a45v, a67v); 47 const float32x4_t x3v = vsubq_f32(a45v, a67v); 48 const float32x4_t x3w = vrev64q_f32(x3v); 49 float32x4_t x0w; 50 a01v = vaddq_f32(x0v, x2v); 51 x0v = vsubq_f32(x0v, x2v); 52 x0w = vrev64q_f32(x0v); 53 a45v = vmulq_f32(wk2rv, x0v); 54 a45v = vmlaq_f32(a45v, wk2iv, x0w); 55 x0v = vmlaq_f32(x1v, x3w, vec_swap_sign); 56 x0w = vrev64q_f32(x0v); 57 a23v = vmulq_f32(wk1rv, x0v); 58 a23v = vmlaq_f32(a23v, wk1iv, x0w); 59 x0v = vmlsq_f32(x1v, x3w, vec_swap_sign); 60 x0w = vrev64q_f32(x0v); 61 a67v = vmulq_f32(wk3rv, x0v); 62 a67v = vmlaq_f32(a67v, wk3iv, x0w); 63 a00v = vcombine_f32(vget_low_f32(a01v), vget_low_f32(a23v)); 64 a04v = vcombine_f32(vget_low_f32(a45v), vget_low_f32(a67v)); 65 a08v = vcombine_f32(vget_high_f32(a01v), vget_high_f32(a23v)); 66 a12v = vcombine_f32(vget_high_f32(a45v), vget_high_f32(a67v)); 67 vst1q_f32(&a[j + 0], a00v); 68 vst1q_f32(&a[j + 4], a04v); 69 vst1q_f32(&a[j + 8], a08v); 70 vst1q_f32(&a[j + 12], a12v); 71 } 72 } 73 74 static void cftmdl_128_neon(float* a) { 75 int j; 76 const int l = 8; 77 const float32x4_t vec_swap_sign = vld1q_f32((float32_t*)k_swap_sign); 78 float32x4_t wk1rv = vld1q_f32(cftmdl_wk1r); 79 80 for (j = 0; j < l; j += 2) { 81 const float32x2_t a_00 = vld1_f32(&a[j + 0]); 82 const float32x2_t a_08 = vld1_f32(&a[j + 8]); 83 const float32x2_t a_32 = vld1_f32(&a[j + 32]); 84 const float32x2_t a_40 = vld1_f32(&a[j + 40]); 85 const float32x4_t a_00_32 = vcombine_f32(a_00, a_32); 86 const float32x4_t a_08_40 = vcombine_f32(a_08, a_40); 87 const float32x4_t x0r0_0i0_0r1_x0i1 = vaddq_f32(a_00_32, a_08_40); 88 const float32x4_t x1r0_1i0_1r1_x1i1 = vsubq_f32(a_00_32, a_08_40); 89 const float32x2_t a_16 = vld1_f32(&a[j + 16]); 90 const float32x2_t a_24 = vld1_f32(&a[j + 24]); 91 const float32x2_t a_48 = vld1_f32(&a[j + 48]); 92 const float32x2_t a_56 = vld1_f32(&a[j + 56]); 93 const float32x4_t a_16_48 = vcombine_f32(a_16, a_48); 94 const float32x4_t a_24_56 = vcombine_f32(a_24, a_56); 95 const float32x4_t x2r0_2i0_2r1_x2i1 = vaddq_f32(a_16_48, a_24_56); 96 const float32x4_t x3r0_3i0_3r1_x3i1 = vsubq_f32(a_16_48, a_24_56); 97 const float32x4_t xx0 = vaddq_f32(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); 98 const float32x4_t xx1 = vsubq_f32(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); 99 const float32x4_t x3i0_3r0_3i1_x3r1 = vrev64q_f32(x3r0_3i0_3r1_x3i1); 100 const float32x4_t x1_x3_add = 101 vmlaq_f32(x1r0_1i0_1r1_x1i1, vec_swap_sign, x3i0_3r0_3i1_x3r1); 102 const float32x4_t x1_x3_sub = 103 vmlsq_f32(x1r0_1i0_1r1_x1i1, vec_swap_sign, x3i0_3r0_3i1_x3r1); 104 const float32x2_t yy0_a = vdup_lane_f32(vget_high_f32(x1_x3_add), 0); 105 const float32x2_t yy0_s = vdup_lane_f32(vget_high_f32(x1_x3_sub), 0); 106 const float32x4_t yy0_as = vcombine_f32(yy0_a, yy0_s); 107 const float32x2_t yy1_a = vdup_lane_f32(vget_high_f32(x1_x3_add), 1); 108 const float32x2_t yy1_s = vdup_lane_f32(vget_high_f32(x1_x3_sub), 1); 109 const float32x4_t yy1_as = vcombine_f32(yy1_a, yy1_s); 110 const float32x4_t yy0 = vmlaq_f32(yy0_as, vec_swap_sign, yy1_as); 111 const float32x4_t yy4 = vmulq_f32(wk1rv, yy0); 112 const float32x4_t xx1_rev = vrev64q_f32(xx1); 113 const float32x4_t yy4_rev = vrev64q_f32(yy4); 114 115 vst1_f32(&a[j + 0], vget_low_f32(xx0)); 116 vst1_f32(&a[j + 32], vget_high_f32(xx0)); 117 vst1_f32(&a[j + 16], vget_low_f32(xx1)); 118 vst1_f32(&a[j + 48], vget_high_f32(xx1_rev)); 119 120 a[j + 48] = -a[j + 48]; 121 122 vst1_f32(&a[j + 8], vget_low_f32(x1_x3_add)); 123 vst1_f32(&a[j + 24], vget_low_f32(x1_x3_sub)); 124 vst1_f32(&a[j + 40], vget_low_f32(yy4)); 125 vst1_f32(&a[j + 56], vget_high_f32(yy4_rev)); 126 } 127 128 { 129 const int k = 64; 130 const int k1 = 2; 131 const int k2 = 2 * k1; 132 const float32x4_t wk2rv = vld1q_f32(&rdft_wk2r[k2 + 0]); 133 const float32x4_t wk2iv = vld1q_f32(&rdft_wk2i[k2 + 0]); 134 const float32x4_t wk1iv = vld1q_f32(&rdft_wk1i[k2 + 0]); 135 const float32x4_t wk3rv = vld1q_f32(&rdft_wk3r[k2 + 0]); 136 const float32x4_t wk3iv = vld1q_f32(&rdft_wk3i[k2 + 0]); 137 wk1rv = vld1q_f32(&rdft_wk1r[k2 + 0]); 138 for (j = k; j < l + k; j += 2) { 139 const float32x2_t a_00 = vld1_f32(&a[j + 0]); 140 const float32x2_t a_08 = vld1_f32(&a[j + 8]); 141 const float32x2_t a_32 = vld1_f32(&a[j + 32]); 142 const float32x2_t a_40 = vld1_f32(&a[j + 40]); 143 const float32x4_t a_00_32 = vcombine_f32(a_00, a_32); 144 const float32x4_t a_08_40 = vcombine_f32(a_08, a_40); 145 const float32x4_t x0r0_0i0_0r1_x0i1 = vaddq_f32(a_00_32, a_08_40); 146 const float32x4_t x1r0_1i0_1r1_x1i1 = vsubq_f32(a_00_32, a_08_40); 147 const float32x2_t a_16 = vld1_f32(&a[j + 16]); 148 const float32x2_t a_24 = vld1_f32(&a[j + 24]); 149 const float32x2_t a_48 = vld1_f32(&a[j + 48]); 150 const float32x2_t a_56 = vld1_f32(&a[j + 56]); 151 const float32x4_t a_16_48 = vcombine_f32(a_16, a_48); 152 const float32x4_t a_24_56 = vcombine_f32(a_24, a_56); 153 const float32x4_t x2r0_2i0_2r1_x2i1 = vaddq_f32(a_16_48, a_24_56); 154 const float32x4_t x3r0_3i0_3r1_x3i1 = vsubq_f32(a_16_48, a_24_56); 155 const float32x4_t xx = vaddq_f32(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); 156 const float32x4_t xx1 = vsubq_f32(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); 157 const float32x4_t x3i0_3r0_3i1_x3r1 = vrev64q_f32(x3r0_3i0_3r1_x3i1); 158 const float32x4_t x1_x3_add = 159 vmlaq_f32(x1r0_1i0_1r1_x1i1, vec_swap_sign, x3i0_3r0_3i1_x3r1); 160 const float32x4_t x1_x3_sub = 161 vmlsq_f32(x1r0_1i0_1r1_x1i1, vec_swap_sign, x3i0_3r0_3i1_x3r1); 162 float32x4_t xx4 = vmulq_f32(wk2rv, xx1); 163 float32x4_t xx12 = vmulq_f32(wk1rv, x1_x3_add); 164 float32x4_t xx22 = vmulq_f32(wk3rv, x1_x3_sub); 165 xx4 = vmlaq_f32(xx4, wk2iv, vrev64q_f32(xx1)); 166 xx12 = vmlaq_f32(xx12, wk1iv, vrev64q_f32(x1_x3_add)); 167 xx22 = vmlaq_f32(xx22, wk3iv, vrev64q_f32(x1_x3_sub)); 168 169 vst1_f32(&a[j + 0], vget_low_f32(xx)); 170 vst1_f32(&a[j + 32], vget_high_f32(xx)); 171 vst1_f32(&a[j + 16], vget_low_f32(xx4)); 172 vst1_f32(&a[j + 48], vget_high_f32(xx4)); 173 vst1_f32(&a[j + 8], vget_low_f32(xx12)); 174 vst1_f32(&a[j + 40], vget_high_f32(xx12)); 175 vst1_f32(&a[j + 24], vget_low_f32(xx22)); 176 vst1_f32(&a[j + 56], vget_high_f32(xx22)); 177 } 178 } 179 } 180 181 __inline static float32x4_t reverse_order_f32x4(float32x4_t in) { 182 // A B C D -> C D A B 183 const float32x4_t rev = vcombine_f32(vget_high_f32(in), vget_low_f32(in)); 184 // C D A B -> D C B A 185 return vrev64q_f32(rev); 186 } 187 188 static void rftfsub_128_neon(float* a) { 189 const float* c = rdft_w + 32; 190 int j1, j2; 191 const float32x4_t mm_half = vdupq_n_f32(0.5f); 192 193 // Vectorized code (four at once). 194 // Note: commented number are indexes for the first iteration of the loop. 195 for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) { 196 // Load 'wk'. 197 const float32x4_t c_j1 = vld1q_f32(&c[j1]); // 1, 2, 3, 4, 198 const float32x4_t c_k1 = vld1q_f32(&c[29 - j1]); // 28, 29, 30, 31, 199 const float32x4_t wkrt = vsubq_f32(mm_half, c_k1); // 28, 29, 30, 31, 200 const float32x4_t wkr_ = reverse_order_f32x4(wkrt); // 31, 30, 29, 28, 201 const float32x4_t wki_ = c_j1; // 1, 2, 3, 4, 202 // Load and shuffle 'a'. 203 // 2, 4, 6, 8, 3, 5, 7, 9 204 float32x4x2_t a_j2_p = vld2q_f32(&a[0 + j2]); 205 // 120, 122, 124, 126, 121, 123, 125, 127, 206 const float32x4x2_t k2_0_4 = vld2q_f32(&a[122 - j2]); 207 // 126, 124, 122, 120 208 const float32x4_t a_k2_p0 = reverse_order_f32x4(k2_0_4.val[0]); 209 // 127, 125, 123, 121 210 const float32x4_t a_k2_p1 = reverse_order_f32x4(k2_0_4.val[1]); 211 // Calculate 'x'. 212 const float32x4_t xr_ = vsubq_f32(a_j2_p.val[0], a_k2_p0); 213 // 2-126, 4-124, 6-122, 8-120, 214 const float32x4_t xi_ = vaddq_f32(a_j2_p.val[1], a_k2_p1); 215 // 3-127, 5-125, 7-123, 9-121, 216 // Calculate product into 'y'. 217 // yr = wkr * xr - wki * xi; 218 // yi = wkr * xi + wki * xr; 219 const float32x4_t a_ = vmulq_f32(wkr_, xr_); 220 const float32x4_t b_ = vmulq_f32(wki_, xi_); 221 const float32x4_t c_ = vmulq_f32(wkr_, xi_); 222 const float32x4_t d_ = vmulq_f32(wki_, xr_); 223 const float32x4_t yr_ = vsubq_f32(a_, b_); // 2-126, 4-124, 6-122, 8-120, 224 const float32x4_t yi_ = vaddq_f32(c_, d_); // 3-127, 5-125, 7-123, 9-121, 225 // Update 'a'. 226 // a[j2 + 0] -= yr; 227 // a[j2 + 1] -= yi; 228 // a[k2 + 0] += yr; 229 // a[k2 + 1] -= yi; 230 // 126, 124, 122, 120, 231 const float32x4_t a_k2_p0n = vaddq_f32(a_k2_p0, yr_); 232 // 127, 125, 123, 121, 233 const float32x4_t a_k2_p1n = vsubq_f32(a_k2_p1, yi_); 234 // Shuffle in right order and store. 235 const float32x4_t a_k2_p0nr = vrev64q_f32(a_k2_p0n); 236 const float32x4_t a_k2_p1nr = vrev64q_f32(a_k2_p1n); 237 // 124, 125, 126, 127, 120, 121, 122, 123 238 const float32x4x2_t a_k2_n = vzipq_f32(a_k2_p0nr, a_k2_p1nr); 239 // 2, 4, 6, 8, 240 a_j2_p.val[0] = vsubq_f32(a_j2_p.val[0], yr_); 241 // 3, 5, 7, 9, 242 a_j2_p.val[1] = vsubq_f32(a_j2_p.val[1], yi_); 243 // 2, 3, 4, 5, 6, 7, 8, 9, 244 vst2q_f32(&a[0 + j2], a_j2_p); 245 246 vst1q_f32(&a[122 - j2], a_k2_n.val[1]); 247 vst1q_f32(&a[126 - j2], a_k2_n.val[0]); 248 } 249 250 // Scalar code for the remaining items. 251 for (; j2 < 64; j1 += 1, j2 += 2) { 252 const int k2 = 128 - j2; 253 const int k1 = 32 - j1; 254 const float wkr = 0.5f - c[k1]; 255 const float wki = c[j1]; 256 const float xr = a[j2 + 0] - a[k2 + 0]; 257 const float xi = a[j2 + 1] + a[k2 + 1]; 258 const float yr = wkr * xr - wki * xi; 259 const float yi = wkr * xi + wki * xr; 260 a[j2 + 0] -= yr; 261 a[j2 + 1] -= yi; 262 a[k2 + 0] += yr; 263 a[k2 + 1] -= yi; 264 } 265 } 266 267 static void rftbsub_128_neon(float* a) { 268 const float* c = rdft_w + 32; 269 int j1, j2; 270 const float32x4_t mm_half = vdupq_n_f32(0.5f); 271 272 a[1] = -a[1]; 273 // Vectorized code (four at once). 274 // Note: commented number are indexes for the first iteration of the loop. 275 for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) { 276 // Load 'wk'. 277 const float32x4_t c_j1 = vld1q_f32(&c[j1]); // 1, 2, 3, 4, 278 const float32x4_t c_k1 = vld1q_f32(&c[29 - j1]); // 28, 29, 30, 31, 279 const float32x4_t wkrt = vsubq_f32(mm_half, c_k1); // 28, 29, 30, 31, 280 const float32x4_t wkr_ = reverse_order_f32x4(wkrt); // 31, 30, 29, 28, 281 const float32x4_t wki_ = c_j1; // 1, 2, 3, 4, 282 // Load and shuffle 'a'. 283 // 2, 4, 6, 8, 3, 5, 7, 9 284 float32x4x2_t a_j2_p = vld2q_f32(&a[0 + j2]); 285 // 120, 122, 124, 126, 121, 123, 125, 127, 286 const float32x4x2_t k2_0_4 = vld2q_f32(&a[122 - j2]); 287 // 126, 124, 122, 120 288 const float32x4_t a_k2_p0 = reverse_order_f32x4(k2_0_4.val[0]); 289 // 127, 125, 123, 121 290 const float32x4_t a_k2_p1 = reverse_order_f32x4(k2_0_4.val[1]); 291 // Calculate 'x'. 292 const float32x4_t xr_ = vsubq_f32(a_j2_p.val[0], a_k2_p0); 293 // 2-126, 4-124, 6-122, 8-120, 294 const float32x4_t xi_ = vaddq_f32(a_j2_p.val[1], a_k2_p1); 295 // 3-127, 5-125, 7-123, 9-121, 296 // Calculate product into 'y'. 297 // yr = wkr * xr - wki * xi; 298 // yi = wkr * xi + wki * xr; 299 const float32x4_t a_ = vmulq_f32(wkr_, xr_); 300 const float32x4_t b_ = vmulq_f32(wki_, xi_); 301 const float32x4_t c_ = vmulq_f32(wkr_, xi_); 302 const float32x4_t d_ = vmulq_f32(wki_, xr_); 303 const float32x4_t yr_ = vaddq_f32(a_, b_); // 2-126, 4-124, 6-122, 8-120, 304 const float32x4_t yi_ = vsubq_f32(c_, d_); // 3-127, 5-125, 7-123, 9-121, 305 // Update 'a'. 306 // a[j2 + 0] -= yr; 307 // a[j2 + 1] -= yi; 308 // a[k2 + 0] += yr; 309 // a[k2 + 1] -= yi; 310 // 126, 124, 122, 120, 311 const float32x4_t a_k2_p0n = vaddq_f32(a_k2_p0, yr_); 312 // 127, 125, 123, 121, 313 const float32x4_t a_k2_p1n = vsubq_f32(yi_, a_k2_p1); 314 // Shuffle in right order and store. 315 // 2, 3, 4, 5, 6, 7, 8, 9, 316 const float32x4_t a_k2_p0nr = vrev64q_f32(a_k2_p0n); 317 const float32x4_t a_k2_p1nr = vrev64q_f32(a_k2_p1n); 318 // 124, 125, 126, 127, 120, 121, 122, 123 319 const float32x4x2_t a_k2_n = vzipq_f32(a_k2_p0nr, a_k2_p1nr); 320 // 2, 4, 6, 8, 321 a_j2_p.val[0] = vsubq_f32(a_j2_p.val[0], yr_); 322 // 3, 5, 7, 9, 323 a_j2_p.val[1] = vsubq_f32(yi_, a_j2_p.val[1]); 324 // 2, 3, 4, 5, 6, 7, 8, 9, 325 vst2q_f32(&a[0 + j2], a_j2_p); 326 327 vst1q_f32(&a[122 - j2], a_k2_n.val[1]); 328 vst1q_f32(&a[126 - j2], a_k2_n.val[0]); 329 } 330 331 // Scalar code for the remaining items. 332 for (; j2 < 64; j1 += 1, j2 += 2) { 333 const int k2 = 128 - j2; 334 const int k1 = 32 - j1; 335 const float wkr = 0.5f - c[k1]; 336 const float wki = c[j1]; 337 const float xr = a[j2 + 0] - a[k2 + 0]; 338 const float xi = a[j2 + 1] + a[k2 + 1]; 339 const float yr = wkr * xr + wki * xi; 340 const float yi = wkr * xi - wki * xr; 341 a[j2 + 0] = a[j2 + 0] - yr; 342 a[j2 + 1] = yi - a[j2 + 1]; 343 a[k2 + 0] = yr + a[k2 + 0]; 344 a[k2 + 1] = yi - a[k2 + 1]; 345 } 346 a[65] = -a[65]; 347 } 348 349 void aec_rdft_init_neon(void) { 350 cft1st_128 = cft1st_128_neon; 351 cftmdl_128 = cftmdl_128_neon; 352 rftfsub_128 = rftfsub_128_neon; 353 rftbsub_128 = rftbsub_128_neon; 354 } 355 356