1 /* 2 * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include "typedefs.h" 12 13 #if defined(WEBRTC_USE_SSE2) 14 #include <emmintrin.h> 15 16 #include "aec_rdft.h" 17 18 static const ALIGN16_BEG float ALIGN16_END k_swap_sign[4] = 19 {-1.f, 1.f, -1.f, 1.f}; 20 21 static void cft1st_128_SSE2(float *a) { 22 const __m128 mm_swap_sign = _mm_load_ps(k_swap_sign); 23 int j, k2; 24 25 for (k2 = 0, j = 0; j < 128; j += 16, k2 += 4) { 26 __m128 a00v = _mm_loadu_ps(&a[j + 0]); 27 __m128 a04v = _mm_loadu_ps(&a[j + 4]); 28 __m128 a08v = _mm_loadu_ps(&a[j + 8]); 29 __m128 a12v = _mm_loadu_ps(&a[j + 12]); 30 __m128 a01v = _mm_shuffle_ps(a00v, a08v, _MM_SHUFFLE(1, 0, 1 ,0)); 31 __m128 a23v = _mm_shuffle_ps(a00v, a08v, _MM_SHUFFLE(3, 2, 3 ,2)); 32 __m128 a45v = _mm_shuffle_ps(a04v, a12v, _MM_SHUFFLE(1, 0, 1 ,0)); 33 __m128 a67v = _mm_shuffle_ps(a04v, a12v, _MM_SHUFFLE(3, 2, 3 ,2)); 34 35 const __m128 wk1rv = _mm_load_ps(&rdft_wk1r[k2]); 36 const __m128 wk1iv = _mm_load_ps(&rdft_wk1i[k2]); 37 const __m128 wk2rv = _mm_load_ps(&rdft_wk2r[k2]); 38 const __m128 wk2iv = _mm_load_ps(&rdft_wk2i[k2]); 39 const __m128 wk3rv = _mm_load_ps(&rdft_wk3r[k2]); 40 const __m128 wk3iv = _mm_load_ps(&rdft_wk3i[k2]); 41 __m128 x0v = _mm_add_ps(a01v, a23v); 42 const __m128 x1v = _mm_sub_ps(a01v, a23v); 43 const __m128 x2v = _mm_add_ps(a45v, a67v); 44 const __m128 x3v = _mm_sub_ps(a45v, a67v); 45 __m128 x0w; 46 a01v = _mm_add_ps(x0v, x2v); 47 x0v = _mm_sub_ps(x0v, x2v); 48 x0w = _mm_shuffle_ps(x0v, x0v, _MM_SHUFFLE(2, 3, 0 ,1)); 49 { 50 const __m128 a45_0v = _mm_mul_ps(wk2rv, x0v); 51 const __m128 a45_1v = _mm_mul_ps(wk2iv, x0w); 52 a45v = _mm_add_ps(a45_0v, a45_1v); 53 } 54 { 55 __m128 a23_0v, a23_1v; 56 const __m128 x3w = _mm_shuffle_ps(x3v, x3v, _MM_SHUFFLE(2, 3, 0 ,1)); 57 const __m128 x3s = _mm_mul_ps(mm_swap_sign, x3w); 58 x0v = _mm_add_ps(x1v, x3s); 59 x0w = _mm_shuffle_ps(x0v, x0v, _MM_SHUFFLE(2, 3, 0 ,1)); 60 a23_0v = _mm_mul_ps(wk1rv, x0v); 61 a23_1v = _mm_mul_ps(wk1iv, x0w); 62 a23v = _mm_add_ps(a23_0v, a23_1v); 63 64 x0v = _mm_sub_ps(x1v, x3s); 65 x0w = _mm_shuffle_ps(x0v, x0v, _MM_SHUFFLE(2, 3, 0 ,1)); 66 } 67 { 68 const __m128 a67_0v = _mm_mul_ps(wk3rv, x0v); 69 const __m128 a67_1v = _mm_mul_ps(wk3iv, x0w); 70 a67v = _mm_add_ps(a67_0v, a67_1v); 71 } 72 73 a00v = _mm_shuffle_ps(a01v, a23v, _MM_SHUFFLE(1, 0, 1 ,0)); 74 a04v = _mm_shuffle_ps(a45v, a67v, _MM_SHUFFLE(1, 0, 1 ,0)); 75 a08v = _mm_shuffle_ps(a01v, a23v, _MM_SHUFFLE(3, 2, 3 ,2)); 76 a12v = _mm_shuffle_ps(a45v, a67v, _MM_SHUFFLE(3, 2, 3 ,2)); 77 _mm_storeu_ps(&a[j + 0], a00v); 78 _mm_storeu_ps(&a[j + 4], a04v); 79 _mm_storeu_ps(&a[j + 8], a08v); 80 _mm_storeu_ps(&a[j + 12], a12v); 81 } 82 } 83 84 static void cftmdl_128_SSE2(float *a) { 85 const int l = 8; 86 const __m128 mm_swap_sign = _mm_load_ps(k_swap_sign); 87 int j0; 88 89 __m128 wk1rv = _mm_load_ps(cftmdl_wk1r); 90 for (j0 = 0; j0 < l; j0 += 2) { 91 const __m128i a_00 = _mm_loadl_epi64((__m128i*)&a[j0 + 0]); 92 const __m128i a_08 = _mm_loadl_epi64((__m128i*)&a[j0 + 8]); 93 const __m128i a_32 = _mm_loadl_epi64((__m128i*)&a[j0 + 32]); 94 const __m128i a_40 = _mm_loadl_epi64((__m128i*)&a[j0 + 40]); 95 const __m128 a_00_32 = _mm_shuffle_ps(_mm_castsi128_ps(a_00), 96 _mm_castsi128_ps(a_32), 97 _MM_SHUFFLE(1, 0, 1 ,0)); 98 const __m128 a_08_40 = _mm_shuffle_ps(_mm_castsi128_ps(a_08), 99 _mm_castsi128_ps(a_40), 100 _MM_SHUFFLE(1, 0, 1 ,0)); 101 __m128 x0r0_0i0_0r1_x0i1 = _mm_add_ps(a_00_32, a_08_40); 102 const __m128 x1r0_1i0_1r1_x1i1 = _mm_sub_ps(a_00_32, a_08_40); 103 104 const __m128i a_16 = _mm_loadl_epi64((__m128i*)&a[j0 + 16]); 105 const __m128i a_24 = _mm_loadl_epi64((__m128i*)&a[j0 + 24]); 106 const __m128i a_48 = _mm_loadl_epi64((__m128i*)&a[j0 + 48]); 107 const __m128i a_56 = _mm_loadl_epi64((__m128i*)&a[j0 + 56]); 108 const __m128 a_16_48 = _mm_shuffle_ps(_mm_castsi128_ps(a_16), 109 _mm_castsi128_ps(a_48), 110 _MM_SHUFFLE(1, 0, 1 ,0)); 111 const __m128 a_24_56 = _mm_shuffle_ps(_mm_castsi128_ps(a_24), 112 _mm_castsi128_ps(a_56), 113 _MM_SHUFFLE(1, 0, 1 ,0)); 114 const __m128 x2r0_2i0_2r1_x2i1 = _mm_add_ps(a_16_48, a_24_56); 115 const __m128 x3r0_3i0_3r1_x3i1 = _mm_sub_ps(a_16_48, a_24_56); 116 117 const __m128 xx0 = _mm_add_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); 118 const __m128 xx1 = _mm_sub_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); 119 120 const __m128 x3i0_3r0_3i1_x3r1 = _mm_castsi128_ps( 121 _mm_shuffle_epi32(_mm_castps_si128(x3r0_3i0_3r1_x3i1), 122 _MM_SHUFFLE(2, 3, 0, 1))); 123 const __m128 x3_swapped = _mm_mul_ps(mm_swap_sign, x3i0_3r0_3i1_x3r1); 124 const __m128 x1_x3_add = _mm_add_ps(x1r0_1i0_1r1_x1i1, x3_swapped); 125 const __m128 x1_x3_sub = _mm_sub_ps(x1r0_1i0_1r1_x1i1, x3_swapped); 126 127 const __m128 yy0 = _mm_shuffle_ps(x1_x3_add, x1_x3_sub, 128 _MM_SHUFFLE(2, 2, 2 ,2)); 129 const __m128 yy1 = _mm_shuffle_ps(x1_x3_add, x1_x3_sub, 130 _MM_SHUFFLE(3, 3, 3 ,3)); 131 const __m128 yy2 = _mm_mul_ps(mm_swap_sign, yy1); 132 const __m128 yy3 = _mm_add_ps(yy0, yy2); 133 const __m128 yy4 = _mm_mul_ps(wk1rv, yy3); 134 135 _mm_storel_epi64((__m128i*)&a[j0 + 0], _mm_castps_si128(xx0)); 136 _mm_storel_epi64((__m128i*)&a[j0 + 32], 137 _mm_shuffle_epi32(_mm_castps_si128(xx0), 138 _MM_SHUFFLE(3, 2, 3, 2))); 139 140 _mm_storel_epi64((__m128i*)&a[j0 + 16], _mm_castps_si128(xx1)); 141 _mm_storel_epi64((__m128i*)&a[j0 + 48], 142 _mm_shuffle_epi32(_mm_castps_si128(xx1), 143 _MM_SHUFFLE(2, 3, 2, 3))); 144 a[j0 + 48] = -a[j0 + 48]; 145 146 _mm_storel_epi64((__m128i*)&a[j0 + 8], _mm_castps_si128(x1_x3_add)); 147 _mm_storel_epi64((__m128i*)&a[j0 + 24], _mm_castps_si128(x1_x3_sub)); 148 149 _mm_storel_epi64((__m128i*)&a[j0 + 40], _mm_castps_si128(yy4)); 150 _mm_storel_epi64((__m128i*)&a[j0 + 56], 151 _mm_shuffle_epi32(_mm_castps_si128(yy4), 152 _MM_SHUFFLE(2, 3, 2, 3))); 153 } 154 155 { 156 int k = 64; 157 int k1 = 2; 158 int k2 = 2 * k1; 159 const __m128 wk2rv = _mm_load_ps(&rdft_wk2r[k2+0]); 160 const __m128 wk2iv = _mm_load_ps(&rdft_wk2i[k2+0]); 161 const __m128 wk1iv = _mm_load_ps(&rdft_wk1i[k2+0]); 162 const __m128 wk3rv = _mm_load_ps(&rdft_wk3r[k2+0]); 163 const __m128 wk3iv = _mm_load_ps(&rdft_wk3i[k2+0]); 164 wk1rv = _mm_load_ps(&rdft_wk1r[k2+0]); 165 for (j0 = k; j0 < l + k; j0 += 2) { 166 const __m128i a_00 = _mm_loadl_epi64((__m128i*)&a[j0 + 0]); 167 const __m128i a_08 = _mm_loadl_epi64((__m128i*)&a[j0 + 8]); 168 const __m128i a_32 = _mm_loadl_epi64((__m128i*)&a[j0 + 32]); 169 const __m128i a_40 = _mm_loadl_epi64((__m128i*)&a[j0 + 40]); 170 const __m128 a_00_32 = _mm_shuffle_ps(_mm_castsi128_ps(a_00), 171 _mm_castsi128_ps(a_32), 172 _MM_SHUFFLE(1, 0, 1 ,0)); 173 const __m128 a_08_40 = _mm_shuffle_ps(_mm_castsi128_ps(a_08), 174 _mm_castsi128_ps(a_40), 175 _MM_SHUFFLE(1, 0, 1 ,0)); 176 __m128 x0r0_0i0_0r1_x0i1 = _mm_add_ps(a_00_32, a_08_40); 177 const __m128 x1r0_1i0_1r1_x1i1 = _mm_sub_ps(a_00_32, a_08_40); 178 179 const __m128i a_16 = _mm_loadl_epi64((__m128i*)&a[j0 + 16]); 180 const __m128i a_24 = _mm_loadl_epi64((__m128i*)&a[j0 + 24]); 181 const __m128i a_48 = _mm_loadl_epi64((__m128i*)&a[j0 + 48]); 182 const __m128i a_56 = _mm_loadl_epi64((__m128i*)&a[j0 + 56]); 183 const __m128 a_16_48 = _mm_shuffle_ps(_mm_castsi128_ps(a_16), 184 _mm_castsi128_ps(a_48), 185 _MM_SHUFFLE(1, 0, 1 ,0)); 186 const __m128 a_24_56 = _mm_shuffle_ps(_mm_castsi128_ps(a_24), 187 _mm_castsi128_ps(a_56), 188 _MM_SHUFFLE(1, 0, 1 ,0)); 189 const __m128 x2r0_2i0_2r1_x2i1 = _mm_add_ps(a_16_48, a_24_56); 190 const __m128 x3r0_3i0_3r1_x3i1 = _mm_sub_ps(a_16_48, a_24_56); 191 192 const __m128 xx = _mm_add_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); 193 const __m128 xx1 = _mm_sub_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); 194 const __m128 xx2 = _mm_mul_ps(xx1 , wk2rv); 195 const __m128 xx3 = _mm_mul_ps(wk2iv, 196 _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(xx1), 197 _MM_SHUFFLE(2, 3, 0, 1)))); 198 const __m128 xx4 = _mm_add_ps(xx2, xx3); 199 200 const __m128 x3i0_3r0_3i1_x3r1 = _mm_castsi128_ps( 201 _mm_shuffle_epi32(_mm_castps_si128(x3r0_3i0_3r1_x3i1), 202 _MM_SHUFFLE(2, 3, 0, 1))); 203 const __m128 x3_swapped = _mm_mul_ps(mm_swap_sign, x3i0_3r0_3i1_x3r1); 204 const __m128 x1_x3_add = _mm_add_ps(x1r0_1i0_1r1_x1i1, x3_swapped); 205 const __m128 x1_x3_sub = _mm_sub_ps(x1r0_1i0_1r1_x1i1, x3_swapped); 206 207 const __m128 xx10 = _mm_mul_ps(x1_x3_add, wk1rv); 208 const __m128 xx11 = _mm_mul_ps(wk1iv, 209 _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(x1_x3_add), 210 _MM_SHUFFLE(2, 3, 0, 1)))); 211 const __m128 xx12 = _mm_add_ps(xx10, xx11); 212 213 const __m128 xx20 = _mm_mul_ps(x1_x3_sub, wk3rv); 214 const __m128 xx21 = _mm_mul_ps(wk3iv, 215 _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(x1_x3_sub), 216 _MM_SHUFFLE(2, 3, 0, 1)))); 217 const __m128 xx22 = _mm_add_ps(xx20, xx21); 218 219 _mm_storel_epi64((__m128i*)&a[j0 + 0], _mm_castps_si128(xx)); 220 _mm_storel_epi64((__m128i*)&a[j0 + 32], 221 _mm_shuffle_epi32(_mm_castps_si128(xx), 222 _MM_SHUFFLE(3, 2, 3, 2))); 223 224 _mm_storel_epi64((__m128i*)&a[j0 + 16], _mm_castps_si128(xx4)); 225 _mm_storel_epi64((__m128i*)&a[j0 + 48], 226 _mm_shuffle_epi32(_mm_castps_si128(xx4), 227 _MM_SHUFFLE(3, 2, 3, 2))); 228 229 _mm_storel_epi64((__m128i*)&a[j0 + 8], _mm_castps_si128(xx12)); 230 _mm_storel_epi64((__m128i*)&a[j0 + 40], 231 _mm_shuffle_epi32(_mm_castps_si128(xx12), 232 _MM_SHUFFLE(3, 2, 3, 2))); 233 234 _mm_storel_epi64((__m128i*)&a[j0 + 24], _mm_castps_si128(xx22)); 235 _mm_storel_epi64((__m128i*)&a[j0 + 56], 236 _mm_shuffle_epi32(_mm_castps_si128(xx22), 237 _MM_SHUFFLE(3, 2, 3, 2))); 238 } 239 } 240 } 241 242 static void rftfsub_128_SSE2(float *a) { 243 const float *c = rdft_w + 32; 244 int j1, j2, k1, k2; 245 float wkr, wki, xr, xi, yr, yi; 246 247 static const ALIGN16_BEG float ALIGN16_END k_half[4] = 248 {0.5f, 0.5f, 0.5f, 0.5f}; 249 const __m128 mm_half = _mm_load_ps(k_half); 250 251 // Vectorized code (four at once). 252 // Note: commented number are indexes for the first iteration of the loop. 253 for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) { 254 // Load 'wk'. 255 const __m128 c_j1 = _mm_loadu_ps(&c[ j1]); // 1, 2, 3, 4, 256 const __m128 c_k1 = _mm_loadu_ps(&c[29 - j1]); // 28, 29, 30, 31, 257 const __m128 wkrt = _mm_sub_ps(mm_half, c_k1); // 28, 29, 30, 31, 258 const __m128 wkr_ = 259 _mm_shuffle_ps(wkrt, wkrt, _MM_SHUFFLE(0, 1, 2, 3)); // 31, 30, 29, 28, 260 const __m128 wki_ = c_j1; // 1, 2, 3, 4, 261 // Load and shuffle 'a'. 262 const __m128 a_j2_0 = _mm_loadu_ps(&a[0 + j2]); // 2, 3, 4, 5, 263 const __m128 a_j2_4 = _mm_loadu_ps(&a[4 + j2]); // 6, 7, 8, 9, 264 const __m128 a_k2_0 = _mm_loadu_ps(&a[122 - j2]); // 120, 121, 122, 123, 265 const __m128 a_k2_4 = _mm_loadu_ps(&a[126 - j2]); // 124, 125, 126, 127, 266 const __m128 a_j2_p0 = _mm_shuffle_ps(a_j2_0, a_j2_4, 267 _MM_SHUFFLE(2, 0, 2 ,0)); // 2, 4, 6, 8, 268 const __m128 a_j2_p1 = _mm_shuffle_ps(a_j2_0, a_j2_4, 269 _MM_SHUFFLE(3, 1, 3 ,1)); // 3, 5, 7, 9, 270 const __m128 a_k2_p0 = _mm_shuffle_ps(a_k2_4, a_k2_0, 271 _MM_SHUFFLE(0, 2, 0 ,2)); // 126, 124, 122, 120, 272 const __m128 a_k2_p1 = _mm_shuffle_ps(a_k2_4, a_k2_0, 273 _MM_SHUFFLE(1, 3, 1 ,3)); // 127, 125, 123, 121, 274 // Calculate 'x'. 275 const __m128 xr_ = _mm_sub_ps(a_j2_p0, a_k2_p0); 276 // 2-126, 4-124, 6-122, 8-120, 277 const __m128 xi_ = _mm_add_ps(a_j2_p1, a_k2_p1); 278 // 3-127, 5-125, 7-123, 9-121, 279 // Calculate product into 'y'. 280 // yr = wkr * xr - wki * xi; 281 // yi = wkr * xi + wki * xr; 282 const __m128 a_ = _mm_mul_ps(wkr_, xr_); 283 const __m128 b_ = _mm_mul_ps(wki_, xi_); 284 const __m128 c_ = _mm_mul_ps(wkr_, xi_); 285 const __m128 d_ = _mm_mul_ps(wki_, xr_); 286 const __m128 yr_ = _mm_sub_ps(a_, b_); // 2-126, 4-124, 6-122, 8-120, 287 const __m128 yi_ = _mm_add_ps(c_, d_); // 3-127, 5-125, 7-123, 9-121, 288 // Update 'a'. 289 // a[j2 + 0] -= yr; 290 // a[j2 + 1] -= yi; 291 // a[k2 + 0] += yr; 292 // a[k2 + 1] -= yi; 293 const __m128 a_j2_p0n = _mm_sub_ps(a_j2_p0, yr_); // 2, 4, 6, 8, 294 const __m128 a_j2_p1n = _mm_sub_ps(a_j2_p1, yi_); // 3, 5, 7, 9, 295 const __m128 a_k2_p0n = _mm_add_ps(a_k2_p0, yr_); // 126, 124, 122, 120, 296 const __m128 a_k2_p1n = _mm_sub_ps(a_k2_p1, yi_); // 127, 125, 123, 121, 297 // Shuffle in right order and store. 298 const __m128 a_j2_0n = _mm_unpacklo_ps(a_j2_p0n, a_j2_p1n); 299 // 2, 3, 4, 5, 300 const __m128 a_j2_4n = _mm_unpackhi_ps(a_j2_p0n, a_j2_p1n); 301 // 6, 7, 8, 9, 302 const __m128 a_k2_0nt = _mm_unpackhi_ps(a_k2_p0n, a_k2_p1n); 303 // 122, 123, 120, 121, 304 const __m128 a_k2_4nt = _mm_unpacklo_ps(a_k2_p0n, a_k2_p1n); 305 // 126, 127, 124, 125, 306 const __m128 a_k2_0n = _mm_shuffle_ps(a_k2_0nt, a_k2_0nt, 307 _MM_SHUFFLE(1, 0, 3 ,2)); // 120, 121, 122, 123, 308 const __m128 a_k2_4n = _mm_shuffle_ps(a_k2_4nt, a_k2_4nt, 309 _MM_SHUFFLE(1, 0, 3 ,2)); // 124, 125, 126, 127, 310 _mm_storeu_ps(&a[0 + j2], a_j2_0n); 311 _mm_storeu_ps(&a[4 + j2], a_j2_4n); 312 _mm_storeu_ps(&a[122 - j2], a_k2_0n); 313 _mm_storeu_ps(&a[126 - j2], a_k2_4n); 314 } 315 // Scalar code for the remaining items. 316 for (; j2 < 64; j1 += 1, j2 += 2) { 317 k2 = 128 - j2; 318 k1 = 32 - j1; 319 wkr = 0.5f - c[k1]; 320 wki = c[j1]; 321 xr = a[j2 + 0] - a[k2 + 0]; 322 xi = a[j2 + 1] + a[k2 + 1]; 323 yr = wkr * xr - wki * xi; 324 yi = wkr * xi + wki * xr; 325 a[j2 + 0] -= yr; 326 a[j2 + 1] -= yi; 327 a[k2 + 0] += yr; 328 a[k2 + 1] -= yi; 329 } 330 } 331 332 static void rftbsub_128_SSE2(float *a) { 333 const float *c = rdft_w + 32; 334 int j1, j2, k1, k2; 335 float wkr, wki, xr, xi, yr, yi; 336 337 static const ALIGN16_BEG float ALIGN16_END k_half[4] = 338 {0.5f, 0.5f, 0.5f, 0.5f}; 339 const __m128 mm_half = _mm_load_ps(k_half); 340 341 a[1] = -a[1]; 342 // Vectorized code (four at once). 343 // Note: commented number are indexes for the first iteration of the loop. 344 for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) { 345 // Load 'wk'. 346 const __m128 c_j1 = _mm_loadu_ps(&c[ j1]); // 1, 2, 3, 4, 347 const __m128 c_k1 = _mm_loadu_ps(&c[29 - j1]); // 28, 29, 30, 31, 348 const __m128 wkrt = _mm_sub_ps(mm_half, c_k1); // 28, 29, 30, 31, 349 const __m128 wkr_ = 350 _mm_shuffle_ps(wkrt, wkrt, _MM_SHUFFLE(0, 1, 2, 3)); // 31, 30, 29, 28, 351 const __m128 wki_ = c_j1; // 1, 2, 3, 4, 352 // Load and shuffle 'a'. 353 const __m128 a_j2_0 = _mm_loadu_ps(&a[0 + j2]); // 2, 3, 4, 5, 354 const __m128 a_j2_4 = _mm_loadu_ps(&a[4 + j2]); // 6, 7, 8, 9, 355 const __m128 a_k2_0 = _mm_loadu_ps(&a[122 - j2]); // 120, 121, 122, 123, 356 const __m128 a_k2_4 = _mm_loadu_ps(&a[126 - j2]); // 124, 125, 126, 127, 357 const __m128 a_j2_p0 = _mm_shuffle_ps(a_j2_0, a_j2_4, 358 _MM_SHUFFLE(2, 0, 2 ,0)); // 2, 4, 6, 8, 359 const __m128 a_j2_p1 = _mm_shuffle_ps(a_j2_0, a_j2_4, 360 _MM_SHUFFLE(3, 1, 3 ,1)); // 3, 5, 7, 9, 361 const __m128 a_k2_p0 = _mm_shuffle_ps(a_k2_4, a_k2_0, 362 _MM_SHUFFLE(0, 2, 0 ,2)); // 126, 124, 122, 120, 363 const __m128 a_k2_p1 = _mm_shuffle_ps(a_k2_4, a_k2_0, 364 _MM_SHUFFLE(1, 3, 1 ,3)); // 127, 125, 123, 121, 365 // Calculate 'x'. 366 const __m128 xr_ = _mm_sub_ps(a_j2_p0, a_k2_p0); 367 // 2-126, 4-124, 6-122, 8-120, 368 const __m128 xi_ = _mm_add_ps(a_j2_p1, a_k2_p1); 369 // 3-127, 5-125, 7-123, 9-121, 370 // Calculate product into 'y'. 371 // yr = wkr * xr + wki * xi; 372 // yi = wkr * xi - wki * xr; 373 const __m128 a_ = _mm_mul_ps(wkr_, xr_); 374 const __m128 b_ = _mm_mul_ps(wki_, xi_); 375 const __m128 c_ = _mm_mul_ps(wkr_, xi_); 376 const __m128 d_ = _mm_mul_ps(wki_, xr_); 377 const __m128 yr_ = _mm_add_ps(a_, b_); // 2-126, 4-124, 6-122, 8-120, 378 const __m128 yi_ = _mm_sub_ps(c_, d_); // 3-127, 5-125, 7-123, 9-121, 379 // Update 'a'. 380 // a[j2 + 0] = a[j2 + 0] - yr; 381 // a[j2 + 1] = yi - a[j2 + 1]; 382 // a[k2 + 0] = yr + a[k2 + 0]; 383 // a[k2 + 1] = yi - a[k2 + 1]; 384 const __m128 a_j2_p0n = _mm_sub_ps(a_j2_p0, yr_); // 2, 4, 6, 8, 385 const __m128 a_j2_p1n = _mm_sub_ps(yi_, a_j2_p1); // 3, 5, 7, 9, 386 const __m128 a_k2_p0n = _mm_add_ps(a_k2_p0, yr_); // 126, 124, 122, 120, 387 const __m128 a_k2_p1n = _mm_sub_ps(yi_, a_k2_p1); // 127, 125, 123, 121, 388 // Shuffle in right order and store. 389 const __m128 a_j2_0n = _mm_unpacklo_ps(a_j2_p0n, a_j2_p1n); 390 // 2, 3, 4, 5, 391 const __m128 a_j2_4n = _mm_unpackhi_ps(a_j2_p0n, a_j2_p1n); 392 // 6, 7, 8, 9, 393 const __m128 a_k2_0nt = _mm_unpackhi_ps(a_k2_p0n, a_k2_p1n); 394 // 122, 123, 120, 121, 395 const __m128 a_k2_4nt = _mm_unpacklo_ps(a_k2_p0n, a_k2_p1n); 396 // 126, 127, 124, 125, 397 const __m128 a_k2_0n = _mm_shuffle_ps(a_k2_0nt, a_k2_0nt, 398 _MM_SHUFFLE(1, 0, 3 ,2)); // 120, 121, 122, 123, 399 const __m128 a_k2_4n = _mm_shuffle_ps(a_k2_4nt, a_k2_4nt, 400 _MM_SHUFFLE(1, 0, 3 ,2)); // 124, 125, 126, 127, 401 _mm_storeu_ps(&a[0 + j2], a_j2_0n); 402 _mm_storeu_ps(&a[4 + j2], a_j2_4n); 403 _mm_storeu_ps(&a[122 - j2], a_k2_0n); 404 _mm_storeu_ps(&a[126 - j2], a_k2_4n); 405 } 406 // Scalar code for the remaining items. 407 for (; j2 < 64; j1 += 1, j2 += 2) { 408 k2 = 128 - j2; 409 k1 = 32 - j1; 410 wkr = 0.5f - c[k1]; 411 wki = c[j1]; 412 xr = a[j2 + 0] - a[k2 + 0]; 413 xi = a[j2 + 1] + a[k2 + 1]; 414 yr = wkr * xr + wki * xi; 415 yi = wkr * xi - wki * xr; 416 a[j2 + 0] = a[j2 + 0] - yr; 417 a[j2 + 1] = yi - a[j2 + 1]; 418 a[k2 + 0] = yr + a[k2 + 0]; 419 a[k2 + 1] = yi - a[k2 + 1]; 420 } 421 a[65] = -a[65]; 422 } 423 424 void aec_rdft_init_sse2(void) { 425 cft1st_128 = cft1st_128_SSE2; 426 cftmdl_128 = cftmdl_128_SSE2; 427 rftfsub_128 = rftfsub_128_SSE2; 428 rftbsub_128 = rftbsub_128_SSE2; 429 } 430 431 #endif // WEBRTC_USE_SS2 432