1 /* 2 * Copyright (c) 2013 The WebRTC project authors. All Rights realserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 * 10 */ 11 12 #include <emmintrin.h> 13 #include <assert.h> 14 15 /** 16 * Two data formats are used by the FFT routines, internally. The 17 * interface to the main external FFT routines use interleaved complex 18 * values where the real part is followed by the imaginary part. 19 * 20 * One is the split format where a complex vector of real and imaginary 21 * values are split such that all of the real values are placed in the 22 * first half of the vector and the corresponding values are placed in 23 * the second half, in the same order. The conversion from interleaved 24 * complex values to split format and back is transparent to the 25 * external FFT interface. 26 * 27 * VComplex uses split format. 28 */ 29 30 /** VComplex hold 4 complex float elements, with the real parts stored 31 * in real and corresponding imaginary parts in imag. 32 */ 33 typedef struct VComplex { 34 __m128 real; 35 __m128 imag; 36 } VC; 37 38 /* out = a * b */ 39 static __inline void VC_MUL(VC *out, VC *a, VC *b) { 40 out->real = _mm_sub_ps(_mm_mul_ps(a->real, b->real), 41 _mm_mul_ps(a->imag, b->imag)); 42 out->imag = _mm_add_ps(_mm_mul_ps(a->real, b->imag), 43 _mm_mul_ps(a->imag, b->real)); 44 } 45 46 /* out = conj(a) * b */ 47 static __inline void VC_CONJ_MUL(VC *out, VC *a, VC *b) { 48 out->real = _mm_add_ps(_mm_mul_ps(a->real, b->real), 49 _mm_mul_ps(a->imag, b->imag)); 50 out->imag = _mm_sub_ps(_mm_mul_ps(a->real, b->imag), 51 _mm_mul_ps(a->imag, b->real)); 52 } 53 54 /* Scale complex by a real factor */ 55 static __inline void VC_MUL_F(VC *out, VC *a, __m128 factor) { 56 out->real = _mm_mul_ps(factor, a->real); 57 out->imag = _mm_mul_ps(factor, a->imag); 58 } 59 60 /* out = a + b */ 61 static __inline void VC_ADD(VC *out, VC *a, VC *b) { 62 out->real = _mm_add_ps(a->real, b->real); 63 out->imag = _mm_add_ps(a->imag, b->imag); 64 } 65 66 /** 67 * out.real = a.real + b.imag 68 * out.imag = a.imag + b.real 69 */ 70 static __inline void VC_ADD_X(VC *out, VC *a, VC *b) { 71 out->real = _mm_add_ps(a->real, b->imag); 72 out->imag = _mm_add_ps(b->real, a->imag); 73 } 74 75 /* VC_ADD and store the result with Split format. */ 76 static __inline void VC_ADD_STORE_SPLIT( 77 OMX_F32 *out, 78 VC *a, 79 VC *b, 80 OMX_INT offset) { 81 _mm_store_ps(out, _mm_add_ps(a->real, b->real)); 82 _mm_store_ps(out + offset, _mm_add_ps(a->imag, b->imag)); 83 } 84 85 /* out = a - b */ 86 static __inline void VC_SUB(VC *out, VC *a, VC *b) { 87 out->real = _mm_sub_ps(a->real, b->real); 88 out->imag = _mm_sub_ps(a->imag, b->imag); 89 } 90 91 /** 92 * out.real = a.real - b.imag 93 * out.imag = a.imag - b.real 94 */ 95 static __inline void VC_SUB_X(VC *out, VC *a, VC *b) { 96 out->real = _mm_sub_ps(a->real, b->imag); 97 out->imag = _mm_sub_ps(b->real, a->imag); 98 } 99 100 /* VC_SUB and store the result with Split format. */ 101 static __inline void VC_SUB_STORE_SPLIT( 102 OMX_F32 *out, 103 VC *a, 104 VC *b, 105 OMX_INT offset) { 106 _mm_store_ps(out, _mm_sub_ps(a->real, b->real)); 107 _mm_store_ps(out + offset, _mm_sub_ps(a->imag, b->imag)); 108 } 109 110 /** 111 * out.real = a.real + b.real 112 * out.imag = a.imag - b.imag 113 */ 114 static __inline void VC_ADD_SUB(VC *out, VC *a, VC *b) { 115 out->real = _mm_add_ps(a->real, b->real); 116 out->imag = _mm_sub_ps(a->imag, b->imag); 117 } 118 119 /** 120 * out.real = a.real + b.imag 121 * out.imag = a.imag - b.real 122 */ 123 static __inline void VC_ADD_SUB_X(VC *out, VC *a, VC *b) { 124 out->real = _mm_add_ps(a->real, b->imag); 125 out->imag = _mm_sub_ps(a->imag, b->real); 126 } 127 128 /* VC_ADD_SUB_X and store the result with Split format. */ 129 static __inline void VC_ADD_SUB_X_STORE_SPLIT( 130 OMX_F32 *out, 131 VC *a, 132 VC *b, 133 OMX_INT offset) { 134 _mm_store_ps(out, _mm_add_ps(a->real, b->imag)); 135 _mm_store_ps(out + offset, _mm_sub_ps(a->imag, b->real)); 136 } 137 138 /** 139 * out.real = a.real - b.real 140 * out.imag = a.imag + b.imag 141 */ 142 static __inline void VC_SUB_ADD(VC *out, VC *a, VC *b) { 143 out->real = _mm_sub_ps(a->real, b->real); 144 out->imag = _mm_add_ps(a->imag, b->imag); 145 } 146 147 /** 148 * out.real = a.real - b.imag 149 * out.imag = a.imag + b.real 150 */ 151 static __inline void VC_SUB_ADD_X(VC *out, VC *a, VC *b) { 152 out->real = _mm_sub_ps(a->real, b->imag); 153 out->imag = _mm_add_ps(a->imag, b->real); 154 } 155 156 /* VC_SUB_ADD_X and store the result with Split format. */ 157 static __inline void VC_SUB_ADD_X_STORE_SPLIT( 158 OMX_F32 *out, 159 VC *a, VC *b, 160 OMX_INT offset) { 161 _mm_store_ps(out, _mm_sub_ps(a->real, b->imag)); 162 _mm_store_ps(out + offset, _mm_add_ps(a->imag, b->real)); 163 } 164 165 /** 166 * out[0] = in.real 167 * out[offset] = in.imag 168 */ 169 static __inline void VC_STORE_SPLIT( 170 OMX_F32 *out, 171 VC *in, 172 OMX_INT offset) { 173 _mm_store_ps(out, in->real); 174 _mm_store_ps(out + offset, in->imag); 175 } 176 177 /** 178 * out.real = in[0]; 179 * out.imag = in[offset]; 180 */ 181 static __inline void VC_LOAD_SPLIT( 182 VC *out, 183 const OMX_F32 *in, 184 OMX_INT offset) { 185 out->real = _mm_load_ps(in); 186 out->imag = _mm_load_ps(in + offset); 187 } 188 189 /* Vector Complex Unpack from Split format to Interleaved format. */ 190 static __inline void VC_UNPACK(VC *out, VC *in) { 191 out->real = _mm_unpacklo_ps(in->real, in->imag); 192 out->imag = _mm_unpackhi_ps(in->real, in->imag); 193 } 194 195 /** 196 * Vector Complex load from interleaved complex array. 197 * out.real = [in[0].real, in[1].real, in[2].real, in[3].real] 198 * out.imag = [in[0].imag, in[1].imag, in[2].imag, in[3].imag] 199 */ 200 static __inline void VC_LOAD_INTERLEAVE(VC *out, const OMX_F32 *in) { 201 __m128 temp0 = _mm_load_ps(in); 202 __m128 temp1 = _mm_load_ps(in + 4); 203 out->real = _mm_shuffle_ps(temp0, temp1, _MM_SHUFFLE(2, 0, 2, 0)); 204 out->imag = _mm_shuffle_ps(temp0, temp1, _MM_SHUFFLE(3, 1, 3, 1)); 205 } 206 /** 207 * Vector Complex Load with Split format. 208 * The input address is not 16 byte aligned. 209 */ 210 static __inline void VC_LOADU_SPLIT( 211 VC *out, 212 const OMX_F32 *in, 213 OMX_INT offset) { 214 out->real = _mm_loadu_ps(in); 215 out->imag = _mm_loadu_ps(in + offset); 216 } 217 218 /* Reverse the order of the Complex Vector. */ 219 static __inline void VC_REVERSE(VC *v) { 220 v->real = _mm_shuffle_ps(v->real, v->real, _MM_SHUFFLE(0, 1, 2, 3)); 221 v->imag = _mm_shuffle_ps(v->imag, v->imag, _MM_SHUFFLE(0, 1, 2, 3)); 222 } 223 /* 224 * Vector Complex store to interleaved complex array 225 * out[0] = in.real[0] 226 * out[1] = in.imag[0] 227 * out[2] = in.real[1] 228 * out[3] = in.imag[1] 229 * out[4] = in.real[2] 230 * out[5] = in.imag[2] 231 * out[6] = in.real[3] 232 * out[7] = in.imag[3] 233 */ 234 static __inline void VC_STORE_INTERLEAVE(OMX_F32 *out, VC *in) { 235 _mm_store_ps(out, _mm_unpacklo_ps(in->real, in->imag)); 236 _mm_store_ps(out + 4, _mm_unpackhi_ps(in->real, in->imag)); 237 } 238 239 /** 240 * Vector Complex Store with Interleaved format. 241 * Address is not 16 byte aligned. 242 */ 243 static __inline void VC_STOREU_INTERLEAVE(OMX_F32 *out, VC *in) { 244 _mm_storeu_ps(out, _mm_unpacklo_ps(in->real, in->imag)); 245 _mm_storeu_ps(out + 4, _mm_unpackhi_ps(in->real, in->imag)); 246 } 247 248 /* VC_ADD_X and store the result with Split format. */ 249 static __inline void VC_ADD_X_STORE_SPLIT( 250 OMX_F32 *out, 251 VC *a, VC *b, 252 OMX_INT offset) { 253 _mm_store_ps(out, _mm_add_ps(a->real, b->imag)); 254 _mm_store_ps(out + offset, _mm_add_ps(b->real, a->imag)); 255 } 256 257 /** 258 * VC_SUB_X and store the result with inverse order. 259 * Address is not 16 byte aligned. 260 */ 261 static __inline void VC_SUB_X_INVERSE_STOREU_SPLIT( 262 OMX_F32 *out, 263 VC *a, 264 VC *b, 265 OMX_INT offset) { 266 __m128 t; 267 t = _mm_sub_ps(a->real, b->imag); 268 _mm_storeu_ps(out, _mm_shuffle_ps(t, t, _MM_SHUFFLE(0, 1, 2, 3))); 269 t = _mm_sub_ps(b->real, a->imag); 270 _mm_storeu_ps(out + offset, _mm_shuffle_ps(t, t, _MM_SHUFFLE(0, 1, 2, 3))); 271 } 272 273 /** 274 * Vector Complex Load from Interleaved format to Split format. 275 * Store the result into two __m128 registers. 276 */ 277 static __inline void VC_LOAD_SHUFFLE( 278 __m128 *out0, 279 __m128 *out1, 280 const OMX_F32 *in) { 281 VC temp; 282 VC_LOAD_INTERLEAVE(&temp, in); 283 *out0 = temp.real; 284 *out1 = temp.imag; 285 } 286 287 /* Finish the butterfly calculation of forward radix4 and store the outputs. */ 288 static __inline void RADIX4_FWD_BUTTERFLY_STORE( 289 OMX_F32 *out0, 290 OMX_F32 *out1, 291 OMX_F32 *out2, 292 OMX_F32 *out3, 293 VC *t0, 294 VC *t1, 295 VC *t2, 296 VC *t3, 297 OMX_INT n) { 298 /* CADD out0, t0, t2 */ 299 VC_ADD_STORE_SPLIT(out0, t0, t2, n); 300 301 /* CSUB out2, t0, t2 */ 302 VC_SUB_STORE_SPLIT(out2, t0, t2, n); 303 304 /* CADD_SUB_X out1, t1, t3 */ 305 VC_ADD_SUB_X_STORE_SPLIT(out1, t1, t3, n); 306 307 /* CSUB_ADD_X out3, t1, t3 */ 308 VC_SUB_ADD_X_STORE_SPLIT(out3, t1, t3, n); 309 } 310 311 /* Finish the butterfly calculation of inverse radix4 and store the outputs. */ 312 static __inline void RADIX4_INV_BUTTERFLY_STORE( 313 OMX_F32 *out0, 314 OMX_F32 *out1, 315 OMX_F32 *out2, 316 OMX_F32 *out3, 317 VC *t0, 318 VC *t1, 319 VC *t2, 320 VC *t3, 321 OMX_INT n) { 322 /* CADD out0, t0, t2 */ 323 VC_ADD_STORE_SPLIT(out0, t0, t2, n); 324 325 /* CSUB out2, t0, t2 */ 326 VC_SUB_STORE_SPLIT(out2, t0, t2, n); 327 328 /* CSUB_ADD_X out1, t1, t3 */ 329 VC_SUB_ADD_X_STORE_SPLIT(out1, t1, t3, n); 330 331 /* CADD_SUB_X out3, t1, t3 */ 332 VC_ADD_SUB_X_STORE_SPLIT(out3, t1, t3, n); 333 } 334 335 /* Radix4 forward butterfly */ 336 static __inline void RADIX4_FWD_BUTTERFLY( 337 VC *t0, 338 VC *t1, 339 VC *t2, 340 VC *t3, 341 VC *Tw1, 342 VC *Tw2, 343 VC *Tw3, 344 VC *T0, 345 VC *T1, 346 VC *T2, 347 VC *T3) { 348 VC tt1, tt2, tt3; 349 350 /* CMUL tt1, Tw1, T1 */ 351 VC_MUL(&tt1, Tw1, T1); 352 353 /* CMUL tt2, Tw2, T2 */ 354 VC_MUL(&tt2, Tw2, T2); 355 356 /* CMUL tt3, Tw3, T3 */ 357 VC_MUL(&tt3, Tw3, T3); 358 359 /* CADD t0, T0, tt2 */ 360 VC_ADD(t0, T0, &tt2); 361 362 /* CSUB t1, T0, tt2 */ 363 VC_SUB(t1, T0, &tt2); 364 365 /* CADD t2, tt1, tt3 */ 366 VC_ADD(t2, &tt1, &tt3); 367 368 /* CSUB t3, tt1, tt3 */ 369 VC_SUB(t3, &tt1, &tt3); 370 } 371 372 /* Radix4 inverse butterfly */ 373 static __inline void RADIX4_INV_BUTTERFLY( 374 VC *t0, 375 VC *t1, 376 VC *t2, 377 VC *t3, 378 VC *Tw1, 379 VC *Tw2, 380 VC *Tw3, 381 VC *T0, 382 VC *T1, 383 VC *T2, 384 VC *T3) { 385 VC tt1, tt2, tt3; 386 387 /* CMUL tt1, Tw1, T1 */ 388 VC_CONJ_MUL(&tt1, Tw1, T1); 389 390 /* CMUL tt2, Tw2, T2 */ 391 VC_CONJ_MUL(&tt2, Tw2, T2); 392 393 /* CMUL tt3, Tw3, T3 */ 394 VC_CONJ_MUL(&tt3, Tw3, T3); 395 396 /* CADD t0, T0, tt2 */ 397 VC_ADD(t0, T0, &tt2); 398 399 /* CSUB t1, T0, tt2 */ 400 VC_SUB(t1, T0, &tt2); 401 402 /* CADD t2, tt1, tt3 */ 403 VC_ADD(t2, &tt1, &tt3); 404 405 /* CSUB t3, tt1, tt3 */ 406 VC_SUB(t3, &tt1, &tt3); 407 } 408 409 /* Radix4 butterfly in first stage for both forward and inverse */ 410 static __inline void RADIX4_BUTTERFLY_FS( 411 VC *t0, 412 VC *t1, 413 VC *t2, 414 VC *t3, 415 VC *T0, 416 VC *T1, 417 VC *T2, 418 VC *T3) { 419 /* CADD t0, T0, T2 */ 420 VC_ADD(t0, T0, T2); 421 422 /* CSUB t1, T0, T2 */ 423 VC_SUB(t1, T0, T2); 424 425 /* CADD t2, T1, T3 */ 426 VC_ADD(t2, T1, T3); 427 428 /* CSUB t3, T1, T3 */ 429 VC_SUB(t3, T1, T3); 430 } 431 432 /** 433 * Load 16 float elements (4 sse registers) which is a 4 * 4 matrix. 434 * Then Do transpose on the matrix. 435 * 3, 2, 1, 0 12, 8, 4, 0 436 * 7, 6, 5, 4 =====> 13, 9, 5, 1 437 * 11, 10, 9, 8 14, 10, 6, 2 438 * 15, 14, 13, 12 15, 11, 7, 3 439 */ 440 static __inline void VC_LOAD_MATRIX_TRANSPOSE( 441 VC *T0, 442 VC *T1, 443 VC *T2, 444 VC *T3, 445 const OMX_F32 *pT0, 446 const OMX_F32 *pT1, 447 const OMX_F32 *pT2, 448 const OMX_F32 *pT3, 449 OMX_INT n) { 450 __m128 xmm0; 451 __m128 xmm1; 452 __m128 xmm2; 453 __m128 xmm3; 454 __m128 xmm4; 455 __m128 xmm5; 456 __m128 xmm6; 457 __m128 xmm7; 458 459 xmm0 = _mm_load_ps(pT0); 460 xmm1 = _mm_load_ps(pT1); 461 xmm2 = _mm_load_ps(pT2); 462 xmm3 = _mm_load_ps(pT3); 463 464 /* Matrix transpose */ 465 xmm4 = _mm_unpacklo_ps(xmm0, xmm1); 466 xmm5 = _mm_unpackhi_ps(xmm0, xmm1); 467 xmm6 = _mm_unpacklo_ps(xmm2, xmm3); 468 xmm7 = _mm_unpackhi_ps(xmm2, xmm3); 469 T0->real = _mm_shuffle_ps(xmm4, xmm6, _MM_SHUFFLE(1, 0, 1, 0)); 470 T1->real = _mm_shuffle_ps(xmm4, xmm6, _MM_SHUFFLE(3, 2, 3, 2)); 471 T2->real = _mm_shuffle_ps(xmm5, xmm7, _MM_SHUFFLE(1, 0, 1, 0)); 472 T3->real = _mm_shuffle_ps(xmm5, xmm7, _MM_SHUFFLE(3, 2, 3, 2)); 473 474 xmm0 = _mm_load_ps(pT0 + n); 475 xmm1 = _mm_load_ps(pT1 + n); 476 xmm2 = _mm_load_ps(pT2 + n); 477 xmm3 = _mm_load_ps(pT3 + n); 478 479 /* Matrix transpose */ 480 xmm4 = _mm_unpacklo_ps(xmm0, xmm1); 481 xmm5 = _mm_unpackhi_ps(xmm0, xmm1); 482 xmm6 = _mm_unpacklo_ps(xmm2, xmm3); 483 xmm7 = _mm_unpackhi_ps(xmm2, xmm3); 484 T0->imag = _mm_shuffle_ps(xmm4, xmm6, _MM_SHUFFLE(1, 0, 1, 0)); 485 T1->imag = _mm_shuffle_ps(xmm4, xmm6, _MM_SHUFFLE(3, 2, 3, 2)); 486 T2->imag = _mm_shuffle_ps(xmm5, xmm7, _MM_SHUFFLE(1, 0, 1, 0)); 487 T3->imag = _mm_shuffle_ps(xmm5, xmm7, _MM_SHUFFLE(3, 2, 3, 2)); 488 } 489