Home | History | Annotate | Download | only in x86
      1 /*
      2  *  Copyright (c) 2013 The WebRTC project authors. All Rights realserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  *
     10  */
     11 
     12 #include <emmintrin.h>
     13 #include <assert.h>
     14 
     15 /**
     16  * Two data formats are used by the FFT routines, internally. The
     17  * interface to the main external FFT routines use interleaved complex
     18  * values where the real part is followed by the imaginary part.
     19  *
     20  * One is the split format where a complex vector of real and imaginary
     21  * values are split such that all of the real values are placed in the
     22  * first half of the vector and the corresponding values are placed in
     23  * the second half, in the same order. The conversion from interleaved
     24  * complex values to split format and back is transparent to the
     25  * external FFT interface.
     26  *
     27  * VComplex uses split format.
     28  */
     29 
     30 /** VComplex hold 4 complex float elements, with the real parts stored
     31  * in real and corresponding imaginary parts in imag.
     32  */
     33 typedef struct VComplex {
     34   __m128 real;
     35   __m128 imag;
     36 } VC;
     37 
     38 /* out = a * b */
     39 static __inline void VC_MUL(VC *out, VC *a, VC *b) {
     40   out->real = _mm_sub_ps(_mm_mul_ps(a->real, b->real),
     41       _mm_mul_ps(a->imag, b->imag));
     42   out->imag = _mm_add_ps(_mm_mul_ps(a->real, b->imag),
     43       _mm_mul_ps(a->imag, b->real));
     44 }
     45 
     46 /* out = conj(a) * b */
     47 static __inline void VC_CONJ_MUL(VC *out, VC *a, VC *b) {
     48   out->real = _mm_add_ps(_mm_mul_ps(a->real, b->real),
     49       _mm_mul_ps(a->imag, b->imag));
     50   out->imag = _mm_sub_ps(_mm_mul_ps(a->real, b->imag),
     51       _mm_mul_ps(a->imag, b->real));
     52 }
     53 
     54 /* Scale complex by a real factor */
     55 static __inline void VC_MUL_F(VC *out, VC *a, __m128 factor) {
     56   out->real = _mm_mul_ps(factor, a->real);
     57   out->imag = _mm_mul_ps(factor, a->imag);
     58 }
     59 
     60 /* out = a + b */
     61 static __inline void VC_ADD(VC *out, VC *a, VC *b) {
     62   out->real = _mm_add_ps(a->real, b->real);
     63   out->imag = _mm_add_ps(a->imag, b->imag);
     64 }
     65 
     66 /**
     67  * out.real = a.real + b.imag
     68  * out.imag = a.imag + b.real
     69  */
     70 static __inline void VC_ADD_X(VC *out, VC *a, VC *b) {
     71   out->real = _mm_add_ps(a->real, b->imag);
     72   out->imag = _mm_add_ps(b->real, a->imag);
     73 }
     74 
     75 /* VC_ADD and store the result with Split format. */
     76 static __inline void VC_ADD_STORE_SPLIT(
     77     OMX_F32 *out,
     78     VC *a,
     79     VC *b,
     80     OMX_INT offset) {
     81   _mm_store_ps(out, _mm_add_ps(a->real, b->real));
     82   _mm_store_ps(out + offset, _mm_add_ps(a->imag, b->imag));
     83 }
     84 
     85 /* out = a - b */
     86 static __inline void VC_SUB(VC *out, VC *a, VC *b) {
     87   out->real = _mm_sub_ps(a->real, b->real);
     88   out->imag = _mm_sub_ps(a->imag, b->imag);
     89 }
     90 
     91 /**
     92  * out.real = a.real - b.imag
     93  * out.imag = a.imag - b.real
     94  */
     95 static __inline void VC_SUB_X(VC *out, VC *a, VC *b) {
     96   out->real = _mm_sub_ps(a->real, b->imag);
     97   out->imag = _mm_sub_ps(b->real, a->imag);
     98 }
     99 
    100 /* VC_SUB and store the result with Split format. */
    101 static __inline void VC_SUB_STORE_SPLIT(
    102     OMX_F32 *out,
    103     VC *a,
    104     VC *b,
    105     OMX_INT offset) {
    106   _mm_store_ps(out, _mm_sub_ps(a->real, b->real));
    107   _mm_store_ps(out + offset, _mm_sub_ps(a->imag, b->imag));
    108 }
    109 
    110 /**
    111  * out.real = a.real + b.real
    112  * out.imag = a.imag - b.imag
    113  */
    114 static __inline void VC_ADD_SUB(VC *out, VC *a, VC *b) {
    115   out->real = _mm_add_ps(a->real, b->real);
    116   out->imag = _mm_sub_ps(a->imag, b->imag);
    117 }
    118 
    119 /**
    120  * out.real = a.real + b.imag
    121  * out.imag = a.imag - b.real
    122  */
    123 static __inline void VC_ADD_SUB_X(VC *out, VC *a, VC *b) {
    124   out->real = _mm_add_ps(a->real, b->imag);
    125   out->imag = _mm_sub_ps(a->imag, b->real);
    126 }
    127 
    128 /* VC_ADD_SUB_X and store the result with Split format. */
    129 static __inline void VC_ADD_SUB_X_STORE_SPLIT(
    130     OMX_F32 *out,
    131     VC *a,
    132     VC *b,
    133     OMX_INT offset) {
    134   _mm_store_ps(out, _mm_add_ps(a->real, b->imag));
    135   _mm_store_ps(out + offset, _mm_sub_ps(a->imag, b->real));
    136 }
    137 
    138 /**
    139  * out.real = a.real - b.real
    140  * out.imag = a.imag + b.imag
    141  */
    142 static __inline void VC_SUB_ADD(VC *out, VC *a, VC *b) {
    143   out->real = _mm_sub_ps(a->real, b->real);
    144   out->imag = _mm_add_ps(a->imag, b->imag);
    145 }
    146 
    147 /**
    148  * out.real = a.real - b.imag
    149  * out.imag = a.imag + b.real
    150  */
    151 static __inline void VC_SUB_ADD_X(VC *out, VC *a, VC *b) {
    152   out->real = _mm_sub_ps(a->real, b->imag);
    153   out->imag = _mm_add_ps(a->imag, b->real);
    154 }
    155 
    156 /* VC_SUB_ADD_X and store the result with Split format. */
    157 static __inline void VC_SUB_ADD_X_STORE_SPLIT(
    158     OMX_F32 *out,
    159     VC *a, VC *b,
    160     OMX_INT offset) {
    161   _mm_store_ps(out, _mm_sub_ps(a->real, b->imag));
    162   _mm_store_ps(out + offset, _mm_add_ps(a->imag, b->real));
    163 }
    164 
    165 /**
    166  * out[0]      = in.real
    167  * out[offset] = in.imag
    168  */
    169 static __inline void VC_STORE_SPLIT(
    170     OMX_F32 *out,
    171     VC *in,
    172     OMX_INT offset) {
    173   _mm_store_ps(out, in->real);
    174   _mm_store_ps(out + offset, in->imag);
    175 }
    176 
    177 /**
    178  * out.real = in[0];
    179  * out.imag = in[offset];
    180 */
    181 static __inline void VC_LOAD_SPLIT(
    182     VC *out,
    183     const OMX_F32 *in,
    184     OMX_INT offset) {
    185   out->real = _mm_load_ps(in);
    186   out->imag = _mm_load_ps(in + offset);
    187 }
    188 
    189 /* Vector Complex Unpack from Split format to Interleaved format. */
    190 static __inline void VC_UNPACK(VC *out, VC *in) {
    191     out->real = _mm_unpacklo_ps(in->real, in->imag);
    192     out->imag = _mm_unpackhi_ps(in->real, in->imag);
    193 }
    194 
    195 /**
    196  * Vector Complex load from interleaved complex array.
    197  * out.real = [in[0].real, in[1].real, in[2].real, in[3].real]
    198  * out.imag = [in[0].imag, in[1].imag, in[2].imag, in[3].imag]
    199  */
    200 static __inline void VC_LOAD_INTERLEAVE(VC *out, const OMX_F32 *in) {
    201     __m128 temp0 = _mm_load_ps(in);
    202     __m128 temp1 = _mm_load_ps(in + 4);
    203     out->real = _mm_shuffle_ps(temp0, temp1, _MM_SHUFFLE(2, 0, 2, 0));
    204     out->imag = _mm_shuffle_ps(temp0, temp1, _MM_SHUFFLE(3, 1, 3, 1));
    205 }
    206 /**
    207  * Vector Complex Load with Split format.
    208  * The input address is not 16 byte aligned.
    209  */
    210 static __inline void VC_LOADU_SPLIT(
    211     VC *out,
    212     const OMX_F32 *in,
    213     OMX_INT offset) {
    214   out->real = _mm_loadu_ps(in);
    215   out->imag = _mm_loadu_ps(in + offset);
    216 }
    217 
    218 /* Reverse the order of the Complex Vector. */
    219 static __inline void VC_REVERSE(VC *v) {
    220   v->real = _mm_shuffle_ps(v->real, v->real, _MM_SHUFFLE(0, 1, 2, 3));
    221   v->imag = _mm_shuffle_ps(v->imag, v->imag, _MM_SHUFFLE(0, 1, 2, 3));
    222 }
    223 /*
    224  * Vector Complex store to interleaved complex array
    225  * out[0] = in.real[0]
    226  * out[1] = in.imag[0]
    227  * out[2] = in.real[1]
    228  * out[3] = in.imag[1]
    229  * out[4] = in.real[2]
    230  * out[5] = in.imag[2]
    231  * out[6] = in.real[3]
    232  * out[7] = in.imag[3]
    233  */
    234 static __inline void VC_STORE_INTERLEAVE(OMX_F32 *out, VC *in) {
    235   _mm_store_ps(out, _mm_unpacklo_ps(in->real, in->imag));
    236   _mm_store_ps(out + 4, _mm_unpackhi_ps(in->real, in->imag));
    237 }
    238 
    239 /**
    240  * Vector Complex Store with Interleaved format.
    241  * Address is not 16 byte aligned.
    242  */
    243 static __inline void VC_STOREU_INTERLEAVE(OMX_F32 *out, VC *in) {
    244   _mm_storeu_ps(out, _mm_unpacklo_ps(in->real, in->imag));
    245   _mm_storeu_ps(out + 4, _mm_unpackhi_ps(in->real, in->imag));
    246 }
    247 
    248 /* VC_ADD_X and store the result with Split format. */
    249 static __inline void VC_ADD_X_STORE_SPLIT(
    250     OMX_F32 *out,
    251     VC *a, VC *b,
    252     OMX_INT offset) {
    253   _mm_store_ps(out, _mm_add_ps(a->real, b->imag));
    254   _mm_store_ps(out + offset, _mm_add_ps(b->real, a->imag));
    255 }
    256 
    257 /**
    258  * VC_SUB_X and store the result with inverse order.
    259  * Address is not 16 byte aligned.
    260  */
    261 static __inline void VC_SUB_X_INVERSE_STOREU_SPLIT(
    262     OMX_F32 *out,
    263     VC *a,
    264     VC *b,
    265     OMX_INT offset) {
    266   __m128 t;
    267   t = _mm_sub_ps(a->real, b->imag);
    268   _mm_storeu_ps(out, _mm_shuffle_ps(t, t, _MM_SHUFFLE(0, 1, 2, 3)));
    269   t = _mm_sub_ps(b->real, a->imag);
    270   _mm_storeu_ps(out + offset, _mm_shuffle_ps(t, t, _MM_SHUFFLE(0, 1, 2, 3)));
    271 }
    272 
    273 /**
    274  * Vector Complex Load from Interleaved format to Split format.
    275  * Store the result into two __m128 registers.
    276  */
    277 static __inline void VC_LOAD_SHUFFLE(
    278     __m128 *out0,
    279     __m128 *out1,
    280     const OMX_F32 *in) {
    281   VC temp;
    282   VC_LOAD_INTERLEAVE(&temp, in);
    283   *out0 = temp.real;
    284   *out1 = temp.imag;
    285 }
    286 
    287 /* Finish the butterfly calculation of forward radix4 and store the outputs. */
    288 static __inline void RADIX4_FWD_BUTTERFLY_STORE(
    289     OMX_F32 *out0,
    290     OMX_F32 *out1,
    291     OMX_F32 *out2,
    292     OMX_F32 *out3,
    293     VC *t0,
    294     VC *t1,
    295     VC *t2,
    296     VC *t3,
    297     OMX_INT n) {
    298   /* CADD out0, t0, t2 */
    299   VC_ADD_STORE_SPLIT(out0, t0, t2, n);
    300 
    301   /* CSUB out2, t0, t2 */
    302   VC_SUB_STORE_SPLIT(out2, t0, t2, n);
    303 
    304   /* CADD_SUB_X out1, t1, t3 */
    305   VC_ADD_SUB_X_STORE_SPLIT(out1, t1, t3, n);
    306 
    307   /* CSUB_ADD_X out3, t1, t3 */
    308   VC_SUB_ADD_X_STORE_SPLIT(out3, t1, t3, n);
    309 }
    310 
    311 /* Finish the butterfly calculation of inverse radix4 and store the outputs. */
    312 static __inline void RADIX4_INV_BUTTERFLY_STORE(
    313     OMX_F32 *out0,
    314     OMX_F32 *out1,
    315     OMX_F32 *out2,
    316     OMX_F32 *out3,
    317     VC *t0,
    318     VC *t1,
    319     VC *t2,
    320     VC *t3,
    321     OMX_INT n) {
    322   /* CADD out0, t0, t2 */
    323   VC_ADD_STORE_SPLIT(out0, t0, t2, n);
    324 
    325   /* CSUB out2, t0, t2 */
    326   VC_SUB_STORE_SPLIT(out2, t0, t2, n);
    327 
    328   /* CSUB_ADD_X out1, t1, t3 */
    329   VC_SUB_ADD_X_STORE_SPLIT(out1, t1, t3, n);
    330 
    331   /* CADD_SUB_X out3, t1, t3 */
    332   VC_ADD_SUB_X_STORE_SPLIT(out3, t1, t3, n);
    333 }
    334 
    335 /* Radix4 forward butterfly */
    336 static __inline void RADIX4_FWD_BUTTERFLY(
    337     VC *t0,
    338     VC *t1,
    339     VC *t2,
    340     VC *t3,
    341     VC *Tw1,
    342     VC *Tw2,
    343     VC *Tw3,
    344     VC *T0,
    345     VC *T1,
    346     VC *T2,
    347     VC *T3) {
    348   VC tt1, tt2, tt3;
    349 
    350   /* CMUL tt1, Tw1, T1 */
    351   VC_MUL(&tt1, Tw1, T1);
    352 
    353   /* CMUL tt2, Tw2, T2 */
    354   VC_MUL(&tt2, Tw2, T2);
    355 
    356   /* CMUL tt3, Tw3, T3 */
    357   VC_MUL(&tt3, Tw3, T3);
    358 
    359   /* CADD t0, T0, tt2 */
    360   VC_ADD(t0, T0, &tt2);
    361 
    362   /* CSUB t1, T0, tt2 */
    363   VC_SUB(t1, T0, &tt2);
    364 
    365   /* CADD t2, tt1, tt3 */
    366   VC_ADD(t2, &tt1, &tt3);
    367 
    368   /* CSUB t3, tt1, tt3 */
    369   VC_SUB(t3, &tt1, &tt3);
    370 }
    371 
    372 /* Radix4 inverse butterfly */
    373 static __inline void RADIX4_INV_BUTTERFLY(
    374     VC *t0,
    375     VC *t1,
    376     VC *t2,
    377     VC *t3,
    378     VC *Tw1,
    379     VC *Tw2,
    380     VC *Tw3,
    381     VC *T0,
    382     VC *T1,
    383     VC *T2,
    384     VC *T3) {
    385   VC tt1, tt2, tt3;
    386 
    387   /* CMUL tt1, Tw1, T1 */
    388   VC_CONJ_MUL(&tt1, Tw1, T1);
    389 
    390   /* CMUL tt2, Tw2, T2 */
    391   VC_CONJ_MUL(&tt2, Tw2, T2);
    392 
    393   /* CMUL tt3, Tw3, T3 */
    394   VC_CONJ_MUL(&tt3, Tw3, T3);
    395 
    396   /* CADD t0, T0, tt2 */
    397   VC_ADD(t0, T0, &tt2);
    398 
    399   /* CSUB t1, T0, tt2 */
    400   VC_SUB(t1, T0, &tt2);
    401 
    402   /* CADD t2, tt1, tt3 */
    403   VC_ADD(t2, &tt1, &tt3);
    404 
    405   /* CSUB t3, tt1, tt3 */
    406   VC_SUB(t3, &tt1, &tt3);
    407 }
    408 
    409 /* Radix4 butterfly in first stage for both forward and inverse */
    410 static __inline void RADIX4_BUTTERFLY_FS(
    411     VC *t0,
    412     VC *t1,
    413     VC *t2,
    414     VC *t3,
    415     VC *T0,
    416     VC *T1,
    417     VC *T2,
    418     VC *T3) {
    419   /* CADD t0, T0, T2 */
    420   VC_ADD(t0, T0, T2);
    421 
    422   /* CSUB t1, T0, T2 */
    423   VC_SUB(t1, T0, T2);
    424 
    425   /* CADD t2, T1, T3 */
    426   VC_ADD(t2, T1, T3);
    427 
    428   /* CSUB t3, T1, T3 */
    429   VC_SUB(t3, T1, T3);
    430 }
    431 
    432 /**
    433  * Load 16 float elements (4 sse registers) which is a 4 * 4 matrix.
    434  * Then Do transpose on the matrix.
    435  * 3,  2,  1,  0                  12, 8,  4,  0
    436  * 7,  6,  5,  4        =====>    13, 9,  5,  1
    437  * 11, 10, 9,  8                  14, 10, 6,  2
    438  * 15, 14, 13, 12                 15, 11, 7,  3
    439  */
    440 static __inline void VC_LOAD_MATRIX_TRANSPOSE(
    441     VC *T0,
    442     VC *T1,
    443     VC *T2,
    444     VC *T3,
    445     const OMX_F32 *pT0,
    446     const OMX_F32 *pT1,
    447     const OMX_F32 *pT2,
    448     const OMX_F32 *pT3,
    449     OMX_INT n) {
    450   __m128 xmm0;
    451   __m128 xmm1;
    452   __m128 xmm2;
    453   __m128 xmm3;
    454   __m128 xmm4;
    455   __m128 xmm5;
    456   __m128 xmm6;
    457   __m128 xmm7;
    458 
    459   xmm0 = _mm_load_ps(pT0);
    460   xmm1 = _mm_load_ps(pT1);
    461   xmm2 = _mm_load_ps(pT2);
    462   xmm3 = _mm_load_ps(pT3);
    463 
    464   /* Matrix transpose */
    465   xmm4 = _mm_unpacklo_ps(xmm0, xmm1);
    466   xmm5 = _mm_unpackhi_ps(xmm0, xmm1);
    467   xmm6 = _mm_unpacklo_ps(xmm2, xmm3);
    468   xmm7 = _mm_unpackhi_ps(xmm2, xmm3);
    469   T0->real = _mm_shuffle_ps(xmm4, xmm6, _MM_SHUFFLE(1, 0, 1, 0));
    470   T1->real = _mm_shuffle_ps(xmm4, xmm6, _MM_SHUFFLE(3, 2, 3, 2));
    471   T2->real = _mm_shuffle_ps(xmm5, xmm7, _MM_SHUFFLE(1, 0, 1, 0));
    472   T3->real = _mm_shuffle_ps(xmm5, xmm7, _MM_SHUFFLE(3, 2, 3, 2));
    473 
    474   xmm0 = _mm_load_ps(pT0 + n);
    475   xmm1 = _mm_load_ps(pT1 + n);
    476   xmm2 = _mm_load_ps(pT2 + n);
    477   xmm3 = _mm_load_ps(pT3 + n);
    478 
    479   /* Matrix transpose */
    480   xmm4 = _mm_unpacklo_ps(xmm0, xmm1);
    481   xmm5 = _mm_unpackhi_ps(xmm0, xmm1);
    482   xmm6 = _mm_unpacklo_ps(xmm2, xmm3);
    483   xmm7 = _mm_unpackhi_ps(xmm2, xmm3);
    484   T0->imag = _mm_shuffle_ps(xmm4, xmm6, _MM_SHUFFLE(1, 0, 1, 0));
    485   T1->imag = _mm_shuffle_ps(xmm4, xmm6, _MM_SHUFFLE(3, 2, 3, 2));
    486   T2->imag = _mm_shuffle_ps(xmm5, xmm7, _MM_SHUFFLE(1, 0, 1, 0));
    487   T3->imag = _mm_shuffle_ps(xmm5, xmm7, _MM_SHUFFLE(3, 2, 3, 2));
    488 }
    489