Home | History | Annotate | Download | only in x86
      1 /*
      2  *  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  *
     10  */
     11 
     12 #include <stdbool.h>
     13 #include <stdint.h>
     14 
     15 #include "dl/api/omxtypes.h"
     16 #include "dl/sp/api/omxSP.h"
     17 #include "dl/sp/api/x86SP.h"
     18 #include "dl/sp/src/x86/x86SP_SSE_Math.h"
     19 
     20 extern OMX_F32* x86SP_F32_radix2_kernel_OutOfPlace(
     21     const OMX_F32 *src,
     22     OMX_F32 *buf1,
     23     OMX_F32 *buf2,
     24     const OMX_F32 *twiddle,
     25     OMX_INT n,
     26     bool forward_fft);
     27 
     28 extern OMX_F32* x86SP_F32_radix4_kernel_OutOfPlace_sse(
     29     const OMX_F32 *src,
     30     OMX_F32 *buf1,
     31     OMX_F32 *buf2,
     32     const OMX_F32 *twiddle,
     33     OMX_INT n,
     34     bool forward_fft);
     35 
     36 /**
     37  * A two-for-one algorithm is used here to do the real ifft:
     38  *
     39  * Input X[k], (k = 0, ..., N - 1)
     40  * Output x[n] = IDFT(N, k){X}
     41  * X' is complex conjugate of X
     42  * A[k] = (X[k] + X'[N/2 - k]) / 2
     43  * B[k] = (X[k] - X'[N/2 - k]) / 2 * W[k], (W = exp(j*2*PI*k/N);
     44  *                                          k = 0, ..., N/2 - 1)
     45  * Z[k] = A[k] + j * B[k], (k = 0, ..., N/2 - 1)
     46  * z[n] = IDFT(N/2, k){Z}
     47  * x[2n] = Re(z[n]), (n = 0, ..., N/2 - 1)
     48  * x[2n + 1] = Im(z[n]), (n = 0, ..., N/2 - 1)
     49  */
     50 
     51 /**
     52  * This function is the first permutation of two-for-one IFFT algorithm.
     53  * We move the division by 2 to the last step in the implementation, so:
     54  * A[k] = (X[k] + X'[N/2 - k])
     55  * B[k] = (X[k] - X'[N/2 - k]) * W[k], (k = 0, ..., N/2 - 1)
     56  * Z[k] = (A[k] + j * B[k]) / 2, (k = 0, ..., N/2 - 1)
     57  */
     58 static void RevbinPermuteInv(const OMX_F32 *in,
     59                              OMX_F32 *out,
     60                              const OMX_F32 *twiddle,
     61                              OMX_INT n) {
     62   OMX_INT i;
     63   OMX_INT j;
     64   OMX_INT i_by_2;
     65   OMX_INT j_by_2;
     66   OMX_INT n_by_2 = n >> 1;
     67   OMX_INT n_by_4 = n >> 2;
     68 
     69   OMX_FC32 big_a;
     70   OMX_FC32 big_b;
     71   OMX_FC32 temp;
     72   const OMX_F32 *tw;
     73 
     74   for (i = 2, j = n - 2; i < n_by_2; i += 2, j -= 2) {
     75     // A[k] = (X[k] + X'[N/2 - k])
     76     big_a.Re = in[i] + in[j];
     77     big_a.Im = in[i + 1] - in[j + 1];
     78 
     79     // temp = (X[k] - X'[N/2 - k])
     80     temp.Re = in[i] - in[j];
     81     temp.Im = in[i + 1] + in[j + 1];
     82 
     83     i_by_2 = i >> 1;
     84     j_by_2 = j >> 1;
     85 
     86     // W[k]
     87     tw = twiddle + i_by_2;
     88 
     89     // B[k] = (X[k] - X'[N/2 - k]) * W[k]
     90     big_b.Re =  temp.Re * tw[0] + temp.Im * tw[n];
     91     big_b.Im =  temp.Re * tw[n] - temp.Im * tw[0];
     92 
     93     // Convert split format to interleaved format.
     94     // Z[k] = (A[k] + j * B[k]) (k = 0, ..., N/2 - 1)
     95     // The scaling of 1/2 will be merged into to the scaling in
     96     // the last step before the output in omxSP_FFTInv_CCSToR_F32_Sfs.
     97     out[i_by_2] = big_a.Re + big_b.Im;
     98     out[i_by_2 + n_by_2] = big_b.Re + big_a.Im;
     99     out[j_by_2] = big_a.Re - big_b.Im;
    100     out[j_by_2 + n_by_2] = big_b.Re - big_a.Im;
    101   }
    102 
    103   // The n_by_2 complex point
    104   out[n_by_4] = 2.0f * in[n_by_2];
    105   out[n_by_4 + n_by_2] = -2.0f * in[n_by_2 + 1];
    106 
    107   // The first complex point
    108   out[0] = in[0] + in[n];
    109   out[n_by_2] = in[0] - in[n];
    110 }
    111 
    112 // Sse version of RevbinPermuteInv function.
    113 static void RevbinPermuteInvSse(const OMX_F32 *in,
    114                                 OMX_F32 *out,
    115                                 const OMX_F32 *twiddle,
    116                                 OMX_INT n) {
    117   OMX_INT i;
    118   OMX_INT j;
    119   OMX_INT n_by_2 = n >> 1;
    120   OMX_INT n_by_4 = n >> 2;
    121   const OMX_F32 *tw;
    122   const OMX_F32 *pi;
    123   const OMX_F32 *pj;
    124 
    125   VC v_i;
    126   VC v_j;
    127   VC v_big_a;
    128   VC v_big_b;
    129   VC v_temp;
    130   VC v_tw;
    131 
    132   for (i = 0, j = n_by_2 - 3; i < n_by_4; i += 4, j -= 4) {
    133     pi = in + (i << 1);
    134     pj = in + (j << 1);
    135     VC_LOAD_INTERLEAVE(&v_i, pi);
    136 
    137     v_j.real = _mm_set_ps(pj[0], pj[2], pj[4], pj[6]);
    138     v_j.imag = _mm_set_ps(pj[1], pj[3], pj[5], pj[7]);
    139 
    140     // A[k] = (X[k] + X'[N/2 - k])
    141     VC_ADD_SUB(&v_big_a, &v_i, &v_j);
    142 
    143     // temp = (X[k] - X'[N/2 - k])
    144     VC_SUB_ADD(&v_temp, &v_i, &v_j);
    145 
    146     // W[k]
    147     tw = twiddle + i;
    148     VC_LOAD_SPLIT(&v_tw, tw, n);
    149 
    150     // B[k] = (X[k] - X'[N/2 - k]) * W[k]
    151     VC_CONJ_MUL(&v_big_b, &v_temp, &v_tw);
    152 
    153     // Convert split format to interleaved format.
    154     // Z[k] = (A[k] + j * B[k]) (k = 0, ..., N/2 - 1)
    155     // The scaling of 1/2 will be merged into to the scaling in
    156     // the last step before the output in omxSP_FFTInv_CCSToR_F32_Sfs.
    157     VC_ADD_X_STORE_SPLIT((out + i), &v_big_a, &v_big_b, n_by_2);
    158 
    159     VC_SUB_X_INVERSE_STOREU_SPLIT((out + j), &v_big_a, &v_big_b, n_by_2);
    160   }
    161 
    162   // The n_by_2 complex point
    163   out[n_by_4] = 2.0f * in[n_by_2];
    164   out[n_by_4 + n_by_2] = -2.0f * in[n_by_2 + 1];
    165 
    166   // The first complex point
    167   out[0] = in[0] + in[n];
    168   out[n_by_2] = in[0] - in[n];
    169 }
    170 
    171 OMXResult omxSP_FFTInv_CCSToR_F32_Sfs(const OMX_F32 *pSrc, OMX_F32 *pDst,
    172                                       const OMXFFTSpec_R_F32 *pFFTSpec) {
    173   OMX_INT n;
    174   OMX_INT n_by_2;
    175   OMX_INT n_by_4;
    176   OMX_INT i;
    177   const OMX_F32 *twiddle;
    178   OMX_F32 *buf;
    179   OMX_F32 *in = (OMX_F32*) pSrc;
    180 
    181   const X86FFTSpec_R_FC32 *pFFTStruct = (const X86FFTSpec_R_FC32*) pFFTSpec;
    182 
    183   // Input must be 32 byte aligned
    184   if (!pSrc || !pDst || (const uintptr_t)pSrc & 31 || (uintptr_t)pDst & 31)
    185     return OMX_Sts_BadArgErr;
    186 
    187   n = pFFTStruct->N;
    188 
    189   // This is to handle the case of order == 1.
    190   if (n == 2) {
    191     pDst[0] = (pSrc[0] + pSrc[2]) / 2;
    192     pDst[1] = (pSrc[0] - pSrc[2]) / 2;
    193     return OMX_Sts_NoErr;
    194   }
    195 
    196   n_by_2 = n >> 1;
    197   n_by_4 = n >> 2;
    198   buf = pFFTStruct->pBuf1;
    199 
    200   twiddle = pFFTStruct->pTwiddle;
    201 
    202   if (n < 8)
    203     RevbinPermuteInv(in, buf, twiddle, n);
    204   else
    205     RevbinPermuteInvSse(in, buf, twiddle, n);
    206 
    207   if (n_by_2 < 16) {
    208     buf = x86SP_F32_radix2_kernel_OutOfPlace(
    209         buf,
    210         pFFTStruct->pBuf2,
    211         buf,
    212         twiddle,
    213         n_by_2,
    214         0);
    215   } else {
    216     buf = x86SP_F32_radix4_kernel_OutOfPlace_sse(
    217         buf,
    218         pFFTStruct->pBuf2,
    219         buf,
    220         twiddle,
    221         n_by_2,
    222         0);
    223   }
    224 
    225   // Scale the result by 1/n.
    226   // It contains a scaling factor of 1/2 in
    227   // RevbinPermuteInv/RevbinPermuteInvSse.
    228   OMX_F32 factor = 1.0f / n;
    229 
    230   if (n < 8) {
    231     for (i = 0; i < n_by_2; i++) {
    232       pDst[i << 1] = buf[i] * factor;
    233       pDst[(i << 1) + 1] = buf[i + n_by_2] * factor;
    234     }
    235   } else {
    236     OMX_F32 *base;
    237     OMX_F32 *dst;
    238     VC temp0;
    239     VC temp1;
    240     __m128 mFactor = _mm_load1_ps(&factor);
    241 
    242     // Two things are done in this loop:
    243     // 1 Get the result scaled; 2 Change the format from split to interleaved.
    244     for (i = 0; i < n_by_2; i += 4) {
    245       base = buf + i;
    246       dst = pDst + (i << 1);
    247       VC_LOAD_SPLIT(&temp0, base, n_by_2);
    248       VC_MUL_F(&temp1, &temp0, mFactor);
    249       VC_STORE_INTERLEAVE(dst, &temp1);
    250     }
    251   }
    252 
    253   return OMX_Sts_NoErr;
    254 }
    255