Home | History | Annotate | Download | only in x86
      1 /*
      2  *  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  *
     10  */
     11 
     12 #include <stdbool.h>
     13 #include <stdint.h>
     14 
     15 #include "dl/api/omxtypes.h"
     16 #include "dl/sp/api/omxSP.h"
     17 #include "dl/sp/api/x86SP.h"
     18 #include "dl/sp/src/x86/x86SP_SSE_Math.h"
     19 
     20 extern OMX_F32* x86SP_F32_radix2_kernel_OutOfPlace(
     21     const OMX_F32 *src,
     22     OMX_F32 *buf1,
     23     OMX_F32 *buf2,
     24     const OMX_F32 *twiddle,
     25     OMX_INT n,
     26     bool forward_fft);
     27 
     28 extern OMX_F32* x86SP_F32_radix4_kernel_OutOfPlace_sse(
     29     const OMX_F32 *src,
     30     OMX_F32 *buf1,
     31     OMX_F32 *buf2,
     32     const OMX_F32 *twiddle,
     33     OMX_INT n,
     34     bool forward_fft);
     35 
     36 /**
     37  * A two-for-one algorithm is used here to do the real fft:
     38  *
     39  * Input x[n], (n = 0, ..., N - 1)
     40  * Output X[k] = DFT(N, k){x}
     41  * a[n] = x[2n], (n = 0, ..., N/2 - 1)
     42  * b[n] = x[2n + 1], (n = 0, ..., N/2 - 1)
     43  * z[n] = a[n] + j * b[n]
     44  * Z[k] = DFT(N/2, k){z}
     45  * Z' is the complex conjugate of Z
     46  * A[k] = (Z[k] + Z'[N/2 - k]) / 2
     47  * B[k] = -j * (Z[k] - Z'[N/2 - k]) / 2
     48  * X[k] = A[k] + B[k] * W[k], (W = exp(-j*2*PI*k/N); k = 0, ..., N/2 - 1)
     49  * X[k] = A[k] - B[k], (k = N/2)
     50  * X' is complex conjugate of X
     51  * X[k] = X'[N - k], (k = N/2 + 1, ..., N - 1)
     52  */
     53 
     54 /**
     55  * This function is the last permutation of two-for-one FFT algorithm.
     56  * We move the division by 2 to the last step in the implementation, so:
     57  * A[k] = (Z[k] + Z'[N/2 - k])
     58  * B[k] = -j * (Z[k] - Z'[N/2 - k])
     59  * X[k] = (A[k] + B[k] * W[k]) / 2, (k = 0, ..., N/2 - 1)
     60  * X[k] = (A[k] - B[k]), (k = N/2)
     61  * X[k] = X'[N - k], (k = N/2 + 1, ..., N - 1)
     62  */
     63 static void RevbinPermuteFwd(
     64     const OMX_F32 *in,
     65     OMX_F32 *out,
     66     const OMX_F32 *twiddle,
     67     OMX_INT n) {
     68   OMX_INT i;
     69   OMX_INT j;
     70   OMX_INT n_by_2 = n >> 1;
     71   OMX_INT n_by_4 = n >> 2;
     72 
     73   OMX_FC32 big_a;
     74   OMX_FC32 big_b;
     75   OMX_FC32 temp;
     76   const OMX_F32 *tw;
     77 
     78   for (i = 1, j = n_by_2 - 1; i < n_by_4; i++, j--) {
     79     // A[k] = (Z[k] + Z'[N/2 - k])
     80     big_a.Re = in[i] + in[j];
     81     big_a.Im = in[j + n_by_2] - in[i + n_by_2];
     82 
     83     // B[k] = -j * (Z[k] - Z'[N/2 - k])
     84     big_b.Re = in[j] - in[i];
     85     big_b.Im = in[j + n_by_2] + in[i + n_by_2];
     86 
     87     // W[k]
     88     tw = twiddle + i;
     89 
     90     // temp = B[k] * W[k]
     91     temp.Re =  big_b.Re * tw[0] + big_b.Im * tw[n];
     92     temp.Im =  big_b.Re * tw[n] - big_b.Im * tw[0];
     93 
     94     // Convert split format to interleaved format.
     95     // X[k] = (A[k] + B[k] * W[k]) / 2, (k = 0, ..., N/2 - 1)
     96     out[i << 1] = 0.5f * (big_a.Re - temp.Im);
     97     out[(i << 1) + 1] = 0.5f * (temp.Re - big_a.Im);
     98     // X[k] = X'[N - k] (k = N/2 + 1, ..., N - 1)
     99     out[j << 1] = 0.5f * (big_a.Re + temp.Im);
    100     out[(j << 1) + 1] = 0.5f * (temp.Re + big_a.Im);
    101   }
    102 
    103   // X[k] = A[k] - B[k] (k = N/2)
    104   out[n_by_2] = in[n_by_4];
    105   out[n_by_2 + 1] = -in[n_by_4 + n_by_2];
    106 
    107   out[0] = in[0] + in[n_by_2];
    108   out[1] = 0;
    109   out[n] = in[0] - in[n_by_2];
    110   out[n + 1] = 0;
    111 }
    112 
    113 // Sse version of RevbinPermuteFwd function.
    114 static void RevbinPermuteFwdSse(
    115     const OMX_F32 *in,
    116     OMX_F32 *out,
    117     const OMX_F32 *twiddle,
    118     OMX_INT n) {
    119   OMX_INT i;
    120   OMX_INT j;
    121   OMX_INT n_by_2 = n >> 1;
    122   OMX_INT n_by_4 = n >> 2;
    123 
    124   VC v_i;
    125   VC v_j;
    126   VC v_big_a;
    127   VC v_big_b;
    128   VC v_temp;
    129   VC v_x0;
    130   VC v_x1;
    131   VC v_tw;
    132 
    133   __m128 factor = _mm_set1_ps(0.5f);
    134 
    135   for (i = 0, j = n_by_2 - 3; i < n_by_4; i += 4, j -= 4) {
    136     VC_LOAD_SPLIT(&v_i, (in + i), n_by_2);
    137 
    138     VC_LOADU_SPLIT(&v_j, (in + j), n_by_2);
    139     VC_REVERSE(&v_j);
    140 
    141     // A[k] = (Z[k] + Z'[N/2 - k])
    142     VC_ADD_SUB(&v_big_a, &v_j, &v_i);
    143 
    144     // B[k] = -j * (Z[k] - Z'[N/2 - k])
    145     VC_SUB_ADD(&v_big_b, &v_j, &v_i);
    146 
    147     // W[k]
    148     VC_LOAD_SPLIT(&v_tw, (twiddle + i), n);
    149 
    150     // temp = B[k] * W[k]
    151     VC_CONJ_MUL(&v_temp, &v_big_b, &v_tw);
    152 
    153     VC_SUB_X(&v_x0, &v_big_a, &v_temp);
    154     VC_ADD_X(&v_x1, &v_big_a, &v_temp);
    155 
    156     VC_MUL_F(&v_x0, &v_x0, factor);
    157     VC_MUL_F(&v_x1, &v_x1, factor);
    158 
    159     // X[k] = A[k] + B[k] * W[k] (k = 0, ..., N/2 - 1)
    160     VC_STORE_INTERLEAVE((out + (i << 1)), &v_x0);
    161 
    162     // X[k] = X'[N - k] (k = N/2 + 1, ..., N - 1)
    163     VC_REVERSE(&v_x1);
    164     VC_STOREU_INTERLEAVE((out + (j << 1)), &v_x1);
    165   }
    166 
    167   out[n_by_2] = in[n_by_4];
    168   out[n_by_2 + 1] = -in[n_by_4 + n_by_2];
    169 
    170   out[0] = in[0] + in[n_by_2];
    171   out[1] = 0;
    172   out[n] = in[0] - in[n_by_2];
    173   out[n + 1] = 0;
    174 }
    175 
    176 OMXResult omxSP_FFTFwd_RToCCS_F32_Sfs(const OMX_F32 *pSrc, OMX_F32 *pDst,
    177                                       const OMXFFTSpec_R_F32 *pFFTSpec) {
    178   OMX_INT n;
    179   OMX_INT n_by_2;
    180   OMX_INT n_by_4;
    181   const OMX_F32 *twiddle;
    182   OMX_F32 *buf;
    183 
    184   const X86FFTSpec_R_FC32 *pFFTStruct = (const X86FFTSpec_R_FC32*) pFFTSpec;
    185 
    186   // Input must be 32 byte aligned
    187   if (!pSrc || !pDst || (const uintptr_t)pSrc & 31 || (uintptr_t)pDst & 31)
    188     return OMX_Sts_BadArgErr;
    189 
    190   n = pFFTStruct->N;
    191 
    192   // This is to handle the case of order == 1.
    193   if (n == 2) {
    194     pDst[0] = (pSrc[0] + pSrc[1]);
    195     pDst[1] = 0.0f;
    196     pDst[2] = (pSrc[0] - pSrc[1]);
    197     pDst[3] = 0.0f;
    198     return OMX_Sts_NoErr;
    199   }
    200 
    201   n_by_2 = n >> 1;
    202   n_by_4 = n >> 2;
    203   buf = pFFTStruct->pBuf1;
    204   twiddle = pFFTStruct->pTwiddle;
    205 
    206   if(n_by_2 >= 16) {
    207     buf = x86SP_F32_radix4_kernel_OutOfPlace_sse(
    208         pSrc,
    209         pFFTStruct->pBuf2,
    210         buf,
    211         twiddle,
    212         n_by_2,
    213         1);
    214   } else {
    215     buf = x86SP_F32_radix2_kernel_OutOfPlace(
    216         pSrc,
    217         pFFTStruct->pBuf2,
    218         buf,
    219         twiddle,
    220         n_by_2,
    221         1);
    222   }
    223 
    224   if(n >= 8)
    225     RevbinPermuteFwdSse(buf, pDst, twiddle, n);
    226   else
    227     RevbinPermuteFwd(buf, pDst, twiddle, n);
    228 
    229   return OMX_Sts_NoErr;
    230 }
    231