Home | History | Annotate | Download | only in mips
      1 /*
      2  *  Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  *
     10  */
     11 
     12 #include <stdint.h>
     13 
     14 #include "dl/api/omxtypes.h"
     15 #include "dl/sp/api/mipsSP.h"
     16 
     17 /*
     18  * Forward real FFT for FFT sizes larger than 16. Computed using the complex
     19  * FFT for half the size.
     20  */
     21 OMXResult mips_FFTFwd_RToCCS_F32_complex(const OMX_F32* pSrc,
     22                                          OMX_F32* pDst,
     23                                          const MIPSFFTSpec_R_FC32* pFFTSpec) {
     24   OMX_U32 n1_4, num_transforms, step;
     25   OMX_F32* w_re_ptr;
     26   OMX_F32* w_im_ptr;
     27   OMX_U32 fft_size = 1 << pFFTSpec->order;
     28   OMX_FC32* p_dst = (OMX_FC32*)pDst;
     29   OMX_FC32* p_buf = (OMX_FC32*)pFFTSpec->pBuf;
     30   OMX_F32 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
     31   OMX_F32 w_re, w_im;
     32 
     33   /*
     34    * Loop performing sub-transforms of size 4,
     35    * which contain 2 butterfly operations.
     36    */
     37   num_transforms = (SUBTRANSFORM_CONST >> (17 - pFFTSpec->order)) | 1;
     38   for (uint32_t n = 0; n < num_transforms; ++n) {
     39     /*
     40      * n is in the range (0 .. num_transforms - 1).
     41      * The size of the pFFTSpec->pOffset is (((SUBTRANSFORM_CONST >>
     42      * (16 - TWIDDLE_TABLE_ORDER)) | 1)).
     43      */
     44     OMX_U32 offset = pFFTSpec->pOffset[n] << 2;
     45     /*
     46      * Offset takes it's value from pFFTSpec->pOffset table which is initialized
     47      * in the omxSP_FFTInit_R_F32 function, and is constant afterwards.
     48      */
     49     OMX_U16* p_bitrev = pFFTSpec->pBitRev + offset;
     50     OMX_FC32* p_tmp = p_buf + offset;
     51     /* Treating the input as a complex vector. */
     52     const OMX_FC32* p_src = (const OMX_FC32*)pSrc;
     53 
     54     tmp1 = p_src[p_bitrev[0]].Re + p_src[p_bitrev[1]].Re;
     55     tmp2 = p_src[p_bitrev[2]].Re + p_src[p_bitrev[3]].Re;
     56     tmp3 = p_src[p_bitrev[0]].Im + p_src[p_bitrev[1]].Im;
     57     tmp4 = p_src[p_bitrev[2]].Im + p_src[p_bitrev[3]].Im;
     58 
     59     p_tmp[0].Re = tmp1 + tmp2;
     60     p_tmp[2].Re = tmp1 - tmp2;
     61     p_tmp[0].Im = tmp3 + tmp4;
     62     p_tmp[2].Im = tmp3 - tmp4;
     63 
     64     tmp1 = p_src[p_bitrev[0]].Re - p_src[p_bitrev[1]].Re;
     65     tmp2 = p_src[p_bitrev[2]].Re - p_src[p_bitrev[3]].Re;
     66     tmp3 = p_src[p_bitrev[0]].Im - p_src[p_bitrev[1]].Im;
     67     tmp4 = p_src[p_bitrev[2]].Im - p_src[p_bitrev[3]].Im;
     68 
     69     p_tmp[1].Re = tmp1 + tmp4;
     70     p_tmp[3].Re = tmp1 - tmp4;
     71     p_tmp[1].Im = tmp3 - tmp2;
     72     p_tmp[3].Im = tmp3 + tmp2;
     73   }
     74 
     75   /*
     76    * Loop performing sub-transforms of size 8,
     77    * which contain four butterfly operations.
     78    */
     79   num_transforms = (num_transforms >> 1) | 1;
     80   for (uint32_t n = 0; n < num_transforms; ++n) {
     81     OMX_U32 offset = pFFTSpec->pOffset[n] << 3;
     82     OMX_U16* p_bitrev = pFFTSpec->pBitRev + offset;
     83     OMX_FC32* p_tmp = p_buf + offset;
     84     const OMX_FC32* p_src = (const OMX_FC32*)pSrc;
     85 
     86     tmp1 = p_src[p_bitrev[4]].Re + p_src[p_bitrev[5]].Re;
     87     tmp2 = p_src[p_bitrev[6]].Re + p_src[p_bitrev[7]].Re;
     88     tmp3 = p_src[p_bitrev[4]].Im + p_src[p_bitrev[5]].Im;
     89     tmp4 = p_src[p_bitrev[6]].Im + p_src[p_bitrev[7]].Im;
     90 
     91     tmp5 = tmp1 + tmp2;
     92     tmp1 = tmp1 - tmp2;
     93     tmp2 = tmp3 + tmp4;
     94     tmp3 = tmp3 - tmp4;
     95 
     96     p_tmp[4].Re = p_tmp[0].Re - tmp5;
     97     p_tmp[0].Re = p_tmp[0].Re + tmp5;
     98     p_tmp[4].Im = p_tmp[0].Im - tmp2;
     99     p_tmp[0].Im = p_tmp[0].Im + tmp2;
    100     p_tmp[6].Re = p_tmp[2].Re - tmp3;
    101     p_tmp[2].Re = p_tmp[2].Re + tmp3;
    102     p_tmp[6].Im = p_tmp[2].Im + tmp1;
    103     p_tmp[2].Im = p_tmp[2].Im - tmp1;
    104 
    105     tmp1 = p_src[p_bitrev[4]].Re - p_src[p_bitrev[5]].Re;
    106     tmp2 = p_src[p_bitrev[6]].Re - p_src[p_bitrev[7]].Re;
    107     tmp3 = p_src[p_bitrev[4]].Im - p_src[p_bitrev[5]].Im;
    108     tmp4 = p_src[p_bitrev[6]].Im - p_src[p_bitrev[7]].Im;
    109 
    110     tmp5 = SQRT1_2 * (tmp1 + tmp3);
    111     tmp1 = SQRT1_2 * (tmp3 - tmp1);
    112     tmp3 = SQRT1_2 * (tmp2 - tmp4);
    113     tmp2 = SQRT1_2 * (tmp2 + tmp4);
    114 
    115     tmp4 = tmp5 + tmp3;
    116     tmp5 = tmp5 - tmp3;
    117     tmp3 = tmp1 + tmp2;
    118     tmp1 = tmp1 - tmp2;
    119 
    120     p_tmp[5].Re = p_tmp[1].Re - tmp4;
    121     p_tmp[1].Re = p_tmp[1].Re + tmp4;
    122     p_tmp[5].Im = p_tmp[1].Im - tmp3;
    123     p_tmp[1].Im = p_tmp[1].Im + tmp3;
    124     p_tmp[7].Re = p_tmp[3].Re - tmp1;
    125     p_tmp[3].Re = p_tmp[3].Re + tmp1;
    126     p_tmp[7].Im = p_tmp[3].Im + tmp5;
    127     p_tmp[3].Im = p_tmp[3].Im - tmp5;
    128   }
    129 
    130   step = 1 << (TWIDDLE_TABLE_ORDER - 4);
    131   n1_4 = 4; /* Quarter of the sub-transform size. */
    132   /* Outer loop that loops over FFT stages. */
    133   for (uint32_t fft_stage = 4; fft_stage <= pFFTSpec->order - 1; ++fft_stage) {
    134     OMX_U32 n1_2 = 2 * n1_4;
    135     OMX_U32 n3_4 = 3 * n1_4;
    136     num_transforms = (num_transforms >> 1) | 1;
    137     /*
    138      * Loop performing sub-transforms of size 16 and higher.
    139      * The actual size depends on the stage.
    140      */
    141     for (uint32_t n = 0; n < num_transforms; ++n) {
    142       OMX_U32 offset = pFFTSpec->pOffset[n] << fft_stage;
    143       OMX_FC32* p_tmp = p_buf + offset;
    144 
    145       tmp1 = p_tmp[n1_2].Re + p_tmp[n3_4].Re;
    146       tmp2 = p_tmp[n1_2].Re - p_tmp[n3_4].Re;
    147       tmp3 = p_tmp[n1_2].Im + p_tmp[n3_4].Im;
    148       tmp4 = p_tmp[n1_2].Im - p_tmp[n3_4].Im;
    149 
    150       p_tmp[n1_2].Re = p_tmp[0].Re - tmp1;
    151       p_tmp[n1_2].Im = p_tmp[0].Im - tmp3;
    152       p_tmp[0].Re = p_tmp[0].Re + tmp1;
    153       p_tmp[0].Im = p_tmp[0].Im + tmp3;
    154       p_tmp[n3_4].Re = p_tmp[n1_4].Re - tmp4;
    155       p_tmp[n3_4].Im = p_tmp[n1_4].Im + tmp2;
    156       p_tmp[n1_4].Re = p_tmp[n1_4].Re + tmp4;
    157       p_tmp[n1_4].Im = p_tmp[n1_4].Im - tmp2;
    158 
    159       /* Twiddle table is initialized for the maximal FFT size. */
    160       w_re_ptr = pFFTSpec->pTwiddle + step;
    161       w_im_ptr =
    162           pFFTSpec->pTwiddle + (OMX_U32)(1 << TWIDDLE_TABLE_ORDER - 2) - step;
    163 
    164       /*
    165        * Loop performing split-radix butterfly operations for
    166        * one sub-transform.
    167        */
    168       for (uint32_t i = 1; i < n1_4; ++i) {
    169         w_re = *w_re_ptr;
    170         w_im = *w_im_ptr;
    171 
    172         tmp1 = w_re * p_tmp[n1_2 + i].Re + w_im * p_tmp[n1_2 + i].Im;
    173         tmp2 = w_re * p_tmp[n1_2 + i].Im - w_im * p_tmp[n1_2 + i].Re;
    174         tmp3 = w_re * p_tmp[n3_4 + i].Re - w_im * p_tmp[n3_4 + i].Im;
    175         tmp4 = w_re * p_tmp[n3_4 + i].Im + w_im * p_tmp[n3_4 + i].Re;
    176 
    177         tmp5 = tmp1 + tmp3;
    178         tmp1 = tmp1 - tmp3;
    179         tmp6 = tmp2 + tmp4;
    180         tmp2 = tmp2 - tmp4;
    181 
    182         p_tmp[n1_2 + i].Re = p_tmp[i].Re - tmp5;
    183         p_tmp[n1_2 + i].Im = p_tmp[i].Im - tmp6;
    184         p_tmp[i].Re = p_tmp[i].Re + tmp5;
    185         p_tmp[i].Im = p_tmp[i].Im + tmp6;
    186         p_tmp[n3_4 + i].Re = p_tmp[n1_4 + i].Re - tmp2;
    187         p_tmp[n3_4 + i].Im = p_tmp[n1_4 + i].Im + tmp1;
    188         p_tmp[n1_4 + i].Re = p_tmp[n1_4 + i].Re + tmp2;
    189         p_tmp[n1_4 + i].Im = p_tmp[n1_4 + i].Im - tmp1;
    190 
    191         w_re_ptr += step;
    192         w_im_ptr -= step;
    193       }
    194     }
    195     step >>= 1;
    196     n1_4 <<= 1;
    197   }
    198 
    199   /* Additional computation to get the output for full FFT size. */
    200   w_re_ptr = pFFTSpec->pTwiddle + step;
    201   w_im_ptr =
    202       pFFTSpec->pTwiddle + (OMX_U32)(1 << TWIDDLE_TABLE_ORDER - 2) - step;
    203 
    204   for (uint32_t i = 1; i < fft_size / 8; ++i) {
    205     tmp1 = p_buf[i].Re;
    206     tmp2 = p_buf[i].Im;
    207     tmp3 = p_buf[fft_size / 2 - i].Re;
    208     tmp4 = p_buf[fft_size / 2 - i].Im;
    209 
    210     tmp5 = tmp1 + tmp3;
    211     tmp6 = tmp1 - tmp3;
    212     tmp7 = tmp2 + tmp4;
    213     tmp8 = tmp2 - tmp4;
    214 
    215     tmp1 = p_buf[i + fft_size / 4].Re;
    216     tmp2 = p_buf[i + fft_size / 4].Im;
    217     tmp3 = p_buf[fft_size / 4 - i].Re;
    218     tmp4 = p_buf[fft_size / 4 - i].Im;
    219 
    220     w_re = *w_re_ptr;
    221     w_im = *w_im_ptr;
    222 
    223     p_dst[i].Re = 0.5f * (tmp5 + w_re * tmp7 - w_im * tmp6);
    224     p_dst[i].Im = 0.5f * (tmp8 - w_re * tmp6 - w_im * tmp7);
    225     p_dst[fft_size / 2 - i].Re = 0.5f * (tmp5 - w_re * tmp7 + w_im * tmp6);
    226     p_dst[fft_size / 2 - i].Im = 0.5f * (-tmp8 - w_re * tmp6 - w_im * tmp7);
    227 
    228     tmp5 = tmp1 + tmp3;
    229     tmp6 = tmp1 - tmp3;
    230     tmp7 = tmp2 + tmp4;
    231     tmp8 = tmp2 - tmp4;
    232 
    233     p_dst[i + fft_size / 4].Re = 0.5f * (tmp5 - w_im * tmp7 - w_re * tmp6);
    234     p_dst[i + fft_size / 4].Im = 0.5f * (tmp8 + w_im * tmp6 - w_re * tmp7);
    235     p_dst[fft_size / 4 - i].Re = 0.5f * (tmp5 + w_im * tmp7 + w_re * tmp6);
    236     p_dst[fft_size / 4 - i].Im = 0.5f * (-tmp8 + w_im * tmp6 - w_re * tmp7);
    237 
    238     w_re_ptr += step;
    239     w_im_ptr -= step;
    240   }
    241   tmp1 = p_buf[fft_size / 8].Re;
    242   tmp2 = p_buf[fft_size / 8].Im;
    243   tmp3 = p_buf[3 * fft_size / 8].Re;
    244   tmp4 = p_buf[3 * fft_size / 8].Im;
    245 
    246   tmp5 = tmp1 + tmp3;
    247   tmp6 = tmp1 - tmp3;
    248   tmp7 = tmp2 + tmp4;
    249   tmp8 = tmp2 - tmp4;
    250 
    251   w_re = *w_re_ptr;
    252   w_im = *w_im_ptr;
    253 
    254   p_dst[fft_size / 8].Re = 0.5f * (tmp5 + w_re * tmp7 - w_im * tmp6);
    255   p_dst[fft_size / 8].Im = 0.5f * (tmp8 - w_re * tmp6 - w_im * tmp7);
    256   p_dst[3 * fft_size / 8].Re = 0.5f * (tmp5 - w_re * tmp7 + w_im * tmp6);
    257   p_dst[3 * fft_size / 8].Im = 0.5f * (-tmp8 - w_re * tmp6 - w_im * tmp7);
    258 
    259   p_dst[0].Re = p_buf[0].Re + p_buf[0].Im;
    260   p_dst[fft_size / 4].Re = p_buf[fft_size / 4].Re;
    261   p_dst[fft_size / 2].Re = p_buf[0].Re - p_buf[0].Im;
    262   p_dst[0].Im = 0.0f;
    263   p_dst[fft_size / 4].Im = -p_buf[fft_size / 4].Im;
    264   p_dst[fft_size / 2].Im = 0.0f;
    265 
    266   return OMX_Sts_NoErr;
    267 }
    268