Home | History | Annotate | Download | only in arm64
      1 /*
      2  *  Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include <arm_neon.h>
     12 
     13 #include "dl/api/omxtypes.h"
     14 #include "dl/sp/api/armSP.h"
     15 #include "dl/sp/api/omxSP.h"
     16 
     17 extern void armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace(
     18     const OMX_FC32* pSrc,
     19     OMX_FC32* pDst,
     20     OMX_FC32* pTwiddle,
     21     long* subFFTNum,
     22     long* subFFTSize);
     23 
     24 extern void armSP_FFTInv_CToC_FC32_Radix2_ls_OutOfPlace(
     25     const OMX_FC32* pSrc,
     26     OMX_FC32* pDst,
     27     OMX_FC32* pTwiddle,
     28     long* subFFTNum,
     29     long* subFFTSize);
     30 
     31 extern void armSP_FFTInv_CToC_FC32_Radix2_OutOfPlace(
     32     const OMX_FC32* pSrc,
     33     OMX_FC32* pDst,
     34     OMX_FC32* pTwiddle,
     35     long* subFFTNum,
     36     long* subFFTSize);
     37 
     38 extern void armSP_FFTInv_CToC_FC32_Radix4_fs_OutOfPlace(
     39     const OMX_FC32* pSrc,
     40     OMX_FC32* pDst,
     41     OMX_FC32* pTwiddle,
     42     long* subFFTNum,
     43     long* subFFTSize);
     44 
     45 extern void armSP_FFTInv_CToC_FC32_Radix4_OutOfPlace(
     46     const OMX_FC32* pSrc,
     47     OMX_FC32* pDst,
     48     OMX_FC32* pTwiddle,
     49     long* subFFTNum,
     50     long* subFFTSize);
     51 
     52 extern void armSP_FFTInv_CToC_FC32_Radix4_ls_OutOfPlace(
     53     const OMX_FC32* pSrc,
     54     OMX_FC32* pDst,
     55     OMX_FC32* pTwiddle,
     56     long* subFFTNum,
     57     long* subFFTSize);
     58 
     59 extern void armSP_FFTInv_CToC_FC32_Radix8_fs_OutOfPlace(
     60     const OMX_FC32* pSrc,
     61     OMX_FC32* pDst,
     62     OMX_FC32* pTwiddle,
     63     long* subFFTNum,
     64     long* subFFTSize);
     65 
     66 extern void armSP_FFTInv_CCSToR_F32_preTwiddleRadix2(
     67     const OMX_F32* pSrc,
     68     const OMX_FC32* pTwiddle,
     69     OMX_F32* pBuf,
     70     long N);
     71 
     72 /*
     73  * Scale FFT data by 1/|length|. |length| must be a power of two
     74  */
     75 static inline ScaleRFFTData(OMX_F32* fftData, unsigned length) {
     76   float32_t* data = (float32_t*)fftData;
     77   float32_t scale = 2.0f / length;
     78 
     79   if (length >= 4) {
     80     /*
     81      * Do 4 float elements at a time because |length| is always a
     82      * multiple of 4 when |length| >= 4.
     83      *
     84      * TODO(rtoy): Figure out how to process 8 elements at a time
     85      * using intrinsics or replace this with inline assembly.
     86      */
     87     do {
     88       float32x4_t x = vld1q_f32(data);
     89 
     90       length -= 4;
     91       x = vmulq_n_f32(x, scale);
     92       vst1q_f32(data, x);
     93       data += 4;
     94     } while (length > 0);
     95   } else if (length == 2) {
     96     float32x2_t x = vld1_f32(data);
     97     x = vmul_n_f32(x, scale);
     98     vst1_f32(data, x);
     99   } else {
    100     fftData[0] *= scale;
    101   }
    102 }
    103 
    104 /**
    105  * Function:  omxSP_FFTInv_CCSToR_F32_Sfs
    106  *
    107  * Description:
    108  * These functions compute the inverse FFT for a conjugate-symmetric input
    109  * sequence.  Transform length is determined by the specification structure,
    110  * which must be initialized prior to calling the FFT function using
    111  * <FFTInit_R_F32>. For a transform of length M, the input sequence is
    112  * represented using a packed CCS vector of length M+2, and is organized
    113  * as follows:
    114  *
    115  *   Index:   0  1  2    3    4    5    . . .  M-2       M-1      M      M+1
    116  *   Comp:  R[0] 0  R[1] I[1] R[2] I[2] . . .  R[M/2-1]  I[M/2-1] R[M/2] 0
    117  *
    118  * where R[n] and I[n], respectively, denote the real and imaginary
    119  * components for FFT bin n. Bins are numbered from 0 to M/2, where M
    120  * is the FFT length.  Bin index 0 corresponds to the DC component,
    121  * and bin index M/2 corresponds to the foldover frequency.
    122  *
    123  * Input Arguments:
    124  *   pSrc - pointer to the complex-valued input sequence represented
    125  *          using CCS format, of length (2^order) + 2; must be aligned on a
    126  *          32-byte boundary.
    127  *   pFFTSpec - pointer to the preallocated and initialized
    128  *              specification structure
    129  *
    130  * Output Arguments:
    131  *   pDst - pointer to the real-valued output sequence, of length
    132  *          2^order ; must be aligned on a 32-byte boundary.
    133  *
    134  * Return Value:
    135  *
    136  *    OMX_Sts_NoErr - no error
    137 
    138  *    OMX_Sts_BadArgErr - bad arguments if one or more of the
    139  *      following is true:
    140  *    -    pSrc, pDst, or pFFTSpec is NULL
    141  *    -    pSrc or pDst is not aligned on a 32-byte boundary
    142  *
    143  */
    144 OMXResult omxSP_FFTInv_CCSToR_F32_Sfs(
    145     const OMX_F32* pSrc,
    146     OMX_F32* pDst,
    147     const OMXFFTSpec_R_F32* pFFTSpec) {
    148   ARMsFFTSpec_R_FC32* spec = (ARMsFFTSpec_R_FC32*)pFFTSpec;
    149   int order;
    150   long subFFTSize;
    151   long subFFTNum;
    152   OMX_FC32* pTwiddle;
    153   OMX_FC32* pOut;
    154   OMX_FC32* pComplexSrc;
    155   OMX_FC32* pComplexDst = (OMX_FC32*) pDst;
    156 
    157   /*
    158    * Check args are not NULL and the source and destination pointers
    159    * are properly aligned.
    160    */
    161   if (!validateParametersF32(pSrc, pDst, spec))
    162     return OMX_Sts_BadArgErr;
    163 
    164   /*
    165    * Preprocess the input before calling the complex inverse FFT. The
    166    * result is actually stored in the second half of the temp buffer
    167    * in pFFTSpec.
    168    */
    169   if (spec->N > 1)
    170     armSP_FFTInv_CCSToR_F32_preTwiddleRadix2(
    171         pSrc, spec->pTwiddle, spec->pBuf, spec->N);
    172 
    173   /*
    174    * Do a complex inverse FFT of half size.
    175    */
    176   order = fastlog2(spec->N) - 1;
    177 
    178   subFFTSize = 1;
    179   subFFTNum = spec->N >> 1;
    180   pTwiddle = spec->pTwiddle;
    181   /*
    182    * The pBuf is split in half. The first half is the temp buffer. The
    183    * second half holds the source data that was placed there by
    184    * armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_unsafe.
    185    */
    186   pOut = (OMX_FC32*) spec->pBuf;
    187   pComplexSrc = pOut + (1 << order);
    188 
    189 
    190   if (order > 3) {
    191     OMX_FC32* argDst;
    192 
    193     /*
    194      * Set up argDst and pOut appropriately so that pOut = pDst for
    195      * the very last FFT stage.
    196      */
    197     if ((order & 2) == 0) {
    198       argDst = pOut;
    199       pOut = pComplexDst;
    200     } else {
    201       argDst = pComplexDst;
    202     }
    203 
    204     /*
    205      * Odd order uses a radix 8 first stage; even order, a radix 4
    206      * first stage.
    207      */
    208     if (order & 1) {
    209       armSP_FFTInv_CToC_FC32_Radix8_fs_OutOfPlace(
    210           pComplexSrc, argDst, pTwiddle, &subFFTNum, &subFFTSize);
    211     } else {
    212       armSP_FFTInv_CToC_FC32_Radix4_fs_OutOfPlace(
    213           pComplexSrc, argDst, pTwiddle, &subFFTNum, &subFFTSize);
    214     }
    215 
    216     /*
    217      * Now use radix 4 stages to finish rest of the FFT
    218      */
    219     if (subFFTNum >= 4) {
    220       while (subFFTNum > 4) {
    221         OMX_FC32* tmp;
    222 
    223         armSP_FFTInv_CToC_FC32_Radix4_OutOfPlace(
    224             argDst, pOut, pTwiddle, &subFFTNum, &subFFTSize);
    225         /*
    226          * Swap argDst and pOut
    227          */
    228         tmp = pOut;
    229         pOut = argDst;
    230         argDst = tmp;
    231       }
    232 
    233       armSP_FFTInv_CToC_FC32_Radix4_ls_OutOfPlace(
    234           argDst, pOut, pTwiddle, &subFFTNum, &subFFTSize);
    235     }
    236   } else if (order == 3) {
    237     armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace(
    238         pComplexSrc, pComplexDst, pTwiddle, &subFFTNum, &subFFTSize);
    239     armSP_FFTInv_CToC_FC32_Radix2_OutOfPlace(
    240         pComplexDst, pOut, pTwiddle, &subFFTNum, &subFFTSize);
    241     armSP_FFTInv_CToC_FC32_Radix2_ls_OutOfPlace(
    242         pOut, pComplexDst, pTwiddle, &subFFTNum, &subFFTSize);
    243   } else if (order == 2) {
    244     armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace(
    245         pComplexSrc, pOut, pTwiddle, &subFFTNum, &subFFTSize);
    246     armSP_FFTInv_CToC_FC32_Radix2_ls_OutOfPlace(
    247         pOut, pComplexDst, pTwiddle, &subFFTNum, &subFFTSize);
    248   } else if (order == 1) {
    249     armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace(
    250         pComplexSrc, pComplexDst, pTwiddle, &subFFTNum, &subFFTSize);
    251   } else {
    252     /* Order = 0 */
    253     *pComplexDst = *pComplexSrc;
    254   }
    255 
    256   ScaleRFFTData(pDst, spec->N);
    257   return OMX_Sts_NoErr;
    258 }
    259 
    260