Home | History | Annotate | Download | only in arm64
      1 //
      2 //  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
      3 //
      4 //  Use of this source code is governed by a BSD-style license
      5 //  that can be found in the LICENSE file in the root of the source
      6 //  tree. An additional intellectual property rights grant can be found
      7 //  in the file PATENTS.  All contributing project authors may
      8 //  be found in the AUTHORS file in the root of the source tree.
      9 //
     10 //  This is a modification of armSP_FFT_CToC_SC32_Radix2_unsafe_s.s
     11 //  to support float instead of SC32.
     12 //
     13 
     14 // Description:
     15 // Compute a Radix 2 DIT in-order out-of-place FFT stage for an N point
     16 // complex signal.  This handles the general stage, not the first or last
     17 // stage.
     18 //
     19 //
     20 
     21 
     22 // Include standard headers
     23 
     24 #include "dl/api/arm/arm64COMM_s.h"
     25 #include "dl/api/arm/omxtypes_s.h"
     26 
     27 
     28 // Import symbols required from other files
     29 // (For example tables)
     30 
     31 
     32 
     33 // Set debugging level
     34 //DEBUG_ON    SETL {TRUE}
     35 
     36 
     37 
     38 // Guarding implementation by the processor name
     39 
     40 
     41 
     42 
     43 // Guarding implementation by the processor name
     44 
     45 //Input Registers
     46 
     47 #define pSrc            x0
     48 #define pDst            x1
     49 #define pTwiddle        x2
     50 #define	pSubFFTNum	x3
     51 #define pSubFFTSize	x4
     52 
     53 
     54 //Output Registers
     55 
     56 
     57 //Local Scratch Registers
     58 
     59 #define subFFTNum       x5
     60 #define subFFTSize      x6
     61 #define outPointStep    x8
     62 #define pointStep       x9
     63 #define pointStep32     w9
     64 #define grpCount        x10
     65 #define grpCount32      w10
     66 #define setCount        x13
     67 #define step            x15
     68 #define dstStep         x11
     69 
     70 // Neon Registers
     71 
     72 #define dW      v0.2s
     73 #define dX0     v2.2s
     74 #define dX1     v3.2s
     75 #define dX2     v4.2s
     76 #define dX3     v5.2s
     77 #define dY0     v6.2s
     78 #define dY1     v7.2s
     79 #define dY2     v8.2s
     80 #define dY3     v9.2s
     81 #define qT0     v10.2s
     82 #define qT1     v11.2s
     83 
     84         .macro FFTSTAGE scaled, inverse, name
     85 
     86         // Define stack arguments
     87 
     88         // Move args values into our work registers
     89         ldr     subFFTNum, [pSubFFTNum]
     90         ldr     subFFTSize, [pSubFFTSize]
     91 
     92         // Update grpCount and grpSize rightaway inorder to reuse pGrpCount
     93         // and pGrpSize regs
     94 
     95         LSR     subFFTNum,subFFTNum,#1                 //grpSize
     96         LSL     grpCount,subFFTSize,#1
     97 
     98 
     99         // pT0+1 increments pT0 by 8 bytes
    100         // pT0+pointStep = increment of 8*pointStep bytes = 4*grpSize bytes
    101         lsl     pointStep, subFFTNum, #2
    102 
    103         // update subFFTSize for the next stage
    104         MOV     subFFTSize,grpCount
    105 
    106         // pOut0+1 increments pOut0 by 8 bytes
    107         // pOut0+outPointStep == increment of 8*outPointStep bytes =
    108         //    4*size bytes
    109         smull   outPointStep, grpCount32, pointStep32
    110 
    111         LSL     pointStep,pointStep,#1
    112 
    113 
    114         rsb      step,pointStep,#16
    115         rsb      dstStep,outPointStep,#16
    116 
    117         // Loop on the groups
    118 
    119 radix2GrpLoop\name :
    120         lsr     setCount, pointStep, #3
    121         LD1     {dW},[pTwiddle],pointStep              //[wi | wr]
    122 
    123 
    124         // Loop on the sets
    125 
    126 
    127 radix2SetLoop\name :
    128 
    129 
    130         // point0: dX0-real part dX1-img part
    131         LD2    {dX0,dX1},[pSrc],pointStep
    132         // point1: dX2-real part dX3-img part
    133         LD2    {dX2,dX3},[pSrc],step
    134 
    135         SUBS    setCount,setCount,#2
    136 
    137         .ifeqs  "\inverse", "TRUE"
    138             fmul   qT0,dX2,dW[0]
    139             fmla   qT0,dX3,dW[1]                       // real part
    140             fmul   qT1,dX3,dW[0]
    141             fmls   qT1,dX2,dW[1]                       // imag part
    142 
    143         .else
    144 
    145             fmul   qT0,dX2,dW[0]
    146             fmls   qT0,dX3,dW[1]                       // real part
    147             fmul   qT1,dX3,dW[0]
    148             fmla   qT1,dX2,dW[1]                       // imag part
    149 
    150         .endif
    151 
    152         fsub    dY0,dX0,qT0
    153         fsub    dY1,dX1,qT1
    154         fadd    dY2,dX0,qT0
    155         fadd    dY3,dX1,qT1
    156 
    157         st2    {dY0,dY1},[pDst],outPointStep
    158         // dstStep = -outPointStep + 16
    159         st2    {dY2,dY3},[pDst],dstStep
    160 
    161         BGT     radix2SetLoop\name
    162 
    163         SUBS    grpCount,grpCount,#2
    164         ADD     pSrc,pSrc,pointStep
    165         BGT     radix2GrpLoop\name
    166 
    167 
    168         str     subFFTNum, [pSubFFTNum]
    169         str     subFFTSize, [pSubFFTSize]
    170         .endm
    171 
    172 
    173 
    174         M_START armSP_FFTFwd_CToC_FC32_Radix2_OutOfPlace,,d11
    175         FFTSTAGE "FALSE","FALSE",FWD
    176         M_END
    177 
    178 
    179 
    180         M_START armSP_FFTInv_CToC_FC32_Radix2_OutOfPlace,,d11
    181         FFTSTAGE "FALSE","TRUE",INV
    182         M_END
    183 
    184 
    185         .end
    186