Home | History | Annotate | Download | only in arm64
      1 //
      2 //  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
      3 //
      4 //  Use of this source code is governed by a BSD-style license
      5 //  that can be found in the LICENSE file in the root of the source
      6 //  tree. An additional intellectual property rights grant can be found
      7 //  in the file PATENTS.  All contributing project authors may
      8 //  be found in the AUTHORS file in the root of the source tree.
      9 //
     10 //
     11 //  This is a modification of armSP_FFT_CToC_SC32_Radix4_fs_unsafe_s.s
     12 //  to support float instead of SC32.
     13 //
     14 
     15 //
     16 // Description:
     17 // Compute a first stage Radix 4 FFT stage for a N point complex signal
     18 //
     19 //
     20 
     21 
     22 // Include standard headers
     23 
     24 #include "dl/api/arm/arm64COMM_s.h"
     25 #include "dl/api/arm/omxtypes_s.h"
     26 
     27 // Import symbols required from other files
     28 // (For example tables)
     29 
     30 
     31 
     32 
     33 // Set debugging level
     34 //DEBUG_ON    SETL {TRUE}
     35 
     36 
     37 
     38 // Guarding implementation by the processor name
     39 
     40 
     41 
     42 // Guarding implementation by the processor name
     43 
     44 //Input Registers
     45 
     46 #define pSrc            x0
     47 #define pDst            x1
     48 #define pTwiddle        x2
     49 #define	pSubFFTNum	x3
     50 #define pSubFFTSize	x4
     51 
     52 
     53 //Output Registers
     54 
     55 
     56 //Local Scratch Registers
     57 
     58 #define subFFTNum       x5
     59 #define subFFTSize      x6
     60 #define grpSize         x7
     61 // Reuse grpSize as setCount
     62 #define setCount        x7
     63 #define pointStep       x8
     64 #define outPointStep    x8
     65 #define setStep         x9
     66 #define step1           x10
     67 #define step3           x11
     68 
     69 // Neon Registers
     70 
     71 #define dXr0    v0.2s
     72 #define dXi0    v1.2s
     73 #define dXr1    v2.2s
     74 #define dXi1    v3.2s
     75 #define dXr2    v4.2s
     76 #define dXi2    v5.2s
     77 #define dXr3    v6.2s
     78 #define dXi3    v7.2s
     79 #define dYr0    v8.2s
     80 #define dYi0    v9.2s
     81 #define dYr1    v10.2s
     82 #define dYi1    v11.2s
     83 #define dYr2    v12.2s
     84 #define dYi2    v13.2s
     85 #define dYr3    v14.2s
     86 #define dYi3    v15.2s
     87 #define dZr0    v16.2s
     88 #define dZi0    v17.2s
     89 #define dZr1    v18.2s
     90 #define dZi1    v19.2s
     91 #define dZr2    v20.2s
     92 #define dZi2    v21.2s
     93 #define dZr3    v22.2s
     94 #define dZi3    v23.2s
     95 
     96 
     97         .macro FFTSTAGE scaled, inverse, name
     98 
     99         // Define stack arguments
    100 
    101         // Move args values into our work registers
    102         ldr     subFFTNum, [pSubFFTNum]
    103         ldr     subFFTSize, [pSubFFTSize]
    104 
    105         // pT0+1 increments pT0 by 8 bytes
    106         // pT0+pointStep = increment of 8*pointStep bytes = 2*grpSize bytes
    107         // Note: outPointStep = pointStep for firststage
    108 
    109         lsl     pointStep, subFFTNum, #1
    110 
    111         // Update pSubFFTSize and pSubFFTNum regs
    112         ld2     {dXr0,dXi0}, [pSrc], pointStep          // data[0]
    113 
    114         // subFFTSize = 1 for the first stage
    115         MOV     subFFTSize,#4
    116 
    117         // Note: setCount = subFFTNum/4 (reuse the grpSize reg for setCount)
    118         LSR     grpSize,subFFTNum,#2
    119         ld2     {dXr1,dXi1}, [pSrc], pointStep          //  data[1]
    120         MOV     subFFTNum,grpSize
    121 
    122 
    123         // Calculate the step of input data for the next set
    124         //MOV     setStep,pointStep,LSL #1
    125         lsl     setStep, grpSize, #4
    126         ld2     {dXr2,dXi2}, [pSrc], pointStep          //  data[2]
    127 
    128         // setStep = 3*pointStep
    129         ADD     setStep,setStep,pointStep
    130         // setStep = - 3*pointStep+16
    131 
    132         rsb     setStep,setStep,#16
    133         //  data[3] & update pSrc for the next set
    134         ld2     {dXr3,dXi3}, [pSrc], setStep
    135 
    136         // step1 = 2*pointStep
    137         lsl     step1, pointStep, #1
    138 
    139         // fadd qY0, qX0, qX2
    140         fadd    dYr0, dXr0, dXr2
    141         fadd    dYi0, dXi0, dXi2
    142         // step3 = -pointStep
    143         neg     step3, pointStep
    144 
    145         // grp = 0 a special case since all the twiddle factors are 1
    146         // Loop on the sets : 2 sets at a time
    147 
    148 radix4fsGrpZeroSetLoop\name :
    149 
    150 
    151 
    152         // Decrement setcount
    153         SUBS    setCount,setCount,#2
    154 
    155 
    156         // finish first stage of 4 point FFT
    157 
    158 
    159         // fsub qy2,qx0,qx2
    160         fsub    dYr2, dXr0, dXr2
    161         fsub    dYi2, dXi0, dXi2
    162 
    163         ld2     {dXr0,dXi0}, [pSrc], step1              //  data[0]
    164         // fadd qy1,qx1,qx3
    165         fadd    dYr1, dXr1, dXr3
    166         fadd    dYi1, dXi1, dXi3
    167         ld2     {dXr2,dXi2}, [pSrc], step3              //  data[2]
    168         // fsub qy3,qx1,qx3
    169         fsub    dYr3, dXr1, dXr3
    170         fsub    dYi3, dXi1, dXi3
    171 
    172 
    173         // finish second stage of 4 point FFT
    174 
    175         .ifeqs "\inverse", "TRUE"
    176 
    177             ld2     {dXr1,dXi1}, [pSrc], step1          //  data[1]
    178             // fadd  qz0,qy0,qy1
    179             fadd    dZr0, dYr0, dYr1
    180             fadd    dZi0, dYi0, dYi1
    181 
    182             //  data[3] & update pSrc for the next set, but not if it's the
    183             //  last iteration so that we don't read past the end of the
    184             //  input array.
    185             BEQ     radix4SkipLastUpdateInv\name
    186             ld2     {dXr3,dXi3}, [pSrc], setStep
    187 
    188 radix4SkipLastUpdateInv\name:
    189             FSUB    dZr3,dYr2,dYi3
    190 
    191             st2    {dZr0,dZi0},[pDst],outPointStep
    192             FADD    dZi3,dYi2,dYr3
    193 
    194             // fsub qZ1,qY0,qY1
    195             FSUB    dZr1, dYr0, dYr1
    196             FSUB    dZi1, dYi0, dYi1
    197             st2    {dZr3,dZi3},[pDst],outPointStep
    198 
    199             FADD    dZr2,dYr2,dYi3
    200             st2    {dZr1,dZi1},[pDst],outPointStep
    201             FSUB    dZi2,dYi2,dYr3
    202 
    203             //  fadd qY0, qX0, qX2
    204             FADD    dYr0, dXr0, dXr2                    // u0 for next iteration
    205             FADD    dYi0, dXi0, dXi2
    206             st2    {dZr2,dZi2},[pDst],setStep
    207 
    208 
    209         .else
    210 
    211             ld2     {dXr1,dXi1}, [pSrc], step1          //  data[1]
    212             // fadd qZ0,qY0,qY1
    213             fadd    dZr0, dYr0, dYr1
    214             fadd    dZi0, dYi0, dYi1
    215 
    216             //  data[3] & update pSrc for the next set, but not if it's the
    217             //  last iteration so that we don't read past the end of the
    218             //  input array.
    219             BEQ     radix4SkipLastUpdateFwd\name
    220             ld2     {dXr3,dXi3}, [pSrc], setStep
    221 
    222 radix4SkipLastUpdateFwd\name:
    223             FADD    dZr2,dYr2,dYi3
    224 
    225             st2    {dZr0,dZi0},[pDst],outPointStep
    226             FSUB    dZi2,dYi2,dYr3
    227 
    228             // fsub  qz1,qy0,qy1
    229             fsub    dZr1, dYr0, dYr1
    230             fsub    dZi1, dYi0, dYi1
    231             st2    {dZr2,dZi2},[pDst],outPointStep
    232 
    233             FSUB    dZr3,dYr2,dYi3
    234             st2    {dZr1,dZi1},[pDst],outPointStep
    235             FADD    dZi3,dYi2,dYr3
    236 
    237             //  fadd  qy0,qx0,qx2
    238             fadd    dYr0, dXr0, dXr2                    // u0 for next iteration
    239             fadd    dYi0, dXi0, dXi2
    240 
    241             st2    {dZr3,dZi3},[pDst],setStep
    242 
    243         .endif
    244 
    245         BGT     radix4fsGrpZeroSetLoop\name
    246 
    247         // Save subFFTNum and subFFTSize for next stage
    248         str     subFFTNum, [pSubFFTNum]
    249         str     subFFTSize, [pSubFFTSize]
    250 
    251         .endm
    252 
    253 
    254 
    255         M_START armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace,,d15
    256         FFTSTAGE "FALSE","FALSE",fwd
    257         M_END
    258 
    259 
    260 
    261         M_START armSP_FFTInv_CToC_FC32_Radix4_fs_OutOfPlace,,d15
    262         FFTSTAGE "FALSE","TRUE",inv
    263         M_END
    264 
    265 
    266         .end
    267