Home | History | Annotate | Download | only in neon
      1 @//
      2 @//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
      3 @//
      4 @//  Use of this source code is governed by a BSD-style license
      5 @//  that can be found in the LICENSE file in the root of the source
      6 @//  tree. An additional intellectual property rights grant can be found
      7 @//  in the file PATENTS.  All contributing project authors may
      8 @//  be found in the AUTHORS file in the root of the source tree.
      9 @//
     10 @//  This is a modification of armSP_FFT_CToC_SC32_Radix2_unsafe_s.s
     11 @//  to support float instead of SC32.
     12 @//
     13 
     14 @// Description:
     15 @// Compute a Radix 2 DIT in-order out-of-place FFT stage for an N point
     16 @// complex signal.  This handles the general stage, not the first or last
     17 @// stage.
     18 @//
     19 @//
     20 
     21 
     22 @// Include standard headers
     23 
     24 #include "dl/api/arm/armCOMM_s.h"
     25 #include "dl/api/arm/omxtypes_s.h"
     26 
     27 
     28 @// Import symbols required from other files
     29 @// (For example tables)
     30 
     31 
     32 
     33 @// Set debugging level
     34 @//DEBUG_ON    SETL {TRUE}
     35 
     36 
     37 
     38 @// Guarding implementation by the processor name
     39 
     40 
     41 
     42 
     43 @// Guarding implementation by the processor name
     44 
     45 
     46 @//Input Registers
     47 
     48 #define pSrc            r0
     49 #define pDst            r2
     50 #define pTwiddle        r1
     51 #define subFFTNum       r6
     52 #define subFFTSize      r7
     53 
     54 
     55 @//Output Registers
     56 
     57 
     58 @//Local Scratch Registers
     59 
     60 #define outPointStep    r3
     61 #define pointStep       r4
     62 #define grpCount        r5
     63 #define setCount        r8
     64 @//const           RN  9
     65 #define step            r10
     66 #define dstStep         r11
     67 #define pTable          r9
     68 #define pTmp            r9
     69 
     70 @// Neon Registers
     71 
     72 #define dW      D0.F32
     73 #define dX0     D2.F32
     74 #define dX1     D3.F32
     75 #define dX2     D4.F32
     76 #define dX3     D5.F32
     77 #define dY0     D6.F32
     78 #define dY1     D7.F32
     79 #define dY2     D8.F32
     80 #define dY3     D9.F32
     81 #define qT0     D10.F32
     82 #define qT1     D11.F32
     83 
     84 
     85         .MACRO FFTSTAGE scaled, inverse, name
     86 
     87         @// Define stack arguments
     88 
     89 
     90         @// Update grpCount and grpSize rightaway inorder to reuse pGrpCount
     91         @// and pGrpSize regs
     92 
     93         LSR     subFFTNum,subFFTNum,#1                      @//grpSize
     94         LSL     grpCount,subFFTSize,#1
     95 
     96 
     97         @// pT0+1 increments pT0 by 8 bytes
     98         @// pT0+pointStep = increment of 8*pointStep bytes = 4*grpSize bytes
     99         MOV     pointStep,subFFTNum,LSL #2
    100 
    101         @// update subFFTSize for the next stage
    102         MOV     subFFTSize,grpCount
    103 
    104         @// pOut0+1 increments pOut0 by 8 bytes
    105         @// pOut0+outPointStep == increment of 8*outPointStep bytes =
    106         @//    4*size bytes
    107         SMULBB  outPointStep,grpCount,pointStep
    108         LSL     pointStep,pointStep,#1
    109 
    110 
    111         RSB      step,pointStep,#16
    112         RSB      dstStep,outPointStep,#16
    113 
    114         @// Loop on the groups
    115 
    116 radix2GrpLoop\name :
    117         MOV      setCount,pointStep,LSR #3
    118         VLD1     dW,[pTwiddle],pointStep                @//[wi | wr]
    119 
    120 
    121         @// Loop on the sets
    122 
    123 
    124 radix2SetLoop\name :
    125 
    126 
    127         @// point0: dX0-real part dX1-img part
    128         VLD2    {dX0,dX1},[pSrc],pointStep
    129         @// point1: dX2-real part dX3-img part
    130         VLD2    {dX2,dX3},[pSrc],step
    131 
    132         SUBS    setCount,setCount,#2
    133 
    134         .ifeqs  "\inverse", "TRUE"
    135             VMUL   qT0,dX2,dW[0]
    136             VMLA   qT0,dX3,dW[1]                       @// real part
    137             VMUL   qT1,dX3,dW[0]
    138             VMLS   qT1,dX2,dW[1]                       @// imag part
    139 
    140         .else
    141 
    142             VMUL   qT0,dX2,dW[0]
    143             VMLS   qT0,dX3,dW[1]                       @// real part
    144             VMUL   qT1,dX3,dW[0]
    145             VMLA   qT1,dX2,dW[1]                       @// imag part
    146 
    147         .endif
    148 
    149         VSUB    dY0,dX0,qT0
    150         VSUB    dY1,dX1,qT1
    151         VADD    dY2,dX0,qT0
    152         VADD    dY3,dX1,qT1
    153 
    154         VST2    {dY0,dY1},[pDst],outPointStep
    155         @// dstStep = -outPointStep + 16
    156         VST2    {dY2,dY3},[pDst],dstStep
    157 
    158         BGT     radix2SetLoop\name
    159 
    160         SUBS    grpCount,grpCount,#2
    161         ADD     pSrc,pSrc,pointStep
    162         BGT     radix2GrpLoop\name
    163 
    164 
    165         @// Reset and Swap pSrc and pDst for the next stage
    166         MOV     pTmp,pDst
    167         @// pDst -= 4*size; pSrc -= 8*size bytes
    168         SUB     pDst,pSrc,outPointStep,LSL #1
    169         SUB     pSrc,pTmp,outPointStep
    170 
    171         @// Reset pTwiddle for the next stage
    172         @// pTwiddle -= 4*size bytes
    173         SUB     pTwiddle,pTwiddle,outPointStep
    174 
    175 
    176         .endm
    177 
    178 
    179 
    180         M_START armSP_FFTFwd_CToC_FC32_Radix2_OutOfPlace_unsafe,r4
    181         FFTSTAGE "FALSE","FALSE",FWD
    182         M_END
    183 
    184 
    185 
    186         M_START armSP_FFTInv_CToC_FC32_Radix2_OutOfPlace_unsafe,r4
    187         FFTSTAGE "FALSE","TRUE",INV
    188         M_END
    189 
    190 
    191         .end
    192