Home | History | Annotate | Download | only in neon
      1 @//
      2 @//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
      3 @//
      4 @//  Use of this source code is governed by a BSD-style license
      5 @//  that can be found in the LICENSE file in the root of the source
      6 @//  tree. An additional intellectual property rights grant can be found
      7 @//  in the file PATENTS.  All contributing project authors may
      8 @//  be found in the AUTHORS file in the root of the source tree.
      9 @//
     10 @//  This is a modification of armSP_FFT_CToC_SC32_Radix2_ls_unsafe_s.S
     11 @//  to support float instead of SC32.
     12 @//
     13 
     14 @//
     15 @// Description:
     16 @// Compute the last stage of a Radix 2 DIT in-order out-of-place FFT
     17 @// stage for a N point complex signal.
     18 @//
     19 @//
     20 
     21 
     22 @// Include standard headers
     23 
     24 #include "dl/api/arm/armCOMM_s.h"
     25 #include "dl/api/arm/omxtypes_s.h"
     26 
     27 
     28 @// Import symbols required from other files
     29 @// (For example tables)
     30 
     31 
     32 
     33 
     34 @// Set debugging level
     35 @//DEBUG_ON    SETL {TRUE}
     36 
     37 
     38 @// Guarding implementation by the processor name
     39 
     40 
     41 @//Input Registers
     42 
     43 #define pSrc            r0
     44 #define pDst            r2
     45 #define pTwiddle        r1
     46 #define subFFTNum       r6
     47 #define subFFTSize      r7
     48 
     49 
     50 @//Output Registers
     51 
     52 
     53 @//Local Scratch Registers
     54 
     55 
     56 #define outPointStep    r3
     57 #define grpCount        r4
     58 #define dstStep         r5
     59 #define pTmp            r4
     60 
     61 @// Neon Registers
     62 
     63 #define dWr     d0.f32
     64 #define dWi     d1.f32
     65 #define dXr0    d2.f32
     66 #define dXi0    d3.f32
     67 #define dXr1    d4.f32
     68 #define dXi1    d5.f32
     69 #define dYr0    d6.f32
     70 #define dYi0    d7.f32
     71 #define dYr1    d8.f32
     72 #define dYi1    d9.f32
     73 #define qT0     d10.f32
     74 #define qT1     d12.f32
     75 
     76         .MACRO FFTSTAGE scaled, inverse, name
     77 
     78 
     79         MOV     outPointStep,subFFTSize,LSL #3
     80         @// Update grpCount and grpSize rightaway
     81 
     82         MOV     subFFTNum,#1                            @//after the last stage
     83         LSL     grpCount,subFFTSize,#1
     84 
     85         @// update subFFTSize for the next stage
     86         MOV     subFFTSize,grpCount
     87 
     88         RSB      dstStep,outPointStep,#16
     89 
     90 
     91         @// Loop on 2 grps at a time for the last stage
     92 
     93 radix2lsGrpLoop\name :
     94         @ dWr = [pTwiddle[0].Re, pTwiddle[1].Re]
     95         @ dWi = [pTwiddle[0].Im, pTwiddle[1].Im]
     96         VLD2    {dWr,dWi},[pTwiddle :64]!
     97 
     98         @ dXr0 = [pSrc[0].Re, pSrc[2].Re]
     99         @ dXi0 = [pSrc[0].Im, pSrc[2].Im]
    100         @ dXr1 = [pSrc[1].Re, pSrc[3].Re]
    101         @ dXi1 = [pSrc[1].Im, pSrc[3].Im]
    102         VLD4    {dXr0,dXi0,dXr1,dXi1},[pSrc :128]!
    103         SUBS    grpCount,grpCount,#4                   @// grpCount is multiplied by 2
    104 
    105         .ifeqs  "\inverse", "TRUE"
    106             VMUL   qT0,dWr,dXr1
    107             VMLA   qT0,dWi,dXi1                       @// real part
    108             VMUL   qT1,dWr,dXi1
    109             VMLS   qT1,dWi,dXr1                       @// imag part
    110 
    111         .else
    112 
    113             VMUL   qT0,dWr,dXr1
    114             VMLS   qT0,dWi,dXi1                       @// real part
    115             VMUL   qT1,dWr,dXi1
    116             VMLA   qT1,dWi,dXr1                       @// imag part
    117 
    118         .endif
    119 
    120         VSUB    dYr0,dXr0,qT0
    121         VSUB    dYi0,dXi0,qT1
    122         VADD    dYr1,dXr0,qT0
    123         VADD    dYi1,dXi0,qT1
    124 
    125         VST2    {dYr0,dYi0},[pDst],outPointStep
    126         VST2    {dYr1,dYi1},[pDst],dstStep                  @// dstStep =  step = -outPointStep + 16
    127 
    128         BGT     radix2lsGrpLoop\name
    129 
    130 
    131         @// Reset and Swap pSrc and pDst for the next stage
    132         MOV     pTmp,pDst
    133         SUB     pDst,pSrc,outPointStep,LSL #1       @// pDst -= 4*size; pSrc -= 8*size bytes
    134         SUB     pSrc,pTmp,outPointStep
    135 
    136         @// Reset pTwiddle for the next stage
    137         SUB     pTwiddle,pTwiddle,outPointStep      @// pTwiddle -= 4*size bytes
    138 
    139         .endm
    140 
    141 
    142 
    143         M_START armSP_FFTFwd_CToC_FC32_Radix2_ls_OutOfPlace_unsafe,r4,""
    144         FFTSTAGE "FALSE","FALSE",fwd
    145         M_END
    146 
    147 
    148 
    149         M_START armSP_FFTInv_CToC_FC32_Radix2_ls_OutOfPlace_unsafe,r4
    150         FFTSTAGE "FALSE","TRUE",inv
    151         M_END
    152 
    153 	.end
    154