Home | History | Annotate | Download | only in neon
      1 @//
      2 @//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
      3 @//
      4 @//  Use of this source code is governed by a BSD-style license
      5 @//  that can be found in the LICENSE file in the root of the source
      6 @//  tree. An additional intellectual property rights grant can be found
      7 @//  in the file PATENTS.  All contributing project authors may
      8 @//  be found in the AUTHORS file in the root of the source tree.
      9 @//
     10 @//  This is a modification of armSP_FFT_CToC_SC32_Radix4_ls_unsafe_s.s
     11 @//  to support float instead of SC32.
     12 @//
     13 
     14 @//
     15 @// Description:
     16 @// Compute a Radix 4 FFT stage for a N point complex signal
     17 @//
     18 @//
     19 
     20 
     21 @// Include standard headers
     22 
     23 #include "dl/api/arm/armCOMM_s.h"
     24 #include "dl/api/arm/omxtypes_s.h"
     25 
     26 @// Import symbols required from other files
     27 @// (For example tables)
     28 
     29 
     30 
     31 
     32 @// Set debugging level
     33 @//DEBUG_ON    SETL {TRUE}
     34 
     35 
     36 @// Guarding implementation by the processor name
     37 
     38 
     39 @// Import symbols required from other files
     40 @// (For example tables)
     41     @//IMPORT  armAAC_constTable
     42 
     43 @//Input Registers
     44 
     45 #define pSrc            r0
     46 #define pDst            r2
     47 #define pTwiddle        r1
     48 #define subFFTNum       r6
     49 #define subFFTSize      r7
     50 
     51 
     52 
     53 @//Output Registers
     54 
     55 
     56 @//Local Scratch Registers
     57 
     58 #define outPointStep    r3
     59 #define grpCount        r4
     60 #define dstStep         r5
     61 #define grpTwStep       r8
     62 #define stepTwiddle     r9
     63 #define twStep          r10
     64 #define pTmp            r4
     65 #define step16          r11
     66 #define step24          r12
     67 
     68 
     69 @// Neon Registers
     70 
     71 #define dButterfly1Real02       D0.F32
     72 #define dButterfly1Imag02       D1.F32
     73 #define dButterfly1Real13       D2.F32
     74 #define dButterfly1Imag13       D3.F32
     75 #define dButterfly2Real02       D4.F32
     76 #define dButterfly2Imag02       D5.F32
     77 #define dButterfly2Real13       D6.F32
     78 #define dButterfly2Imag13       D7.F32
     79 #define dXr0                    D0.F32
     80 #define dXi0                    D1.F32
     81 #define dXr1                    D2.F32
     82 #define dXi1                    D3.F32
     83 #define dXr2                    D4.F32
     84 #define dXi2                    D5.F32
     85 #define dXr3                    D6.F32
     86 #define dXi3                    D7.F32
     87 
     88 #define dYr0                    D16.F32
     89 #define dYi0                    D17.F32
     90 #define dYr1                    D18.F32
     91 #define dYi1                    D19.F32
     92 #define dYr2                    D20.F32
     93 #define dYi2                    D21.F32
     94 #define dYr3                    D22.F32
     95 #define dYi3                    D23.F32
     96 
     97 #define dW1r                    D8.F32
     98 #define dW1i                    D9.F32
     99 #define dW2r                    D10.F32
    100 #define dW2i                    D11.F32
    101 #define dW3r                    D12.F32
    102 #define dW3i                    D13.F32
    103 #define qT0                     d14.f32
    104 #define qT1                     d16.F32
    105 #define qT2                     d18.F32
    106 #define qT3                     d20.f32
    107 #define qT4                     d22.f32
    108 #define qT5                     d24.f32
    109 
    110 #define dZr0                    D14.F32
    111 #define dZi0                    D15.F32
    112 #define dZr1                    D26.F32
    113 #define dZi1                    D27.F32
    114 #define dZr2                    D28.F32
    115 #define dZi2                    D29.F32
    116 #define dZr3                    D30.F32
    117 #define dZi3                    D31.F32
    118 
    119 #define qX0                     Q0.F32
    120 #define qY0                     Q8.F32
    121 #define qY1                     Q9.F32
    122 #define qY2                     Q10.F32
    123 #define qY3                     Q11.F32
    124 #define qZ0                     Q7.F32
    125 #define qZ1                     Q13.F32
    126 #define qZ2                     Q14.F32
    127 #define qZ3                     Q15.F32
    128 
    129 
    130 
    131         .MACRO FFTSTAGE scaled, inverse , name
    132 
    133         @// Define stack arguments
    134 
    135 
    136         @// pOut0+1 increments pOut0 by 8 bytes
    137         @// pOut0+outPointStep == increment of 8*outPointStep bytes
    138         MOV     outPointStep,subFFTSize,LSL #3
    139 
    140         @// Update grpCount and grpSize rightaway
    141 
    142         VLD2    {dW1r,dW1i},[pTwiddle :128]             @// [wi|wr]
    143         MOV     step16,#16
    144         LSL     grpCount,subFFTSize,#2
    145 
    146         VLD1    dW2r,[pTwiddle :64]                     @// [wi|wr]
    147         MOV     subFFTNum,#1                            @//after the last stage
    148 
    149         VLD1    dW3r,[pTwiddle :64],step16              @// [wi|wr]
    150         MOV     stepTwiddle,#0
    151 
    152         VLD1    dW2i,[pTwiddle :64]!                    @// [wi|wr]
    153         SUB     grpTwStep,stepTwiddle,#8                @// grpTwStep = -8 to start with
    154 
    155         @// update subFFTSize for the next stage
    156         MOV     subFFTSize,grpCount
    157         VLD1    dW3i,[pTwiddle :64],grpTwStep           @// [wi|wr]
    158         MOV     dstStep,outPointStep,LSL #1
    159 
    160         @// AC.r AC.i BD.r BD.i
    161         VLD4     {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc :256]!
    162         ADD     dstStep,dstStep,outPointStep            @// dstStep = 3*outPointStep
    163         RSB     dstStep,dstStep,#16                     @// dstStep = - 3*outPointStep+16
    164         MOV     step24,#24
    165 
    166         @// AC.r AC.i BD.r BD.i
    167         VLD4     {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc :256]!
    168 
    169 
    170         @// Process two groups at a time
    171 
    172 radix4lsGrpLoop\name :
    173 
    174         VZIP    dW2r,dW2i
    175         ADD     stepTwiddle,stepTwiddle,#16
    176         VZIP    dW3r,dW3i
    177         ADD     grpTwStep,stepTwiddle,#4
    178         VUZP     dButterfly1Real13, dButterfly2Real13   @// B.r D.r
    179         SUB     twStep,stepTwiddle,#16                  @// -16+stepTwiddle
    180         VUZP     dButterfly1Imag13, dButterfly2Imag13   @// B.i D.i
    181         MOV     grpTwStep,grpTwStep,LSL #1
    182         VUZP     dButterfly1Real02, dButterfly2Real02   @// A.r C.r
    183         RSB     grpTwStep,grpTwStep,#0                  @// -8-2*stepTwiddle
    184 
    185 
    186         VUZP     dButterfly1Imag02, dButterfly2Imag02   @// A.i C.i
    187 
    188 
    189         @// grpCount is multiplied by 4
    190         SUBS    grpCount,grpCount,#8
    191 
    192         .ifeqs  "\inverse", "TRUE"
    193             VMUL   dZr1,dW1r,dXr1
    194             VMLA   dZr1,dW1i,dXi1                       @// real part
    195             VMUL   dZi1,dW1r,dXi1
    196             VMLS   dZi1,dW1i,dXr1                       @// imag part
    197 
    198         .else
    199 
    200             VMUL   dZr1,dW1r,dXr1
    201             VMLS   dZr1,dW1i,dXi1                       @// real part
    202             VMUL   dZi1,dW1r,dXi1
    203             VMLA   dZi1,dW1i,dXr1                       @// imag part
    204 
    205         .endif
    206 
    207         VLD2    {dW1r,dW1i},[pTwiddle :128],stepTwiddle      @// [wi|wr]
    208 
    209         .ifeqs  "\inverse", "TRUE"
    210             VMUL   dZr2,dW2r,dXr2
    211             VMLA   dZr2,dW2i,dXi2                       @// real part
    212             VMUL   dZi2,dW2r,dXi2
    213             VLD1   dW2r,[pTwiddle :64],step16           @// [wi|wr]
    214             VMLS   dZi2,dW2i,dXr2                       @// imag part
    215 
    216         .else
    217 
    218             VMUL   dZr2,dW2r,dXr2
    219             VMLS   dZr2,dW2i,dXi2                       @// real part
    220             VMUL   dZi2,dW2r,dXi2
    221             VLD1    dW2r,[pTwiddle :64],step16          @// [wi|wr]
    222             VMLA   dZi2,dW2i,dXr2                       @// imag part
    223 
    224         .endif
    225 
    226 
    227         VLD1    dW2i,[pTwiddle :64],twStep              @// [wi|wr]
    228 
    229         @// move qX0 so as to load for the next iteration
    230         VMOV     qZ0,qX0
    231 
    232         .ifeqs  "\inverse", "TRUE"
    233             VMUL   dZr3,dW3r,dXr3
    234             VMLA   dZr3,dW3i,dXi3                       @// real part
    235             VMUL   dZi3,dW3r,dXi3
    236             VLD1    dW3r,[pTwiddle :64],step24
    237             VMLS   dZi3,dW3i,dXr3                       @// imag part
    238 
    239         .else
    240 
    241             VMUL   dZr3,dW3r,dXr3
    242             VMLS   dZr3,dW3i,dXi3                       @// real part
    243             VMUL   dZi3,dW3r,dXi3
    244             VLD1    dW3r,[pTwiddle :64],step24
    245             VMLA   dZi3,dW3i,dXr3                       @// imag part
    246 
    247         .endif
    248 
    249         VLD1    dW3i,[pTwiddle :64],grpTwStep           @// [wi|wr]
    250 
    251         @// Don't do the load on the last iteration so we don't read past the end
    252         @// of pSrc.
    253         addeq   pSrc, pSrc, #64
    254         beq     radix4lsSkipRead\name
    255         @// AC.r AC.i BD.r BD.i
    256         VLD4     {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc :256]!
    257 
    258         @// AC.r AC.i BD.r BD.i
    259         VLD4     {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc :256]!
    260 radix4lsSkipRead\name:
    261 
    262         @// finish first stage of 4 point FFT
    263 
    264         VADD    qY0,qZ0,qZ2
    265         VSUB    qY2,qZ0,qZ2
    266         VADD    qY1,qZ1,qZ3
    267         VSUB    qY3,qZ1,qZ3
    268 
    269 
    270         @// finish second stage of 4 point FFT
    271 
    272         .ifeqs  "\inverse", "TRUE"
    273 
    274             VSUB    qZ0,qY2,qY1
    275 
    276             VADD    dZr3,dYr0,dYi3
    277             VST2    {dZr0,dZi0},[pDst :128],outPointStep
    278             VSUB    dZi3,dYi0,dYr3
    279 
    280             VADD    qZ2,qY2,qY1
    281             VST2    {dZr3,dZi3},[pDst :128],outPointStep
    282 
    283             VSUB    dZr1,dYr0,dYi3
    284             VST2    {dZr2,dZi2},[pDst :128],outPointStep
    285             VADD    dZi1,dYi0,dYr3
    286 
    287             @// dstStep = -outPointStep + 16
    288             VST2    {dZr1,dZi1},[pDst :128],dstStep
    289 
    290 
    291         .else
    292 
    293             VSUB    qZ0,qY2,qY1
    294 
    295             VSUB    dZr1,dYr0,dYi3
    296             VST2    {dZr0,dZi0},[pDst :128],outPointStep
    297             VADD    dZi1,dYi0,dYr3
    298 
    299             VADD    qZ2,qY2,qY1
    300             VST2    {dZr1,dZi1},[pDst :128],outPointStep
    301 
    302             VADD    dZr3,dYr0,dYi3
    303             VST2    {dZr2,dZi2},[pDst :128],outPointStep
    304             VSUB    dZi3,dYi0,dYr3
    305 
    306             @// dstStep = -outPointStep + 16
    307             VST2    {dZr3,dZi3},[pDst :128],dstStep
    308 
    309 
    310         .endif
    311 
    312         BGT     radix4lsGrpLoop\name
    313 
    314 
    315         @// Reset and Swap pSrc and pDst for the next stage
    316         MOV     pTmp,pDst
    317         @// Extra increment done in final iteration of the loop
    318         SUB     pSrc,pSrc,#64
    319         @// pDst -= 4*size; pSrc -= 8*size bytes
    320         SUB     pDst,pSrc,outPointStep,LSL #2
    321         SUB     pSrc,pTmp,outPointStep
    322         SUB     pTwiddle,pTwiddle,subFFTSize,LSL #1
    323         @// Extra increment done in final iteration of the loop
    324         SUB     pTwiddle,pTwiddle,#16
    325 
    326         .endm
    327 
    328 
    329         M_START armSP_FFTFwd_CToC_FC32_Radix4_ls_OutOfPlace_unsafe,r4
    330         FFTSTAGE "FALSE","FALSE",fwd
    331         M_END
    332 
    333 
    334         M_START armSP_FFTInv_CToC_FC32_Radix4_ls_OutOfPlace_unsafe,r4
    335         FFTSTAGE "FALSE","TRUE",inv
    336         M_END
    337 
    338 
    339         .end
    340