Home | History | Annotate | Download | only in neon
      1 @//
      2 @//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
      3 @//
      4 @//  Use of this source code is governed by a BSD-style license
      5 @//  that can be found in the LICENSE file in the root of the source
      6 @//  tree. An additional intellectual property rights grant can be found
      7 @//  in the file PATENTS.  All contributing project authors may
      8 @//  be found in the AUTHORS file in the root of the source tree.
      9 @//
     10 @//
     11 @//  This is a modification of armSP_FFT_CToC_SC32_Radix4_unsafe_s.s
     12 @//  to support float instead of SC32.
     13 @//
     14 
     15 @//
     16 @// Description:
     17 @// Compute a Radix 4 FFT stage for a N point complex signal
     18 @//
     19 @//
     20 
     21 
     22 @// Include standard headers
     23 
     24 #include "dl/api/arm/armCOMM_s.h"
     25 #include "dl/api/arm/omxtypes_s.h"
     26 
     27 
     28 @// Import symbols required from other files
     29 @// (For example tables)
     30 
     31 
     32 
     33 
     34 @// Set debugging level
     35 @//DEBUG_ON    SETL {TRUE}
     36 
     37 
     38 
     39 @// Guarding implementation by the processor name
     40 
     41 
     42 
     43 
     44 @// Guarding implementation by the processor name
     45 
     46 
     47 @// Import symbols required from other files
     48 @// (For example tables)
     49 
     50 
     51 @//Input Registers
     52 
     53 #define pSrc            r0
     54 #define pDst            r2
     55 #define pTwiddle        r1
     56 #define subFFTNum       r6
     57 #define subFFTSize      r7
     58 
     59 
     60 
     61 @//Output Registers
     62 
     63 
     64 @//Local Scratch Registers
     65 
     66 #define grpCount        r3
     67 #define pointStep       r4
     68 #define outPointStep    r5
     69 #define stepTwiddle     r12
     70 #define setCount        r14
     71 #define srcStep         r8
     72 #define setStep         r9
     73 #define dstStep         r10
     74 #define twStep          r11
     75 #define t1              r3
     76 
     77 @// Neon Registers
     78 
     79 #define dW1     D0.F32
     80 #define dW2     D1.F32
     81 #define dW3     D2.F32
     82 
     83 #define dXr0    D4.F32
     84 #define dXi0    D5.F32
     85 #define dXr1    D6.F32
     86 #define dXi1    D7.F32
     87 #define dXr2    D8.F32
     88 #define dXi2    D9.F32
     89 #define dXr3    D10.F32
     90 #define dXi3    D11.F32
     91 #define dYr0    D12.F32
     92 #define dYi0    D13.F32
     93 #define dYr1    D14.F32
     94 #define dYi1    D15.F32
     95 #define dYr2    D16.F32
     96 #define dYi2    D17.F32
     97 #define dYr3    D18.F32
     98 #define dYi3    D19.F32
     99 #define qT0     d16.f32
    100 #define qT1     d18.f32
    101 #define qT2     d12.f32
    102 #define qT3     d14.f32
    103 #define dZr0    D20.F32
    104 #define dZi0    D21.F32
    105 #define dZr1    D22.F32
    106 #define dZi1    D23.F32
    107 #define dZr2    D24.F32
    108 #define dZi2    D25.F32
    109 #define dZr3    D26.F32
    110 #define dZi3    D27.F32
    111 
    112 #define qY0     Q6.F32
    113 #define qY1     Q7.F32
    114 #define qY2     Q8.F32
    115 #define qY3     Q9.F32
    116 #define qX0     Q2.F32
    117 #define qZ0     Q10.F32
    118 #define qZ1     Q11.F32
    119 #define qZ2     Q12.F32
    120 #define qZ3     Q13.F32
    121 
    122         .macro FFTSTAGE scaled, inverse , name
    123 
    124         @// Define stack arguments
    125 
    126 
    127         @// Update grpCount and grpSize rightaway inorder to reuse
    128         @// pGrpCount and pGrpSize regs
    129 
    130         LSL     grpCount,subFFTSize,#2
    131         LSR     subFFTNum,subFFTNum,#2
    132         MOV     subFFTSize,grpCount
    133 
    134         VLD1     dW1,[pTwiddle]                    @//[wi | wr]
    135         @// pT0+1 increments pT0 by 8 bytes
    136         @// pT0+pointStep = increment of 8*pointStep bytes = 2*grpSize bytes
    137         MOV     pointStep,subFFTNum,LSL #1
    138 
    139 
    140         @// pOut0+1 increments pOut0 by 8 bytes
    141         @// pOut0+outPointStep == increment of 8*outPointStep bytes
    142         @//   = 2*size bytes
    143 
    144         MOV     stepTwiddle,#0
    145         VLD1     dW2,[pTwiddle]                    @//[wi | wr]
    146         SMULBB  outPointStep,grpCount,pointStep
    147         LSL     pointStep,pointStep,#2             @// 2*grpSize
    148 
    149         VLD1     dW3,[pTwiddle]                    @//[wi | wr]
    150         MOV     srcStep,pointStep,LSL #1           @// srcStep = 2*pointStep
    151         ADD     setStep,srcStep,pointStep          @// setStep = 3*pointStep
    152 
    153         RSB     setStep,setStep,#0                 @// setStep = - 3*pointStep
    154         SUB     srcStep,srcStep,#16                @// srcStep = 2*pointStep-16
    155 
    156         MOV     dstStep,outPointStep,LSL #1
    157         ADD     dstStep,dstStep,outPointStep       @// dstStep = 3*outPointStep
    158         @// dstStep = - 3*outPointStep+16
    159         RSB     dstStep,dstStep,#16
    160 
    161 
    162 
    163 radix4GrpLoop\name :
    164 
    165         VLD2    {dXr0,dXi0},[pSrc],pointStep       @//  data[0]
    166         ADD      stepTwiddle,stepTwiddle,pointStep
    167         VLD2    {dXr1,dXi1},[pSrc],pointStep       @//  data[1]
    168         @// set pTwiddle to the first point
    169         ADD      pTwiddle,pTwiddle,stepTwiddle
    170         VLD2    {dXr2,dXi2},[pSrc],pointStep       @//  data[2]
    171         MOV      twStep,stepTwiddle,LSL #2
    172 
    173         @//  data[3] & update pSrc for the next set
    174         VLD2    {dXr3,dXi3},[pSrc],setStep
    175         SUB      twStep,stepTwiddle,twStep         @// twStep = -3*stepTwiddle
    176 
    177         MOV      setCount,pointStep,LSR #3
    178         @// set pSrc to data[0] of the next set
    179         ADD     pSrc,pSrc,#16
    180         @// increment to data[1] of the next set
    181         ADD     pSrc,pSrc,pointStep
    182 
    183 
    184         @// Loop on the sets
    185 
    186 radix4SetLoop\name :
    187 
    188 
    189 
    190         .ifeqs  "\inverse", "TRUE"
    191             VMUL   dZr1,dXr1,dW1[0]
    192             VMUL   dZi1,dXi1,dW1[0]
    193             VMUL   dZr2,dXr2,dW2[0]
    194             VMUL   dZi2,dXi2,dW2[0]
    195             VMUL   dZr3,dXr3,dW3[0]
    196             VMUL   dZi3,dXi3,dW3[0]
    197 
    198             VMLA   dZr1,dXi1,dW1[1]                @// real part
    199             VMLS   dZi1,dXr1,dW1[1]                @// imag part
    200 
    201             @//  data[1] for next iteration
    202             VLD2    {dXr1,dXi1},[pSrc],pointStep
    203 
    204             VMLA   dZr2,dXi2,dW2[1]                @// real part
    205             VMLS   dZi2,dXr2,dW2[1]                @// imag part
    206 
    207             @//  data[2] for next iteration
    208             VLD2    {dXr2,dXi2},[pSrc],pointStep
    209 
    210             VMLA   dZr3,dXi3,dW3[1]                @// real part
    211             VMLS   dZi3,dXr3,dW3[1]                @// imag part
    212         .else
    213             VMUL   dZr1,dXr1,dW1[0]
    214             VMUL   dZi1,dXi1,dW1[0]
    215             VMUL   dZr2,dXr2,dW2[0]
    216             VMUL   dZi2,dXi2,dW2[0]
    217             VMUL   dZr3,dXr3,dW3[0]
    218             VMUL   dZi3,dXi3,dW3[0]
    219 
    220             VMLS   dZr1,dXi1,dW1[1]                @// real part
    221             VMLA   dZi1,dXr1,dW1[1]                @// imag part
    222 
    223             @//  data[1] for next iteration
    224             VLD2    {dXr1,dXi1},[pSrc],pointStep
    225 
    226             VMLS   dZr2,dXi2,dW2[1]                @// real part
    227             VMLA   dZi2,dXr2,dW2[1]                @// imag part
    228 
    229             @//  data[2] for next iteration
    230             VLD2    {dXr2,dXi2},[pSrc],pointStep
    231 
    232             VMLS   dZr3,dXi3,dW3[1]                @// real part
    233             VMLA   dZi3,dXr3,dW3[1]                @// imag part
    234         .endif
    235 
    236         @//  data[3] & update pSrc to data[0]
    237         @// But don't read on the very last iteration because that reads past
    238 	@// the end of pSrc. The last iteration is grpCount = 4, setCount = 2.
    239         cmp     grpCount, #4
    240         cmpeq   setCount, #2                      @// Test setCount if grpCount = 4
    241         @// These are executed only if both grpCount = 4 and setCount = 2
    242         addeq   pSrc, pSrc, setStep
    243         beq     radix4SkipRead\name
    244         VLD2    {dXr3,dXi3},[pSrc],setStep
    245 radix4SkipRead\name:
    246         SUBS    setCount,setCount,#2
    247 
    248         @// finish first stage of 4 point FFT
    249         VADD    qY0,qX0,qZ2
    250         VSUB    qY2,qX0,qZ2
    251 
    252         @//  data[0] for next iteration
    253         VLD2    {dXr0,dXi0},[pSrc :128]!
    254         VADD    qY1,qZ1,qZ3
    255         VSUB    qY3,qZ1,qZ3
    256 
    257         @// finish second stage of 4 point FFT
    258 
    259         VSUB    qZ0,qY2,qY1
    260 
    261 
    262         .ifeqs  "\inverse", "TRUE"
    263 
    264             VADD    dZr3,dYr0,dYi3
    265             VST2    {dZr0,dZi0},[pDst :128],outPointStep
    266             VSUB    dZi3,dYi0,dYr3
    267 
    268             VADD    qZ2,qY2,qY1
    269             VST2    {dZr3,dZi3},[pDst :128],outPointStep
    270 
    271             VSUB    dZr1,dYr0,dYi3
    272             VST2    {dZr2,dZi2},[pDst :128],outPointStep
    273             VADD    dZi1,dYi0,dYr3
    274 
    275             VST2    {dZr1,dZi1},[pDst :128],dstStep
    276 
    277 
    278         .else
    279 
    280             VSUB    dZr1,dYr0,dYi3
    281             VST2    {dZr0,dZi0},[pDst :128],outPointStep
    282             VADD    dZi1,dYi0,dYr3
    283 
    284             VADD    qZ2,qY2,qY1
    285             VST2    {dZr1,dZi1},[pDst :128],outPointStep
    286 
    287             VADD    dZr3,dYr0,dYi3
    288             VST2    {dZr2,dZi2},[pDst :128],outPointStep
    289             VSUB    dZi3,dYi0,dYr3
    290 
    291             VST2    {dZr3,dZi3},[pDst :128],dstStep
    292 
    293 
    294         .endif
    295 
    296         @// increment to data[1] of the next set
    297         ADD     pSrc,pSrc,pointStep
    298         BGT     radix4SetLoop\name
    299 
    300 
    301         VLD1     dW1,[pTwiddle :64],stepTwiddle    @//[wi | wr]
    302         @// subtract 4 since grpCount multiplied by 4
    303         SUBS    grpCount,grpCount,#4
    304         VLD1     dW2,[pTwiddle :64],stepTwiddle    @//[wi | wr]
    305         @// increment pSrc for the next grp
    306         ADD     pSrc,pSrc,srcStep
    307         VLD1     dW3,[pTwiddle :64],twStep         @//[wi | wr]
    308         BGT     radix4GrpLoop\name
    309 
    310 
    311         @// Reset and Swap pSrc and pDst for the next stage
    312         MOV     t1,pDst
    313         @// pDst -= 2*size; pSrc -= 8*size bytes
    314         SUB     pDst,pSrc,outPointStep,LSL #2
    315         SUB     pSrc,t1,outPointStep
    316 
    317 
    318         .endm
    319 
    320 
    321         M_START armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace_unsafe,r4
    322             FFTSTAGE "FALSE","FALSE",FWD
    323         M_END
    324 
    325 
    326         M_START armSP_FFTInv_CToC_FC32_Radix4_OutOfPlace_unsafe,r4
    327             FFTSTAGE "FALSE","TRUE",INV
    328         M_END
    329 
    330 
    331         .end
    332