Home | History | Annotate | Download | only in neon
      1 @//
      2 @//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
      3 @//
      4 @//  Use of this source code is governed by a BSD-style license
      5 @//  that can be found in the LICENSE file in the root of the source
      6 @//  tree. An additional intellectual property rights grant can be found
      7 @//  in the file PATENTS.  All contributing project authors may
      8 @//  be found in the AUTHORS file in the root of the source tree.
      9 @//
     10 @//  This file was originally licensed as follows. It has been
     11 @//  relicensed with permission from the copyright holders.
     12 
     13 @//
     14 @//
     15 @// File Name:  armSP_FFT_CToC_SC16_Radix2_ps_unsafe_s.s
     16 @// OpenMAX DL: v1.0.2
     17 @// Last Modified Revision:   6740
     18 @// Last Modified Date:       Wed, 18 Jul 2007
     19 @//
     20 @// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
     21 @//
     22 @//
     23 @//
     24 @// Description:
     25 @// Compute a Radix 2 FFT stage for a N point complex signal
     26 @//
     27 @//
     28 
     29 
     30 @// Include standard headers
     31 
     32 #include "dl/api/arm/armCOMM_s.h"
     33 #include "dl/api/arm/omxtypes_s.h"
     34 
     35 
     36 @// Import symbols required from other files
     37 @// (For example tables)
     38 
     39 
     40 
     41 
     42 @// Set debugging level
     43 @//DEBUG_ON    SETL {TRUE}
     44 
     45 
     46 
     47 
     48 @// Guarding implementation by the processor name
     49 
     50 
     51 @//Input Registers
     52 
     53 #define pSrc                            r0
     54 #define pDst                            r2
     55 #define pTwiddle                        r1
     56 #define subFFTNum                       r6
     57 #define subFFTSize                      r7
     58 
     59 
     60 @//Output Registers
     61 
     62 
     63 @//Local Scratch Registers
     64 
     65 #define outPointStep                    r3
     66 #define grpCount                        r4
     67 #define dstStep                         r5
     68 #define twStep                          r8
     69 #define pTmp                            r4
     70 
     71 @// Neon Registers
     72 
     73 #define dW1S32                          D0.S32
     74 #define dW2S32                          D1.S32
     75 #define dW1                             D0.S16
     76 #define dW2                             D1.S16
     77 
     78 #define dX0                             D2.S16
     79 #define dX1                             D3.S16
     80 #define dX2                             D4.S16
     81 #define dX3                             D5.S16
     82 #define dY0                             D6.S16
     83 #define dY1                             D7.S16
     84 #define dY2                             D8.S16
     85 #define dY3                             D9.S16
     86 #define qT0                             Q5.S32
     87 #define qT1                             Q6.S32
     88 
     89 
     90         .MACRO FFTSTAGE scaled, inverse, name
     91 
     92         @// Define stack arguments
     93 
     94 
     95         @// Update grpCount and grpSize rightaway inorder to reuse pGrpCount and pGrpSize regs
     96 
     97 
     98         LSL     grpCount,subFFTSize,#1
     99 
    100 
    101         @// update subFFTSize for the next stage
    102         MOV     subFFTSize,grpCount
    103 
    104         @// pOut0+1 increments pOut0 by 8 bytes
    105         @// pOut0+outPointStep == increment of 4*outPointStep bytes = 2*size bytes
    106         SMULBB  outPointStep,grpCount,subFFTNum
    107         MOV     twStep,subFFTNum,LSL #1
    108         LSR     subFFTNum,subFFTNum,#1                      @//grpSize
    109 
    110 
    111         RSB      dstStep,outPointStep,#8
    112 
    113 
    114         @// Note: pointStep is 8 in this case: so need of extra reg
    115         @// Loop on the groups: 2 groups at a time
    116 
    117 grpLoop\name:
    118 
    119         VLD1     dW1S32[],[pTwiddle],twStep                @//[wi | wr]
    120         VLD1     dW2S32[],[pTwiddle],twStep
    121 
    122         @// Process the sets for each grp:  2 sets at a time (no set looping required)
    123 
    124         VLD1    dX0,[pSrc]!            @// point0: of set0,set1 of grp0
    125         VLD1    dX1,[pSrc]!            @// point1: of set0,set1 of grp0
    126         VLD1    dX2,[pSrc]!            @// point0: of set0,set1 of grp1
    127         VLD1    dX3,[pSrc]!            @// point1: of set0,set1 of grp1
    128 
    129         SUBS    grpCount,grpCount,#4              @// decrement the loop counter
    130         VUZP    dW1,dW2
    131         VUZP    dX1,dX3
    132 
    133         .ifeqs  "\inverse", "TRUE"
    134             VMULL   qT0,dX1,dW1
    135             VMLAL   qT0,dX3,dW2                       @// real part
    136             VMULL   qT1,dX3,dW1
    137             VMLSL   qT1,dX1,dW2                       @// imag part
    138 
    139         .ELSE
    140             VMULL   qT0,dX1,dW1
    141             VMLSL   qT0,dX3,dW2                       @// real part
    142             VMULL   qT1,dX3,dW1
    143             VMLAL   qT1,dX1,dW2                       @// imag part
    144 
    145         .ENDIF
    146 
    147         VRSHRN  dX1,qT0,#15
    148         VRSHRN  dX3,qT1,#15
    149 
    150         VZIP    dX1,dX3
    151 
    152 
    153         .ifeqs "\scaled", "TRUE"
    154 
    155             VHSUB    dY0,dX0,dX1
    156             VHADD    dY1,dX0,dX1
    157             VHSUB    dY2,dX2,dX3
    158             VHADD    dY3,dX2,dX3
    159 
    160         .ELSE
    161 
    162             VSUB    dY0,dX0,dX1
    163             VADD    dY1,dX0,dX1
    164             VSUB    dY2,dX2,dX3
    165             VADD    dY3,dX2,dX3
    166 
    167 
    168 
    169         .ENDIF
    170 
    171         VST1    dY0,[pDst],outPointStep             @// point0: of set0,set1 of grp0
    172         VST1    dY1,[pDst],dstStep                  @// dstStep = -outPointStep + 8
    173         VST1    dY2,[pDst],outPointStep             @// point0: of set0,set1 of grp1
    174         VST1    dY3,[pDst],dstStep                  @// point1: of set0,set1 of grp1
    175 
    176 
    177         BGT     grpLoop\name
    178 
    179 
    180         @// Reset and Swap pSrc and pDst for the next stage
    181         MOV     pTmp,pDst
    182         SUB     pDst,pSrc,outPointStep,LSL #1       @// pDst -= 2*size; pSrc -= 4*size bytes
    183         SUB     pSrc,pTmp,outPointStep
    184 
    185         @// Reset pTwiddle for the next stage
    186         SUB     pTwiddle,pTwiddle,outPointStep      @// pTwiddle -= 2*size bytes
    187 
    188         .endm
    189 
    190 
    191 
    192         M_START armSP_FFTFwd_CToC_SC16_Radix2_ps_OutOfPlace_unsafe,r4
    193         FFTSTAGE "FALSE","FALSE",FWD
    194         M_END
    195 
    196 
    197 
    198         M_START armSP_FFTInv_CToC_SC16_Radix2_ps_OutOfPlace_unsafe,r4
    199         FFTSTAGE "FALSE","TRUE",INV
    200         M_END
    201 
    202 
    203 
    204         M_START armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ps_OutOfPlace_unsafe,r4
    205         FFTSTAGE "TRUE","FALSE",FWDSFS
    206         M_END
    207 
    208 
    209 
    210         M_START armSP_FFTInv_CToC_SC16_Sfs_Radix2_ps_OutOfPlace_unsafe,r4
    211         FFTSTAGE "TRUE","TRUE",INVSFS
    212         M_END
    213 
    214 
    215 
    216     .END
    217