Home | History | Annotate | Download | only in neon
      1 @//
      2 @//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
      3 @//
      4 @//  Use of this source code is governed by a BSD-style license
      5 @//  that can be found in the LICENSE file in the root of the source
      6 @//  tree. An additional intellectual property rights grant can be found
      7 @//  in the file PATENTS.  All contributing project authors may
      8 @//  be found in the AUTHORS file in the root of the source tree.
      9 @//
     10 @//  This file was originally licensed as follows. It has been
     11 @//  relicensed with permission from the copyright holders.
     12 @//
     13 
     14 @//
     15 @// File Name:  armSP_FFT_CToC_SC32_Radix4_unsafe_s.s
     16 @// OpenMAX DL: v1.0.2
     17 @// Last Modified Revision:   7767
     18 @// Last Modified Date:       Thu, 27 Sep 2007
     19 @//
     20 @// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
     21 @//
     22 @//
     23 @//
     24 @// Description:
     25 @// Compute a Radix 4 FFT stage for a N point complex signal
     26 @//
     27 
     28 
     29 
     30 
     31 @// Include standard headers
     32 
     33 #include "dl/api/arm/armCOMM_s.h"
     34 #include "dl/api/arm/omxtypes_s.h"
     35 
     36 
     37 @// Import symbols required from other files
     38 @// (For example tables)
     39 
     40 
     41 
     42 
     43 @// Set debugging level
     44 @//DEBUG_ON    SETL {TRUE}
     45 
     46 
     47 
     48 @// Guarding implementation by the processor name
     49 
     50 
     51 
     52 
     53 @// Guarding implementation by the processor name
     54 
     55 
     56 @// Import symbols required from other files
     57 @// (For example tables)
     58 
     59 
     60 @//Input Registers
     61 
     62 #define pSrc		r0
     63 #define pDst		r2
     64 #define pTwiddle	r1
     65 #define subFFTNum	r6
     66 #define subFFTSize	r7
     67 
     68 
     69 
     70 @//Output Registers
     71 
     72 
     73 @//Local Scratch Registers
     74 
     75 #define grpCount	r3
     76 #define pointStep	r4
     77 #define outPointStep	r5
     78 #define stepTwiddle	r12
     79 #define setCount	r14
     80 #define srcStep		r8
     81 #define setStep		r9
     82 #define dstStep		r10
     83 #define twStep		r11
     84 #define t1		r3
     85 
     86 @// Neon Registers
     87 
     88 #define dW1	D0.S32
     89 #define dW2	D1.S32
     90 #define dW3	D2.S32
     91 
     92 #define dXr0	D4.S32
     93 #define dXi0	D5.S32
     94 #define dXr1	D6.S32
     95 #define dXi1	D7.S32
     96 #define dXr2	D8.S32
     97 #define dXi2	D9.S32
     98 #define dXr3	D10.S32
     99 #define dXi3	D11.S32
    100 #define dYr0	D12.S32
    101 #define dYi0	D13.S32
    102 #define dYr1	D14.S32
    103 #define dYi1	D15.S32
    104 #define dYr2	D16.S32
    105 #define dYi2	D17.S32
    106 #define dYr3	D18.S32
    107 #define dYi3	D19.S32
    108 #define qT0	Q8.S64
    109 #define qT1	Q9.S64
    110 #define qT2	Q6.S64
    111 #define qT3	Q7.S64
    112 
    113 #define dZr0	D20.S32
    114 #define dZi0	D21.S32
    115 #define dZr1	D22.S32
    116 #define dZi1	D23.S32
    117 #define dZr2	D24.S32
    118 #define dZi2	D25.S32
    119 #define dZr3	D26.S32
    120 #define dZi3	D27.S32
    121 
    122 #define qY0	Q6.S32
    123 #define qY1	Q7.S32
    124 #define qY2	Q8.S32
    125 #define qY3	Q9.S32
    126 #define qX0	Q2.S32
    127 #define qZ0	Q10.S32
    128 #define qZ1	Q11.S32
    129 #define qZ2	Q12.S32
    130 #define qZ3	Q13.S32
    131 
    132 
    133         .MACRO FFTSTAGE scaled, inverse , name
    134 
    135         @// Define stack arguments
    136 
    137 
    138         @// Update grpCount and grpSize rightaway inorder to reuse pGrpCount and pGrpSize regs
    139 
    140         LSL     grpCount,subFFTSize,#2
    141         LSR     subFFTNum,subFFTNum,#2
    142         MOV     subFFTSize,grpCount
    143 
    144         VLD1     dW1,[pTwiddle]                             @//[wi | wr]
    145         @// pT0+1 increments pT0 by 8 bytes
    146         @// pT0+pointStep = increment of 8*pointStep bytes = 2*grpSize bytes
    147         MOV     pointStep,subFFTNum,LSL #1
    148 
    149 
    150         @// pOut0+1 increments pOut0 by 8 bytes
    151         @// pOut0+outPointStep == increment of 8*outPointStep bytes = 2*size bytes
    152 
    153         MOV     stepTwiddle,#0
    154         VLD1     dW2,[pTwiddle]                             @//[wi | wr]
    155         SMULBB  outPointStep,grpCount,pointStep
    156         LSL     pointStep,pointStep,#2                      @// 2*grpSize
    157 
    158         VLD1     dW3,[pTwiddle]                             @//[wi | wr]
    159         MOV     srcStep,pointStep,LSL #1                    @// srcStep = 2*pointStep
    160         ADD     setStep,srcStep,pointStep                   @// setStep = 3*pointStep
    161         @//RSB     setStep,setStep,#16                         @// setStep = - 3*pointStep+16
    162         RSB     setStep,setStep,#0                         @// setStep = - 3*pointStep
    163         SUB     srcStep,srcStep,#16                         @// srcStep = 2*pointStep-16
    164 
    165         MOV     dstStep,outPointStep,LSL #1
    166         ADD     dstStep,dstStep,outPointStep                @// dstStep = 3*outPointStep
    167         RSB     dstStep,dstStep,#16                          @// dstStep = - 3*outPointStep+16
    168 
    169 
    170 
    171 grpLoop\name :
    172 
    173         VLD2    {dXr0,dXi0},[pSrc],pointStep                @//  data[0]
    174         ADD      stepTwiddle,stepTwiddle,pointStep
    175         VLD2    {dXr1,dXi1},[pSrc],pointStep                @//  data[1]
    176         ADD      pTwiddle,pTwiddle,stepTwiddle              @// set pTwiddle to the first point
    177         VLD2    {dXr2,dXi2},[pSrc],pointStep                @//  data[2]
    178         MOV      twStep,stepTwiddle,LSL #2
    179 
    180         VLD2    {dXr3,dXi3},[pSrc],setStep                  @//  data[3] & update pSrc for the next set
    181         SUB      twStep,stepTwiddle,twStep                  @// twStep = -3*stepTwiddle
    182 
    183         MOV      setCount,pointStep,LSR #3
    184         ADD     pSrc,pSrc,#16                         @// set pSrc to data[0] of the next set
    185         ADD     pSrc,pSrc,pointStep                   @// increment to data[1] of the next set
    186 
    187 
    188         @// Loop on the sets
    189 
    190 setLoop\name :
    191 
    192 
    193 
    194         SUBS    setCount,setCount,#2                    @// decrement the loop counter
    195 
    196         .ifeqs  "\inverse", "TRUE"
    197             VMULL   qT0,dXr1,dW1[0]
    198             VMLAL   qT0,dXi1,dW1[1]                       @// real part
    199             VMULL   qT1,dXi1,dW1[0]
    200             VMLSL   qT1,dXr1,dW1[1]                       @// imag part
    201 
    202         .else
    203             VMULL   qT0,dXr1,dW1[0]
    204             VMLSL   qT0,dXi1,dW1[1]                       @// real part
    205             VMULL   qT1,dXi1,dW1[0]
    206             VMLAL   qT1,dXr1,dW1[1]                       @// imag part
    207 
    208         .endif
    209 
    210         VLD2    {dXr1,dXi1},[pSrc],pointStep              @//  data[1] for next iteration
    211 
    212         .ifeqs  "\inverse", "TRUE"
    213             VMULL   qT2,dXr2,dW2[0]
    214             VMLAL   qT2,dXi2,dW2[1]                       @// real part
    215             VMULL   qT3,dXi2,dW2[0]
    216             VMLSL   qT3,dXr2,dW2[1]                       @// imag part
    217 
    218         .else
    219             VMULL   qT2,dXr2,dW2[0]
    220             VMLSL   qT2,dXi2,dW2[1]                       @// real part
    221             VMULL   qT3,dXi2,dW2[0]
    222             VMLAL   qT3,dXr2,dW2[1]                       @// imag part
    223 
    224         .endif
    225 
    226         VRSHRN  dZr1,qT0,#31
    227         VRSHRN  dZi1,qT1,#31
    228         VLD2    {dXr2,dXi2},[pSrc],pointStep              @//  data[2] for next iteration
    229 
    230 
    231         .ifeqs  "\inverse", "TRUE"
    232             VMULL   qT0,dXr3,dW3[0]
    233             VMLAL   qT0,dXi3,dW3[1]                       @// real part
    234             VMULL   qT1,dXi3,dW3[0]
    235             VMLSL   qT1,dXr3,dW3[1]                       @// imag part
    236 
    237         .else
    238             VMULL   qT0,dXr3,dW3[0]
    239             VMLSL   qT0,dXi3,dW3[1]                       @// real part
    240             VMULL   qT1,dXi3,dW3[0]
    241             VMLAL   qT1,dXr3,dW3[1]                       @// imag part
    242 
    243         .endif
    244 
    245         VRSHRN  dZr2,qT2,#31
    246         VRSHRN  dZi2,qT3,#31
    247 
    248 
    249         VRSHRN  dZr3,qT0,#31
    250         VRSHRN  dZi3,qT1,#31
    251         VLD2    {dXr3,dXi3},[pSrc],setStep            @//  data[3] & update pSrc to data[0]
    252 
    253         .ifeqs "\scaled", "TRUE"
    254 
    255             @// finish first stage of 4 point FFT
    256             VHADD    qY0,qX0,qZ2
    257             VHSUB    qY2,qX0,qZ2
    258 
    259             VLD2    {dXr0,dXi0},[pSrc]!          @//  data[0] for next iteration
    260             VHADD    qY1,qZ1,qZ3
    261             VHSUB    qY3,qZ1,qZ3
    262 
    263             @// finish second stage of 4 point FFT
    264 
    265             VHSUB    qZ0,qY2,qY1
    266 
    267 
    268             .ifeqs  "\inverse", "TRUE"
    269 
    270                 VHADD    dZr3,dYr0,dYi3
    271                 VST2    {dZr0,dZi0},[pDst :128],outPointStep
    272                 VHSUB    dZi3,dYi0,dYr3
    273 
    274                 VHADD    qZ2,qY2,qY1
    275                 VST2    {dZr3,dZi3},[pDst :128],outPointStep
    276 
    277                 VHSUB    dZr1,dYr0,dYi3
    278                 VST2    {dZr2,dZi2},[pDst :128],outPointStep
    279                 VHADD    dZi1,dYi0,dYr3
    280 
    281                 VST2    {dZr1,dZi1},[pDst :128],dstStep
    282 
    283 
    284             .else
    285 
    286                 VHSUB    dZr1,dYr0,dYi3
    287                 VST2    {dZr0,dZi0},[pDst :128],outPointStep
    288                 VHADD    dZi1,dYi0,dYr3
    289 
    290                 VHADD    qZ2,qY2,qY1
    291                 VST2    {dZr1,dZi1},[pDst :128],outPointStep
    292 
    293                 VHADD    dZr3,dYr0,dYi3
    294                 VST2    {dZr2,dZi2},[pDst :128],outPointStep
    295                 VHSUB    dZi3,dYi0,dYr3
    296 
    297                 VST2    {dZr3,dZi3},[pDst :128],dstStep
    298 
    299 
    300             .endif
    301 
    302 
    303         .else
    304 
    305             @// finish first stage of 4 point FFT
    306             VADD    qY0,qX0,qZ2
    307             VSUB    qY2,qX0,qZ2
    308 
    309             VLD2    {dXr0,dXi0},[pSrc :128]!          @//  data[0] for next iteration
    310             VADD    qY1,qZ1,qZ3
    311             VSUB    qY3,qZ1,qZ3
    312 
    313             @// finish second stage of 4 point FFT
    314 
    315             VSUB    qZ0,qY2,qY1
    316 
    317 
    318             .ifeqs  "\inverse", "TRUE"
    319 
    320                 VADD    dZr3,dYr0,dYi3
    321                 VST2    {dZr0,dZi0},[pDst :128],outPointStep
    322                 VSUB    dZi3,dYi0,dYr3
    323 
    324                 VADD    qZ2,qY2,qY1
    325                 VST2    {dZr3,dZi3},[pDst :128],outPointStep
    326 
    327                 VSUB    dZr1,dYr0,dYi3
    328                 VST2    {dZr2,dZi2},[pDst :128],outPointStep
    329                 VADD    dZi1,dYi0,dYr3
    330 
    331                 VST2    {dZr1,dZi1},[pDst :128],dstStep
    332 
    333 
    334             .else
    335 
    336                 VSUB    dZr1,dYr0,dYi3
    337                 VST2    {dZr0,dZi0},[pDst :128],outPointStep
    338                 VADD    dZi1,dYi0,dYr3
    339 
    340                 VADD    qZ2,qY2,qY1
    341                 VST2    {dZr1,dZi1},[pDst :128],outPointStep
    342 
    343                 VADD    dZr3,dYr0,dYi3
    344                 VST2    {dZr2,dZi2},[pDst :128],outPointStep
    345                 VSUB    dZi3,dYi0,dYr3
    346 
    347                 VST2    {dZr3,dZi3},[pDst :128],dstStep
    348 
    349 
    350             .endif
    351 
    352         .endif
    353 
    354         ADD     pSrc,pSrc,pointStep                         @// increment to data[1] of the next set
    355         BGT     setLoop\name
    356 
    357 
    358         VLD1     dW1,[pTwiddle :64],stepTwiddle                  @//[wi | wr]
    359         SUBS    grpCount,grpCount,#4                    @// subtract 4 since grpCount multiplied by 4
    360         VLD1     dW2,[pTwiddle :64],stepTwiddle                  @//[wi | wr]
    361         ADD     pSrc,pSrc,srcStep                       @// increment pSrc for the next grp
    362         VLD1     dW3,[pTwiddle :64],twStep                       @//[wi | wr]
    363         BGT     grpLoop\name
    364 
    365 
    366         @// Reset and Swap pSrc and pDst for the next stage
    367         MOV     t1,pDst
    368         SUB     pDst,pSrc,outPointStep,LSL #2                  @// pDst -= 2*size; pSrc -= 8*size bytes
    369         SUB     pSrc,t1,outPointStep
    370 
    371 
    372         .endm
    373 
    374 
    375         M_START armSP_FFTFwd_CToC_SC32_Radix4_OutOfPlace_unsafe,r4
    376             FFTSTAGE "FALSE","FALSE",FWD
    377         M_END
    378 
    379 
    380         M_START armSP_FFTInv_CToC_SC32_Radix4_OutOfPlace_unsafe,r4
    381             FFTSTAGE "FALSE","TRUE",INV
    382         M_END
    383 
    384 
    385         M_START armSP_FFTFwd_CToC_SC32_Sfs_Radix4_OutOfPlace_unsafe,r4
    386             FFTSTAGE "TRUE","FALSE",FWDSFS
    387         M_END
    388 
    389 
    390         M_START armSP_FFTInv_CToC_SC32_Sfs_Radix4_OutOfPlace_unsafe,r4
    391             FFTSTAGE "TRUE","TRUE",INVSFS
    392         M_END
    393 
    394 
    395 	.end
    396