Home | History | Annotate | Download | only in neon
      1 @//
      2 @//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
      3 @//
      4 @//  Use of this source code is governed by a BSD-style license
      5 @//  that can be found in the LICENSE file in the root of the source
      6 @//  tree. An additional intellectual property rights grant can be found
      7 @//  in the file PATENTS.  All contributing project authors may
      8 @//  be found in the AUTHORS file in the root of the source tree.
      9 @//
     10 @//  This file was originally licensed as follows. It has been
     11 @//  relicensed with permission from the copyright holders.
     12 @//
     13 
     14 @//
     15 @// File Name:  omxSP_FFTFwd_RToCCS_S32_Sfs_s.s
     16 @// OpenMAX DL: v1.0.2
     17 @// Last Modified Revision:   7810
     18 @// Last Modified Date:       Thu, 04 Oct 2007
     19 @//
     20 @// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
     21 @//
     22 @//
     23 @//
     24 @// Description:
     25 @// Compute FFT for a real signal
     26 @//
     27 
     28 
     29 
     30 @// Include standard headers
     31 
     32 #include "dl/api/arm/armCOMM_s.h"
     33 #include "dl/api/arm/omxtypes_s.h"
     34 
     35 
     36 @// Import symbols required from other files
     37 @// (For example tables)
     38 
     39         .extern  armSP_FFTFwd_CToC_SC32_Sfs_Radix2_fs_OutOfPlace_unsafe
     40         .extern  armSP_FFTFwd_CToC_SC32_Radix2_fs_OutOfPlace_unsafe
     41         .extern  armSP_FFTFwd_CToC_SC32_Radix4_fs_OutOfPlace_unsafe
     42         .extern  armSP_FFTFwd_CToC_SC32_Radix8_fs_OutOfPlace_unsafe
     43         .extern  armSP_FFTFwd_CToC_SC32_Radix4_OutOfPlace_unsafe
     44         .extern  armSP_FFTFwd_CToC_SC32_Sfs_Radix4_fs_OutOfPlace_unsafe
     45         .extern  armSP_FFTFwd_CToC_SC32_Sfs_Radix8_fs_OutOfPlace_unsafe
     46         .extern  armSP_FFTFwd_CToC_SC32_Sfs_Radix4_OutOfPlace_unsafe
     47         .extern  armSP_FFTFwd_CToC_SC32_Sfs_Radix2_OutOfPlace_unsafe
     48         .extern  armSP_FFTFwd_CToC_SC32_Radix2_OutOfPlace_unsafe
     49 
     50 @// Set debugging level
     51 @//DEBUG_ON    SETL {TRUE}
     52 
     53 
     54 
     55 @// Guarding implementation by the processor name
     56 
     57 
     58 
     59     @// Guarding implementation by the processor name
     60 
     61 @// Import symbols required from other files
     62 @// (For example tables)
     63         .extern  armSP_FFTFwd_CToC_SC32_Radix4_ls_OutOfPlace_unsafe
     64         .extern  armSP_FFTFwd_CToC_SC32_Radix2_ls_OutOfPlace_unsafe
     65         .extern  armSP_FFTFwd_CToC_SC32_Sfs_Radix4_ls_OutOfPlace_unsafe
     66         .extern  armSP_FFTFwd_CToC_SC32_Sfs_Radix2_ls_OutOfPlace_unsafe
     67 
     68 
     69 @//Input Registers
     70 
     71 #define pSrc            r0
     72 #define pDst            r1
     73 #define pFFTSpec        r2
     74 #define scale           r3
     75 
     76 
     77 @// Output registers
     78 #define result          r0
     79 
     80 @//Local Scratch Registers
     81 
     82 #define argTwiddle      r1
     83 #define argDst          r2
     84 #define argScale        r4
     85 #define tmpOrder        r4
     86 #define pTwiddle        r4
     87 #define pOut            r5
     88 #define subFFTSize      r7
     89 #define subFFTNum       r6
     90 #define N               r6
     91 #define order           r14
     92 #define diff            r9
     93 @// Total num of radix stages required to comple the FFT
     94 #define count           r8
     95 #define x0r             r4
     96 #define x0i             r5
     97 #define diffMinusOne    r2
     98 #define subFFTSizeTmp   r6
     99 #define step            r3
    100 #define step1           r4
    101 #define twStep          r8
    102 #define zero            r9
    103 #define pTwiddleTmp     r5
    104 #define t0              r10
    105 
    106 @// Neon registers
    107 
    108 #define dX0       d0.s32
    109 #define dzero     d1.s32
    110 #define dZero     d2.s32
    111 #define dShift    d3.s32
    112 #define dX0r      d2.s32
    113 #define dX0i      d3.s32
    114 #define dX1r      d4.s32
    115 #define dX1i      d5.s32
    116 #define dT0       d6.s32
    117 #define dT1       d7.s32
    118 #define dT2       d8.s32
    119 #define dT3       d9.s32
    120 #define qT0       q5.s64
    121 #define qT1       q6.s64
    122 #define dW0r      d14.s32
    123 #define dW0i      d15.s32
    124 #define dW1r      d16.s32
    125 #define dW1i      d17.s32
    126 #define dY0r      d14.s32
    127 #define dY0i      d15.s32
    128 #define dY1r      d16.s32
    129 #define dY1i      d17.s32
    130 #define dY0rS64   d14.s64
    131 #define dY0iS64   d15.s64
    132 #define qT2       q9.s64
    133 #define qT3       q10.s64
    134 @// lastThreeelements
    135 #define dX1       d3.s32
    136 #define dW0       d4.s32
    137 #define dW1       d5.s32
    138 #define dY0       d10.s32
    139 #define dY1       d11.s32
    140 #define dY2       d12.s32
    141 #define dY3       d13.s32
    142 
    143     @// Allocate stack memory required by the function
    144 
    145         M_ALLOC4        diffOnStack, 4
    146 
    147     @// Write function header
    148         M_START     omxSP_FFTFwd_RToCCS_S32_Sfs,r11,d15
    149 
    150 @ Structure offsets for the FFTSpec
    151         .set    ARMsFFTSpec_N, 0
    152         .set    ARMsFFTSpec_pBitRev, 4
    153         .set    ARMsFFTSpec_pTwiddle, 8
    154         .set    ARMsFFTSpec_pBuf, 12
    155 
    156         @// Define stack arguments
    157 
    158         @// Read the size from structure and take log
    159         LDR     N, [pFFTSpec, #ARMsFFTSpec_N]
    160 
    161         @// Read other structure parameters
    162         LDR     pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]
    163         LDR     pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]
    164 
    165         @//  N=1 Treat seperately
    166         CMP     N,#1
    167         BGT     sizeGreaterThanOne
    168         VLD1    dX0[0],[pSrc]
    169         RSB     scale,scale,#0                        @// to use VRSHL for right shift by a variable
    170         MOV     zero,#0
    171         VMOV    dShift[0],scale
    172         VMOV    dzero[0],zero
    173         VRSHL   dX0,dShift
    174         VMOV    dZero[0],zero
    175         VST3    {dX0[0],dzero[0],dZero[0]},[pDst]
    176 
    177         B       End
    178 
    179 
    180 
    181 sizeGreaterThanOne:
    182         @// Do a N/2 point complex FFT including the scaling
    183 
    184         MOV     N,N,ASR #1                          @// N/2 point complex FFT
    185 
    186         CLZ     order,N                             @// N = 2^order
    187         RSB     order,order,#31
    188         MOV     subFFTSize,#1
    189         @//MOV     subFFTNum,N
    190 
    191         CMP     order,#3
    192         BGT     orderGreaterthan3                   @// order > 3
    193 
    194         CMP     order,#1
    195         BGE     orderGreaterthan0                   @// order > 0
    196         M_STR   scale, diffOnStack,LT               @// order = 0
    197         VLD1    dX0,[pSrc]
    198         VST1    dX0,[pOut]
    199         MOV     pSrc,pOut
    200         MOV     argDst,pDst
    201         BLT     FFTEnd
    202 
    203 orderGreaterthan0:
    204         @// set the buffers appropriately for various orders
    205         CMP     order,#2
    206         MOVEQ   argDst,pDst
    207         MOVNE   argDst,pOut
    208         MOVNE   pOut,pDst                           @// Pass the first stage destination in RN5
    209         MOV     argTwiddle,pTwiddle
    210 
    211         SUBS     diff,scale,order
    212         M_STR   diff,diffOnStack
    213         MOVGT   scale,order
    214         @// Now scale <= order
    215 
    216         CMP     order,#1
    217         BGT     orderGreaterthan1
    218         SUBS    scale,scale,#1
    219         BLEQ    armSP_FFTFwd_CToC_SC32_Sfs_Radix2_fs_OutOfPlace_unsafe  @// order = 1
    220         BLLT    armSP_FFTFwd_CToC_SC32_Radix2_fs_OutOfPlace_unsafe      @// order = 1
    221         B       FFTEnd
    222 
    223 orderGreaterthan1:
    224         CMP     order,#2
    225         MOV     argScale,scale
    226         BGT     orderGreaterthan2
    227         SUBS    argScale,argScale,#1
    228         BLGE    armSP_FFTFwd_CToC_SC32_Sfs_Radix2_fs_OutOfPlace_unsafe      @// order =2
    229         BLLT    armSP_FFTFwd_CToC_SC32_Radix2_fs_OutOfPlace_unsafe
    230         SUBS    argScale,argScale,#1
    231         BLEQ    armSP_FFTFwd_CToC_SC32_Sfs_Radix2_ls_OutOfPlace_unsafe
    232         BLLT    armSP_FFTFwd_CToC_SC32_Radix2_ls_OutOfPlace_unsafe
    233         B       FFTEnd
    234 
    235 orderGreaterthan2:@// order =3
    236         SUBS    argScale,argScale,#1
    237         BLGE    armSP_FFTFwd_CToC_SC32_Sfs_Radix2_fs_OutOfPlace_unsafe
    238         BLLT    armSP_FFTFwd_CToC_SC32_Radix2_fs_OutOfPlace_unsafe
    239         SUBS    argScale,argScale,#1
    240         BLGE    armSP_FFTFwd_CToC_SC32_Sfs_Radix2_OutOfPlace_unsafe
    241         BLLT    armSP_FFTFwd_CToC_SC32_Radix2_OutOfPlace_unsafe
    242         SUBS    argScale,argScale,#1
    243         BLEQ    armSP_FFTFwd_CToC_SC32_Sfs_Radix2_ls_OutOfPlace_unsafe
    244         BLLT    armSP_FFTFwd_CToC_SC32_Radix2_ls_OutOfPlace_unsafe
    245         B       FFTEnd
    246 
    247 
    248 
    249 orderGreaterthan3:
    250         @// check scale = 0 or scale = order
    251         SUBS    diff, scale, order                 @// scale > order
    252         MOVGT   scale,order
    253         BGE     specialScaleCase                   @// scale = 0 or scale = order
    254         CMP     scale,#0
    255         BEQ     specialScaleCase
    256         B       generalScaleCase
    257 
    258 specialScaleCase:@//  scale = 0 or scale = order  and order >= 2
    259 
    260         TST     order, #2                           @// Set input args to fft stages
    261         MOVEQ   argDst,pDst
    262         MOVNE   argDst,pOut
    263         MOVNE   pOut,pDst                           @// Pass the first stage destination in RN5
    264         MOV     argTwiddle,pTwiddle
    265 
    266         CMP      diff,#0
    267         M_STR    diff, diffOnStack
    268         BGE      scaleEqualsOrder
    269 
    270         @//check for even or odd order
    271         @// NOTE: The following combination of BL's would work fine eventhough the first
    272         @// BL would corrupt the flags. This is because the end of the "grpZeroSetLoop" loop inside
    273         @// armSP_FFTFwd_CToC_SC32_Radix4_fs_OutOfPlace_unsafe sets the Z flag to EQ
    274 
    275         TST     order,#0x00000001
    276         BLEQ    armSP_FFTFwd_CToC_SC32_Radix4_fs_OutOfPlace_unsafe
    277         BLNE    armSP_FFTFwd_CToC_SC32_Radix8_fs_OutOfPlace_unsafe
    278 
    279         CMP        subFFTNum,#4
    280         BLT     FFTEnd
    281 
    282 
    283 unscaledRadix4Loop:
    284         BEQ        lastStageUnscaledRadix4
    285          BL        armSP_FFTFwd_CToC_SC32_Radix4_OutOfPlace_unsafe
    286          CMP        subFFTNum,#4
    287          B        unscaledRadix4Loop
    288 
    289 lastStageUnscaledRadix4:
    290         BL      armSP_FFTFwd_CToC_SC32_Radix4_ls_OutOfPlace_unsafe
    291         B        FFTEnd
    292 
    293 
    294 scaleEqualsOrder:
    295         @//check for even or odd order
    296         @// NOTE: The following combination of BL's would work fine eventhough the first
    297         @// BL would corrupt the flags. This is because the end of the "grpZeroSetLoop" loop inside
    298         @// armSP_FFTFwd_CToC_SC32_Radix4_fs_OutOfPlace_unsafe sets the Z flag to EQ
    299 
    300         TST     order,#0x00000001
    301         BLEQ    armSP_FFTFwd_CToC_SC32_Sfs_Radix4_fs_OutOfPlace_unsafe
    302         BLNE    armSP_FFTFwd_CToC_SC32_Sfs_Radix8_fs_OutOfPlace_unsafe
    303 
    304         CMP        subFFTNum,#4
    305         BLT     FFTEnd
    306 
    307 
    308 scaledRadix4Loop:
    309         BEQ        lastStageScaledRadix4
    310          BL        armSP_FFTFwd_CToC_SC32_Sfs_Radix4_OutOfPlace_unsafe
    311          CMP        subFFTNum,#4
    312          B        scaledRadix4Loop
    313 
    314 lastStageScaledRadix4:
    315         BL      armSP_FFTFwd_CToC_SC32_Sfs_Radix4_ls_OutOfPlace_unsafe
    316         B        FFTEnd
    317 
    318 generalScaleCase:@// 0 < scale < order and order >= 2
    319         @// Determine the correct destination buffer
    320         SUB     diff,order,scale
    321         TST     diff,#0x01
    322         ADDEQ   count, scale,diff,lsr #1         @// count = scale + (order - scale)/2
    323         MOVNE   count, order
    324         TST     count, #0x01                     @// Is count even or odd ?
    325 
    326         MOVEQ   argDst,pDst                     @// Set input args to fft stages
    327         MOVNE   argDst,pOut
    328         MOVNE   pOut,pDst                       @// Pass the first stage destination in RN5
    329         MOV     argTwiddle,pTwiddle
    330 
    331         M_STR   diff, diffOnStack
    332 
    333         MOV     argScale,scale                  @// Put scale in RN4 so as to save and restore
    334         BL      armSP_FFTFwd_CToC_SC32_Sfs_Radix2_fs_OutOfPlace_unsafe     @// scaled first stage
    335         SUBS    argScale,argScale,#1
    336 
    337 scaledRadix2Loop:
    338         BLGT    armSP_FFTFwd_CToC_SC32_Sfs_Radix2_OutOfPlace_unsafe
    339         SUBS    argScale,argScale,#1            @// save and restore scale (RN4) in the scaled stages
    340         BGT     scaledRadix2Loop
    341 
    342 
    343         M_LDR   diff, diffOnStack
    344         @//check for even or odd order
    345         TST     diff,#0x00000001
    346         BEQ     generalUnscaledRadix4Loop
    347         B       unscaledRadix2Loop
    348 
    349 generalUnscaledRadix4Loop:
    350         CMP        subFFTNum,#4
    351          BEQ        generalLastStageUnscaledRadix4
    352          BL        armSP_FFTFwd_CToC_SC32_Radix4_OutOfPlace_unsafe
    353          B        generalUnscaledRadix4Loop
    354 
    355 generalLastStageUnscaledRadix4:
    356         BL      armSP_FFTFwd_CToC_SC32_Radix4_ls_OutOfPlace_unsafe
    357         B        finalComplexToRealFixup
    358 
    359 
    360 unscaledRadix2Loop:
    361         CMP        subFFTNum,#2
    362          BEQ        generalLastStageUnscaledRadix2
    363          BL        armSP_FFTFwd_CToC_SC32_Radix2_OutOfPlace_unsafe
    364          B        unscaledRadix2Loop
    365 
    366 generalLastStageUnscaledRadix2:
    367         BL      armSP_FFTFwd_CToC_SC32_Radix2_ls_OutOfPlace_unsafe
    368         B        finalComplexToRealFixup
    369 
    370 
    371 FFTEnd:@// Does only the scaling
    372 
    373         M_LDR   diff, diffOnStack
    374         CMP     diff,#0
    375         BLE     finalComplexToRealFixup
    376 
    377         RSB     diff,diff,#0                        @// to use VRSHL for right shift by a variable
    378         VDUP    dShift,diff
    379 
    380         @// save subFFTSize and use tmpsubfftsize in the folowwing loop
    381         MOV    subFFTSizeTmp,subFFTSize                 @// subFFTSizeTmp same reg as subFFTNum
    382 
    383 scaleFFTData:@// N = subFFTSize  ; dataptr = pDst  ; scale = diff
    384         VLD1    {dX0},[pSrc]            @// pSrc contains pDst pointer
    385         SUBS    subFFTSizeTmp,subFFTSizeTmp,#1
    386         VRSHL   dX0,dShift
    387         VST1    {dX0},[pSrc]!
    388 
    389         BGT     scaleFFTData
    390 
    391         SUB     pSrc,pSrc,subFFTSize,LSL #3             @// reset pSrc for final fixup
    392 
    393         @//  change the logic so that output after scaling is in pOut and not in pDst
    394         @//  finally store from pOut to pDst
    395         @//  change branch "End" to branch "finalComplexToRealFixup" in the above
    396         @//  chk the code below for multiplication by j factor
    397 
    398 finalComplexToRealFixup:
    399 
    400 
    401         @// F(0) = 1/2[Z(0) + Z'(0)] - j [Z(0) - Z'(0)]
    402         @// 1/2[(a+jb) + (a-jb)] - j  [(a+jb) - (a-jb)]
    403         @// 1/2[2a+j0] - j [0+j2b]
    404         @// (a+b, 0)
    405 
    406         @// F(N/2) = 1/2[Z(0) + Z'(0)] + j [Z(0) - Z'(0)]
    407         @// 1/2[(a+jb) + (a-jb)] + j  [(a+jb) - (a-jb)]
    408         @// 1/2[2a+j0] + j [0+j2b]
    409         @// (a-b, 0)
    410 
    411         @// F(0) and F(N/2)
    412         VLD2    {dX0r[0],dX0i[0]},[pSrc]!
    413         MOV     zero,#0
    414         VMOV    dX0r[1],zero
    415         MOV     step,subFFTSize,LSL #3                  @// step = N/2 * 8 bytes
    416         VMOV    dX0i[1],zero
    417         SUB     twStep,step,subFFTSize,LSL #1           @// twStep = 3N/8 * 8 bytes pointing to W^1
    418 
    419         VADD    dY0r,dX0r,dX0i                          @// F(0) = ((Z0.r+Z0.i) , 0)
    420         MOV     step1,subFFTSize,LSL #2                 @// step1 = N/2 * 4 bytes
    421         VSUB    dY0i,dX0r,dX0i                            @// F(N/2) = ((Z0.r-Z0.i) , 0)
    422         SUBS    subFFTSize,subFFTSize,#2
    423 
    424         VST1    dY0r,[argDst],step
    425         ADD     pTwiddleTmp,argTwiddle,#8                @// W^2
    426         VST1    dY0i,[argDst]!
    427         ADD     argTwiddle,argTwiddle,twStep             @// W^1
    428 
    429         VDUP    dzero,zero
    430         SUB     argDst,argDst,step
    431 
    432         BLT     End
    433         BEQ     lastElement
    434         SUB     step,step,#24
    435         SUB     step1,step1,#8                         @// (N/4-1)*8 bytes
    436 
    437         @// F(k) = 1/2[Z(k) +  Z'(N/2-k)] -j*W^(k) [Z(k) -  Z'(N/2-k)]
    438         @// Note: W^k is stored as negative values in the table
    439         @// Process 4 elements at a time. E.g: F(1),F(2) and F(N/2-2),F(N/2-1) since both of them
    440         @// require Z(1),Z(2) and Z(N/2-2),Z(N/2-1)
    441 
    442 
    443 evenOddButterflyLoop:
    444 
    445 
    446         VLD1    dW0r,[argTwiddle],step1
    447         VLD1    dW1r,[argTwiddle]!
    448 
    449         VLD2    {dX0r,dX0i},[pSrc],step
    450         SUB     argTwiddle,argTwiddle,step1
    451         VLD2    {dX1r,dX1i},[pSrc]!
    452 
    453 
    454 
    455         SUB     step1,step1,#8                          @// (N/4-2)*8 bytes
    456         VLD1    dW0i,[pTwiddleTmp],step1
    457         VLD1    dW1i,[pTwiddleTmp]!
    458         SUB     pSrc,pSrc,step
    459 
    460         SUB     pTwiddleTmp,pTwiddleTmp,step1
    461         VREV64  dX1r,dX1r
    462         VREV64  dX1i,dX1i
    463         SUBS    subFFTSize,subFFTSize,#4
    464 
    465 
    466 
    467         VSUB    dT2,dX0r,dX1r                            @// a-c
    468         SUB     step1,step1,#8
    469         VADD    dT3,dX0i,dX1i                            @// b+d
    470         VADD    dT0,dX0r,dX1r                           @// a+c
    471         VSUB    dT1,dX0i,dX1i                            @// b-d
    472         VHADD   dT0,dT0,dzero
    473         VHADD   dT1,dT1,dzero
    474 
    475         VZIP    dW1r,dW1i
    476         vzip    dW0r,dW0i
    477 
    478 
    479         VMULL   qT0,dW1r,dT2
    480         VMLAL   qT0,dW1i,dT3
    481         VMULL   qT1,dW1r,dT3
    482         VMLSL   qT1,dW1i,dT2
    483 
    484         VMULL   qT2,dW0r,dT2
    485         VMLSL   qT2,dW0i,dT3
    486         VMULL   qT3,dW0r,dT3
    487         VMLAL   qT3,dW0i,dT2
    488 
    489 
    490         VRSHRN  dX1r,qT0,#32
    491         VRSHRN  dX1i,qT1,#32
    492 
    493         VSUB    dY1r,dT0,dX1i                           @// F(N/2 -1)
    494         VADD    dY1i,dT1,dX1r
    495         VNEG    dY1i,dY1i
    496 
    497         VREV64  dY1r,dY1r
    498         VREV64  dY1i,dY1i
    499 
    500 
    501         VRSHRN  dX0r,qT2,#32
    502         VRSHRN  dX0i,qT3,#32
    503 
    504 
    505         VSUB    dY0r,dT0,dX0i                           @// F(1)
    506         VADD    dY0i,dT1,dX0r
    507 
    508 
    509         VST2    {dY0r,dY0i},[argDst],step
    510         VST2    {dY1r,dY1i},[argDst]!
    511         SUB     argDst,argDst,step
    512         SUB     step,step,#32                            @// (N/2-4)*8 bytes
    513 
    514 
    515         BGT     evenOddButterflyLoop
    516 
    517         SUB     pSrc,pSrc,#8                @// set both the ptrs to the last element
    518         SUB     argDst,argDst,#8
    519 
    520 
    521 
    522         @// Last element can be expanded as follows
    523         @// 1/2[Z(k) + Z'(k)] + j w^k [Z(k) - Z'(k)]
    524         @// 1/2[(a+jb) + (a-jb)] + j w^k [(a+jb) - (a-jb)]
    525         @// 1/2[2a+j0] + j (c+jd) [0+j2b]
    526         @// (a-bc, -bd)
    527         @// Since (c,d) = (0,1) for the last element, result is just (a,-b)
    528 
    529 lastElement:
    530         VLD1    dX0r,[pSrc]
    531 
    532         VST1    dX0r[0],[argDst]!
    533         VNEG    dX0r,dX0r
    534         VST1    dX0r[1],[argDst]!
    535 
    536 
    537 
    538 
    539 
    540 
    541 End:
    542         @// Set return value
    543         MOV     result, #OMX_Sts_NoErr
    544 
    545         @// Write function tail
    546         M_END
    547 
    548         .end
    549 
    550