1 @// 2 @// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. 3 @// 4 @// Use of this source code is governed by a BSD-style license 5 @// that can be found in the LICENSE file in the root of the source 6 @// tree. An additional intellectual property rights grant can be found 7 @// in the file PATENTS. All contributing project authors may 8 @// be found in the AUTHORS file in the root of the source tree. 9 @// 10 @// This is a modification of 11 @// armSP_FFTInv_CCSToR_S32_preTwiddleRadix2_unsafe_s.S to support float 12 @// instead of SC32. 13 @// 14 15 @// 16 @// Description: 17 @// Compute the "preTwiddleRadix2" stage prior to the call to the complexFFT 18 @// It does a Z(k) = Feven(k) + jW^(-k) FOdd(k); k=0,1,2,...N/2-1 computation 19 @// It implements the "scaled"(by 1/2) version of the above formula. 20 @// 21 @// 22 23 24 @// Include standard headers 25 26 #include "dl/api/arm/armCOMM_s.h" 27 #include "dl/api/arm/omxtypes_s.h" 28 29 @// M_VARIANTS ARM1136JS 30 31 @// Import symbols required from other files 32 @// (For example tables) 33 34 35 @// Set debugging level 36 @//DEBUG_ON SETL {TRUE} 37 38 39 40 @// Guarding implementation by the processor name 41 42 @/ IF ARM1136JS 43 44 @//Input Registers 45 46 #define pSrc r0 47 #define pDst r1 48 #define pFFTSpec r2 49 50 51 @// Output registers 52 #define result r0 53 54 @//Local Scratch Registers 55 56 57 #define argTwiddle r1 58 #define argDst r2 59 #define argScale r4 60 #define pTwiddle r4 61 #define pOut r5 62 #define subFFTSize r7 63 #define subFFTNum r6 64 #define N r6 65 #define order r14 66 #define diff r9 67 #define count r8 68 #define diffMinusOne r2 69 #define round r3 70 71 #define pOut1 r2 72 #define size r7 73 #define step r3 74 #define step1 r6 75 #define twStep r12 76 #define pTwiddleTmp r14 77 #define t0 r12 78 79 #define x0r s0 80 #define x0i s1 81 #define x1r s2 82 #define x1i s3 83 #define w0r s4 84 #define w0i s5 85 #define y0r s6 86 #define y0i s7 87 #define w1r s6 88 #define w1i s7 89 #define y1r s6 /*@// w1r,w1i*/ 90 #define y1i s7 91 #define st0 s8 92 #define st1 s9 93 #define st2 s10 94 #define st3 s11 95 #define st4 s12 96 #define st5 s13 97 //@ half = 0.5 98 #define half s15 99 100 101 102 103 104 .macro FFTSTAGE scaled, inverse,name 105 106 @// Initialize half now. 107 movw N, #0x0000 108 movt N, #0x3f00 109 vmov.f32 half, N @// half = 0.5 110 111 @// Read the size from structure and take log 112 LDR N, [pFFTSpec, #ARMsFFTSpec_N] 113 114 @// Read other structure parameters 115 LDR pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle] 116 LDR pOut, [pFFTSpec, #ARMsFFTSpec_pBuf] 117 118 119 MOV size,N,ASR #1 @// preserve the contents of N 120 121 MOV step,size,LSL #3 @// step = N/2 * 8 bytes 122 ADD pTwiddleTmp,pTwiddle,#8 @// W^2 123 124 ADD pOut1,pOut,step @// pOut1 = pOut+ N/2*8 bytes 125 @// twStep = 3N/8 * 8 bytes pointing to W^1 126 SUB twStep,step,size,LSL #1 127 MOV step1,size,LSL #2 @// step1 = N/4 * 8 = N/2*4 bytes 128 SUB step1,step1,#8 @// (N/4-1)*8 bytes 129 ADD argTwiddle,pTwiddle,twStep @// W^1 130 131 @// Z(k) = 1/2 {[F(k) + F'(N/2-k)] +j*W^(-k) [F(k) - F'(N/2-k)]} 132 @// Note: W^(k) is stored as negated value and also need to 133 @// conjugate the values from the table 134 135 @// Z(0) : no need of twiddle multiply 136 @// Z(0) = 1/2 { [F(0) + F'(N/2)] +j [F(0) - F'(N/2)] } 137 138 139 add pSrc, step @// step = N/2*8 bytes 140 vldm.f32 pSrc, {x1r, x1i} @// {x1r, x1i} = [pSrc, step] 141 sub pSrc, step 142 vldm.f32 pSrc!, {x0r, x0i} 143 144 SUBS size,size,#2 145 146 vadd.f32 st0, x0r, x1r @// a+c 147 vsub.f32 st1, x0r, x1r @// a-c 148 vmov.f32 x0r, st0 149 vmov.f32 x1r, st1 150 vsub.f32 st0, x0i, x1i @// b-d 151 vadd.f32 x1i, x0i, x1i @// b+d 152 vmov.f32 x0i, st0 153 154 155 vsub.f32 x0r,x0r,x1i @// Z(0).r 156 vadd.f32 x0i,x0i,x1r @// Z(0).i 157 158 vmul.f32 x0r, half 159 vmul.f32 x0i, half 160 vstm.f32 pOut1!, {x0r, x0i} @// pOut1 = pOut+ N/2*8 bytes 161 162 BLT end\name 163 BEQ lastElement\name 164 165 ASR size,size,#1 166 evenOddButterflyLoop\name: 167 168 SUB step,step,#16 @// (N/2-2)*8 bytes 169 170 add pSrc, step @// (N/2-1)*8 bytes 171 vldm.f32 pSrc, {x1r, x1i} @// {x1r, x1i} = [pSrc, step] 172 sub pSrc, step 173 vldm.f32 pSrc!, {x0r, x0i} 174 add argTwiddle, step1 175 vldm.f32 argTwiddle, {w1r, w1i} @// {w1r, w1i} = [argTwiddle, step] 176 sub argTwiddle, step1 177 vldm.f32 argTwiddle!, {w0r, w0i} 178 179 SUB step1,step1,#8 180 SUBS size,size,#1 181 182 183 vsub.f32 st2,x0r,x1r @// a-c 184 vadd.f32 st3,x0i,x1i @// b+d 185 vadd.f32 st0,x0r,x1r @// a+c 186 vsub.f32 st1,x0i,x1i @// b-d 187 188 vmul.f32 x1r,w1r,st2 189 vmul.f32 x1i,w1r,st3 190 vmls.f32 x1r,w1i,st3 191 vmla.f32 x1i,w1i,st2 192 193 vadd.f32 y1r,st0,x1i @// F(N/2 -1) 194 vsub.f32 y1i,x1r,st1 @// y1r,y1i same as w1r, w1i 195 196 197 vmul.f32 x0r,w0r,st2 198 vmul.f32 x0i,w0r,st3 199 vmla.f32 x0r,w0i,st3 200 vmls.f32 x0i,w0i,st2 201 202 203 vadd.f32 st4,st0,x0i @// F(1) 204 vsub.f32 st5,st1,x0r 205 206 207 vmul.f32 y1r, half 208 vmul.f32 y1i, half 209 vmul.f32 st4, half 210 vmul.f32 st5, half 211 add pOut1, step @// (N/2-1)*8 bytes 212 vstm.f32 pOut1, {y1r, y1i} @// {y1r,y1i} = [pOut1, step] 213 sub pOut1, step 214 vstm.f32 pOut1!, {st4, st5} 215 216 MOV t0,argTwiddle @// swap ptr for even and odd twiddles 217 MOV argTwiddle,pTwiddleTmp 218 MOV pTwiddleTmp,t0 219 220 BGT evenOddButterflyLoop\name 221 222 223 @// Last element can be expanded as follows 224 @// 1/2[Z(k) + Z'(k)] - j w^-k [Z(k) - Z'(k)] 225 @// (since W^k is stored as -ve) 226 @// 1/2[(a+jb) + (a-jb)] - j w^-k [(a+jb) - (a-jb)] 227 @// 1/2[2a+j0] + j (c-jd) [0+j2b] 228 @// (a+bc, -bd) 229 @// Since (c,d) = (0,1) for the last element, result is just (a,-b) 230 231 lastElement\name: 232 vldm.f32 pSrc, {x0r, x0i} 233 234 vneg.f32 x0i, x0i 235 vstm.f32 pOut1, {x0r, x0i} 236 end\name: 237 238 239 .endm 240 241 242 @ Structure offsets for FFTSpec 243 .set ARMsFFTSpec_N, 0 244 .set ARMsFFTSpec_pBitRev, 4 245 .set ARMsFFTSpec_pTwiddle, 8 246 .set ARMsFFTSpec_pBuf, 12 247 248 249 M_START armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_unsafe_vfp,r4 250 FFTSTAGE "FALSE","TRUE",Inv 251 M_END 252 253 @// ENDIF @//ARM1136JS 254 255 256 @// Guarding implementation by the processor name 257 258 259 260 .end 261