1 @// 2 @// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. 3 @// 4 @// Use of this source code is governed by a BSD-style license 5 @// that can be found in the LICENSE file in the root of the source 6 @// tree. An additional intellectual property rights grant can be found 7 @// in the file PATENTS. All contributing project authors may 8 @// be found in the AUTHORS file in the root of the source tree. 9 @// 10 @// This is a modification of omxSP_FFTFwd_RToCCS_S32_Sfs_s.s 11 @// to support float instead of SC32. 12 @// 13 14 @// 15 @// Description: 16 @// Compute FFT for a real signal 17 @// 18 @// 19 20 21 @// Include standard headers 22 23 #include "dl/api/arm/armCOMM_s.h" 24 #include "dl/api/arm/omxtypes_s.h" 25 26 @// M_VARIANTS ARM1136JS 27 28 @// Import symbols required from other files 29 @// (For example tables) 30 31 .extern armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace_unsafe_vfp 32 .extern armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe_vfp 33 .extern armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace_unsafe_vfp 34 .extern armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace_unsafe_vfp 35 36 @// Set debugging level 37 @//DEBUG_ON SETL {TRUE} 38 39 40 41 @// Guarding implementation by the processor name 42 43 @// IF ARM1136JS 44 45 @//Input Registers 46 47 #define pSrc r0 48 #define pDst r1 49 #define pFFTSpec r2 50 51 52 @// Output registers 53 #define result r0 54 55 @//Local Scratch Registers 56 57 @// N=1 case 58 #define scaleMinusOne r2 59 #define rnd r2 60 #define zero r8 61 #define Zero r9 62 63 64 #define argTwiddle r1 65 #define argDst r2 66 #define argScale r4 67 #define pTwiddle r4 68 #define pOut r5 69 #define subFFTSize r7 70 #define subFFTNum r6 71 #define N r6 72 #define order r14 73 #define diff r9 74 #define count r8 75 #define diffMinusOne r10 76 #define round r3 77 78 #define step r3 79 #define step1 r6 80 #define twStep r12 81 #define pTwiddleTmp r14 82 #define t0 r12 83 #define t1 r14 /*@// pTwiddleTmp*/ 84 #define t2 r0 85 #define t3 r1 /*@// pSrc,argTwiddle*/ 86 #define t4 r6 87 #define t5 r7 /*@// step1,subFFTSize*/ 88 89 #define x0r s0 90 #define x0i s1 91 #define y0r s2 92 #define y0i s3 93 #define x1r s4 94 #define x1i s5 95 #define w1r s2 96 #define w1i s3 97 #define w0r s6 98 #define w0i s7 99 #define y1r s2 /*@// w1r,w1i*/ 100 #define y1i s3 101 #define st0 s8 102 #define st1 s9 103 #define st2 s10 104 #define st3 s11 105 #define st4 s12 106 #define st5 s13 107 #define half s15 108 109 110 111 112 @// Allocate stack memory required by the function 113 114 115 116 @// Write function header 117 M_START omxSP_FFTFwd_RToCCS_F32_Sfs_vfp,r11 118 119 @ Structure offsets for FFTSpec 120 .set ARMsFFTSpec_N, 0 121 .set ARMsFFTSpec_pBitRev, 4 122 .set ARMsFFTSpec_pTwiddle, 8 123 .set ARMsFFTSpec_pBuf, 12 124 125 @// Define stack arguments 126 127 @// Setup half value 128 movw N, #0 @// Use N as a temp. 129 movt N, #0x3f00 130 vmov.f32 half, N 131 132 @// Read the size from structure and take log 133 LDR N, [pFFTSpec, #ARMsFFTSpec_N] 134 135 @// Read other structure parameters 136 LDR pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle] 137 LDR pOut, [pFFTSpec, #ARMsFFTSpec_pBuf] 138 139 @// N=1 Treat seperately 140 CMP N,#1 141 BGT sizeGreaterThanOne 142 // N<=1 is not supported 143 @// Set return value 144 MOV result, #OMX_Sts_NoErr 145 B FunctionEnd 146 147 sizeGreaterThanOne: 148 @// Do a N/2 point complex FFT including the scaling 149 150 MOV N,N,ASR #1 @// N/2 point complex FFT 151 CLZ order,N @// N = 2^order 152 RSB order,order,#31 153 MOV subFFTSize,#1 154 @//MOV subFFTNum,N 155 156 157 CMP order,#1 158 BGT orderGreaterthan1 @// order > 1 159 vldmlt.f32 pSrc, {x0r, x0i} 160 vstmlt.f32 pOut, {x0r, x0i} 161 MOVLT pSrc,pOut 162 MOVLT argDst,pDst 163 BLT FFTEnd 164 165 MOV argDst,pOut @// Set input args to fft stages 166 MOV pOut,pDst @// Set input args to fft stages 167 MOV argTwiddle,pTwiddle 168 169 BL armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace_unsafe_vfp 170 B finalComplexToRealFixup 171 172 orderGreaterthan1: 173 174 TST order, #2 @// Set input args to fft stages 175 MOVEQ argDst,pDst 176 MOVNE argDst,pOut 177 MOVNE pOut,pDst @// Pass the first stage dest in RN5 178 MOV argTwiddle,pTwiddle 179 180 @//check for even or odd order 181 182 @// NOTE: The following combination of BL's would work fine 183 @// eventhough the first BL would corrupt the flags. This is 184 @// because the end of the "grpZeroSetLoop" loop inside 185 @// armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe_vfp sets 186 @// the Z flag to EQ 187 188 TST order,#0x00000001 189 BLEQ armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe_vfp 190 BLNE armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace_unsafe_vfp 191 192 unscaledRadix4Loop: 193 CMP subFFTNum,#1 194 BEQ FFTEnd 195 BL armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace_unsafe_vfp 196 B unscaledRadix4Loop 197 198 FFTEnd: 199 finalComplexToRealFixup: 200 201 @// step = N/2 * 8 bytes 202 MOV step,subFFTSize,LSL #3 203 @// twStep = 3N/8 * 8 bytes pointing to W^1 204 SUB twStep,step,subFFTSize,LSL #1 205 @// step1 = N/4 * 8 = N/2*4 bytes 206 MOV step1,subFFTSize,LSL #2 207 @// (N/4-1)*8 bytes 208 SUB step1,step1,#8 209 210 @// F(0) = 1/2 [Z(0) + Z'(0)] - j [Z(0) - Z'(0)] 211 @// 1/2 [(a+jb) + (a-jb)] - j [(a+jb) - (a-jb)] 212 @// 1/2 [2a+j0] - j [0+j2b] 213 @// (a+b, 0) 214 215 @// F(N/2) =1/2 [Z(0) + Z'(0)] + j [Z(0) - Z'(0)] 216 @// 1/2 [(a+jb) + (a-jb)] + j [(a+jb) - (a-jb)] 217 @// 1/2 [2a+j0] + j [0+j2b] 218 @// (a-b, 0) 219 220 @// F(0) and F(N/2) 221 vldm.f32 pSrc!, {x0r, x0i} 222 vadd.f32 y0r,x0r,x0i @// F(0) = (2(Z0.r+Z0.i) , 0) 223 vsub.f32 x0r,x0r,x0i @// F(N/2) = (2(Z0.r-Z0.i) , 0) 224 vsub.f32 y0i, y0i @ y0i and x0i set to 0.0 225 vsub.f32 x0i, x0i 226 227 add argDst, step 228 vstm.f32 argDst, {x0r, x0i} @// {x0r,x0i}->[argDst, step] 229 sub argDst, step 230 vstm.f32 argDst!, {y0r, y0i} 231 232 SUBS subFFTSize,subFFTSize,#2 233 234 ADD pTwiddleTmp,argTwiddle,#8 @// W^2 235 ADD argTwiddle,argTwiddle,twStep @// W^1 236 BLT End 237 BEQ lastElement 238 239 240 @// F(k) = 1/2 [Z(k) + Z'(N/2-k)] -j*W^(k) [Z(k) - Z'(N/2-k)] 241 @// Process 2 elements at a time. E.g: F(1) and F(N/2-1) since 242 @// both of them require Z(1) and Z(N/2-1) 243 244 ASR subFFTSize,subFFTSize,#1 245 evenOddButterflyLoop: 246 247 SUB step,step,#16 @// (N/2-2)*8 bytes 248 249 add pSrc, step 250 vldm.f32 pSrc, {x1r, x1i} @// {x1r, x1i} = [pSrc, step] 251 sub pSrc, step 252 vldm.f32 pSrc!, {x0r, x0i} 253 add argTwiddle, step1 254 vldm.f32 argTwiddle, {w1r, w1i} @// {w1r, w1i} = [argTwiddle, step1] 255 sub argTwiddle, step1 256 vldm.f32 argTwiddle!, {w0r, w0i} @// {w0r, w0i} = [argTwiddle], #8 257 258 SUB step1,step1,#8 259 SUBS subFFTSize,subFFTSize,#1 260 261 vsub.f32 st2,x0r,x1r @// a-c 262 vadd.f32 st3,x0i,x1i @// b+d 263 vadd.f32 st0,x0r,x1r @// a+c 264 vsub.f32 st1,x0i,x1i @// b-d 265 266 vmul.f32 x1r,w1r,st2 267 vmul.f32 x1i,w1r,st3 268 vmla.f32 x1r,w1i,st3 @// x1r = w1r*st2 + w1i*st3 269 @//RSB x1r,x1r,#0 270 vmls.f32 x1i,w1i,st2 @// x1i = w1r*st3 - wli*st2 271 272 vsub.f32 y1r, st0, x1i 273 vadd.f32 y1i, x1r, st1 274 vneg.f32 y1i, y1i 275 276 vmul.f32 x0r,w0r,st2 277 vmul.f32 x0i,w0r,st3 278 vmls.f32 x0r,w0i,st3 @// x0r = w0r*st2 - w0i*st3 279 vmla.f32 x0i,w0i,st2 @// x0i = w0r*st3 + x0i*st1 280 281 vsub.f32 st4,st0,x0i @// F(1) 282 vadd.f32 st5,x0r,st1 283 284 285 vmul.f32 y1r, half 286 vmul.f32 y1i, half 287 vmul.f32 st4, half 288 vmul.f32 st5, half 289 290 add argDst, step 291 vstm.f32 argDst, {y1r, y1i} @// {y1r,y1i} -> [argDst,step] 292 sub argDst, step 293 vstm.f32 argDst!, {st4, st5} 294 295 296 MOV t0,argTwiddle @// swap ptr for even and odd twiddles 297 MOV argTwiddle,pTwiddleTmp 298 MOV pTwiddleTmp,t0 299 300 BGT evenOddButterflyLoop 301 302 @// Last element can be expanded as follows 303 @// 1/2[Z(k) + Z'(k)] + j w^k [Z(k) - Z'(k)] 304 @// 1/2[(a+jb) + (a-jb)] + j w^k [(a+jb) - (a-jb)] 305 @// 1/2[2a+j0] + j (c+jd) [0+j2b] 306 @// (a-bc, -bd) 307 308 lastElement: 309 vldm.f32 pSrc, {x0r, x0i} 310 vneg.f32 x0i, x0i 311 vstm.f32 argDst, {x0r, x0i} 312 313 End: 314 @// Set return value 315 MOV result, #OMX_Sts_NoErr 316 317 FunctionEnd: 318 @// Write function tail 319 M_END 320 321 @// ENDIF @//ARM1136JS 322 323 324 @// Guarding implementation by the processor name 325 326 327 328 .end 329