1 @// 2 @// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. 3 @// 4 @// Use of this source code is governed by a BSD-style license 5 @// that can be found in the LICENSE file in the root of the source 6 @// tree. An additional intellectual property rights grant can be found 7 @// in the file PATENTS. All contributing project authors may 8 @// be found in the AUTHORS file in the root of the source tree. 9 @// 10 @// This is a modification of armSP_FFT_CToC_SC32_Radix4_unsafe_s.S 11 @// to support float instead of SC32. 12 @// 13 14 @// 15 @// Description: 16 @// Compute a Radix 4 FFT stage for a N point complex signal 17 @// 18 @// 19 20 21 @// Include standard headers 22 23 #include "dl/api/arm/armCOMM_s.h" 24 #include "dl/api/arm/omxtypes_s.h" 25 26 @// M_VARIANTS ARM1136JS 27 28 @// Import symbols required from other files 29 @// (For example tables) 30 31 32 33 34 @// Set debugging level 35 @//DEBUG_ON SETL {TRUE} 36 37 38 39 @// Guarding implementation by the processor name 40 41 @// IF ARM1136JS 42 43 @//Input Registers 44 45 #define pSrc r0 46 #define pDst r2 47 #define pTwiddle r1 48 #define subFFTNum r6 49 #define subFFTSize r7 50 51 52 53 @//Output Registers 54 55 56 @//Local Scratch Registers 57 58 #define grpCount r12 59 #define step r12 /*@// Reuse grpCount*/ 60 #define outPointStep r3 61 #define setCount r8 62 #define diff r9 63 #define pointStep r14 64 65 #define t1 r3 /*@// Reuse outPointStep*/ 66 67 @// Real and Imaginary parts used in the inner grp loop 68 #define x0r s0 69 #define x0i s1 70 #define x1r s2 71 #define x1i s3 72 #define x2r s4 73 #define x2i s5 74 #define x3r s6 75 #define x3i s7 76 77 @// Temporary reg to hold the twiddle multiplies 78 79 #define t0r s8 80 #define t0i s9 81 #define t2r s10 82 #define t2i s11 83 #define sr s12 84 #define si s13 85 86 87 88 89 .MACRO FFTSTAGE scaled, inverse , name 90 91 @// Define stack arguments 92 93 94 @// Update grpCount and grpSize rightaway inorder to reuse 95 @// pGrpCount and pGrpSize regs 96 97 LSL grpCount,subFFTSize,#2 98 lsr subFFTNum, subFFTNum, #2 99 mov subFFTSize, grpCount 100 101 102 @// pT0+1 increments pT0 by 8 bytes 103 @// pT0+pointStep = increment of 8*pointStep bytes = 2*grpSize bytes 104 mov pointStep, subFFTNum, lsl #1 105 106 107 @// pOut0+1 increments pOut0 by 8 bytes 108 @// pOut0+outPointStep == increment of 8*outPointStep bytes = 2*size 109 @// bytes 110 111 @// Use setCount as dummy. It's set correctly below. 112 smull outPointStep, setCount, grpCount, pointStep 113 114 LSL pointStep,pointStep,#2 @// 2*grpSize 115 116 117 MOV setCount,pointStep,LSR #3 118 119 @// Interchange grpLoop and setLoop 120 121 setLoop\name: 122 123 MOV step,#0 124 @// Set pSrc and pDst for the grpLoop 125 126 SUB diff,outPointStep,pointStep 127 128 @// Save setCount on stack to reuse the reg 129 130 ADD pSrc,pSrc,diff,LSL #2 @// pSrc += (grpCount-1)*grpStep 131 ADD pDst,pDst,diff @// pDst += (grpCount-1)*setCount 132 ADD step,step,diff @// step += (grpCount-1)*setCount 133 134 135 136 @// Loop on the grps 137 138 grpLoop\name: 139 140 141 142 @// butterfly loop 143 add pSrc, pointStep 144 vldm.f32 pSrc, {x3r, x3i} @// data[1] 145 add pTwiddle, step 146 vldm.f32 pTwiddle, {x1r, x1i} @// coef[1] 147 add pTwiddle, step 148 vldm.f32 pTwiddle, {x2r, x2i} @// coef[2] 149 add pSrc, pointStep 150 vldm.f32 pSrc, {x0r, x0i} @// data[2] 151 152 @// do first complex multiply 153 vmul.f32 t0r, x3r, x1r 154 vmul.f32 t0i, x3i, x1r 155 156 .ifeqs "\inverse", "TRUE" 157 vmla.f32 t0r, x3i, x1i 158 vmls.f32 t0i, x3r, x1i 159 vmov.f32 x1r, t0r 160 vmov.f32 x1i, t0i 161 .else 162 vmls.f32 t0r, x3i, x1i 163 vmla.f32 t0i, x3r, x1i 164 vmov.f32 x1r, t0r 165 vmov.f32 x1i, t0i 166 .endif 167 168 add pTwiddle, pTwiddle, step 169 vldm pTwiddle, {x3r, x3i} @// coef[3] 170 sub pTwiddle, pTwiddle, step 171 172 @// do second complex multiply 173 vmul.f32 t0r, x0r, x2r 174 vmul.f32 t0i, x0i, x2r 175 176 .ifeqs "\inverse", "TRUE" 177 vmla.f32 t0r, x0i, x2i 178 vmls.f32 t0i, x0r, x2i 179 vmov.f32 x2r, t0r 180 vmov.f32 x2i, t0i 181 .else 182 vmls.f32 t0r, x0i, x2i 183 vmla.f32 t0i, x0r, x2i 184 vmov.f32 x2r, t0r 185 vmov.f32 x2i, t0i 186 .endif 187 188 add pSrc, pointStep 189 vldm pSrc, {x0r, x0i} @// data[3] 190 sub pSrc, pointStep 191 192 SUB pTwiddle,pTwiddle,step,LSL #1 @// reset pTwiddle 193 SUBS step,step,pointStep @// decrement loop counter 194 195 @// do third complex multiply 196 SUB pSrc,pSrc,pointStep,LSL #1 @// reset pSrc to data[0] 197 vmul.f32 t0r, x0r, x3r 198 vmul.f32 t0i, x0i, x3r 199 200 .ifeqs "\inverse", "TRUE" 201 vmla.f32 t0r, x0i, x3i 202 vmls.f32 t0i, x0r, x3i 203 vmov.f32 x3r, t0r 204 vmov.f32 x3i, t0i 205 .else 206 vmls.f32 t0r, x0i, x3i 207 vmla.f32 t0i, x0r, x3i 208 vmov.f32 x3r, t0r 209 vmov.f32 x3i, t0i 210 .endif 211 212 vldm pSrc, {x0r, x0i} @// data[0] 213 214 @// finish first stage of 4 point FFT 215 vadd.f32 x0r,x0r,x2r @// x0 = x0 + x2 (u0) 216 vadd.f32 x0i,x0i,x2i 217 218 vadd.f32 sr, x2r, x2r 219 vadd.f32 si, x2i, x2i 220 vsub.f32 x2r,x0r,sr @// x2 = x0 - x2 (u1) 221 vsub.f32 x2i,x0i,si 222 223 vadd.f32 x1r,x1r,x3r @// x1 = x1/2 + x3/2 (u2/2) 224 vadd.f32 x1i,x1i,x3i 225 226 vadd.f32 sr, x3r, x3r 227 vadd.f32 si, x3i, x3i 228 vsub.f32 x3r,x1r,sr @// x3 = x1/2 - x3/2 (u3/2) 229 vsub.f32 x3i,x1i,si 230 231 232 @// finish second stage of 4 point FFT 233 234 @// y0 = u1-u2 since twiddle's are stored as -ve values 235 vsub.f32 x2r,x2r,x1r 236 vsub.f32 x2i,x2i,x1i 237 238 vadd.f32 sr, x1r, x1r 239 vadd.f32 si, x1i, x1i 240 vadd.f32 x1r,x2r,sr @// y2 = u1+u2 241 vadd.f32 x1i,x2i,si 242 vstm pDst, {x2r, x2i} @// store y0 243 244 vsub.f32 x0r,x0r,x3i @// y3 = u0+ju3 245 vadd.f32 x0i,x0i,x3r 246 247 vadd.f32 sr, x3r, x3r 248 vadd.f32 si, x3i, x3i 249 vadd.f32 t2r,x0r,si @// y1 = u0-ju3 250 vsub.f32 t2i,x0i,sr @// t2 will be same as x2r reg 251 252 .ifeqs "\inverse", "TRUE" 253 add pDst, outPointStep 254 vstm pDst, {t2r, t2i} @// store y1 255 add pDst, outPointStep 256 vstm pDst, {x1r, x1i} @// store y2 257 add pDst, outPointStep 258 vstm pDst, {x0r, x0i} @// store y3 259 sub pDst, outPointStep 260 .else 261 add pDst, outPointStep 262 vstm pDst, {x0r, x0i} @// store y1 263 add pDst, outPointStep 264 vstm pDst, {x1r, x1i} @// store y2 265 add pDst, outPointStep 266 vstm pDst, {t2r, t2i} @// store y3 267 sub pDst, outPointStep 268 .endif 269 270 SUB pDst,pDst,outPointStep, LSL #1 @// reset pDst 271 @// update the pDst for the next grp 272 SUBGE pDst,pDst,pointStep 273 @// update the pSrc for the next grp 274 SUBGE pSrc,pSrc,pointStep,LSL #2 275 276 277 BGE grpLoop\name 278 279 ADD pSrc,pSrc,#8 @// pSrc += 1; for the next set 280 ADD pDst,pDst,#8 @// pDst += 1; for the next set 281 282 SUBS setCount,setCount,#1 @// decrement loop counter 283 284 285 BGT setLoop\name 286 287 @// Reset and Swap pSrc and pDst for the next stage 288 MOV t1,pDst 289 SUB pDst,pSrc,subFFTNum,LSL #3 290 SUB pSrc,t1,subFFTNum,LSL #3 291 292 .endm 293 294 295 M_START armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace_unsafe_vfp,r4 296 FFTSTAGE "FALSE","FALSE",FWD 297 M_END 298 299 M_START armSP_FFTInv_CToC_FC32_Radix4_OutOfPlace_unsafe_vfp,r4 300 FFTSTAGE "FALSE","TRUE",INV 301 M_END 302 303 304 @// ENDIF @//ARM1136JS 305 306 307 308 @// Guarding implementation by the processor name 309 310 .end 311