1 @// 2 @// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. 3 @// 4 @// Use of this source code is governed by a BSD-style license 5 @// that can be found in the LICENSE file in the root of the source 6 @// tree. An additional intellectual property rights grant can be found 7 @// in the file PATENTS. All contributing project authors may 8 @// be found in the AUTHORS file in the root of the source tree. 9 @// 10 @// This is a modification of armSP_FFT_CToC_SC32_Radix8_fs_unsafe_s.S 11 @// to support float instead of SC32. 12 @// 13 14 @// 15 @// Description: 16 @// Compute a first stage Radix 8 FFT stage for a N point complex signal 17 @// 18 @// 19 20 21 @// Include standard headers 22 23 #include "dl/api/arm/armCOMM_s.h" 24 #include "dl/api/arm/omxtypes_s.h" 25 26 @// M_VARIANTS ARM1136JS 27 28 @// Import symbols required from other files 29 @// (For example tables) 30 31 32 @// Set debugging level 33 @//DEBUG_ON SETL {TRUE} 34 35 36 37 @// Guarding implementation by the processor name 38 39 @// IF ARM1136JS 40 41 @//Input Registers 42 43 #define pSrc r0 44 #define pDst r2 45 #define pTwiddle r1 46 #define subFFTNum r6 47 #define subFFTSize r7 48 #define pPingPongBuf r5 49 50 51 @//Output Registers 52 53 54 @//Local Scratch Registers 55 56 #define grpSize r14 57 #define step1 r3 58 #define step2 r8 59 #define setCount r14 /*@// Reuse grpSize as setCount*/ 60 #define pointStep r12 61 62 #define t0 r4 63 @// Real and Imaginary parts 64 65 #define x0r s0 66 #define x0i s1 67 #define x1r s2 68 #define x1i s3 69 #define x2r s4 70 #define x2i s5 71 #define x3r s6 72 #define x3i s7 73 #define t3r s8 /*@// Temporarily hold x3r and x3i*/ 74 #define t3i s9 75 #define t1r s4 76 #define t1i s5 77 #define sr s10 78 #define si s11 79 #define roothalf s12 80 81 @// Define macros to load/store two float regs from/to the stack. 82 .macro M_VSTM r0, r1, p 83 .set _Offset, _Workspace + \p\()_F 84 add t0, sp, #_Offset 85 vstm.f32 t0, {\r0, \r1} 86 .endm 87 88 .macro M_VLDM r0, r1, p 89 .set _Offset, _Workspace + \p\()_F 90 add t0, sp, #_Offset 91 vldm.f32 t0, {\r0, \r1} 92 .endm 93 94 @// Define constants 95 96 .macro FFTSTAGE scaled, inverse , name 97 98 @// Define stack arguments 99 100 101 @// Update grpCount and grpSize rightaway inorder to reuse 102 @// pSubFFTSize and pSubFFTNum regs 103 104 mov subFFTSize, #8 105 lsr grpSize, subFFTNum, #3 106 mov subFFTNum, grpSize 107 108 109 @// pT0+1 increments pT0 by 8 bytes 110 @// pT0+pointStep = increment of 8*pointStep bytes = grpSize bytes 111 @// Note: setCount = grpSize/8 (reuse the updated grpSize for 112 @// setCount) 113 MOV pointStep,grpSize,LSL #3 114 115 116 @// Calculate the step of input data for the next set 117 MOV step1,grpSize,LSL #4 118 MOV step2,pointStep,LSL #3 119 SUB step2,step2,pointStep @// step2 = 7*pointStep 120 121 122 @// grp = 0 a special case since all the twiddle factors are 1 123 @// Loop on the sets 124 125 movw t0,#0x04f3 126 movt t0,#0x3f35 127 vmov.f32 roothalf, t0 @// roothalf = sqrt(1/2) 128 129 grpZeroSetLoop\name: 130 131 vldm.f32 pSrc, {x0r, x0i} @// x0 132 add pSrc, step1 133 vldm.f32 pSrc, {x1r, x1i} @// x2 134 add pSrc, step1 135 vldm.f32 pSrc, {x2r, x2i} @// x4 136 add pSrc, step1 137 vldm.f32 pSrc, {x3r, x3i} @// x6 138 add pSrc, step1 139 140 SUB pSrc, pSrc, step2 141 142 @// finish first stage of 8 point FFT and save on stack 143 144 vadd.f32 x0r,x0r,x2r @// u0 145 vadd.f32 x0i,x0i,x2i 146 147 vadd.f32 sr, x2r, x2r 148 vadd.f32 si, x2i, x2i 149 vsub.f32 x2r,x0r,sr @// u1 150 vsub.f32 x2i,x0i,si 151 152 M_VSTM x0r,x0i, pU0 153 M_VSTM x2r,x2i, pU1 154 155 vadd.f32 x1r,x1r,x3r @// u4 156 vadd.f32 x1i,x1i,x3i 157 158 vadd.f32 sr, x3r, x3r 159 vadd.f32 si, x3i, x3i 160 vsub.f32 x3r,x1r,sr @// u5 161 vsub.f32 x3i,x1i,si 162 163 M_VSTM x1r,x1i, pU4 164 M_VSTM x3r,x3i, pU5 165 166 167 vldm pSrc, {x0r, x0i} @// x1 168 add pSrc, step1 169 vldm pSrc, {x1r, x1i} @// x3 170 add pSrc, step1 171 vldm pSrc, {x2r, x2i} @// x5 172 add pSrc, step1 173 vldm pSrc, {x3r, x3i} @// x7 174 add pSrc, #8 175 176 SUB pSrc, pSrc, step2 177 178 vadd.f32 x0r,x0r,x2r @// u2 179 vadd.f32 x0i,x0i,x2i 180 181 vadd.f32 sr, x2r, x2r 182 vadd.f32 si, x2i, x2i 183 vsub.f32 x2r,x0r,sr @// u3 184 vsub.f32 x2i,x0i,si 185 186 M_VSTM x2r,x2i, pU3 187 188 vadd.f32 x1r,x1r,x3r @// u6 189 vadd.f32 x1i,x1i,x3i 190 191 vadd.f32 sr, x3r, x3r 192 vadd.f32 si, x3i, x3i 193 vsub.f32 x3r,x1r,sr @// u7 194 vsub.f32 x3i,x1i,si 195 196 @// finish second and third stage of 8 point FFT 197 198 M_VSTM x3r,x3i, pU7 199 M_VLDM x2r,x2i, pU0 200 201 @// Decrement setcount 202 SUBS setCount,setCount,#1 203 M_VLDM x3r,x3i, pU4 204 205 vadd.f32 x0r,x0r,x1r @// v4 206 vadd.f32 x0i,x0i,x1i 207 208 vadd.f32 sr, x1r, x1r 209 vadd.f32 si, x1i, x1i 210 vsub.f32 x1r,x0r,sr @// v6 211 vsub.f32 x1i,x0i,si 212 213 vadd.f32 x2r,x2r,x3r @// v0 214 vadd.f32 x2i,x2i,x3i 215 216 vadd.f32 sr, x3r, x3r 217 vadd.f32 si, x3i, x3i 218 vsub.f32 x3r,x2r,sr @// v2 219 vsub.f32 x3i,x2i,si 220 221 222 223 vadd.f32 x2r,x2r,x0r @// y0 224 vadd.f32 x2i,x2i,x0i 225 226 vadd.f32 sr, x0r, x0r 227 vadd.f32 si, x0i, x0i 228 vsub.f32 x0r,x2r,sr @// y4 229 vsub.f32 x0i,x2i,si 230 231 vstm pDst, {x2r, x2i} @// store y0 232 add pDst, step1 233 234 vadd.f32 x3r,x3r,x1i @// y6 235 vsub.f32 x3i,x3i,x1r 236 237 vadd.f32 sr, x1r, x1r 238 vadd.f32 si, x1i, x1i 239 vsub.f32 t1r,x3r,si @// t1r=x2r reg;t1i=x2i reg 240 vadd.f32 t1i,x3i,sr @// y2 241 242 .ifeqs "\inverse", "TRUE" 243 vstm pDst, {t1r, t1i} @// store y2 244 add pDst, step1 245 vstm pDst, {x0r, x0i} @// store y4 246 add pDst, step1 247 vstm pDst, {x3r, x3i} @// store y6 248 add pDst, step1 249 .else 250 vstm pDst, {x3r, x3i} @// store y2 251 add pDst, step1 252 vstm pDst, {x0r, x0i} @// store y4 253 add pDst, step1 254 vstm pDst, {t1r, t1i} @// store y6 255 add pDst, step1 256 .endif 257 258 SUB pDst, pDst, step2 @// set pDst to y1 259 260 261 M_VLDM x0r,x0i,pU1 @// Load u1,u3,u5,u7 262 M_VLDM x1r,x1i,pU5 263 M_VLDM x3r,x3i,pU7 264 265 vsub.f32 x0r,x0r,x1i @// v1 266 vadd.f32 x0i,x0i,x1r 267 vadd.f32 sr, x1r, x1r 268 vadd.f32 si, x1i, x1i 269 vadd.f32 t1r,x0r,si @// t1r=x2r reg;t1i=x2i reg 270 vsub.f32 t1i,x0i,sr @// v3 271 272 M_VLDM x1r,x1i,pU3 273 274 vsub.f32 x1r,x1r,x3i @// v5 275 vadd.f32 x1i,x1i,x3r 276 277 vadd.f32 sr, x3r, x3r 278 vadd.f32 si, x3i, x3i 279 vadd.f32 t3r,x1r,si @// t3i = x3i 280 vsub.f32 t3i,x1i,sr @// v7 281 282 @// store v5 as (v5.r - v5.i,v5.r + v5.i) 283 @// store v7 as (v7.i + v7.r,v7.i - v7.r) 284 285 vadd.f32 x3r,t3i,t3r @// v7 286 vsub.f32 x3i,t3i,t3r 287 288 vsub.f32 x1r,x1r,x1i @// v5 289 vadd.f32 x1i, x1i 290 vadd.f32 x1i,x1r,x1i 291 292 vmul.f32 x3r, x3r, roothalf @// (v7.i + v7.r)*(1/sqrt(2)) 293 vmul.f32 x3i, x3i, roothalf @// (v7.i - v7.r)*(1/sqrt(2)) 294 vmul.f32 x1r, x1r, roothalf @// (v5.r - v5.i)*(1/sqrt(2)) 295 vmul.f32 x1i, x1i, roothalf @// (v5.r + v5.i)*(1/sqrt(2)) 296 297 vadd.f32 x2r,x2r,x3r @// y7 298 vadd.f32 x2i,x2i,x3i 299 300 vadd.f32 sr, x3r, x3r 301 vadd.f32 si, x3i, x3i 302 vsub.f32 x3r,x2r,sr @// y3 303 vsub.f32 x3i,x2i,si 304 305 306 vsub.f32 x0r,x0r,x1r @// y5 307 vsub.f32 x0i,x0i,x1i 308 309 vadd.f32 sr, x1r, x1r 310 vadd.f32 si, x1i, x1i 311 vadd.f32 x1r,x0r,sr @// y1 312 vadd.f32 x1i,x0i,si 313 314 .ifeqs "\inverse", "TRUE" 315 vstm pDst, {x1r, x1i} @// store y1 316 add pDst, step1 317 vstm pDst, {x3r, x3i} @// store y3 318 add pDst, step1 319 vstm pDst, {x0r, x0i} @// store y5 320 add pDst, step1 321 vstm pDst, {x2r, x2i} @// store y7 322 add pDst, #8 323 .else 324 vstm pDst, {x2r, x2i} @// store y1 325 add pDst, step1 326 vstm pDst, {x0r, x0i} @// store y3 327 add pDst, step1 328 vstm pDst, {x3r, x3i} @// store y5 329 add pDst, step1 330 vstm pDst, {x1r, x1i} @// store y7 331 add pDst, #8 332 .endif 333 334 SUB pDst, pDst, step2 @// update pDst for the next set 335 336 337 BGT grpZeroSetLoop\name 338 339 340 @// reset pSrc to pDst for the next stage 341 SUB pSrc,pDst,pointStep @// pDst -= 2*grpSize 342 mov pDst, pPingPongBuf 343 344 345 .endm 346 347 348 349 350 351 @// Allocate stack memory required by the function 352 353 @// Ensure 8 byte alignment to use M_VLDM 354 M_ALLOC8 pU0, 8 355 M_ALLOC8 pU1, 8 356 M_ALLOC8 pU3, 8 357 M_ALLOC8 pU4, 8 358 M_ALLOC8 pU5, 8 359 M_ALLOC8 pU7, 8 360 361 M_START armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace_unsafe_vfp,r4 362 FFTSTAGE "FALSE","FALSE",FWD 363 M_END 364 365 @// Allocate stack memory required by the function 366 367 @// Ensure 8 byte alignment to use M_VLDM 368 M_ALLOC8 pU0, 8 369 M_ALLOC8 pU1, 8 370 M_ALLOC8 pU3, 8 371 M_ALLOC8 pU4, 8 372 M_ALLOC8 pU5, 8 373 M_ALLOC8 pU7, 8 374 375 M_START armSP_FFTInv_CToC_FC32_Radix8_fs_OutOfPlace_unsafe_vfp,r4 376 FFTSTAGE "FALSE","TRUE",INV 377 M_END 378 379 @// ENDIF @//ARM1136JS 380 381 382 383 @// Guarding implementation by the processor name 384 385 386 .end 387