1 /* 2 * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 * 10 */ 11 12 #include <stdbool.h> 13 #include <stdint.h> 14 15 #include "dl/api/omxtypes.h" 16 #include "dl/sp/api/omxSP.h" 17 #include "dl/sp/api/x86SP.h" 18 #include "dl/sp/src/x86/x86SP_SSE_Math.h" 19 20 extern OMX_F32* x86SP_F32_radix2_kernel_OutOfPlace( 21 const OMX_F32 *src, 22 OMX_F32 *buf1, 23 OMX_F32 *buf2, 24 const OMX_F32 *twiddle, 25 OMX_INT n, 26 bool forward_fft); 27 28 extern OMX_F32* x86SP_F32_radix4_kernel_OutOfPlace_sse( 29 const OMX_F32 *src, 30 OMX_F32 *buf1, 31 OMX_F32 *buf2, 32 const OMX_F32 *twiddle, 33 OMX_INT n, 34 bool forward_fft); 35 36 /** 37 * A two-for-one algorithm is used here to do the real ifft: 38 * 39 * Input X[k], (k = 0, ..., N - 1) 40 * Output x[n] = IDFT(N, k){X} 41 * X' is complex conjugate of X 42 * A[k] = (X[k] + X'[N/2 - k]) / 2 43 * B[k] = (X[k] - X'[N/2 - k]) / 2 * W[k], (W = exp(j*2*PI*k/N); 44 * k = 0, ..., N/2 - 1) 45 * Z[k] = A[k] + j * B[k], (k = 0, ..., N/2 - 1) 46 * z[n] = IDFT(N/2, k){Z} 47 * x[2n] = Re(z[n]), (n = 0, ..., N/2 - 1) 48 * x[2n + 1] = Im(z[n]), (n = 0, ..., N/2 - 1) 49 */ 50 51 /** 52 * This function is the first permutation of two-for-one IFFT algorithm. 53 * We move the division by 2 to the last step in the implementation, so: 54 * A[k] = (X[k] + X'[N/2 - k]) 55 * B[k] = (X[k] - X'[N/2 - k]) * W[k], (k = 0, ..., N/2 - 1) 56 * Z[k] = (A[k] + j * B[k]) / 2, (k = 0, ..., N/2 - 1) 57 */ 58 static void RevbinPermuteInv(const OMX_F32 *in, 59 OMX_F32 *out, 60 const OMX_F32 *twiddle, 61 OMX_INT n) { 62 OMX_INT i; 63 OMX_INT j; 64 OMX_INT i_by_2; 65 OMX_INT j_by_2; 66 OMX_INT n_by_2 = n >> 1; 67 OMX_INT n_by_4 = n >> 2; 68 69 OMX_FC32 big_a; 70 OMX_FC32 big_b; 71 OMX_FC32 temp; 72 const OMX_F32 *tw; 73 74 for (i = 2, j = n - 2; i < n_by_2; i += 2, j -= 2) { 75 // A[k] = (X[k] + X'[N/2 - k]) 76 big_a.Re = in[i] + in[j]; 77 big_a.Im = in[i + 1] - in[j + 1]; 78 79 // temp = (X[k] - X'[N/2 - k]) 80 temp.Re = in[i] - in[j]; 81 temp.Im = in[i + 1] + in[j + 1]; 82 83 i_by_2 = i >> 1; 84 j_by_2 = j >> 1; 85 86 // W[k] 87 tw = twiddle + i_by_2; 88 89 // B[k] = (X[k] - X'[N/2 - k]) * W[k] 90 big_b.Re = temp.Re * tw[0] + temp.Im * tw[n]; 91 big_b.Im = temp.Re * tw[n] - temp.Im * tw[0]; 92 93 // Convert split format to interleaved format. 94 // Z[k] = (A[k] + j * B[k]) (k = 0, ..., N/2 - 1) 95 // The scaling of 1/2 will be merged into to the scaling in 96 // the last step before the output in omxSP_FFTInv_CCSToR_F32_Sfs. 97 out[i_by_2] = big_a.Re + big_b.Im; 98 out[i_by_2 + n_by_2] = big_b.Re + big_a.Im; 99 out[j_by_2] = big_a.Re - big_b.Im; 100 out[j_by_2 + n_by_2] = big_b.Re - big_a.Im; 101 } 102 103 // The n_by_2 complex point 104 out[n_by_4] = 2.0f * in[n_by_2]; 105 out[n_by_4 + n_by_2] = -2.0f * in[n_by_2 + 1]; 106 107 // The first complex point 108 out[0] = in[0] + in[n]; 109 out[n_by_2] = in[0] - in[n]; 110 } 111 112 // Sse version of RevbinPermuteInv function. 113 static void RevbinPermuteInvSse(const OMX_F32 *in, 114 OMX_F32 *out, 115 const OMX_F32 *twiddle, 116 OMX_INT n) { 117 OMX_INT i; 118 OMX_INT j; 119 OMX_INT n_by_2 = n >> 1; 120 OMX_INT n_by_4 = n >> 2; 121 const OMX_F32 *tw; 122 const OMX_F32 *pi; 123 const OMX_F32 *pj; 124 125 VC v_i; 126 VC v_j; 127 VC v_big_a; 128 VC v_big_b; 129 VC v_temp; 130 VC v_tw; 131 132 for (i = 0, j = n_by_2 - 3; i < n_by_4; i += 4, j -= 4) { 133 pi = in + (i << 1); 134 pj = in + (j << 1); 135 VC_LOAD_INTERLEAVE(&v_i, pi); 136 137 v_j.real = _mm_set_ps(pj[0], pj[2], pj[4], pj[6]); 138 v_j.imag = _mm_set_ps(pj[1], pj[3], pj[5], pj[7]); 139 140 // A[k] = (X[k] + X'[N/2 - k]) 141 VC_ADD_SUB(&v_big_a, &v_i, &v_j); 142 143 // temp = (X[k] - X'[N/2 - k]) 144 VC_SUB_ADD(&v_temp, &v_i, &v_j); 145 146 // W[k] 147 tw = twiddle + i; 148 VC_LOAD_SPLIT(&v_tw, tw, n); 149 150 // B[k] = (X[k] - X'[N/2 - k]) * W[k] 151 VC_CONJ_MUL(&v_big_b, &v_temp, &v_tw); 152 153 // Convert split format to interleaved format. 154 // Z[k] = (A[k] + j * B[k]) (k = 0, ..., N/2 - 1) 155 // The scaling of 1/2 will be merged into to the scaling in 156 // the last step before the output in omxSP_FFTInv_CCSToR_F32_Sfs. 157 VC_ADD_X_STORE_SPLIT((out + i), &v_big_a, &v_big_b, n_by_2); 158 159 VC_SUB_X_INVERSE_STOREU_SPLIT((out + j), &v_big_a, &v_big_b, n_by_2); 160 } 161 162 // The n_by_2 complex point 163 out[n_by_4] = 2.0f * in[n_by_2]; 164 out[n_by_4 + n_by_2] = -2.0f * in[n_by_2 + 1]; 165 166 // The first complex point 167 out[0] = in[0] + in[n]; 168 out[n_by_2] = in[0] - in[n]; 169 } 170 171 OMXResult omxSP_FFTInv_CCSToR_F32_Sfs(const OMX_F32 *pSrc, OMX_F32 *pDst, 172 const OMXFFTSpec_R_F32 *pFFTSpec) { 173 OMX_INT n; 174 OMX_INT n_by_2; 175 OMX_INT n_by_4; 176 OMX_INT i; 177 const OMX_F32 *twiddle; 178 OMX_F32 *buf; 179 OMX_F32 *in = (OMX_F32*) pSrc; 180 181 const X86FFTSpec_R_FC32 *pFFTStruct = (const X86FFTSpec_R_FC32*) pFFTSpec; 182 183 // Input must be 32 byte aligned 184 if (!pSrc || !pDst || (const uintptr_t)pSrc & 31 || (uintptr_t)pDst & 31) 185 return OMX_Sts_BadArgErr; 186 187 n = pFFTStruct->N; 188 189 // This is to handle the case of order == 1. 190 if (n == 2) { 191 pDst[0] = (pSrc[0] + pSrc[2]) / 2; 192 pDst[1] = (pSrc[0] - pSrc[2]) / 2; 193 return OMX_Sts_NoErr; 194 } 195 196 n_by_2 = n >> 1; 197 n_by_4 = n >> 2; 198 buf = pFFTStruct->pBuf1; 199 200 twiddle = pFFTStruct->pTwiddle; 201 202 if (n < 8) 203 RevbinPermuteInv(in, buf, twiddle, n); 204 else 205 RevbinPermuteInvSse(in, buf, twiddle, n); 206 207 if (n_by_2 < 16) { 208 buf = x86SP_F32_radix2_kernel_OutOfPlace( 209 buf, 210 pFFTStruct->pBuf2, 211 buf, 212 twiddle, 213 n_by_2, 214 0); 215 } else { 216 buf = x86SP_F32_radix4_kernel_OutOfPlace_sse( 217 buf, 218 pFFTStruct->pBuf2, 219 buf, 220 twiddle, 221 n_by_2, 222 0); 223 } 224 225 // Scale the result by 1/n. 226 // It contains a scaling factor of 1/2 in 227 // RevbinPermuteInv/RevbinPermuteInvSse. 228 OMX_F32 factor = 1.0f / n; 229 230 if (n < 8) { 231 for (i = 0; i < n_by_2; i++) { 232 pDst[i << 1] = buf[i] * factor; 233 pDst[(i << 1) + 1] = buf[i + n_by_2] * factor; 234 } 235 } else { 236 OMX_F32 *base; 237 OMX_F32 *dst; 238 VC temp0; 239 VC temp1; 240 __m128 mFactor = _mm_load1_ps(&factor); 241 242 // Two things are done in this loop: 243 // 1 Get the result scaled; 2 Change the format from split to interleaved. 244 for (i = 0; i < n_by_2; i += 4) { 245 base = buf + i; 246 dst = pDst + (i << 1); 247 VC_LOAD_SPLIT(&temp0, base, n_by_2); 248 VC_MUL_F(&temp1, &temp0, mFactor); 249 VC_STORE_INTERLEAVE(dst, &temp1); 250 } 251 } 252 253 return OMX_Sts_NoErr; 254 } 255