src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_ms.c

/*
 *  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 *
 */

#include "dl/api/omxtypes.h"

void x86SP_FFT_CToC_FC32_Inv_Radix4_ms(
    const OMX_F32 *in,
    OMX_F32 *out,
    const OMX_F32 *twiddle,
    OMX_INT n,
    OMX_INT sub_size,
    OMX_INT sub_num) {
  OMX_INT set;
  OMX_INT grp;
  OMX_INT step = sub_num >> 1;
  OMX_INT set_count = sub_num >> 2;
  OMX_INT n_by_4 = n >> 2;
  OMX_INT n_mul_2 = n << 1;
  OMX_F32 *out0 = out;

  // grp == 0
  for (set = 0; set < set_count; ++set) {
    OMX_FC32 t0;
    OMX_FC32 t1;
    OMX_FC32 t2;
    OMX_FC32 t3;

    const OMX_F32 *in0 = in + set;
    const OMX_F32 *in1 = in0 + set_count;
    const OMX_F32 *in2 = in1 + set_count;
    const OMX_F32 *in3 = in2 + set_count;
    OMX_F32 *out1 = out0 + n_by_4;
    OMX_F32 *out2 = out1 + n_by_4;
    OMX_F32 *out3 = out2 + n_by_4;

    // CADD t0, in0, in2
    t0.Re = in0[0] + in2[0];
    t0.Im = in0[n] + in2[n];

    // CSUB t1, in0, in2
    t1.Re = in0[0] - in2[0];
    t1.Im = in0[n] - in2[n];

    // CADD t2, in1, in3
    t2.Re = in1[0] + in3[0];
    t2.Im = in1[n] + in3[n];

    // CSUB t3, in1, in3
    t3.Re = in1[0] - in3[0];
    t3.Im = in1[n] - in3[n];

    // CADD out0, t0, t2
    out0[0] = t0.Re + t2.Re;
    out0[n] = t0.Im + t2.Im;

    // CSUB out2, t0, t2
    out2[0] = t0.Re - t2.Re;
    out2[n] = t0.Im - t2.Im;

    // CSUB_ADD_X out1, t1, t3
    out1[0] = t1.Re - t3.Im;
    out1[n] = t1.Im + t3.Re;

    // CADD_SUB_X out3, t1, t3
    out3[0] = t1.Re + t3.Im;
    out3[n] = t1.Im - t3.Re;

    out0 += 1;
  }

  // grp > 0
  for (grp = 1; grp < sub_size; ++grp) {
    const OMX_F32 *tw1 = twiddle + grp * step;
    const OMX_F32 *tw2 = tw1 + grp * step;
    const OMX_F32 *tw3 = tw2 + grp * step;

    for (set = 0; set < set_count; ++set) {
      OMX_FC32 t0;
      OMX_FC32 t1;
      OMX_FC32 t2;
      OMX_FC32 t3;
      OMX_FC32 tt1;
      OMX_FC32 tt2;
      OMX_FC32 tt3;

      const OMX_F32 *in0 = in + set + grp * sub_num;
      const OMX_F32 *in1 = in0 + set_count;
      const OMX_F32 *in2 = in1 + set_count;
      const OMX_F32 *in3 = in2 + set_count;
      OMX_F32 *out1 = out0 + n_by_4;
      OMX_F32 *out2 = out1 + n_by_4;
      OMX_F32 *out3 = out2 + n_by_4;

      // CMUL tt1, Tw1, in1
      tt1.Re = tw1[0] * in1[0] + tw1[n_mul_2] * in1[n];
      tt1.Im = tw1[0] * in1[n] - tw1[n_mul_2] * in1[0];

      // CMUL tt2, Tw2, in2
      tt2.Re = tw2[0] * in2[0] + tw2[n_mul_2] * in2[n];
      tt2.Im = tw2[0] * in2[n] - tw2[n_mul_2] * in2[0];

      // CMUL tt3, Tw3, in3
      tt3.Re = tw3[0] * in3[0] + tw3[n_mul_2] * in3[n];
      tt3.Im = tw3[0] * in3[n] - tw3[n_mul_2] * in3[0];

      // CADD t0, in0, tt2
      t0.Re = in0[0] + tt2.Re;
      t0.Im = in0[n] + tt2.Im;

      // CSUB t1, in0, tt2
      t1.Re = in0[0] - tt2.Re;
      t1.Im = in0[n] - tt2.Im;

      // CADD t2, tt1, tt3
      t2.Re = tt1.Re + tt3.Re;
      t2.Im = tt1.Im + tt3.Im;

      // CSUB t3, tt1, tt3
      t3.Re = tt1.Re - tt3.Re;
      t3.Im = tt1.Im - tt3.Im;

      // CADD out0, t0, t2
      out0[0] = t0.Re + t2.Re;
      out0[n] = t0.Im + t2.Im;

      // CSUB out2, t0, t2
      out2[0] = t0.Re - t2.Re;
      out2[n] = t0.Im - t2.Im;

      // CSUB_ADD_X out1, t1, t3
      out1[0] = t1.Re - t3.Im;
      out1[n] = t1.Im + t3.Re;

      // CADD_SUB_X out3, t1, t3
      out3[0] = t1.Re + t3.Im;
      out3[n] = t1.Im - t3.Re;

      out0 += 1;
    }
  }
}