Home | History | Annotate | Download | only in source
      1 /*
      2  *  Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include <arm_neon.h>
     12 
     13 #include "webrtc/modules/audio_coding/codecs/isac/fix/source/codec.h"
     14 #include "webrtc/modules/audio_coding/codecs/isac/fix/source/fft.h"
     15 #include "webrtc/modules/audio_coding/codecs/isac/fix/source/settings.h"
     16 
     17 // Tables are defined in transform_tables.c file.
     18 // Cosine table 1 in Q14.
     19 extern const int16_t WebRtcIsacfix_kCosTab1[FRAMESAMPLES/2];
     20 // Sine table 1 in Q14.
     21 extern const int16_t WebRtcIsacfix_kSinTab1[FRAMESAMPLES/2];
     22 // Sine table 2 in Q14.
     23 extern const int16_t WebRtcIsacfix_kSinTab2[FRAMESAMPLES/4];
     24 
     25 static inline int32_t ComplexMulAndFindMaxNeon(int16_t* inre1Q9,
     26                                                int16_t* inre2Q9,
     27                                                int32_t* outreQ16,
     28                                                int32_t* outimQ16) {
     29   int k;
     30   const int16_t* kCosTab = &WebRtcIsacfix_kCosTab1[0];
     31   const int16_t* kSinTab = &WebRtcIsacfix_kSinTab1[0];
     32   // 0.5 / sqrt(240) in Q19 is round((.5 / sqrt(240)) * (2^19)) = 16921.
     33   // Use "16921 << 5" and vqdmulh, instead of ">> 26" as in the C code.
     34   int32_t fact  = 16921 << 5;
     35   int32x4_t factq = vdupq_n_s32(fact);
     36   uint32x4_t max_r = vdupq_n_u32(0);
     37   uint32x4_t max_i = vdupq_n_u32(0);
     38 
     39   for (k = 0; k < FRAMESAMPLES/2; k += 8) {
     40     int16x8_t tmpr = vld1q_s16(kCosTab);
     41     int16x8_t tmpi = vld1q_s16(kSinTab);
     42     int16x8_t inre1 = vld1q_s16(inre1Q9);
     43     int16x8_t inre2 = vld1q_s16(inre2Q9);
     44     kCosTab += 8;
     45     kSinTab += 8;
     46     inre1Q9 += 8;
     47     inre2Q9 += 8;
     48 
     49     // Use ">> 26", instead of ">> 7", ">> 16" and then ">> 3" as in the C code.
     50     int32x4_t tmp0 = vmull_s16(vget_low_s16(tmpr), vget_low_s16(inre1));
     51     int32x4_t tmp1 = vmull_s16(vget_low_s16(tmpr), vget_low_s16(inre2));
     52     tmp0 = vmlal_s16(tmp0, vget_low_s16(tmpi), vget_low_s16(inre2));
     53     tmp1 = vmlsl_s16(tmp1, vget_low_s16(tmpi), vget_low_s16(inre1));
     54 #if defined(WEBRTC_ARCH_ARM64)
     55     int32x4_t tmp2 = vmull_high_s16(tmpr, inre1);
     56     int32x4_t tmp3 = vmull_high_s16(tmpr, inre2);
     57     tmp2 = vmlal_high_s16(tmp2, tmpi, inre2);
     58     tmp3 = vmlsl_high_s16(tmp3, tmpi, inre1);
     59 #else
     60     int32x4_t tmp2 = vmull_s16(vget_high_s16(tmpr), vget_high_s16(inre1));
     61     int32x4_t tmp3 = vmull_s16(vget_high_s16(tmpr), vget_high_s16(inre2));
     62     tmp2 = vmlal_s16(tmp2, vget_high_s16(tmpi), vget_high_s16(inre2));
     63     tmp3 = vmlsl_s16(tmp3, vget_high_s16(tmpi), vget_high_s16(inre1));
     64 #endif
     65 
     66     int32x4_t outr_0 = vqdmulhq_s32(tmp0, factq);
     67     int32x4_t outr_1 = vqdmulhq_s32(tmp2, factq);
     68     int32x4_t outi_0 = vqdmulhq_s32(tmp1, factq);
     69     int32x4_t outi_1 = vqdmulhq_s32(tmp3, factq);
     70     vst1q_s32(outreQ16, outr_0);
     71     outreQ16 += 4;
     72     vst1q_s32(outreQ16, outr_1);
     73     outreQ16 += 4;
     74     vst1q_s32(outimQ16, outi_0);
     75     outimQ16 += 4;
     76     vst1q_s32(outimQ16, outi_1);
     77     outimQ16 += 4;
     78 
     79     // Find the absolute maximum in the vectors.
     80     tmp0 = vabsq_s32(outr_0);
     81     tmp1 = vabsq_s32(outr_1);
     82     tmp2 = vabsq_s32(outi_0);
     83     tmp3 = vabsq_s32(outi_1);
     84     // vabs doesn't change the value of 0x80000000.
     85     // Use u32 so we don't lose the value 0x80000000.
     86     max_r = vmaxq_u32(max_r, vreinterpretq_u32_s32(tmp0));
     87     max_i = vmaxq_u32(max_i, vreinterpretq_u32_s32(tmp2));
     88     max_r = vmaxq_u32(max_r, vreinterpretq_u32_s32(tmp1));
     89     max_i = vmaxq_u32(max_i, vreinterpretq_u32_s32(tmp3));
     90   }
     91 
     92   max_r = vmaxq_u32(max_r, max_i);
     93 #if defined(WEBRTC_ARCH_ARM64)
     94   uint32_t maximum = vmaxvq_u32(max_r);
     95 #else
     96   uint32x2_t max32x2_r = vmax_u32(vget_low_u32(max_r), vget_high_u32(max_r));
     97   max32x2_r = vpmax_u32(max32x2_r, max32x2_r);
     98   uint32_t maximum = vget_lane_u32(max32x2_r, 0);
     99 #endif
    100 
    101   return (int32_t)maximum;
    102 }
    103 
    104 static inline void PreShiftW32toW16Neon(int32_t* inre,
    105                                         int32_t* inim,
    106                                         int16_t* outre,
    107                                         int16_t* outim,
    108                                         int32_t sh) {
    109   int k;
    110   int32x4_t sh32x4 = vdupq_n_s32(sh);
    111   for (k = 0; k < FRAMESAMPLES/2; k += 16) {
    112     int32x4x4_t inre32x4x4 = vld4q_s32(inre);
    113     int32x4x4_t inim32x4x4 = vld4q_s32(inim);
    114     inre += 16;
    115     inim += 16;
    116     inre32x4x4.val[0] = vrshlq_s32(inre32x4x4.val[0], sh32x4);
    117     inre32x4x4.val[1] = vrshlq_s32(inre32x4x4.val[1], sh32x4);
    118     inre32x4x4.val[2] = vrshlq_s32(inre32x4x4.val[2], sh32x4);
    119     inre32x4x4.val[3] = vrshlq_s32(inre32x4x4.val[3], sh32x4);
    120     inim32x4x4.val[0] = vrshlq_s32(inim32x4x4.val[0], sh32x4);
    121     inim32x4x4.val[1] = vrshlq_s32(inim32x4x4.val[1], sh32x4);
    122     inim32x4x4.val[2] = vrshlq_s32(inim32x4x4.val[2], sh32x4);
    123     inim32x4x4.val[3] = vrshlq_s32(inim32x4x4.val[3], sh32x4);
    124     int16x4x4_t outre16x4x4;
    125     int16x4x4_t outim16x4x4;
    126     outre16x4x4.val[0]  = vmovn_s32(inre32x4x4.val[0]);
    127     outre16x4x4.val[1]  = vmovn_s32(inre32x4x4.val[1]);
    128     outre16x4x4.val[2]  = vmovn_s32(inre32x4x4.val[2]);
    129     outre16x4x4.val[3]  = vmovn_s32(inre32x4x4.val[3]);
    130     outim16x4x4.val[0]  = vmovn_s32(inim32x4x4.val[0]);
    131     outim16x4x4.val[1]  = vmovn_s32(inim32x4x4.val[1]);
    132     outim16x4x4.val[2]  = vmovn_s32(inim32x4x4.val[2]);
    133     outim16x4x4.val[3]  = vmovn_s32(inim32x4x4.val[3]);
    134     vst4_s16(outre, outre16x4x4);
    135     vst4_s16(outim, outim16x4x4);
    136     outre += 16;
    137     outim += 16;
    138   }
    139 }
    140 
    141 static inline void PostShiftAndSeparateNeon(int16_t* inre,
    142                                             int16_t* inim,
    143                                             int16_t* outre,
    144                                             int16_t* outim,
    145                                             int32_t sh) {
    146   int k;
    147   int16_t* inre1 = inre;
    148   int16_t* inre2 = &inre[FRAMESAMPLES/2 - 4];
    149   int16_t* inim1 = inim;
    150   int16_t* inim2 = &inim[FRAMESAMPLES/2 - 4];
    151   int16_t* outre1 = outre;
    152   int16_t* outre2 = &outre[FRAMESAMPLES/2 - 4];
    153   int16_t* outim1 = outim;
    154   int16_t* outim2 = &outim[FRAMESAMPLES/2 - 4];
    155   const int16_t* kSinTab1 = &WebRtcIsacfix_kSinTab2[0];
    156   const int16_t* kSinTab2 = &WebRtcIsacfix_kSinTab2[FRAMESAMPLES/4 -4];
    157   // By vshl, we effectively did "<< (-sh - 23)", instead of "<< (-sh)",
    158   // ">> 14" and then ">> 9" as in the C code.
    159   int32x4_t shift = vdupq_n_s32(-sh - 23);
    160 
    161   for (k = 0; k < FRAMESAMPLES/4; k += 4) {
    162     int16x4_t tmpi = vld1_s16(kSinTab1);
    163     kSinTab1 += 4;
    164     int16x4_t tmpr = vld1_s16(kSinTab2);
    165     kSinTab2 -= 4;
    166     int16x4_t inre_0 = vld1_s16(inre1);
    167     inre1 += 4;
    168     int16x4_t inre_1 = vld1_s16(inre2);
    169     inre2 -= 4;
    170     int16x4_t inim_0 = vld1_s16(inim1);
    171     inim1 += 4;
    172     int16x4_t inim_1 = vld1_s16(inim2);
    173     inim2 -= 4;
    174     tmpr = vneg_s16(tmpr);
    175     inre_1 = vrev64_s16(inre_1);
    176     inim_1 = vrev64_s16(inim_1);
    177     tmpr = vrev64_s16(tmpr);
    178 
    179     int16x4_t xr = vqadd_s16(inre_0, inre_1);
    180     int16x4_t xi = vqsub_s16(inim_0, inim_1);
    181     int16x4_t yr = vqadd_s16(inim_0, inim_1);
    182     int16x4_t yi = vqsub_s16(inre_1, inre_0);
    183 
    184     int32x4_t outr0 = vmull_s16(tmpr, xr);
    185     int32x4_t outi0 = vmull_s16(tmpi, xr);
    186     int32x4_t outr1 = vmull_s16(tmpi, yr);
    187     int32x4_t outi1 = vmull_s16(tmpi, yi);
    188     outr0 = vmlsl_s16(outr0, tmpi, xi);
    189     outi0 = vmlal_s16(outi0, tmpr, xi);
    190     outr1 = vmlal_s16(outr1, tmpr, yi);
    191     outi1 = vmlsl_s16(outi1, tmpr, yr);
    192 
    193     outr0 = vshlq_s32(outr0, shift);
    194     outi0 = vshlq_s32(outi0, shift);
    195     outr1 = vshlq_s32(outr1, shift);
    196     outi1 = vshlq_s32(outi1, shift);
    197     outr1 = vnegq_s32(outr1);
    198 
    199     int16x4_t outre_0  = vmovn_s32(outr0);
    200     int16x4_t outim_0  = vmovn_s32(outi0);
    201     int16x4_t outre_1  = vmovn_s32(outr1);
    202     int16x4_t outim_1  = vmovn_s32(outi1);
    203     outre_1 = vrev64_s16(outre_1);
    204     outim_1 = vrev64_s16(outim_1);
    205 
    206     vst1_s16(outre1, outre_0);
    207     outre1 += 4;
    208     vst1_s16(outim1, outim_0);
    209     outim1 += 4;
    210     vst1_s16(outre2, outre_1);
    211     outre2 -= 4;
    212     vst1_s16(outim2, outim_1);
    213     outim2 -= 4;
    214   }
    215 }
    216 
    217 void WebRtcIsacfix_Time2SpecNeon(int16_t* inre1Q9,
    218                                  int16_t* inre2Q9,
    219                                  int16_t* outreQ7,
    220                                  int16_t* outimQ7) {
    221   int32_t tmpreQ16[FRAMESAMPLES/2], tmpimQ16[FRAMESAMPLES/2];
    222   int32_t max;
    223   int32_t sh;
    224 
    225   // Multiply with complex exponentials and combine into one complex vector.
    226   // And find the maximum.
    227   max = ComplexMulAndFindMaxNeon(inre1Q9, inre2Q9, tmpreQ16, tmpimQ16);
    228 
    229   sh = (int32_t)WebRtcSpl_NormW32(max);
    230   sh = sh - 24;
    231 
    232   // If sh becomes >= 0, then we should shift sh steps to the left,
    233   // and the domain will become Q(16 + sh).
    234   // If sh becomes < 0, then we should shift -sh steps to the right,
    235   // and the domain will become Q(16 + sh).
    236   PreShiftW32toW16Neon(tmpreQ16, tmpimQ16, inre1Q9, inre2Q9, sh);
    237 
    238   // Get DFT.
    239   WebRtcIsacfix_FftRadix16Fastest(inre1Q9, inre2Q9, -1);
    240 
    241   // If sh >= 0, shift sh steps to the right,
    242   // If sh < 0, shift -sh steps to the left.
    243   // Use symmetry to separate into two complex vectors
    244   // and center frames in time around zero.
    245   PostShiftAndSeparateNeon(inre1Q9, inre2Q9, outreQ7, outimQ7, sh);
    246 }
    247 
    248 static inline int32_t TransformAndFindMaxNeon(int16_t* inre,
    249                                               int16_t* inim,
    250                                               int32_t* outre,
    251                                               int32_t* outim) {
    252   int k;
    253   int16_t* inre1 = inre;
    254   int16_t* inre2 = &inre[FRAMESAMPLES/2 - 4];
    255   int16_t* inim1 = inim;
    256   int16_t* inim2 = &inim[FRAMESAMPLES/2 - 4];
    257   int32_t* outre1 = outre;
    258   int32_t* outre2 = &outre[FRAMESAMPLES/2 - 4];
    259   int32_t* outim1 = outim;
    260   int32_t* outim2 = &outim[FRAMESAMPLES/2 - 4];
    261   const int16_t* kSinTab1 = &WebRtcIsacfix_kSinTab2[0];
    262   const int16_t* kSinTab2 = &WebRtcIsacfix_kSinTab2[FRAMESAMPLES/4 - 4];
    263   uint32x4_t max_r = vdupq_n_u32(0);
    264   uint32x4_t max_i = vdupq_n_u32(0);
    265 
    266   // Use ">> 5", instead of "<< 9" and then ">> 14" as in the C code.
    267   for (k = 0; k < FRAMESAMPLES/4; k += 4) {
    268     int16x4_t tmpi = vld1_s16(kSinTab1);
    269     kSinTab1 += 4;
    270     int16x4_t tmpr = vld1_s16(kSinTab2);
    271     kSinTab2 -= 4;
    272     int16x4_t inre_0 = vld1_s16(inre1);
    273     inre1 += 4;
    274     int16x4_t inre_1 = vld1_s16(inre2);
    275     inre2 -= 4;
    276     int16x4_t inim_0 = vld1_s16(inim1);
    277     inim1 += 4;
    278     int16x4_t inim_1 = vld1_s16(inim2);
    279     inim2 -= 4;
    280     tmpr = vneg_s16(tmpr);
    281     inre_1 = vrev64_s16(inre_1);
    282     inim_1 = vrev64_s16(inim_1);
    283     tmpr = vrev64_s16(tmpr);
    284 
    285     int32x4_t xr = vmull_s16(tmpr, inre_0);
    286     int32x4_t xi = vmull_s16(tmpr, inim_0);
    287     int32x4_t yr = vmull_s16(tmpr, inim_1);
    288     int32x4_t yi = vmull_s16(tmpi, inim_1);
    289     xr = vmlal_s16(xr, tmpi, inim_0);
    290     xi = vmlsl_s16(xi, tmpi, inre_0);
    291     yr = vmlal_s16(yr, tmpi, inre_1);
    292     yi = vmlsl_s16(yi, tmpr, inre_1);
    293     yr = vnegq_s32(yr);
    294 
    295     xr = vshrq_n_s32(xr, 5);
    296     xi = vshrq_n_s32(xi, 5);
    297     yr = vshrq_n_s32(yr, 5);
    298     yi = vshrq_n_s32(yi, 5);
    299 
    300     int32x4_t outr0 = vsubq_s32(xr, yi);
    301     int32x4_t outr1 = vaddq_s32(xr, yi);
    302     int32x4_t outi0 = vaddq_s32(xi, yr);
    303     int32x4_t outi1 = vsubq_s32(yr, xi);
    304 
    305     // Find the absolute maximum in the vectors.
    306     int32x4_t tmp0 = vabsq_s32(outr0);
    307     int32x4_t tmp1 = vabsq_s32(outr1);
    308     int32x4_t tmp2 = vabsq_s32(outi0);
    309     int32x4_t tmp3 = vabsq_s32(outi1);
    310     // vabs doesn't change the value of 0x80000000.
    311     // Use u32 so we don't lose the value 0x80000000.
    312     max_r = vmaxq_u32(max_r, vreinterpretq_u32_s32(tmp0));
    313     max_i = vmaxq_u32(max_i, vreinterpretq_u32_s32(tmp2));
    314     max_r = vmaxq_u32(max_r, vreinterpretq_u32_s32(tmp1));
    315     max_i = vmaxq_u32(max_i, vreinterpretq_u32_s32(tmp3));
    316 
    317     // Store the vectors.
    318     outr1 = vrev64q_s32(outr1);
    319     outi1 = vrev64q_s32(outi1);
    320     int32x4_t outr_1 = vcombine_s32(vget_high_s32(outr1), vget_low_s32(outr1));
    321     int32x4_t outi_1 = vcombine_s32(vget_high_s32(outi1), vget_low_s32(outi1));
    322 
    323     vst1q_s32(outre1, outr0);
    324     outre1 += 4;
    325     vst1q_s32(outim1, outi0);
    326     outim1 += 4;
    327     vst1q_s32(outre2, outr_1);
    328     outre2 -= 4;
    329     vst1q_s32(outim2, outi_1);
    330     outim2 -= 4;
    331   }
    332 
    333   max_r = vmaxq_u32(max_r, max_i);
    334 #if defined(WEBRTC_ARCH_ARM64)
    335   uint32_t maximum = vmaxvq_u32(max_r);
    336 #else
    337   uint32x2_t max32x2_r = vmax_u32(vget_low_u32(max_r), vget_high_u32(max_r));
    338   max32x2_r = vpmax_u32(max32x2_r, max32x2_r);
    339   uint32_t maximum = vget_lane_u32(max32x2_r, 0);
    340 #endif
    341 
    342   return (int32_t)maximum;
    343 }
    344 
    345 static inline void PostShiftAndDivideAndDemodulateNeon(int16_t* inre,
    346                                                        int16_t* inim,
    347                                                        int32_t* outre1,
    348                                                        int32_t* outre2,
    349                                                        int32_t sh) {
    350   int k;
    351   int16_t* p_inre = inre;
    352   int16_t* p_inim = inim;
    353   int32_t* p_outre1 = outre1;
    354   int32_t* p_outre2 = outre2;
    355   const int16_t* kCosTab = &WebRtcIsacfix_kCosTab1[0];
    356   const int16_t* kSinTab = &WebRtcIsacfix_kSinTab1[0];
    357   int32x4_t shift = vdupq_n_s32(-sh - 16);
    358   // Divide through by the normalizing constant:
    359   // scale all values with 1/240, i.e. with 273 in Q16.
    360   // 273/65536 ~= 0.0041656
    361   // 1/240 ~= 0.0041666
    362   int16x8_t scale = vdupq_n_s16(273);
    363   // Sqrt(240) in Q11 is round(15.49193338482967 * 2048) = 31727.
    364   int factQ19 = 31727 << 16;
    365   int32x4_t fact = vdupq_n_s32(factQ19);
    366 
    367   for (k = 0; k < FRAMESAMPLES/2; k += 8) {
    368     int16x8_t inre16x8 = vld1q_s16(p_inre);
    369     int16x8_t inim16x8 = vld1q_s16(p_inim);
    370     p_inre += 8;
    371     p_inim += 8;
    372     int16x8_t tmpr = vld1q_s16(kCosTab);
    373     int16x8_t tmpi = vld1q_s16(kSinTab);
    374     kCosTab += 8;
    375     kSinTab += 8;
    376     // By vshl and vmull, we effectively did "<< (-sh - 16)",
    377     // instead of "<< (-sh)" and ">> 16" as in the C code.
    378     int32x4_t outre1_0 = vmull_s16(vget_low_s16(inre16x8), vget_low_s16(scale));
    379     int32x4_t outre2_0 = vmull_s16(vget_low_s16(inim16x8), vget_low_s16(scale));
    380 #if defined(WEBRTC_ARCH_ARM64)
    381     int32x4_t outre1_1 = vmull_high_s16(inre16x8, scale);
    382     int32x4_t outre2_1 = vmull_high_s16(inim16x8, scale);
    383 #else
    384     int32x4_t outre1_1 = vmull_s16(vget_high_s16(inre16x8),
    385                                    vget_high_s16(scale));
    386     int32x4_t outre2_1 = vmull_s16(vget_high_s16(inim16x8),
    387                                    vget_high_s16(scale));
    388 #endif
    389 
    390     outre1_0 = vshlq_s32(outre1_0, shift);
    391     outre1_1 = vshlq_s32(outre1_1, shift);
    392     outre2_0 = vshlq_s32(outre2_0, shift);
    393     outre2_1 = vshlq_s32(outre2_1, shift);
    394 
    395     // Demodulate and separate.
    396     int32x4_t tmpr_0 = vmovl_s16(vget_low_s16(tmpr));
    397     int32x4_t tmpi_0 = vmovl_s16(vget_low_s16(tmpi));
    398 #if defined(WEBRTC_ARCH_ARM64)
    399     int32x4_t tmpr_1 = vmovl_high_s16(tmpr);
    400     int32x4_t tmpi_1 = vmovl_high_s16(tmpi);
    401 #else
    402     int32x4_t tmpr_1 = vmovl_s16(vget_high_s16(tmpr));
    403     int32x4_t tmpi_1 = vmovl_s16(vget_high_s16(tmpi));
    404 #endif
    405 
    406     int64x2_t xr0 = vmull_s32(vget_low_s32(tmpr_0), vget_low_s32(outre1_0));
    407     int64x2_t xi0 = vmull_s32(vget_low_s32(tmpr_0), vget_low_s32(outre2_0));
    408     int64x2_t xr2 = vmull_s32(vget_low_s32(tmpr_1), vget_low_s32(outre1_1));
    409     int64x2_t xi2 = vmull_s32(vget_low_s32(tmpr_1), vget_low_s32(outre2_1));
    410     xr0 = vmlsl_s32(xr0, vget_low_s32(tmpi_0), vget_low_s32(outre2_0));
    411     xi0 = vmlal_s32(xi0, vget_low_s32(tmpi_0), vget_low_s32(outre1_0));
    412     xr2 = vmlsl_s32(xr2, vget_low_s32(tmpi_1), vget_low_s32(outre2_1));
    413     xi2 = vmlal_s32(xi2, vget_low_s32(tmpi_1), vget_low_s32(outre1_1));
    414 
    415 #if defined(WEBRTC_ARCH_ARM64)
    416     int64x2_t xr1 = vmull_high_s32(tmpr_0, outre1_0);
    417     int64x2_t xi1 = vmull_high_s32(tmpr_0, outre2_0);
    418     int64x2_t xr3 = vmull_high_s32(tmpr_1, outre1_1);
    419     int64x2_t xi3 = vmull_high_s32(tmpr_1, outre2_1);
    420     xr1 = vmlsl_high_s32(xr1, tmpi_0, outre2_0);
    421     xi1 = vmlal_high_s32(xi1, tmpi_0, outre1_0);
    422     xr3 = vmlsl_high_s32(xr3, tmpi_1, outre2_1);
    423     xi3 = vmlal_high_s32(xi3, tmpi_1, outre1_1);
    424 #else
    425     int64x2_t xr1 = vmull_s32(vget_high_s32(tmpr_0), vget_high_s32(outre1_0));
    426     int64x2_t xi1 = vmull_s32(vget_high_s32(tmpr_0), vget_high_s32(outre2_0));
    427     int64x2_t xr3 = vmull_s32(vget_high_s32(tmpr_1), vget_high_s32(outre1_1));
    428     int64x2_t xi3 = vmull_s32(vget_high_s32(tmpr_1), vget_high_s32(outre2_1));
    429     xr1 = vmlsl_s32(xr1, vget_high_s32(tmpi_0), vget_high_s32(outre2_0));
    430     xi1 = vmlal_s32(xi1, vget_high_s32(tmpi_0), vget_high_s32(outre1_0));
    431     xr3 = vmlsl_s32(xr3, vget_high_s32(tmpi_1), vget_high_s32(outre2_1));
    432     xi3 = vmlal_s32(xi3, vget_high_s32(tmpi_1), vget_high_s32(outre1_1));
    433 #endif
    434 
    435     outre1_0 = vcombine_s32(vrshrn_n_s64(xr0, 10), vrshrn_n_s64(xr1, 10));
    436     outre2_0 = vcombine_s32(vrshrn_n_s64(xi0, 10), vrshrn_n_s64(xi1, 10));
    437     outre1_1 = vcombine_s32(vrshrn_n_s64(xr2, 10), vrshrn_n_s64(xr3, 10));
    438     outre2_1 = vcombine_s32(vrshrn_n_s64(xi2, 10), vrshrn_n_s64(xi3, 10));
    439     outre1_0 = vqdmulhq_s32(outre1_0, fact);
    440     outre2_0 = vqdmulhq_s32(outre2_0, fact);
    441     outre1_1 = vqdmulhq_s32(outre1_1, fact);
    442     outre2_1 = vqdmulhq_s32(outre2_1, fact);
    443 
    444     vst1q_s32(p_outre1, outre1_0);
    445     p_outre1 += 4;
    446     vst1q_s32(p_outre1, outre1_1);
    447     p_outre1 += 4;
    448     vst1q_s32(p_outre2, outre2_0);
    449     p_outre2 += 4;
    450     vst1q_s32(p_outre2, outre2_1);
    451     p_outre2 += 4;
    452   }
    453 }
    454 
    455 void WebRtcIsacfix_Spec2TimeNeon(int16_t* inreQ7,
    456                                  int16_t* inimQ7,
    457                                  int32_t* outre1Q16,
    458                                  int32_t* outre2Q16) {
    459   int32_t max;
    460   int32_t sh;
    461 
    462   max = TransformAndFindMaxNeon(inreQ7, inimQ7, outre1Q16, outre2Q16);
    463 
    464 
    465   sh = (int32_t)WebRtcSpl_NormW32(max);
    466   sh = sh - 24;
    467   // If sh becomes >= 0, then we should shift sh steps to the left,
    468   // and the domain will become Q(16 + sh).
    469   // If sh becomes < 0, then we should shift -sh steps to the right,
    470   // and the domain will become Q(16 + sh).
    471 
    472   // "Fastest" vectors.
    473   PreShiftW32toW16Neon(outre1Q16, outre2Q16, inreQ7, inimQ7, sh);
    474 
    475   // Get IDFT.
    476   WebRtcIsacfix_FftRadix16Fastest(inreQ7, inimQ7, 1);
    477 
    478   PostShiftAndDivideAndDemodulateNeon(inreQ7, inimQ7, outre1Q16, outre2Q16, sh);
    479 }
    480