1 /* 2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include <arm_neon.h> 12 13 #include "webrtc/modules/audio_coding/codecs/isac/fix/source/codec.h" 14 #include "webrtc/modules/audio_coding/codecs/isac/fix/source/fft.h" 15 #include "webrtc/modules/audio_coding/codecs/isac/fix/source/settings.h" 16 17 // Tables are defined in transform_tables.c file. 18 // Cosine table 1 in Q14. 19 extern const int16_t WebRtcIsacfix_kCosTab1[FRAMESAMPLES/2]; 20 // Sine table 1 in Q14. 21 extern const int16_t WebRtcIsacfix_kSinTab1[FRAMESAMPLES/2]; 22 // Sine table 2 in Q14. 23 extern const int16_t WebRtcIsacfix_kSinTab2[FRAMESAMPLES/4]; 24 25 static inline int32_t ComplexMulAndFindMaxNeon(int16_t* inre1Q9, 26 int16_t* inre2Q9, 27 int32_t* outreQ16, 28 int32_t* outimQ16) { 29 int k; 30 const int16_t* kCosTab = &WebRtcIsacfix_kCosTab1[0]; 31 const int16_t* kSinTab = &WebRtcIsacfix_kSinTab1[0]; 32 // 0.5 / sqrt(240) in Q19 is round((.5 / sqrt(240)) * (2^19)) = 16921. 33 // Use "16921 << 5" and vqdmulh, instead of ">> 26" as in the C code. 34 int32_t fact = 16921 << 5; 35 int32x4_t factq = vdupq_n_s32(fact); 36 uint32x4_t max_r = vdupq_n_u32(0); 37 uint32x4_t max_i = vdupq_n_u32(0); 38 39 for (k = 0; k < FRAMESAMPLES/2; k += 8) { 40 int16x8_t tmpr = vld1q_s16(kCosTab); 41 int16x8_t tmpi = vld1q_s16(kSinTab); 42 int16x8_t inre1 = vld1q_s16(inre1Q9); 43 int16x8_t inre2 = vld1q_s16(inre2Q9); 44 kCosTab += 8; 45 kSinTab += 8; 46 inre1Q9 += 8; 47 inre2Q9 += 8; 48 49 // Use ">> 26", instead of ">> 7", ">> 16" and then ">> 3" as in the C code. 50 int32x4_t tmp0 = vmull_s16(vget_low_s16(tmpr), vget_low_s16(inre1)); 51 int32x4_t tmp1 = vmull_s16(vget_low_s16(tmpr), vget_low_s16(inre2)); 52 tmp0 = vmlal_s16(tmp0, vget_low_s16(tmpi), vget_low_s16(inre2)); 53 tmp1 = vmlsl_s16(tmp1, vget_low_s16(tmpi), vget_low_s16(inre1)); 54 #if defined(WEBRTC_ARCH_ARM64) 55 int32x4_t tmp2 = vmull_high_s16(tmpr, inre1); 56 int32x4_t tmp3 = vmull_high_s16(tmpr, inre2); 57 tmp2 = vmlal_high_s16(tmp2, tmpi, inre2); 58 tmp3 = vmlsl_high_s16(tmp3, tmpi, inre1); 59 #else 60 int32x4_t tmp2 = vmull_s16(vget_high_s16(tmpr), vget_high_s16(inre1)); 61 int32x4_t tmp3 = vmull_s16(vget_high_s16(tmpr), vget_high_s16(inre2)); 62 tmp2 = vmlal_s16(tmp2, vget_high_s16(tmpi), vget_high_s16(inre2)); 63 tmp3 = vmlsl_s16(tmp3, vget_high_s16(tmpi), vget_high_s16(inre1)); 64 #endif 65 66 int32x4_t outr_0 = vqdmulhq_s32(tmp0, factq); 67 int32x4_t outr_1 = vqdmulhq_s32(tmp2, factq); 68 int32x4_t outi_0 = vqdmulhq_s32(tmp1, factq); 69 int32x4_t outi_1 = vqdmulhq_s32(tmp3, factq); 70 vst1q_s32(outreQ16, outr_0); 71 outreQ16 += 4; 72 vst1q_s32(outreQ16, outr_1); 73 outreQ16 += 4; 74 vst1q_s32(outimQ16, outi_0); 75 outimQ16 += 4; 76 vst1q_s32(outimQ16, outi_1); 77 outimQ16 += 4; 78 79 // Find the absolute maximum in the vectors. 80 tmp0 = vabsq_s32(outr_0); 81 tmp1 = vabsq_s32(outr_1); 82 tmp2 = vabsq_s32(outi_0); 83 tmp3 = vabsq_s32(outi_1); 84 // vabs doesn't change the value of 0x80000000. 85 // Use u32 so we don't lose the value 0x80000000. 86 max_r = vmaxq_u32(max_r, vreinterpretq_u32_s32(tmp0)); 87 max_i = vmaxq_u32(max_i, vreinterpretq_u32_s32(tmp2)); 88 max_r = vmaxq_u32(max_r, vreinterpretq_u32_s32(tmp1)); 89 max_i = vmaxq_u32(max_i, vreinterpretq_u32_s32(tmp3)); 90 } 91 92 max_r = vmaxq_u32(max_r, max_i); 93 #if defined(WEBRTC_ARCH_ARM64) 94 uint32_t maximum = vmaxvq_u32(max_r); 95 #else 96 uint32x2_t max32x2_r = vmax_u32(vget_low_u32(max_r), vget_high_u32(max_r)); 97 max32x2_r = vpmax_u32(max32x2_r, max32x2_r); 98 uint32_t maximum = vget_lane_u32(max32x2_r, 0); 99 #endif 100 101 return (int32_t)maximum; 102 } 103 104 static inline void PreShiftW32toW16Neon(int32_t* inre, 105 int32_t* inim, 106 int16_t* outre, 107 int16_t* outim, 108 int32_t sh) { 109 int k; 110 int32x4_t sh32x4 = vdupq_n_s32(sh); 111 for (k = 0; k < FRAMESAMPLES/2; k += 16) { 112 int32x4x4_t inre32x4x4 = vld4q_s32(inre); 113 int32x4x4_t inim32x4x4 = vld4q_s32(inim); 114 inre += 16; 115 inim += 16; 116 inre32x4x4.val[0] = vrshlq_s32(inre32x4x4.val[0], sh32x4); 117 inre32x4x4.val[1] = vrshlq_s32(inre32x4x4.val[1], sh32x4); 118 inre32x4x4.val[2] = vrshlq_s32(inre32x4x4.val[2], sh32x4); 119 inre32x4x4.val[3] = vrshlq_s32(inre32x4x4.val[3], sh32x4); 120 inim32x4x4.val[0] = vrshlq_s32(inim32x4x4.val[0], sh32x4); 121 inim32x4x4.val[1] = vrshlq_s32(inim32x4x4.val[1], sh32x4); 122 inim32x4x4.val[2] = vrshlq_s32(inim32x4x4.val[2], sh32x4); 123 inim32x4x4.val[3] = vrshlq_s32(inim32x4x4.val[3], sh32x4); 124 int16x4x4_t outre16x4x4; 125 int16x4x4_t outim16x4x4; 126 outre16x4x4.val[0] = vmovn_s32(inre32x4x4.val[0]); 127 outre16x4x4.val[1] = vmovn_s32(inre32x4x4.val[1]); 128 outre16x4x4.val[2] = vmovn_s32(inre32x4x4.val[2]); 129 outre16x4x4.val[3] = vmovn_s32(inre32x4x4.val[3]); 130 outim16x4x4.val[0] = vmovn_s32(inim32x4x4.val[0]); 131 outim16x4x4.val[1] = vmovn_s32(inim32x4x4.val[1]); 132 outim16x4x4.val[2] = vmovn_s32(inim32x4x4.val[2]); 133 outim16x4x4.val[3] = vmovn_s32(inim32x4x4.val[3]); 134 vst4_s16(outre, outre16x4x4); 135 vst4_s16(outim, outim16x4x4); 136 outre += 16; 137 outim += 16; 138 } 139 } 140 141 static inline void PostShiftAndSeparateNeon(int16_t* inre, 142 int16_t* inim, 143 int16_t* outre, 144 int16_t* outim, 145 int32_t sh) { 146 int k; 147 int16_t* inre1 = inre; 148 int16_t* inre2 = &inre[FRAMESAMPLES/2 - 4]; 149 int16_t* inim1 = inim; 150 int16_t* inim2 = &inim[FRAMESAMPLES/2 - 4]; 151 int16_t* outre1 = outre; 152 int16_t* outre2 = &outre[FRAMESAMPLES/2 - 4]; 153 int16_t* outim1 = outim; 154 int16_t* outim2 = &outim[FRAMESAMPLES/2 - 4]; 155 const int16_t* kSinTab1 = &WebRtcIsacfix_kSinTab2[0]; 156 const int16_t* kSinTab2 = &WebRtcIsacfix_kSinTab2[FRAMESAMPLES/4 -4]; 157 // By vshl, we effectively did "<< (-sh - 23)", instead of "<< (-sh)", 158 // ">> 14" and then ">> 9" as in the C code. 159 int32x4_t shift = vdupq_n_s32(-sh - 23); 160 161 for (k = 0; k < FRAMESAMPLES/4; k += 4) { 162 int16x4_t tmpi = vld1_s16(kSinTab1); 163 kSinTab1 += 4; 164 int16x4_t tmpr = vld1_s16(kSinTab2); 165 kSinTab2 -= 4; 166 int16x4_t inre_0 = vld1_s16(inre1); 167 inre1 += 4; 168 int16x4_t inre_1 = vld1_s16(inre2); 169 inre2 -= 4; 170 int16x4_t inim_0 = vld1_s16(inim1); 171 inim1 += 4; 172 int16x4_t inim_1 = vld1_s16(inim2); 173 inim2 -= 4; 174 tmpr = vneg_s16(tmpr); 175 inre_1 = vrev64_s16(inre_1); 176 inim_1 = vrev64_s16(inim_1); 177 tmpr = vrev64_s16(tmpr); 178 179 int16x4_t xr = vqadd_s16(inre_0, inre_1); 180 int16x4_t xi = vqsub_s16(inim_0, inim_1); 181 int16x4_t yr = vqadd_s16(inim_0, inim_1); 182 int16x4_t yi = vqsub_s16(inre_1, inre_0); 183 184 int32x4_t outr0 = vmull_s16(tmpr, xr); 185 int32x4_t outi0 = vmull_s16(tmpi, xr); 186 int32x4_t outr1 = vmull_s16(tmpi, yr); 187 int32x4_t outi1 = vmull_s16(tmpi, yi); 188 outr0 = vmlsl_s16(outr0, tmpi, xi); 189 outi0 = vmlal_s16(outi0, tmpr, xi); 190 outr1 = vmlal_s16(outr1, tmpr, yi); 191 outi1 = vmlsl_s16(outi1, tmpr, yr); 192 193 outr0 = vshlq_s32(outr0, shift); 194 outi0 = vshlq_s32(outi0, shift); 195 outr1 = vshlq_s32(outr1, shift); 196 outi1 = vshlq_s32(outi1, shift); 197 outr1 = vnegq_s32(outr1); 198 199 int16x4_t outre_0 = vmovn_s32(outr0); 200 int16x4_t outim_0 = vmovn_s32(outi0); 201 int16x4_t outre_1 = vmovn_s32(outr1); 202 int16x4_t outim_1 = vmovn_s32(outi1); 203 outre_1 = vrev64_s16(outre_1); 204 outim_1 = vrev64_s16(outim_1); 205 206 vst1_s16(outre1, outre_0); 207 outre1 += 4; 208 vst1_s16(outim1, outim_0); 209 outim1 += 4; 210 vst1_s16(outre2, outre_1); 211 outre2 -= 4; 212 vst1_s16(outim2, outim_1); 213 outim2 -= 4; 214 } 215 } 216 217 void WebRtcIsacfix_Time2SpecNeon(int16_t* inre1Q9, 218 int16_t* inre2Q9, 219 int16_t* outreQ7, 220 int16_t* outimQ7) { 221 int32_t tmpreQ16[FRAMESAMPLES/2], tmpimQ16[FRAMESAMPLES/2]; 222 int32_t max; 223 int32_t sh; 224 225 // Multiply with complex exponentials and combine into one complex vector. 226 // And find the maximum. 227 max = ComplexMulAndFindMaxNeon(inre1Q9, inre2Q9, tmpreQ16, tmpimQ16); 228 229 sh = (int32_t)WebRtcSpl_NormW32(max); 230 sh = sh - 24; 231 232 // If sh becomes >= 0, then we should shift sh steps to the left, 233 // and the domain will become Q(16 + sh). 234 // If sh becomes < 0, then we should shift -sh steps to the right, 235 // and the domain will become Q(16 + sh). 236 PreShiftW32toW16Neon(tmpreQ16, tmpimQ16, inre1Q9, inre2Q9, sh); 237 238 // Get DFT. 239 WebRtcIsacfix_FftRadix16Fastest(inre1Q9, inre2Q9, -1); 240 241 // If sh >= 0, shift sh steps to the right, 242 // If sh < 0, shift -sh steps to the left. 243 // Use symmetry to separate into two complex vectors 244 // and center frames in time around zero. 245 PostShiftAndSeparateNeon(inre1Q9, inre2Q9, outreQ7, outimQ7, sh); 246 } 247 248 static inline int32_t TransformAndFindMaxNeon(int16_t* inre, 249 int16_t* inim, 250 int32_t* outre, 251 int32_t* outim) { 252 int k; 253 int16_t* inre1 = inre; 254 int16_t* inre2 = &inre[FRAMESAMPLES/2 - 4]; 255 int16_t* inim1 = inim; 256 int16_t* inim2 = &inim[FRAMESAMPLES/2 - 4]; 257 int32_t* outre1 = outre; 258 int32_t* outre2 = &outre[FRAMESAMPLES/2 - 4]; 259 int32_t* outim1 = outim; 260 int32_t* outim2 = &outim[FRAMESAMPLES/2 - 4]; 261 const int16_t* kSinTab1 = &WebRtcIsacfix_kSinTab2[0]; 262 const int16_t* kSinTab2 = &WebRtcIsacfix_kSinTab2[FRAMESAMPLES/4 - 4]; 263 uint32x4_t max_r = vdupq_n_u32(0); 264 uint32x4_t max_i = vdupq_n_u32(0); 265 266 // Use ">> 5", instead of "<< 9" and then ">> 14" as in the C code. 267 for (k = 0; k < FRAMESAMPLES/4; k += 4) { 268 int16x4_t tmpi = vld1_s16(kSinTab1); 269 kSinTab1 += 4; 270 int16x4_t tmpr = vld1_s16(kSinTab2); 271 kSinTab2 -= 4; 272 int16x4_t inre_0 = vld1_s16(inre1); 273 inre1 += 4; 274 int16x4_t inre_1 = vld1_s16(inre2); 275 inre2 -= 4; 276 int16x4_t inim_0 = vld1_s16(inim1); 277 inim1 += 4; 278 int16x4_t inim_1 = vld1_s16(inim2); 279 inim2 -= 4; 280 tmpr = vneg_s16(tmpr); 281 inre_1 = vrev64_s16(inre_1); 282 inim_1 = vrev64_s16(inim_1); 283 tmpr = vrev64_s16(tmpr); 284 285 int32x4_t xr = vmull_s16(tmpr, inre_0); 286 int32x4_t xi = vmull_s16(tmpr, inim_0); 287 int32x4_t yr = vmull_s16(tmpr, inim_1); 288 int32x4_t yi = vmull_s16(tmpi, inim_1); 289 xr = vmlal_s16(xr, tmpi, inim_0); 290 xi = vmlsl_s16(xi, tmpi, inre_0); 291 yr = vmlal_s16(yr, tmpi, inre_1); 292 yi = vmlsl_s16(yi, tmpr, inre_1); 293 yr = vnegq_s32(yr); 294 295 xr = vshrq_n_s32(xr, 5); 296 xi = vshrq_n_s32(xi, 5); 297 yr = vshrq_n_s32(yr, 5); 298 yi = vshrq_n_s32(yi, 5); 299 300 int32x4_t outr0 = vsubq_s32(xr, yi); 301 int32x4_t outr1 = vaddq_s32(xr, yi); 302 int32x4_t outi0 = vaddq_s32(xi, yr); 303 int32x4_t outi1 = vsubq_s32(yr, xi); 304 305 // Find the absolute maximum in the vectors. 306 int32x4_t tmp0 = vabsq_s32(outr0); 307 int32x4_t tmp1 = vabsq_s32(outr1); 308 int32x4_t tmp2 = vabsq_s32(outi0); 309 int32x4_t tmp3 = vabsq_s32(outi1); 310 // vabs doesn't change the value of 0x80000000. 311 // Use u32 so we don't lose the value 0x80000000. 312 max_r = vmaxq_u32(max_r, vreinterpretq_u32_s32(tmp0)); 313 max_i = vmaxq_u32(max_i, vreinterpretq_u32_s32(tmp2)); 314 max_r = vmaxq_u32(max_r, vreinterpretq_u32_s32(tmp1)); 315 max_i = vmaxq_u32(max_i, vreinterpretq_u32_s32(tmp3)); 316 317 // Store the vectors. 318 outr1 = vrev64q_s32(outr1); 319 outi1 = vrev64q_s32(outi1); 320 int32x4_t outr_1 = vcombine_s32(vget_high_s32(outr1), vget_low_s32(outr1)); 321 int32x4_t outi_1 = vcombine_s32(vget_high_s32(outi1), vget_low_s32(outi1)); 322 323 vst1q_s32(outre1, outr0); 324 outre1 += 4; 325 vst1q_s32(outim1, outi0); 326 outim1 += 4; 327 vst1q_s32(outre2, outr_1); 328 outre2 -= 4; 329 vst1q_s32(outim2, outi_1); 330 outim2 -= 4; 331 } 332 333 max_r = vmaxq_u32(max_r, max_i); 334 #if defined(WEBRTC_ARCH_ARM64) 335 uint32_t maximum = vmaxvq_u32(max_r); 336 #else 337 uint32x2_t max32x2_r = vmax_u32(vget_low_u32(max_r), vget_high_u32(max_r)); 338 max32x2_r = vpmax_u32(max32x2_r, max32x2_r); 339 uint32_t maximum = vget_lane_u32(max32x2_r, 0); 340 #endif 341 342 return (int32_t)maximum; 343 } 344 345 static inline void PostShiftAndDivideAndDemodulateNeon(int16_t* inre, 346 int16_t* inim, 347 int32_t* outre1, 348 int32_t* outre2, 349 int32_t sh) { 350 int k; 351 int16_t* p_inre = inre; 352 int16_t* p_inim = inim; 353 int32_t* p_outre1 = outre1; 354 int32_t* p_outre2 = outre2; 355 const int16_t* kCosTab = &WebRtcIsacfix_kCosTab1[0]; 356 const int16_t* kSinTab = &WebRtcIsacfix_kSinTab1[0]; 357 int32x4_t shift = vdupq_n_s32(-sh - 16); 358 // Divide through by the normalizing constant: 359 // scale all values with 1/240, i.e. with 273 in Q16. 360 // 273/65536 ~= 0.0041656 361 // 1/240 ~= 0.0041666 362 int16x8_t scale = vdupq_n_s16(273); 363 // Sqrt(240) in Q11 is round(15.49193338482967 * 2048) = 31727. 364 int factQ19 = 31727 << 16; 365 int32x4_t fact = vdupq_n_s32(factQ19); 366 367 for (k = 0; k < FRAMESAMPLES/2; k += 8) { 368 int16x8_t inre16x8 = vld1q_s16(p_inre); 369 int16x8_t inim16x8 = vld1q_s16(p_inim); 370 p_inre += 8; 371 p_inim += 8; 372 int16x8_t tmpr = vld1q_s16(kCosTab); 373 int16x8_t tmpi = vld1q_s16(kSinTab); 374 kCosTab += 8; 375 kSinTab += 8; 376 // By vshl and vmull, we effectively did "<< (-sh - 16)", 377 // instead of "<< (-sh)" and ">> 16" as in the C code. 378 int32x4_t outre1_0 = vmull_s16(vget_low_s16(inre16x8), vget_low_s16(scale)); 379 int32x4_t outre2_0 = vmull_s16(vget_low_s16(inim16x8), vget_low_s16(scale)); 380 #if defined(WEBRTC_ARCH_ARM64) 381 int32x4_t outre1_1 = vmull_high_s16(inre16x8, scale); 382 int32x4_t outre2_1 = vmull_high_s16(inim16x8, scale); 383 #else 384 int32x4_t outre1_1 = vmull_s16(vget_high_s16(inre16x8), 385 vget_high_s16(scale)); 386 int32x4_t outre2_1 = vmull_s16(vget_high_s16(inim16x8), 387 vget_high_s16(scale)); 388 #endif 389 390 outre1_0 = vshlq_s32(outre1_0, shift); 391 outre1_1 = vshlq_s32(outre1_1, shift); 392 outre2_0 = vshlq_s32(outre2_0, shift); 393 outre2_1 = vshlq_s32(outre2_1, shift); 394 395 // Demodulate and separate. 396 int32x4_t tmpr_0 = vmovl_s16(vget_low_s16(tmpr)); 397 int32x4_t tmpi_0 = vmovl_s16(vget_low_s16(tmpi)); 398 #if defined(WEBRTC_ARCH_ARM64) 399 int32x4_t tmpr_1 = vmovl_high_s16(tmpr); 400 int32x4_t tmpi_1 = vmovl_high_s16(tmpi); 401 #else 402 int32x4_t tmpr_1 = vmovl_s16(vget_high_s16(tmpr)); 403 int32x4_t tmpi_1 = vmovl_s16(vget_high_s16(tmpi)); 404 #endif 405 406 int64x2_t xr0 = vmull_s32(vget_low_s32(tmpr_0), vget_low_s32(outre1_0)); 407 int64x2_t xi0 = vmull_s32(vget_low_s32(tmpr_0), vget_low_s32(outre2_0)); 408 int64x2_t xr2 = vmull_s32(vget_low_s32(tmpr_1), vget_low_s32(outre1_1)); 409 int64x2_t xi2 = vmull_s32(vget_low_s32(tmpr_1), vget_low_s32(outre2_1)); 410 xr0 = vmlsl_s32(xr0, vget_low_s32(tmpi_0), vget_low_s32(outre2_0)); 411 xi0 = vmlal_s32(xi0, vget_low_s32(tmpi_0), vget_low_s32(outre1_0)); 412 xr2 = vmlsl_s32(xr2, vget_low_s32(tmpi_1), vget_low_s32(outre2_1)); 413 xi2 = vmlal_s32(xi2, vget_low_s32(tmpi_1), vget_low_s32(outre1_1)); 414 415 #if defined(WEBRTC_ARCH_ARM64) 416 int64x2_t xr1 = vmull_high_s32(tmpr_0, outre1_0); 417 int64x2_t xi1 = vmull_high_s32(tmpr_0, outre2_0); 418 int64x2_t xr3 = vmull_high_s32(tmpr_1, outre1_1); 419 int64x2_t xi3 = vmull_high_s32(tmpr_1, outre2_1); 420 xr1 = vmlsl_high_s32(xr1, tmpi_0, outre2_0); 421 xi1 = vmlal_high_s32(xi1, tmpi_0, outre1_0); 422 xr3 = vmlsl_high_s32(xr3, tmpi_1, outre2_1); 423 xi3 = vmlal_high_s32(xi3, tmpi_1, outre1_1); 424 #else 425 int64x2_t xr1 = vmull_s32(vget_high_s32(tmpr_0), vget_high_s32(outre1_0)); 426 int64x2_t xi1 = vmull_s32(vget_high_s32(tmpr_0), vget_high_s32(outre2_0)); 427 int64x2_t xr3 = vmull_s32(vget_high_s32(tmpr_1), vget_high_s32(outre1_1)); 428 int64x2_t xi3 = vmull_s32(vget_high_s32(tmpr_1), vget_high_s32(outre2_1)); 429 xr1 = vmlsl_s32(xr1, vget_high_s32(tmpi_0), vget_high_s32(outre2_0)); 430 xi1 = vmlal_s32(xi1, vget_high_s32(tmpi_0), vget_high_s32(outre1_0)); 431 xr3 = vmlsl_s32(xr3, vget_high_s32(tmpi_1), vget_high_s32(outre2_1)); 432 xi3 = vmlal_s32(xi3, vget_high_s32(tmpi_1), vget_high_s32(outre1_1)); 433 #endif 434 435 outre1_0 = vcombine_s32(vrshrn_n_s64(xr0, 10), vrshrn_n_s64(xr1, 10)); 436 outre2_0 = vcombine_s32(vrshrn_n_s64(xi0, 10), vrshrn_n_s64(xi1, 10)); 437 outre1_1 = vcombine_s32(vrshrn_n_s64(xr2, 10), vrshrn_n_s64(xr3, 10)); 438 outre2_1 = vcombine_s32(vrshrn_n_s64(xi2, 10), vrshrn_n_s64(xi3, 10)); 439 outre1_0 = vqdmulhq_s32(outre1_0, fact); 440 outre2_0 = vqdmulhq_s32(outre2_0, fact); 441 outre1_1 = vqdmulhq_s32(outre1_1, fact); 442 outre2_1 = vqdmulhq_s32(outre2_1, fact); 443 444 vst1q_s32(p_outre1, outre1_0); 445 p_outre1 += 4; 446 vst1q_s32(p_outre1, outre1_1); 447 p_outre1 += 4; 448 vst1q_s32(p_outre2, outre2_0); 449 p_outre2 += 4; 450 vst1q_s32(p_outre2, outre2_1); 451 p_outre2 += 4; 452 } 453 } 454 455 void WebRtcIsacfix_Spec2TimeNeon(int16_t* inreQ7, 456 int16_t* inimQ7, 457 int32_t* outre1Q16, 458 int32_t* outre2Q16) { 459 int32_t max; 460 int32_t sh; 461 462 max = TransformAndFindMaxNeon(inreQ7, inimQ7, outre1Q16, outre2Q16); 463 464 465 sh = (int32_t)WebRtcSpl_NormW32(max); 466 sh = sh - 24; 467 // If sh becomes >= 0, then we should shift sh steps to the left, 468 // and the domain will become Q(16 + sh). 469 // If sh becomes < 0, then we should shift -sh steps to the right, 470 // and the domain will become Q(16 + sh). 471 472 // "Fastest" vectors. 473 PreShiftW32toW16Neon(outre1Q16, outre2Q16, inreQ7, inimQ7, sh); 474 475 // Get IDFT. 476 WebRtcIsacfix_FftRadix16Fastest(inreQ7, inimQ7, 1); 477 478 PostShiftAndDivideAndDemodulateNeon(inreQ7, inimQ7, outre1Q16, outre2Q16, sh); 479 } 480