1 /* 2 * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include "nsx_core.h" 12 13 #include <arm_neon.h> 14 #include <assert.h> 15 16 // Update the noise estimation information. 17 static void UpdateNoiseEstimateNeon(NsxInst_t* inst, int offset) { 18 int i = 0; 19 const int16_t kExp2Const = 11819; // Q13 20 int16_t* ptr_noiseEstLogQuantile = NULL; 21 int16_t* ptr_noiseEstQuantile = NULL; 22 int16x4_t kExp2Const16x4 = vdup_n_s16(kExp2Const); 23 int32x4_t twentyOne32x4 = vdupq_n_s32(21); 24 int32x4_t constA32x4 = vdupq_n_s32(0x1fffff); 25 int32x4_t constB32x4 = vdupq_n_s32(0x200000); 26 27 int16_t tmp16 = WebRtcSpl_MaxValueW16(inst->noiseEstLogQuantile + offset, 28 inst->magnLen); 29 30 // Guarantee a Q-domain as high as possible and still fit in int16 31 inst->qNoise = 14 - (int) WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(kExp2Const, 32 tmp16, 33 21); 34 35 int32x4_t qNoise32x4 = vdupq_n_s32(inst->qNoise); 36 37 for (ptr_noiseEstLogQuantile = &inst->noiseEstLogQuantile[offset], 38 ptr_noiseEstQuantile = &inst->noiseEstQuantile[0]; 39 ptr_noiseEstQuantile < &inst->noiseEstQuantile[inst->magnLen - 3]; 40 ptr_noiseEstQuantile += 4, ptr_noiseEstLogQuantile += 4) { 41 42 // tmp32no2 = WEBRTC_SPL_MUL_16_16(kExp2Const, 43 // inst->noiseEstLogQuantile[offset + i]); 44 int16x4_t v16x4 = vld1_s16(ptr_noiseEstLogQuantile); 45 int32x4_t v32x4B = vmull_s16(v16x4, kExp2Const16x4); 46 47 // tmp32no1 = (0x00200000 | (tmp32no2 & 0x001FFFFF)); // 2^21 + frac 48 int32x4_t v32x4A = vandq_s32(v32x4B, constA32x4); 49 v32x4A = vorrq_s32(v32x4A, constB32x4); 50 51 // tmp16 = (int16_t) WEBRTC_SPL_RSHIFT_W32(tmp32no2, 21); 52 v32x4B = vshrq_n_s32(v32x4B, 21); 53 54 // tmp16 -= 21;// shift 21 to get result in Q0 55 v32x4B = vsubq_s32(v32x4B, twentyOne32x4); 56 57 // tmp16 += (int16_t) inst->qNoise; 58 // shift to get result in Q(qNoise) 59 v32x4B = vaddq_s32(v32x4B, qNoise32x4); 60 61 // if (tmp16 < 0) { 62 // tmp32no1 = WEBRTC_SPL_RSHIFT_W32(tmp32no1, -tmp16); 63 // } else { 64 // tmp32no1 = WEBRTC_SPL_LSHIFT_W32(tmp32no1, tmp16); 65 // } 66 v32x4B = vshlq_s32(v32x4A, v32x4B); 67 68 // tmp16 = WebRtcSpl_SatW32ToW16(tmp32no1); 69 v16x4 = vqmovn_s32(v32x4B); 70 71 //inst->noiseEstQuantile[i] = tmp16; 72 vst1_s16(ptr_noiseEstQuantile, v16x4); 73 } 74 75 // Last iteration: 76 77 // inst->quantile[i]=exp(inst->lquantile[offset+i]); 78 // in Q21 79 int32_t tmp32no2 = WEBRTC_SPL_MUL_16_16(kExp2Const, 80 *ptr_noiseEstLogQuantile); 81 int32_t tmp32no1 = (0x00200000 | (tmp32no2 & 0x001FFFFF)); // 2^21 + frac 82 83 tmp16 = (int16_t) WEBRTC_SPL_RSHIFT_W32(tmp32no2, 21); 84 tmp16 -= 21;// shift 21 to get result in Q0 85 tmp16 += (int16_t) inst->qNoise; //shift to get result in Q(qNoise) 86 if (tmp16 < 0) { 87 tmp32no1 = WEBRTC_SPL_RSHIFT_W32(tmp32no1, -tmp16); 88 } else { 89 tmp32no1 = WEBRTC_SPL_LSHIFT_W32(tmp32no1, tmp16); 90 } 91 *ptr_noiseEstQuantile = WebRtcSpl_SatW32ToW16(tmp32no1); 92 } 93 94 // Noise Estimation 95 static void NoiseEstimationNeon(NsxInst_t* inst, 96 uint16_t* magn, 97 uint32_t* noise, 98 int16_t* q_noise) { 99 int16_t lmagn[HALF_ANAL_BLOCKL], counter, countDiv; 100 int16_t countProd, delta, zeros, frac; 101 int16_t log2, tabind, logval, tmp16, tmp16no1, tmp16no2; 102 const int16_t log2_const = 22713; 103 const int16_t width_factor = 21845; 104 105 int i, s, offset; 106 107 tabind = inst->stages - inst->normData; 108 assert(tabind < 9); 109 assert(tabind > -9); 110 if (tabind < 0) { 111 logval = -WebRtcNsx_kLogTable[-tabind]; 112 } else { 113 logval = WebRtcNsx_kLogTable[tabind]; 114 } 115 116 int16x8_t logval_16x8 = vdupq_n_s16(logval); 117 118 // lmagn(i)=log(magn(i))=log(2)*log2(magn(i)) 119 // magn is in Q(-stages), and the real lmagn values are: 120 // real_lmagn(i)=log(magn(i)*2^stages)=log(magn(i))+log(2^stages) 121 // lmagn in Q8 122 for (i = 0; i < inst->magnLen; i++) { 123 if (magn[i]) { 124 zeros = WebRtcSpl_NormU32((uint32_t)magn[i]); 125 frac = (int16_t)((((uint32_t)magn[i] << zeros) 126 & 0x7FFFFFFF) >> 23); 127 assert(frac < 256); 128 // log2(magn(i)) 129 log2 = (int16_t)(((31 - zeros) << 8) 130 + WebRtcNsx_kLogTableFrac[frac]); 131 // log2(magn(i))*log(2) 132 lmagn[i] = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(log2, log2_const, 15); 133 // + log(2^stages) 134 lmagn[i] += logval; 135 } else { 136 lmagn[i] = logval; 137 } 138 } 139 140 int16x4_t Q3_16x4 = vdup_n_s16(3); 141 int16x8_t WIDTHQ8_16x8 = vdupq_n_s16(WIDTH_Q8); 142 int16x8_t WIDTHFACTOR_16x8 = vdupq_n_s16(width_factor); 143 144 int16_t factor = FACTOR_Q7; 145 if (inst->blockIndex < END_STARTUP_LONG) 146 factor = FACTOR_Q7_STARTUP; 147 148 // Loop over simultaneous estimates 149 for (s = 0; s < SIMULT; s++) { 150 offset = s * inst->magnLen; 151 152 // Get counter values from state 153 counter = inst->noiseEstCounter[s]; 154 assert(counter < 201); 155 countDiv = WebRtcNsx_kCounterDiv[counter]; 156 countProd = (int16_t)WEBRTC_SPL_MUL_16_16(counter, countDiv); 157 158 // quant_est(...) 159 int16_t deltaBuff[8]; 160 int16x4_t tmp16x4_0; 161 int16x4_t tmp16x4_1; 162 int16x4_t countDiv_16x4 = vdup_n_s16(countDiv); 163 int16x8_t countProd_16x8 = vdupq_n_s16(countProd); 164 int16x8_t tmp16x8_0 = vdupq_n_s16(countDiv); 165 int16x8_t prod16x8 = vqrdmulhq_s16(WIDTHFACTOR_16x8, tmp16x8_0); 166 int16x8_t tmp16x8_1; 167 int16x8_t tmp16x8_2; 168 int16x8_t tmp16x8_3; 169 int16x8_t tmp16x8_4; 170 int16x8_t tmp16x8_5; 171 int32x4_t tmp32x4; 172 173 for (i = 0; i < inst->magnLen - 7; i += 8) { 174 // Compute delta. 175 // Smaller step size during startup. This prevents from using 176 // unrealistic values causing overflow. 177 tmp16x8_0 = vdupq_n_s16(factor); 178 vst1q_s16(deltaBuff, tmp16x8_0); 179 180 int j; 181 for (j = 0; j < 8; j++) { 182 if (inst->noiseEstDensity[offset + i + j] > 512) { 183 // Get values for deltaBuff by shifting intead of dividing. 184 int factor = WebRtcSpl_NormW16(inst->noiseEstDensity[offset + i + j]); 185 deltaBuff[j] = (int16_t)(FACTOR_Q16 >> (14 - factor)); 186 } 187 } 188 189 // Update log quantile estimate 190 191 // tmp16 = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(delta, countDiv, 14); 192 tmp32x4 = vmull_s16(vld1_s16(&deltaBuff[0]), countDiv_16x4); 193 tmp16x4_1 = vshrn_n_s32(tmp32x4, 14); 194 tmp32x4 = vmull_s16(vld1_s16(&deltaBuff[4]), countDiv_16x4); 195 tmp16x4_0 = vshrn_n_s32(tmp32x4, 14); 196 tmp16x8_0 = vcombine_s16(tmp16x4_1, tmp16x4_0); // Keep for several lines. 197 198 // prepare for the "if" branch 199 // tmp16 += 2; 200 // tmp16_1 = (Word16)(tmp16>>2); 201 tmp16x8_1 = vrshrq_n_s16(tmp16x8_0, 2); 202 203 // inst->noiseEstLogQuantile[offset+i] + tmp16_1; 204 tmp16x8_2 = vld1q_s16(&inst->noiseEstLogQuantile[offset + i]); // Keep 205 tmp16x8_1 = vaddq_s16(tmp16x8_2, tmp16x8_1); // Keep for several lines 206 207 // Prepare for the "else" branch 208 // tmp16 += 1; 209 // tmp16_1 = (Word16)(tmp16>>1); 210 tmp16x8_0 = vrshrq_n_s16(tmp16x8_0, 1); 211 212 // tmp16_2 = (Word16)WEBRTC_SPL_MUL_16_16_RSFT(tmp16_1,3,1); 213 tmp32x4 = vmull_s16(vget_low_s16(tmp16x8_0), Q3_16x4); 214 tmp16x4_1 = vshrn_n_s32(tmp32x4, 1); 215 216 // tmp16_2 = (Word16)WEBRTC_SPL_MUL_16_16_RSFT(tmp16_1,3,1); 217 tmp32x4 = vmull_s16(vget_high_s16(tmp16x8_0), Q3_16x4); 218 tmp16x4_0 = vshrn_n_s32(tmp32x4, 1); 219 220 // inst->noiseEstLogQuantile[offset + i] - tmp16_2; 221 tmp16x8_0 = vcombine_s16(tmp16x4_1, tmp16x4_0); // keep 222 tmp16x8_0 = vsubq_s16(tmp16x8_2, tmp16x8_0); 223 224 // logval is the smallest fixed point representation we can have. Values 225 // below that will correspond to values in the interval [0, 1], which 226 // can't possibly occur. 227 tmp16x8_0 = vmaxq_s16(tmp16x8_0, logval_16x8); 228 229 // Do the if-else branches: 230 tmp16x8_3 = vld1q_s16(&lmagn[i]); // keep for several lines 231 tmp16x8_5 = vsubq_s16(tmp16x8_3, tmp16x8_2); 232 __asm__("vcgt.s16 %q0, %q1, #0"::"w"(tmp16x8_4), "w"(tmp16x8_5)); 233 __asm__("vbit %q0, %q1, %q2":: 234 "w"(tmp16x8_2), "w"(tmp16x8_1), "w"(tmp16x8_4)); 235 __asm__("vbif %q0, %q1, %q2":: 236 "w"(tmp16x8_2), "w"(tmp16x8_0), "w"(tmp16x8_4)); 237 vst1q_s16(&inst->noiseEstLogQuantile[offset + i], tmp16x8_2); 238 239 // Update density estimate 240 // tmp16_1 + tmp16_2 241 tmp16x8_1 = vld1q_s16(&inst->noiseEstDensity[offset + i]); 242 tmp16x8_0 = vqrdmulhq_s16(tmp16x8_1, countProd_16x8); 243 tmp16x8_0 = vaddq_s16(tmp16x8_0, prod16x8); 244 245 // lmagn[i] - inst->noiseEstLogQuantile[offset + i] 246 tmp16x8_3 = vsubq_s16(tmp16x8_3, tmp16x8_2); 247 tmp16x8_3 = vabsq_s16(tmp16x8_3); 248 tmp16x8_4 = vcgtq_s16(WIDTHQ8_16x8, tmp16x8_3); 249 __asm__("vbit %q0, %q1, %q2":: 250 "w"(tmp16x8_1), "w"(tmp16x8_0), "w"(tmp16x8_4)); 251 vst1q_s16(&inst->noiseEstDensity[offset + i], tmp16x8_1); 252 } // End loop over magnitude spectrum 253 254 // Last iteration over magnitude spectrum: 255 // compute delta 256 if (inst->noiseEstDensity[offset + i] > 512) { 257 // Get values for deltaBuff by shifting intead of dividing. 258 int factor = WebRtcSpl_NormW16(inst->noiseEstDensity[offset + i]); 259 delta = (int16_t)(FACTOR_Q16 >> (14 - factor)); 260 } else { 261 delta = FACTOR_Q7; 262 if (inst->blockIndex < END_STARTUP_LONG) { 263 // Smaller step size during startup. This prevents from using 264 // unrealistic values causing overflow. 265 delta = FACTOR_Q7_STARTUP; 266 } 267 } 268 // update log quantile estimate 269 tmp16 = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(delta, countDiv, 14); 270 if (lmagn[i] > inst->noiseEstLogQuantile[offset + i]) { 271 // +=QUANTILE*delta/(inst->counter[s]+1) QUANTILE=0.25, =1 in Q2 272 // CounterDiv=1/(inst->counter[s]+1) in Q15 273 tmp16 += 2; 274 tmp16no1 = WEBRTC_SPL_RSHIFT_W16(tmp16, 2); 275 inst->noiseEstLogQuantile[offset + i] += tmp16no1; 276 } else { 277 tmp16 += 1; 278 tmp16no1 = WEBRTC_SPL_RSHIFT_W16(tmp16, 1); 279 // *(1-QUANTILE), in Q2 QUANTILE=0.25, 1-0.25=0.75=3 in Q2 280 tmp16no2 = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(tmp16no1, 3, 1); 281 inst->noiseEstLogQuantile[offset + i] -= tmp16no2; 282 if (inst->noiseEstLogQuantile[offset + i] < logval) { 283 // logval is the smallest fixed point representation we can have. 284 // Values below that will correspond to values in the interval 285 // [0, 1], which can't possibly occur. 286 inst->noiseEstLogQuantile[offset + i] = logval; 287 } 288 } 289 290 // update density estimate 291 if (WEBRTC_SPL_ABS_W16(lmagn[i] - inst->noiseEstLogQuantile[offset + i]) 292 < WIDTH_Q8) { 293 tmp16no1 = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND( 294 inst->noiseEstDensity[offset + i], countProd, 15); 295 tmp16no2 = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND( 296 width_factor, countDiv, 15); 297 inst->noiseEstDensity[offset + i] = tmp16no1 + tmp16no2; 298 } 299 300 301 if (counter >= END_STARTUP_LONG) { 302 inst->noiseEstCounter[s] = 0; 303 if (inst->blockIndex >= END_STARTUP_LONG) { 304 UpdateNoiseEstimateNeon(inst, offset); 305 } 306 } 307 inst->noiseEstCounter[s]++; 308 309 } // end loop over simultaneous estimates 310 311 // Sequentially update the noise during startup 312 if (inst->blockIndex < END_STARTUP_LONG) { 313 UpdateNoiseEstimateNeon(inst, offset); 314 } 315 316 for (i = 0; i < inst->magnLen; i++) { 317 noise[i] = (uint32_t)(inst->noiseEstQuantile[i]); // Q(qNoise) 318 } 319 (*q_noise) = (int16_t)inst->qNoise; 320 } 321 322 // Filter the data in the frequency domain, and create spectrum. 323 static void PrepareSpectrumNeon(NsxInst_t* inst, int16_t* freq_buf) { 324 325 // (1) Filtering. 326 327 // Fixed point C code for the next block is as follows: 328 // for (i = 0; i < inst->magnLen; i++) { 329 // inst->real[i] = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(inst->real[i], 330 // (int16_t)(inst->noiseSupFilter[i]), 14); // Q(normData-stages) 331 // inst->imag[i] = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(inst->imag[i], 332 // (int16_t)(inst->noiseSupFilter[i]), 14); // Q(normData-stages) 333 // } 334 335 int16_t* ptr_real = &inst->real[0]; 336 int16_t* ptr_imag = &inst->imag[0]; 337 uint16_t* ptr_noiseSupFilter = &inst->noiseSupFilter[0]; 338 339 // Filter the rest in the frequency domain. 340 for (; ptr_real < &inst->real[inst->magnLen - 1];) { 341 // Loop unrolled once. Both pointers are incremented by 4 twice. 342 __asm__ __volatile__( 343 "vld1.16 d20, [%[ptr_real]]\n\t" 344 "vld1.16 d22, [%[ptr_imag]]\n\t" 345 "vld1.16 d23, [%[ptr_noiseSupFilter]]!\n\t" 346 "vmull.s16 q10, d20, d23\n\t" 347 "vmull.s16 q11, d22, d23\n\t" 348 "vshrn.s32 d20, q10, #14\n\t" 349 "vshrn.s32 d22, q11, #14\n\t" 350 "vst1.16 d20, [%[ptr_real]]!\n\t" 351 "vst1.16 d22, [%[ptr_imag]]!\n\t" 352 353 "vld1.16 d18, [%[ptr_real]]\n\t" 354 "vld1.16 d24, [%[ptr_imag]]\n\t" 355 "vld1.16 d25, [%[ptr_noiseSupFilter]]!\n\t" 356 "vmull.s16 q9, d18, d25\n\t" 357 "vmull.s16 q12, d24, d25\n\t" 358 "vshrn.s32 d18, q9, #14\n\t" 359 "vshrn.s32 d24, q12, #14\n\t" 360 "vst1.16 d18, [%[ptr_real]]!\n\t" 361 "vst1.16 d24, [%[ptr_imag]]!\n\t" 362 363 // Specify constraints. 364 :[ptr_imag]"+r"(ptr_imag), 365 [ptr_real]"+r"(ptr_real), 366 [ptr_noiseSupFilter]"+r"(ptr_noiseSupFilter) 367 : 368 :"d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", 369 "q9", "q10", "q11", "q12" 370 ); 371 } 372 373 // Filter the last pair of elements in the frequency domain. 374 *ptr_real = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(*ptr_real, 375 (int16_t)(*ptr_noiseSupFilter), 14); // Q(normData-stages) 376 *ptr_imag = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(*ptr_imag, 377 (int16_t)(*ptr_noiseSupFilter), 14); // Q(normData-stages) 378 379 // (2) Create spectrum. 380 381 // Fixed point C code for the rest of the function is as follows: 382 // freq_buf[0] = inst->real[0]; 383 // freq_buf[1] = -inst->imag[0]; 384 // for (i = 1, j = 2; i < inst->anaLen2; i += 1, j += 2) { 385 // tmp16 = (inst->anaLen << 1) - j; 386 // freq_buf[j] = inst->real[i]; 387 // freq_buf[j + 1] = -inst->imag[i]; 388 // freq_buf[tmp16] = inst->real[i]; 389 // freq_buf[tmp16 + 1] = inst->imag[i]; 390 // } 391 // freq_buf[inst->anaLen] = inst->real[inst->anaLen2]; 392 // freq_buf[inst->anaLen + 1] = -inst->imag[inst->anaLen2]; 393 394 freq_buf[0] = inst->real[0]; 395 freq_buf[1] = -inst->imag[0]; 396 397 int offset = -16; 398 int16_t* ptr_realImag1 = &freq_buf[2]; 399 int16_t* ptr_realImag2 = ptr_realImag2 = &freq_buf[(inst->anaLen << 1) - 8]; 400 ptr_real = &inst->real[1]; 401 ptr_imag = &inst->imag[1]; 402 for (; ptr_real < &inst->real[inst->anaLen2 - 11];) { 403 // Loop unrolled once. All pointers are incremented twice. 404 __asm__ __volatile__( 405 "vld1.16 d22, [%[ptr_real]]!\n\t" 406 "vld1.16 d23, [%[ptr_imag]]!\n\t" 407 // Negate and interleave: 408 "vmov.s16 d20, d22\n\t" 409 "vneg.s16 d21, d23\n\t" 410 "vzip.16 d20, d21\n\t" 411 // Write 8 elements to &freq_buf[j] 412 "vst1.16 {d20, d21}, [%[ptr_realImag1]]!\n\t" 413 // Interleave and reverse elements: 414 "vzip.16 d22, d23\n\t" 415 "vrev64.32 d18, d23\n\t" 416 "vrev64.32 d19, d22\n\t" 417 // Write 8 elements to &freq_buf[tmp16] 418 "vst1.16 {d18, d19}, [%[ptr_realImag2]], %[offset]\n\t" 419 420 "vld1.16 d22, [%[ptr_real]]!\n\t" 421 "vld1.16 d23, [%[ptr_imag]]!\n\t" 422 // Negate and interleave: 423 "vmov.s16 d20, d22\n\t" 424 "vneg.s16 d21, d23\n\t" 425 "vzip.16 d20, d21\n\t" 426 // Write 8 elements to &freq_buf[j] 427 "vst1.16 {d20, d21}, [%[ptr_realImag1]]!\n\t" 428 // Interleave and reverse elements: 429 "vzip.16 d22, d23\n\t" 430 "vrev64.32 d18, d23\n\t" 431 "vrev64.32 d19, d22\n\t" 432 // Write 8 elements to &freq_buf[tmp16] 433 "vst1.16 {d18, d19}, [%[ptr_realImag2]], %[offset]\n\t" 434 435 // Specify constraints. 436 :[ptr_imag]"+r"(ptr_imag), 437 [ptr_real]"+r"(ptr_real), 438 [ptr_realImag1]"+r"(ptr_realImag1), 439 [ptr_realImag2]"+r"(ptr_realImag2) 440 :[offset]"r"(offset) 441 :"d18", "d19", "d20", "d21", "d22", "d23" 442 ); 443 } 444 for (ptr_realImag2 += 6; 445 ptr_real <= &inst->real[inst->anaLen2]; 446 ptr_real += 1, ptr_imag += 1, ptr_realImag1 += 2, ptr_realImag2 -= 2) { 447 *ptr_realImag1 = *ptr_real; 448 *(ptr_realImag1 + 1) = -(*ptr_imag); 449 *ptr_realImag2 = *ptr_real; 450 *(ptr_realImag2 + 1) = *ptr_imag; 451 } 452 453 freq_buf[inst->anaLen] = inst->real[inst->anaLen2]; 454 freq_buf[inst->anaLen + 1] = -inst->imag[inst->anaLen2]; 455 } 456 457 // Denormalize the input buffer. 458 static __inline void DenormalizeNeon(NsxInst_t* inst, int16_t* in, int factor) { 459 int16_t* ptr_real = &inst->real[0]; 460 int16_t* ptr_in = &in[0]; 461 462 __asm__ __volatile__("vdup.32 q10, %0" :: 463 "r"((int32_t)(factor - inst->normData)) : "q10"); 464 for (; ptr_real < &inst->real[inst->anaLen];) { 465 466 // Loop unrolled once. Both pointers are incremented. 467 __asm__ __volatile__( 468 // tmp32 = WEBRTC_SPL_SHIFT_W32((int32_t)in[j], 469 // factor - inst->normData); 470 "vld2.16 {d24, d25}, [%[ptr_in]]!\n\t" 471 "vmovl.s16 q12, d24\n\t" 472 "vshl.s32 q12, q10\n\t" 473 // inst->real[i] = WebRtcSpl_SatW32ToW16(tmp32); // Q0 474 "vqmovn.s32 d24, q12\n\t" 475 "vst1.16 d24, [%[ptr_real]]!\n\t" 476 477 // tmp32 = WEBRTC_SPL_SHIFT_W32((int32_t)in[j], 478 // factor - inst->normData); 479 "vld2.16 {d22, d23}, [%[ptr_in]]!\n\t" 480 "vmovl.s16 q11, d22\n\t" 481 "vshl.s32 q11, q10\n\t" 482 // inst->real[i] = WebRtcSpl_SatW32ToW16(tmp32); // Q0 483 "vqmovn.s32 d22, q11\n\t" 484 "vst1.16 d22, [%[ptr_real]]!\n\t" 485 486 // Specify constraints. 487 :[ptr_in]"+r"(ptr_in), 488 [ptr_real]"+r"(ptr_real) 489 : 490 :"d22", "d23", "d24", "d25" 491 ); 492 } 493 } 494 495 // For the noise supress process, synthesis, read out fully processed segment, 496 // and update synthesis buffer. 497 static void SynthesisUpdateNeon(NsxInst_t* inst, 498 int16_t* out_frame, 499 int16_t gain_factor) { 500 int16_t* ptr_real = &inst->real[0]; 501 int16_t* ptr_syn = &inst->synthesisBuffer[0]; 502 int16_t* ptr_window = &inst->window[0]; 503 504 // synthesis 505 __asm__ __volatile__("vdup.16 d24, %0" : : "r"(gain_factor) : "d24"); 506 // Loop unrolled once. All pointers are incremented in the assembly code. 507 for (; ptr_syn < &inst->synthesisBuffer[inst->anaLen];) { 508 __asm__ __volatile__( 509 // Load variables. 510 "vld1.16 d22, [%[ptr_real]]!\n\t" 511 "vld1.16 d23, [%[ptr_window]]!\n\t" 512 "vld1.16 d25, [%[ptr_syn]]\n\t" 513 // tmp16a = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND( 514 // inst->window[i], inst->real[i], 14); // Q0, window in Q14 515 "vmull.s16 q11, d22, d23\n\t" 516 "vrshrn.i32 d22, q11, #14\n\t" 517 // tmp32 = WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(tmp16a, gain_factor, 13); 518 "vmull.s16 q11, d24, d22\n\t" 519 // tmp16b = WebRtcSpl_SatW32ToW16(tmp32); // Q0 520 "vqrshrn.s32 d22, q11, #13\n\t" 521 // inst->synthesisBuffer[i] = WEBRTC_SPL_ADD_SAT_W16( 522 // inst->synthesisBuffer[i], tmp16b); // Q0 523 "vqadd.s16 d25, d22\n\t" 524 "vst1.16 d25, [%[ptr_syn]]!\n\t" 525 526 // Load variables. 527 "vld1.16 d26, [%[ptr_real]]!\n\t" 528 "vld1.16 d27, [%[ptr_window]]!\n\t" 529 "vld1.16 d28, [%[ptr_syn]]\n\t" 530 // tmp16a = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND( 531 // inst->window[i], inst->real[i], 14); // Q0, window in Q14 532 "vmull.s16 q13, d26, d27\n\t" 533 "vrshrn.i32 d26, q13, #14\n\t" 534 // tmp32 = WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(tmp16a, gain_factor, 13); 535 "vmull.s16 q13, d24, d26\n\t" 536 // tmp16b = WebRtcSpl_SatW32ToW16(tmp32); // Q0 537 "vqrshrn.s32 d26, q13, #13\n\t" 538 // inst->synthesisBuffer[i] = WEBRTC_SPL_ADD_SAT_W16( 539 // inst->synthesisBuffer[i], tmp16b); // Q0 540 "vqadd.s16 d28, d26\n\t" 541 "vst1.16 d28, [%[ptr_syn]]!\n\t" 542 543 // Specify constraints. 544 :[ptr_real]"+r"(ptr_real), 545 [ptr_window]"+r"(ptr_window), 546 [ptr_syn]"+r"(ptr_syn) 547 : 548 :"d22", "d23", "d24", "d25", "d26", "d27", "d28", "q11", "q12", "q13" 549 ); 550 } 551 552 int16_t* ptr_out = &out_frame[0]; 553 ptr_syn = &inst->synthesisBuffer[0]; 554 // read out fully processed segment 555 for (; ptr_syn < &inst->synthesisBuffer[inst->blockLen10ms];) { 556 // Loop unrolled once. Both pointers are incremented in the assembly code. 557 __asm__ __volatile__( 558 // out_frame[i] = inst->synthesisBuffer[i]; // Q0 559 "vld1.16 {d22, d23}, [%[ptr_syn]]!\n\t" 560 "vld1.16 {d24, d25}, [%[ptr_syn]]!\n\t" 561 "vst1.16 {d22, d23}, [%[ptr_out]]!\n\t" 562 "vst1.16 {d24, d25}, [%[ptr_out]]!\n\t" 563 :[ptr_syn]"+r"(ptr_syn), 564 [ptr_out]"+r"(ptr_out) 565 : 566 :"d22", "d23", "d24", "d25" 567 ); 568 } 569 570 // Update synthesis buffer. 571 // C code: 572 // WEBRTC_SPL_MEMCPY_W16(inst->synthesisBuffer, 573 // inst->synthesisBuffer + inst->blockLen10ms, 574 // inst->anaLen - inst->blockLen10ms); 575 ptr_out = &inst->synthesisBuffer[0], 576 ptr_syn = &inst->synthesisBuffer[inst->blockLen10ms]; 577 for (; ptr_syn < &inst->synthesisBuffer[inst->anaLen];) { 578 // Loop unrolled once. Both pointers are incremented in the assembly code. 579 __asm__ __volatile__( 580 "vld1.16 {d22, d23}, [%[ptr_syn]]!\n\t" 581 "vld1.16 {d24, d25}, [%[ptr_syn]]!\n\t" 582 "vst1.16 {d22, d23}, [%[ptr_out]]!\n\t" 583 "vst1.16 {d24, d25}, [%[ptr_out]]!\n\t" 584 :[ptr_syn]"+r"(ptr_syn), 585 [ptr_out]"+r"(ptr_out) 586 : 587 :"d22", "d23", "d24", "d25" 588 ); 589 } 590 591 // C code: 592 // WebRtcSpl_ZerosArrayW16(inst->synthesisBuffer 593 // + inst->anaLen - inst->blockLen10ms, inst->blockLen10ms); 594 __asm__ __volatile__("vdup.16 q10, %0" : : "r"(0) : "q10"); 595 for (; ptr_out < &inst->synthesisBuffer[inst->anaLen];) { 596 // Loop unrolled once. Pointer is incremented in the assembly code. 597 __asm__ __volatile__( 598 "vst1.16 {d20, d21}, [%[ptr_out]]!\n\t" 599 "vst1.16 {d20, d21}, [%[ptr_out]]!\n\t" 600 :[ptr_out]"+r"(ptr_out) 601 : 602 :"d20", "d21" 603 ); 604 } 605 } 606 607 // Update analysis buffer for lower band, and window data before FFT. 608 static void AnalysisUpdateNeon(NsxInst_t* inst, 609 int16_t* out, 610 int16_t* new_speech) { 611 612 int16_t* ptr_ana = &inst->analysisBuffer[inst->blockLen10ms]; 613 int16_t* ptr_out = &inst->analysisBuffer[0]; 614 615 // For lower band update analysis buffer. 616 // WEBRTC_SPL_MEMCPY_W16(inst->analysisBuffer, 617 // inst->analysisBuffer + inst->blockLen10ms, 618 // inst->anaLen - inst->blockLen10ms); 619 for (; ptr_out < &inst->analysisBuffer[inst->anaLen - inst->blockLen10ms];) { 620 // Loop unrolled once, so both pointers are incremented by 8 twice. 621 __asm__ __volatile__( 622 "vld1.16 {d20, d21}, [%[ptr_ana]]!\n\t" 623 "vst1.16 {d20, d21}, [%[ptr_out]]!\n\t" 624 "vld1.16 {d22, d23}, [%[ptr_ana]]!\n\t" 625 "vst1.16 {d22, d23}, [%[ptr_out]]!\n\t" 626 :[ptr_ana]"+r"(ptr_ana), 627 [ptr_out]"+r"(ptr_out) 628 : 629 :"d20", "d21", "d22", "d23" 630 ); 631 } 632 633 // WEBRTC_SPL_MEMCPY_W16(inst->analysisBuffer 634 // + inst->anaLen - inst->blockLen10ms, new_speech, inst->blockLen10ms); 635 for (ptr_ana = new_speech; ptr_out < &inst->analysisBuffer[inst->anaLen];) { 636 // Loop unrolled once, so both pointers are incremented by 8 twice. 637 __asm__ __volatile__( 638 "vld1.16 {d20, d21}, [%[ptr_ana]]!\n\t" 639 "vst1.16 {d20, d21}, [%[ptr_out]]!\n\t" 640 "vld1.16 {d22, d23}, [%[ptr_ana]]!\n\t" 641 "vst1.16 {d22, d23}, [%[ptr_out]]!\n\t" 642 :[ptr_ana]"+r"(ptr_ana), 643 [ptr_out]"+r"(ptr_out) 644 : 645 :"d20", "d21", "d22", "d23" 646 ); 647 } 648 649 // Window data before FFT 650 int16_t* ptr_window = &inst->window[0]; 651 ptr_out = &out[0]; 652 ptr_ana = &inst->analysisBuffer[0]; 653 for (; ptr_out < &out[inst->anaLen];) { 654 655 // Loop unrolled once, so all pointers are incremented by 4 twice. 656 __asm__ __volatile__( 657 "vld1.16 d20, [%[ptr_ana]]!\n\t" 658 "vld1.16 d21, [%[ptr_window]]!\n\t" 659 // out[i] = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND( 660 // inst->window[i], inst->analysisBuffer[i], 14); // Q0 661 "vmull.s16 q10, d20, d21\n\t" 662 "vrshrn.i32 d20, q10, #14\n\t" 663 "vst1.16 d20, [%[ptr_out]]!\n\t" 664 665 "vld1.16 d22, [%[ptr_ana]]!\n\t" 666 "vld1.16 d23, [%[ptr_window]]!\n\t" 667 // out[i] = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND( 668 // inst->window[i], inst->analysisBuffer[i], 14); // Q0 669 "vmull.s16 q11, d22, d23\n\t" 670 "vrshrn.i32 d22, q11, #14\n\t" 671 "vst1.16 d22, [%[ptr_out]]!\n\t" 672 673 // Specify constraints. 674 :[ptr_ana]"+r"(ptr_ana), 675 [ptr_window]"+r"(ptr_window), 676 [ptr_out]"+r"(ptr_out) 677 : 678 :"d20", "d21", "d22", "d23", "q10", "q11" 679 ); 680 } 681 } 682 683 // Create a complex number buffer (out[]) as the intput (in[]) interleaved with 684 // zeros, and normalize it. 685 static __inline void CreateComplexBufferNeon(NsxInst_t* inst, 686 int16_t* in, 687 int16_t* out) { 688 int16_t* ptr_out = &out[0]; 689 int16_t* ptr_in = &in[0]; 690 691 __asm__ __volatile__("vdup.16 d25, %0" : : "r"(0) : "d25"); 692 __asm__ __volatile__("vdup.16 q10, %0" : : "r"(inst->normData) : "q10"); 693 for (; ptr_in < &in[inst->anaLen];) { 694 695 // Loop unrolled once, so ptr_in is incremented by 8 twice, 696 // and ptr_out is incremented by 8 four times. 697 __asm__ __volatile__( 698 // out[j] = WEBRTC_SPL_LSHIFT_W16(in[i], inst->normData); // Q(normData) 699 "vld1.16 {d22, d23}, [%[ptr_in]]!\n\t" 700 "vshl.s16 q11, q10\n\t" 701 "vmov d24, d23\n\t" 702 703 // out[j + 1] = 0; // Insert zeros in imaginary part 704 "vmov d23, d25\n\t" 705 "vst2.16 {d22, d23}, [%[ptr_out]]!\n\t" 706 "vst2.16 {d24, d25}, [%[ptr_out]]!\n\t" 707 708 // out[j] = WEBRTC_SPL_LSHIFT_W16(in[i], inst->normData); // Q(normData) 709 "vld1.16 {d22, d23}, [%[ptr_in]]!\n\t" 710 "vshl.s16 q11, q10\n\t" 711 "vmov d24, d23\n\t" 712 713 // out[j + 1] = 0; // Insert zeros in imaginary part 714 "vmov d23, d25\n\t" 715 "vst2.16 {d22, d23}, [%[ptr_out]]!\n\t" 716 "vst2.16 {d24, d25}, [%[ptr_out]]!\n\t" 717 718 // Specify constraints. 719 :[ptr_in]"+r"(ptr_in), 720 [ptr_out]"+r"(ptr_out) 721 : 722 :"d22", "d23", "d24", "d25", "q10", "q11" 723 ); 724 } 725 } 726 727 void WebRtcNsx_InitNeon(void) { 728 WebRtcNsx_NoiseEstimation = NoiseEstimationNeon; 729 WebRtcNsx_PrepareSpectrum = PrepareSpectrumNeon; 730 WebRtcNsx_SynthesisUpdate = SynthesisUpdateNeon; 731 WebRtcNsx_AnalysisUpdate = AnalysisUpdateNeon; 732 WebRtcNsx_Denormalize = DenormalizeNeon; 733 WebRtcNsx_CreateComplexBuffer = CreateComplexBufferNeon; 734 } 735