1 /* 2 * Copyright (C) 2010, Google Inc. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS'' AND ANY 14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 15 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 16 * DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS BE LIABLE FOR ANY 17 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 18 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 19 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 20 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 21 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 22 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 23 */ 24 25 #include "config.h" 26 27 #if ENABLE(WEB_AUDIO) 28 29 #include "core/platform/audio/VectorMath.h" 30 31 #include "wtf/Assertions.h" 32 33 #if OS(DARWIN) 34 #include <Accelerate/Accelerate.h> 35 #endif 36 37 #ifdef __SSE2__ 38 #include <emmintrin.h> 39 #endif 40 41 #if HAVE(ARM_NEON_INTRINSICS) 42 #include <arm_neon.h> 43 #endif 44 45 #include <math.h> 46 #include <algorithm> 47 48 namespace WebCore { 49 50 namespace VectorMath { 51 52 #if OS(DARWIN) 53 // On the Mac we use the highly optimized versions in Accelerate.framework 54 // In 32-bit mode (__ppc__ or __i386__) <Accelerate/Accelerate.h> includes <vecLib/vDSP_translate.h> which defines macros of the same name as 55 // our namespaced function names, so we must handle this case differently. Other architectures (64bit, ARM, etc.) do not include this header file. 56 57 void vsmul(const float* sourceP, int sourceStride, const float* scale, float* destP, int destStride, size_t framesToProcess) 58 { 59 #if defined(__ppc__) || defined(__i386__) 60 ::vsmul(sourceP, sourceStride, scale, destP, destStride, framesToProcess); 61 #else 62 vDSP_vsmul(sourceP, sourceStride, scale, destP, destStride, framesToProcess); 63 #endif 64 } 65 66 void vadd(const float* source1P, int sourceStride1, const float* source2P, int sourceStride2, float* destP, int destStride, size_t framesToProcess) 67 { 68 #if defined(__ppc__) || defined(__i386__) 69 ::vadd(source1P, sourceStride1, source2P, sourceStride2, destP, destStride, framesToProcess); 70 #else 71 vDSP_vadd(source1P, sourceStride1, source2P, sourceStride2, destP, destStride, framesToProcess); 72 #endif 73 } 74 75 void vmul(const float* source1P, int sourceStride1, const float* source2P, int sourceStride2, float* destP, int destStride, size_t framesToProcess) 76 { 77 #if defined(__ppc__) || defined(__i386__) 78 ::vmul(source1P, sourceStride1, source2P, sourceStride2, destP, destStride, framesToProcess); 79 #else 80 vDSP_vmul(source1P, sourceStride1, source2P, sourceStride2, destP, destStride, framesToProcess); 81 #endif 82 } 83 84 void zvmul(const float* real1P, const float* imag1P, const float* real2P, const float* imag2P, float* realDestP, float* imagDestP, size_t framesToProcess) 85 { 86 DSPSplitComplex sc1; 87 DSPSplitComplex sc2; 88 DSPSplitComplex dest; 89 sc1.realp = const_cast<float*>(real1P); 90 sc1.imagp = const_cast<float*>(imag1P); 91 sc2.realp = const_cast<float*>(real2P); 92 sc2.imagp = const_cast<float*>(imag2P); 93 dest.realp = realDestP; 94 dest.imagp = imagDestP; 95 #if defined(__ppc__) || defined(__i386__) 96 ::zvmul(&sc1, 1, &sc2, 1, &dest, 1, framesToProcess, 1); 97 #else 98 vDSP_zvmul(&sc1, 1, &sc2, 1, &dest, 1, framesToProcess, 1); 99 #endif 100 } 101 102 void vsma(const float* sourceP, int sourceStride, const float* scale, float* destP, int destStride, size_t framesToProcess) 103 { 104 vDSP_vsma(sourceP, sourceStride, scale, destP, destStride, destP, destStride, framesToProcess); 105 } 106 107 void vmaxmgv(const float* sourceP, int sourceStride, float* maxP, size_t framesToProcess) 108 { 109 vDSP_maxmgv(sourceP, sourceStride, maxP, framesToProcess); 110 } 111 112 void vsvesq(const float* sourceP, int sourceStride, float* sumP, size_t framesToProcess) 113 { 114 vDSP_svesq(const_cast<float*>(sourceP), sourceStride, sumP, framesToProcess); 115 } 116 117 void vclip(const float* sourceP, int sourceStride, const float* lowThresholdP, const float* highThresholdP, float* destP, int destStride, size_t framesToProcess) 118 { 119 vDSP_vclip(const_cast<float*>(sourceP), sourceStride, const_cast<float*>(lowThresholdP), const_cast<float*>(highThresholdP), destP, destStride, framesToProcess); 120 } 121 #else 122 123 void vsma(const float* sourceP, int sourceStride, const float* scale, float* destP, int destStride, size_t framesToProcess) 124 { 125 int n = framesToProcess; 126 127 #ifdef __SSE2__ 128 if ((sourceStride == 1) && (destStride == 1)) { 129 float k = *scale; 130 131 // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed separately. 132 while ((reinterpret_cast<uintptr_t>(sourceP) & 0x0F) && n) { 133 *destP += k * *sourceP; 134 sourceP++; 135 destP++; 136 n--; 137 } 138 139 // Now the sourceP is aligned, use SSE. 140 int tailFrames = n % 4; 141 const float* endP = destP + n - tailFrames; 142 143 __m128 pSource; 144 __m128 dest; 145 __m128 temp; 146 __m128 mScale = _mm_set_ps1(k); 147 148 bool destAligned = !(reinterpret_cast<uintptr_t>(destP) & 0x0F); 149 150 #define SSE2_MULT_ADD(loadInstr, storeInstr) \ 151 while (destP < endP) \ 152 { \ 153 pSource = _mm_load_ps(sourceP); \ 154 temp = _mm_mul_ps(pSource, mScale); \ 155 dest = _mm_##loadInstr##_ps(destP); \ 156 dest = _mm_add_ps(dest, temp); \ 157 _mm_##storeInstr##_ps(destP, dest); \ 158 sourceP += 4; \ 159 destP += 4; \ 160 } 161 162 if (destAligned) 163 SSE2_MULT_ADD(load, store) 164 else 165 SSE2_MULT_ADD(loadu, storeu) 166 167 n = tailFrames; 168 } 169 #elif HAVE(ARM_NEON_INTRINSICS) 170 if ((sourceStride == 1) && (destStride == 1)) { 171 int tailFrames = n % 4; 172 const float* endP = destP + n - tailFrames; 173 174 float32x4_t k = vdupq_n_f32(*scale); 175 while (destP < endP) { 176 float32x4_t source = vld1q_f32(sourceP); 177 float32x4_t dest = vld1q_f32(destP); 178 179 dest = vmlaq_f32(dest, source, k); 180 vst1q_f32(destP, dest); 181 182 sourceP += 4; 183 destP += 4; 184 } 185 n = tailFrames; 186 } 187 #endif 188 while (n) { 189 *destP += *sourceP * *scale; 190 sourceP += sourceStride; 191 destP += destStride; 192 n--; 193 } 194 } 195 196 void vsmul(const float* sourceP, int sourceStride, const float* scale, float* destP, int destStride, size_t framesToProcess) 197 { 198 int n = framesToProcess; 199 200 #ifdef __SSE2__ 201 if ((sourceStride == 1) && (destStride == 1)) { 202 float k = *scale; 203 204 // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed separately. 205 while ((reinterpret_cast<size_t>(sourceP) & 0x0F) && n) { 206 *destP = k * *sourceP; 207 sourceP++; 208 destP++; 209 n--; 210 } 211 212 // Now the sourceP address is aligned and start to apply SSE. 213 int group = n / 4; 214 __m128 mScale = _mm_set_ps1(k); 215 __m128* pSource; 216 __m128* pDest; 217 __m128 dest; 218 219 220 if (reinterpret_cast<size_t>(destP) & 0x0F) { 221 while (group--) { 222 pSource = reinterpret_cast<__m128*>(const_cast<float*>(sourceP)); 223 dest = _mm_mul_ps(*pSource, mScale); 224 _mm_storeu_ps(destP, dest); 225 226 sourceP += 4; 227 destP += 4; 228 } 229 } else { 230 while (group--) { 231 pSource = reinterpret_cast<__m128*>(const_cast<float*>(sourceP)); 232 pDest = reinterpret_cast<__m128*>(destP); 233 *pDest = _mm_mul_ps(*pSource, mScale); 234 235 sourceP += 4; 236 destP += 4; 237 } 238 } 239 240 // Non-SSE handling for remaining frames which is less than 4. 241 n %= 4; 242 while (n) { 243 *destP = k * *sourceP; 244 sourceP++; 245 destP++; 246 n--; 247 } 248 } else { // If strides are not 1, rollback to normal algorithm. 249 #elif HAVE(ARM_NEON_INTRINSICS) 250 if ((sourceStride == 1) && (destStride == 1)) { 251 float k = *scale; 252 int tailFrames = n % 4; 253 const float* endP = destP + n - tailFrames; 254 255 while (destP < endP) { 256 float32x4_t source = vld1q_f32(sourceP); 257 vst1q_f32(destP, vmulq_n_f32(source, k)); 258 259 sourceP += 4; 260 destP += 4; 261 } 262 n = tailFrames; 263 } 264 #endif 265 float k = *scale; 266 while (n--) { 267 *destP = k * *sourceP; 268 sourceP += sourceStride; 269 destP += destStride; 270 } 271 #ifdef __SSE2__ 272 } 273 #endif 274 } 275 276 void vadd(const float* source1P, int sourceStride1, const float* source2P, int sourceStride2, float* destP, int destStride, size_t framesToProcess) 277 { 278 int n = framesToProcess; 279 280 #ifdef __SSE2__ 281 if ((sourceStride1 ==1) && (sourceStride2 == 1) && (destStride == 1)) { 282 // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed separately. 283 while ((reinterpret_cast<size_t>(source1P) & 0x0F) && n) { 284 *destP = *source1P + *source2P; 285 source1P++; 286 source2P++; 287 destP++; 288 n--; 289 } 290 291 // Now the source1P address is aligned and start to apply SSE. 292 int group = n / 4; 293 __m128* pSource1; 294 __m128* pSource2; 295 __m128* pDest; 296 __m128 source2; 297 __m128 dest; 298 299 bool source2Aligned = !(reinterpret_cast<size_t>(source2P) & 0x0F); 300 bool destAligned = !(reinterpret_cast<size_t>(destP) & 0x0F); 301 302 if (source2Aligned && destAligned) { // all aligned 303 while (group--) { 304 pSource1 = reinterpret_cast<__m128*>(const_cast<float*>(source1P)); 305 pSource2 = reinterpret_cast<__m128*>(const_cast<float*>(source2P)); 306 pDest = reinterpret_cast<__m128*>(destP); 307 *pDest = _mm_add_ps(*pSource1, *pSource2); 308 309 source1P += 4; 310 source2P += 4; 311 destP += 4; 312 } 313 314 } else if (source2Aligned && !destAligned) { // source2 aligned but dest not aligned 315 while (group--) { 316 pSource1 = reinterpret_cast<__m128*>(const_cast<float*>(source1P)); 317 pSource2 = reinterpret_cast<__m128*>(const_cast<float*>(source2P)); 318 dest = _mm_add_ps(*pSource1, *pSource2); 319 _mm_storeu_ps(destP, dest); 320 321 source1P += 4; 322 source2P += 4; 323 destP += 4; 324 } 325 326 } else if (!source2Aligned && destAligned) { // source2 not aligned but dest aligned 327 while (group--) { 328 pSource1 = reinterpret_cast<__m128*>(const_cast<float*>(source1P)); 329 source2 = _mm_loadu_ps(source2P); 330 pDest = reinterpret_cast<__m128*>(destP); 331 *pDest = _mm_add_ps(*pSource1, source2); 332 333 source1P += 4; 334 source2P += 4; 335 destP += 4; 336 } 337 } else if (!source2Aligned && !destAligned) { // both source2 and dest not aligned 338 while (group--) { 339 pSource1 = reinterpret_cast<__m128*>(const_cast<float*>(source1P)); 340 source2 = _mm_loadu_ps(source2P); 341 dest = _mm_add_ps(*pSource1, source2); 342 _mm_storeu_ps(destP, dest); 343 344 source1P += 4; 345 source2P += 4; 346 destP += 4; 347 } 348 } 349 350 // Non-SSE handling for remaining frames which is less than 4. 351 n %= 4; 352 while (n) { 353 *destP = *source1P + *source2P; 354 source1P++; 355 source2P++; 356 destP++; 357 n--; 358 } 359 } else { // if strides are not 1, rollback to normal algorithm 360 #elif HAVE(ARM_NEON_INTRINSICS) 361 if ((sourceStride1 ==1) && (sourceStride2 == 1) && (destStride == 1)) { 362 int tailFrames = n % 4; 363 const float* endP = destP + n - tailFrames; 364 365 while (destP < endP) { 366 float32x4_t source1 = vld1q_f32(source1P); 367 float32x4_t source2 = vld1q_f32(source2P); 368 vst1q_f32(destP, vaddq_f32(source1, source2)); 369 370 source1P += 4; 371 source2P += 4; 372 destP += 4; 373 } 374 n = tailFrames; 375 } 376 #endif 377 while (n--) { 378 *destP = *source1P + *source2P; 379 source1P += sourceStride1; 380 source2P += sourceStride2; 381 destP += destStride; 382 } 383 #ifdef __SSE2__ 384 } 385 #endif 386 } 387 388 void vmul(const float* source1P, int sourceStride1, const float* source2P, int sourceStride2, float* destP, int destStride, size_t framesToProcess) 389 { 390 391 int n = framesToProcess; 392 393 #ifdef __SSE2__ 394 if ((sourceStride1 == 1) && (sourceStride2 == 1) && (destStride == 1)) { 395 // If the source1P address is not 16-byte aligned, the first several frames (at most three) should be processed separately. 396 while ((reinterpret_cast<uintptr_t>(source1P) & 0x0F) && n) { 397 *destP = *source1P * *source2P; 398 source1P++; 399 source2P++; 400 destP++; 401 n--; 402 } 403 404 // Now the source1P address aligned and start to apply SSE. 405 int tailFrames = n % 4; 406 const float* endP = destP + n - tailFrames; 407 __m128 pSource1; 408 __m128 pSource2; 409 __m128 dest; 410 411 bool source2Aligned = !(reinterpret_cast<uintptr_t>(source2P) & 0x0F); 412 bool destAligned = !(reinterpret_cast<uintptr_t>(destP) & 0x0F); 413 414 #define SSE2_MULT(loadInstr, storeInstr) \ 415 while (destP < endP) \ 416 { \ 417 pSource1 = _mm_load_ps(source1P); \ 418 pSource2 = _mm_##loadInstr##_ps(source2P); \ 419 dest = _mm_mul_ps(pSource1, pSource2); \ 420 _mm_##storeInstr##_ps(destP, dest); \ 421 source1P += 4; \ 422 source2P += 4; \ 423 destP += 4; \ 424 } 425 426 if (source2Aligned && destAligned) // Both aligned. 427 SSE2_MULT(load, store) 428 else if (source2Aligned && !destAligned) // Source2 is aligned but dest not. 429 SSE2_MULT(load, storeu) 430 else if (!source2Aligned && destAligned) // Dest is aligned but source2 not. 431 SSE2_MULT(loadu, store) 432 else // Neither aligned. 433 SSE2_MULT(loadu, storeu) 434 435 n = tailFrames; 436 } 437 #elif HAVE(ARM_NEON_INTRINSICS) 438 if ((sourceStride1 ==1) && (sourceStride2 == 1) && (destStride == 1)) { 439 int tailFrames = n % 4; 440 const float* endP = destP + n - tailFrames; 441 442 while (destP < endP) { 443 float32x4_t source1 = vld1q_f32(source1P); 444 float32x4_t source2 = vld1q_f32(source2P); 445 vst1q_f32(destP, vmulq_f32(source1, source2)); 446 447 source1P += 4; 448 source2P += 4; 449 destP += 4; 450 } 451 n = tailFrames; 452 } 453 #endif 454 while (n) { 455 *destP = *source1P * *source2P; 456 source1P += sourceStride1; 457 source2P += sourceStride2; 458 destP += destStride; 459 n--; 460 } 461 } 462 463 void zvmul(const float* real1P, const float* imag1P, const float* real2P, const float* imag2P, float* realDestP, float* imagDestP, size_t framesToProcess) 464 { 465 unsigned i = 0; 466 #ifdef __SSE2__ 467 // Only use the SSE optimization in the very common case that all addresses are 16-byte aligned. 468 // Otherwise, fall through to the scalar code below. 469 if (!(reinterpret_cast<uintptr_t>(real1P) & 0x0F) 470 && !(reinterpret_cast<uintptr_t>(imag1P) & 0x0F) 471 && !(reinterpret_cast<uintptr_t>(real2P) & 0x0F) 472 && !(reinterpret_cast<uintptr_t>(imag2P) & 0x0F) 473 && !(reinterpret_cast<uintptr_t>(realDestP) & 0x0F) 474 && !(reinterpret_cast<uintptr_t>(imagDestP) & 0x0F)) { 475 476 unsigned endSize = framesToProcess - framesToProcess % 4; 477 while (i < endSize) { 478 __m128 real1 = _mm_load_ps(real1P + i); 479 __m128 real2 = _mm_load_ps(real2P + i); 480 __m128 imag1 = _mm_load_ps(imag1P + i); 481 __m128 imag2 = _mm_load_ps(imag2P + i); 482 __m128 real = _mm_mul_ps(real1, real2); 483 real = _mm_sub_ps(real, _mm_mul_ps(imag1, imag2)); 484 __m128 imag = _mm_mul_ps(real1, imag2); 485 imag = _mm_add_ps(imag, _mm_mul_ps(imag1, real2)); 486 _mm_store_ps(realDestP + i, real); 487 _mm_store_ps(imagDestP + i, imag); 488 i += 4; 489 } 490 } 491 #elif HAVE(ARM_NEON_INTRINSICS) 492 unsigned endSize = framesToProcess - framesToProcess % 4; 493 while (i < endSize) { 494 float32x4_t real1 = vld1q_f32(real1P + i); 495 float32x4_t real2 = vld1q_f32(real2P + i); 496 float32x4_t imag1 = vld1q_f32(imag1P + i); 497 float32x4_t imag2 = vld1q_f32(imag2P + i); 498 499 float32x4_t realResult = vmlsq_f32(vmulq_f32(real1, real2), imag1, imag2); 500 float32x4_t imagResult = vmlaq_f32(vmulq_f32(real1, imag2), imag1, real2); 501 502 vst1q_f32(realDestP + i, realResult); 503 vst1q_f32(imagDestP + i, imagResult); 504 505 i += 4; 506 } 507 #endif 508 for (; i < framesToProcess; ++i) { 509 // Read and compute result before storing them, in case the 510 // destination is the same as one of the sources. 511 float realResult = real1P[i] * real2P[i] - imag1P[i] * imag2P[i]; 512 float imagResult = real1P[i] * imag2P[i] + imag1P[i] * real2P[i]; 513 514 realDestP[i] = realResult; 515 imagDestP[i] = imagResult; 516 } 517 } 518 519 void vsvesq(const float* sourceP, int sourceStride, float* sumP, size_t framesToProcess) 520 { 521 int n = framesToProcess; 522 float sum = 0; 523 524 #ifdef __SSE2__ 525 if (sourceStride == 1) { 526 // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed separately. 527 while ((reinterpret_cast<uintptr_t>(sourceP) & 0x0F) && n) { 528 float sample = *sourceP; 529 sum += sample * sample; 530 sourceP++; 531 n--; 532 } 533 534 // Now the sourceP is aligned, use SSE. 535 int tailFrames = n % 4; 536 const float* endP = sourceP + n - tailFrames; 537 __m128 source; 538 __m128 mSum = _mm_setzero_ps(); 539 540 while (sourceP < endP) { 541 source = _mm_load_ps(sourceP); 542 source = _mm_mul_ps(source, source); 543 mSum = _mm_add_ps(mSum, source); 544 sourceP += 4; 545 } 546 547 // Summarize the SSE results. 548 const float* groupSumP = reinterpret_cast<float*>(&mSum); 549 sum += groupSumP[0] + groupSumP[1] + groupSumP[2] + groupSumP[3]; 550 551 n = tailFrames; 552 } 553 #elif HAVE(ARM_NEON_INTRINSICS) 554 if (sourceStride == 1) { 555 int tailFrames = n % 4; 556 const float* endP = sourceP + n - tailFrames; 557 558 float32x4_t fourSum = vdupq_n_f32(0); 559 while (sourceP < endP) { 560 float32x4_t source = vld1q_f32(sourceP); 561 fourSum = vmlaq_f32(fourSum, source, source); 562 sourceP += 4; 563 } 564 float32x2_t twoSum = vadd_f32(vget_low_f32(fourSum), vget_high_f32(fourSum)); 565 566 float groupSum[2]; 567 vst1_f32(groupSum, twoSum); 568 sum += groupSum[0] + groupSum[1]; 569 570 n = tailFrames; 571 } 572 #endif 573 574 while (n--) { 575 float sample = *sourceP; 576 sum += sample * sample; 577 sourceP += sourceStride; 578 } 579 580 ASSERT(sumP); 581 *sumP = sum; 582 } 583 584 void vmaxmgv(const float* sourceP, int sourceStride, float* maxP, size_t framesToProcess) 585 { 586 int n = framesToProcess; 587 float max = 0; 588 589 #ifdef __SSE2__ 590 if (sourceStride == 1) { 591 // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed separately. 592 while ((reinterpret_cast<uintptr_t>(sourceP) & 0x0F) && n) { 593 max = std::max(max, fabsf(*sourceP)); 594 sourceP++; 595 n--; 596 } 597 598 // Now the sourceP is aligned, use SSE. 599 int tailFrames = n % 4; 600 const float* endP = sourceP + n - tailFrames; 601 __m128 source; 602 __m128 mMax = _mm_setzero_ps(); 603 int mask = 0x7FFFFFFF; 604 __m128 mMask = _mm_set1_ps(*reinterpret_cast<float*>(&mask)); 605 606 while (sourceP < endP) { 607 source = _mm_load_ps(sourceP); 608 // Calculate the absolute value by anding source with mask, the sign bit is set to 0. 609 source = _mm_and_ps(source, mMask); 610 mMax = _mm_max_ps(mMax, source); 611 sourceP += 4; 612 } 613 614 // Get max from the SSE results. 615 const float* groupMaxP = reinterpret_cast<float*>(&mMax); 616 max = std::max(max, groupMaxP[0]); 617 max = std::max(max, groupMaxP[1]); 618 max = std::max(max, groupMaxP[2]); 619 max = std::max(max, groupMaxP[3]); 620 621 n = tailFrames; 622 } 623 #elif HAVE(ARM_NEON_INTRINSICS) 624 if (sourceStride == 1) { 625 int tailFrames = n % 4; 626 const float* endP = sourceP + n - tailFrames; 627 628 float32x4_t fourMax = vdupq_n_f32(0); 629 while (sourceP < endP) { 630 float32x4_t source = vld1q_f32(sourceP); 631 fourMax = vmaxq_f32(fourMax, vabsq_f32(source)); 632 sourceP += 4; 633 } 634 float32x2_t twoMax = vmax_f32(vget_low_f32(fourMax), vget_high_f32(fourMax)); 635 636 float groupMax[2]; 637 vst1_f32(groupMax, twoMax); 638 max = std::max(groupMax[0], groupMax[1]); 639 640 n = tailFrames; 641 } 642 #endif 643 644 while (n--) { 645 max = std::max(max, fabsf(*sourceP)); 646 sourceP += sourceStride; 647 } 648 649 ASSERT(maxP); 650 *maxP = max; 651 } 652 653 void vclip(const float* sourceP, int sourceStride, const float* lowThresholdP, const float* highThresholdP, float* destP, int destStride, size_t framesToProcess) 654 { 655 int n = framesToProcess; 656 float lowThreshold = *lowThresholdP; 657 float highThreshold = *highThresholdP; 658 659 // FIXME: Optimize for SSE2. 660 #if HAVE(ARM_NEON_INTRINSICS) 661 if ((sourceStride == 1) && (destStride == 1)) { 662 int tailFrames = n % 4; 663 const float* endP = destP + n - tailFrames; 664 665 float32x4_t low = vdupq_n_f32(lowThreshold); 666 float32x4_t high = vdupq_n_f32(highThreshold); 667 while (destP < endP) { 668 float32x4_t source = vld1q_f32(sourceP); 669 vst1q_f32(destP, vmaxq_f32(vminq_f32(source, high), low)); 670 sourceP += 4; 671 destP += 4; 672 } 673 n = tailFrames; 674 } 675 #endif 676 while (n--) { 677 *destP = std::max(std::min(*sourceP, highThreshold), lowThreshold); 678 sourceP += sourceStride; 679 destP += destStride; 680 } 681 } 682 683 #endif // OS(DARWIN) 684 685 } // namespace VectorMath 686 687 } // namespace WebCore 688 689 #endif // ENABLE(WEB_AUDIO) 690