1 /* 2 * Copyright (C) 2010, Google Inc. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS'' AND ANY 14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 15 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 16 * DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS BE LIABLE FOR ANY 17 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 18 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 19 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 20 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 21 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 22 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 23 */ 24 25 #include "config.h" 26 27 #if ENABLE(WEB_AUDIO) 28 29 #include "platform/audio/VectorMath.h" 30 #include "wtf/Assertions.h" 31 #include "wtf/CPU.h" 32 #include <stdint.h> 33 34 #if OS(MACOSX) 35 #include <Accelerate/Accelerate.h> 36 #endif 37 38 #if CPU(X86) || CPU(X86_64) 39 #include <emmintrin.h> 40 #endif 41 42 #if HAVE(ARM_NEON_INTRINSICS) 43 #include <arm_neon.h> 44 #endif 45 46 #include <math.h> 47 #include <algorithm> 48 49 namespace blink { 50 51 namespace VectorMath { 52 53 #if OS(MACOSX) 54 // On the Mac we use the highly optimized versions in Accelerate.framework 55 // In 32-bit mode (__ppc__ or __i386__) <Accelerate/Accelerate.h> includes <vecLib/vDSP_translate.h> which defines macros of the same name as 56 // our namespaced function names, so we must handle this case differently. Other architectures (64bit, ARM, etc.) do not include this header file. 57 58 void vsmul(const float* sourceP, int sourceStride, const float* scale, float* destP, int destStride, size_t framesToProcess) 59 { 60 #if CPU(X86) 61 ::vsmul(sourceP, sourceStride, scale, destP, destStride, framesToProcess); 62 #else 63 vDSP_vsmul(sourceP, sourceStride, scale, destP, destStride, framesToProcess); 64 #endif 65 } 66 67 void vadd(const float* source1P, int sourceStride1, const float* source2P, int sourceStride2, float* destP, int destStride, size_t framesToProcess) 68 { 69 #if CPU(X86) 70 ::vadd(source1P, sourceStride1, source2P, sourceStride2, destP, destStride, framesToProcess); 71 #else 72 vDSP_vadd(source1P, sourceStride1, source2P, sourceStride2, destP, destStride, framesToProcess); 73 #endif 74 } 75 76 void vmul(const float* source1P, int sourceStride1, const float* source2P, int sourceStride2, float* destP, int destStride, size_t framesToProcess) 77 { 78 #if CPU(X86) 79 ::vmul(source1P, sourceStride1, source2P, sourceStride2, destP, destStride, framesToProcess); 80 #else 81 vDSP_vmul(source1P, sourceStride1, source2P, sourceStride2, destP, destStride, framesToProcess); 82 #endif 83 } 84 85 void zvmul(const float* real1P, const float* imag1P, const float* real2P, const float* imag2P, float* realDestP, float* imagDestP, size_t framesToProcess) 86 { 87 DSPSplitComplex sc1; 88 DSPSplitComplex sc2; 89 DSPSplitComplex dest; 90 sc1.realp = const_cast<float*>(real1P); 91 sc1.imagp = const_cast<float*>(imag1P); 92 sc2.realp = const_cast<float*>(real2P); 93 sc2.imagp = const_cast<float*>(imag2P); 94 dest.realp = realDestP; 95 dest.imagp = imagDestP; 96 #if CPU(X86) 97 ::zvmul(&sc1, 1, &sc2, 1, &dest, 1, framesToProcess, 1); 98 #else 99 vDSP_zvmul(&sc1, 1, &sc2, 1, &dest, 1, framesToProcess, 1); 100 #endif 101 } 102 103 void vsma(const float* sourceP, int sourceStride, const float* scale, float* destP, int destStride, size_t framesToProcess) 104 { 105 vDSP_vsma(sourceP, sourceStride, scale, destP, destStride, destP, destStride, framesToProcess); 106 } 107 108 void vmaxmgv(const float* sourceP, int sourceStride, float* maxP, size_t framesToProcess) 109 { 110 vDSP_maxmgv(sourceP, sourceStride, maxP, framesToProcess); 111 } 112 113 void vsvesq(const float* sourceP, int sourceStride, float* sumP, size_t framesToProcess) 114 { 115 vDSP_svesq(const_cast<float*>(sourceP), sourceStride, sumP, framesToProcess); 116 } 117 118 void vclip(const float* sourceP, int sourceStride, const float* lowThresholdP, const float* highThresholdP, float* destP, int destStride, size_t framesToProcess) 119 { 120 vDSP_vclip(const_cast<float*>(sourceP), sourceStride, const_cast<float*>(lowThresholdP), const_cast<float*>(highThresholdP), destP, destStride, framesToProcess); 121 } 122 #else 123 124 void vsma(const float* sourceP, int sourceStride, const float* scale, float* destP, int destStride, size_t framesToProcess) 125 { 126 int n = framesToProcess; 127 128 #if CPU(X86) || CPU(X86_64) 129 if ((sourceStride == 1) && (destStride == 1)) { 130 float k = *scale; 131 132 // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed separately. 133 while ((reinterpret_cast<uintptr_t>(sourceP) & 0x0F) && n) { 134 *destP += k * *sourceP; 135 sourceP++; 136 destP++; 137 n--; 138 } 139 140 // Now the sourceP is aligned, use SSE. 141 int tailFrames = n % 4; 142 const float* endP = destP + n - tailFrames; 143 144 __m128 pSource; 145 __m128 dest; 146 __m128 temp; 147 __m128 mScale = _mm_set_ps1(k); 148 149 bool destAligned = !(reinterpret_cast<uintptr_t>(destP) & 0x0F); 150 151 #define SSE2_MULT_ADD(loadInstr, storeInstr) \ 152 while (destP < endP) \ 153 { \ 154 pSource = _mm_load_ps(sourceP); \ 155 temp = _mm_mul_ps(pSource, mScale); \ 156 dest = _mm_##loadInstr##_ps(destP); \ 157 dest = _mm_add_ps(dest, temp); \ 158 _mm_##storeInstr##_ps(destP, dest); \ 159 sourceP += 4; \ 160 destP += 4; \ 161 } 162 163 if (destAligned) 164 SSE2_MULT_ADD(load, store) 165 else 166 SSE2_MULT_ADD(loadu, storeu) 167 168 n = tailFrames; 169 } 170 #elif HAVE(ARM_NEON_INTRINSICS) 171 if ((sourceStride == 1) && (destStride == 1)) { 172 int tailFrames = n % 4; 173 const float* endP = destP + n - tailFrames; 174 175 float32x4_t k = vdupq_n_f32(*scale); 176 while (destP < endP) { 177 float32x4_t source = vld1q_f32(sourceP); 178 float32x4_t dest = vld1q_f32(destP); 179 180 dest = vmlaq_f32(dest, source, k); 181 vst1q_f32(destP, dest); 182 183 sourceP += 4; 184 destP += 4; 185 } 186 n = tailFrames; 187 } 188 #endif 189 while (n) { 190 *destP += *sourceP * *scale; 191 sourceP += sourceStride; 192 destP += destStride; 193 n--; 194 } 195 } 196 197 void vsmul(const float* sourceP, int sourceStride, const float* scale, float* destP, int destStride, size_t framesToProcess) 198 { 199 int n = framesToProcess; 200 201 #if CPU(X86) || CPU(X86_64) 202 if ((sourceStride == 1) && (destStride == 1)) { 203 float k = *scale; 204 205 // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed separately. 206 while ((reinterpret_cast<size_t>(sourceP) & 0x0F) && n) { 207 *destP = k * *sourceP; 208 sourceP++; 209 destP++; 210 n--; 211 } 212 213 // Now the sourceP address is aligned and start to apply SSE. 214 int group = n / 4; 215 __m128 mScale = _mm_set_ps1(k); 216 __m128* pSource; 217 __m128* pDest; 218 __m128 dest; 219 220 221 if (reinterpret_cast<size_t>(destP) & 0x0F) { 222 while (group--) { 223 pSource = reinterpret_cast<__m128*>(const_cast<float*>(sourceP)); 224 dest = _mm_mul_ps(*pSource, mScale); 225 _mm_storeu_ps(destP, dest); 226 227 sourceP += 4; 228 destP += 4; 229 } 230 } else { 231 while (group--) { 232 pSource = reinterpret_cast<__m128*>(const_cast<float*>(sourceP)); 233 pDest = reinterpret_cast<__m128*>(destP); 234 *pDest = _mm_mul_ps(*pSource, mScale); 235 236 sourceP += 4; 237 destP += 4; 238 } 239 } 240 241 // Non-SSE handling for remaining frames which is less than 4. 242 n %= 4; 243 while (n) { 244 *destP = k * *sourceP; 245 sourceP++; 246 destP++; 247 n--; 248 } 249 } else { // If strides are not 1, rollback to normal algorithm. 250 #elif HAVE(ARM_NEON_INTRINSICS) 251 if ((sourceStride == 1) && (destStride == 1)) { 252 float k = *scale; 253 int tailFrames = n % 4; 254 const float* endP = destP + n - tailFrames; 255 256 while (destP < endP) { 257 float32x4_t source = vld1q_f32(sourceP); 258 vst1q_f32(destP, vmulq_n_f32(source, k)); 259 260 sourceP += 4; 261 destP += 4; 262 } 263 n = tailFrames; 264 } 265 #endif 266 float k = *scale; 267 while (n--) { 268 *destP = k * *sourceP; 269 sourceP += sourceStride; 270 destP += destStride; 271 } 272 #if CPU(X86) || CPU(X86_64) 273 } 274 #endif 275 } 276 277 void vadd(const float* source1P, int sourceStride1, const float* source2P, int sourceStride2, float* destP, int destStride, size_t framesToProcess) 278 { 279 int n = framesToProcess; 280 281 #if CPU(X86) || CPU(X86_64) 282 if ((sourceStride1 ==1) && (sourceStride2 == 1) && (destStride == 1)) { 283 // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed separately. 284 while ((reinterpret_cast<size_t>(source1P) & 0x0F) && n) { 285 *destP = *source1P + *source2P; 286 source1P++; 287 source2P++; 288 destP++; 289 n--; 290 } 291 292 // Now the source1P address is aligned and start to apply SSE. 293 int group = n / 4; 294 __m128* pSource1; 295 __m128* pSource2; 296 __m128* pDest; 297 __m128 source2; 298 __m128 dest; 299 300 bool source2Aligned = !(reinterpret_cast<size_t>(source2P) & 0x0F); 301 bool destAligned = !(reinterpret_cast<size_t>(destP) & 0x0F); 302 303 if (source2Aligned && destAligned) { // all aligned 304 while (group--) { 305 pSource1 = reinterpret_cast<__m128*>(const_cast<float*>(source1P)); 306 pSource2 = reinterpret_cast<__m128*>(const_cast<float*>(source2P)); 307 pDest = reinterpret_cast<__m128*>(destP); 308 *pDest = _mm_add_ps(*pSource1, *pSource2); 309 310 source1P += 4; 311 source2P += 4; 312 destP += 4; 313 } 314 315 } else if (source2Aligned && !destAligned) { // source2 aligned but dest not aligned 316 while (group--) { 317 pSource1 = reinterpret_cast<__m128*>(const_cast<float*>(source1P)); 318 pSource2 = reinterpret_cast<__m128*>(const_cast<float*>(source2P)); 319 dest = _mm_add_ps(*pSource1, *pSource2); 320 _mm_storeu_ps(destP, dest); 321 322 source1P += 4; 323 source2P += 4; 324 destP += 4; 325 } 326 327 } else if (!source2Aligned && destAligned) { // source2 not aligned but dest aligned 328 while (group--) { 329 pSource1 = reinterpret_cast<__m128*>(const_cast<float*>(source1P)); 330 source2 = _mm_loadu_ps(source2P); 331 pDest = reinterpret_cast<__m128*>(destP); 332 *pDest = _mm_add_ps(*pSource1, source2); 333 334 source1P += 4; 335 source2P += 4; 336 destP += 4; 337 } 338 } else if (!source2Aligned && !destAligned) { // both source2 and dest not aligned 339 while (group--) { 340 pSource1 = reinterpret_cast<__m128*>(const_cast<float*>(source1P)); 341 source2 = _mm_loadu_ps(source2P); 342 dest = _mm_add_ps(*pSource1, source2); 343 _mm_storeu_ps(destP, dest); 344 345 source1P += 4; 346 source2P += 4; 347 destP += 4; 348 } 349 } 350 351 // Non-SSE handling for remaining frames which is less than 4. 352 n %= 4; 353 while (n) { 354 *destP = *source1P + *source2P; 355 source1P++; 356 source2P++; 357 destP++; 358 n--; 359 } 360 } else { // if strides are not 1, rollback to normal algorithm 361 #elif HAVE(ARM_NEON_INTRINSICS) 362 if ((sourceStride1 ==1) && (sourceStride2 == 1) && (destStride == 1)) { 363 int tailFrames = n % 4; 364 const float* endP = destP + n - tailFrames; 365 366 while (destP < endP) { 367 float32x4_t source1 = vld1q_f32(source1P); 368 float32x4_t source2 = vld1q_f32(source2P); 369 vst1q_f32(destP, vaddq_f32(source1, source2)); 370 371 source1P += 4; 372 source2P += 4; 373 destP += 4; 374 } 375 n = tailFrames; 376 } 377 #endif 378 while (n--) { 379 *destP = *source1P + *source2P; 380 source1P += sourceStride1; 381 source2P += sourceStride2; 382 destP += destStride; 383 } 384 #if CPU(X86) || CPU(X86_64) 385 } 386 #endif 387 } 388 389 void vmul(const float* source1P, int sourceStride1, const float* source2P, int sourceStride2, float* destP, int destStride, size_t framesToProcess) 390 { 391 392 int n = framesToProcess; 393 394 #if CPU(X86) || CPU(X86_64) 395 if ((sourceStride1 == 1) && (sourceStride2 == 1) && (destStride == 1)) { 396 // If the source1P address is not 16-byte aligned, the first several frames (at most three) should be processed separately. 397 while ((reinterpret_cast<uintptr_t>(source1P) & 0x0F) && n) { 398 *destP = *source1P * *source2P; 399 source1P++; 400 source2P++; 401 destP++; 402 n--; 403 } 404 405 // Now the source1P address aligned and start to apply SSE. 406 int tailFrames = n % 4; 407 const float* endP = destP + n - tailFrames; 408 __m128 pSource1; 409 __m128 pSource2; 410 __m128 dest; 411 412 bool source2Aligned = !(reinterpret_cast<uintptr_t>(source2P) & 0x0F); 413 bool destAligned = !(reinterpret_cast<uintptr_t>(destP) & 0x0F); 414 415 #define SSE2_MULT(loadInstr, storeInstr) \ 416 while (destP < endP) \ 417 { \ 418 pSource1 = _mm_load_ps(source1P); \ 419 pSource2 = _mm_##loadInstr##_ps(source2P); \ 420 dest = _mm_mul_ps(pSource1, pSource2); \ 421 _mm_##storeInstr##_ps(destP, dest); \ 422 source1P += 4; \ 423 source2P += 4; \ 424 destP += 4; \ 425 } 426 427 if (source2Aligned && destAligned) // Both aligned. 428 SSE2_MULT(load, store) 429 else if (source2Aligned && !destAligned) // Source2 is aligned but dest not. 430 SSE2_MULT(load, storeu) 431 else if (!source2Aligned && destAligned) // Dest is aligned but source2 not. 432 SSE2_MULT(loadu, store) 433 else // Neither aligned. 434 SSE2_MULT(loadu, storeu) 435 436 n = tailFrames; 437 } 438 #elif HAVE(ARM_NEON_INTRINSICS) 439 if ((sourceStride1 ==1) && (sourceStride2 == 1) && (destStride == 1)) { 440 int tailFrames = n % 4; 441 const float* endP = destP + n - tailFrames; 442 443 while (destP < endP) { 444 float32x4_t source1 = vld1q_f32(source1P); 445 float32x4_t source2 = vld1q_f32(source2P); 446 vst1q_f32(destP, vmulq_f32(source1, source2)); 447 448 source1P += 4; 449 source2P += 4; 450 destP += 4; 451 } 452 n = tailFrames; 453 } 454 #endif 455 while (n) { 456 *destP = *source1P * *source2P; 457 source1P += sourceStride1; 458 source2P += sourceStride2; 459 destP += destStride; 460 n--; 461 } 462 } 463 464 void zvmul(const float* real1P, const float* imag1P, const float* real2P, const float* imag2P, float* realDestP, float* imagDestP, size_t framesToProcess) 465 { 466 unsigned i = 0; 467 #if CPU(X86) || CPU(X86_64) 468 // Only use the SSE optimization in the very common case that all addresses are 16-byte aligned. 469 // Otherwise, fall through to the scalar code below. 470 if (!(reinterpret_cast<uintptr_t>(real1P) & 0x0F) 471 && !(reinterpret_cast<uintptr_t>(imag1P) & 0x0F) 472 && !(reinterpret_cast<uintptr_t>(real2P) & 0x0F) 473 && !(reinterpret_cast<uintptr_t>(imag2P) & 0x0F) 474 && !(reinterpret_cast<uintptr_t>(realDestP) & 0x0F) 475 && !(reinterpret_cast<uintptr_t>(imagDestP) & 0x0F)) { 476 477 unsigned endSize = framesToProcess - framesToProcess % 4; 478 while (i < endSize) { 479 __m128 real1 = _mm_load_ps(real1P + i); 480 __m128 real2 = _mm_load_ps(real2P + i); 481 __m128 imag1 = _mm_load_ps(imag1P + i); 482 __m128 imag2 = _mm_load_ps(imag2P + i); 483 __m128 real = _mm_mul_ps(real1, real2); 484 real = _mm_sub_ps(real, _mm_mul_ps(imag1, imag2)); 485 __m128 imag = _mm_mul_ps(real1, imag2); 486 imag = _mm_add_ps(imag, _mm_mul_ps(imag1, real2)); 487 _mm_store_ps(realDestP + i, real); 488 _mm_store_ps(imagDestP + i, imag); 489 i += 4; 490 } 491 } 492 #elif HAVE(ARM_NEON_INTRINSICS) 493 unsigned endSize = framesToProcess - framesToProcess % 4; 494 while (i < endSize) { 495 float32x4_t real1 = vld1q_f32(real1P + i); 496 float32x4_t real2 = vld1q_f32(real2P + i); 497 float32x4_t imag1 = vld1q_f32(imag1P + i); 498 float32x4_t imag2 = vld1q_f32(imag2P + i); 499 500 float32x4_t realResult = vmlsq_f32(vmulq_f32(real1, real2), imag1, imag2); 501 float32x4_t imagResult = vmlaq_f32(vmulq_f32(real1, imag2), imag1, real2); 502 503 vst1q_f32(realDestP + i, realResult); 504 vst1q_f32(imagDestP + i, imagResult); 505 506 i += 4; 507 } 508 #endif 509 for (; i < framesToProcess; ++i) { 510 // Read and compute result before storing them, in case the 511 // destination is the same as one of the sources. 512 float realResult = real1P[i] * real2P[i] - imag1P[i] * imag2P[i]; 513 float imagResult = real1P[i] * imag2P[i] + imag1P[i] * real2P[i]; 514 515 realDestP[i] = realResult; 516 imagDestP[i] = imagResult; 517 } 518 } 519 520 void vsvesq(const float* sourceP, int sourceStride, float* sumP, size_t framesToProcess) 521 { 522 int n = framesToProcess; 523 float sum = 0; 524 525 #if CPU(X86) || CPU(X86_64) 526 if (sourceStride == 1) { 527 // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed separately. 528 while ((reinterpret_cast<uintptr_t>(sourceP) & 0x0F) && n) { 529 float sample = *sourceP; 530 sum += sample * sample; 531 sourceP++; 532 n--; 533 } 534 535 // Now the sourceP is aligned, use SSE. 536 int tailFrames = n % 4; 537 const float* endP = sourceP + n - tailFrames; 538 __m128 source; 539 __m128 mSum = _mm_setzero_ps(); 540 541 while (sourceP < endP) { 542 source = _mm_load_ps(sourceP); 543 source = _mm_mul_ps(source, source); 544 mSum = _mm_add_ps(mSum, source); 545 sourceP += 4; 546 } 547 548 // Summarize the SSE results. 549 const float* groupSumP = reinterpret_cast<float*>(&mSum); 550 sum += groupSumP[0] + groupSumP[1] + groupSumP[2] + groupSumP[3]; 551 552 n = tailFrames; 553 } 554 #elif HAVE(ARM_NEON_INTRINSICS) 555 if (sourceStride == 1) { 556 int tailFrames = n % 4; 557 const float* endP = sourceP + n - tailFrames; 558 559 float32x4_t fourSum = vdupq_n_f32(0); 560 while (sourceP < endP) { 561 float32x4_t source = vld1q_f32(sourceP); 562 fourSum = vmlaq_f32(fourSum, source, source); 563 sourceP += 4; 564 } 565 float32x2_t twoSum = vadd_f32(vget_low_f32(fourSum), vget_high_f32(fourSum)); 566 567 float groupSum[2]; 568 vst1_f32(groupSum, twoSum); 569 sum += groupSum[0] + groupSum[1]; 570 571 n = tailFrames; 572 } 573 #endif 574 575 while (n--) { 576 float sample = *sourceP; 577 sum += sample * sample; 578 sourceP += sourceStride; 579 } 580 581 ASSERT(sumP); 582 *sumP = sum; 583 } 584 585 void vmaxmgv(const float* sourceP, int sourceStride, float* maxP, size_t framesToProcess) 586 { 587 int n = framesToProcess; 588 float max = 0; 589 590 #if CPU(X86) || CPU(X86_64) 591 if (sourceStride == 1) { 592 // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed separately. 593 while ((reinterpret_cast<uintptr_t>(sourceP) & 0x0F) && n) { 594 max = std::max(max, fabsf(*sourceP)); 595 sourceP++; 596 n--; 597 } 598 599 // Now the sourceP is aligned, use SSE. 600 int tailFrames = n % 4; 601 const float* endP = sourceP + n - tailFrames; 602 __m128 source; 603 __m128 mMax = _mm_setzero_ps(); 604 int mask = 0x7FFFFFFF; 605 __m128 mMask = _mm_set1_ps(*reinterpret_cast<float*>(&mask)); 606 607 while (sourceP < endP) { 608 source = _mm_load_ps(sourceP); 609 // Calculate the absolute value by anding source with mask, the sign bit is set to 0. 610 source = _mm_and_ps(source, mMask); 611 mMax = _mm_max_ps(mMax, source); 612 sourceP += 4; 613 } 614 615 // Get max from the SSE results. 616 const float* groupMaxP = reinterpret_cast<float*>(&mMax); 617 max = std::max(max, groupMaxP[0]); 618 max = std::max(max, groupMaxP[1]); 619 max = std::max(max, groupMaxP[2]); 620 max = std::max(max, groupMaxP[3]); 621 622 n = tailFrames; 623 } 624 #elif HAVE(ARM_NEON_INTRINSICS) 625 if (sourceStride == 1) { 626 int tailFrames = n % 4; 627 const float* endP = sourceP + n - tailFrames; 628 629 float32x4_t fourMax = vdupq_n_f32(0); 630 while (sourceP < endP) { 631 float32x4_t source = vld1q_f32(sourceP); 632 fourMax = vmaxq_f32(fourMax, vabsq_f32(source)); 633 sourceP += 4; 634 } 635 float32x2_t twoMax = vmax_f32(vget_low_f32(fourMax), vget_high_f32(fourMax)); 636 637 float groupMax[2]; 638 vst1_f32(groupMax, twoMax); 639 max = std::max(groupMax[0], groupMax[1]); 640 641 n = tailFrames; 642 } 643 #endif 644 645 while (n--) { 646 max = std::max(max, fabsf(*sourceP)); 647 sourceP += sourceStride; 648 } 649 650 ASSERT(maxP); 651 *maxP = max; 652 } 653 654 void vclip(const float* sourceP, int sourceStride, const float* lowThresholdP, const float* highThresholdP, float* destP, int destStride, size_t framesToProcess) 655 { 656 int n = framesToProcess; 657 float lowThreshold = *lowThresholdP; 658 float highThreshold = *highThresholdP; 659 660 // FIXME: Optimize for SSE2. 661 #if HAVE(ARM_NEON_INTRINSICS) 662 if ((sourceStride == 1) && (destStride == 1)) { 663 int tailFrames = n % 4; 664 const float* endP = destP + n - tailFrames; 665 666 float32x4_t low = vdupq_n_f32(lowThreshold); 667 float32x4_t high = vdupq_n_f32(highThreshold); 668 while (destP < endP) { 669 float32x4_t source = vld1q_f32(sourceP); 670 vst1q_f32(destP, vmaxq_f32(vminq_f32(source, high), low)); 671 sourceP += 4; 672 destP += 4; 673 } 674 n = tailFrames; 675 } 676 #endif 677 while (n--) { 678 *destP = std::max(std::min(*sourceP, highThreshold), lowThreshold); 679 sourceP += sourceStride; 680 destP += destStride; 681 } 682 } 683 684 #endif // OS(MACOSX) 685 686 } // namespace VectorMath 687 688 } // namespace blink 689 690 #endif // ENABLE(WEB_AUDIO) 691