Home | History | Annotate | Download | only in audio
      1 /*
      2  * Copyright (C) 2010, Google Inc. All rights reserved.
      3  *
      4  * Redistribution and use in source and binary forms, with or without
      5  * modification, are permitted provided that the following conditions
      6  * are met:
      7  * 1.  Redistributions of source code must retain the above copyright
      8  *    notice, this list of conditions and the following disclaimer.
      9  * 2.  Redistributions in binary form must reproduce the above copyright
     10  *    notice, this list of conditions and the following disclaimer in the
     11  *    documentation and/or other materials provided with the distribution.
     12  *
     13  * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS'' AND ANY
     14  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     15  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     16  * DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS BE LIABLE FOR ANY
     17  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     18  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     19  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
     20  * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     21  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     22  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     23  */
     24 
     25 #include "config.h"
     26 
     27 #if ENABLE(WEB_AUDIO)
     28 
     29 #include "platform/audio/VectorMath.h"
     30 #include "wtf/Assertions.h"
     31 #include "wtf/CPU.h"
     32 #include <stdint.h>
     33 
     34 #if OS(MACOSX)
     35 #include <Accelerate/Accelerate.h>
     36 #endif
     37 
     38 #if CPU(X86) || CPU(X86_64)
     39 #include <emmintrin.h>
     40 #endif
     41 
     42 #if HAVE(ARM_NEON_INTRINSICS)
     43 #include <arm_neon.h>
     44 #endif
     45 
     46 #include <math.h>
     47 #include <algorithm>
     48 
     49 namespace WebCore {
     50 
     51 namespace VectorMath {
     52 
     53 #if OS(MACOSX)
     54 // On the Mac we use the highly optimized versions in Accelerate.framework
     55 // In 32-bit mode (__ppc__ or __i386__) <Accelerate/Accelerate.h> includes <vecLib/vDSP_translate.h> which defines macros of the same name as
     56 // our namespaced function names, so we must handle this case differently. Other architectures (64bit, ARM, etc.) do not include this header file.
     57 
     58 void vsmul(const float* sourceP, int sourceStride, const float* scale, float* destP, int destStride, size_t framesToProcess)
     59 {
     60 #if CPU(X86)
     61     ::vsmul(sourceP, sourceStride, scale, destP, destStride, framesToProcess);
     62 #else
     63     vDSP_vsmul(sourceP, sourceStride, scale, destP, destStride, framesToProcess);
     64 #endif
     65 }
     66 
     67 void vadd(const float* source1P, int sourceStride1, const float* source2P, int sourceStride2, float* destP, int destStride, size_t framesToProcess)
     68 {
     69 #if CPU(X86)
     70     ::vadd(source1P, sourceStride1, source2P, sourceStride2, destP, destStride, framesToProcess);
     71 #else
     72     vDSP_vadd(source1P, sourceStride1, source2P, sourceStride2, destP, destStride, framesToProcess);
     73 #endif
     74 }
     75 
     76 void vmul(const float* source1P, int sourceStride1, const float* source2P, int sourceStride2, float* destP, int destStride, size_t framesToProcess)
     77 {
     78 #if CPU(X86)
     79     ::vmul(source1P, sourceStride1, source2P, sourceStride2, destP, destStride, framesToProcess);
     80 #else
     81     vDSP_vmul(source1P, sourceStride1, source2P, sourceStride2, destP, destStride, framesToProcess);
     82 #endif
     83 }
     84 
     85 void zvmul(const float* real1P, const float* imag1P, const float* real2P, const float* imag2P, float* realDestP, float* imagDestP, size_t framesToProcess)
     86 {
     87     DSPSplitComplex sc1;
     88     DSPSplitComplex sc2;
     89     DSPSplitComplex dest;
     90     sc1.realp = const_cast<float*>(real1P);
     91     sc1.imagp = const_cast<float*>(imag1P);
     92     sc2.realp = const_cast<float*>(real2P);
     93     sc2.imagp = const_cast<float*>(imag2P);
     94     dest.realp = realDestP;
     95     dest.imagp = imagDestP;
     96 #if CPU(X86)
     97     ::zvmul(&sc1, 1, &sc2, 1, &dest, 1, framesToProcess, 1);
     98 #else
     99     vDSP_zvmul(&sc1, 1, &sc2, 1, &dest, 1, framesToProcess, 1);
    100 #endif
    101 }
    102 
    103 void vsma(const float* sourceP, int sourceStride, const float* scale, float* destP, int destStride, size_t framesToProcess)
    104 {
    105     vDSP_vsma(sourceP, sourceStride, scale, destP, destStride, destP, destStride, framesToProcess);
    106 }
    107 
    108 void vmaxmgv(const float* sourceP, int sourceStride, float* maxP, size_t framesToProcess)
    109 {
    110     vDSP_maxmgv(sourceP, sourceStride, maxP, framesToProcess);
    111 }
    112 
    113 void vsvesq(const float* sourceP, int sourceStride, float* sumP, size_t framesToProcess)
    114 {
    115     vDSP_svesq(const_cast<float*>(sourceP), sourceStride, sumP, framesToProcess);
    116 }
    117 
    118 void vclip(const float* sourceP, int sourceStride, const float* lowThresholdP, const float* highThresholdP, float* destP, int destStride, size_t framesToProcess)
    119 {
    120     vDSP_vclip(const_cast<float*>(sourceP), sourceStride, const_cast<float*>(lowThresholdP), const_cast<float*>(highThresholdP), destP, destStride, framesToProcess);
    121 }
    122 #else
    123 
    124 void vsma(const float* sourceP, int sourceStride, const float* scale, float* destP, int destStride, size_t framesToProcess)
    125 {
    126     int n = framesToProcess;
    127 
    128 #if CPU(X86) || CPU(X86_64)
    129     if ((sourceStride == 1) && (destStride == 1)) {
    130         float k = *scale;
    131 
    132         // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed separately.
    133         while ((reinterpret_cast<uintptr_t>(sourceP) & 0x0F) && n) {
    134             *destP += k * *sourceP;
    135             sourceP++;
    136             destP++;
    137             n--;
    138         }
    139 
    140         // Now the sourceP is aligned, use SSE.
    141         int tailFrames = n % 4;
    142         const float* endP = destP + n - tailFrames;
    143 
    144         __m128 pSource;
    145         __m128 dest;
    146         __m128 temp;
    147         __m128 mScale = _mm_set_ps1(k);
    148 
    149         bool destAligned = !(reinterpret_cast<uintptr_t>(destP) & 0x0F);
    150 
    151 #define SSE2_MULT_ADD(loadInstr, storeInstr)        \
    152             while (destP < endP)                    \
    153             {                                       \
    154                 pSource = _mm_load_ps(sourceP);     \
    155                 temp = _mm_mul_ps(pSource, mScale); \
    156                 dest = _mm_##loadInstr##_ps(destP); \
    157                 dest = _mm_add_ps(dest, temp);      \
    158                 _mm_##storeInstr##_ps(destP, dest); \
    159                 sourceP += 4;                       \
    160                 destP += 4;                         \
    161             }
    162 
    163         if (destAligned)
    164             SSE2_MULT_ADD(load, store)
    165         else
    166             SSE2_MULT_ADD(loadu, storeu)
    167 
    168         n = tailFrames;
    169     }
    170 #elif HAVE(ARM_NEON_INTRINSICS)
    171     if ((sourceStride == 1) && (destStride == 1)) {
    172         int tailFrames = n % 4;
    173         const float* endP = destP + n - tailFrames;
    174 
    175         float32x4_t k = vdupq_n_f32(*scale);
    176         while (destP < endP) {
    177             float32x4_t source = vld1q_f32(sourceP);
    178             float32x4_t dest = vld1q_f32(destP);
    179 
    180             dest = vmlaq_f32(dest, source, k);
    181             vst1q_f32(destP, dest);
    182 
    183             sourceP += 4;
    184             destP += 4;
    185         }
    186         n = tailFrames;
    187     }
    188 #endif
    189     while (n) {
    190         *destP += *sourceP * *scale;
    191         sourceP += sourceStride;
    192         destP += destStride;
    193         n--;
    194     }
    195 }
    196 
    197 void vsmul(const float* sourceP, int sourceStride, const float* scale, float* destP, int destStride, size_t framesToProcess)
    198 {
    199     int n = framesToProcess;
    200 
    201 #if CPU(X86) || CPU(X86_64)
    202     if ((sourceStride == 1) && (destStride == 1)) {
    203         float k = *scale;
    204 
    205         // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed separately.
    206         while ((reinterpret_cast<size_t>(sourceP) & 0x0F) && n) {
    207             *destP = k * *sourceP;
    208             sourceP++;
    209             destP++;
    210             n--;
    211         }
    212 
    213         // Now the sourceP address is aligned and start to apply SSE.
    214         int group = n / 4;
    215         __m128 mScale = _mm_set_ps1(k);
    216         __m128* pSource;
    217         __m128* pDest;
    218         __m128 dest;
    219 
    220 
    221         if (reinterpret_cast<size_t>(destP) & 0x0F) {
    222             while (group--) {
    223                 pSource = reinterpret_cast<__m128*>(const_cast<float*>(sourceP));
    224                 dest = _mm_mul_ps(*pSource, mScale);
    225                 _mm_storeu_ps(destP, dest);
    226 
    227                 sourceP += 4;
    228                 destP += 4;
    229             }
    230         } else {
    231             while (group--) {
    232                 pSource = reinterpret_cast<__m128*>(const_cast<float*>(sourceP));
    233                 pDest = reinterpret_cast<__m128*>(destP);
    234                 *pDest = _mm_mul_ps(*pSource, mScale);
    235 
    236                 sourceP += 4;
    237                 destP += 4;
    238             }
    239         }
    240 
    241         // Non-SSE handling for remaining frames which is less than 4.
    242         n %= 4;
    243         while (n) {
    244             *destP = k * *sourceP;
    245             sourceP++;
    246             destP++;
    247             n--;
    248         }
    249     } else { // If strides are not 1, rollback to normal algorithm.
    250 #elif HAVE(ARM_NEON_INTRINSICS)
    251     if ((sourceStride == 1) && (destStride == 1)) {
    252         float k = *scale;
    253         int tailFrames = n % 4;
    254         const float* endP = destP + n - tailFrames;
    255 
    256         while (destP < endP) {
    257             float32x4_t source = vld1q_f32(sourceP);
    258             vst1q_f32(destP, vmulq_n_f32(source, k));
    259 
    260             sourceP += 4;
    261             destP += 4;
    262         }
    263         n = tailFrames;
    264     }
    265 #endif
    266     float k = *scale;
    267     while (n--) {
    268         *destP = k * *sourceP;
    269         sourceP += sourceStride;
    270         destP += destStride;
    271     }
    272 #if CPU(X86) || CPU(X86_64)
    273     }
    274 #endif
    275 }
    276 
    277 void vadd(const float* source1P, int sourceStride1, const float* source2P, int sourceStride2, float* destP, int destStride, size_t framesToProcess)
    278 {
    279     int n = framesToProcess;
    280 
    281 #if CPU(X86) || CPU(X86_64)
    282     if ((sourceStride1 ==1) && (sourceStride2 == 1) && (destStride == 1)) {
    283         // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed separately.
    284         while ((reinterpret_cast<size_t>(source1P) & 0x0F) && n) {
    285             *destP = *source1P + *source2P;
    286             source1P++;
    287             source2P++;
    288             destP++;
    289             n--;
    290         }
    291 
    292         // Now the source1P address is aligned and start to apply SSE.
    293         int group = n / 4;
    294         __m128* pSource1;
    295         __m128* pSource2;
    296         __m128* pDest;
    297         __m128 source2;
    298         __m128 dest;
    299 
    300         bool source2Aligned = !(reinterpret_cast<size_t>(source2P) & 0x0F);
    301         bool destAligned = !(reinterpret_cast<size_t>(destP) & 0x0F);
    302 
    303         if (source2Aligned && destAligned) { // all aligned
    304             while (group--) {
    305                 pSource1 = reinterpret_cast<__m128*>(const_cast<float*>(source1P));
    306                 pSource2 = reinterpret_cast<__m128*>(const_cast<float*>(source2P));
    307                 pDest = reinterpret_cast<__m128*>(destP);
    308                 *pDest = _mm_add_ps(*pSource1, *pSource2);
    309 
    310                 source1P += 4;
    311                 source2P += 4;
    312                 destP += 4;
    313             }
    314 
    315         } else if (source2Aligned && !destAligned) { // source2 aligned but dest not aligned
    316             while (group--) {
    317                 pSource1 = reinterpret_cast<__m128*>(const_cast<float*>(source1P));
    318                 pSource2 = reinterpret_cast<__m128*>(const_cast<float*>(source2P));
    319                 dest = _mm_add_ps(*pSource1, *pSource2);
    320                 _mm_storeu_ps(destP, dest);
    321 
    322                 source1P += 4;
    323                 source2P += 4;
    324                 destP += 4;
    325             }
    326 
    327         } else if (!source2Aligned && destAligned) { // source2 not aligned but dest aligned
    328             while (group--) {
    329                 pSource1 = reinterpret_cast<__m128*>(const_cast<float*>(source1P));
    330                 source2 = _mm_loadu_ps(source2P);
    331                 pDest = reinterpret_cast<__m128*>(destP);
    332                 *pDest = _mm_add_ps(*pSource1, source2);
    333 
    334                 source1P += 4;
    335                 source2P += 4;
    336                 destP += 4;
    337             }
    338         } else if (!source2Aligned && !destAligned) { // both source2 and dest not aligned
    339             while (group--) {
    340                 pSource1 = reinterpret_cast<__m128*>(const_cast<float*>(source1P));
    341                 source2 = _mm_loadu_ps(source2P);
    342                 dest = _mm_add_ps(*pSource1, source2);
    343                 _mm_storeu_ps(destP, dest);
    344 
    345                 source1P += 4;
    346                 source2P += 4;
    347                 destP += 4;
    348             }
    349         }
    350 
    351         // Non-SSE handling for remaining frames which is less than 4.
    352         n %= 4;
    353         while (n) {
    354             *destP = *source1P + *source2P;
    355             source1P++;
    356             source2P++;
    357             destP++;
    358             n--;
    359         }
    360     } else { // if strides are not 1, rollback to normal algorithm
    361 #elif HAVE(ARM_NEON_INTRINSICS)
    362     if ((sourceStride1 ==1) && (sourceStride2 == 1) && (destStride == 1)) {
    363         int tailFrames = n % 4;
    364         const float* endP = destP + n - tailFrames;
    365 
    366         while (destP < endP) {
    367             float32x4_t source1 = vld1q_f32(source1P);
    368             float32x4_t source2 = vld1q_f32(source2P);
    369             vst1q_f32(destP, vaddq_f32(source1, source2));
    370 
    371             source1P += 4;
    372             source2P += 4;
    373             destP += 4;
    374         }
    375         n = tailFrames;
    376     }
    377 #endif
    378     while (n--) {
    379         *destP = *source1P + *source2P;
    380         source1P += sourceStride1;
    381         source2P += sourceStride2;
    382         destP += destStride;
    383     }
    384 #if CPU(X86) || CPU(X86_64)
    385     }
    386 #endif
    387 }
    388 
    389 void vmul(const float* source1P, int sourceStride1, const float* source2P, int sourceStride2, float* destP, int destStride, size_t framesToProcess)
    390 {
    391 
    392     int n = framesToProcess;
    393 
    394 #if CPU(X86) || CPU(X86_64)
    395     if ((sourceStride1 == 1) && (sourceStride2 == 1) && (destStride == 1)) {
    396         // If the source1P address is not 16-byte aligned, the first several frames (at most three) should be processed separately.
    397         while ((reinterpret_cast<uintptr_t>(source1P) & 0x0F) && n) {
    398             *destP = *source1P * *source2P;
    399             source1P++;
    400             source2P++;
    401             destP++;
    402             n--;
    403         }
    404 
    405         // Now the source1P address aligned and start to apply SSE.
    406         int tailFrames = n % 4;
    407         const float* endP = destP + n - tailFrames;
    408         __m128 pSource1;
    409         __m128 pSource2;
    410         __m128 dest;
    411 
    412         bool source2Aligned = !(reinterpret_cast<uintptr_t>(source2P) & 0x0F);
    413         bool destAligned = !(reinterpret_cast<uintptr_t>(destP) & 0x0F);
    414 
    415 #define SSE2_MULT(loadInstr, storeInstr)                   \
    416             while (destP < endP)                           \
    417             {                                              \
    418                 pSource1 = _mm_load_ps(source1P);          \
    419                 pSource2 = _mm_##loadInstr##_ps(source2P); \
    420                 dest = _mm_mul_ps(pSource1, pSource2);     \
    421                 _mm_##storeInstr##_ps(destP, dest);        \
    422                 source1P += 4;                             \
    423                 source2P += 4;                             \
    424                 destP += 4;                                \
    425             }
    426 
    427         if (source2Aligned && destAligned) // Both aligned.
    428             SSE2_MULT(load, store)
    429         else if (source2Aligned && !destAligned) // Source2 is aligned but dest not.
    430             SSE2_MULT(load, storeu)
    431         else if (!source2Aligned && destAligned) // Dest is aligned but source2 not.
    432             SSE2_MULT(loadu, store)
    433         else // Neither aligned.
    434             SSE2_MULT(loadu, storeu)
    435 
    436         n = tailFrames;
    437     }
    438 #elif HAVE(ARM_NEON_INTRINSICS)
    439     if ((sourceStride1 ==1) && (sourceStride2 == 1) && (destStride == 1)) {
    440         int tailFrames = n % 4;
    441         const float* endP = destP + n - tailFrames;
    442 
    443         while (destP < endP) {
    444             float32x4_t source1 = vld1q_f32(source1P);
    445             float32x4_t source2 = vld1q_f32(source2P);
    446             vst1q_f32(destP, vmulq_f32(source1, source2));
    447 
    448             source1P += 4;
    449             source2P += 4;
    450             destP += 4;
    451         }
    452         n = tailFrames;
    453     }
    454 #endif
    455     while (n) {
    456         *destP = *source1P * *source2P;
    457         source1P += sourceStride1;
    458         source2P += sourceStride2;
    459         destP += destStride;
    460         n--;
    461     }
    462 }
    463 
    464 void zvmul(const float* real1P, const float* imag1P, const float* real2P, const float* imag2P, float* realDestP, float* imagDestP, size_t framesToProcess)
    465 {
    466     unsigned i = 0;
    467 #if CPU(X86) || CPU(X86_64)
    468     // Only use the SSE optimization in the very common case that all addresses are 16-byte aligned.
    469     // Otherwise, fall through to the scalar code below.
    470     if (!(reinterpret_cast<uintptr_t>(real1P) & 0x0F)
    471         && !(reinterpret_cast<uintptr_t>(imag1P) & 0x0F)
    472         && !(reinterpret_cast<uintptr_t>(real2P) & 0x0F)
    473         && !(reinterpret_cast<uintptr_t>(imag2P) & 0x0F)
    474         && !(reinterpret_cast<uintptr_t>(realDestP) & 0x0F)
    475         && !(reinterpret_cast<uintptr_t>(imagDestP) & 0x0F)) {
    476 
    477         unsigned endSize = framesToProcess - framesToProcess % 4;
    478         while (i < endSize) {
    479             __m128 real1 = _mm_load_ps(real1P + i);
    480             __m128 real2 = _mm_load_ps(real2P + i);
    481             __m128 imag1 = _mm_load_ps(imag1P + i);
    482             __m128 imag2 = _mm_load_ps(imag2P + i);
    483             __m128 real = _mm_mul_ps(real1, real2);
    484             real = _mm_sub_ps(real, _mm_mul_ps(imag1, imag2));
    485             __m128 imag = _mm_mul_ps(real1, imag2);
    486             imag = _mm_add_ps(imag, _mm_mul_ps(imag1, real2));
    487             _mm_store_ps(realDestP + i, real);
    488             _mm_store_ps(imagDestP + i, imag);
    489             i += 4;
    490         }
    491     }
    492 #elif HAVE(ARM_NEON_INTRINSICS)
    493         unsigned endSize = framesToProcess - framesToProcess % 4;
    494         while (i < endSize) {
    495             float32x4_t real1 = vld1q_f32(real1P + i);
    496             float32x4_t real2 = vld1q_f32(real2P + i);
    497             float32x4_t imag1 = vld1q_f32(imag1P + i);
    498             float32x4_t imag2 = vld1q_f32(imag2P + i);
    499 
    500             float32x4_t realResult = vmlsq_f32(vmulq_f32(real1, real2), imag1, imag2);
    501             float32x4_t imagResult = vmlaq_f32(vmulq_f32(real1, imag2), imag1, real2);
    502 
    503             vst1q_f32(realDestP + i, realResult);
    504             vst1q_f32(imagDestP + i, imagResult);
    505 
    506             i += 4;
    507         }
    508 #endif
    509     for (; i < framesToProcess; ++i) {
    510         // Read and compute result before storing them, in case the
    511         // destination is the same as one of the sources.
    512         float realResult = real1P[i] * real2P[i] - imag1P[i] * imag2P[i];
    513         float imagResult = real1P[i] * imag2P[i] + imag1P[i] * real2P[i];
    514 
    515         realDestP[i] = realResult;
    516         imagDestP[i] = imagResult;
    517     }
    518 }
    519 
    520 void vsvesq(const float* sourceP, int sourceStride, float* sumP, size_t framesToProcess)
    521 {
    522     int n = framesToProcess;
    523     float sum = 0;
    524 
    525 #if CPU(X86) || CPU(X86_64)
    526     if (sourceStride == 1) {
    527         // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed separately.
    528         while ((reinterpret_cast<uintptr_t>(sourceP) & 0x0F) && n) {
    529             float sample = *sourceP;
    530             sum += sample * sample;
    531             sourceP++;
    532             n--;
    533         }
    534 
    535         // Now the sourceP is aligned, use SSE.
    536         int tailFrames = n % 4;
    537         const float* endP = sourceP + n - tailFrames;
    538         __m128 source;
    539         __m128 mSum = _mm_setzero_ps();
    540 
    541         while (sourceP < endP) {
    542             source = _mm_load_ps(sourceP);
    543             source = _mm_mul_ps(source, source);
    544             mSum = _mm_add_ps(mSum, source);
    545             sourceP += 4;
    546         }
    547 
    548         // Summarize the SSE results.
    549         const float* groupSumP = reinterpret_cast<float*>(&mSum);
    550         sum += groupSumP[0] + groupSumP[1] + groupSumP[2] + groupSumP[3];
    551 
    552         n = tailFrames;
    553     }
    554 #elif HAVE(ARM_NEON_INTRINSICS)
    555     if (sourceStride == 1) {
    556         int tailFrames = n % 4;
    557         const float* endP = sourceP + n - tailFrames;
    558 
    559         float32x4_t fourSum = vdupq_n_f32(0);
    560         while (sourceP < endP) {
    561             float32x4_t source = vld1q_f32(sourceP);
    562             fourSum = vmlaq_f32(fourSum, source, source);
    563             sourceP += 4;
    564         }
    565         float32x2_t twoSum = vadd_f32(vget_low_f32(fourSum), vget_high_f32(fourSum));
    566 
    567         float groupSum[2];
    568         vst1_f32(groupSum, twoSum);
    569         sum += groupSum[0] + groupSum[1];
    570 
    571         n = tailFrames;
    572     }
    573 #endif
    574 
    575     while (n--) {
    576         float sample = *sourceP;
    577         sum += sample * sample;
    578         sourceP += sourceStride;
    579     }
    580 
    581     ASSERT(sumP);
    582     *sumP = sum;
    583 }
    584 
    585 void vmaxmgv(const float* sourceP, int sourceStride, float* maxP, size_t framesToProcess)
    586 {
    587     int n = framesToProcess;
    588     float max = 0;
    589 
    590 #if CPU(X86) || CPU(X86_64)
    591     if (sourceStride == 1) {
    592         // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed separately.
    593         while ((reinterpret_cast<uintptr_t>(sourceP) & 0x0F) && n) {
    594             max = std::max(max, fabsf(*sourceP));
    595             sourceP++;
    596             n--;
    597         }
    598 
    599         // Now the sourceP is aligned, use SSE.
    600         int tailFrames = n % 4;
    601         const float* endP = sourceP + n - tailFrames;
    602         __m128 source;
    603         __m128 mMax = _mm_setzero_ps();
    604         int mask = 0x7FFFFFFF;
    605         __m128 mMask = _mm_set1_ps(*reinterpret_cast<float*>(&mask));
    606 
    607         while (sourceP < endP) {
    608             source = _mm_load_ps(sourceP);
    609             // Calculate the absolute value by anding source with mask, the sign bit is set to 0.
    610             source = _mm_and_ps(source, mMask);
    611             mMax = _mm_max_ps(mMax, source);
    612             sourceP += 4;
    613         }
    614 
    615         // Get max from the SSE results.
    616         const float* groupMaxP = reinterpret_cast<float*>(&mMax);
    617         max = std::max(max, groupMaxP[0]);
    618         max = std::max(max, groupMaxP[1]);
    619         max = std::max(max, groupMaxP[2]);
    620         max = std::max(max, groupMaxP[3]);
    621 
    622         n = tailFrames;
    623     }
    624 #elif HAVE(ARM_NEON_INTRINSICS)
    625     if (sourceStride == 1) {
    626         int tailFrames = n % 4;
    627         const float* endP = sourceP + n - tailFrames;
    628 
    629         float32x4_t fourMax = vdupq_n_f32(0);
    630         while (sourceP < endP) {
    631             float32x4_t source = vld1q_f32(sourceP);
    632             fourMax = vmaxq_f32(fourMax, vabsq_f32(source));
    633             sourceP += 4;
    634         }
    635         float32x2_t twoMax = vmax_f32(vget_low_f32(fourMax), vget_high_f32(fourMax));
    636 
    637         float groupMax[2];
    638         vst1_f32(groupMax, twoMax);
    639         max = std::max(groupMax[0], groupMax[1]);
    640 
    641         n = tailFrames;
    642     }
    643 #endif
    644 
    645     while (n--) {
    646         max = std::max(max, fabsf(*sourceP));
    647         sourceP += sourceStride;
    648     }
    649 
    650     ASSERT(maxP);
    651     *maxP = max;
    652 }
    653 
    654 void vclip(const float* sourceP, int sourceStride, const float* lowThresholdP, const float* highThresholdP, float* destP, int destStride, size_t framesToProcess)
    655 {
    656     int n = framesToProcess;
    657     float lowThreshold = *lowThresholdP;
    658     float highThreshold = *highThresholdP;
    659 
    660     // FIXME: Optimize for SSE2.
    661 #if HAVE(ARM_NEON_INTRINSICS)
    662     if ((sourceStride == 1) && (destStride == 1)) {
    663         int tailFrames = n % 4;
    664         const float* endP = destP + n - tailFrames;
    665 
    666         float32x4_t low = vdupq_n_f32(lowThreshold);
    667         float32x4_t high = vdupq_n_f32(highThreshold);
    668         while (destP < endP) {
    669             float32x4_t source = vld1q_f32(sourceP);
    670             vst1q_f32(destP, vmaxq_f32(vminq_f32(source, high), low));
    671             sourceP += 4;
    672             destP += 4;
    673         }
    674         n = tailFrames;
    675     }
    676 #endif
    677     while (n--) {
    678         *destP = std::max(std::min(*sourceP, highThreshold), lowThreshold);
    679         sourceP += sourceStride;
    680         destP += destStride;
    681     }
    682 }
    683 
    684 #endif // OS(MACOSX)
    685 
    686 } // namespace VectorMath
    687 
    688 } // namespace WebCore
    689 
    690 #endif // ENABLE(WEB_AUDIO)
    691