Home | History | Annotate | Download | only in audio
      1 /*
      2  * Copyright (C) 2010, Google Inc. All rights reserved.
      3  *
      4  * Redistribution and use in source and binary forms, with or without
      5  * modification, are permitted provided that the following conditions
      6  * are met:
      7  * 1.  Redistributions of source code must retain the above copyright
      8  *    notice, this list of conditions and the following disclaimer.
      9  * 2.  Redistributions in binary form must reproduce the above copyright
     10  *    notice, this list of conditions and the following disclaimer in the
     11  *    documentation and/or other materials provided with the distribution.
     12  *
     13  * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS'' AND ANY
     14  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     15  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     16  * DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS BE LIABLE FOR ANY
     17  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     18  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     19  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
     20  * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     21  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     22  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     23  */
     24 
     25 #include "config.h"
     26 
     27 #if ENABLE(WEB_AUDIO)
     28 
     29 #include "core/platform/audio/VectorMath.h"
     30 
     31 #include "wtf/Assertions.h"
     32 
     33 #if OS(DARWIN)
     34 #include <Accelerate/Accelerate.h>
     35 #endif
     36 
     37 #ifdef __SSE2__
     38 #include <emmintrin.h>
     39 #endif
     40 
     41 #if HAVE(ARM_NEON_INTRINSICS)
     42 #include <arm_neon.h>
     43 #endif
     44 
     45 #include <math.h>
     46 #include <algorithm>
     47 
     48 namespace WebCore {
     49 
     50 namespace VectorMath {
     51 
     52 #if OS(DARWIN)
     53 // On the Mac we use the highly optimized versions in Accelerate.framework
     54 // In 32-bit mode (__ppc__ or __i386__) <Accelerate/Accelerate.h> includes <vecLib/vDSP_translate.h> which defines macros of the same name as
     55 // our namespaced function names, so we must handle this case differently. Other architectures (64bit, ARM, etc.) do not include this header file.
     56 
     57 void vsmul(const float* sourceP, int sourceStride, const float* scale, float* destP, int destStride, size_t framesToProcess)
     58 {
     59 #if defined(__ppc__) || defined(__i386__)
     60     ::vsmul(sourceP, sourceStride, scale, destP, destStride, framesToProcess);
     61 #else
     62     vDSP_vsmul(sourceP, sourceStride, scale, destP, destStride, framesToProcess);
     63 #endif
     64 }
     65 
     66 void vadd(const float* source1P, int sourceStride1, const float* source2P, int sourceStride2, float* destP, int destStride, size_t framesToProcess)
     67 {
     68 #if defined(__ppc__) || defined(__i386__)
     69     ::vadd(source1P, sourceStride1, source2P, sourceStride2, destP, destStride, framesToProcess);
     70 #else
     71     vDSP_vadd(source1P, sourceStride1, source2P, sourceStride2, destP, destStride, framesToProcess);
     72 #endif
     73 }
     74 
     75 void vmul(const float* source1P, int sourceStride1, const float* source2P, int sourceStride2, float* destP, int destStride, size_t framesToProcess)
     76 {
     77 #if defined(__ppc__) || defined(__i386__)
     78     ::vmul(source1P, sourceStride1, source2P, sourceStride2, destP, destStride, framesToProcess);
     79 #else
     80     vDSP_vmul(source1P, sourceStride1, source2P, sourceStride2, destP, destStride, framesToProcess);
     81 #endif
     82 }
     83 
     84 void zvmul(const float* real1P, const float* imag1P, const float* real2P, const float* imag2P, float* realDestP, float* imagDestP, size_t framesToProcess)
     85 {
     86     DSPSplitComplex sc1;
     87     DSPSplitComplex sc2;
     88     DSPSplitComplex dest;
     89     sc1.realp = const_cast<float*>(real1P);
     90     sc1.imagp = const_cast<float*>(imag1P);
     91     sc2.realp = const_cast<float*>(real2P);
     92     sc2.imagp = const_cast<float*>(imag2P);
     93     dest.realp = realDestP;
     94     dest.imagp = imagDestP;
     95 #if defined(__ppc__) || defined(__i386__)
     96     ::zvmul(&sc1, 1, &sc2, 1, &dest, 1, framesToProcess, 1);
     97 #else
     98     vDSP_zvmul(&sc1, 1, &sc2, 1, &dest, 1, framesToProcess, 1);
     99 #endif
    100 }
    101 
    102 void vsma(const float* sourceP, int sourceStride, const float* scale, float* destP, int destStride, size_t framesToProcess)
    103 {
    104     vDSP_vsma(sourceP, sourceStride, scale, destP, destStride, destP, destStride, framesToProcess);
    105 }
    106 
    107 void vmaxmgv(const float* sourceP, int sourceStride, float* maxP, size_t framesToProcess)
    108 {
    109     vDSP_maxmgv(sourceP, sourceStride, maxP, framesToProcess);
    110 }
    111 
    112 void vsvesq(const float* sourceP, int sourceStride, float* sumP, size_t framesToProcess)
    113 {
    114     vDSP_svesq(const_cast<float*>(sourceP), sourceStride, sumP, framesToProcess);
    115 }
    116 
    117 void vclip(const float* sourceP, int sourceStride, const float* lowThresholdP, const float* highThresholdP, float* destP, int destStride, size_t framesToProcess)
    118 {
    119     vDSP_vclip(const_cast<float*>(sourceP), sourceStride, const_cast<float*>(lowThresholdP), const_cast<float*>(highThresholdP), destP, destStride, framesToProcess);
    120 }
    121 #else
    122 
    123 void vsma(const float* sourceP, int sourceStride, const float* scale, float* destP, int destStride, size_t framesToProcess)
    124 {
    125     int n = framesToProcess;
    126 
    127 #ifdef __SSE2__
    128     if ((sourceStride == 1) && (destStride == 1)) {
    129         float k = *scale;
    130 
    131         // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed separately.
    132         while ((reinterpret_cast<uintptr_t>(sourceP) & 0x0F) && n) {
    133             *destP += k * *sourceP;
    134             sourceP++;
    135             destP++;
    136             n--;
    137         }
    138 
    139         // Now the sourceP is aligned, use SSE.
    140         int tailFrames = n % 4;
    141         const float* endP = destP + n - tailFrames;
    142 
    143         __m128 pSource;
    144         __m128 dest;
    145         __m128 temp;
    146         __m128 mScale = _mm_set_ps1(k);
    147 
    148         bool destAligned = !(reinterpret_cast<uintptr_t>(destP) & 0x0F);
    149 
    150 #define SSE2_MULT_ADD(loadInstr, storeInstr)        \
    151             while (destP < endP)                    \
    152             {                                       \
    153                 pSource = _mm_load_ps(sourceP);     \
    154                 temp = _mm_mul_ps(pSource, mScale); \
    155                 dest = _mm_##loadInstr##_ps(destP); \
    156                 dest = _mm_add_ps(dest, temp);      \
    157                 _mm_##storeInstr##_ps(destP, dest); \
    158                 sourceP += 4;                       \
    159                 destP += 4;                         \
    160             }
    161 
    162         if (destAligned)
    163             SSE2_MULT_ADD(load, store)
    164         else
    165             SSE2_MULT_ADD(loadu, storeu)
    166 
    167         n = tailFrames;
    168     }
    169 #elif HAVE(ARM_NEON_INTRINSICS)
    170     if ((sourceStride == 1) && (destStride == 1)) {
    171         int tailFrames = n % 4;
    172         const float* endP = destP + n - tailFrames;
    173 
    174         float32x4_t k = vdupq_n_f32(*scale);
    175         while (destP < endP) {
    176             float32x4_t source = vld1q_f32(sourceP);
    177             float32x4_t dest = vld1q_f32(destP);
    178 
    179             dest = vmlaq_f32(dest, source, k);
    180             vst1q_f32(destP, dest);
    181 
    182             sourceP += 4;
    183             destP += 4;
    184         }
    185         n = tailFrames;
    186     }
    187 #endif
    188     while (n) {
    189         *destP += *sourceP * *scale;
    190         sourceP += sourceStride;
    191         destP += destStride;
    192         n--;
    193     }
    194 }
    195 
    196 void vsmul(const float* sourceP, int sourceStride, const float* scale, float* destP, int destStride, size_t framesToProcess)
    197 {
    198     int n = framesToProcess;
    199 
    200 #ifdef __SSE2__
    201     if ((sourceStride == 1) && (destStride == 1)) {
    202         float k = *scale;
    203 
    204         // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed separately.
    205         while ((reinterpret_cast<size_t>(sourceP) & 0x0F) && n) {
    206             *destP = k * *sourceP;
    207             sourceP++;
    208             destP++;
    209             n--;
    210         }
    211 
    212         // Now the sourceP address is aligned and start to apply SSE.
    213         int group = n / 4;
    214         __m128 mScale = _mm_set_ps1(k);
    215         __m128* pSource;
    216         __m128* pDest;
    217         __m128 dest;
    218 
    219 
    220         if (reinterpret_cast<size_t>(destP) & 0x0F) {
    221             while (group--) {
    222                 pSource = reinterpret_cast<__m128*>(const_cast<float*>(sourceP));
    223                 dest = _mm_mul_ps(*pSource, mScale);
    224                 _mm_storeu_ps(destP, dest);
    225 
    226                 sourceP += 4;
    227                 destP += 4;
    228             }
    229         } else {
    230             while (group--) {
    231                 pSource = reinterpret_cast<__m128*>(const_cast<float*>(sourceP));
    232                 pDest = reinterpret_cast<__m128*>(destP);
    233                 *pDest = _mm_mul_ps(*pSource, mScale);
    234 
    235                 sourceP += 4;
    236                 destP += 4;
    237             }
    238         }
    239 
    240         // Non-SSE handling for remaining frames which is less than 4.
    241         n %= 4;
    242         while (n) {
    243             *destP = k * *sourceP;
    244             sourceP++;
    245             destP++;
    246             n--;
    247         }
    248     } else { // If strides are not 1, rollback to normal algorithm.
    249 #elif HAVE(ARM_NEON_INTRINSICS)
    250     if ((sourceStride == 1) && (destStride == 1)) {
    251         float k = *scale;
    252         int tailFrames = n % 4;
    253         const float* endP = destP + n - tailFrames;
    254 
    255         while (destP < endP) {
    256             float32x4_t source = vld1q_f32(sourceP);
    257             vst1q_f32(destP, vmulq_n_f32(source, k));
    258 
    259             sourceP += 4;
    260             destP += 4;
    261         }
    262         n = tailFrames;
    263     }
    264 #endif
    265     float k = *scale;
    266     while (n--) {
    267         *destP = k * *sourceP;
    268         sourceP += sourceStride;
    269         destP += destStride;
    270     }
    271 #ifdef __SSE2__
    272     }
    273 #endif
    274 }
    275 
    276 void vadd(const float* source1P, int sourceStride1, const float* source2P, int sourceStride2, float* destP, int destStride, size_t framesToProcess)
    277 {
    278     int n = framesToProcess;
    279 
    280 #ifdef __SSE2__
    281     if ((sourceStride1 ==1) && (sourceStride2 == 1) && (destStride == 1)) {
    282         // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed separately.
    283         while ((reinterpret_cast<size_t>(source1P) & 0x0F) && n) {
    284             *destP = *source1P + *source2P;
    285             source1P++;
    286             source2P++;
    287             destP++;
    288             n--;
    289         }
    290 
    291         // Now the source1P address is aligned and start to apply SSE.
    292         int group = n / 4;
    293         __m128* pSource1;
    294         __m128* pSource2;
    295         __m128* pDest;
    296         __m128 source2;
    297         __m128 dest;
    298 
    299         bool source2Aligned = !(reinterpret_cast<size_t>(source2P) & 0x0F);
    300         bool destAligned = !(reinterpret_cast<size_t>(destP) & 0x0F);
    301 
    302         if (source2Aligned && destAligned) { // all aligned
    303             while (group--) {
    304                 pSource1 = reinterpret_cast<__m128*>(const_cast<float*>(source1P));
    305                 pSource2 = reinterpret_cast<__m128*>(const_cast<float*>(source2P));
    306                 pDest = reinterpret_cast<__m128*>(destP);
    307                 *pDest = _mm_add_ps(*pSource1, *pSource2);
    308 
    309                 source1P += 4;
    310                 source2P += 4;
    311                 destP += 4;
    312             }
    313 
    314         } else if (source2Aligned && !destAligned) { // source2 aligned but dest not aligned
    315             while (group--) {
    316                 pSource1 = reinterpret_cast<__m128*>(const_cast<float*>(source1P));
    317                 pSource2 = reinterpret_cast<__m128*>(const_cast<float*>(source2P));
    318                 dest = _mm_add_ps(*pSource1, *pSource2);
    319                 _mm_storeu_ps(destP, dest);
    320 
    321                 source1P += 4;
    322                 source2P += 4;
    323                 destP += 4;
    324             }
    325 
    326         } else if (!source2Aligned && destAligned) { // source2 not aligned but dest aligned
    327             while (group--) {
    328                 pSource1 = reinterpret_cast<__m128*>(const_cast<float*>(source1P));
    329                 source2 = _mm_loadu_ps(source2P);
    330                 pDest = reinterpret_cast<__m128*>(destP);
    331                 *pDest = _mm_add_ps(*pSource1, source2);
    332 
    333                 source1P += 4;
    334                 source2P += 4;
    335                 destP += 4;
    336             }
    337         } else if (!source2Aligned && !destAligned) { // both source2 and dest not aligned
    338             while (group--) {
    339                 pSource1 = reinterpret_cast<__m128*>(const_cast<float*>(source1P));
    340                 source2 = _mm_loadu_ps(source2P);
    341                 dest = _mm_add_ps(*pSource1, source2);
    342                 _mm_storeu_ps(destP, dest);
    343 
    344                 source1P += 4;
    345                 source2P += 4;
    346                 destP += 4;
    347             }
    348         }
    349 
    350         // Non-SSE handling for remaining frames which is less than 4.
    351         n %= 4;
    352         while (n) {
    353             *destP = *source1P + *source2P;
    354             source1P++;
    355             source2P++;
    356             destP++;
    357             n--;
    358         }
    359     } else { // if strides are not 1, rollback to normal algorithm
    360 #elif HAVE(ARM_NEON_INTRINSICS)
    361     if ((sourceStride1 ==1) && (sourceStride2 == 1) && (destStride == 1)) {
    362         int tailFrames = n % 4;
    363         const float* endP = destP + n - tailFrames;
    364 
    365         while (destP < endP) {
    366             float32x4_t source1 = vld1q_f32(source1P);
    367             float32x4_t source2 = vld1q_f32(source2P);
    368             vst1q_f32(destP, vaddq_f32(source1, source2));
    369 
    370             source1P += 4;
    371             source2P += 4;
    372             destP += 4;
    373         }
    374         n = tailFrames;
    375     }
    376 #endif
    377     while (n--) {
    378         *destP = *source1P + *source2P;
    379         source1P += sourceStride1;
    380         source2P += sourceStride2;
    381         destP += destStride;
    382     }
    383 #ifdef __SSE2__
    384     }
    385 #endif
    386 }
    387 
    388 void vmul(const float* source1P, int sourceStride1, const float* source2P, int sourceStride2, float* destP, int destStride, size_t framesToProcess)
    389 {
    390 
    391     int n = framesToProcess;
    392 
    393 #ifdef __SSE2__
    394     if ((sourceStride1 == 1) && (sourceStride2 == 1) && (destStride == 1)) {
    395         // If the source1P address is not 16-byte aligned, the first several frames (at most three) should be processed separately.
    396         while ((reinterpret_cast<uintptr_t>(source1P) & 0x0F) && n) {
    397             *destP = *source1P * *source2P;
    398             source1P++;
    399             source2P++;
    400             destP++;
    401             n--;
    402         }
    403 
    404         // Now the source1P address aligned and start to apply SSE.
    405         int tailFrames = n % 4;
    406         const float* endP = destP + n - tailFrames;
    407         __m128 pSource1;
    408         __m128 pSource2;
    409         __m128 dest;
    410 
    411         bool source2Aligned = !(reinterpret_cast<uintptr_t>(source2P) & 0x0F);
    412         bool destAligned = !(reinterpret_cast<uintptr_t>(destP) & 0x0F);
    413 
    414 #define SSE2_MULT(loadInstr, storeInstr)                   \
    415             while (destP < endP)                           \
    416             {                                              \
    417                 pSource1 = _mm_load_ps(source1P);          \
    418                 pSource2 = _mm_##loadInstr##_ps(source2P); \
    419                 dest = _mm_mul_ps(pSource1, pSource2);     \
    420                 _mm_##storeInstr##_ps(destP, dest);        \
    421                 source1P += 4;                             \
    422                 source2P += 4;                             \
    423                 destP += 4;                                \
    424             }
    425 
    426         if (source2Aligned && destAligned) // Both aligned.
    427             SSE2_MULT(load, store)
    428         else if (source2Aligned && !destAligned) // Source2 is aligned but dest not.
    429             SSE2_MULT(load, storeu)
    430         else if (!source2Aligned && destAligned) // Dest is aligned but source2 not.
    431             SSE2_MULT(loadu, store)
    432         else // Neither aligned.
    433             SSE2_MULT(loadu, storeu)
    434 
    435         n = tailFrames;
    436     }
    437 #elif HAVE(ARM_NEON_INTRINSICS)
    438     if ((sourceStride1 ==1) && (sourceStride2 == 1) && (destStride == 1)) {
    439         int tailFrames = n % 4;
    440         const float* endP = destP + n - tailFrames;
    441 
    442         while (destP < endP) {
    443             float32x4_t source1 = vld1q_f32(source1P);
    444             float32x4_t source2 = vld1q_f32(source2P);
    445             vst1q_f32(destP, vmulq_f32(source1, source2));
    446 
    447             source1P += 4;
    448             source2P += 4;
    449             destP += 4;
    450         }
    451         n = tailFrames;
    452     }
    453 #endif
    454     while (n) {
    455         *destP = *source1P * *source2P;
    456         source1P += sourceStride1;
    457         source2P += sourceStride2;
    458         destP += destStride;
    459         n--;
    460     }
    461 }
    462 
    463 void zvmul(const float* real1P, const float* imag1P, const float* real2P, const float* imag2P, float* realDestP, float* imagDestP, size_t framesToProcess)
    464 {
    465     unsigned i = 0;
    466 #ifdef __SSE2__
    467     // Only use the SSE optimization in the very common case that all addresses are 16-byte aligned.
    468     // Otherwise, fall through to the scalar code below.
    469     if (!(reinterpret_cast<uintptr_t>(real1P) & 0x0F)
    470         && !(reinterpret_cast<uintptr_t>(imag1P) & 0x0F)
    471         && !(reinterpret_cast<uintptr_t>(real2P) & 0x0F)
    472         && !(reinterpret_cast<uintptr_t>(imag2P) & 0x0F)
    473         && !(reinterpret_cast<uintptr_t>(realDestP) & 0x0F)
    474         && !(reinterpret_cast<uintptr_t>(imagDestP) & 0x0F)) {
    475 
    476         unsigned endSize = framesToProcess - framesToProcess % 4;
    477         while (i < endSize) {
    478             __m128 real1 = _mm_load_ps(real1P + i);
    479             __m128 real2 = _mm_load_ps(real2P + i);
    480             __m128 imag1 = _mm_load_ps(imag1P + i);
    481             __m128 imag2 = _mm_load_ps(imag2P + i);
    482             __m128 real = _mm_mul_ps(real1, real2);
    483             real = _mm_sub_ps(real, _mm_mul_ps(imag1, imag2));
    484             __m128 imag = _mm_mul_ps(real1, imag2);
    485             imag = _mm_add_ps(imag, _mm_mul_ps(imag1, real2));
    486             _mm_store_ps(realDestP + i, real);
    487             _mm_store_ps(imagDestP + i, imag);
    488             i += 4;
    489         }
    490     }
    491 #elif HAVE(ARM_NEON_INTRINSICS)
    492         unsigned endSize = framesToProcess - framesToProcess % 4;
    493         while (i < endSize) {
    494             float32x4_t real1 = vld1q_f32(real1P + i);
    495             float32x4_t real2 = vld1q_f32(real2P + i);
    496             float32x4_t imag1 = vld1q_f32(imag1P + i);
    497             float32x4_t imag2 = vld1q_f32(imag2P + i);
    498 
    499             float32x4_t realResult = vmlsq_f32(vmulq_f32(real1, real2), imag1, imag2);
    500             float32x4_t imagResult = vmlaq_f32(vmulq_f32(real1, imag2), imag1, real2);
    501 
    502             vst1q_f32(realDestP + i, realResult);
    503             vst1q_f32(imagDestP + i, imagResult);
    504 
    505             i += 4;
    506         }
    507 #endif
    508     for (; i < framesToProcess; ++i) {
    509         // Read and compute result before storing them, in case the
    510         // destination is the same as one of the sources.
    511         float realResult = real1P[i] * real2P[i] - imag1P[i] * imag2P[i];
    512         float imagResult = real1P[i] * imag2P[i] + imag1P[i] * real2P[i];
    513 
    514         realDestP[i] = realResult;
    515         imagDestP[i] = imagResult;
    516     }
    517 }
    518 
    519 void vsvesq(const float* sourceP, int sourceStride, float* sumP, size_t framesToProcess)
    520 {
    521     int n = framesToProcess;
    522     float sum = 0;
    523 
    524 #ifdef __SSE2__
    525     if (sourceStride == 1) {
    526         // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed separately.
    527         while ((reinterpret_cast<uintptr_t>(sourceP) & 0x0F) && n) {
    528             float sample = *sourceP;
    529             sum += sample * sample;
    530             sourceP++;
    531             n--;
    532         }
    533 
    534         // Now the sourceP is aligned, use SSE.
    535         int tailFrames = n % 4;
    536         const float* endP = sourceP + n - tailFrames;
    537         __m128 source;
    538         __m128 mSum = _mm_setzero_ps();
    539 
    540         while (sourceP < endP) {
    541             source = _mm_load_ps(sourceP);
    542             source = _mm_mul_ps(source, source);
    543             mSum = _mm_add_ps(mSum, source);
    544             sourceP += 4;
    545         }
    546 
    547         // Summarize the SSE results.
    548         const float* groupSumP = reinterpret_cast<float*>(&mSum);
    549         sum += groupSumP[0] + groupSumP[1] + groupSumP[2] + groupSumP[3];
    550 
    551         n = tailFrames;
    552     }
    553 #elif HAVE(ARM_NEON_INTRINSICS)
    554     if (sourceStride == 1) {
    555         int tailFrames = n % 4;
    556         const float* endP = sourceP + n - tailFrames;
    557 
    558         float32x4_t fourSum = vdupq_n_f32(0);
    559         while (sourceP < endP) {
    560             float32x4_t source = vld1q_f32(sourceP);
    561             fourSum = vmlaq_f32(fourSum, source, source);
    562             sourceP += 4;
    563         }
    564         float32x2_t twoSum = vadd_f32(vget_low_f32(fourSum), vget_high_f32(fourSum));
    565 
    566         float groupSum[2];
    567         vst1_f32(groupSum, twoSum);
    568         sum += groupSum[0] + groupSum[1];
    569 
    570         n = tailFrames;
    571     }
    572 #endif
    573 
    574     while (n--) {
    575         float sample = *sourceP;
    576         sum += sample * sample;
    577         sourceP += sourceStride;
    578     }
    579 
    580     ASSERT(sumP);
    581     *sumP = sum;
    582 }
    583 
    584 void vmaxmgv(const float* sourceP, int sourceStride, float* maxP, size_t framesToProcess)
    585 {
    586     int n = framesToProcess;
    587     float max = 0;
    588 
    589 #ifdef __SSE2__
    590     if (sourceStride == 1) {
    591         // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed separately.
    592         while ((reinterpret_cast<uintptr_t>(sourceP) & 0x0F) && n) {
    593             max = std::max(max, fabsf(*sourceP));
    594             sourceP++;
    595             n--;
    596         }
    597 
    598         // Now the sourceP is aligned, use SSE.
    599         int tailFrames = n % 4;
    600         const float* endP = sourceP + n - tailFrames;
    601         __m128 source;
    602         __m128 mMax = _mm_setzero_ps();
    603         int mask = 0x7FFFFFFF;
    604         __m128 mMask = _mm_set1_ps(*reinterpret_cast<float*>(&mask));
    605 
    606         while (sourceP < endP) {
    607             source = _mm_load_ps(sourceP);
    608             // Calculate the absolute value by anding source with mask, the sign bit is set to 0.
    609             source = _mm_and_ps(source, mMask);
    610             mMax = _mm_max_ps(mMax, source);
    611             sourceP += 4;
    612         }
    613 
    614         // Get max from the SSE results.
    615         const float* groupMaxP = reinterpret_cast<float*>(&mMax);
    616         max = std::max(max, groupMaxP[0]);
    617         max = std::max(max, groupMaxP[1]);
    618         max = std::max(max, groupMaxP[2]);
    619         max = std::max(max, groupMaxP[3]);
    620 
    621         n = tailFrames;
    622     }
    623 #elif HAVE(ARM_NEON_INTRINSICS)
    624     if (sourceStride == 1) {
    625         int tailFrames = n % 4;
    626         const float* endP = sourceP + n - tailFrames;
    627 
    628         float32x4_t fourMax = vdupq_n_f32(0);
    629         while (sourceP < endP) {
    630             float32x4_t source = vld1q_f32(sourceP);
    631             fourMax = vmaxq_f32(fourMax, vabsq_f32(source));
    632             sourceP += 4;
    633         }
    634         float32x2_t twoMax = vmax_f32(vget_low_f32(fourMax), vget_high_f32(fourMax));
    635 
    636         float groupMax[2];
    637         vst1_f32(groupMax, twoMax);
    638         max = std::max(groupMax[0], groupMax[1]);
    639 
    640         n = tailFrames;
    641     }
    642 #endif
    643 
    644     while (n--) {
    645         max = std::max(max, fabsf(*sourceP));
    646         sourceP += sourceStride;
    647     }
    648 
    649     ASSERT(maxP);
    650     *maxP = max;
    651 }
    652 
    653 void vclip(const float* sourceP, int sourceStride, const float* lowThresholdP, const float* highThresholdP, float* destP, int destStride, size_t framesToProcess)
    654 {
    655     int n = framesToProcess;
    656     float lowThreshold = *lowThresholdP;
    657     float highThreshold = *highThresholdP;
    658 
    659     // FIXME: Optimize for SSE2.
    660 #if HAVE(ARM_NEON_INTRINSICS)
    661     if ((sourceStride == 1) && (destStride == 1)) {
    662         int tailFrames = n % 4;
    663         const float* endP = destP + n - tailFrames;
    664 
    665         float32x4_t low = vdupq_n_f32(lowThreshold);
    666         float32x4_t high = vdupq_n_f32(highThreshold);
    667         while (destP < endP) {
    668             float32x4_t source = vld1q_f32(sourceP);
    669             vst1q_f32(destP, vmaxq_f32(vminq_f32(source, high), low));
    670             sourceP += 4;
    671             destP += 4;
    672         }
    673         n = tailFrames;
    674     }
    675 #endif
    676     while (n--) {
    677         *destP = std::max(std::min(*sourceP, highThreshold), lowThreshold);
    678         sourceP += sourceStride;
    679         destP += destStride;
    680     }
    681 }
    682 
    683 #endif // OS(DARWIN)
    684 
    685 } // namespace VectorMath
    686 
    687 } // namespace WebCore
    688 
    689 #endif // ENABLE(WEB_AUDIO)
    690