Home | History | Annotate | Download | only in audio
      1 /*
      2  * Copyright (C) 2011 Google Inc. All rights reserved.
      3  *
      4  * Redistribution and use in source and binary forms, with or without
      5  * modification, are permitted provided that the following conditions
      6  * are met:
      7  *
      8  * 1.  Redistributions of source code must retain the above copyright
      9  *     notice, this list of conditions and the following disclaimer.
     10  * 2.  Redistributions in binary form must reproduce the above copyright
     11  *     notice, this list of conditions and the following disclaimer in the
     12  *     documentation and/or other materials provided with the distribution.
     13  * 3.  Neither the name of Apple Computer, Inc. ("Apple") nor the names of
     14  *     its contributors may be used to endorse or promote products derived
     15  *     from this software without specific prior written permission.
     16  *
     17  * THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY
     18  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     19  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     20  * DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY
     21  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     22  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     23  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
     24  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
     26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     27  */
     28 
     29 #include "config.h"
     30 
     31 #if ENABLE(WEB_AUDIO)
     32 
     33 #include "platform/audio/SincResampler.h"
     34 
     35 #include "platform/audio/AudioBus.h"
     36 #include "wtf/CPU.h"
     37 #include "wtf/MathExtras.h"
     38 
     39 #if CPU(X86) || CPU(X86_64)
     40 #include <emmintrin.h>
     41 #endif
     42 
     43 using namespace std;
     44 
     45 // Input buffer layout, dividing the total buffer into regions (r0 - r5):
     46 //
     47 // |----------------|----------------------------------------------------------------|----------------|
     48 //
     49 //                                              blockSize + kernelSize / 2
     50 //                   <-------------------------------------------------------------------------------->
     51 //                                                  r0
     52 //
     53 //   kernelSize / 2   kernelSize / 2                                 kernelSize / 2     kernelSize / 2
     54 // <---------------> <--------------->                              <---------------> <--------------->
     55 //         r1                r2                                             r3                r4
     56 //
     57 //                                              blockSize
     58 //                                     <-------------------------------------------------------------->
     59 //                                                  r5
     60 
     61 // The Algorithm:
     62 //
     63 // 1) Consume input frames into r0 (r1 is zero-initialized).
     64 // 2) Position kernel centered at start of r0 (r2) and generate output frames until kernel is centered at start of r4.
     65 //    or we've finished generating all the output frames.
     66 // 3) Copy r3 to r1 and r4 to r2.
     67 // 4) Consume input frames into r5 (zero-pad if we run out of input).
     68 // 5) Goto (2) until all of input is consumed.
     69 //
     70 // note: we're glossing over how the sub-sample handling works with m_virtualSourceIndex, etc.
     71 
     72 namespace WebCore {
     73 
     74 SincResampler::SincResampler(double scaleFactor, unsigned kernelSize, unsigned numberOfKernelOffsets)
     75     : m_scaleFactor(scaleFactor)
     76     , m_kernelSize(kernelSize)
     77     , m_numberOfKernelOffsets(numberOfKernelOffsets)
     78     , m_kernelStorage(m_kernelSize * (m_numberOfKernelOffsets + 1))
     79     , m_virtualSourceIndex(0)
     80     , m_blockSize(512)
     81     , m_inputBuffer(m_blockSize + m_kernelSize) // See input buffer layout above.
     82     , m_source(0)
     83     , m_sourceFramesAvailable(0)
     84     , m_sourceProvider(0)
     85     , m_isBufferPrimed(false)
     86 {
     87     initializeKernel();
     88 }
     89 
     90 void SincResampler::initializeKernel()
     91 {
     92     // Blackman window parameters.
     93     double alpha = 0.16;
     94     double a0 = 0.5 * (1.0 - alpha);
     95     double a1 = 0.5;
     96     double a2 = 0.5 * alpha;
     97 
     98     // sincScaleFactor is basically the normalized cutoff frequency of the low-pass filter.
     99     double sincScaleFactor = m_scaleFactor > 1.0 ? 1.0 / m_scaleFactor : 1.0;
    100 
    101     // The sinc function is an idealized brick-wall filter, but since we're windowing it the
    102     // transition from pass to stop does not happen right away. So we should adjust the
    103     // lowpass filter cutoff slightly downward to avoid some aliasing at the very high-end.
    104     // FIXME: this value is empirical and to be more exact should vary depending on m_kernelSize.
    105     sincScaleFactor *= 0.9;
    106 
    107     int n = m_kernelSize;
    108     int halfSize = n / 2;
    109 
    110     // Generates a set of windowed sinc() kernels.
    111     // We generate a range of sub-sample offsets from 0.0 to 1.0.
    112     for (unsigned offsetIndex = 0; offsetIndex <= m_numberOfKernelOffsets; ++offsetIndex) {
    113         double subsampleOffset = static_cast<double>(offsetIndex) / m_numberOfKernelOffsets;
    114 
    115         for (int i = 0; i < n; ++i) {
    116             // Compute the sinc() with offset.
    117             double s = sincScaleFactor * piDouble * (i - halfSize - subsampleOffset);
    118             double sinc = !s ? 1.0 : sin(s) / s;
    119             sinc *= sincScaleFactor;
    120 
    121             // Compute Blackman window, matching the offset of the sinc().
    122             double x = (i - subsampleOffset) / n;
    123             double window = a0 - a1 * cos(twoPiDouble * x) + a2 * cos(twoPiDouble * 2.0 * x);
    124 
    125             // Window the sinc() function and store at the correct offset.
    126             m_kernelStorage[i + offsetIndex * m_kernelSize] = sinc * window;
    127         }
    128     }
    129 }
    130 
    131 void SincResampler::consumeSource(float* buffer, unsigned numberOfSourceFrames)
    132 {
    133     ASSERT(m_sourceProvider);
    134     if (!m_sourceProvider)
    135         return;
    136 
    137     // Wrap the provided buffer by an AudioBus for use by the source provider.
    138     RefPtr<AudioBus> bus = AudioBus::create(1, numberOfSourceFrames, false);
    139 
    140     // FIXME: Find a way to make the following const-correct:
    141     bus->setChannelMemory(0, buffer, numberOfSourceFrames);
    142 
    143     m_sourceProvider->provideInput(bus.get(), numberOfSourceFrames);
    144 }
    145 
    146 namespace {
    147 
    148 // BufferSourceProvider is an AudioSourceProvider wrapping an in-memory buffer.
    149 
    150 class BufferSourceProvider FINAL : public AudioSourceProvider {
    151 public:
    152     BufferSourceProvider(const float* source, size_t numberOfSourceFrames)
    153         : m_source(source)
    154         , m_sourceFramesAvailable(numberOfSourceFrames)
    155     {
    156     }
    157 
    158     // Consumes samples from the in-memory buffer.
    159     virtual void provideInput(AudioBus* bus, size_t framesToProcess) OVERRIDE
    160     {
    161         ASSERT(m_source && bus);
    162         if (!m_source || !bus)
    163             return;
    164 
    165         float* buffer = bus->channel(0)->mutableData();
    166 
    167         // Clamp to number of frames available and zero-pad.
    168         size_t framesToCopy = min(m_sourceFramesAvailable, framesToProcess);
    169         memcpy(buffer, m_source, sizeof(float) * framesToCopy);
    170 
    171         // Zero-pad if necessary.
    172         if (framesToCopy < framesToProcess)
    173             memset(buffer + framesToCopy, 0, sizeof(float) * (framesToProcess - framesToCopy));
    174 
    175         m_sourceFramesAvailable -= framesToCopy;
    176         m_source += framesToCopy;
    177     }
    178 
    179 private:
    180     const float* m_source;
    181     size_t m_sourceFramesAvailable;
    182 };
    183 
    184 } // namespace
    185 
    186 void SincResampler::process(const float* source, float* destination, unsigned numberOfSourceFrames)
    187 {
    188     // Resample an in-memory buffer using an AudioSourceProvider.
    189     BufferSourceProvider sourceProvider(source, numberOfSourceFrames);
    190 
    191     unsigned numberOfDestinationFrames = static_cast<unsigned>(numberOfSourceFrames / m_scaleFactor);
    192     unsigned remaining = numberOfDestinationFrames;
    193 
    194     while (remaining) {
    195         unsigned framesThisTime = min(remaining, m_blockSize);
    196         process(&sourceProvider, destination, framesThisTime);
    197 
    198         destination += framesThisTime;
    199         remaining -= framesThisTime;
    200     }
    201 }
    202 
    203 void SincResampler::process(AudioSourceProvider* sourceProvider, float* destination, size_t framesToProcess)
    204 {
    205     bool isGood = sourceProvider && m_blockSize > m_kernelSize && m_inputBuffer.size() >= m_blockSize + m_kernelSize && !(m_kernelSize % 2);
    206     ASSERT(isGood);
    207     if (!isGood)
    208         return;
    209 
    210     m_sourceProvider = sourceProvider;
    211 
    212     unsigned numberOfDestinationFrames = framesToProcess;
    213 
    214     // Setup various region pointers in the buffer (see diagram above).
    215     float* r0 = m_inputBuffer.data() + m_kernelSize / 2;
    216     float* r1 = m_inputBuffer.data();
    217     float* r2 = r0;
    218     float* r3 = r0 + m_blockSize - m_kernelSize / 2;
    219     float* r4 = r0 + m_blockSize;
    220     float* r5 = r0 + m_kernelSize / 2;
    221 
    222     // Step (1)
    223     // Prime the input buffer at the start of the input stream.
    224     if (!m_isBufferPrimed) {
    225         consumeSource(r0, m_blockSize + m_kernelSize / 2);
    226         m_isBufferPrimed = true;
    227     }
    228 
    229     // Step (2)
    230 
    231     while (numberOfDestinationFrames) {
    232         while (m_virtualSourceIndex < m_blockSize) {
    233             // m_virtualSourceIndex lies in between two kernel offsets so figure out what they are.
    234             int sourceIndexI = static_cast<int>(m_virtualSourceIndex);
    235             double subsampleRemainder = m_virtualSourceIndex - sourceIndexI;
    236 
    237             double virtualOffsetIndex = subsampleRemainder * m_numberOfKernelOffsets;
    238             int offsetIndex = static_cast<int>(virtualOffsetIndex);
    239 
    240             float* k1 = m_kernelStorage.data() + offsetIndex * m_kernelSize;
    241             float* k2 = k1 + m_kernelSize;
    242 
    243             // Initialize input pointer based on quantized m_virtualSourceIndex.
    244             float* inputP = r1 + sourceIndexI;
    245 
    246             // We'll compute "convolutions" for the two kernels which straddle m_virtualSourceIndex
    247             float sum1 = 0;
    248             float sum2 = 0;
    249 
    250             // Figure out how much to weight each kernel's "convolution".
    251             double kernelInterpolationFactor = virtualOffsetIndex - offsetIndex;
    252 
    253             // Generate a single output sample.
    254             int n = m_kernelSize;
    255 
    256 #define CONVOLVE_ONE_SAMPLE      \
    257             input = *inputP++;   \
    258             sum1 += input * *k1; \
    259             sum2 += input * *k2; \
    260             ++k1;                \
    261             ++k2;
    262 
    263             {
    264                 float input;
    265 
    266 #if CPU(X86) || CPU(X86_64)
    267                 // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed seperately.
    268                 while ((reinterpret_cast<uintptr_t>(inputP) & 0x0F) && n) {
    269                     CONVOLVE_ONE_SAMPLE
    270                     n--;
    271                 }
    272 
    273                 // Now the inputP is aligned and start to apply SSE.
    274                 float* endP = inputP + n - n % 4;
    275                 __m128 mInput;
    276                 __m128 mK1;
    277                 __m128 mK2;
    278                 __m128 mul1;
    279                 __m128 mul2;
    280 
    281                 __m128 sums1 = _mm_setzero_ps();
    282                 __m128 sums2 = _mm_setzero_ps();
    283                 bool k1Aligned = !(reinterpret_cast<uintptr_t>(k1) & 0x0F);
    284                 bool k2Aligned = !(reinterpret_cast<uintptr_t>(k2) & 0x0F);
    285 
    286 #define LOAD_DATA(l1, l2)                        \
    287                 mInput = _mm_load_ps(inputP);    \
    288                 mK1 = _mm_##l1##_ps(k1);         \
    289                 mK2 = _mm_##l2##_ps(k2);
    290 
    291 #define CONVOLVE_4_SAMPLES                       \
    292                 mul1 = _mm_mul_ps(mInput, mK1);  \
    293                 mul2 = _mm_mul_ps(mInput, mK2);  \
    294                 sums1 = _mm_add_ps(sums1, mul1); \
    295                 sums2 = _mm_add_ps(sums2, mul2); \
    296                 inputP += 4;                     \
    297                 k1 += 4;                         \
    298                 k2 += 4;
    299 
    300                 if (k1Aligned && k2Aligned) { // both aligned
    301                     while (inputP < endP) {
    302                         LOAD_DATA(load, load)
    303                         CONVOLVE_4_SAMPLES
    304                     }
    305                 } else if (!k1Aligned && k2Aligned) { // only k2 aligned
    306                     while (inputP < endP) {
    307                         LOAD_DATA(loadu, load)
    308                         CONVOLVE_4_SAMPLES
    309                     }
    310                 } else if (k1Aligned && !k2Aligned) { // only k1 aligned
    311                     while (inputP < endP) {
    312                         LOAD_DATA(load, loadu)
    313                         CONVOLVE_4_SAMPLES
    314                     }
    315                 } else { // both non-aligned
    316                     while (inputP < endP) {
    317                         LOAD_DATA(loadu, loadu)
    318                         CONVOLVE_4_SAMPLES
    319                     }
    320                 }
    321 
    322                 // Summarize the SSE results to sum1 and sum2.
    323                 float* groupSumP = reinterpret_cast<float*>(&sums1);
    324                 sum1 += groupSumP[0] + groupSumP[1] + groupSumP[2] + groupSumP[3];
    325                 groupSumP = reinterpret_cast<float*>(&sums2);
    326                 sum2 += groupSumP[0] + groupSumP[1] + groupSumP[2] + groupSumP[3];
    327 
    328                 n %= 4;
    329                 while (n) {
    330                     CONVOLVE_ONE_SAMPLE
    331                     n--;
    332                 }
    333 #else
    334                 // FIXME: add ARM NEON optimizations for the following. The scalar code-path can probably also be optimized better.
    335 
    336                 // Optimize size 32 and size 64 kernels by unrolling the while loop.
    337                 // A 20 - 30% speed improvement was measured in some cases by using this approach.
    338 
    339                 if (n == 32) {
    340                     CONVOLVE_ONE_SAMPLE // 1
    341                     CONVOLVE_ONE_SAMPLE // 2
    342                     CONVOLVE_ONE_SAMPLE // 3
    343                     CONVOLVE_ONE_SAMPLE // 4
    344                     CONVOLVE_ONE_SAMPLE // 5
    345                     CONVOLVE_ONE_SAMPLE // 6
    346                     CONVOLVE_ONE_SAMPLE // 7
    347                     CONVOLVE_ONE_SAMPLE // 8
    348                     CONVOLVE_ONE_SAMPLE // 9
    349                     CONVOLVE_ONE_SAMPLE // 10
    350                     CONVOLVE_ONE_SAMPLE // 11
    351                     CONVOLVE_ONE_SAMPLE // 12
    352                     CONVOLVE_ONE_SAMPLE // 13
    353                     CONVOLVE_ONE_SAMPLE // 14
    354                     CONVOLVE_ONE_SAMPLE // 15
    355                     CONVOLVE_ONE_SAMPLE // 16
    356                     CONVOLVE_ONE_SAMPLE // 17
    357                     CONVOLVE_ONE_SAMPLE // 18
    358                     CONVOLVE_ONE_SAMPLE // 19
    359                     CONVOLVE_ONE_SAMPLE // 20
    360                     CONVOLVE_ONE_SAMPLE // 21
    361                     CONVOLVE_ONE_SAMPLE // 22
    362                     CONVOLVE_ONE_SAMPLE // 23
    363                     CONVOLVE_ONE_SAMPLE // 24
    364                     CONVOLVE_ONE_SAMPLE // 25
    365                     CONVOLVE_ONE_SAMPLE // 26
    366                     CONVOLVE_ONE_SAMPLE // 27
    367                     CONVOLVE_ONE_SAMPLE // 28
    368                     CONVOLVE_ONE_SAMPLE // 29
    369                     CONVOLVE_ONE_SAMPLE // 30
    370                     CONVOLVE_ONE_SAMPLE // 31
    371                     CONVOLVE_ONE_SAMPLE // 32
    372                 } else if (n == 64) {
    373                     CONVOLVE_ONE_SAMPLE // 1
    374                     CONVOLVE_ONE_SAMPLE // 2
    375                     CONVOLVE_ONE_SAMPLE // 3
    376                     CONVOLVE_ONE_SAMPLE // 4
    377                     CONVOLVE_ONE_SAMPLE // 5
    378                     CONVOLVE_ONE_SAMPLE // 6
    379                     CONVOLVE_ONE_SAMPLE // 7
    380                     CONVOLVE_ONE_SAMPLE // 8
    381                     CONVOLVE_ONE_SAMPLE // 9
    382                     CONVOLVE_ONE_SAMPLE // 10
    383                     CONVOLVE_ONE_SAMPLE // 11
    384                     CONVOLVE_ONE_SAMPLE // 12
    385                     CONVOLVE_ONE_SAMPLE // 13
    386                     CONVOLVE_ONE_SAMPLE // 14
    387                     CONVOLVE_ONE_SAMPLE // 15
    388                     CONVOLVE_ONE_SAMPLE // 16
    389                     CONVOLVE_ONE_SAMPLE // 17
    390                     CONVOLVE_ONE_SAMPLE // 18
    391                     CONVOLVE_ONE_SAMPLE // 19
    392                     CONVOLVE_ONE_SAMPLE // 20
    393                     CONVOLVE_ONE_SAMPLE // 21
    394                     CONVOLVE_ONE_SAMPLE // 22
    395                     CONVOLVE_ONE_SAMPLE // 23
    396                     CONVOLVE_ONE_SAMPLE // 24
    397                     CONVOLVE_ONE_SAMPLE // 25
    398                     CONVOLVE_ONE_SAMPLE // 26
    399                     CONVOLVE_ONE_SAMPLE // 27
    400                     CONVOLVE_ONE_SAMPLE // 28
    401                     CONVOLVE_ONE_SAMPLE // 29
    402                     CONVOLVE_ONE_SAMPLE // 30
    403                     CONVOLVE_ONE_SAMPLE // 31
    404                     CONVOLVE_ONE_SAMPLE // 32
    405                     CONVOLVE_ONE_SAMPLE // 33
    406                     CONVOLVE_ONE_SAMPLE // 34
    407                     CONVOLVE_ONE_SAMPLE // 35
    408                     CONVOLVE_ONE_SAMPLE // 36
    409                     CONVOLVE_ONE_SAMPLE // 37
    410                     CONVOLVE_ONE_SAMPLE // 38
    411                     CONVOLVE_ONE_SAMPLE // 39
    412                     CONVOLVE_ONE_SAMPLE // 40
    413                     CONVOLVE_ONE_SAMPLE // 41
    414                     CONVOLVE_ONE_SAMPLE // 42
    415                     CONVOLVE_ONE_SAMPLE // 43
    416                     CONVOLVE_ONE_SAMPLE // 44
    417                     CONVOLVE_ONE_SAMPLE // 45
    418                     CONVOLVE_ONE_SAMPLE // 46
    419                     CONVOLVE_ONE_SAMPLE // 47
    420                     CONVOLVE_ONE_SAMPLE // 48
    421                     CONVOLVE_ONE_SAMPLE // 49
    422                     CONVOLVE_ONE_SAMPLE // 50
    423                     CONVOLVE_ONE_SAMPLE // 51
    424                     CONVOLVE_ONE_SAMPLE // 52
    425                     CONVOLVE_ONE_SAMPLE // 53
    426                     CONVOLVE_ONE_SAMPLE // 54
    427                     CONVOLVE_ONE_SAMPLE // 55
    428                     CONVOLVE_ONE_SAMPLE // 56
    429                     CONVOLVE_ONE_SAMPLE // 57
    430                     CONVOLVE_ONE_SAMPLE // 58
    431                     CONVOLVE_ONE_SAMPLE // 59
    432                     CONVOLVE_ONE_SAMPLE // 60
    433                     CONVOLVE_ONE_SAMPLE // 61
    434                     CONVOLVE_ONE_SAMPLE // 62
    435                     CONVOLVE_ONE_SAMPLE // 63
    436                     CONVOLVE_ONE_SAMPLE // 64
    437                 } else {
    438                     while (n--) {
    439                         // Non-optimized using actual while loop.
    440                         CONVOLVE_ONE_SAMPLE
    441                     }
    442                 }
    443 #endif
    444             }
    445 
    446             // Linearly interpolate the two "convolutions".
    447             double result = (1.0 - kernelInterpolationFactor) * sum1 + kernelInterpolationFactor * sum2;
    448 
    449             *destination++ = result;
    450 
    451             // Advance the virtual index.
    452             m_virtualSourceIndex += m_scaleFactor;
    453 
    454             --numberOfDestinationFrames;
    455             if (!numberOfDestinationFrames)
    456                 return;
    457         }
    458 
    459         // Wrap back around to the start.
    460         m_virtualSourceIndex -= m_blockSize;
    461 
    462         // Step (3) Copy r3 to r1 and r4 to r2.
    463         // This wraps the last input frames back to the start of the buffer.
    464         memcpy(r1, r3, sizeof(float) * (m_kernelSize / 2));
    465         memcpy(r2, r4, sizeof(float) * (m_kernelSize / 2));
    466 
    467         // Step (4)
    468         // Refresh the buffer with more input.
    469         consumeSource(r5, m_blockSize);
    470     }
    471 }
    472 
    473 } // namespace WebCore
    474 
    475 #endif // ENABLE(WEB_AUDIO)
    476