1 /* 2 * Copyright (C) 2011 Google Inc. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 3. Neither the name of Apple Computer, Inc. ("Apple") nor the names of 14 * its contributors may be used to endorse or promote products derived 15 * from this software without specific prior written permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY 18 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 20 * DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY 21 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 22 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 23 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 24 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29 #include "config.h" 30 31 #if ENABLE(WEB_AUDIO) 32 33 #include "platform/audio/SincResampler.h" 34 35 #include "platform/audio/AudioBus.h" 36 #include "wtf/CPU.h" 37 #include "wtf/MathExtras.h" 38 39 #if CPU(X86) || CPU(X86_64) 40 #include <emmintrin.h> 41 #endif 42 43 using namespace std; 44 45 // Input buffer layout, dividing the total buffer into regions (r0 - r5): 46 // 47 // |----------------|----------------------------------------------------------------|----------------| 48 // 49 // blockSize + kernelSize / 2 50 // <--------------------------------------------------------------------------------> 51 // r0 52 // 53 // kernelSize / 2 kernelSize / 2 kernelSize / 2 kernelSize / 2 54 // <---------------> <---------------> <---------------> <---------------> 55 // r1 r2 r3 r4 56 // 57 // blockSize 58 // <--------------------------------------------------------------> 59 // r5 60 61 // The Algorithm: 62 // 63 // 1) Consume input frames into r0 (r1 is zero-initialized). 64 // 2) Position kernel centered at start of r0 (r2) and generate output frames until kernel is centered at start of r4. 65 // or we've finished generating all the output frames. 66 // 3) Copy r3 to r1 and r4 to r2. 67 // 4) Consume input frames into r5 (zero-pad if we run out of input). 68 // 5) Goto (2) until all of input is consumed. 69 // 70 // note: we're glossing over how the sub-sample handling works with m_virtualSourceIndex, etc. 71 72 namespace WebCore { 73 74 SincResampler::SincResampler(double scaleFactor, unsigned kernelSize, unsigned numberOfKernelOffsets) 75 : m_scaleFactor(scaleFactor) 76 , m_kernelSize(kernelSize) 77 , m_numberOfKernelOffsets(numberOfKernelOffsets) 78 , m_kernelStorage(m_kernelSize * (m_numberOfKernelOffsets + 1)) 79 , m_virtualSourceIndex(0) 80 , m_blockSize(512) 81 , m_inputBuffer(m_blockSize + m_kernelSize) // See input buffer layout above. 82 , m_source(0) 83 , m_sourceFramesAvailable(0) 84 , m_sourceProvider(0) 85 , m_isBufferPrimed(false) 86 { 87 initializeKernel(); 88 } 89 90 void SincResampler::initializeKernel() 91 { 92 // Blackman window parameters. 93 double alpha = 0.16; 94 double a0 = 0.5 * (1.0 - alpha); 95 double a1 = 0.5; 96 double a2 = 0.5 * alpha; 97 98 // sincScaleFactor is basically the normalized cutoff frequency of the low-pass filter. 99 double sincScaleFactor = m_scaleFactor > 1.0 ? 1.0 / m_scaleFactor : 1.0; 100 101 // The sinc function is an idealized brick-wall filter, but since we're windowing it the 102 // transition from pass to stop does not happen right away. So we should adjust the 103 // lowpass filter cutoff slightly downward to avoid some aliasing at the very high-end. 104 // FIXME: this value is empirical and to be more exact should vary depending on m_kernelSize. 105 sincScaleFactor *= 0.9; 106 107 int n = m_kernelSize; 108 int halfSize = n / 2; 109 110 // Generates a set of windowed sinc() kernels. 111 // We generate a range of sub-sample offsets from 0.0 to 1.0. 112 for (unsigned offsetIndex = 0; offsetIndex <= m_numberOfKernelOffsets; ++offsetIndex) { 113 double subsampleOffset = static_cast<double>(offsetIndex) / m_numberOfKernelOffsets; 114 115 for (int i = 0; i < n; ++i) { 116 // Compute the sinc() with offset. 117 double s = sincScaleFactor * piDouble * (i - halfSize - subsampleOffset); 118 double sinc = !s ? 1.0 : sin(s) / s; 119 sinc *= sincScaleFactor; 120 121 // Compute Blackman window, matching the offset of the sinc(). 122 double x = (i - subsampleOffset) / n; 123 double window = a0 - a1 * cos(twoPiDouble * x) + a2 * cos(twoPiDouble * 2.0 * x); 124 125 // Window the sinc() function and store at the correct offset. 126 m_kernelStorage[i + offsetIndex * m_kernelSize] = sinc * window; 127 } 128 } 129 } 130 131 void SincResampler::consumeSource(float* buffer, unsigned numberOfSourceFrames) 132 { 133 ASSERT(m_sourceProvider); 134 if (!m_sourceProvider) 135 return; 136 137 // Wrap the provided buffer by an AudioBus for use by the source provider. 138 RefPtr<AudioBus> bus = AudioBus::create(1, numberOfSourceFrames, false); 139 140 // FIXME: Find a way to make the following const-correct: 141 bus->setChannelMemory(0, buffer, numberOfSourceFrames); 142 143 m_sourceProvider->provideInput(bus.get(), numberOfSourceFrames); 144 } 145 146 namespace { 147 148 // BufferSourceProvider is an AudioSourceProvider wrapping an in-memory buffer. 149 150 class BufferSourceProvider FINAL : public AudioSourceProvider { 151 public: 152 BufferSourceProvider(const float* source, size_t numberOfSourceFrames) 153 : m_source(source) 154 , m_sourceFramesAvailable(numberOfSourceFrames) 155 { 156 } 157 158 // Consumes samples from the in-memory buffer. 159 virtual void provideInput(AudioBus* bus, size_t framesToProcess) OVERRIDE 160 { 161 ASSERT(m_source && bus); 162 if (!m_source || !bus) 163 return; 164 165 float* buffer = bus->channel(0)->mutableData(); 166 167 // Clamp to number of frames available and zero-pad. 168 size_t framesToCopy = min(m_sourceFramesAvailable, framesToProcess); 169 memcpy(buffer, m_source, sizeof(float) * framesToCopy); 170 171 // Zero-pad if necessary. 172 if (framesToCopy < framesToProcess) 173 memset(buffer + framesToCopy, 0, sizeof(float) * (framesToProcess - framesToCopy)); 174 175 m_sourceFramesAvailable -= framesToCopy; 176 m_source += framesToCopy; 177 } 178 179 private: 180 const float* m_source; 181 size_t m_sourceFramesAvailable; 182 }; 183 184 } // namespace 185 186 void SincResampler::process(const float* source, float* destination, unsigned numberOfSourceFrames) 187 { 188 // Resample an in-memory buffer using an AudioSourceProvider. 189 BufferSourceProvider sourceProvider(source, numberOfSourceFrames); 190 191 unsigned numberOfDestinationFrames = static_cast<unsigned>(numberOfSourceFrames / m_scaleFactor); 192 unsigned remaining = numberOfDestinationFrames; 193 194 while (remaining) { 195 unsigned framesThisTime = min(remaining, m_blockSize); 196 process(&sourceProvider, destination, framesThisTime); 197 198 destination += framesThisTime; 199 remaining -= framesThisTime; 200 } 201 } 202 203 void SincResampler::process(AudioSourceProvider* sourceProvider, float* destination, size_t framesToProcess) 204 { 205 bool isGood = sourceProvider && m_blockSize > m_kernelSize && m_inputBuffer.size() >= m_blockSize + m_kernelSize && !(m_kernelSize % 2); 206 ASSERT(isGood); 207 if (!isGood) 208 return; 209 210 m_sourceProvider = sourceProvider; 211 212 unsigned numberOfDestinationFrames = framesToProcess; 213 214 // Setup various region pointers in the buffer (see diagram above). 215 float* r0 = m_inputBuffer.data() + m_kernelSize / 2; 216 float* r1 = m_inputBuffer.data(); 217 float* r2 = r0; 218 float* r3 = r0 + m_blockSize - m_kernelSize / 2; 219 float* r4 = r0 + m_blockSize; 220 float* r5 = r0 + m_kernelSize / 2; 221 222 // Step (1) 223 // Prime the input buffer at the start of the input stream. 224 if (!m_isBufferPrimed) { 225 consumeSource(r0, m_blockSize + m_kernelSize / 2); 226 m_isBufferPrimed = true; 227 } 228 229 // Step (2) 230 231 while (numberOfDestinationFrames) { 232 while (m_virtualSourceIndex < m_blockSize) { 233 // m_virtualSourceIndex lies in between two kernel offsets so figure out what they are. 234 int sourceIndexI = static_cast<int>(m_virtualSourceIndex); 235 double subsampleRemainder = m_virtualSourceIndex - sourceIndexI; 236 237 double virtualOffsetIndex = subsampleRemainder * m_numberOfKernelOffsets; 238 int offsetIndex = static_cast<int>(virtualOffsetIndex); 239 240 float* k1 = m_kernelStorage.data() + offsetIndex * m_kernelSize; 241 float* k2 = k1 + m_kernelSize; 242 243 // Initialize input pointer based on quantized m_virtualSourceIndex. 244 float* inputP = r1 + sourceIndexI; 245 246 // We'll compute "convolutions" for the two kernels which straddle m_virtualSourceIndex 247 float sum1 = 0; 248 float sum2 = 0; 249 250 // Figure out how much to weight each kernel's "convolution". 251 double kernelInterpolationFactor = virtualOffsetIndex - offsetIndex; 252 253 // Generate a single output sample. 254 int n = m_kernelSize; 255 256 #define CONVOLVE_ONE_SAMPLE \ 257 input = *inputP++; \ 258 sum1 += input * *k1; \ 259 sum2 += input * *k2; \ 260 ++k1; \ 261 ++k2; 262 263 { 264 float input; 265 266 #if CPU(X86) || CPU(X86_64) 267 // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed seperately. 268 while ((reinterpret_cast<uintptr_t>(inputP) & 0x0F) && n) { 269 CONVOLVE_ONE_SAMPLE 270 n--; 271 } 272 273 // Now the inputP is aligned and start to apply SSE. 274 float* endP = inputP + n - n % 4; 275 __m128 mInput; 276 __m128 mK1; 277 __m128 mK2; 278 __m128 mul1; 279 __m128 mul2; 280 281 __m128 sums1 = _mm_setzero_ps(); 282 __m128 sums2 = _mm_setzero_ps(); 283 bool k1Aligned = !(reinterpret_cast<uintptr_t>(k1) & 0x0F); 284 bool k2Aligned = !(reinterpret_cast<uintptr_t>(k2) & 0x0F); 285 286 #define LOAD_DATA(l1, l2) \ 287 mInput = _mm_load_ps(inputP); \ 288 mK1 = _mm_##l1##_ps(k1); \ 289 mK2 = _mm_##l2##_ps(k2); 290 291 #define CONVOLVE_4_SAMPLES \ 292 mul1 = _mm_mul_ps(mInput, mK1); \ 293 mul2 = _mm_mul_ps(mInput, mK2); \ 294 sums1 = _mm_add_ps(sums1, mul1); \ 295 sums2 = _mm_add_ps(sums2, mul2); \ 296 inputP += 4; \ 297 k1 += 4; \ 298 k2 += 4; 299 300 if (k1Aligned && k2Aligned) { // both aligned 301 while (inputP < endP) { 302 LOAD_DATA(load, load) 303 CONVOLVE_4_SAMPLES 304 } 305 } else if (!k1Aligned && k2Aligned) { // only k2 aligned 306 while (inputP < endP) { 307 LOAD_DATA(loadu, load) 308 CONVOLVE_4_SAMPLES 309 } 310 } else if (k1Aligned && !k2Aligned) { // only k1 aligned 311 while (inputP < endP) { 312 LOAD_DATA(load, loadu) 313 CONVOLVE_4_SAMPLES 314 } 315 } else { // both non-aligned 316 while (inputP < endP) { 317 LOAD_DATA(loadu, loadu) 318 CONVOLVE_4_SAMPLES 319 } 320 } 321 322 // Summarize the SSE results to sum1 and sum2. 323 float* groupSumP = reinterpret_cast<float*>(&sums1); 324 sum1 += groupSumP[0] + groupSumP[1] + groupSumP[2] + groupSumP[3]; 325 groupSumP = reinterpret_cast<float*>(&sums2); 326 sum2 += groupSumP[0] + groupSumP[1] + groupSumP[2] + groupSumP[3]; 327 328 n %= 4; 329 while (n) { 330 CONVOLVE_ONE_SAMPLE 331 n--; 332 } 333 #else 334 // FIXME: add ARM NEON optimizations for the following. The scalar code-path can probably also be optimized better. 335 336 // Optimize size 32 and size 64 kernels by unrolling the while loop. 337 // A 20 - 30% speed improvement was measured in some cases by using this approach. 338 339 if (n == 32) { 340 CONVOLVE_ONE_SAMPLE // 1 341 CONVOLVE_ONE_SAMPLE // 2 342 CONVOLVE_ONE_SAMPLE // 3 343 CONVOLVE_ONE_SAMPLE // 4 344 CONVOLVE_ONE_SAMPLE // 5 345 CONVOLVE_ONE_SAMPLE // 6 346 CONVOLVE_ONE_SAMPLE // 7 347 CONVOLVE_ONE_SAMPLE // 8 348 CONVOLVE_ONE_SAMPLE // 9 349 CONVOLVE_ONE_SAMPLE // 10 350 CONVOLVE_ONE_SAMPLE // 11 351 CONVOLVE_ONE_SAMPLE // 12 352 CONVOLVE_ONE_SAMPLE // 13 353 CONVOLVE_ONE_SAMPLE // 14 354 CONVOLVE_ONE_SAMPLE // 15 355 CONVOLVE_ONE_SAMPLE // 16 356 CONVOLVE_ONE_SAMPLE // 17 357 CONVOLVE_ONE_SAMPLE // 18 358 CONVOLVE_ONE_SAMPLE // 19 359 CONVOLVE_ONE_SAMPLE // 20 360 CONVOLVE_ONE_SAMPLE // 21 361 CONVOLVE_ONE_SAMPLE // 22 362 CONVOLVE_ONE_SAMPLE // 23 363 CONVOLVE_ONE_SAMPLE // 24 364 CONVOLVE_ONE_SAMPLE // 25 365 CONVOLVE_ONE_SAMPLE // 26 366 CONVOLVE_ONE_SAMPLE // 27 367 CONVOLVE_ONE_SAMPLE // 28 368 CONVOLVE_ONE_SAMPLE // 29 369 CONVOLVE_ONE_SAMPLE // 30 370 CONVOLVE_ONE_SAMPLE // 31 371 CONVOLVE_ONE_SAMPLE // 32 372 } else if (n == 64) { 373 CONVOLVE_ONE_SAMPLE // 1 374 CONVOLVE_ONE_SAMPLE // 2 375 CONVOLVE_ONE_SAMPLE // 3 376 CONVOLVE_ONE_SAMPLE // 4 377 CONVOLVE_ONE_SAMPLE // 5 378 CONVOLVE_ONE_SAMPLE // 6 379 CONVOLVE_ONE_SAMPLE // 7 380 CONVOLVE_ONE_SAMPLE // 8 381 CONVOLVE_ONE_SAMPLE // 9 382 CONVOLVE_ONE_SAMPLE // 10 383 CONVOLVE_ONE_SAMPLE // 11 384 CONVOLVE_ONE_SAMPLE // 12 385 CONVOLVE_ONE_SAMPLE // 13 386 CONVOLVE_ONE_SAMPLE // 14 387 CONVOLVE_ONE_SAMPLE // 15 388 CONVOLVE_ONE_SAMPLE // 16 389 CONVOLVE_ONE_SAMPLE // 17 390 CONVOLVE_ONE_SAMPLE // 18 391 CONVOLVE_ONE_SAMPLE // 19 392 CONVOLVE_ONE_SAMPLE // 20 393 CONVOLVE_ONE_SAMPLE // 21 394 CONVOLVE_ONE_SAMPLE // 22 395 CONVOLVE_ONE_SAMPLE // 23 396 CONVOLVE_ONE_SAMPLE // 24 397 CONVOLVE_ONE_SAMPLE // 25 398 CONVOLVE_ONE_SAMPLE // 26 399 CONVOLVE_ONE_SAMPLE // 27 400 CONVOLVE_ONE_SAMPLE // 28 401 CONVOLVE_ONE_SAMPLE // 29 402 CONVOLVE_ONE_SAMPLE // 30 403 CONVOLVE_ONE_SAMPLE // 31 404 CONVOLVE_ONE_SAMPLE // 32 405 CONVOLVE_ONE_SAMPLE // 33 406 CONVOLVE_ONE_SAMPLE // 34 407 CONVOLVE_ONE_SAMPLE // 35 408 CONVOLVE_ONE_SAMPLE // 36 409 CONVOLVE_ONE_SAMPLE // 37 410 CONVOLVE_ONE_SAMPLE // 38 411 CONVOLVE_ONE_SAMPLE // 39 412 CONVOLVE_ONE_SAMPLE // 40 413 CONVOLVE_ONE_SAMPLE // 41 414 CONVOLVE_ONE_SAMPLE // 42 415 CONVOLVE_ONE_SAMPLE // 43 416 CONVOLVE_ONE_SAMPLE // 44 417 CONVOLVE_ONE_SAMPLE // 45 418 CONVOLVE_ONE_SAMPLE // 46 419 CONVOLVE_ONE_SAMPLE // 47 420 CONVOLVE_ONE_SAMPLE // 48 421 CONVOLVE_ONE_SAMPLE // 49 422 CONVOLVE_ONE_SAMPLE // 50 423 CONVOLVE_ONE_SAMPLE // 51 424 CONVOLVE_ONE_SAMPLE // 52 425 CONVOLVE_ONE_SAMPLE // 53 426 CONVOLVE_ONE_SAMPLE // 54 427 CONVOLVE_ONE_SAMPLE // 55 428 CONVOLVE_ONE_SAMPLE // 56 429 CONVOLVE_ONE_SAMPLE // 57 430 CONVOLVE_ONE_SAMPLE // 58 431 CONVOLVE_ONE_SAMPLE // 59 432 CONVOLVE_ONE_SAMPLE // 60 433 CONVOLVE_ONE_SAMPLE // 61 434 CONVOLVE_ONE_SAMPLE // 62 435 CONVOLVE_ONE_SAMPLE // 63 436 CONVOLVE_ONE_SAMPLE // 64 437 } else { 438 while (n--) { 439 // Non-optimized using actual while loop. 440 CONVOLVE_ONE_SAMPLE 441 } 442 } 443 #endif 444 } 445 446 // Linearly interpolate the two "convolutions". 447 double result = (1.0 - kernelInterpolationFactor) * sum1 + kernelInterpolationFactor * sum2; 448 449 *destination++ = result; 450 451 // Advance the virtual index. 452 m_virtualSourceIndex += m_scaleFactor; 453 454 --numberOfDestinationFrames; 455 if (!numberOfDestinationFrames) 456 return; 457 } 458 459 // Wrap back around to the start. 460 m_virtualSourceIndex -= m_blockSize; 461 462 // Step (3) Copy r3 to r1 and r4 to r2. 463 // This wraps the last input frames back to the start of the buffer. 464 memcpy(r1, r3, sizeof(float) * (m_kernelSize / 2)); 465 memcpy(r2, r4, sizeof(float) * (m_kernelSize / 2)); 466 467 // Step (4) 468 // Refresh the buffer with more input. 469 consumeSource(r5, m_blockSize); 470 } 471 } 472 473 } // namespace WebCore 474 475 #endif // ENABLE(WEB_AUDIO) 476