1 // Copyright 2013 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "media/base/sinc_resampler.h" 6 7 #include <xmmintrin.h> 8 9 namespace media { 10 11 float SincResampler::Convolve_SSE(const float* input_ptr, const float* k1, 12 const float* k2, 13 double kernel_interpolation_factor) { 14 __m128 m_input; 15 __m128 m_sums1 = _mm_setzero_ps(); 16 __m128 m_sums2 = _mm_setzero_ps(); 17 18 // Based on |input_ptr| alignment, we need to use loadu or load. Unrolling 19 // these loops hurt performance in local testing. 20 if (reinterpret_cast<uintptr_t>(input_ptr) & 0x0F) { 21 for (int i = 0; i < kKernelSize; i += 4) { 22 m_input = _mm_loadu_ps(input_ptr + i); 23 m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i))); 24 m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i))); 25 } 26 } else { 27 for (int i = 0; i < kKernelSize; i += 4) { 28 m_input = _mm_load_ps(input_ptr + i); 29 m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i))); 30 m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i))); 31 } 32 } 33 34 // Linearly interpolate the two "convolutions". 35 m_sums1 = _mm_mul_ps(m_sums1, _mm_set_ps1(1.0 - kernel_interpolation_factor)); 36 m_sums2 = _mm_mul_ps(m_sums2, _mm_set_ps1(kernel_interpolation_factor)); 37 m_sums1 = _mm_add_ps(m_sums1, m_sums2); 38 39 // Sum components together. 40 float result; 41 m_sums2 = _mm_add_ps(_mm_movehl_ps(m_sums1, m_sums1), m_sums1); 42 _mm_store_ss(&result, _mm_add_ss(m_sums2, _mm_shuffle_ps( 43 m_sums2, m_sums2, 1))); 44 45 return result; 46 } 47 48 } // namespace media 49