Home | History | Annotate | Download | only in base
      1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "media/base/vector_math.h"
      6 #include "media/base/vector_math_testing.h"
      7 
      8 #include <algorithm>
      9 
     10 #include "base/logging.h"
     11 #include "build/build_config.h"
     12 
     13 // NaCl does not allow intrinsics.
     14 #if defined(ARCH_CPU_X86_FAMILY) && !defined(OS_NACL)
     15 #include <xmmintrin.h>
     16 #define FMAC_FUNC FMAC_SSE
     17 #define FMUL_FUNC FMUL_SSE
     18 #define EWMAAndMaxPower_FUNC EWMAAndMaxPower_SSE
     19 #elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)
     20 #include <arm_neon.h>
     21 #define FMAC_FUNC FMAC_NEON
     22 #define FMUL_FUNC FMUL_NEON
     23 #define EWMAAndMaxPower_FUNC EWMAAndMaxPower_NEON
     24 #else
     25 #define FMAC_FUNC FMAC_C
     26 #define FMUL_FUNC FMUL_C
     27 #define EWMAAndMaxPower_FUNC EWMAAndMaxPower_C
     28 #endif
     29 
     30 namespace media {
     31 namespace vector_math {
     32 
     33 void FMAC(const float src[], float scale, int len, float dest[]) {
     34   // Ensure |src| and |dest| are 16-byte aligned.
     35   DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(src) & (kRequiredAlignment - 1));
     36   DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(dest) & (kRequiredAlignment - 1));
     37   return FMAC_FUNC(src, scale, len, dest);
     38 }
     39 
     40 void FMAC_C(const float src[], float scale, int len, float dest[]) {
     41   for (int i = 0; i < len; ++i)
     42     dest[i] += src[i] * scale;
     43 }
     44 
     45 void FMUL(const float src[], float scale, int len, float dest[]) {
     46   // Ensure |src| and |dest| are 16-byte aligned.
     47   DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(src) & (kRequiredAlignment - 1));
     48   DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(dest) & (kRequiredAlignment - 1));
     49   return FMUL_FUNC(src, scale, len, dest);
     50 }
     51 
     52 void FMUL_C(const float src[], float scale, int len, float dest[]) {
     53   for (int i = 0; i < len; ++i)
     54     dest[i] = src[i] * scale;
     55 }
     56 
     57 void Crossfade(const float src[], int len, float dest[]) {
     58   float cf_ratio = 0;
     59   const float cf_increment = 1.0f / len;
     60   for (int i = 0; i < len; ++i, cf_ratio += cf_increment)
     61     dest[i] = (1.0f - cf_ratio) * src[i] + cf_ratio * dest[i];
     62 }
     63 
     64 std::pair<float, float> EWMAAndMaxPower(
     65     float initial_value, const float src[], int len, float smoothing_factor) {
     66   // Ensure |src| is 16-byte aligned.
     67   DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(src) & (kRequiredAlignment - 1));
     68   return EWMAAndMaxPower_FUNC(initial_value, src, len, smoothing_factor);
     69 }
     70 
     71 std::pair<float, float> EWMAAndMaxPower_C(
     72     float initial_value, const float src[], int len, float smoothing_factor) {
     73   std::pair<float, float> result(initial_value, 0.0f);
     74   const float weight_prev = 1.0f - smoothing_factor;
     75   for (int i = 0; i < len; ++i) {
     76     result.first *= weight_prev;
     77     const float sample = src[i];
     78     const float sample_squared = sample * sample;
     79     result.first += sample_squared * smoothing_factor;
     80     result.second = std::max(result.second, sample_squared);
     81   }
     82   return result;
     83 }
     84 
     85 #if defined(ARCH_CPU_X86_FAMILY) && !defined(OS_NACL)
     86 void FMUL_SSE(const float src[], float scale, int len, float dest[]) {
     87   const int rem = len % 4;
     88   const int last_index = len - rem;
     89   __m128 m_scale = _mm_set_ps1(scale);
     90   for (int i = 0; i < last_index; i += 4)
     91     _mm_store_ps(dest + i, _mm_mul_ps(_mm_load_ps(src + i), m_scale));
     92 
     93   // Handle any remaining values that wouldn't fit in an SSE pass.
     94   for (int i = last_index; i < len; ++i)
     95     dest[i] = src[i] * scale;
     96 }
     97 
     98 void FMAC_SSE(const float src[], float scale, int len, float dest[]) {
     99   const int rem = len % 4;
    100   const int last_index = len - rem;
    101   __m128 m_scale = _mm_set_ps1(scale);
    102   for (int i = 0; i < last_index; i += 4) {
    103     _mm_store_ps(dest + i, _mm_add_ps(_mm_load_ps(dest + i),
    104                  _mm_mul_ps(_mm_load_ps(src + i), m_scale)));
    105   }
    106 
    107   // Handle any remaining values that wouldn't fit in an SSE pass.
    108   for (int i = last_index; i < len; ++i)
    109     dest[i] += src[i] * scale;
    110 }
    111 
    112 // Convenience macro to extract float 0 through 3 from the vector |a|.  This is
    113 // needed because compilers other than clang don't support access via
    114 // operator[]().
    115 #define EXTRACT_FLOAT(a, i) \
    116     (i == 0 ? \
    117          _mm_cvtss_f32(a) : \
    118          _mm_cvtss_f32(_mm_shuffle_ps(a, a, i)))
    119 
    120 std::pair<float, float> EWMAAndMaxPower_SSE(
    121     float initial_value, const float src[], int len, float smoothing_factor) {
    122   // When the recurrence is unrolled, we see that we can split it into 4
    123   // separate lanes of evaluation:
    124   //
    125   // y[n] = a(S[n]^2) + (1-a)(y[n-1])
    126   //      = a(S[n]^2) + (1-a)^1(aS[n-1]^2) + (1-a)^2(aS[n-2]^2) + ...
    127   //      = z[n] + (1-a)^1(z[n-1]) + (1-a)^2(z[n-2]) + (1-a)^3(z[n-3])
    128   //
    129   // where z[n] = a(S[n]^2) + (1-a)^4(z[n-4]) + (1-a)^8(z[n-8]) + ...
    130   //
    131   // Thus, the strategy here is to compute z[n], z[n-1], z[n-2], and z[n-3] in
    132   // each of the 4 lanes, and then combine them to give y[n].
    133 
    134   const int rem = len % 4;
    135   const int last_index = len - rem;
    136 
    137   const __m128 smoothing_factor_x4 = _mm_set_ps1(smoothing_factor);
    138   const float weight_prev = 1.0f - smoothing_factor;
    139   const __m128 weight_prev_x4 = _mm_set_ps1(weight_prev);
    140   const __m128 weight_prev_squared_x4 =
    141       _mm_mul_ps(weight_prev_x4, weight_prev_x4);
    142   const __m128 weight_prev_4th_x4 =
    143       _mm_mul_ps(weight_prev_squared_x4, weight_prev_squared_x4);
    144 
    145   // Compute z[n], z[n-1], z[n-2], and z[n-3] in parallel in lanes 3, 2, 1 and
    146   // 0, respectively.
    147   __m128 max_x4 = _mm_setzero_ps();
    148   __m128 ewma_x4 = _mm_setr_ps(0.0f, 0.0f, 0.0f, initial_value);
    149   int i;
    150   for (i = 0; i < last_index; i += 4) {
    151     ewma_x4 = _mm_mul_ps(ewma_x4, weight_prev_4th_x4);
    152     const __m128 sample_x4 = _mm_load_ps(src + i);
    153     const __m128 sample_squared_x4 = _mm_mul_ps(sample_x4, sample_x4);
    154     max_x4 = _mm_max_ps(max_x4, sample_squared_x4);
    155     // Note: The compiler optimizes this to a single multiply-and-accumulate
    156     // instruction:
    157     ewma_x4 = _mm_add_ps(ewma_x4,
    158                          _mm_mul_ps(sample_squared_x4, smoothing_factor_x4));
    159   }
    160 
    161   // y[n] = z[n] + (1-a)^1(z[n-1]) + (1-a)^2(z[n-2]) + (1-a)^3(z[n-3])
    162   float ewma = EXTRACT_FLOAT(ewma_x4, 3);
    163   ewma_x4 = _mm_mul_ps(ewma_x4, weight_prev_x4);
    164   ewma += EXTRACT_FLOAT(ewma_x4, 2);
    165   ewma_x4 = _mm_mul_ps(ewma_x4, weight_prev_x4);
    166   ewma += EXTRACT_FLOAT(ewma_x4, 1);
    167   ewma_x4 = _mm_mul_ss(ewma_x4, weight_prev_x4);
    168   ewma += EXTRACT_FLOAT(ewma_x4, 0);
    169 
    170   // Fold the maximums together to get the overall maximum.
    171   max_x4 = _mm_max_ps(max_x4,
    172                       _mm_shuffle_ps(max_x4, max_x4, _MM_SHUFFLE(3, 3, 1, 1)));
    173   max_x4 = _mm_max_ss(max_x4, _mm_shuffle_ps(max_x4, max_x4, 2));
    174 
    175   std::pair<float, float> result(ewma, EXTRACT_FLOAT(max_x4, 0));
    176 
    177   // Handle remaining values at the end of |src|.
    178   for (; i < len; ++i) {
    179     result.first *= weight_prev;
    180     const float sample = src[i];
    181     const float sample_squared = sample * sample;
    182     result.first += sample_squared * smoothing_factor;
    183     result.second = std::max(result.second, sample_squared);
    184   }
    185 
    186   return result;
    187 }
    188 #endif
    189 
    190 #if defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)
    191 void FMAC_NEON(const float src[], float scale, int len, float dest[]) {
    192   const int rem = len % 4;
    193   const int last_index = len - rem;
    194   float32x4_t m_scale = vmovq_n_f32(scale);
    195   for (int i = 0; i < last_index; i += 4) {
    196     vst1q_f32(dest + i, vmlaq_f32(
    197         vld1q_f32(dest + i), vld1q_f32(src + i), m_scale));
    198   }
    199 
    200   // Handle any remaining values that wouldn't fit in an NEON pass.
    201   for (int i = last_index; i < len; ++i)
    202     dest[i] += src[i] * scale;
    203 }
    204 
    205 void FMUL_NEON(const float src[], float scale, int len, float dest[]) {
    206   const int rem = len % 4;
    207   const int last_index = len - rem;
    208   float32x4_t m_scale = vmovq_n_f32(scale);
    209   for (int i = 0; i < last_index; i += 4)
    210     vst1q_f32(dest + i, vmulq_f32(vld1q_f32(src + i), m_scale));
    211 
    212   // Handle any remaining values that wouldn't fit in an NEON pass.
    213   for (int i = last_index; i < len; ++i)
    214     dest[i] = src[i] * scale;
    215 }
    216 
    217 std::pair<float, float> EWMAAndMaxPower_NEON(
    218     float initial_value, const float src[], int len, float smoothing_factor) {
    219   // When the recurrence is unrolled, we see that we can split it into 4
    220   // separate lanes of evaluation:
    221   //
    222   // y[n] = a(S[n]^2) + (1-a)(y[n-1])
    223   //      = a(S[n]^2) + (1-a)^1(aS[n-1]^2) + (1-a)^2(aS[n-2]^2) + ...
    224   //      = z[n] + (1-a)^1(z[n-1]) + (1-a)^2(z[n-2]) + (1-a)^3(z[n-3])
    225   //
    226   // where z[n] = a(S[n]^2) + (1-a)^4(z[n-4]) + (1-a)^8(z[n-8]) + ...
    227   //
    228   // Thus, the strategy here is to compute z[n], z[n-1], z[n-2], and z[n-3] in
    229   // each of the 4 lanes, and then combine them to give y[n].
    230 
    231   const int rem = len % 4;
    232   const int last_index = len - rem;
    233 
    234   const float32x4_t smoothing_factor_x4 = vdupq_n_f32(smoothing_factor);
    235   const float weight_prev = 1.0f - smoothing_factor;
    236   const float32x4_t weight_prev_x4 = vdupq_n_f32(weight_prev);
    237   const float32x4_t weight_prev_squared_x4 =
    238       vmulq_f32(weight_prev_x4, weight_prev_x4);
    239   const float32x4_t weight_prev_4th_x4 =
    240       vmulq_f32(weight_prev_squared_x4, weight_prev_squared_x4);
    241 
    242   // Compute z[n], z[n-1], z[n-2], and z[n-3] in parallel in lanes 3, 2, 1 and
    243   // 0, respectively.
    244   float32x4_t max_x4 = vdupq_n_f32(0.0f);
    245   float32x4_t ewma_x4 = vsetq_lane_f32(initial_value, vdupq_n_f32(0.0f), 3);
    246   int i;
    247   for (i = 0; i < last_index; i += 4) {
    248     ewma_x4 = vmulq_f32(ewma_x4, weight_prev_4th_x4);
    249     const float32x4_t sample_x4 = vld1q_f32(src + i);
    250     const float32x4_t sample_squared_x4 = vmulq_f32(sample_x4, sample_x4);
    251     max_x4 = vmaxq_f32(max_x4, sample_squared_x4);
    252     ewma_x4 = vmlaq_f32(ewma_x4, sample_squared_x4, smoothing_factor_x4);
    253   }
    254 
    255   // y[n] = z[n] + (1-a)^1(z[n-1]) + (1-a)^2(z[n-2]) + (1-a)^3(z[n-3])
    256   float ewma = vgetq_lane_f32(ewma_x4, 3);
    257   ewma_x4 = vmulq_f32(ewma_x4, weight_prev_x4);
    258   ewma += vgetq_lane_f32(ewma_x4, 2);
    259   ewma_x4 = vmulq_f32(ewma_x4, weight_prev_x4);
    260   ewma += vgetq_lane_f32(ewma_x4, 1);
    261   ewma_x4 = vmulq_f32(ewma_x4, weight_prev_x4);
    262   ewma += vgetq_lane_f32(ewma_x4, 0);
    263 
    264   // Fold the maximums together to get the overall maximum.
    265   float32x2_t max_x2 = vpmax_f32(vget_low_f32(max_x4), vget_high_f32(max_x4));
    266   max_x2 = vpmax_f32(max_x2, max_x2);
    267 
    268   std::pair<float, float> result(ewma, vget_lane_f32(max_x2, 0));
    269 
    270   // Handle remaining values at the end of |src|.
    271   for (; i < len; ++i) {
    272     result.first *= weight_prev;
    273     const float sample = src[i];
    274     const float sample_squared = sample * sample;
    275     result.first += sample_squared * smoothing_factor;
    276     result.second = std::max(result.second, sample_squared);
    277   }
    278 
    279   return result;
    280 }
    281 #endif
    282 
    283 }  // namespace vector_math
    284 }  // namespace media
    285