1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "media/base/vector_math.h" 6 #include "media/base/vector_math_testing.h" 7 8 #include <algorithm> 9 10 #include "base/logging.h" 11 #include "build/build_config.h" 12 13 // NaCl does not allow intrinsics. 14 #if defined(ARCH_CPU_X86_FAMILY) && !defined(OS_NACL) 15 #include <xmmintrin.h> 16 #define FMAC_FUNC FMAC_SSE 17 #define FMUL_FUNC FMUL_SSE 18 #define EWMAAndMaxPower_FUNC EWMAAndMaxPower_SSE 19 #elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON) 20 #include <arm_neon.h> 21 #define FMAC_FUNC FMAC_NEON 22 #define FMUL_FUNC FMUL_NEON 23 #define EWMAAndMaxPower_FUNC EWMAAndMaxPower_NEON 24 #else 25 #define FMAC_FUNC FMAC_C 26 #define FMUL_FUNC FMUL_C 27 #define EWMAAndMaxPower_FUNC EWMAAndMaxPower_C 28 #endif 29 30 namespace media { 31 namespace vector_math { 32 33 void FMAC(const float src[], float scale, int len, float dest[]) { 34 // Ensure |src| and |dest| are 16-byte aligned. 35 DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(src) & (kRequiredAlignment - 1)); 36 DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(dest) & (kRequiredAlignment - 1)); 37 return FMAC_FUNC(src, scale, len, dest); 38 } 39 40 void FMAC_C(const float src[], float scale, int len, float dest[]) { 41 for (int i = 0; i < len; ++i) 42 dest[i] += src[i] * scale; 43 } 44 45 void FMUL(const float src[], float scale, int len, float dest[]) { 46 // Ensure |src| and |dest| are 16-byte aligned. 47 DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(src) & (kRequiredAlignment - 1)); 48 DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(dest) & (kRequiredAlignment - 1)); 49 return FMUL_FUNC(src, scale, len, dest); 50 } 51 52 void FMUL_C(const float src[], float scale, int len, float dest[]) { 53 for (int i = 0; i < len; ++i) 54 dest[i] = src[i] * scale; 55 } 56 57 void Crossfade(const float src[], int len, float dest[]) { 58 float cf_ratio = 0; 59 const float cf_increment = 1.0f / len; 60 for (int i = 0; i < len; ++i, cf_ratio += cf_increment) 61 dest[i] = (1.0f - cf_ratio) * src[i] + cf_ratio * dest[i]; 62 } 63 64 std::pair<float, float> EWMAAndMaxPower( 65 float initial_value, const float src[], int len, float smoothing_factor) { 66 // Ensure |src| is 16-byte aligned. 67 DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(src) & (kRequiredAlignment - 1)); 68 return EWMAAndMaxPower_FUNC(initial_value, src, len, smoothing_factor); 69 } 70 71 std::pair<float, float> EWMAAndMaxPower_C( 72 float initial_value, const float src[], int len, float smoothing_factor) { 73 std::pair<float, float> result(initial_value, 0.0f); 74 const float weight_prev = 1.0f - smoothing_factor; 75 for (int i = 0; i < len; ++i) { 76 result.first *= weight_prev; 77 const float sample = src[i]; 78 const float sample_squared = sample * sample; 79 result.first += sample_squared * smoothing_factor; 80 result.second = std::max(result.second, sample_squared); 81 } 82 return result; 83 } 84 85 #if defined(ARCH_CPU_X86_FAMILY) && !defined(OS_NACL) 86 void FMUL_SSE(const float src[], float scale, int len, float dest[]) { 87 const int rem = len % 4; 88 const int last_index = len - rem; 89 __m128 m_scale = _mm_set_ps1(scale); 90 for (int i = 0; i < last_index; i += 4) 91 _mm_store_ps(dest + i, _mm_mul_ps(_mm_load_ps(src + i), m_scale)); 92 93 // Handle any remaining values that wouldn't fit in an SSE pass. 94 for (int i = last_index; i < len; ++i) 95 dest[i] = src[i] * scale; 96 } 97 98 void FMAC_SSE(const float src[], float scale, int len, float dest[]) { 99 const int rem = len % 4; 100 const int last_index = len - rem; 101 __m128 m_scale = _mm_set_ps1(scale); 102 for (int i = 0; i < last_index; i += 4) { 103 _mm_store_ps(dest + i, _mm_add_ps(_mm_load_ps(dest + i), 104 _mm_mul_ps(_mm_load_ps(src + i), m_scale))); 105 } 106 107 // Handle any remaining values that wouldn't fit in an SSE pass. 108 for (int i = last_index; i < len; ++i) 109 dest[i] += src[i] * scale; 110 } 111 112 // Convenience macro to extract float 0 through 3 from the vector |a|. This is 113 // needed because compilers other than clang don't support access via 114 // operator[](). 115 #define EXTRACT_FLOAT(a, i) \ 116 (i == 0 ? \ 117 _mm_cvtss_f32(a) : \ 118 _mm_cvtss_f32(_mm_shuffle_ps(a, a, i))) 119 120 std::pair<float, float> EWMAAndMaxPower_SSE( 121 float initial_value, const float src[], int len, float smoothing_factor) { 122 // When the recurrence is unrolled, we see that we can split it into 4 123 // separate lanes of evaluation: 124 // 125 // y[n] = a(S[n]^2) + (1-a)(y[n-1]) 126 // = a(S[n]^2) + (1-a)^1(aS[n-1]^2) + (1-a)^2(aS[n-2]^2) + ... 127 // = z[n] + (1-a)^1(z[n-1]) + (1-a)^2(z[n-2]) + (1-a)^3(z[n-3]) 128 // 129 // where z[n] = a(S[n]^2) + (1-a)^4(z[n-4]) + (1-a)^8(z[n-8]) + ... 130 // 131 // Thus, the strategy here is to compute z[n], z[n-1], z[n-2], and z[n-3] in 132 // each of the 4 lanes, and then combine them to give y[n]. 133 134 const int rem = len % 4; 135 const int last_index = len - rem; 136 137 const __m128 smoothing_factor_x4 = _mm_set_ps1(smoothing_factor); 138 const float weight_prev = 1.0f - smoothing_factor; 139 const __m128 weight_prev_x4 = _mm_set_ps1(weight_prev); 140 const __m128 weight_prev_squared_x4 = 141 _mm_mul_ps(weight_prev_x4, weight_prev_x4); 142 const __m128 weight_prev_4th_x4 = 143 _mm_mul_ps(weight_prev_squared_x4, weight_prev_squared_x4); 144 145 // Compute z[n], z[n-1], z[n-2], and z[n-3] in parallel in lanes 3, 2, 1 and 146 // 0, respectively. 147 __m128 max_x4 = _mm_setzero_ps(); 148 __m128 ewma_x4 = _mm_setr_ps(0.0f, 0.0f, 0.0f, initial_value); 149 int i; 150 for (i = 0; i < last_index; i += 4) { 151 ewma_x4 = _mm_mul_ps(ewma_x4, weight_prev_4th_x4); 152 const __m128 sample_x4 = _mm_load_ps(src + i); 153 const __m128 sample_squared_x4 = _mm_mul_ps(sample_x4, sample_x4); 154 max_x4 = _mm_max_ps(max_x4, sample_squared_x4); 155 // Note: The compiler optimizes this to a single multiply-and-accumulate 156 // instruction: 157 ewma_x4 = _mm_add_ps(ewma_x4, 158 _mm_mul_ps(sample_squared_x4, smoothing_factor_x4)); 159 } 160 161 // y[n] = z[n] + (1-a)^1(z[n-1]) + (1-a)^2(z[n-2]) + (1-a)^3(z[n-3]) 162 float ewma = EXTRACT_FLOAT(ewma_x4, 3); 163 ewma_x4 = _mm_mul_ps(ewma_x4, weight_prev_x4); 164 ewma += EXTRACT_FLOAT(ewma_x4, 2); 165 ewma_x4 = _mm_mul_ps(ewma_x4, weight_prev_x4); 166 ewma += EXTRACT_FLOAT(ewma_x4, 1); 167 ewma_x4 = _mm_mul_ss(ewma_x4, weight_prev_x4); 168 ewma += EXTRACT_FLOAT(ewma_x4, 0); 169 170 // Fold the maximums together to get the overall maximum. 171 max_x4 = _mm_max_ps(max_x4, 172 _mm_shuffle_ps(max_x4, max_x4, _MM_SHUFFLE(3, 3, 1, 1))); 173 max_x4 = _mm_max_ss(max_x4, _mm_shuffle_ps(max_x4, max_x4, 2)); 174 175 std::pair<float, float> result(ewma, EXTRACT_FLOAT(max_x4, 0)); 176 177 // Handle remaining values at the end of |src|. 178 for (; i < len; ++i) { 179 result.first *= weight_prev; 180 const float sample = src[i]; 181 const float sample_squared = sample * sample; 182 result.first += sample_squared * smoothing_factor; 183 result.second = std::max(result.second, sample_squared); 184 } 185 186 return result; 187 } 188 #endif 189 190 #if defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON) 191 void FMAC_NEON(const float src[], float scale, int len, float dest[]) { 192 const int rem = len % 4; 193 const int last_index = len - rem; 194 float32x4_t m_scale = vmovq_n_f32(scale); 195 for (int i = 0; i < last_index; i += 4) { 196 vst1q_f32(dest + i, vmlaq_f32( 197 vld1q_f32(dest + i), vld1q_f32(src + i), m_scale)); 198 } 199 200 // Handle any remaining values that wouldn't fit in an NEON pass. 201 for (int i = last_index; i < len; ++i) 202 dest[i] += src[i] * scale; 203 } 204 205 void FMUL_NEON(const float src[], float scale, int len, float dest[]) { 206 const int rem = len % 4; 207 const int last_index = len - rem; 208 float32x4_t m_scale = vmovq_n_f32(scale); 209 for (int i = 0; i < last_index; i += 4) 210 vst1q_f32(dest + i, vmulq_f32(vld1q_f32(src + i), m_scale)); 211 212 // Handle any remaining values that wouldn't fit in an NEON pass. 213 for (int i = last_index; i < len; ++i) 214 dest[i] = src[i] * scale; 215 } 216 217 std::pair<float, float> EWMAAndMaxPower_NEON( 218 float initial_value, const float src[], int len, float smoothing_factor) { 219 // When the recurrence is unrolled, we see that we can split it into 4 220 // separate lanes of evaluation: 221 // 222 // y[n] = a(S[n]^2) + (1-a)(y[n-1]) 223 // = a(S[n]^2) + (1-a)^1(aS[n-1]^2) + (1-a)^2(aS[n-2]^2) + ... 224 // = z[n] + (1-a)^1(z[n-1]) + (1-a)^2(z[n-2]) + (1-a)^3(z[n-3]) 225 // 226 // where z[n] = a(S[n]^2) + (1-a)^4(z[n-4]) + (1-a)^8(z[n-8]) + ... 227 // 228 // Thus, the strategy here is to compute z[n], z[n-1], z[n-2], and z[n-3] in 229 // each of the 4 lanes, and then combine them to give y[n]. 230 231 const int rem = len % 4; 232 const int last_index = len - rem; 233 234 const float32x4_t smoothing_factor_x4 = vdupq_n_f32(smoothing_factor); 235 const float weight_prev = 1.0f - smoothing_factor; 236 const float32x4_t weight_prev_x4 = vdupq_n_f32(weight_prev); 237 const float32x4_t weight_prev_squared_x4 = 238 vmulq_f32(weight_prev_x4, weight_prev_x4); 239 const float32x4_t weight_prev_4th_x4 = 240 vmulq_f32(weight_prev_squared_x4, weight_prev_squared_x4); 241 242 // Compute z[n], z[n-1], z[n-2], and z[n-3] in parallel in lanes 3, 2, 1 and 243 // 0, respectively. 244 float32x4_t max_x4 = vdupq_n_f32(0.0f); 245 float32x4_t ewma_x4 = vsetq_lane_f32(initial_value, vdupq_n_f32(0.0f), 3); 246 int i; 247 for (i = 0; i < last_index; i += 4) { 248 ewma_x4 = vmulq_f32(ewma_x4, weight_prev_4th_x4); 249 const float32x4_t sample_x4 = vld1q_f32(src + i); 250 const float32x4_t sample_squared_x4 = vmulq_f32(sample_x4, sample_x4); 251 max_x4 = vmaxq_f32(max_x4, sample_squared_x4); 252 ewma_x4 = vmlaq_f32(ewma_x4, sample_squared_x4, smoothing_factor_x4); 253 } 254 255 // y[n] = z[n] + (1-a)^1(z[n-1]) + (1-a)^2(z[n-2]) + (1-a)^3(z[n-3]) 256 float ewma = vgetq_lane_f32(ewma_x4, 3); 257 ewma_x4 = vmulq_f32(ewma_x4, weight_prev_x4); 258 ewma += vgetq_lane_f32(ewma_x4, 2); 259 ewma_x4 = vmulq_f32(ewma_x4, weight_prev_x4); 260 ewma += vgetq_lane_f32(ewma_x4, 1); 261 ewma_x4 = vmulq_f32(ewma_x4, weight_prev_x4); 262 ewma += vgetq_lane_f32(ewma_x4, 0); 263 264 // Fold the maximums together to get the overall maximum. 265 float32x2_t max_x2 = vpmax_f32(vget_low_f32(max_x4), vget_high_f32(max_x4)); 266 max_x2 = vpmax_f32(max_x2, max_x2); 267 268 std::pair<float, float> result(ewma, vget_lane_f32(max_x2, 0)); 269 270 // Handle remaining values at the end of |src|. 271 for (; i < len; ++i) { 272 result.first *= weight_prev; 273 const float sample = src[i]; 274 const float sample_squared = sample * sample; 275 result.first += sample_squared * smoothing_factor; 276 result.second = std::max(result.second, sample_squared); 277 } 278 279 return result; 280 } 281 #endif 282 283 } // namespace vector_math 284 } // namespace media 285