1 /* 2 * 3 * Bluetooth low-complexity, subband codec (SBC) library 4 * 5 * Copyright (C) 2004-2009 Marcel Holtmann <marcel (at) holtmann.org> 6 * Copyright (C) 2004-2005 Henryk Ploetz <henryk (at) ploetzli.ch> 7 * Copyright (C) 2005-2006 Brad Midgley <bmidgley (at) xmission.com> 8 * 9 * 10 * This library is free software; you can redistribute it and/or 11 * modify it under the terms of the GNU Lesser General Public 12 * License as published by the Free Software Foundation; either 13 * version 2.1 of the License, or (at your option) any later version. 14 * 15 * This library is distributed in the hope that it will be useful, 16 * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18 * Lesser General Public License for more details. 19 * 20 * You should have received a copy of the GNU Lesser General Public 21 * License along with this library; if not, write to the Free Software 22 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 23 * 24 */ 25 26 #include <stdint.h> 27 #include <limits.h> 28 #include <string.h> 29 #include "sbc.h" 30 #include "sbc_math.h" 31 #include "sbc_tables.h" 32 33 #include "sbc_primitives.h" 34 #include "sbc_primitives_mmx.h" 35 #include "sbc_primitives_neon.h" 36 37 /* 38 * A reference C code of analysis filter with SIMD-friendly tables 39 * reordering and code layout. This code can be used to develop platform 40 * specific SIMD optimizations. Also it may be used as some kind of test 41 * for compiler autovectorization capabilities (who knows, if the compiler 42 * is very good at this stuff, hand optimized assembly may be not strictly 43 * needed for some platform). 44 * 45 * Note: It is also possible to make a simple variant of analysis filter, 46 * which needs only a single constants table without taking care about 47 * even/odd cases. This simple variant of filter can be implemented without 48 * input data permutation. The only thing that would be lost is the 49 * possibility to use pairwise SIMD multiplications. But for some simple 50 * CPU cores without SIMD extensions it can be useful. If anybody is 51 * interested in implementing such variant of a filter, sourcecode from 52 * bluez versions 4.26/4.27 can be used as a reference and the history of 53 * the changes in git repository done around that time may be worth checking. 54 */ 55 56 static inline void sbc_analyze_four_simd(const int16_t *in, int32_t *out, 57 const FIXED_T *consts) 58 { 59 FIXED_A t1[4]; 60 FIXED_T t2[4]; 61 int hop = 0; 62 63 /* rounding coefficient */ 64 t1[0] = t1[1] = t1[2] = t1[3] = 65 (FIXED_A) 1 << (SBC_PROTO_FIXED4_SCALE - 1); 66 67 /* low pass polyphase filter */ 68 for (hop = 0; hop < 40; hop += 8) { 69 t1[0] += (FIXED_A) in[hop] * consts[hop]; 70 t1[0] += (FIXED_A) in[hop + 1] * consts[hop + 1]; 71 t1[1] += (FIXED_A) in[hop + 2] * consts[hop + 2]; 72 t1[1] += (FIXED_A) in[hop + 3] * consts[hop + 3]; 73 t1[2] += (FIXED_A) in[hop + 4] * consts[hop + 4]; 74 t1[2] += (FIXED_A) in[hop + 5] * consts[hop + 5]; 75 t1[3] += (FIXED_A) in[hop + 6] * consts[hop + 6]; 76 t1[3] += (FIXED_A) in[hop + 7] * consts[hop + 7]; 77 } 78 79 /* scaling */ 80 t2[0] = t1[0] >> SBC_PROTO_FIXED4_SCALE; 81 t2[1] = t1[1] >> SBC_PROTO_FIXED4_SCALE; 82 t2[2] = t1[2] >> SBC_PROTO_FIXED4_SCALE; 83 t2[3] = t1[3] >> SBC_PROTO_FIXED4_SCALE; 84 85 /* do the cos transform */ 86 t1[0] = (FIXED_A) t2[0] * consts[40 + 0]; 87 t1[0] += (FIXED_A) t2[1] * consts[40 + 1]; 88 t1[1] = (FIXED_A) t2[0] * consts[40 + 2]; 89 t1[1] += (FIXED_A) t2[1] * consts[40 + 3]; 90 t1[2] = (FIXED_A) t2[0] * consts[40 + 4]; 91 t1[2] += (FIXED_A) t2[1] * consts[40 + 5]; 92 t1[3] = (FIXED_A) t2[0] * consts[40 + 6]; 93 t1[3] += (FIXED_A) t2[1] * consts[40 + 7]; 94 95 t1[0] += (FIXED_A) t2[2] * consts[40 + 8]; 96 t1[0] += (FIXED_A) t2[3] * consts[40 + 9]; 97 t1[1] += (FIXED_A) t2[2] * consts[40 + 10]; 98 t1[1] += (FIXED_A) t2[3] * consts[40 + 11]; 99 t1[2] += (FIXED_A) t2[2] * consts[40 + 12]; 100 t1[2] += (FIXED_A) t2[3] * consts[40 + 13]; 101 t1[3] += (FIXED_A) t2[2] * consts[40 + 14]; 102 t1[3] += (FIXED_A) t2[3] * consts[40 + 15]; 103 104 out[0] = t1[0] >> 105 (SBC_COS_TABLE_FIXED4_SCALE - SCALE_OUT_BITS); 106 out[1] = t1[1] >> 107 (SBC_COS_TABLE_FIXED4_SCALE - SCALE_OUT_BITS); 108 out[2] = t1[2] >> 109 (SBC_COS_TABLE_FIXED4_SCALE - SCALE_OUT_BITS); 110 out[3] = t1[3] >> 111 (SBC_COS_TABLE_FIXED4_SCALE - SCALE_OUT_BITS); 112 } 113 114 static inline void sbc_analyze_eight_simd(const int16_t *in, int32_t *out, 115 const FIXED_T *consts) 116 { 117 FIXED_A t1[8]; 118 FIXED_T t2[8]; 119 int i, hop; 120 121 /* rounding coefficient */ 122 t1[0] = t1[1] = t1[2] = t1[3] = t1[4] = t1[5] = t1[6] = t1[7] = 123 (FIXED_A) 1 << (SBC_PROTO_FIXED8_SCALE-1); 124 125 /* low pass polyphase filter */ 126 for (hop = 0; hop < 80; hop += 16) { 127 t1[0] += (FIXED_A) in[hop] * consts[hop]; 128 t1[0] += (FIXED_A) in[hop + 1] * consts[hop + 1]; 129 t1[1] += (FIXED_A) in[hop + 2] * consts[hop + 2]; 130 t1[1] += (FIXED_A) in[hop + 3] * consts[hop + 3]; 131 t1[2] += (FIXED_A) in[hop + 4] * consts[hop + 4]; 132 t1[2] += (FIXED_A) in[hop + 5] * consts[hop + 5]; 133 t1[3] += (FIXED_A) in[hop + 6] * consts[hop + 6]; 134 t1[3] += (FIXED_A) in[hop + 7] * consts[hop + 7]; 135 t1[4] += (FIXED_A) in[hop + 8] * consts[hop + 8]; 136 t1[4] += (FIXED_A) in[hop + 9] * consts[hop + 9]; 137 t1[5] += (FIXED_A) in[hop + 10] * consts[hop + 10]; 138 t1[5] += (FIXED_A) in[hop + 11] * consts[hop + 11]; 139 t1[6] += (FIXED_A) in[hop + 12] * consts[hop + 12]; 140 t1[6] += (FIXED_A) in[hop + 13] * consts[hop + 13]; 141 t1[7] += (FIXED_A) in[hop + 14] * consts[hop + 14]; 142 t1[7] += (FIXED_A) in[hop + 15] * consts[hop + 15]; 143 } 144 145 /* scaling */ 146 t2[0] = t1[0] >> SBC_PROTO_FIXED8_SCALE; 147 t2[1] = t1[1] >> SBC_PROTO_FIXED8_SCALE; 148 t2[2] = t1[2] >> SBC_PROTO_FIXED8_SCALE; 149 t2[3] = t1[3] >> SBC_PROTO_FIXED8_SCALE; 150 t2[4] = t1[4] >> SBC_PROTO_FIXED8_SCALE; 151 t2[5] = t1[5] >> SBC_PROTO_FIXED8_SCALE; 152 t2[6] = t1[6] >> SBC_PROTO_FIXED8_SCALE; 153 t2[7] = t1[7] >> SBC_PROTO_FIXED8_SCALE; 154 155 156 /* do the cos transform */ 157 t1[0] = t1[1] = t1[2] = t1[3] = t1[4] = t1[5] = t1[6] = t1[7] = 0; 158 159 for (i = 0; i < 4; i++) { 160 t1[0] += (FIXED_A) t2[i * 2 + 0] * consts[80 + i * 16 + 0]; 161 t1[0] += (FIXED_A) t2[i * 2 + 1] * consts[80 + i * 16 + 1]; 162 t1[1] += (FIXED_A) t2[i * 2 + 0] * consts[80 + i * 16 + 2]; 163 t1[1] += (FIXED_A) t2[i * 2 + 1] * consts[80 + i * 16 + 3]; 164 t1[2] += (FIXED_A) t2[i * 2 + 0] * consts[80 + i * 16 + 4]; 165 t1[2] += (FIXED_A) t2[i * 2 + 1] * consts[80 + i * 16 + 5]; 166 t1[3] += (FIXED_A) t2[i * 2 + 0] * consts[80 + i * 16 + 6]; 167 t1[3] += (FIXED_A) t2[i * 2 + 1] * consts[80 + i * 16 + 7]; 168 t1[4] += (FIXED_A) t2[i * 2 + 0] * consts[80 + i * 16 + 8]; 169 t1[4] += (FIXED_A) t2[i * 2 + 1] * consts[80 + i * 16 + 9]; 170 t1[5] += (FIXED_A) t2[i * 2 + 0] * consts[80 + i * 16 + 10]; 171 t1[5] += (FIXED_A) t2[i * 2 + 1] * consts[80 + i * 16 + 11]; 172 t1[6] += (FIXED_A) t2[i * 2 + 0] * consts[80 + i * 16 + 12]; 173 t1[6] += (FIXED_A) t2[i * 2 + 1] * consts[80 + i * 16 + 13]; 174 t1[7] += (FIXED_A) t2[i * 2 + 0] * consts[80 + i * 16 + 14]; 175 t1[7] += (FIXED_A) t2[i * 2 + 1] * consts[80 + i * 16 + 15]; 176 } 177 178 for (i = 0; i < 8; i++) 179 out[i] = t1[i] >> 180 (SBC_COS_TABLE_FIXED8_SCALE - SCALE_OUT_BITS); 181 } 182 183 static inline void sbc_analyze_4b_4s_simd(int16_t *x, 184 int32_t *out, int out_stride) 185 { 186 /* Analyze blocks */ 187 sbc_analyze_four_simd(x + 12, out, analysis_consts_fixed4_simd_odd); 188 out += out_stride; 189 sbc_analyze_four_simd(x + 8, out, analysis_consts_fixed4_simd_even); 190 out += out_stride; 191 sbc_analyze_four_simd(x + 4, out, analysis_consts_fixed4_simd_odd); 192 out += out_stride; 193 sbc_analyze_four_simd(x + 0, out, analysis_consts_fixed4_simd_even); 194 } 195 196 static inline void sbc_analyze_4b_8s_simd(int16_t *x, 197 int32_t *out, int out_stride) 198 { 199 /* Analyze blocks */ 200 sbc_analyze_eight_simd(x + 24, out, analysis_consts_fixed8_simd_odd); 201 out += out_stride; 202 sbc_analyze_eight_simd(x + 16, out, analysis_consts_fixed8_simd_even); 203 out += out_stride; 204 sbc_analyze_eight_simd(x + 8, out, analysis_consts_fixed8_simd_odd); 205 out += out_stride; 206 sbc_analyze_eight_simd(x + 0, out, analysis_consts_fixed8_simd_even); 207 } 208 209 static inline int16_t unaligned16_be(const uint8_t *ptr) 210 { 211 return (int16_t) ((ptr[0] << 8) | ptr[1]); 212 } 213 214 static inline int16_t unaligned16_le(const uint8_t *ptr) 215 { 216 return (int16_t) (ptr[0] | (ptr[1] << 8)); 217 } 218 219 /* 220 * Internal helper functions for input data processing. In order to get 221 * optimal performance, it is important to have "nsamples", "nchannels" 222 * and "big_endian" arguments used with this inline function as compile 223 * time constants. 224 */ 225 226 static SBC_ALWAYS_INLINE int sbc_encoder_process_input_s4_internal( 227 int position, 228 const uint8_t *pcm, int16_t X[2][SBC_X_BUFFER_SIZE], 229 int nsamples, int nchannels, int big_endian) 230 { 231 /* handle X buffer wraparound */ 232 if (position < nsamples) { 233 if (nchannels > 0) 234 memcpy(&X[0][SBC_X_BUFFER_SIZE - 40], &X[0][position], 235 36 * sizeof(int16_t)); 236 if (nchannels > 1) 237 memcpy(&X[1][SBC_X_BUFFER_SIZE - 40], &X[1][position], 238 36 * sizeof(int16_t)); 239 position = SBC_X_BUFFER_SIZE - 40; 240 } 241 242 #define PCM(i) (big_endian ? \ 243 unaligned16_be(pcm + (i) * 2) : unaligned16_le(pcm + (i) * 2)) 244 245 /* copy/permutate audio samples */ 246 while ((nsamples -= 8) >= 0) { 247 position -= 8; 248 if (nchannels > 0) { 249 int16_t *x = &X[0][position]; 250 x[0] = PCM(0 + 7 * nchannels); 251 x[1] = PCM(0 + 3 * nchannels); 252 x[2] = PCM(0 + 6 * nchannels); 253 x[3] = PCM(0 + 4 * nchannels); 254 x[4] = PCM(0 + 0 * nchannels); 255 x[5] = PCM(0 + 2 * nchannels); 256 x[6] = PCM(0 + 1 * nchannels); 257 x[7] = PCM(0 + 5 * nchannels); 258 } 259 if (nchannels > 1) { 260 int16_t *x = &X[1][position]; 261 x[0] = PCM(1 + 7 * nchannels); 262 x[1] = PCM(1 + 3 * nchannels); 263 x[2] = PCM(1 + 6 * nchannels); 264 x[3] = PCM(1 + 4 * nchannels); 265 x[4] = PCM(1 + 0 * nchannels); 266 x[5] = PCM(1 + 2 * nchannels); 267 x[6] = PCM(1 + 1 * nchannels); 268 x[7] = PCM(1 + 5 * nchannels); 269 } 270 pcm += 16 * nchannels; 271 } 272 #undef PCM 273 274 return position; 275 } 276 277 static SBC_ALWAYS_INLINE int sbc_encoder_process_input_s8_internal( 278 int position, 279 const uint8_t *pcm, int16_t X[2][SBC_X_BUFFER_SIZE], 280 int nsamples, int nchannels, int big_endian) 281 { 282 /* handle X buffer wraparound */ 283 if (position < nsamples) { 284 if (nchannels > 0) 285 memcpy(&X[0][SBC_X_BUFFER_SIZE - 72], &X[0][position], 286 72 * sizeof(int16_t)); 287 if (nchannels > 1) 288 memcpy(&X[1][SBC_X_BUFFER_SIZE - 72], &X[1][position], 289 72 * sizeof(int16_t)); 290 position = SBC_X_BUFFER_SIZE - 72; 291 } 292 293 #define PCM(i) (big_endian ? \ 294 unaligned16_be(pcm + (i) * 2) : unaligned16_le(pcm + (i) * 2)) 295 296 /* copy/permutate audio samples */ 297 while ((nsamples -= 16) >= 0) { 298 position -= 16; 299 if (nchannels > 0) { 300 int16_t *x = &X[0][position]; 301 x[0] = PCM(0 + 15 * nchannels); 302 x[1] = PCM(0 + 7 * nchannels); 303 x[2] = PCM(0 + 14 * nchannels); 304 x[3] = PCM(0 + 8 * nchannels); 305 x[4] = PCM(0 + 13 * nchannels); 306 x[5] = PCM(0 + 9 * nchannels); 307 x[6] = PCM(0 + 12 * nchannels); 308 x[7] = PCM(0 + 10 * nchannels); 309 x[8] = PCM(0 + 11 * nchannels); 310 x[9] = PCM(0 + 3 * nchannels); 311 x[10] = PCM(0 + 6 * nchannels); 312 x[11] = PCM(0 + 0 * nchannels); 313 x[12] = PCM(0 + 5 * nchannels); 314 x[13] = PCM(0 + 1 * nchannels); 315 x[14] = PCM(0 + 4 * nchannels); 316 x[15] = PCM(0 + 2 * nchannels); 317 } 318 if (nchannels > 1) { 319 int16_t *x = &X[1][position]; 320 x[0] = PCM(1 + 15 * nchannels); 321 x[1] = PCM(1 + 7 * nchannels); 322 x[2] = PCM(1 + 14 * nchannels); 323 x[3] = PCM(1 + 8 * nchannels); 324 x[4] = PCM(1 + 13 * nchannels); 325 x[5] = PCM(1 + 9 * nchannels); 326 x[6] = PCM(1 + 12 * nchannels); 327 x[7] = PCM(1 + 10 * nchannels); 328 x[8] = PCM(1 + 11 * nchannels); 329 x[9] = PCM(1 + 3 * nchannels); 330 x[10] = PCM(1 + 6 * nchannels); 331 x[11] = PCM(1 + 0 * nchannels); 332 x[12] = PCM(1 + 5 * nchannels); 333 x[13] = PCM(1 + 1 * nchannels); 334 x[14] = PCM(1 + 4 * nchannels); 335 x[15] = PCM(1 + 2 * nchannels); 336 } 337 pcm += 32 * nchannels; 338 } 339 #undef PCM 340 341 return position; 342 } 343 344 /* 345 * Input data processing functions. The data is endian converted if needed, 346 * channels are deintrleaved and audio samples are reordered for use in 347 * SIMD-friendly analysis filter function. The results are put into "X" 348 * array, getting appended to the previous data (or it is better to say 349 * prepended, as the buffer is filled from top to bottom). Old data is 350 * discarded when neededed, but availability of (10 * nrof_subbands) 351 * contiguous samples is always guaranteed for the input to the analysis 352 * filter. This is achieved by copying a sufficient part of old data 353 * to the top of the buffer on buffer wraparound. 354 */ 355 356 static int sbc_enc_process_input_4s_le(int position, 357 const uint8_t *pcm, int16_t X[2][SBC_X_BUFFER_SIZE], 358 int nsamples, int nchannels) 359 { 360 if (nchannels > 1) 361 return sbc_encoder_process_input_s4_internal( 362 position, pcm, X, nsamples, 2, 0); 363 else 364 return sbc_encoder_process_input_s4_internal( 365 position, pcm, X, nsamples, 1, 0); 366 } 367 368 static int sbc_enc_process_input_4s_be(int position, 369 const uint8_t *pcm, int16_t X[2][SBC_X_BUFFER_SIZE], 370 int nsamples, int nchannels) 371 { 372 if (nchannels > 1) 373 return sbc_encoder_process_input_s4_internal( 374 position, pcm, X, nsamples, 2, 1); 375 else 376 return sbc_encoder_process_input_s4_internal( 377 position, pcm, X, nsamples, 1, 1); 378 } 379 380 static int sbc_enc_process_input_8s_le(int position, 381 const uint8_t *pcm, int16_t X[2][SBC_X_BUFFER_SIZE], 382 int nsamples, int nchannels) 383 { 384 if (nchannels > 1) 385 return sbc_encoder_process_input_s8_internal( 386 position, pcm, X, nsamples, 2, 0); 387 else 388 return sbc_encoder_process_input_s8_internal( 389 position, pcm, X, nsamples, 1, 0); 390 } 391 392 static int sbc_enc_process_input_8s_be(int position, 393 const uint8_t *pcm, int16_t X[2][SBC_X_BUFFER_SIZE], 394 int nsamples, int nchannels) 395 { 396 if (nchannels > 1) 397 return sbc_encoder_process_input_s8_internal( 398 position, pcm, X, nsamples, 2, 1); 399 else 400 return sbc_encoder_process_input_s8_internal( 401 position, pcm, X, nsamples, 1, 1); 402 } 403 404 /* Supplementary function to count the number of leading zeros */ 405 406 static inline int sbc_clz(uint32_t x) 407 { 408 #ifdef __GNUC__ 409 return __builtin_clz(x); 410 #else 411 /* TODO: this should be replaced with something better if good 412 * performance is wanted when using compilers other than gcc */ 413 int cnt = 0; 414 while (x) { 415 cnt++; 416 x >>= 1; 417 } 418 return 32 - cnt; 419 #endif 420 } 421 422 static void sbc_calc_scalefactors( 423 int32_t sb_sample_f[16][2][8], 424 uint32_t scale_factor[2][8], 425 int blocks, int channels, int subbands) 426 { 427 int ch, sb, blk; 428 for (ch = 0; ch < channels; ch++) { 429 for (sb = 0; sb < subbands; sb++) { 430 uint32_t x = 1 << SCALE_OUT_BITS; 431 for (blk = 0; blk < blocks; blk++) { 432 int32_t tmp = fabs(sb_sample_f[blk][ch][sb]); 433 if (tmp != 0) 434 x |= tmp - 1; 435 } 436 scale_factor[ch][sb] = (31 - SCALE_OUT_BITS) - 437 sbc_clz(x); 438 } 439 } 440 } 441 442 /* 443 * Detect CPU features and setup function pointers 444 */ 445 void sbc_init_primitives(struct sbc_encoder_state *state) 446 { 447 /* Default implementation for analyze functions */ 448 state->sbc_analyze_4b_4s = sbc_analyze_4b_4s_simd; 449 state->sbc_analyze_4b_8s = sbc_analyze_4b_8s_simd; 450 451 /* Default implementation for input reordering / deinterleaving */ 452 state->sbc_enc_process_input_4s_le = sbc_enc_process_input_4s_le; 453 state->sbc_enc_process_input_4s_be = sbc_enc_process_input_4s_be; 454 state->sbc_enc_process_input_8s_le = sbc_enc_process_input_8s_le; 455 state->sbc_enc_process_input_8s_be = sbc_enc_process_input_8s_be; 456 457 /* Default implementation for scale factors calculation */ 458 state->sbc_calc_scalefactors = sbc_calc_scalefactors; 459 state->implementation_info = "Generic C"; 460 461 /* X86/AMD64 optimizations */ 462 #ifdef SBC_BUILD_WITH_MMX_SUPPORT 463 sbc_init_primitives_mmx(state); 464 #endif 465 466 /* ARM optimizations */ 467 #ifdef SBC_BUILD_WITH_NEON_SUPPORT 468 sbc_init_primitives_neon(state); 469 #endif 470 } 471