Home | History | Annotate | Download | only in sbc
      1 /*
      2  *
      3  *  Bluetooth low-complexity, subband codec (SBC) library
      4  *
      5  *  Copyright (C) 2008-2010  Nokia Corporation
      6  *  Copyright (C) 2004-2010  Marcel Holtmann <marcel (at) holtmann.org>
      7  *  Copyright (C) 2004-2005  Henryk Ploetz <henryk (at) ploetzli.ch>
      8  *  Copyright (C) 2005-2006  Brad Midgley <bmidgley (at) xmission.com>
      9  *
     10  *
     11  *  This library is free software; you can redistribute it and/or
     12  *  modify it under the terms of the GNU Lesser General Public
     13  *  License as published by the Free Software Foundation; either
     14  *  version 2.1 of the License, or (at your option) any later version.
     15  *
     16  *  This library is distributed in the hope that it will be useful,
     17  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
     18  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     19  *  Lesser General Public License for more details.
     20  *
     21  *  You should have received a copy of the GNU Lesser General Public
     22  *  License along with this library; if not, write to the Free Software
     23  *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
     24  *
     25  */
     26 
     27 #include <stdint.h>
     28 #include <limits.h>
     29 #include <string.h>
     30 #include "sbc.h"
     31 #include "sbc_math.h"
     32 #include "sbc_tables.h"
     33 
     34 #include "sbc_primitives.h"
     35 #include "sbc_primitives_mmx.h"
     36 #include "sbc_primitives_iwmmxt.h"
     37 #include "sbc_primitives_neon.h"
     38 #include "sbc_primitives_armv6.h"
     39 
     40 /*
     41  * A reference C code of analysis filter with SIMD-friendly tables
     42  * reordering and code layout. This code can be used to develop platform
     43  * specific SIMD optimizations. Also it may be used as some kind of test
     44  * for compiler autovectorization capabilities (who knows, if the compiler
     45  * is very good at this stuff, hand optimized assembly may be not strictly
     46  * needed for some platform).
     47  *
     48  * Note: It is also possible to make a simple variant of analysis filter,
     49  * which needs only a single constants table without taking care about
     50  * even/odd cases. This simple variant of filter can be implemented without
     51  * input data permutation. The only thing that would be lost is the
     52  * possibility to use pairwise SIMD multiplications. But for some simple
     53  * CPU cores without SIMD extensions it can be useful. If anybody is
     54  * interested in implementing such variant of a filter, sourcecode from
     55  * bluez versions 4.26/4.27 can be used as a reference and the history of
     56  * the changes in git repository done around that time may be worth checking.
     57  */
     58 
     59 static inline void sbc_analyze_four_simd(const int16_t *in, int32_t *out,
     60 							const FIXED_T *consts)
     61 {
     62 	FIXED_A t1[4];
     63 	FIXED_T t2[4];
     64 	int hop = 0;
     65 
     66 	/* rounding coefficient */
     67 	t1[0] = t1[1] = t1[2] = t1[3] =
     68 		(FIXED_A) 1 << (SBC_PROTO_FIXED4_SCALE - 1);
     69 
     70 	/* low pass polyphase filter */
     71 	for (hop = 0; hop < 40; hop += 8) {
     72 		t1[0] += (FIXED_A) in[hop] * consts[hop];
     73 		t1[0] += (FIXED_A) in[hop + 1] * consts[hop + 1];
     74 		t1[1] += (FIXED_A) in[hop + 2] * consts[hop + 2];
     75 		t1[1] += (FIXED_A) in[hop + 3] * consts[hop + 3];
     76 		t1[2] += (FIXED_A) in[hop + 4] * consts[hop + 4];
     77 		t1[2] += (FIXED_A) in[hop + 5] * consts[hop + 5];
     78 		t1[3] += (FIXED_A) in[hop + 6] * consts[hop + 6];
     79 		t1[3] += (FIXED_A) in[hop + 7] * consts[hop + 7];
     80 	}
     81 
     82 	/* scaling */
     83 	t2[0] = t1[0] >> SBC_PROTO_FIXED4_SCALE;
     84 	t2[1] = t1[1] >> SBC_PROTO_FIXED4_SCALE;
     85 	t2[2] = t1[2] >> SBC_PROTO_FIXED4_SCALE;
     86 	t2[3] = t1[3] >> SBC_PROTO_FIXED4_SCALE;
     87 
     88 	/* do the cos transform */
     89 	t1[0]  = (FIXED_A) t2[0] * consts[40 + 0];
     90 	t1[0] += (FIXED_A) t2[1] * consts[40 + 1];
     91 	t1[1]  = (FIXED_A) t2[0] * consts[40 + 2];
     92 	t1[1] += (FIXED_A) t2[1] * consts[40 + 3];
     93 	t1[2]  = (FIXED_A) t2[0] * consts[40 + 4];
     94 	t1[2] += (FIXED_A) t2[1] * consts[40 + 5];
     95 	t1[3]  = (FIXED_A) t2[0] * consts[40 + 6];
     96 	t1[3] += (FIXED_A) t2[1] * consts[40 + 7];
     97 
     98 	t1[0] += (FIXED_A) t2[2] * consts[40 + 8];
     99 	t1[0] += (FIXED_A) t2[3] * consts[40 + 9];
    100 	t1[1] += (FIXED_A) t2[2] * consts[40 + 10];
    101 	t1[1] += (FIXED_A) t2[3] * consts[40 + 11];
    102 	t1[2] += (FIXED_A) t2[2] * consts[40 + 12];
    103 	t1[2] += (FIXED_A) t2[3] * consts[40 + 13];
    104 	t1[3] += (FIXED_A) t2[2] * consts[40 + 14];
    105 	t1[3] += (FIXED_A) t2[3] * consts[40 + 15];
    106 
    107 	out[0] = t1[0] >>
    108 		(SBC_COS_TABLE_FIXED4_SCALE - SCALE_OUT_BITS);
    109 	out[1] = t1[1] >>
    110 		(SBC_COS_TABLE_FIXED4_SCALE - SCALE_OUT_BITS);
    111 	out[2] = t1[2] >>
    112 		(SBC_COS_TABLE_FIXED4_SCALE - SCALE_OUT_BITS);
    113 	out[3] = t1[3] >>
    114 		(SBC_COS_TABLE_FIXED4_SCALE - SCALE_OUT_BITS);
    115 }
    116 
    117 static inline void sbc_analyze_eight_simd(const int16_t *in, int32_t *out,
    118 							const FIXED_T *consts)
    119 {
    120 	FIXED_A t1[8];
    121 	FIXED_T t2[8];
    122 	int i, hop;
    123 
    124 	/* rounding coefficient */
    125 	t1[0] = t1[1] = t1[2] = t1[3] = t1[4] = t1[5] = t1[6] = t1[7] =
    126 		(FIXED_A) 1 << (SBC_PROTO_FIXED8_SCALE-1);
    127 
    128 	/* low pass polyphase filter */
    129 	for (hop = 0; hop < 80; hop += 16) {
    130 		t1[0] += (FIXED_A) in[hop] * consts[hop];
    131 		t1[0] += (FIXED_A) in[hop + 1] * consts[hop + 1];
    132 		t1[1] += (FIXED_A) in[hop + 2] * consts[hop + 2];
    133 		t1[1] += (FIXED_A) in[hop + 3] * consts[hop + 3];
    134 		t1[2] += (FIXED_A) in[hop + 4] * consts[hop + 4];
    135 		t1[2] += (FIXED_A) in[hop + 5] * consts[hop + 5];
    136 		t1[3] += (FIXED_A) in[hop + 6] * consts[hop + 6];
    137 		t1[3] += (FIXED_A) in[hop + 7] * consts[hop + 7];
    138 		t1[4] += (FIXED_A) in[hop + 8] * consts[hop + 8];
    139 		t1[4] += (FIXED_A) in[hop + 9] * consts[hop + 9];
    140 		t1[5] += (FIXED_A) in[hop + 10] * consts[hop + 10];
    141 		t1[5] += (FIXED_A) in[hop + 11] * consts[hop + 11];
    142 		t1[6] += (FIXED_A) in[hop + 12] * consts[hop + 12];
    143 		t1[6] += (FIXED_A) in[hop + 13] * consts[hop + 13];
    144 		t1[7] += (FIXED_A) in[hop + 14] * consts[hop + 14];
    145 		t1[7] += (FIXED_A) in[hop + 15] * consts[hop + 15];
    146 	}
    147 
    148 	/* scaling */
    149 	t2[0] = t1[0] >> SBC_PROTO_FIXED8_SCALE;
    150 	t2[1] = t1[1] >> SBC_PROTO_FIXED8_SCALE;
    151 	t2[2] = t1[2] >> SBC_PROTO_FIXED8_SCALE;
    152 	t2[3] = t1[3] >> SBC_PROTO_FIXED8_SCALE;
    153 	t2[4] = t1[4] >> SBC_PROTO_FIXED8_SCALE;
    154 	t2[5] = t1[5] >> SBC_PROTO_FIXED8_SCALE;
    155 	t2[6] = t1[6] >> SBC_PROTO_FIXED8_SCALE;
    156 	t2[7] = t1[7] >> SBC_PROTO_FIXED8_SCALE;
    157 
    158 
    159 	/* do the cos transform */
    160 	t1[0] = t1[1] = t1[2] = t1[3] = t1[4] = t1[5] = t1[6] = t1[7] = 0;
    161 
    162 	for (i = 0; i < 4; i++) {
    163 		t1[0] += (FIXED_A) t2[i * 2 + 0] * consts[80 + i * 16 + 0];
    164 		t1[0] += (FIXED_A) t2[i * 2 + 1] * consts[80 + i * 16 + 1];
    165 		t1[1] += (FIXED_A) t2[i * 2 + 0] * consts[80 + i * 16 + 2];
    166 		t1[1] += (FIXED_A) t2[i * 2 + 1] * consts[80 + i * 16 + 3];
    167 		t1[2] += (FIXED_A) t2[i * 2 + 0] * consts[80 + i * 16 + 4];
    168 		t1[2] += (FIXED_A) t2[i * 2 + 1] * consts[80 + i * 16 + 5];
    169 		t1[3] += (FIXED_A) t2[i * 2 + 0] * consts[80 + i * 16 + 6];
    170 		t1[3] += (FIXED_A) t2[i * 2 + 1] * consts[80 + i * 16 + 7];
    171 		t1[4] += (FIXED_A) t2[i * 2 + 0] * consts[80 + i * 16 + 8];
    172 		t1[4] += (FIXED_A) t2[i * 2 + 1] * consts[80 + i * 16 + 9];
    173 		t1[5] += (FIXED_A) t2[i * 2 + 0] * consts[80 + i * 16 + 10];
    174 		t1[5] += (FIXED_A) t2[i * 2 + 1] * consts[80 + i * 16 + 11];
    175 		t1[6] += (FIXED_A) t2[i * 2 + 0] * consts[80 + i * 16 + 12];
    176 		t1[6] += (FIXED_A) t2[i * 2 + 1] * consts[80 + i * 16 + 13];
    177 		t1[7] += (FIXED_A) t2[i * 2 + 0] * consts[80 + i * 16 + 14];
    178 		t1[7] += (FIXED_A) t2[i * 2 + 1] * consts[80 + i * 16 + 15];
    179 	}
    180 
    181 	for (i = 0; i < 8; i++)
    182 		out[i] = t1[i] >>
    183 			(SBC_COS_TABLE_FIXED8_SCALE - SCALE_OUT_BITS);
    184 }
    185 
    186 static inline void sbc_analyze_4b_4s_simd(int16_t *x,
    187 						int32_t *out, int out_stride)
    188 {
    189 	/* Analyze blocks */
    190 	sbc_analyze_four_simd(x + 12, out, analysis_consts_fixed4_simd_odd);
    191 	out += out_stride;
    192 	sbc_analyze_four_simd(x + 8, out, analysis_consts_fixed4_simd_even);
    193 	out += out_stride;
    194 	sbc_analyze_four_simd(x + 4, out, analysis_consts_fixed4_simd_odd);
    195 	out += out_stride;
    196 	sbc_analyze_four_simd(x + 0, out, analysis_consts_fixed4_simd_even);
    197 }
    198 
    199 static inline void sbc_analyze_4b_8s_simd(int16_t *x,
    200 					  int32_t *out, int out_stride)
    201 {
    202 	/* Analyze blocks */
    203 	sbc_analyze_eight_simd(x + 24, out, analysis_consts_fixed8_simd_odd);
    204 	out += out_stride;
    205 	sbc_analyze_eight_simd(x + 16, out, analysis_consts_fixed8_simd_even);
    206 	out += out_stride;
    207 	sbc_analyze_eight_simd(x + 8, out, analysis_consts_fixed8_simd_odd);
    208 	out += out_stride;
    209 	sbc_analyze_eight_simd(x + 0, out, analysis_consts_fixed8_simd_even);
    210 }
    211 
    212 static inline int16_t unaligned16_be(const uint8_t *ptr)
    213 {
    214 	return (int16_t) ((ptr[0] << 8) | ptr[1]);
    215 }
    216 
    217 static inline int16_t unaligned16_le(const uint8_t *ptr)
    218 {
    219 	return (int16_t) (ptr[0] | (ptr[1] << 8));
    220 }
    221 
    222 /*
    223  * Internal helper functions for input data processing. In order to get
    224  * optimal performance, it is important to have "nsamples", "nchannels"
    225  * and "big_endian" arguments used with this inline function as compile
    226  * time constants.
    227  */
    228 
    229 static SBC_ALWAYS_INLINE int sbc_encoder_process_input_s4_internal(
    230 	int position,
    231 	const uint8_t *pcm, int16_t X[2][SBC_X_BUFFER_SIZE],
    232 	int nsamples, int nchannels, int big_endian)
    233 {
    234 	/* handle X buffer wraparound */
    235 	if (position < nsamples) {
    236 		if (nchannels > 0)
    237 			memcpy(&X[0][SBC_X_BUFFER_SIZE - 40], &X[0][position],
    238 							36 * sizeof(int16_t));
    239 		if (nchannels > 1)
    240 			memcpy(&X[1][SBC_X_BUFFER_SIZE - 40], &X[1][position],
    241 							36 * sizeof(int16_t));
    242 		position = SBC_X_BUFFER_SIZE - 40;
    243 	}
    244 
    245 	#define PCM(i) (big_endian ? \
    246 		unaligned16_be(pcm + (i) * 2) : unaligned16_le(pcm + (i) * 2))
    247 
    248 	/* copy/permutate audio samples */
    249 	while ((nsamples -= 8) >= 0) {
    250 		position -= 8;
    251 		if (nchannels > 0) {
    252 			int16_t *x = &X[0][position];
    253 			x[0]  = PCM(0 + 7 * nchannels);
    254 			x[1]  = PCM(0 + 3 * nchannels);
    255 			x[2]  = PCM(0 + 6 * nchannels);
    256 			x[3]  = PCM(0 + 4 * nchannels);
    257 			x[4]  = PCM(0 + 0 * nchannels);
    258 			x[5]  = PCM(0 + 2 * nchannels);
    259 			x[6]  = PCM(0 + 1 * nchannels);
    260 			x[7]  = PCM(0 + 5 * nchannels);
    261 		}
    262 		if (nchannels > 1) {
    263 			int16_t *x = &X[1][position];
    264 			x[0]  = PCM(1 + 7 * nchannels);
    265 			x[1]  = PCM(1 + 3 * nchannels);
    266 			x[2]  = PCM(1 + 6 * nchannels);
    267 			x[3]  = PCM(1 + 4 * nchannels);
    268 			x[4]  = PCM(1 + 0 * nchannels);
    269 			x[5]  = PCM(1 + 2 * nchannels);
    270 			x[6]  = PCM(1 + 1 * nchannels);
    271 			x[7]  = PCM(1 + 5 * nchannels);
    272 		}
    273 		pcm += 16 * nchannels;
    274 	}
    275 	#undef PCM
    276 
    277 	return position;
    278 }
    279 
    280 static SBC_ALWAYS_INLINE int sbc_encoder_process_input_s8_internal(
    281 	int position,
    282 	const uint8_t *pcm, int16_t X[2][SBC_X_BUFFER_SIZE],
    283 	int nsamples, int nchannels, int big_endian)
    284 {
    285 	/* handle X buffer wraparound */
    286 	if (position < nsamples) {
    287 		if (nchannels > 0)
    288 			memcpy(&X[0][SBC_X_BUFFER_SIZE - 72], &X[0][position],
    289 							72 * sizeof(int16_t));
    290 		if (nchannels > 1)
    291 			memcpy(&X[1][SBC_X_BUFFER_SIZE - 72], &X[1][position],
    292 							72 * sizeof(int16_t));
    293 		position = SBC_X_BUFFER_SIZE - 72;
    294 	}
    295 
    296 	#define PCM(i) (big_endian ? \
    297 		unaligned16_be(pcm + (i) * 2) : unaligned16_le(pcm + (i) * 2))
    298 
    299 	/* copy/permutate audio samples */
    300 	while ((nsamples -= 16) >= 0) {
    301 		position -= 16;
    302 		if (nchannels > 0) {
    303 			int16_t *x = &X[0][position];
    304 			x[0]  = PCM(0 + 15 * nchannels);
    305 			x[1]  = PCM(0 + 7 * nchannels);
    306 			x[2]  = PCM(0 + 14 * nchannels);
    307 			x[3]  = PCM(0 + 8 * nchannels);
    308 			x[4]  = PCM(0 + 13 * nchannels);
    309 			x[5]  = PCM(0 + 9 * nchannels);
    310 			x[6]  = PCM(0 + 12 * nchannels);
    311 			x[7]  = PCM(0 + 10 * nchannels);
    312 			x[8]  = PCM(0 + 11 * nchannels);
    313 			x[9]  = PCM(0 + 3 * nchannels);
    314 			x[10] = PCM(0 + 6 * nchannels);
    315 			x[11] = PCM(0 + 0 * nchannels);
    316 			x[12] = PCM(0 + 5 * nchannels);
    317 			x[13] = PCM(0 + 1 * nchannels);
    318 			x[14] = PCM(0 + 4 * nchannels);
    319 			x[15] = PCM(0 + 2 * nchannels);
    320 		}
    321 		if (nchannels > 1) {
    322 			int16_t *x = &X[1][position];
    323 			x[0]  = PCM(1 + 15 * nchannels);
    324 			x[1]  = PCM(1 + 7 * nchannels);
    325 			x[2]  = PCM(1 + 14 * nchannels);
    326 			x[3]  = PCM(1 + 8 * nchannels);
    327 			x[4]  = PCM(1 + 13 * nchannels);
    328 			x[5]  = PCM(1 + 9 * nchannels);
    329 			x[6]  = PCM(1 + 12 * nchannels);
    330 			x[7]  = PCM(1 + 10 * nchannels);
    331 			x[8]  = PCM(1 + 11 * nchannels);
    332 			x[9]  = PCM(1 + 3 * nchannels);
    333 			x[10] = PCM(1 + 6 * nchannels);
    334 			x[11] = PCM(1 + 0 * nchannels);
    335 			x[12] = PCM(1 + 5 * nchannels);
    336 			x[13] = PCM(1 + 1 * nchannels);
    337 			x[14] = PCM(1 + 4 * nchannels);
    338 			x[15] = PCM(1 + 2 * nchannels);
    339 		}
    340 		pcm += 32 * nchannels;
    341 	}
    342 	#undef PCM
    343 
    344 	return position;
    345 }
    346 
    347 /*
    348  * Input data processing functions. The data is endian converted if needed,
    349  * channels are deintrleaved and audio samples are reordered for use in
    350  * SIMD-friendly analysis filter function. The results are put into "X"
    351  * array, getting appended to the previous data (or it is better to say
    352  * prepended, as the buffer is filled from top to bottom). Old data is
    353  * discarded when neededed, but availability of (10 * nrof_subbands)
    354  * contiguous samples is always guaranteed for the input to the analysis
    355  * filter. This is achieved by copying a sufficient part of old data
    356  * to the top of the buffer on buffer wraparound.
    357  */
    358 
    359 static int sbc_enc_process_input_4s_le(int position,
    360 		const uint8_t *pcm, int16_t X[2][SBC_X_BUFFER_SIZE],
    361 		int nsamples, int nchannels)
    362 {
    363 	if (nchannels > 1)
    364 		return sbc_encoder_process_input_s4_internal(
    365 			position, pcm, X, nsamples, 2, 0);
    366 	else
    367 		return sbc_encoder_process_input_s4_internal(
    368 			position, pcm, X, nsamples, 1, 0);
    369 }
    370 
    371 static int sbc_enc_process_input_4s_be(int position,
    372 		const uint8_t *pcm, int16_t X[2][SBC_X_BUFFER_SIZE],
    373 		int nsamples, int nchannels)
    374 {
    375 	if (nchannels > 1)
    376 		return sbc_encoder_process_input_s4_internal(
    377 			position, pcm, X, nsamples, 2, 1);
    378 	else
    379 		return sbc_encoder_process_input_s4_internal(
    380 			position, pcm, X, nsamples, 1, 1);
    381 }
    382 
    383 static int sbc_enc_process_input_8s_le(int position,
    384 		const uint8_t *pcm, int16_t X[2][SBC_X_BUFFER_SIZE],
    385 		int nsamples, int nchannels)
    386 {
    387 	if (nchannels > 1)
    388 		return sbc_encoder_process_input_s8_internal(
    389 			position, pcm, X, nsamples, 2, 0);
    390 	else
    391 		return sbc_encoder_process_input_s8_internal(
    392 			position, pcm, X, nsamples, 1, 0);
    393 }
    394 
    395 static int sbc_enc_process_input_8s_be(int position,
    396 		const uint8_t *pcm, int16_t X[2][SBC_X_BUFFER_SIZE],
    397 		int nsamples, int nchannels)
    398 {
    399 	if (nchannels > 1)
    400 		return sbc_encoder_process_input_s8_internal(
    401 			position, pcm, X, nsamples, 2, 1);
    402 	else
    403 		return sbc_encoder_process_input_s8_internal(
    404 			position, pcm, X, nsamples, 1, 1);
    405 }
    406 
    407 /* Supplementary function to count the number of leading zeros */
    408 
    409 static inline int sbc_clz(uint32_t x)
    410 {
    411 #ifdef __GNUC__
    412 	return __builtin_clz(x);
    413 #else
    414 	/* TODO: this should be replaced with something better if good
    415 	 * performance is wanted when using compilers other than gcc */
    416 	int cnt = 0;
    417 	while (x) {
    418 		cnt++;
    419 		x >>= 1;
    420 	}
    421 	return 32 - cnt;
    422 #endif
    423 }
    424 
    425 static void sbc_calc_scalefactors(
    426 	int32_t sb_sample_f[16][2][8],
    427 	uint32_t scale_factor[2][8],
    428 	int blocks, int channels, int subbands)
    429 {
    430 	int ch, sb, blk;
    431 	for (ch = 0; ch < channels; ch++) {
    432 		for (sb = 0; sb < subbands; sb++) {
    433 			uint32_t x = 1 << SCALE_OUT_BITS;
    434 			for (blk = 0; blk < blocks; blk++) {
    435 				int32_t tmp = fabs(sb_sample_f[blk][ch][sb]);
    436 				if (tmp != 0)
    437 					x |= tmp - 1;
    438 			}
    439 			scale_factor[ch][sb] = (31 - SCALE_OUT_BITS) -
    440 				sbc_clz(x);
    441 		}
    442 	}
    443 }
    444 
    445 static int sbc_calc_scalefactors_j(
    446 	int32_t sb_sample_f[16][2][8],
    447 	uint32_t scale_factor[2][8],
    448 	int blocks, int subbands)
    449 {
    450 	int blk, joint = 0;
    451 	int32_t tmp0, tmp1;
    452 	uint32_t x, y;
    453 
    454 	/* last subband does not use joint stereo */
    455 	int sb = subbands - 1;
    456 	x = 1 << SCALE_OUT_BITS;
    457 	y = 1 << SCALE_OUT_BITS;
    458 	for (blk = 0; blk < blocks; blk++) {
    459 		tmp0 = fabs(sb_sample_f[blk][0][sb]);
    460 		tmp1 = fabs(sb_sample_f[blk][1][sb]);
    461 		if (tmp0 != 0)
    462 			x |= tmp0 - 1;
    463 		if (tmp1 != 0)
    464 			y |= tmp1 - 1;
    465 	}
    466 	scale_factor[0][sb] = (31 - SCALE_OUT_BITS) - sbc_clz(x);
    467 	scale_factor[1][sb] = (31 - SCALE_OUT_BITS) - sbc_clz(y);
    468 
    469 	/* the rest of subbands can use joint stereo */
    470 	while (--sb >= 0) {
    471 		int32_t sb_sample_j[16][2];
    472 		x = 1 << SCALE_OUT_BITS;
    473 		y = 1 << SCALE_OUT_BITS;
    474 		for (blk = 0; blk < blocks; blk++) {
    475 			tmp0 = sb_sample_f[blk][0][sb];
    476 			tmp1 = sb_sample_f[blk][1][sb];
    477 			sb_sample_j[blk][0] = ASR(tmp0, 1) + ASR(tmp1, 1);
    478 			sb_sample_j[blk][1] = ASR(tmp0, 1) - ASR(tmp1, 1);
    479 			tmp0 = fabs(tmp0);
    480 			tmp1 = fabs(tmp1);
    481 			if (tmp0 != 0)
    482 				x |= tmp0 - 1;
    483 			if (tmp1 != 0)
    484 				y |= tmp1 - 1;
    485 		}
    486 		scale_factor[0][sb] = (31 - SCALE_OUT_BITS) -
    487 			sbc_clz(x);
    488 		scale_factor[1][sb] = (31 - SCALE_OUT_BITS) -
    489 			sbc_clz(y);
    490 		x = 1 << SCALE_OUT_BITS;
    491 		y = 1 << SCALE_OUT_BITS;
    492 		for (blk = 0; blk < blocks; blk++) {
    493 			tmp0 = fabs(sb_sample_j[blk][0]);
    494 			tmp1 = fabs(sb_sample_j[blk][1]);
    495 			if (tmp0 != 0)
    496 				x |= tmp0 - 1;
    497 			if (tmp1 != 0)
    498 				y |= tmp1 - 1;
    499 		}
    500 		x = (31 - SCALE_OUT_BITS) - sbc_clz(x);
    501 		y = (31 - SCALE_OUT_BITS) - sbc_clz(y);
    502 
    503 		/* decide whether to use joint stereo for this subband */
    504 		if ((scale_factor[0][sb] + scale_factor[1][sb]) > x + y) {
    505 			joint |= 1 << (subbands - 1 - sb);
    506 			scale_factor[0][sb] = x;
    507 			scale_factor[1][sb] = y;
    508 			for (blk = 0; blk < blocks; blk++) {
    509 				sb_sample_f[blk][0][sb] = sb_sample_j[blk][0];
    510 				sb_sample_f[blk][1][sb] = sb_sample_j[blk][1];
    511 			}
    512 		}
    513 	}
    514 
    515 	/* bitmask with the information about subbands using joint stereo */
    516 	return joint;
    517 }
    518 
    519 /*
    520  * Detect CPU features and setup function pointers
    521  */
    522 void sbc_init_primitives(struct sbc_encoder_state *state)
    523 {
    524 	/* Default implementation for analyze functions */
    525 	state->sbc_analyze_4b_4s = sbc_analyze_4b_4s_simd;
    526 	state->sbc_analyze_4b_8s = sbc_analyze_4b_8s_simd;
    527 
    528 	/* Default implementation for input reordering / deinterleaving */
    529 	state->sbc_enc_process_input_4s_le = sbc_enc_process_input_4s_le;
    530 	state->sbc_enc_process_input_4s_be = sbc_enc_process_input_4s_be;
    531 	state->sbc_enc_process_input_8s_le = sbc_enc_process_input_8s_le;
    532 	state->sbc_enc_process_input_8s_be = sbc_enc_process_input_8s_be;
    533 
    534 	/* Default implementation for scale factors calculation */
    535 	state->sbc_calc_scalefactors = sbc_calc_scalefactors;
    536 	state->sbc_calc_scalefactors_j = sbc_calc_scalefactors_j;
    537 	state->implementation_info = "Generic C";
    538 
    539 	/* X86/AMD64 optimizations */
    540 #ifdef SBC_BUILD_WITH_MMX_SUPPORT
    541 	sbc_init_primitives_mmx(state);
    542 #endif
    543 
    544 	/* ARM optimizations */
    545 #ifdef SBC_BUILD_WITH_ARMV6_SUPPORT
    546 	sbc_init_primitives_armv6(state);
    547 #endif
    548 #ifdef SBC_BUILD_WITH_IWMMXT_SUPPORT
    549 	sbc_init_primitives_iwmmxt(state);
    550 #endif
    551 #ifdef SBC_BUILD_WITH_NEON_SUPPORT
    552 	sbc_init_primitives_neon(state);
    553 #endif
    554 }
    555