Home | History | Annotate | Download | only in sbc
      1 /*
      2  *
      3  *  Bluetooth low-complexity, subband codec (SBC) library
      4  *
      5  *  Copyright (C) 2008-2010  Nokia Corporation
      6  *  Copyright (C) 2004-2010  Marcel Holtmann <marcel (at) holtmann.org>
      7  *  Copyright (C) 2004-2005  Henryk Ploetz <henryk (at) ploetzli.ch>
      8  *  Copyright (C) 2005-2006  Brad Midgley <bmidgley (at) xmission.com>
      9  *
     10  *
     11  *  This library is free software; you can redistribute it and/or
     12  *  modify it under the terms of the GNU Lesser General Public
     13  *  License as published by the Free Software Foundation; either
     14  *  version 2.1 of the License, or (at your option) any later version.
     15  *
     16  *  This library is distributed in the hope that it will be useful,
     17  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
     18  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     19  *  Lesser General Public License for more details.
     20  *
     21  *  You should have received a copy of the GNU Lesser General Public
     22  *  License along with this library; if not, write to the Free Software
     23  *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
     24  *
     25  */
     26 
     27 #include <stdint.h>
     28 #include <limits.h>
     29 #include "sbc.h"
     30 #include "sbc_math.h"
     31 #include "sbc_tables.h"
     32 
     33 #include "sbc_primitives_neon.h"
     34 
     35 /*
     36  * ARM NEON optimizations
     37  */
     38 
     39 #ifdef SBC_BUILD_WITH_NEON_SUPPORT
     40 
     41 static inline void _sbc_analyze_four_neon(const int16_t *in, int32_t *out,
     42 							const FIXED_T *consts)
     43 {
     44 	/* TODO: merge even and odd cases (or even merge all four calls to this
     45 	 * function) in order to have only aligned reads from 'in' array
     46 	 * and reduce number of load instructions */
     47 	asm volatile (
     48 		"vld1.16    {d4, d5}, [%0, :64]!\n"
     49 		"vld1.16    {d8, d9}, [%1, :128]!\n"
     50 
     51 		"vmull.s16  q0, d4, d8\n"
     52 		"vld1.16    {d6,  d7}, [%0, :64]!\n"
     53 		"vmull.s16  q1, d5, d9\n"
     54 		"vld1.16    {d10, d11}, [%1, :128]!\n"
     55 
     56 		"vmlal.s16  q0, d6, d10\n"
     57 		"vld1.16    {d4, d5}, [%0, :64]!\n"
     58 		"vmlal.s16  q1, d7, d11\n"
     59 		"vld1.16    {d8, d9}, [%1, :128]!\n"
     60 
     61 		"vmlal.s16  q0, d4, d8\n"
     62 		"vld1.16    {d6,  d7}, [%0, :64]!\n"
     63 		"vmlal.s16  q1, d5, d9\n"
     64 		"vld1.16    {d10, d11}, [%1, :128]!\n"
     65 
     66 		"vmlal.s16  q0, d6, d10\n"
     67 		"vld1.16    {d4, d5}, [%0, :64]!\n"
     68 		"vmlal.s16  q1, d7, d11\n"
     69 		"vld1.16    {d8, d9}, [%1, :128]!\n"
     70 
     71 		"vmlal.s16  q0, d4, d8\n"
     72 		"vmlal.s16  q1, d5, d9\n"
     73 
     74 		"vpadd.s32  d0, d0, d1\n"
     75 		"vpadd.s32  d1, d2, d3\n"
     76 
     77 		"vrshrn.s32 d0, q0, %3\n"
     78 
     79 		"vld1.16    {d2, d3, d4, d5}, [%1, :128]!\n"
     80 
     81 		"vdup.i32   d1, d0[1]\n"  /* TODO: can be eliminated */
     82 		"vdup.i32   d0, d0[0]\n"  /* TODO: can be eliminated */
     83 
     84 		"vmull.s16  q3, d2, d0\n"
     85 		"vmull.s16  q4, d3, d0\n"
     86 		"vmlal.s16  q3, d4, d1\n"
     87 		"vmlal.s16  q4, d5, d1\n"
     88 
     89 		"vpadd.s32  d0, d6, d7\n" /* TODO: can be eliminated */
     90 		"vpadd.s32  d1, d8, d9\n" /* TODO: can be eliminated */
     91 
     92 		"vst1.32    {d0, d1}, [%2, :128]\n"
     93 		: "+r" (in), "+r" (consts)
     94 		: "r" (out),
     95 			"i" (SBC_PROTO_FIXED4_SCALE)
     96 		: "memory",
     97 			"d0", "d1", "d2", "d3", "d4", "d5",
     98 			"d6", "d7", "d8", "d9", "d10", "d11");
     99 }
    100 
    101 static inline void _sbc_analyze_eight_neon(const int16_t *in, int32_t *out,
    102 							const FIXED_T *consts)
    103 {
    104 	/* TODO: merge even and odd cases (or even merge all four calls to this
    105 	 * function) in order to have only aligned reads from 'in' array
    106 	 * and reduce number of load instructions */
    107 	asm volatile (
    108 		"vld1.16    {d4, d5}, [%0, :64]!\n"
    109 		"vld1.16    {d8, d9}, [%1, :128]!\n"
    110 
    111 		"vmull.s16  q6, d4, d8\n"
    112 		"vld1.16    {d6,  d7}, [%0, :64]!\n"
    113 		"vmull.s16  q7, d5, d9\n"
    114 		"vld1.16    {d10, d11}, [%1, :128]!\n"
    115 		"vmull.s16  q8, d6, d10\n"
    116 		"vld1.16    {d4, d5}, [%0, :64]!\n"
    117 		"vmull.s16  q9, d7, d11\n"
    118 		"vld1.16    {d8, d9}, [%1, :128]!\n"
    119 
    120 		"vmlal.s16  q6, d4, d8\n"
    121 		"vld1.16    {d6,  d7}, [%0, :64]!\n"
    122 		"vmlal.s16  q7, d5, d9\n"
    123 		"vld1.16    {d10, d11}, [%1, :128]!\n"
    124 		"vmlal.s16  q8, d6, d10\n"
    125 		"vld1.16    {d4, d5}, [%0, :64]!\n"
    126 		"vmlal.s16  q9, d7, d11\n"
    127 		"vld1.16    {d8, d9}, [%1, :128]!\n"
    128 
    129 		"vmlal.s16  q6, d4, d8\n"
    130 		"vld1.16    {d6,  d7}, [%0, :64]!\n"
    131 		"vmlal.s16  q7, d5, d9\n"
    132 		"vld1.16    {d10, d11}, [%1, :128]!\n"
    133 		"vmlal.s16  q8, d6, d10\n"
    134 		"vld1.16    {d4, d5}, [%0, :64]!\n"
    135 		"vmlal.s16  q9, d7, d11\n"
    136 		"vld1.16    {d8, d9}, [%1, :128]!\n"
    137 
    138 		"vmlal.s16  q6, d4, d8\n"
    139 		"vld1.16    {d6,  d7}, [%0, :64]!\n"
    140 		"vmlal.s16  q7, d5, d9\n"
    141 		"vld1.16    {d10, d11}, [%1, :128]!\n"
    142 		"vmlal.s16  q8, d6, d10\n"
    143 		"vld1.16    {d4, d5}, [%0, :64]!\n"
    144 		"vmlal.s16  q9, d7, d11\n"
    145 		"vld1.16    {d8, d9}, [%1, :128]!\n"
    146 
    147 		"vmlal.s16  q6, d4, d8\n"
    148 		"vld1.16    {d6,  d7}, [%0, :64]!\n"
    149 		"vmlal.s16  q7, d5, d9\n"
    150 		"vld1.16    {d10, d11}, [%1, :128]!\n"
    151 
    152 		"vmlal.s16  q8, d6, d10\n"
    153 		"vmlal.s16  q9, d7, d11\n"
    154 
    155 		"vpadd.s32  d0, d12, d13\n"
    156 		"vpadd.s32  d1, d14, d15\n"
    157 		"vpadd.s32  d2, d16, d17\n"
    158 		"vpadd.s32  d3, d18, d19\n"
    159 
    160 		"vrshr.s32 q0, q0, %3\n"
    161 		"vrshr.s32 q1, q1, %3\n"
    162 		"vmovn.s32 d0, q0\n"
    163 		"vmovn.s32 d1, q1\n"
    164 
    165 		"vdup.i32   d3, d1[1]\n"  /* TODO: can be eliminated */
    166 		"vdup.i32   d2, d1[0]\n"  /* TODO: can be eliminated */
    167 		"vdup.i32   d1, d0[1]\n"  /* TODO: can be eliminated */
    168 		"vdup.i32   d0, d0[0]\n"  /* TODO: can be eliminated */
    169 
    170 		"vld1.16    {d4, d5}, [%1, :128]!\n"
    171 		"vmull.s16  q6, d4, d0\n"
    172 		"vld1.16    {d6, d7}, [%1, :128]!\n"
    173 		"vmull.s16  q7, d5, d0\n"
    174 		"vmull.s16  q8, d6, d0\n"
    175 		"vmull.s16  q9, d7, d0\n"
    176 
    177 		"vld1.16    {d4, d5}, [%1, :128]!\n"
    178 		"vmlal.s16  q6, d4, d1\n"
    179 		"vld1.16    {d6, d7}, [%1, :128]!\n"
    180 		"vmlal.s16  q7, d5, d1\n"
    181 		"vmlal.s16  q8, d6, d1\n"
    182 		"vmlal.s16  q9, d7, d1\n"
    183 
    184 		"vld1.16    {d4, d5}, [%1, :128]!\n"
    185 		"vmlal.s16  q6, d4, d2\n"
    186 		"vld1.16    {d6, d7}, [%1, :128]!\n"
    187 		"vmlal.s16  q7, d5, d2\n"
    188 		"vmlal.s16  q8, d6, d2\n"
    189 		"vmlal.s16  q9, d7, d2\n"
    190 
    191 		"vld1.16    {d4, d5}, [%1, :128]!\n"
    192 		"vmlal.s16  q6, d4, d3\n"
    193 		"vld1.16    {d6, d7}, [%1, :128]!\n"
    194 		"vmlal.s16  q7, d5, d3\n"
    195 		"vmlal.s16  q8, d6, d3\n"
    196 		"vmlal.s16  q9, d7, d3\n"
    197 
    198 		"vpadd.s32  d0, d12, d13\n" /* TODO: can be eliminated */
    199 		"vpadd.s32  d1, d14, d15\n" /* TODO: can be eliminated */
    200 		"vpadd.s32  d2, d16, d17\n" /* TODO: can be eliminated */
    201 		"vpadd.s32  d3, d18, d19\n" /* TODO: can be eliminated */
    202 
    203 		"vst1.32    {d0, d1, d2, d3}, [%2, :128]\n"
    204 		: "+r" (in), "+r" (consts)
    205 		: "r" (out),
    206 			"i" (SBC_PROTO_FIXED8_SCALE)
    207 		: "memory",
    208 			"d0", "d1", "d2", "d3", "d4", "d5",
    209 			"d6", "d7", "d8", "d9", "d10", "d11",
    210 			"d12", "d13", "d14", "d15", "d16", "d17",
    211 			"d18", "d19");
    212 }
    213 
    214 static inline void sbc_analyze_4b_4s_neon(int16_t *x,
    215 						int32_t *out, int out_stride)
    216 {
    217 	/* Analyze blocks */
    218 	_sbc_analyze_four_neon(x + 12, out, analysis_consts_fixed4_simd_odd);
    219 	out += out_stride;
    220 	_sbc_analyze_four_neon(x + 8, out, analysis_consts_fixed4_simd_even);
    221 	out += out_stride;
    222 	_sbc_analyze_four_neon(x + 4, out, analysis_consts_fixed4_simd_odd);
    223 	out += out_stride;
    224 	_sbc_analyze_four_neon(x + 0, out, analysis_consts_fixed4_simd_even);
    225 }
    226 
    227 static inline void sbc_analyze_4b_8s_neon(int16_t *x,
    228 						int32_t *out, int out_stride)
    229 {
    230 	/* Analyze blocks */
    231 	_sbc_analyze_eight_neon(x + 24, out, analysis_consts_fixed8_simd_odd);
    232 	out += out_stride;
    233 	_sbc_analyze_eight_neon(x + 16, out, analysis_consts_fixed8_simd_even);
    234 	out += out_stride;
    235 	_sbc_analyze_eight_neon(x + 8, out, analysis_consts_fixed8_simd_odd);
    236 	out += out_stride;
    237 	_sbc_analyze_eight_neon(x + 0, out, analysis_consts_fixed8_simd_even);
    238 }
    239 
    240 static void sbc_calc_scalefactors_neon(
    241 	int32_t sb_sample_f[16][2][8],
    242 	uint32_t scale_factor[2][8],
    243 	int blocks, int channels, int subbands)
    244 {
    245 	int ch, sb;
    246 	for (ch = 0; ch < channels; ch++) {
    247 		for (sb = 0; sb < subbands; sb += 4) {
    248 			int blk = blocks;
    249 			int32_t *in = &sb_sample_f[0][ch][sb];
    250 			asm volatile (
    251 				"vmov.s32  q0, #0\n"
    252 				"vmov.s32  q1, %[c1]\n"
    253 				"vmov.s32  q14, #1\n"
    254 				"vmov.s32  q15, %[c2]\n"
    255 				"vadd.s32  q1, q1, q14\n"
    256 			"1:\n"
    257 				"vld1.32   {d16, d17}, [%[in], :128], %[inc]\n"
    258 				"vabs.s32  q8,  q8\n"
    259 				"vld1.32   {d18, d19}, [%[in], :128], %[inc]\n"
    260 				"vabs.s32  q9,  q9\n"
    261 				"vld1.32   {d20, d21}, [%[in], :128], %[inc]\n"
    262 				"vabs.s32  q10, q10\n"
    263 				"vld1.32   {d22, d23}, [%[in], :128], %[inc]\n"
    264 				"vabs.s32  q11, q11\n"
    265 				"vmax.s32  q0,  q0,  q8\n"
    266 				"vmax.s32  q1,  q1,  q9\n"
    267 				"vmax.s32  q0,  q0,  q10\n"
    268 				"vmax.s32  q1,  q1,  q11\n"
    269 				"subs      %[blk], %[blk], #4\n"
    270 				"bgt       1b\n"
    271 				"vmax.s32  q0,  q0,  q1\n"
    272 				"vsub.s32  q0,  q0,  q14\n"
    273 				"vclz.s32  q0,  q0\n"
    274 				"vsub.s32  q0,  q15, q0\n"
    275 				"vst1.32   {d0, d1}, [%[out], :128]\n"
    276 			:
    277 			  [blk]    "+r" (blk),
    278 			  [in]     "+r" (in)
    279 			:
    280 			  [inc]     "r" ((char *) &sb_sample_f[1][0][0] -
    281 					 (char *) &sb_sample_f[0][0][0]),
    282 			  [out]     "r" (&scale_factor[ch][sb]),
    283 			  [c1]      "i" (1 << SCALE_OUT_BITS),
    284 			  [c2]      "i" (31 - SCALE_OUT_BITS)
    285 			: "d0", "d1", "d2", "d3", "d16", "d17", "d18", "d19",
    286 			  "d20", "d21", "d22", "d23", "d24", "d25", "d26",
    287 			  "d27", "d28", "d29", "d30", "d31", "cc", "memory");
    288 		}
    289 	}
    290 }
    291 
    292 int sbc_calc_scalefactors_j_neon(
    293 	int32_t sb_sample_f[16][2][8],
    294 	uint32_t scale_factor[2][8],
    295 	int blocks, int subbands)
    296 {
    297 	static SBC_ALIGNED int32_t joint_bits_mask[8] = {
    298 		8,   4,  2,  1, 128, 64, 32, 16
    299 	};
    300 	int joint, i;
    301 	int32_t  *in0, *in1;
    302 	int32_t  *in = &sb_sample_f[0][0][0];
    303 	uint32_t *out0, *out1;
    304 	uint32_t *out = &scale_factor[0][0];
    305 	int32_t  *consts = joint_bits_mask;
    306 
    307 	i = subbands;
    308 
    309 	asm volatile (
    310 		/*
    311 		 * constants: q13 = (31 - SCALE_OUT_BITS), q14 = 1
    312 		 * input:     q0  = ((1 << SCALE_OUT_BITS) + 1)
    313 		 *            %[in0] - samples for channel 0
    314 		 *            %[in1] - samples for shannel 1
    315 		 * output:    q0, q1 - scale factors without joint stereo
    316 		 *            q2, q3 - scale factors with joint stereo
    317 		 *            q15    - joint stereo selection mask
    318 		 */
    319 		".macro calc_scalefactors\n"
    320 			"vmov.s32  q1, q0\n"
    321 			"vmov.s32  q2, q0\n"
    322 			"vmov.s32  q3, q0\n"
    323 			"mov       %[i], %[blocks]\n"
    324 		"1:\n"
    325 			"vld1.32   {d18, d19}, [%[in1], :128], %[inc]\n"
    326 			"vbic.s32  q11, q9,  q14\n"
    327 			"vld1.32   {d16, d17}, [%[in0], :128], %[inc]\n"
    328 			"vhadd.s32 q10, q8,  q11\n"
    329 			"vhsub.s32 q11, q8,  q11\n"
    330 			"vabs.s32  q8,  q8\n"
    331 			"vabs.s32  q9,  q9\n"
    332 			"vabs.s32  q10, q10\n"
    333 			"vabs.s32  q11, q11\n"
    334 			"vmax.s32  q0,  q0,  q8\n"
    335 			"vmax.s32  q1,  q1,  q9\n"
    336 			"vmax.s32  q2,  q2,  q10\n"
    337 			"vmax.s32  q3,  q3,  q11\n"
    338 			"subs      %[i], %[i], #1\n"
    339 			"bgt       1b\n"
    340 			"vsub.s32  q0,  q0,  q14\n"
    341 			"vsub.s32  q1,  q1,  q14\n"
    342 			"vsub.s32  q2,  q2,  q14\n"
    343 			"vsub.s32  q3,  q3,  q14\n"
    344 			"vclz.s32  q0,  q0\n"
    345 			"vclz.s32  q1,  q1\n"
    346 			"vclz.s32  q2,  q2\n"
    347 			"vclz.s32  q3,  q3\n"
    348 			"vsub.s32  q0,  q13, q0\n"
    349 			"vsub.s32  q1,  q13, q1\n"
    350 			"vsub.s32  q2,  q13, q2\n"
    351 			"vsub.s32  q3,  q13, q3\n"
    352 		".endm\n"
    353 		/*
    354 		 * constants: q14 = 1
    355 		 * input: q15    - joint stereo selection mask
    356 		 *        %[in0] - value set by calc_scalefactors macro
    357 		 *        %[in1] - value set by calc_scalefactors macro
    358 		 */
    359 		".macro update_joint_stereo_samples\n"
    360 			"sub       %[out1], %[in1], %[inc]\n"
    361 			"sub       %[out0], %[in0], %[inc]\n"
    362 			"sub       %[in1], %[in1], %[inc], asl #1\n"
    363 			"sub       %[in0], %[in0], %[inc], asl #1\n"
    364 			"vld1.32   {d18, d19}, [%[in1], :128]\n"
    365 			"vbic.s32  q11, q9,  q14\n"
    366 			"vld1.32   {d16, d17}, [%[in0], :128]\n"
    367 			"vld1.32   {d2, d3}, [%[out1], :128]\n"
    368 			"vbic.s32  q3,  q1,  q14\n"
    369 			"vld1.32   {d0, d1}, [%[out0], :128]\n"
    370 			"vhsub.s32 q10, q8,  q11\n"
    371 			"vhadd.s32 q11, q8,  q11\n"
    372 			"vhsub.s32 q2,  q0,  q3\n"
    373 			"vhadd.s32 q3,  q0,  q3\n"
    374 			"vbif.s32  q10, q9,  q15\n"
    375 			"vbif.s32  d22, d16, d30\n"
    376 			"sub       %[inc], %[zero], %[inc], asl #1\n"
    377 			"sub       %[i], %[blocks], #2\n"
    378 		"2:\n"
    379 			"vbif.s32  d23, d17, d31\n"
    380 			"vst1.32   {d20, d21}, [%[in1], :128], %[inc]\n"
    381 			"vbif.s32  d4,  d2,  d30\n"
    382 			"vld1.32   {d18, d19}, [%[in1], :128]\n"
    383 			"vbif.s32  d5,  d3,  d31\n"
    384 			"vst1.32   {d22, d23}, [%[in0], :128], %[inc]\n"
    385 			"vbif.s32  d6,  d0,  d30\n"
    386 			"vld1.32   {d16, d17}, [%[in0], :128]\n"
    387 			"vbif.s32  d7,  d1,  d31\n"
    388 			"vst1.32   {d4, d5}, [%[out1], :128], %[inc]\n"
    389 			"vbic.s32  q11, q9,  q14\n"
    390 			"vld1.32   {d2, d3}, [%[out1], :128]\n"
    391 			"vst1.32   {d6, d7}, [%[out0], :128], %[inc]\n"
    392 			"vbic.s32  q3,  q1,  q14\n"
    393 			"vld1.32   {d0, d1}, [%[out0], :128]\n"
    394 			"vhsub.s32 q10, q8,  q11\n"
    395 			"vhadd.s32 q11, q8,  q11\n"
    396 			"vhsub.s32 q2,  q0,  q3\n"
    397 			"vhadd.s32 q3,  q0,  q3\n"
    398 			"vbif.s32  q10, q9,  q15\n"
    399 			"vbif.s32  d22, d16, d30\n"
    400 			"subs      %[i], %[i], #2\n"
    401 			"bgt       2b\n"
    402 			"sub       %[inc], %[zero], %[inc], asr #1\n"
    403 			"vbif.s32  d23, d17, d31\n"
    404 			"vst1.32   {d20, d21}, [%[in1], :128]\n"
    405 			"vbif.s32  q2,  q1,  q15\n"
    406 			"vst1.32   {d22, d23}, [%[in0], :128]\n"
    407 			"vbif.s32  q3,  q0,  q15\n"
    408 			"vst1.32   {d4, d5}, [%[out1], :128]\n"
    409 			"vst1.32   {d6, d7}, [%[out0], :128]\n"
    410 		".endm\n"
    411 
    412 		"vmov.s32  q14, #1\n"
    413 		"vmov.s32  q13, %[c2]\n"
    414 
    415 		"cmp   %[i], #4\n"
    416 		"bne   8f\n"
    417 
    418 	"4:\n" /* 4 subbands */
    419 		"add   %[in0], %[in], #0\n"
    420 		"add   %[in1], %[in], #32\n"
    421 		"add   %[out0], %[out], #0\n"
    422 		"add   %[out1], %[out], #32\n"
    423 		"vmov.s32  q0, %[c1]\n"
    424 		"vadd.s32  q0, q0, q14\n"
    425 
    426 		"calc_scalefactors\n"
    427 
    428 		/* check whether to use joint stereo for subbands 0, 1, 2 */
    429 		"vadd.s32  q15, q0,  q1\n"
    430 		"vadd.s32  q9,  q2,  q3\n"
    431 		"vmov.s32  d31[1], %[zero]\n" /* last subband -> no joint */
    432 		"vld1.32   {d16, d17}, [%[consts], :128]!\n"
    433 		"vcgt.s32  q15, q15, q9\n"
    434 
    435 		/* calculate and save to memory 'joint' variable */
    436 		/* update and save scale factors to memory */
    437 		"  vand.s32  q8, q8, q15\n"
    438 		"vbit.s32  q0,  q2,  q15\n"
    439 		"  vpadd.s32 d16, d16, d17\n"
    440 		"vbit.s32  q1,  q3,  q15\n"
    441 		"  vpadd.s32 d16, d16, d16\n"
    442 		"vst1.32   {d0, d1}, [%[out0], :128]\n"
    443 		"vst1.32   {d2, d3}, [%[out1], :128]\n"
    444 		"  vst1.32   {d16[0]}, [%[joint]]\n"
    445 
    446 		"update_joint_stereo_samples\n"
    447 		"b     9f\n"
    448 
    449 	"8:\n" /* 8 subbands */
    450 		"add   %[in0], %[in], #16\n\n"
    451 		"add   %[in1], %[in], #48\n"
    452 		"add   %[out0], %[out], #16\n\n"
    453 		"add   %[out1], %[out], #48\n"
    454 		"vmov.s32  q0, %[c1]\n"
    455 		"vadd.s32  q0, q0, q14\n"
    456 
    457 		"calc_scalefactors\n"
    458 
    459 		/* check whether to use joint stereo for subbands 4, 5, 6 */
    460 		"vadd.s32  q15, q0,  q1\n"
    461 		"vadd.s32  q9,  q2,  q3\n"
    462 		"vmov.s32  d31[1], %[zero]\n"  /* last subband -> no joint */
    463 		"vld1.32   {d16, d17}, [%[consts], :128]!\n"
    464 		"vcgt.s32  q15, q15, q9\n"
    465 
    466 		/* calculate part of 'joint' variable and save it to d24 */
    467 		/* update and save scale factors to memory */
    468 		"  vand.s32  q8, q8, q15\n"
    469 		"vbit.s32  q0,  q2,  q15\n"
    470 		"  vpadd.s32 d16, d16, d17\n"
    471 		"vbit.s32  q1,  q3,  q15\n"
    472 		"vst1.32   {d0, d1}, [%[out0], :128]\n"
    473 		"vst1.32   {d2, d3}, [%[out1], :128]\n"
    474 		"  vpadd.s32 d24, d16, d16\n"
    475 
    476 		"update_joint_stereo_samples\n"
    477 
    478 		"add   %[in0], %[in], #0\n"
    479 		"add   %[in1], %[in], #32\n"
    480 		"add   %[out0], %[out], #0\n\n"
    481 		"add   %[out1], %[out], #32\n"
    482 		"vmov.s32  q0, %[c1]\n"
    483 		"vadd.s32  q0, q0, q14\n"
    484 
    485 		"calc_scalefactors\n"
    486 
    487 		/* check whether to use joint stereo for subbands 0, 1, 2, 3 */
    488 		"vadd.s32  q15, q0,  q1\n"
    489 		"vadd.s32  q9,  q2,  q3\n"
    490 		"vld1.32   {d16, d17}, [%[consts], :128]!\n"
    491 		"vcgt.s32  q15, q15, q9\n"
    492 
    493 		/* combine last part of 'joint' with d24 and save to memory */
    494 		/* update and save scale factors to memory */
    495 		"  vand.s32  q8, q8, q15\n"
    496 		"vbit.s32  q0,  q2,  q15\n"
    497 		"  vpadd.s32 d16, d16, d17\n"
    498 		"vbit.s32  q1,  q3,  q15\n"
    499 		"  vpadd.s32 d16, d16, d16\n"
    500 		"vst1.32   {d0, d1}, [%[out0], :128]\n"
    501 		"  vadd.s32  d16, d16, d24\n"
    502 		"vst1.32   {d2, d3}, [%[out1], :128]\n"
    503 		"  vst1.32   {d16[0]}, [%[joint]]\n"
    504 
    505 		"update_joint_stereo_samples\n"
    506 	"9:\n"
    507 		".purgem calc_scalefactors\n"
    508 		".purgem update_joint_stereo_samples\n"
    509 		:
    510 		  [i]      "+&r" (i),
    511 		  [in]     "+&r" (in),
    512 		  [in0]    "=&r" (in0),
    513 		  [in1]    "=&r" (in1),
    514 		  [out]    "+&r" (out),
    515 		  [out0]   "=&r" (out0),
    516 		  [out1]   "=&r" (out1),
    517 		  [consts] "+&r" (consts)
    518 		:
    519 		  [inc]      "r" ((char *) &sb_sample_f[1][0][0] -
    520 				 (char *) &sb_sample_f[0][0][0]),
    521 		  [blocks]   "r" (blocks),
    522 		  [joint]    "r" (&joint),
    523 		  [c1]       "i" (1 << SCALE_OUT_BITS),
    524 		  [c2]       "i" (31 - SCALE_OUT_BITS),
    525 		  [zero]     "r" (0)
    526 		: "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
    527 		  "d16", "d17", "d18", "d19", "d20", "d21", "d22",
    528 		  "d23", "d24", "d25", "d26", "d27", "d28", "d29",
    529 		  "d30", "d31", "cc", "memory");
    530 
    531 	return joint;
    532 }
    533 
    534 #define PERM_BE(a, b, c, d) {             \
    535 		(a * 2) + 1, (a * 2) + 0, \
    536 		(b * 2) + 1, (b * 2) + 0, \
    537 		(c * 2) + 1, (c * 2) + 0, \
    538 		(d * 2) + 1, (d * 2) + 0  \
    539 	}
    540 #define PERM_LE(a, b, c, d) {             \
    541 		(a * 2) + 0, (a * 2) + 1, \
    542 		(b * 2) + 0, (b * 2) + 1, \
    543 		(c * 2) + 0, (c * 2) + 1, \
    544 		(d * 2) + 0, (d * 2) + 1  \
    545 	}
    546 
    547 static SBC_ALWAYS_INLINE int sbc_enc_process_input_4s_neon_internal(
    548 	int position,
    549 	const uint8_t *pcm, int16_t X[2][SBC_X_BUFFER_SIZE],
    550 	int nsamples, int nchannels, int big_endian)
    551 {
    552 	static SBC_ALIGNED uint8_t perm_be[2][8] = {
    553 		PERM_BE(7, 3, 6, 4),
    554 		PERM_BE(0, 2, 1, 5)
    555 	};
    556 	static SBC_ALIGNED uint8_t perm_le[2][8] = {
    557 		PERM_LE(7, 3, 6, 4),
    558 		PERM_LE(0, 2, 1, 5)
    559 	};
    560 	/* handle X buffer wraparound */
    561 	if (position < nsamples) {
    562 		int16_t *dst = &X[0][SBC_X_BUFFER_SIZE - 40];
    563 		int16_t *src = &X[0][position];
    564 		asm volatile (
    565 			"vld1.16 {d0, d1, d2, d3}, [%[src], :128]!\n"
    566 			"vst1.16 {d0, d1, d2, d3}, [%[dst], :128]!\n"
    567 			"vld1.16 {d0, d1, d2, d3}, [%[src], :128]!\n"
    568 			"vst1.16 {d0, d1, d2, d3}, [%[dst], :128]!\n"
    569 			"vld1.16 {d0}, [%[src], :64]!\n"
    570 			"vst1.16 {d0}, [%[dst], :64]!\n"
    571 			:
    572 			  [dst] "+r" (dst),
    573 			  [src] "+r" (src)
    574 			: : "memory", "d0", "d1", "d2", "d3");
    575 		if (nchannels > 1) {
    576 			dst = &X[1][SBC_X_BUFFER_SIZE - 40];
    577 			src = &X[1][position];
    578 			asm volatile (
    579 				"vld1.16 {d0, d1, d2, d3}, [%[src], :128]!\n"
    580 				"vst1.16 {d0, d1, d2, d3}, [%[dst], :128]!\n"
    581 				"vld1.16 {d0, d1, d2, d3}, [%[src], :128]!\n"
    582 				"vst1.16 {d0, d1, d2, d3}, [%[dst], :128]!\n"
    583 				"vld1.16 {d0}, [%[src], :64]!\n"
    584 				"vst1.16 {d0}, [%[dst], :64]!\n"
    585 				:
    586 				  [dst] "+r" (dst),
    587 				  [src] "+r" (src)
    588 				: : "memory", "d0", "d1", "d2", "d3");
    589 		}
    590 		position = SBC_X_BUFFER_SIZE - 40;
    591 	}
    592 
    593 	if ((nchannels > 1) && ((uintptr_t)pcm & 1)) {
    594 		/* poor 'pcm' alignment */
    595 		int16_t *x = &X[0][position];
    596 		int16_t *y = &X[1][position];
    597 		asm volatile (
    598 			"vld1.8  {d0, d1}, [%[perm], :128]\n"
    599 		"1:\n"
    600 			"sub     %[x], %[x], #16\n"
    601 			"sub     %[y], %[y], #16\n"
    602 			"sub     %[position], %[position], #8\n"
    603 			"vld1.8  {d4, d5}, [%[pcm]]!\n"
    604 			"vuzp.16 d4,  d5\n"
    605 			"vld1.8  {d20, d21}, [%[pcm]]!\n"
    606 			"vuzp.16 d20, d21\n"
    607 			"vswp    d5,  d20\n"
    608 			"vtbl.8  d16, {d4, d5}, d0\n"
    609 			"vtbl.8  d17, {d4, d5}, d1\n"
    610 			"vtbl.8  d18, {d20, d21}, d0\n"
    611 			"vtbl.8  d19, {d20, d21}, d1\n"
    612 			"vst1.16 {d16, d17}, [%[x], :128]\n"
    613 			"vst1.16 {d18, d19}, [%[y], :128]\n"
    614 			"subs    %[nsamples], %[nsamples], #8\n"
    615 			"bgt     1b\n"
    616 			:
    617 			  [x]        "+r" (x),
    618 			  [y]        "+r" (y),
    619 			  [pcm]      "+r" (pcm),
    620 			  [nsamples] "+r" (nsamples),
    621 			  [position] "+r" (position)
    622 			:
    623 			  [perm]      "r" (big_endian ? perm_be : perm_le)
    624 			: "cc", "memory", "d0", "d1", "d2", "d3", "d4",
    625 			  "d5", "d6", "d7", "d16", "d17", "d18", "d19",
    626 			  "d20", "d21", "d22", "d23");
    627 	} else if (nchannels > 1) {
    628 		/* proper 'pcm' alignment */
    629 		int16_t *x = &X[0][position];
    630 		int16_t *y = &X[1][position];
    631 		asm volatile (
    632 			"vld1.8  {d0, d1}, [%[perm], :128]\n"
    633 		"1:\n"
    634 			"sub     %[x], %[x], #16\n"
    635 			"sub     %[y], %[y], #16\n"
    636 			"sub     %[position], %[position], #8\n"
    637 			"vld2.16 {d4, d5}, [%[pcm]]!\n"
    638 			"vld2.16 {d20, d21}, [%[pcm]]!\n"
    639 			"vswp    d5, d20\n"
    640 			"vtbl.8  d16, {d4, d5}, d0\n"
    641 			"vtbl.8  d17, {d4, d5}, d1\n"
    642 			"vtbl.8  d18, {d20, d21}, d0\n"
    643 			"vtbl.8  d19, {d20, d21}, d1\n"
    644 			"vst1.16 {d16, d17}, [%[x], :128]\n"
    645 			"vst1.16 {d18, d19}, [%[y], :128]\n"
    646 			"subs    %[nsamples], %[nsamples], #8\n"
    647 			"bgt     1b\n"
    648 			:
    649 			  [x]        "+r" (x),
    650 			  [y]        "+r" (y),
    651 			  [pcm]      "+r" (pcm),
    652 			  [nsamples] "+r" (nsamples),
    653 			  [position] "+r" (position)
    654 			:
    655 			  [perm]      "r" (big_endian ? perm_be : perm_le)
    656 			: "cc", "memory", "d0", "d1", "d2", "d3", "d4",
    657 			  "d5", "d6", "d7", "d16", "d17", "d18", "d19",
    658 			  "d20", "d21", "d22", "d23");
    659 	} else {
    660 		int16_t *x = &X[0][position];
    661 		asm volatile (
    662 			"vld1.8  {d0, d1}, [%[perm], :128]\n"
    663 		"1:\n"
    664 			"sub     %[x], %[x], #16\n"
    665 			"sub     %[position], %[position], #8\n"
    666 			"vld1.8  {d4, d5}, [%[pcm]]!\n"
    667 			"vtbl.8  d16, {d4, d5}, d0\n"
    668 			"vtbl.8  d17, {d4, d5}, d1\n"
    669 			"vst1.16 {d16, d17}, [%[x], :128]\n"
    670 			"subs    %[nsamples], %[nsamples], #8\n"
    671 			"bgt     1b\n"
    672 			:
    673 			  [x]        "+r" (x),
    674 			  [pcm]      "+r" (pcm),
    675 			  [nsamples] "+r" (nsamples),
    676 			  [position] "+r" (position)
    677 			:
    678 			  [perm]      "r" (big_endian ? perm_be : perm_le)
    679 			: "cc", "memory", "d0", "d1", "d2", "d3", "d4",
    680 			  "d5", "d6", "d7", "d16", "d17", "d18", "d19");
    681 	}
    682 	return position;
    683 }
    684 
    685 static SBC_ALWAYS_INLINE int sbc_enc_process_input_8s_neon_internal(
    686 	int position,
    687 	const uint8_t *pcm, int16_t X[2][SBC_X_BUFFER_SIZE],
    688 	int nsamples, int nchannels, int big_endian)
    689 {
    690 	static SBC_ALIGNED uint8_t perm_be[4][8] = {
    691 		PERM_BE(15, 7, 14, 8),
    692 		PERM_BE(13, 9, 12, 10),
    693 		PERM_BE(11, 3, 6,  0),
    694 		PERM_BE(5,  1, 4,  2)
    695 	};
    696 	static SBC_ALIGNED uint8_t perm_le[4][8] = {
    697 		PERM_LE(15, 7, 14, 8),
    698 		PERM_LE(13, 9, 12, 10),
    699 		PERM_LE(11, 3, 6,  0),
    700 		PERM_LE(5,  1, 4,  2)
    701 	};
    702 	/* handle X buffer wraparound */
    703 	if (position < nsamples) {
    704 		int16_t *dst = &X[0][SBC_X_BUFFER_SIZE - 72];
    705 		int16_t *src = &X[0][position];
    706 		asm volatile (
    707 			"vld1.16 {d0, d1, d2, d3}, [%[src], :128]!\n"
    708 			"vst1.16 {d0, d1, d2, d3}, [%[dst], :128]!\n"
    709 			"vld1.16 {d0, d1, d2, d3}, [%[src], :128]!\n"
    710 			"vst1.16 {d0, d1, d2, d3}, [%[dst], :128]!\n"
    711 			"vld1.16 {d0, d1, d2, d3}, [%[src], :128]!\n"
    712 			"vst1.16 {d0, d1, d2, d3}, [%[dst], :128]!\n"
    713 			"vld1.16 {d0, d1, d2, d3}, [%[src], :128]!\n"
    714 			"vst1.16 {d0, d1, d2, d3}, [%[dst], :128]!\n"
    715 			"vld1.16 {d0, d1}, [%[src], :128]!\n"
    716 			"vst1.16 {d0, d1}, [%[dst], :128]!\n"
    717 			:
    718 			  [dst] "+r" (dst),
    719 			  [src] "+r" (src)
    720 			: : "memory", "d0", "d1", "d2", "d3");
    721 		if (nchannels > 1) {
    722 			dst = &X[1][SBC_X_BUFFER_SIZE - 72];
    723 			src = &X[1][position];
    724 			asm volatile (
    725 				"vld1.16 {d0, d1, d2, d3}, [%[src], :128]!\n"
    726 				"vst1.16 {d0, d1, d2, d3}, [%[dst], :128]!\n"
    727 				"vld1.16 {d0, d1, d2, d3}, [%[src], :128]!\n"
    728 				"vst1.16 {d0, d1, d2, d3}, [%[dst], :128]!\n"
    729 				"vld1.16 {d0, d1, d2, d3}, [%[src], :128]!\n"
    730 				"vst1.16 {d0, d1, d2, d3}, [%[dst], :128]!\n"
    731 				"vld1.16 {d0, d1, d2, d3}, [%[src], :128]!\n"
    732 				"vst1.16 {d0, d1, d2, d3}, [%[dst], :128]!\n"
    733 				"vld1.16 {d0, d1}, [%[src], :128]!\n"
    734 				"vst1.16 {d0, d1}, [%[dst], :128]!\n"
    735 				:
    736 				  [dst] "+r" (dst),
    737 				  [src] "+r" (src)
    738 				: : "memory", "d0", "d1", "d2", "d3");
    739 		}
    740 		position = SBC_X_BUFFER_SIZE - 72;
    741 	}
    742 
    743 	if ((nchannels > 1) && ((uintptr_t)pcm & 1)) {
    744 		/* poor 'pcm' alignment */
    745 		int16_t *x = &X[0][position];
    746 		int16_t *y = &X[1][position];
    747 		asm volatile (
    748 			"vld1.8  {d0, d1, d2, d3}, [%[perm], :128]\n"
    749 		"1:\n"
    750 			"sub     %[x], %[x], #32\n"
    751 			"sub     %[y], %[y], #32\n"
    752 			"sub     %[position], %[position], #16\n"
    753 			"vld1.8  {d4, d5, d6, d7}, [%[pcm]]!\n"
    754 			"vuzp.16 q2,  q3\n"
    755 			"vld1.8  {d20, d21, d22, d23}, [%[pcm]]!\n"
    756 			"vuzp.16 q10, q11\n"
    757 			"vswp    q3,  q10\n"
    758 			"vtbl.8  d16, {d4, d5, d6, d7}, d0\n"
    759 			"vtbl.8  d17, {d4, d5, d6, d7}, d1\n"
    760 			"vtbl.8  d18, {d4, d5, d6, d7}, d2\n"
    761 			"vtbl.8  d19, {d4, d5, d6, d7}, d3\n"
    762 			"vst1.16 {d16, d17, d18, d19}, [%[x], :128]\n"
    763 			"vtbl.8  d16, {d20, d21, d22, d23}, d0\n"
    764 			"vtbl.8  d17, {d20, d21, d22, d23}, d1\n"
    765 			"vtbl.8  d18, {d20, d21, d22, d23}, d2\n"
    766 			"vtbl.8  d19, {d20, d21, d22, d23}, d3\n"
    767 			"vst1.16 {d16, d17, d18, d19}, [%[y], :128]\n"
    768 			"subs    %[nsamples], %[nsamples], #16\n"
    769 			"bgt     1b\n"
    770 			:
    771 			  [x]        "+r" (x),
    772 			  [y]        "+r" (y),
    773 			  [pcm]      "+r" (pcm),
    774 			  [nsamples] "+r" (nsamples),
    775 			  [position] "+r" (position)
    776 			:
    777 			  [perm]      "r" (big_endian ? perm_be : perm_le)
    778 			: "cc", "memory", "d0", "d1", "d2", "d3", "d4",
    779 			  "d5", "d6", "d7", "d16", "d17", "d18", "d19",
    780 			  "d20", "d21", "d22", "d23");
    781 	} else if (nchannels > 1) {
    782 		/* proper 'pcm' alignment */
    783 		int16_t *x = &X[0][position];
    784 		int16_t *y = &X[1][position];
    785 		asm volatile (
    786 			"vld1.8  {d0, d1, d2, d3}, [%[perm], :128]\n"
    787 		"1:\n"
    788 			"sub     %[x], %[x], #32\n"
    789 			"sub     %[y], %[y], #32\n"
    790 			"sub     %[position], %[position], #16\n"
    791 			"vld2.16  {d4, d5, d6, d7}, [%[pcm]]!\n"
    792 			"vld2.16  {d20, d21, d22, d23}, [%[pcm]]!\n"
    793 			"vswp    q3, q10\n"
    794 			"vtbl.8  d16, {d4, d5, d6, d7}, d0\n"
    795 			"vtbl.8  d17, {d4, d5, d6, d7}, d1\n"
    796 			"vtbl.8  d18, {d4, d5, d6, d7}, d2\n"
    797 			"vtbl.8  d19, {d4, d5, d6, d7}, d3\n"
    798 			"vst1.16 {d16, d17, d18, d19}, [%[x], :128]\n"
    799 			"vtbl.8  d16, {d20, d21, d22, d23}, d0\n"
    800 			"vtbl.8  d17, {d20, d21, d22, d23}, d1\n"
    801 			"vtbl.8  d18, {d20, d21, d22, d23}, d2\n"
    802 			"vtbl.8  d19, {d20, d21, d22, d23}, d3\n"
    803 			"vst1.16 {d16, d17, d18, d19}, [%[y], :128]\n"
    804 			"subs    %[nsamples], %[nsamples], #16\n"
    805 			"bgt     1b\n"
    806 			:
    807 			  [x]        "+r" (x),
    808 			  [y]        "+r" (y),
    809 			  [pcm]      "+r" (pcm),
    810 			  [nsamples] "+r" (nsamples),
    811 			  [position] "+r" (position)
    812 			:
    813 			  [perm]      "r" (big_endian ? perm_be : perm_le)
    814 			: "cc", "memory", "d0", "d1", "d2", "d3", "d4",
    815 			  "d5", "d6", "d7", "d16", "d17", "d18", "d19",
    816 			  "d20", "d21", "d22", "d23");
    817 	} else {
    818 		int16_t *x = &X[0][position];
    819 		asm volatile (
    820 			"vld1.8  {d0, d1, d2, d3}, [%[perm], :128]\n"
    821 		"1:\n"
    822 			"sub     %[x], %[x], #32\n"
    823 			"sub     %[position], %[position], #16\n"
    824 			"vld1.8  {d4, d5, d6, d7}, [%[pcm]]!\n"
    825 			"vtbl.8  d16, {d4, d5, d6, d7}, d0\n"
    826 			"vtbl.8  d17, {d4, d5, d6, d7}, d1\n"
    827 			"vtbl.8  d18, {d4, d5, d6, d7}, d2\n"
    828 			"vtbl.8  d19, {d4, d5, d6, d7}, d3\n"
    829 			"vst1.16 {d16, d17, d18, d19}, [%[x], :128]\n"
    830 			"subs    %[nsamples], %[nsamples], #16\n"
    831 			"bgt     1b\n"
    832 			:
    833 			  [x]        "+r" (x),
    834 			  [pcm]      "+r" (pcm),
    835 			  [nsamples] "+r" (nsamples),
    836 			  [position] "+r" (position)
    837 			:
    838 			  [perm]      "r" (big_endian ? perm_be : perm_le)
    839 			: "cc", "memory", "d0", "d1", "d2", "d3", "d4",
    840 			  "d5", "d6", "d7", "d16", "d17", "d18", "d19");
    841 	}
    842 	return position;
    843 }
    844 
    845 #undef PERM_BE
    846 #undef PERM_LE
    847 
    848 static int sbc_enc_process_input_4s_be_neon(int position, const uint8_t *pcm,
    849 					int16_t X[2][SBC_X_BUFFER_SIZE],
    850 					int nsamples, int nchannels)
    851 {
    852 	return sbc_enc_process_input_4s_neon_internal(
    853 		position, pcm, X, nsamples, nchannels, 1);
    854 }
    855 
    856 static int sbc_enc_process_input_4s_le_neon(int position, const uint8_t *pcm,
    857 					int16_t X[2][SBC_X_BUFFER_SIZE],
    858 					int nsamples, int nchannels)
    859 {
    860 	return sbc_enc_process_input_4s_neon_internal(
    861 		position, pcm, X, nsamples, nchannels, 0);
    862 }
    863 
    864 static int sbc_enc_process_input_8s_be_neon(int position, const uint8_t *pcm,
    865 					int16_t X[2][SBC_X_BUFFER_SIZE],
    866 					int nsamples, int nchannels)
    867 {
    868 	return sbc_enc_process_input_8s_neon_internal(
    869 		position, pcm, X, nsamples, nchannels, 1);
    870 }
    871 
    872 static int sbc_enc_process_input_8s_le_neon(int position, const uint8_t *pcm,
    873 					int16_t X[2][SBC_X_BUFFER_SIZE],
    874 					int nsamples, int nchannels)
    875 {
    876 	return sbc_enc_process_input_8s_neon_internal(
    877 		position, pcm, X, nsamples, nchannels, 0);
    878 }
    879 
    880 void sbc_init_primitives_neon(struct sbc_encoder_state *state)
    881 {
    882 	state->sbc_analyze_4b_4s = sbc_analyze_4b_4s_neon;
    883 	state->sbc_analyze_4b_8s = sbc_analyze_4b_8s_neon;
    884 	state->sbc_calc_scalefactors = sbc_calc_scalefactors_neon;
    885 	state->sbc_calc_scalefactors_j = sbc_calc_scalefactors_j_neon;
    886 	state->sbc_enc_process_input_4s_le = sbc_enc_process_input_4s_le_neon;
    887 	state->sbc_enc_process_input_4s_be = sbc_enc_process_input_4s_be_neon;
    888 	state->sbc_enc_process_input_8s_le = sbc_enc_process_input_8s_le_neon;
    889 	state->sbc_enc_process_input_8s_be = sbc_enc_process_input_8s_be_neon;
    890 	state->implementation_info = "NEON";
    891 }
    892 
    893 #endif
    894