Home | History | Annotate | Download | only in dsp
      1 /* Copyright (c) 2013 The Chromium OS Authors. All rights reserved.
      2  * Use of this source code is governed by a BSD-style license that can be
      3  * found in the LICENSE file.
      4  */
      5 
      6 #include <string.h>
      7 #include "crossover2.h"
      8 #include "biquad.h"
      9 
     10 static void lr42_set(struct lr42 *lr42, enum biquad_type type, float freq)
     11 {
     12 	struct biquad q;
     13 	biquad_set(&q, type, freq, 0, 0);
     14 	memset(lr42, 0, sizeof(*lr42));
     15 	lr42->b0 = q.b0;
     16 	lr42->b1 = q.b1;
     17 	lr42->b2 = q.b2;
     18 	lr42->a1 = q.a1;
     19 	lr42->a2 = q.a2;
     20 }
     21 
     22 /* Split input data using two LR4 filters, put the result into the input array
     23  * and another array.
     24  *
     25  * data0 --+-- lp --> data0
     26  *         |
     27  *         \-- hp --> data1
     28  */
     29 #if defined(__ARM_NEON__)
     30 #include <arm_neon.h>
     31 static void lr42_split(struct lr42 *lp, struct lr42 *hp, int count,
     32 		       float *data0L, float *data0R,
     33 		       float *data1L, float *data1R)
     34 {
     35 	float32x4_t x1 = {lp->x1L, hp->x1L, lp->x1R, hp->x1R};
     36 	float32x4_t x2 = {lp->x2L, hp->x2L, lp->x2R, hp->x2R};
     37 	float32x4_t y1 = {lp->y1L, hp->y1L, lp->y1R, hp->y1R};
     38 	float32x4_t y2 = {lp->y2L, hp->y2L, lp->y2R, hp->y2R};
     39 	float32x4_t z1 = {lp->z1L, hp->z1L, lp->z1R, hp->z1R};
     40 	float32x4_t z2 = {lp->z2L, hp->z2L, lp->z2R, hp->z2R};
     41 	float32x4_t b0 = {lp->b0, hp->b0, lp->b0, hp->b0};
     42 	float32x4_t b1 = {lp->b1, hp->b1, lp->b1, hp->b1};
     43 	float32x4_t b2 = {lp->b2, hp->b2, lp->b2, hp->b2};
     44 	float32x4_t a1 = {lp->a1, hp->a1, lp->a1, hp->a1};
     45 	float32x4_t a2 = {lp->a2, hp->a2, lp->a2, hp->a2};
     46 
     47 	__asm__ __volatile__(
     48 		/* q0 = x, q1 = y, q2 = z */
     49 		"1:                                     \n"
     50 		"vmul.f32 q1, %q[b1], %q[x1]            \n"
     51 		"vld1.32 d0[], [%[data0L]]              \n"
     52 		"vld1.32 d1[], [%[data0R]]              \n"
     53 		"subs %[count], #1                      \n"
     54 		"vmul.f32 q2, %q[b1], %q[y1]            \n"
     55 		"vmla.f32 q1, %q[b0], q0                \n"
     56 		"vmla.f32 q1, %q[b2], %q[x2]            \n"
     57 		"vmov.f32 %q[x2], %q[x1]                \n"
     58 		"vmov.f32 %q[x1], q0                    \n"
     59 		"vmls.f32 q1, %q[a1], %q[y1]            \n"
     60 		"vmls.f32 q1, %q[a2], %q[y2]            \n"
     61 		"vmla.f32 q2, %q[b0], q1                \n"
     62 		"vmla.f32 q2, %q[b2], %q[y2]            \n"
     63 		"vmov.f32 %q[y2], %q[y1]                \n"
     64 		"vmov.f32 %q[y1], q1                    \n"
     65 		"vmls.f32 q2, %q[a1], %q[z1]            \n"
     66 		"vmls.f32 q2, %q[a2], %q[z2]            \n"
     67 		"vmov.f32 %q[z2], %q[z1]                \n"
     68 		"vmov.f32 %q[z1], q2                    \n"
     69 		"vst1.f32 d4[0], [%[data0L]]!           \n"
     70 		"vst1.f32 d4[1], [%[data1L]]!           \n"
     71 		"vst1.f32 d5[0], [%[data0R]]!           \n"
     72 		"vst1.f32 d5[1], [%[data1R]]!           \n"
     73 		"bne 1b                                 \n"
     74 		: /* output */
     75 		  "=r"(data0L),
     76 		  "=r"(data0R),
     77 		  "=r"(data1L),
     78 		  "=r"(data1R),
     79 		  "=r"(count),
     80 		  [x1]"+w"(x1),
     81 		  [x2]"+w"(x2),
     82 		  [y1]"+w"(y1),
     83 		  [y2]"+w"(y2),
     84 		  [z1]"+w"(z1),
     85 		  [z2]"+w"(z2)
     86 		: /* input */
     87 		  [data0L]"0"(data0L),
     88 		  [data0R]"1"(data0R),
     89 		  [data1L]"2"(data1L),
     90 		  [data1R]"3"(data1R),
     91 		  [count]"4"(count),
     92 		  [b0]"w"(b0),
     93 		  [b1]"w"(b1),
     94 		  [b2]"w"(b2),
     95 		  [a1]"w"(a1),
     96 		  [a2]"w"(a2)
     97 		: /* clobber */
     98 		  "q0", "q1", "q2", "memory", "cc"
     99 		);
    100 
    101 	lp->x1L = x1[0]; lp->x1R = x1[2];
    102 	lp->x2L = x2[0]; lp->x2R = x2[2];
    103 	lp->y1L = y1[0]; lp->y1R = y1[2];
    104 	lp->y2L = y2[0]; lp->y2R = y2[2];
    105 	lp->z1L = z1[0]; lp->z1R = z1[2];
    106 	lp->z2L = z2[0]; lp->z2R = z2[2];
    107 
    108 	hp->x1L = x1[1]; hp->x1R = x1[3];
    109 	hp->x2L = x2[1]; hp->x2R = x2[3];
    110 	hp->y1L = y1[1]; hp->y1R = y1[3];
    111 	hp->y2L = y2[1]; hp->y2R = y2[3];
    112 	hp->z1L = z1[1]; hp->z1R = z1[3];
    113 	hp->z2L = z2[1]; hp->z2R = z2[3];
    114 }
    115 #elif defined(__SSE3__) && defined(__x86_64__)
    116 #include <emmintrin.h>
    117 static void lr42_split(struct lr42 *lp, struct lr42 *hp, int count,
    118 		       float *data0L, float *data0R,
    119 		       float *data1L, float *data1R)
    120 {
    121 	__m128 x1 = {lp->x1L, hp->x1L, lp->x1R, hp->x1R};
    122 	__m128 x2 = {lp->x2L, hp->x2L, lp->x2R, hp->x2R};
    123 	__m128 y1 = {lp->y1L, hp->y1L, lp->y1R, hp->y1R};
    124 	__m128 y2 = {lp->y2L, hp->y2L, lp->y2R, hp->y2R};
    125 	__m128 z1 = {lp->z1L, hp->z1L, lp->z1R, hp->z1R};
    126 	__m128 z2 = {lp->z2L, hp->z2L, lp->z2R, hp->z2R};
    127 	__m128 b0 = {lp->b0, hp->b0, lp->b0, hp->b0};
    128 	__m128 b1 = {lp->b1, hp->b1, lp->b1, hp->b1};
    129 	__m128 b2 = {lp->b2, hp->b2, lp->b2, hp->b2};
    130 	__m128 a1 = {lp->a1, hp->a1, lp->a1, hp->a1};
    131 	__m128 a2 = {lp->a2, hp->a2, lp->a2, hp->a2};
    132 
    133 	__asm__ __volatile__(
    134 		"1:                                     \n"
    135 		"movss (%[data0L]), %%xmm2              \n"
    136 		"movss (%[data0R]), %%xmm1              \n"
    137 		"shufps $0, %%xmm1, %%xmm2              \n"
    138 		"mulps %[b2],%[x2]                      \n"
    139 		"movaps %[b0], %%xmm0                   \n"
    140 		"mulps %[a2],%[z2]                      \n"
    141 		"movaps %[b1], %%xmm1                   \n"
    142 		"mulps %%xmm2,%%xmm0                    \n"
    143 		"mulps %[x1],%%xmm1                     \n"
    144 		"addps %%xmm1,%%xmm0                    \n"
    145 		"movaps %[a1],%%xmm1                    \n"
    146 		"mulps %[y1],%%xmm1                     \n"
    147 		"addps %[x2],%%xmm0                     \n"
    148 		"movaps %[b1],%[x2]                     \n"
    149 		"mulps %[y1],%[x2]                      \n"
    150 		"subps %%xmm1,%%xmm0                    \n"
    151 		"movaps %[a2],%%xmm1                    \n"
    152 		"mulps %[y2],%%xmm1                     \n"
    153 		"mulps %[b2],%[y2]                      \n"
    154 		"subps %%xmm1,%%xmm0                    \n"
    155 		"movaps %[b0],%%xmm1                    \n"
    156 		"mulps %%xmm0,%%xmm1                    \n"
    157 		"addps %[x2],%%xmm1                     \n"
    158 		"movaps %[x1],%[x2]                     \n"
    159 		"movaps %%xmm2,%[x1]                    \n"
    160 		"addps %[y2],%%xmm1                     \n"
    161 		"movaps %[a1],%[y2]                     \n"
    162 		"mulps %[z1],%[y2]                      \n"
    163 		"subps %[y2],%%xmm1                     \n"
    164 		"movaps %[y1],%[y2]                     \n"
    165 		"movaps %%xmm0,%[y1]                    \n"
    166 		"subps %[z2],%%xmm1                     \n"
    167 		"movaps %[z1],%[z2]                     \n"
    168 		"movaps %%xmm1,%[z1]                    \n"
    169 		"movss %%xmm1, (%[data0L])              \n"
    170 		"shufps $0x39, %%xmm1, %%xmm1           \n"
    171 		"movss %%xmm1, (%[data1L])              \n"
    172 		"shufps $0x39, %%xmm1, %%xmm1           \n"
    173 		"movss %%xmm1, (%[data0R])              \n"
    174 		"shufps $0x39, %%xmm1, %%xmm1           \n"
    175 		"movss %%xmm1, (%[data1R])              \n"
    176 		"add $4, %[data0L]                      \n"
    177 		"add $4, %[data1L]                      \n"
    178 		"add $4, %[data0R]                      \n"
    179 		"add $4, %[data1R]                      \n"
    180 		"sub $1, %[count]                       \n"
    181 		"jnz 1b                                 \n"
    182 		: /* output */
    183 		  [data0L]"+r"(data0L),
    184 		  [data0R]"+r"(data0R),
    185 		  [data1L]"+r"(data1L),
    186 		  [data1R]"+r"(data1R),
    187 		  [count]"+r"(count),
    188 		  [x1]"+x"(x1),
    189 		  [x2]"+x"(x2),
    190 		  [y1]"+x"(y1),
    191 		  [y2]"+x"(y2),
    192 		  [z1]"+x"(z1),
    193 		  [z2]"+x"(z2)
    194 		: /* input */
    195 		  [b0]"x"(b0),
    196 		  [b1]"x"(b1),
    197 		  [b2]"x"(b2),
    198 		  [a1]"x"(a1),
    199 		  [a2]"x"(a2)
    200 		: /* clobber */
    201 		  "xmm0", "xmm1", "xmm2", "memory", "cc"
    202 		);
    203 
    204 	lp->x1L = x1[0]; lp->x1R = x1[2];
    205 	lp->x2L = x2[0]; lp->x2R = x2[2];
    206 	lp->y1L = y1[0]; lp->y1R = y1[2];
    207 	lp->y2L = y2[0]; lp->y2R = y2[2];
    208 	lp->z1L = z1[0]; lp->z1R = z1[2];
    209 	lp->z2L = z2[0]; lp->z2R = z2[2];
    210 
    211 	hp->x1L = x1[1]; hp->x1R = x1[3];
    212 	hp->x2L = x2[1]; hp->x2R = x2[3];
    213 	hp->y1L = y1[1]; hp->y1R = y1[3];
    214 	hp->y2L = y2[1]; hp->y2R = y2[3];
    215 	hp->z1L = z1[1]; hp->z1R = z1[3];
    216 	hp->z2L = z2[1]; hp->z2R = z2[3];
    217 }
    218 #else
    219 static void lr42_split(struct lr42 *lp, struct lr42 *hp, int count,
    220 		       float *data0L, float *data0R,
    221 		       float *data1L, float *data1R)
    222 {
    223 	float lx1L = lp->x1L, lx1R = lp->x1R;
    224 	float lx2L = lp->x2L, lx2R = lp->x2R;
    225 	float ly1L = lp->y1L, ly1R = lp->y1R;
    226 	float ly2L = lp->y2L, ly2R = lp->y2R;
    227 	float lz1L = lp->z1L, lz1R = lp->z1R;
    228 	float lz2L = lp->z2L, lz2R = lp->z2R;
    229 	float lb0 = lp->b0;
    230 	float lb1 = lp->b1;
    231 	float lb2 = lp->b2;
    232 	float la1 = lp->a1;
    233 	float la2 = lp->a2;
    234 
    235 	float hx1L = hp->x1L, hx1R = hp->x1R;
    236 	float hx2L = hp->x2L, hx2R = hp->x2R;
    237 	float hy1L = hp->y1L, hy1R = hp->y1R;
    238 	float hy2L = hp->y2L, hy2R = hp->y2R;
    239 	float hz1L = hp->z1L, hz1R = hp->z1R;
    240 	float hz2L = hp->z2L, hz2R = hp->z2R;
    241 	float hb0 = hp->b0;
    242 	float hb1 = hp->b1;
    243 	float hb2 = hp->b2;
    244 	float ha1 = hp->a1;
    245 	float ha2 = hp->a2;
    246 
    247 	int i;
    248 	for (i = 0; i < count; i++) {
    249 		float xL, yL, zL, xR, yR, zR;
    250 		xL = data0L[i];
    251 		xR = data0R[i];
    252 		yL = lb0*xL + lb1*lx1L + lb2*lx2L - la1*ly1L - la2*ly2L;
    253 		yR = lb0*xR + lb1*lx1R + lb2*lx2R - la1*ly1R - la2*ly2R;
    254 		zL = lb0*yL + lb1*ly1L + lb2*ly2L - la1*lz1L - la2*lz2L;
    255 		zR = lb0*yR + lb1*ly1R + lb2*ly2R - la1*lz1R - la2*lz2R;
    256 		lx2L = lx1L;
    257 		lx2R = lx1R;
    258 		lx1L = xL;
    259 		lx1R = xR;
    260 		ly2L = ly1L;
    261 		ly2R = ly1R;
    262 		ly1L = yL;
    263 		ly1R = yR;
    264 		lz2L = lz1L;
    265 		lz2R = lz1R;
    266 		lz1L = zL;
    267 		lz1R = zR;
    268 		data0L[i] = zL;
    269 		data0R[i] = zR;
    270 
    271 		yL = hb0*xL + hb1*hx1L + hb2*hx2L - ha1*hy1L - ha2*hy2L;
    272 		yR = hb0*xR + hb1*hx1R + hb2*hx2R - ha1*hy1R - ha2*hy2R;
    273 		zL = hb0*yL + hb1*hy1L + hb2*hy2L - ha1*hz1L - ha2*hz2L;
    274 		zR = hb0*yR + hb1*hy1R + hb2*hy2R - ha1*hz1R - ha2*hz2R;
    275 		hx2L = hx1L;
    276 		hx2R = hx1R;
    277 		hx1L = xL;
    278 		hx1R = xR;
    279 		hy2L = hy1L;
    280 		hy2R = hy1R;
    281 		hy1L = yL;
    282 		hy1R = yR;
    283 		hz2L = hz1L;
    284 		hz2R = hz1R;
    285 		hz1L = zL;
    286 		hz1R = zR;
    287 		data1L[i] = zL;
    288 		data1R[i] = zR;
    289 	}
    290 
    291 	lp->x1L = lx1L; lp->x1R = lx1R;
    292 	lp->x2L = lx2L;	lp->x2R = lx2R;
    293 	lp->y1L = ly1L;	lp->y1R = ly1R;
    294 	lp->y2L = ly2L;	lp->y2R = ly2R;
    295 	lp->z1L = lz1L;	lp->z1R = lz1R;
    296 	lp->z2L = lz2L;	lp->z2R = lz2R;
    297 
    298 	hp->x1L = hx1L; hp->x1R = hx1R;
    299 	hp->x2L = hx2L;	hp->x2R = hx2R;
    300 	hp->y1L = hy1L;	hp->y1R = hy1R;
    301 	hp->y2L = hy2L;	hp->y2R = hy2R;
    302 	hp->z1L = hz1L;	hp->z1R = hz1R;
    303 	hp->z2L = hz2L;	hp->z2R = hz2R;
    304 }
    305 #endif
    306 
    307 /* Split input data using two LR4 filters and sum them back to the original
    308  * data array.
    309  *
    310  * data --+-- lp --+--> data
    311  *        |        |
    312  *        \-- hp --/
    313  */
    314 #if defined(__ARM_NEON__)
    315 #include <arm_neon.h>
    316 static void lr42_merge(struct lr42 *lp, struct lr42 *hp, int count,
    317 		       float *dataL, float *dataR)
    318 {
    319 	float32x4_t x1 = {lp->x1L, hp->x1L, lp->x1R, hp->x1R};
    320 	float32x4_t x2 = {lp->x2L, hp->x2L, lp->x2R, hp->x2R};
    321 	float32x4_t y1 = {lp->y1L, hp->y1L, lp->y1R, hp->y1R};
    322 	float32x4_t y2 = {lp->y2L, hp->y2L, lp->y2R, hp->y2R};
    323 	float32x4_t z1 = {lp->z1L, hp->z1L, lp->z1R, hp->z1R};
    324 	float32x4_t z2 = {lp->z2L, hp->z2L, lp->z2R, hp->z2R};
    325 	float32x4_t b0 = {lp->b0, hp->b0, lp->b0, hp->b0};
    326 	float32x4_t b1 = {lp->b1, hp->b1, lp->b1, hp->b1};
    327 	float32x4_t b2 = {lp->b2, hp->b2, lp->b2, hp->b2};
    328 	float32x4_t a1 = {lp->a1, hp->a1, lp->a1, hp->a1};
    329 	float32x4_t a2 = {lp->a2, hp->a2, lp->a2, hp->a2};
    330 
    331 	__asm__ __volatile__(
    332 		/* q0 = x, q1 = y, q2 = z */
    333 		"1:                                     \n"
    334 		"vmul.f32 q1, %q[b1], %q[x1]            \n"
    335 		"vld1.32 d0[], [%[dataL]]               \n"
    336 		"vld1.32 d1[], [%[dataR]]               \n"
    337 		"subs %[count], #1                      \n"
    338 		"vmul.f32 q2, %q[b1], %q[y1]            \n"
    339 		"vmla.f32 q1, %q[b0], q0                \n"
    340 		"vmla.f32 q1, %q[b2], %q[x2]            \n"
    341 		"vmov.f32 %q[x2], %q[x1]                \n"
    342 		"vmov.f32 %q[x1], q0                    \n"
    343 		"vmls.f32 q1, %q[a1], %q[y1]            \n"
    344 		"vmls.f32 q1, %q[a2], %q[y2]            \n"
    345 		"vmla.f32 q2, %q[b0], q1                \n"
    346 		"vmla.f32 q2, %q[b2], %q[y2]            \n"
    347 		"vmov.f32 %q[y2], %q[y1]                \n"
    348 		"vmov.f32 %q[y1], q1                    \n"
    349 		"vmls.f32 q2, %q[a1], %q[z1]            \n"
    350 		"vmls.f32 q2, %q[a2], %q[z2]            \n"
    351 		"vmov.f32 %q[z2], %q[z1]                \n"
    352 		"vmov.f32 %q[z1], q2                    \n"
    353 		"vpadd.f32 d4, d4, d5                   \n"
    354 		"vst1.f32 d4[0], [%[dataL]]!            \n"
    355 		"vst1.f32 d4[1], [%[dataR]]!            \n"
    356 		"bne 1b                                 \n"
    357 		: /* output */
    358 		  "=r"(dataL),
    359 		  "=r"(dataR),
    360 		  "=r"(count),
    361 		  [x1]"+w"(x1),
    362 		  [x2]"+w"(x2),
    363 		  [y1]"+w"(y1),
    364 		  [y2]"+w"(y2),
    365 		  [z1]"+w"(z1),
    366 		  [z2]"+w"(z2)
    367 		: /* input */
    368 		  [dataL]"0"(dataL),
    369 		  [dataR]"1"(dataR),
    370 		  [count]"2"(count),
    371 		  [b0]"w"(b0),
    372 		  [b1]"w"(b1),
    373 		  [b2]"w"(b2),
    374 		  [a1]"w"(a1),
    375 		  [a2]"w"(a2)
    376 		: /* clobber */
    377 		  "q0", "q1", "q2", "memory", "cc"
    378 		);
    379 
    380 	lp->x1L = x1[0]; lp->x1R = x1[2];
    381 	lp->x2L = x2[0]; lp->x2R = x2[2];
    382 	lp->y1L = y1[0]; lp->y1R = y1[2];
    383 	lp->y2L = y2[0]; lp->y2R = y2[2];
    384 	lp->z1L = z1[0]; lp->z1R = z1[2];
    385 	lp->z2L = z2[0]; lp->z2R = z2[2];
    386 
    387 	hp->x1L = x1[1]; hp->x1R = x1[3];
    388 	hp->x2L = x2[1]; hp->x2R = x2[3];
    389 	hp->y1L = y1[1]; hp->y1R = y1[3];
    390 	hp->y2L = y2[1]; hp->y2R = y2[3];
    391 	hp->z1L = z1[1]; hp->z1R = z1[3];
    392 	hp->z2L = z2[1]; hp->z2R = z2[3];
    393 }
    394 #elif defined(__SSE3__) && defined(__x86_64__)
    395 #include <emmintrin.h>
    396 static void lr42_merge(struct lr42 *lp, struct lr42 *hp, int count,
    397 		       float *dataL, float *dataR)
    398 {
    399 	__m128 x1 = {lp->x1L, hp->x1L, lp->x1R, hp->x1R};
    400 	__m128 x2 = {lp->x2L, hp->x2L, lp->x2R, hp->x2R};
    401 	__m128 y1 = {lp->y1L, hp->y1L, lp->y1R, hp->y1R};
    402 	__m128 y2 = {lp->y2L, hp->y2L, lp->y2R, hp->y2R};
    403 	__m128 z1 = {lp->z1L, hp->z1L, lp->z1R, hp->z1R};
    404 	__m128 z2 = {lp->z2L, hp->z2L, lp->z2R, hp->z2R};
    405 	__m128 b0 = {lp->b0, hp->b0, lp->b0, hp->b0};
    406 	__m128 b1 = {lp->b1, hp->b1, lp->b1, hp->b1};
    407 	__m128 b2 = {lp->b2, hp->b2, lp->b2, hp->b2};
    408 	__m128 a1 = {lp->a1, hp->a1, lp->a1, hp->a1};
    409 	__m128 a2 = {lp->a2, hp->a2, lp->a2, hp->a2};
    410 
    411 	__asm__ __volatile__(
    412 		"1:                                     \n"
    413 		"movss (%[dataL]), %%xmm2               \n"
    414 		"movss (%[dataR]), %%xmm1               \n"
    415 		"shufps $0, %%xmm1, %%xmm2              \n"
    416 		"mulps %[b2],%[x2]                      \n"
    417 		"movaps %[b0], %%xmm0                   \n"
    418 		"mulps %[a2],%[z2]                      \n"
    419 		"movaps %[b1], %%xmm1                   \n"
    420 		"mulps %%xmm2,%%xmm0                    \n"
    421 		"mulps %[x1],%%xmm1                     \n"
    422 		"addps %%xmm1,%%xmm0                    \n"
    423 		"movaps %[a1],%%xmm1                    \n"
    424 		"mulps %[y1],%%xmm1                     \n"
    425 		"addps %[x2],%%xmm0                     \n"
    426 		"movaps %[b1],%[x2]                     \n"
    427 		"mulps %[y1],%[x2]                      \n"
    428 		"subps %%xmm1,%%xmm0                    \n"
    429 		"movaps %[a2],%%xmm1                    \n"
    430 		"mulps %[y2],%%xmm1                     \n"
    431 		"mulps %[b2],%[y2]                      \n"
    432 		"subps %%xmm1,%%xmm0                    \n"
    433 		"movaps %[b0],%%xmm1                    \n"
    434 		"mulps %%xmm0,%%xmm1                    \n"
    435 		"addps %[x2],%%xmm1                     \n"
    436 		"movaps %[x1],%[x2]                     \n"
    437 		"movaps %%xmm2,%[x1]                    \n"
    438 		"addps %[y2],%%xmm1                     \n"
    439 		"movaps %[a1],%[y2]                     \n"
    440 		"mulps %[z1],%[y2]                      \n"
    441 		"subps %[y2],%%xmm1                     \n"
    442 		"movaps %[y1],%[y2]                     \n"
    443 		"movaps %%xmm0,%[y1]                    \n"
    444 		"subps %[z2],%%xmm1                     \n"
    445 		"movaps %[z1],%[z2]                     \n"
    446 		"movaps %%xmm1,%[z1]                    \n"
    447 		"haddps %%xmm1, %%xmm1                  \n"
    448 		"movss %%xmm1, (%[dataL])               \n"
    449 		"shufps $0x39, %%xmm1, %%xmm1           \n"
    450 		"movss %%xmm1, (%[dataR])               \n"
    451 		"add $4, %[dataL]                       \n"
    452 		"add $4, %[dataR]                       \n"
    453 		"sub $1, %[count]                       \n"
    454 		"jnz 1b                                 \n"
    455 		: /* output */
    456 		  [dataL]"+r"(dataL),
    457 		  [dataR]"+r"(dataR),
    458 		  [count]"+r"(count),
    459 		  [x1]"+x"(x1),
    460 		  [x2]"+x"(x2),
    461 		  [y1]"+x"(y1),
    462 		  [y2]"+x"(y2),
    463 		  [z1]"+x"(z1),
    464 		  [z2]"+x"(z2)
    465 		: /* input */
    466 		  [b0]"x"(b0),
    467 		  [b1]"x"(b1),
    468 		  [b2]"x"(b2),
    469 		  [a1]"x"(a1),
    470 		  [a2]"x"(a2)
    471 		: /* clobber */
    472 		  "xmm0", "xmm1", "xmm2", "memory", "cc"
    473 		);
    474 
    475 	lp->x1L = x1[0]; lp->x1R = x1[2];
    476 	lp->x2L = x2[0]; lp->x2R = x2[2];
    477 	lp->y1L = y1[0]; lp->y1R = y1[2];
    478 	lp->y2L = y2[0]; lp->y2R = y2[2];
    479 	lp->z1L = z1[0]; lp->z1R = z1[2];
    480 	lp->z2L = z2[0]; lp->z2R = z2[2];
    481 
    482 	hp->x1L = x1[1]; hp->x1R = x1[3];
    483 	hp->x2L = x2[1]; hp->x2R = x2[3];
    484 	hp->y1L = y1[1]; hp->y1R = y1[3];
    485 	hp->y2L = y2[1]; hp->y2R = y2[3];
    486 	hp->z1L = z1[1]; hp->z1R = z1[3];
    487 	hp->z2L = z2[1]; hp->z2R = z2[3];
    488 }
    489 #else
    490 static void lr42_merge(struct lr42 *lp, struct lr42 *hp, int count,
    491 		       float *dataL, float *dataR)
    492 {
    493 	float lx1L = lp->x1L, lx1R = lp->x1R;
    494 	float lx2L = lp->x2L, lx2R = lp->x2R;
    495 	float ly1L = lp->y1L, ly1R = lp->y1R;
    496 	float ly2L = lp->y2L, ly2R = lp->y2R;
    497 	float lz1L = lp->z1L, lz1R = lp->z1R;
    498 	float lz2L = lp->z2L, lz2R = lp->z2R;
    499 	float lb0 = lp->b0;
    500 	float lb1 = lp->b1;
    501 	float lb2 = lp->b2;
    502 	float la1 = lp->a1;
    503 	float la2 = lp->a2;
    504 
    505 	float hx1L = hp->x1L, hx1R = hp->x1R;
    506 	float hx2L = hp->x2L, hx2R = hp->x2R;
    507 	float hy1L = hp->y1L, hy1R = hp->y1R;
    508 	float hy2L = hp->y2L, hy2R = hp->y2R;
    509 	float hz1L = hp->z1L, hz1R = hp->z1R;
    510 	float hz2L = hp->z2L, hz2R = hp->z2R;
    511 	float hb0 = hp->b0;
    512 	float hb1 = hp->b1;
    513 	float hb2 = hp->b2;
    514 	float ha1 = hp->a1;
    515 	float ha2 = hp->a2;
    516 
    517 	int i;
    518 	for (i = 0; i < count; i++) {
    519 		float xL, yL, zL, xR, yR, zR;
    520 		xL = dataL[i];
    521 		xR = dataR[i];
    522 		yL = lb0*xL + lb1*lx1L + lb2*lx2L - la1*ly1L - la2*ly2L;
    523 		yR = lb0*xR + lb1*lx1R + lb2*lx2R - la1*ly1R - la2*ly2R;
    524 		zL = lb0*yL + lb1*ly1L + lb2*ly2L - la1*lz1L - la2*lz2L;
    525 		zR = lb0*yR + lb1*ly1R + lb2*ly2R - la1*lz1R - la2*lz2R;
    526 		lx2L = lx1L;
    527 		lx2R = lx1R;
    528 		lx1L = xL;
    529 		lx1R = xR;
    530 		ly2L = ly1L;
    531 		ly2R = ly1R;
    532 		ly1L = yL;
    533 		ly1R = yR;
    534 		lz2L = lz1L;
    535 		lz2R = lz1R;
    536 		lz1L = zL;
    537 		lz1R = zR;
    538 
    539 		yL = hb0*xL + hb1*hx1L + hb2*hx2L - ha1*hy1L - ha2*hy2L;
    540 		yR = hb0*xR + hb1*hx1R + hb2*hx2R - ha1*hy1R - ha2*hy2R;
    541 		zL = hb0*yL + hb1*hy1L + hb2*hy2L - ha1*hz1L - ha2*hz2L;
    542 		zR = hb0*yR + hb1*hy1R + hb2*hy2R - ha1*hz1R - ha2*hz2R;
    543 		hx2L = hx1L;
    544 		hx2R = hx1R;
    545 		hx1L = xL;
    546 		hx1R = xR;
    547 		hy2L = hy1L;
    548 		hy2R = hy1R;
    549 		hy1L = yL;
    550 		hy1R = yR;
    551 		hz2L = hz1L;
    552 		hz2R = hz1R;
    553 		hz1L = zL;
    554 		hz1R = zR;
    555 		dataL[i] = zL + lz1L;
    556 		dataR[i] = zR + lz1R;
    557 	}
    558 
    559 	lp->x1L = lx1L; lp->x1R = lx1R;
    560 	lp->x2L = lx2L;	lp->x2R = lx2R;
    561 	lp->y1L = ly1L;	lp->y1R = ly1R;
    562 	lp->y2L = ly2L;	lp->y2R = ly2R;
    563 	lp->z1L = lz1L;	lp->z1R = lz1R;
    564 	lp->z2L = lz2L;	lp->z2R = lz2R;
    565 
    566 	hp->x1L = hx1L; hp->x1R = hx1R;
    567 	hp->x2L = hx2L;	hp->x2R = hx2R;
    568 	hp->y1L = hy1L;	hp->y1R = hy1R;
    569 	hp->y2L = hy2L;	hp->y2R = hy2R;
    570 	hp->z1L = hz1L;	hp->z1R = hz1R;
    571 	hp->z2L = hz2L;	hp->z2R = hz2R;
    572 }
    573 #endif
    574 
    575 void crossover2_init(struct crossover2 *xo2, float freq1, float freq2)
    576 {
    577 	int i;
    578 	for (i = 0; i < 3; i++) {
    579 		float f = (i == 0) ? freq1 : freq2;
    580 		lr42_set(&xo2->lp[i], BQ_LOWPASS, f);
    581 		lr42_set(&xo2->hp[i], BQ_HIGHPASS, f);
    582 	}
    583 }
    584 
    585 void crossover2_process(struct crossover2 *xo2, int count,
    586 			float *data0L, float *data0R,
    587 			float *data1L, float *data1R,
    588 			float *data2L, float *data2R)
    589 {
    590 	if (!count)
    591 		return;
    592 
    593 	lr42_split(&xo2->lp[0], &xo2->hp[0], count, data0L, data0R,
    594 		   data1L, data1R);
    595 	lr42_merge(&xo2->lp[1], &xo2->hp[1], count, data0L, data0R);
    596 	lr42_split(&xo2->lp[2], &xo2->hp[2], count, data1L, data1R,
    597 		   data2L, data2R);
    598 }
    599