Home | History | Annotate | Download | only in libFLAC
      1 /* libFLAC - Free Lossless Audio Codec library
      2  * Copyright (C) 2000-2009  Josh Coalson
      3  * Copyright (C) 2011-2016  Xiph.Org Foundation
      4  *
      5  * Redistribution and use in source and binary forms, with or without
      6  * modification, are permitted provided that the following conditions
      7  * are met:
      8  *
      9  * - Redistributions of source code must retain the above copyright
     10  * notice, this list of conditions and the following disclaimer.
     11  *
     12  * - Redistributions in binary form must reproduce the above copyright
     13  * notice, this list of conditions and the following disclaimer in the
     14  * documentation and/or other materials provided with the distribution.
     15  *
     16  * - Neither the name of the Xiph.org Foundation nor the names of its
     17  * contributors may be used to endorse or promote products derived from
     18  * this software without specific prior written permission.
     19  *
     20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     22  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     23  * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
     24  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
     25  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
     26  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
     27  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
     28  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
     29  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     30  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     31  */
     32 
     33 #ifdef HAVE_CONFIG_H
     34 #  include <config.h>
     35 #endif
     36 
     37 #include "private/cpu.h"
     38 
     39 #ifndef FLAC__INTEGER_ONLY_LIBRARY
     40 #ifndef FLAC__NO_ASM
     41 #if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN
     42 #include "private/lpc.h"
     43 #ifdef FLAC__AVX2_SUPPORTED
     44 
     45 #include "FLAC/assert.h"
     46 #include "FLAC/format.h"
     47 
     48 #include <immintrin.h> /* AVX2 */
     49 
     50 FLAC__SSE_TARGET("avx2")
     51 void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_avx2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
     52 {
     53 	int i;
     54 	FLAC__int32 sum;
     55 	__m128i cnt = _mm_cvtsi32_si128(lp_quantization);
     56 
     57 	FLAC__ASSERT(order > 0);
     58 	FLAC__ASSERT(order <= 32);
     59 
     60 	if(order <= 12) {
     61 		if(order > 8) {
     62 			if(order > 10) {
     63 				if(order == 12) {
     64 					__m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;
     65 					q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
     66 					q1  = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
     67 					q2  = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
     68 					q3  = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
     69 					q4  = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]);
     70 					q5  = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]);
     71 					q6  = _mm256_set1_epi32(0xffff & qlp_coeff[6 ]);
     72 					q7  = _mm256_set1_epi32(0xffff & qlp_coeff[7 ]);
     73 					q8  = _mm256_set1_epi32(0xffff & qlp_coeff[8 ]);
     74 					q9  = _mm256_set1_epi32(0xffff & qlp_coeff[9 ]);
     75 					q10 = _mm256_set1_epi32(0xffff & qlp_coeff[10]);
     76 					q11 = _mm256_set1_epi32(0xffff & qlp_coeff[11]);
     77 
     78 					for(i = 0; i < (int)data_len-7; i+=8) {
     79 						__m256i summ, mull;
     80 						summ = _mm256_madd_epi16(q11, _mm256_loadu_si256((const __m256i*)(data+i-12)));
     81 						mull = _mm256_madd_epi16(q10, _mm256_loadu_si256((const __m256i*)(data+i-11))); summ = _mm256_add_epi32(summ, mull);
     82 						mull = _mm256_madd_epi16(q9,  _mm256_loadu_si256((const __m256i*)(data+i-10))); summ = _mm256_add_epi32(summ, mull);
     83 						mull = _mm256_madd_epi16(q8,  _mm256_loadu_si256((const __m256i*)(data+i-9 ))); summ = _mm256_add_epi32(summ, mull);
     84 						mull = _mm256_madd_epi16(q7,  _mm256_loadu_si256((const __m256i*)(data+i-8 ))); summ = _mm256_add_epi32(summ, mull);
     85 						mull = _mm256_madd_epi16(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mull);
     86 						mull = _mm256_madd_epi16(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull);
     87 						mull = _mm256_madd_epi16(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
     88 						mull = _mm256_madd_epi16(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
     89 						mull = _mm256_madd_epi16(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
     90 						mull = _mm256_madd_epi16(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
     91 						mull = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
     92 						summ = _mm256_sra_epi32(summ, cnt);
     93 						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
     94 					}
     95 				}
     96 				else { /* order == 11 */
     97 					__m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10;
     98 					q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
     99 					q1  = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
    100 					q2  = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
    101 					q3  = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
    102 					q4  = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]);
    103 					q5  = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]);
    104 					q6  = _mm256_set1_epi32(0xffff & qlp_coeff[6 ]);
    105 					q7  = _mm256_set1_epi32(0xffff & qlp_coeff[7 ]);
    106 					q8  = _mm256_set1_epi32(0xffff & qlp_coeff[8 ]);
    107 					q9  = _mm256_set1_epi32(0xffff & qlp_coeff[9 ]);
    108 					q10 = _mm256_set1_epi32(0xffff & qlp_coeff[10]);
    109 
    110 					for(i = 0; i < (int)data_len-7; i+=8) {
    111 						__m256i summ, mull;
    112 						summ = _mm256_madd_epi16(q10, _mm256_loadu_si256((const __m256i*)(data+i-11)));
    113 						mull = _mm256_madd_epi16(q9,  _mm256_loadu_si256((const __m256i*)(data+i-10))); summ = _mm256_add_epi32(summ, mull);
    114 						mull = _mm256_madd_epi16(q8,  _mm256_loadu_si256((const __m256i*)(data+i-9 ))); summ = _mm256_add_epi32(summ, mull);
    115 						mull = _mm256_madd_epi16(q7,  _mm256_loadu_si256((const __m256i*)(data+i-8 ))); summ = _mm256_add_epi32(summ, mull);
    116 						mull = _mm256_madd_epi16(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mull);
    117 						mull = _mm256_madd_epi16(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull);
    118 						mull = _mm256_madd_epi16(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
    119 						mull = _mm256_madd_epi16(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
    120 						mull = _mm256_madd_epi16(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
    121 						mull = _mm256_madd_epi16(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
    122 						mull = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
    123 						summ = _mm256_sra_epi32(summ, cnt);
    124 						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
    125 					}
    126 				}
    127 			}
    128 			else {
    129 				if(order == 10) {
    130 					__m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9;
    131 					q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
    132 					q1  = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
    133 					q2  = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
    134 					q3  = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
    135 					q4  = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]);
    136 					q5  = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]);
    137 					q6  = _mm256_set1_epi32(0xffff & qlp_coeff[6 ]);
    138 					q7  = _mm256_set1_epi32(0xffff & qlp_coeff[7 ]);
    139 					q8  = _mm256_set1_epi32(0xffff & qlp_coeff[8 ]);
    140 					q9  = _mm256_set1_epi32(0xffff & qlp_coeff[9 ]);
    141 
    142 					for(i = 0; i < (int)data_len-7; i+=8) {
    143 						__m256i summ, mull;
    144 						summ = _mm256_madd_epi16(q9,  _mm256_loadu_si256((const __m256i*)(data+i-10)));
    145 						mull = _mm256_madd_epi16(q8,  _mm256_loadu_si256((const __m256i*)(data+i-9 ))); summ = _mm256_add_epi32(summ, mull);
    146 						mull = _mm256_madd_epi16(q7,  _mm256_loadu_si256((const __m256i*)(data+i-8 ))); summ = _mm256_add_epi32(summ, mull);
    147 						mull = _mm256_madd_epi16(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mull);
    148 						mull = _mm256_madd_epi16(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull);
    149 						mull = _mm256_madd_epi16(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
    150 						mull = _mm256_madd_epi16(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
    151 						mull = _mm256_madd_epi16(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
    152 						mull = _mm256_madd_epi16(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
    153 						mull = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
    154 						summ = _mm256_sra_epi32(summ, cnt);
    155 						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
    156 					}
    157 				}
    158 				else { /* order == 9 */
    159 					__m256i q0, q1, q2, q3, q4, q5, q6, q7, q8;
    160 					q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
    161 					q1  = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
    162 					q2  = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
    163 					q3  = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
    164 					q4  = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]);
    165 					q5  = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]);
    166 					q6  = _mm256_set1_epi32(0xffff & qlp_coeff[6 ]);
    167 					q7  = _mm256_set1_epi32(0xffff & qlp_coeff[7 ]);
    168 					q8  = _mm256_set1_epi32(0xffff & qlp_coeff[8 ]);
    169 
    170 					for(i = 0; i < (int)data_len-7; i+=8) {
    171 						__m256i summ, mull;
    172 						summ = _mm256_madd_epi16(q8,  _mm256_loadu_si256((const __m256i*)(data+i-9 )));
    173 						mull = _mm256_madd_epi16(q7,  _mm256_loadu_si256((const __m256i*)(data+i-8 ))); summ = _mm256_add_epi32(summ, mull);
    174 						mull = _mm256_madd_epi16(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mull);
    175 						mull = _mm256_madd_epi16(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull);
    176 						mull = _mm256_madd_epi16(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
    177 						mull = _mm256_madd_epi16(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
    178 						mull = _mm256_madd_epi16(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
    179 						mull = _mm256_madd_epi16(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
    180 						mull = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
    181 						summ = _mm256_sra_epi32(summ, cnt);
    182 						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
    183 					}
    184 				}
    185 			}
    186 		}
    187 		else if(order > 4) {
    188 			if(order > 6) {
    189 				if(order == 8) {
    190 					__m256i q0, q1, q2, q3, q4, q5, q6, q7;
    191 					q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
    192 					q1  = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
    193 					q2  = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
    194 					q3  = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
    195 					q4  = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]);
    196 					q5  = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]);
    197 					q6  = _mm256_set1_epi32(0xffff & qlp_coeff[6 ]);
    198 					q7  = _mm256_set1_epi32(0xffff & qlp_coeff[7 ]);
    199 
    200 					for(i = 0; i < (int)data_len-7; i+=8) {
    201 						__m256i summ, mull;
    202 						summ = _mm256_madd_epi16(q7,  _mm256_loadu_si256((const __m256i*)(data+i-8 )));
    203 						mull = _mm256_madd_epi16(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mull);
    204 						mull = _mm256_madd_epi16(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull);
    205 						mull = _mm256_madd_epi16(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
    206 						mull = _mm256_madd_epi16(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
    207 						mull = _mm256_madd_epi16(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
    208 						mull = _mm256_madd_epi16(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
    209 						mull = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
    210 						summ = _mm256_sra_epi32(summ, cnt);
    211 						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
    212 					}
    213 				}
    214 				else { /* order == 7 */
    215 					__m256i q0, q1, q2, q3, q4, q5, q6;
    216 					q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
    217 					q1  = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
    218 					q2  = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
    219 					q3  = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
    220 					q4  = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]);
    221 					q5  = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]);
    222 					q6  = _mm256_set1_epi32(0xffff & qlp_coeff[6 ]);
    223 
    224 					for(i = 0; i < (int)data_len-7; i+=8) {
    225 						__m256i summ, mull;
    226 						summ = _mm256_madd_epi16(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7 )));
    227 						mull = _mm256_madd_epi16(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull);
    228 						mull = _mm256_madd_epi16(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
    229 						mull = _mm256_madd_epi16(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
    230 						mull = _mm256_madd_epi16(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
    231 						mull = _mm256_madd_epi16(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
    232 						mull = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
    233 						summ = _mm256_sra_epi32(summ, cnt);
    234 						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
    235 					}
    236 				}
    237 			}
    238 			else {
    239 				if(order == 6) {
    240 					__m256i q0, q1, q2, q3, q4, q5;
    241 					q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
    242 					q1  = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
    243 					q2  = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
    244 					q3  = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
    245 					q4  = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]);
    246 					q5  = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]);
    247 
    248 					for(i = 0; i < (int)data_len-7; i+=8) {
    249 						__m256i summ, mull;
    250 						summ = _mm256_madd_epi16(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6 )));
    251 						mull = _mm256_madd_epi16(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
    252 						mull = _mm256_madd_epi16(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
    253 						mull = _mm256_madd_epi16(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
    254 						mull = _mm256_madd_epi16(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
    255 						mull = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
    256 						summ = _mm256_sra_epi32(summ, cnt);
    257 						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
    258 					}
    259 				}
    260 				else { /* order == 5 */
    261 					__m256i q0, q1, q2, q3, q4;
    262 					q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
    263 					q1  = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
    264 					q2  = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
    265 					q3  = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
    266 					q4  = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]);
    267 
    268 					for(i = 0; i < (int)data_len-7; i+=8) {
    269 						__m256i summ, mull;
    270 						summ = _mm256_madd_epi16(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5 )));
    271 						mull = _mm256_madd_epi16(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
    272 						mull = _mm256_madd_epi16(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
    273 						mull = _mm256_madd_epi16(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
    274 						mull = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
    275 						summ = _mm256_sra_epi32(summ, cnt);
    276 						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
    277 					}
    278 				}
    279 			}
    280 		}
    281 		else {
    282 			if(order > 2) {
    283 				if(order == 4) {
    284 					__m256i q0, q1, q2, q3;
    285 					q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
    286 					q1  = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
    287 					q2  = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
    288 					q3  = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
    289 
    290 					for(i = 0; i < (int)data_len-7; i+=8) {
    291 						__m256i summ, mull;
    292 						summ = _mm256_madd_epi16(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4 )));
    293 						mull = _mm256_madd_epi16(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
    294 						mull = _mm256_madd_epi16(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
    295 						mull = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
    296 						summ = _mm256_sra_epi32(summ, cnt);
    297 						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
    298 					}
    299 				}
    300 				else { /* order == 3 */
    301 					__m256i q0, q1, q2;
    302 					q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
    303 					q1  = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
    304 					q2  = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
    305 
    306 					for(i = 0; i < (int)data_len-7; i+=8) {
    307 						__m256i summ, mull;
    308 						summ = _mm256_madd_epi16(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3 )));
    309 						mull = _mm256_madd_epi16(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
    310 						mull = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
    311 						summ = _mm256_sra_epi32(summ, cnt);
    312 						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
    313 					}
    314 				}
    315 			}
    316 			else {
    317 				if(order == 2) {
    318 					__m256i q0, q1;
    319 					q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
    320 					q1  = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
    321 
    322 					for(i = 0; i < (int)data_len-7; i+=8) {
    323 						__m256i summ, mull;
    324 						summ = _mm256_madd_epi16(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2 )));
    325 						mull = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
    326 						summ = _mm256_sra_epi32(summ, cnt);
    327 						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
    328 					}
    329 				}
    330 				else { /* order == 1 */
    331 					__m256i q0;
    332 					q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
    333 
    334 					for(i = 0; i < (int)data_len-7; i+=8) {
    335 						__m256i summ;
    336 						summ = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 )));
    337 						summ = _mm256_sra_epi32(summ, cnt);
    338 						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
    339 					}
    340 				}
    341 			}
    342 		}
    343 		for(; i < (int)data_len; i++) {
    344 			sum = 0;
    345 			switch(order) {
    346 				case 12: sum += qlp_coeff[11] * data[i-12];
    347 				case 11: sum += qlp_coeff[10] * data[i-11];
    348 				case 10: sum += qlp_coeff[ 9] * data[i-10];
    349 				case 9:  sum += qlp_coeff[ 8] * data[i- 9];
    350 				case 8:  sum += qlp_coeff[ 7] * data[i- 8];
    351 				case 7:  sum += qlp_coeff[ 6] * data[i- 7];
    352 				case 6:  sum += qlp_coeff[ 5] * data[i- 6];
    353 				case 5:  sum += qlp_coeff[ 4] * data[i- 5];
    354 				case 4:  sum += qlp_coeff[ 3] * data[i- 4];
    355 				case 3:  sum += qlp_coeff[ 2] * data[i- 3];
    356 				case 2:  sum += qlp_coeff[ 1] * data[i- 2];
    357 				case 1:  sum += qlp_coeff[ 0] * data[i- 1];
    358 			}
    359 			residual[i] = data[i] - (sum >> lp_quantization);
    360 		}
    361 	}
    362 	else { /* order > 12 */
    363 		for(i = 0; i < (int)data_len; i++) {
    364 			sum = 0;
    365 			switch(order) {
    366 				case 32: sum += qlp_coeff[31] * data[i-32];
    367 				case 31: sum += qlp_coeff[30] * data[i-31];
    368 				case 30: sum += qlp_coeff[29] * data[i-30];
    369 				case 29: sum += qlp_coeff[28] * data[i-29];
    370 				case 28: sum += qlp_coeff[27] * data[i-28];
    371 				case 27: sum += qlp_coeff[26] * data[i-27];
    372 				case 26: sum += qlp_coeff[25] * data[i-26];
    373 				case 25: sum += qlp_coeff[24] * data[i-25];
    374 				case 24: sum += qlp_coeff[23] * data[i-24];
    375 				case 23: sum += qlp_coeff[22] * data[i-23];
    376 				case 22: sum += qlp_coeff[21] * data[i-22];
    377 				case 21: sum += qlp_coeff[20] * data[i-21];
    378 				case 20: sum += qlp_coeff[19] * data[i-20];
    379 				case 19: sum += qlp_coeff[18] * data[i-19];
    380 				case 18: sum += qlp_coeff[17] * data[i-18];
    381 				case 17: sum += qlp_coeff[16] * data[i-17];
    382 				case 16: sum += qlp_coeff[15] * data[i-16];
    383 				case 15: sum += qlp_coeff[14] * data[i-15];
    384 				case 14: sum += qlp_coeff[13] * data[i-14];
    385 				case 13: sum += qlp_coeff[12] * data[i-13];
    386 				         sum += qlp_coeff[11] * data[i-12];
    387 				         sum += qlp_coeff[10] * data[i-11];
    388 				         sum += qlp_coeff[ 9] * data[i-10];
    389 				         sum += qlp_coeff[ 8] * data[i- 9];
    390 				         sum += qlp_coeff[ 7] * data[i- 8];
    391 				         sum += qlp_coeff[ 6] * data[i- 7];
    392 				         sum += qlp_coeff[ 5] * data[i- 6];
    393 				         sum += qlp_coeff[ 4] * data[i- 5];
    394 				         sum += qlp_coeff[ 3] * data[i- 4];
    395 				         sum += qlp_coeff[ 2] * data[i- 3];
    396 				         sum += qlp_coeff[ 1] * data[i- 2];
    397 				         sum += qlp_coeff[ 0] * data[i- 1];
    398 			}
    399 			residual[i] = data[i] - (sum >> lp_quantization);
    400 		}
    401 	}
    402 	_mm256_zeroupper();
    403 }
    404 
    405 FLAC__SSE_TARGET("avx2")
    406 void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_avx2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
    407 {
    408 	int i;
    409 	FLAC__int32 sum;
    410 	__m128i cnt = _mm_cvtsi32_si128(lp_quantization);
    411 
    412 	FLAC__ASSERT(order > 0);
    413 	FLAC__ASSERT(order <= 32);
    414 
    415 	if(order <= 12) {
    416 		if(order > 8) {
    417 			if(order > 10) {
    418 				if(order == 12) {
    419 					__m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;
    420 					q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
    421 					q1  = _mm256_set1_epi32(qlp_coeff[1 ]);
    422 					q2  = _mm256_set1_epi32(qlp_coeff[2 ]);
    423 					q3  = _mm256_set1_epi32(qlp_coeff[3 ]);
    424 					q4  = _mm256_set1_epi32(qlp_coeff[4 ]);
    425 					q5  = _mm256_set1_epi32(qlp_coeff[5 ]);
    426 					q6  = _mm256_set1_epi32(qlp_coeff[6 ]);
    427 					q7  = _mm256_set1_epi32(qlp_coeff[7 ]);
    428 					q8  = _mm256_set1_epi32(qlp_coeff[8 ]);
    429 					q9  = _mm256_set1_epi32(qlp_coeff[9 ]);
    430 					q10 = _mm256_set1_epi32(qlp_coeff[10]);
    431 					q11 = _mm256_set1_epi32(qlp_coeff[11]);
    432 
    433 					for(i = 0; i < (int)data_len-7; i+=8) {
    434 						__m256i summ, mull;
    435 						summ = _mm256_mullo_epi32(q11, _mm256_loadu_si256((const __m256i*)(data+i-12)));
    436 						mull = _mm256_mullo_epi32(q10, _mm256_loadu_si256((const __m256i*)(data+i-11))); summ = _mm256_add_epi32(summ, mull);
    437 						mull = _mm256_mullo_epi32(q9,  _mm256_loadu_si256((const __m256i*)(data+i-10))); summ = _mm256_add_epi32(summ, mull);
    438 						mull = _mm256_mullo_epi32(q8,  _mm256_loadu_si256((const __m256i*)(data+i-9)));  summ = _mm256_add_epi32(summ, mull);
    439 						mull = _mm256_mullo_epi32(q7,  _mm256_loadu_si256((const __m256i*)(data+i-8)));  summ = _mm256_add_epi32(summ, mull);
    440 						mull = _mm256_mullo_epi32(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7)));  summ = _mm256_add_epi32(summ, mull);
    441 						mull = _mm256_mullo_epi32(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6)));  summ = _mm256_add_epi32(summ, mull);
    442 						mull = _mm256_mullo_epi32(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5)));  summ = _mm256_add_epi32(summ, mull);
    443 						mull = _mm256_mullo_epi32(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4)));  summ = _mm256_add_epi32(summ, mull);
    444 						mull = _mm256_mullo_epi32(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3)));  summ = _mm256_add_epi32(summ, mull);
    445 						mull = _mm256_mullo_epi32(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2)));  summ = _mm256_add_epi32(summ, mull);
    446 						mull = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));  summ = _mm256_add_epi32(summ, mull);
    447 						summ = _mm256_sra_epi32(summ, cnt);
    448 						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
    449 					}
    450 				}
    451 				else { /* order == 11 */
    452 					__m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10;
    453 					q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
    454 					q1  = _mm256_set1_epi32(qlp_coeff[1 ]);
    455 					q2  = _mm256_set1_epi32(qlp_coeff[2 ]);
    456 					q3  = _mm256_set1_epi32(qlp_coeff[3 ]);
    457 					q4  = _mm256_set1_epi32(qlp_coeff[4 ]);
    458 					q5  = _mm256_set1_epi32(qlp_coeff[5 ]);
    459 					q6  = _mm256_set1_epi32(qlp_coeff[6 ]);
    460 					q7  = _mm256_set1_epi32(qlp_coeff[7 ]);
    461 					q8  = _mm256_set1_epi32(qlp_coeff[8 ]);
    462 					q9  = _mm256_set1_epi32(qlp_coeff[9 ]);
    463 					q10 = _mm256_set1_epi32(qlp_coeff[10]);
    464 
    465 					for(i = 0; i < (int)data_len-7; i+=8) {
    466 						__m256i summ, mull;
    467 						summ = _mm256_mullo_epi32(q10, _mm256_loadu_si256((const __m256i*)(data+i-11)));
    468 						mull = _mm256_mullo_epi32(q9,  _mm256_loadu_si256((const __m256i*)(data+i-10))); summ = _mm256_add_epi32(summ, mull);
    469 						mull = _mm256_mullo_epi32(q8,  _mm256_loadu_si256((const __m256i*)(data+i-9)));  summ = _mm256_add_epi32(summ, mull);
    470 						mull = _mm256_mullo_epi32(q7,  _mm256_loadu_si256((const __m256i*)(data+i-8)));  summ = _mm256_add_epi32(summ, mull);
    471 						mull = _mm256_mullo_epi32(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7)));  summ = _mm256_add_epi32(summ, mull);
    472 						mull = _mm256_mullo_epi32(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6)));  summ = _mm256_add_epi32(summ, mull);
    473 						mull = _mm256_mullo_epi32(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5)));  summ = _mm256_add_epi32(summ, mull);
    474 						mull = _mm256_mullo_epi32(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4)));  summ = _mm256_add_epi32(summ, mull);
    475 						mull = _mm256_mullo_epi32(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3)));  summ = _mm256_add_epi32(summ, mull);
    476 						mull = _mm256_mullo_epi32(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2)));  summ = _mm256_add_epi32(summ, mull);
    477 						mull = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));  summ = _mm256_add_epi32(summ, mull);
    478 						summ = _mm256_sra_epi32(summ, cnt);
    479 						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
    480 					}
    481 				}
    482 			}
    483 			else {
    484 				if(order == 10) {
    485 					__m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9;
    486 					q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
    487 					q1  = _mm256_set1_epi32(qlp_coeff[1 ]);
    488 					q2  = _mm256_set1_epi32(qlp_coeff[2 ]);
    489 					q3  = _mm256_set1_epi32(qlp_coeff[3 ]);
    490 					q4  = _mm256_set1_epi32(qlp_coeff[4 ]);
    491 					q5  = _mm256_set1_epi32(qlp_coeff[5 ]);
    492 					q6  = _mm256_set1_epi32(qlp_coeff[6 ]);
    493 					q7  = _mm256_set1_epi32(qlp_coeff[7 ]);
    494 					q8  = _mm256_set1_epi32(qlp_coeff[8 ]);
    495 					q9  = _mm256_set1_epi32(qlp_coeff[9 ]);
    496 
    497 					for(i = 0; i < (int)data_len-7; i+=8) {
    498 						__m256i summ, mull;
    499 						summ = _mm256_mullo_epi32(q9,  _mm256_loadu_si256((const __m256i*)(data+i-10)));
    500 						mull = _mm256_mullo_epi32(q8,  _mm256_loadu_si256((const __m256i*)(data+i-9)));  summ = _mm256_add_epi32(summ, mull);
    501 						mull = _mm256_mullo_epi32(q7,  _mm256_loadu_si256((const __m256i*)(data+i-8)));  summ = _mm256_add_epi32(summ, mull);
    502 						mull = _mm256_mullo_epi32(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7)));  summ = _mm256_add_epi32(summ, mull);
    503 						mull = _mm256_mullo_epi32(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6)));  summ = _mm256_add_epi32(summ, mull);
    504 						mull = _mm256_mullo_epi32(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5)));  summ = _mm256_add_epi32(summ, mull);
    505 						mull = _mm256_mullo_epi32(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4)));  summ = _mm256_add_epi32(summ, mull);
    506 						mull = _mm256_mullo_epi32(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3)));  summ = _mm256_add_epi32(summ, mull);
    507 						mull = _mm256_mullo_epi32(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2)));  summ = _mm256_add_epi32(summ, mull);
    508 						mull = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));  summ = _mm256_add_epi32(summ, mull);
    509 						summ = _mm256_sra_epi32(summ, cnt);
    510 						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
    511 					}
    512 				}
    513 				else { /* order == 9 */
    514 					__m256i q0, q1, q2, q3, q4, q5, q6, q7, q8;
    515 					q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
    516 					q1  = _mm256_set1_epi32(qlp_coeff[1 ]);
    517 					q2  = _mm256_set1_epi32(qlp_coeff[2 ]);
    518 					q3  = _mm256_set1_epi32(qlp_coeff[3 ]);
    519 					q4  = _mm256_set1_epi32(qlp_coeff[4 ]);
    520 					q5  = _mm256_set1_epi32(qlp_coeff[5 ]);
    521 					q6  = _mm256_set1_epi32(qlp_coeff[6 ]);
    522 					q7  = _mm256_set1_epi32(qlp_coeff[7 ]);
    523 					q8  = _mm256_set1_epi32(qlp_coeff[8 ]);
    524 
    525 					for(i = 0; i < (int)data_len-7; i+=8) {
    526 						__m256i summ, mull;
    527 						summ = _mm256_mullo_epi32(q8,  _mm256_loadu_si256((const __m256i*)(data+i-9)));
    528 						mull = _mm256_mullo_epi32(q7,  _mm256_loadu_si256((const __m256i*)(data+i-8)));  summ = _mm256_add_epi32(summ, mull);
    529 						mull = _mm256_mullo_epi32(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7)));  summ = _mm256_add_epi32(summ, mull);
    530 						mull = _mm256_mullo_epi32(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6)));  summ = _mm256_add_epi32(summ, mull);
    531 						mull = _mm256_mullo_epi32(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5)));  summ = _mm256_add_epi32(summ, mull);
    532 						mull = _mm256_mullo_epi32(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4)));  summ = _mm256_add_epi32(summ, mull);
    533 						mull = _mm256_mullo_epi32(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3)));  summ = _mm256_add_epi32(summ, mull);
    534 						mull = _mm256_mullo_epi32(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2)));  summ = _mm256_add_epi32(summ, mull);
    535 						mull = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));  summ = _mm256_add_epi32(summ, mull);
    536 						summ = _mm256_sra_epi32(summ, cnt);
    537 						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
    538 					}
    539 				}
    540 			}
    541 		}
    542 		else if(order > 4) {
    543 			if(order > 6) {
    544 				if(order == 8) {
    545 					__m256i q0, q1, q2, q3, q4, q5, q6, q7;
    546 					q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
    547 					q1  = _mm256_set1_epi32(qlp_coeff[1 ]);
    548 					q2  = _mm256_set1_epi32(qlp_coeff[2 ]);
    549 					q3  = _mm256_set1_epi32(qlp_coeff[3 ]);
    550 					q4  = _mm256_set1_epi32(qlp_coeff[4 ]);
    551 					q5  = _mm256_set1_epi32(qlp_coeff[5 ]);
    552 					q6  = _mm256_set1_epi32(qlp_coeff[6 ]);
    553 					q7  = _mm256_set1_epi32(qlp_coeff[7 ]);
    554 
    555 					for(i = 0; i < (int)data_len-7; i+=8) {
    556 						__m256i summ, mull;
    557 						summ = _mm256_mullo_epi32(q7,  _mm256_loadu_si256((const __m256i*)(data+i-8)));
    558 						mull = _mm256_mullo_epi32(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7)));  summ = _mm256_add_epi32(summ, mull);
    559 						mull = _mm256_mullo_epi32(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6)));  summ = _mm256_add_epi32(summ, mull);
    560 						mull = _mm256_mullo_epi32(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5)));  summ = _mm256_add_epi32(summ, mull);
    561 						mull = _mm256_mullo_epi32(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4)));  summ = _mm256_add_epi32(summ, mull);
    562 						mull = _mm256_mullo_epi32(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3)));  summ = _mm256_add_epi32(summ, mull);
    563 						mull = _mm256_mullo_epi32(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2)));  summ = _mm256_add_epi32(summ, mull);
    564 						mull = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));  summ = _mm256_add_epi32(summ, mull);
    565 						summ = _mm256_sra_epi32(summ, cnt);
    566 						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
    567 					}
    568 				}
    569 				else { /* order == 7 */
    570 					__m256i q0, q1, q2, q3, q4, q5, q6;
    571 					q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
    572 					q1  = _mm256_set1_epi32(qlp_coeff[1 ]);
    573 					q2  = _mm256_set1_epi32(qlp_coeff[2 ]);
    574 					q3  = _mm256_set1_epi32(qlp_coeff[3 ]);
    575 					q4  = _mm256_set1_epi32(qlp_coeff[4 ]);
    576 					q5  = _mm256_set1_epi32(qlp_coeff[5 ]);
    577 					q6  = _mm256_set1_epi32(qlp_coeff[6 ]);
    578 
    579 					for(i = 0; i < (int)data_len-7; i+=8) {
    580 						__m256i summ, mull;
    581 						summ = _mm256_mullo_epi32(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7)));
    582 						mull = _mm256_mullo_epi32(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6)));  summ = _mm256_add_epi32(summ, mull);
    583 						mull = _mm256_mullo_epi32(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5)));  summ = _mm256_add_epi32(summ, mull);
    584 						mull = _mm256_mullo_epi32(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4)));  summ = _mm256_add_epi32(summ, mull);
    585 						mull = _mm256_mullo_epi32(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3)));  summ = _mm256_add_epi32(summ, mull);
    586 						mull = _mm256_mullo_epi32(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2)));  summ = _mm256_add_epi32(summ, mull);
    587 						mull = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));  summ = _mm256_add_epi32(summ, mull);
    588 						summ = _mm256_sra_epi32(summ, cnt);
    589 						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
    590 					}
    591 				}
    592 			}
    593 			else {
    594 				if(order == 6) {
    595 					__m256i q0, q1, q2, q3, q4, q5;
    596 					q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
    597 					q1  = _mm256_set1_epi32(qlp_coeff[1 ]);
    598 					q2  = _mm256_set1_epi32(qlp_coeff[2 ]);
    599 					q3  = _mm256_set1_epi32(qlp_coeff[3 ]);
    600 					q4  = _mm256_set1_epi32(qlp_coeff[4 ]);
    601 					q5  = _mm256_set1_epi32(qlp_coeff[5 ]);
    602 
    603 					for(i = 0; i < (int)data_len-7; i+=8) {
    604 						__m256i summ, mull;
    605 						summ = _mm256_mullo_epi32(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6)));
    606 						mull = _mm256_mullo_epi32(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5)));  summ = _mm256_add_epi32(summ, mull);
    607 						mull = _mm256_mullo_epi32(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4)));  summ = _mm256_add_epi32(summ, mull);
    608 						mull = _mm256_mullo_epi32(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3)));  summ = _mm256_add_epi32(summ, mull);
    609 						mull = _mm256_mullo_epi32(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2)));  summ = _mm256_add_epi32(summ, mull);
    610 						mull = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));  summ = _mm256_add_epi32(summ, mull);
    611 						summ = _mm256_sra_epi32(summ, cnt);
    612 						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
    613 					}
    614 				}
    615 				else { /* order == 5 */
    616 					__m256i q0, q1, q2, q3, q4;
    617 					q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
    618 					q1  = _mm256_set1_epi32(qlp_coeff[1 ]);
    619 					q2  = _mm256_set1_epi32(qlp_coeff[2 ]);
    620 					q3  = _mm256_set1_epi32(qlp_coeff[3 ]);
    621 					q4  = _mm256_set1_epi32(qlp_coeff[4 ]);
    622 
    623 					for(i = 0; i < (int)data_len-7; i+=8) {
    624 						__m256i summ, mull;
    625 						summ = _mm256_mullo_epi32(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5)));
    626 						mull = _mm256_mullo_epi32(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4)));  summ = _mm256_add_epi32(summ, mull);
    627 						mull = _mm256_mullo_epi32(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3)));  summ = _mm256_add_epi32(summ, mull);
    628 						mull = _mm256_mullo_epi32(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2)));  summ = _mm256_add_epi32(summ, mull);
    629 						mull = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));  summ = _mm256_add_epi32(summ, mull);
    630 						summ = _mm256_sra_epi32(summ, cnt);
    631 						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
    632 					}
    633 				}
    634 			}
    635 		}
    636 		else {
    637 			if(order > 2) {
    638 				if(order == 4) {
    639 					__m256i q0, q1, q2, q3;
    640 					q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
    641 					q1  = _mm256_set1_epi32(qlp_coeff[1 ]);
    642 					q2  = _mm256_set1_epi32(qlp_coeff[2 ]);
    643 					q3  = _mm256_set1_epi32(qlp_coeff[3 ]);
    644 
    645 					for(i = 0; i < (int)data_len-7; i+=8) {
    646 						__m256i summ, mull;
    647 						summ = _mm256_mullo_epi32(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4)));
    648 						mull = _mm256_mullo_epi32(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3)));  summ = _mm256_add_epi32(summ, mull);
    649 						mull = _mm256_mullo_epi32(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2)));  summ = _mm256_add_epi32(summ, mull);
    650 						mull = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));  summ = _mm256_add_epi32(summ, mull);
    651 						summ = _mm256_sra_epi32(summ, cnt);
    652 						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
    653 					}
    654 				}
    655 				else { /* order == 3 */
    656 					__m256i q0, q1, q2;
    657 					q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
    658 					q1  = _mm256_set1_epi32(qlp_coeff[1 ]);
    659 					q2  = _mm256_set1_epi32(qlp_coeff[2 ]);
    660 
    661 					for(i = 0; i < (int)data_len-7; i+=8) {
    662 						__m256i summ, mull;
    663 						summ = _mm256_mullo_epi32(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3)));
    664 						mull = _mm256_mullo_epi32(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2)));  summ = _mm256_add_epi32(summ, mull);
    665 						mull = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));  summ = _mm256_add_epi32(summ, mull);
    666 						summ = _mm256_sra_epi32(summ, cnt);
    667 						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
    668 					}
    669 				}
    670 			}
    671 			else {
    672 				if(order == 2) {
    673 					__m256i q0, q1;
    674 					q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
    675 					q1  = _mm256_set1_epi32(qlp_coeff[1 ]);
    676 
    677 					for(i = 0; i < (int)data_len-7; i+=8) {
    678 						__m256i summ, mull;
    679 						summ = _mm256_mullo_epi32(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2)));
    680 						mull = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));  summ = _mm256_add_epi32(summ, mull);
    681 						summ = _mm256_sra_epi32(summ, cnt);
    682 						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
    683 					}
    684 				}
    685 				else { /* order == 1 */
    686 					__m256i q0;
    687 					q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
    688 
    689 					for(i = 0; i < (int)data_len-7; i+=8) {
    690 						__m256i summ;
    691 						summ = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));
    692 						summ = _mm256_sra_epi32(summ, cnt);
    693 						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
    694 					}
    695 				}
    696 			}
    697 		}
    698 		for(; i < (int)data_len; i++) {
    699 			sum = 0;
    700 			switch(order) {
    701 				case 12: sum += qlp_coeff[11] * data[i-12];
    702 				case 11: sum += qlp_coeff[10] * data[i-11];
    703 				case 10: sum += qlp_coeff[ 9] * data[i-10];
    704 				case 9:  sum += qlp_coeff[ 8] * data[i- 9];
    705 				case 8:  sum += qlp_coeff[ 7] * data[i- 8];
    706 				case 7:  sum += qlp_coeff[ 6] * data[i- 7];
    707 				case 6:  sum += qlp_coeff[ 5] * data[i- 6];
    708 				case 5:  sum += qlp_coeff[ 4] * data[i- 5];
    709 				case 4:  sum += qlp_coeff[ 3] * data[i- 4];
    710 				case 3:  sum += qlp_coeff[ 2] * data[i- 3];
    711 				case 2:  sum += qlp_coeff[ 1] * data[i- 2];
    712 				case 1:  sum += qlp_coeff[ 0] * data[i- 1];
    713 			}
    714 			residual[i] = data[i] - (sum >> lp_quantization);
    715 		}
    716 	}
    717 	else { /* order > 12 */
    718 		for(i = 0; i < (int)data_len; i++) {
    719 			sum = 0;
    720 			switch(order) {
    721 				case 32: sum += qlp_coeff[31] * data[i-32];
    722 				case 31: sum += qlp_coeff[30] * data[i-31];
    723 				case 30: sum += qlp_coeff[29] * data[i-30];
    724 				case 29: sum += qlp_coeff[28] * data[i-29];
    725 				case 28: sum += qlp_coeff[27] * data[i-28];
    726 				case 27: sum += qlp_coeff[26] * data[i-27];
    727 				case 26: sum += qlp_coeff[25] * data[i-26];
    728 				case 25: sum += qlp_coeff[24] * data[i-25];
    729 				case 24: sum += qlp_coeff[23] * data[i-24];
    730 				case 23: sum += qlp_coeff[22] * data[i-23];
    731 				case 22: sum += qlp_coeff[21] * data[i-22];
    732 				case 21: sum += qlp_coeff[20] * data[i-21];
    733 				case 20: sum += qlp_coeff[19] * data[i-20];
    734 				case 19: sum += qlp_coeff[18] * data[i-19];
    735 				case 18: sum += qlp_coeff[17] * data[i-18];
    736 				case 17: sum += qlp_coeff[16] * data[i-17];
    737 				case 16: sum += qlp_coeff[15] * data[i-16];
    738 				case 15: sum += qlp_coeff[14] * data[i-15];
    739 				case 14: sum += qlp_coeff[13] * data[i-14];
    740 				case 13: sum += qlp_coeff[12] * data[i-13];
    741 				         sum += qlp_coeff[11] * data[i-12];
    742 				         sum += qlp_coeff[10] * data[i-11];
    743 				         sum += qlp_coeff[ 9] * data[i-10];
    744 				         sum += qlp_coeff[ 8] * data[i- 9];
    745 				         sum += qlp_coeff[ 7] * data[i- 8];
    746 				         sum += qlp_coeff[ 6] * data[i- 7];
    747 				         sum += qlp_coeff[ 5] * data[i- 6];
    748 				         sum += qlp_coeff[ 4] * data[i- 5];
    749 				         sum += qlp_coeff[ 3] * data[i- 4];
    750 				         sum += qlp_coeff[ 2] * data[i- 3];
    751 				         sum += qlp_coeff[ 1] * data[i- 2];
    752 				         sum += qlp_coeff[ 0] * data[i- 1];
    753 			}
    754 			residual[i] = data[i] - (sum >> lp_quantization);
    755 		}
    756 	}
    757 	_mm256_zeroupper();
    758 }
    759 
    760 static FLAC__int32 pack_arr[8] = { 0, 2, 4, 6, 1, 3, 5, 7 };
    761 
    762 FLAC__SSE_TARGET("avx2")
    763 void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_avx2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
    764 {
    765 	int i;
    766 	FLAC__int64 sum;
    767 	__m128i cnt = _mm_cvtsi32_si128(lp_quantization);
    768 	__m256i pack = _mm256_loadu_si256((const __m256i *)pack_arr);
    769 
    770 	FLAC__ASSERT(order > 0);
    771 	FLAC__ASSERT(order <= 32);
    772 	FLAC__ASSERT(lp_quantization <= 32); /* there's no _mm256_sra_epi64() so we have to use _mm256_srl_epi64() */
    773 
    774 	if(order <= 12) {
    775 		if(order > 8) {
    776 			if(order > 10) {
    777 				if(order == 12) {
    778 					__m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;
    779 					q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
    780 					q1  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
    781 					q2  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
    782 					q3  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
    783 					q4  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ]));
    784 					q5  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ]));
    785 					q6  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[6 ]));
    786 					q7  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[7 ]));
    787 					q8  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[8 ]));
    788 					q9  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[9 ]));
    789 					q10 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[10]));
    790 					q11 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[11]));
    791 
    792 					for(i = 0; i < (int)data_len-3; i+=4) {
    793 						__m256i summ, mull;
    794 						summ = _mm256_mul_epi32(q11, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-12))));
    795 						mull = _mm256_mul_epi32(q10, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-11)))); summ = _mm256_add_epi64(summ, mull);
    796 						mull = _mm256_mul_epi32(q9,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-10)))); summ = _mm256_add_epi64(summ, mull);
    797 						mull = _mm256_mul_epi32(q8,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-9 )))); summ = _mm256_add_epi64(summ, mull);
    798 						mull = _mm256_mul_epi32(q7,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 )))); summ = _mm256_add_epi64(summ, mull);
    799 						mull = _mm256_mul_epi32(q6,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256_add_epi64(summ, mull);
    800 						mull = _mm256_mul_epi32(q5,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull);
    801 						mull = _mm256_mul_epi32(q4,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
    802 						mull = _mm256_mul_epi32(q3,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
    803 						mull = _mm256_mul_epi32(q2,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
    804 						mull = _mm256_mul_epi32(q1,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
    805 						mull = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
    806 						summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
    807 						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
    808 					}
    809 				}
    810 				else { /* order == 11 */
    811 					__m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10;
    812 					q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
    813 					q1  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
    814 					q2  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
    815 					q3  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
    816 					q4  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ]));
    817 					q5  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ]));
    818 					q6  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[6 ]));
    819 					q7  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[7 ]));
    820 					q8  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[8 ]));
    821 					q9  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[9 ]));
    822 					q10 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[10]));
    823 
    824 					for(i = 0; i < (int)data_len-3; i+=4) {
    825 						__m256i summ, mull;
    826 						summ = _mm256_mul_epi32(q10, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-11))));
    827 						mull = _mm256_mul_epi32(q9,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-10)))); summ = _mm256_add_epi64(summ, mull);
    828 						mull = _mm256_mul_epi32(q8,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-9 )))); summ = _mm256_add_epi64(summ, mull);
    829 						mull = _mm256_mul_epi32(q7,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 )))); summ = _mm256_add_epi64(summ, mull);
    830 						mull = _mm256_mul_epi32(q6,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256_add_epi64(summ, mull);
    831 						mull = _mm256_mul_epi32(q5,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull);
    832 						mull = _mm256_mul_epi32(q4,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
    833 						mull = _mm256_mul_epi32(q3,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
    834 						mull = _mm256_mul_epi32(q2,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
    835 						mull = _mm256_mul_epi32(q1,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
    836 						mull = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
    837 						summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
    838 						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
    839 					}
    840 				}
    841 			}
    842 			else {
    843 				if(order == 10) {
    844 					__m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9;
    845 					q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
    846 					q1  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
    847 					q2  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
    848 					q3  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
    849 					q4  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ]));
    850 					q5  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ]));
    851 					q6  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[6 ]));
    852 					q7  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[7 ]));
    853 					q8  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[8 ]));
    854 					q9  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[9 ]));
    855 
    856 					for(i = 0; i < (int)data_len-3; i+=4) {
    857 						__m256i summ, mull;
    858 						summ = _mm256_mul_epi32(q9,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-10))));
    859 						mull = _mm256_mul_epi32(q8,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-9 )))); summ = _mm256_add_epi64(summ, mull);
    860 						mull = _mm256_mul_epi32(q7,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 )))); summ = _mm256_add_epi64(summ, mull);
    861 						mull = _mm256_mul_epi32(q6,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256_add_epi64(summ, mull);
    862 						mull = _mm256_mul_epi32(q5,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull);
    863 						mull = _mm256_mul_epi32(q4,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
    864 						mull = _mm256_mul_epi32(q3,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
    865 						mull = _mm256_mul_epi32(q2,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
    866 						mull = _mm256_mul_epi32(q1,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
    867 						mull = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
    868 						summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
    869 						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
    870 					}
    871 				}
    872 				else { /* order == 9 */
    873 					__m256i q0, q1, q2, q3, q4, q5, q6, q7, q8;
    874 					q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
    875 					q1  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
    876 					q2  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
    877 					q3  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
    878 					q4  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ]));
    879 					q5  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ]));
    880 					q6  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[6 ]));
    881 					q7  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[7 ]));
    882 					q8  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[8 ]));
    883 
    884 					for(i = 0; i < (int)data_len-3; i+=4) {
    885 						__m256i summ, mull;
    886 						summ = _mm256_mul_epi32(q8,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-9 ))));
    887 						mull = _mm256_mul_epi32(q7,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 )))); summ = _mm256_add_epi64(summ, mull);
    888 						mull = _mm256_mul_epi32(q6,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256_add_epi64(summ, mull);
    889 						mull = _mm256_mul_epi32(q5,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull);
    890 						mull = _mm256_mul_epi32(q4,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
    891 						mull = _mm256_mul_epi32(q3,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
    892 						mull = _mm256_mul_epi32(q2,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
    893 						mull = _mm256_mul_epi32(q1,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
    894 						mull = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
    895 						summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
    896 						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
    897 					}
    898 				}
    899 			}
    900 		}
    901 		else if(order > 4) {
    902 			if(order > 6) {
    903 				if(order == 8) {
    904 					__m256i q0, q1, q2, q3, q4, q5, q6, q7;
    905 					q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
    906 					q1  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
    907 					q2  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
    908 					q3  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
    909 					q4  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ]));
    910 					q5  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ]));
    911 					q6  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[6 ]));
    912 					q7  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[7 ]));
    913 
    914 					for(i = 0; i < (int)data_len-3; i+=4) {
    915 						__m256i summ, mull;
    916 						summ = _mm256_mul_epi32(q7,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 ))));
    917 						mull = _mm256_mul_epi32(q6,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256_add_epi64(summ, mull);
    918 						mull = _mm256_mul_epi32(q5,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull);
    919 						mull = _mm256_mul_epi32(q4,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
    920 						mull = _mm256_mul_epi32(q3,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
    921 						mull = _mm256_mul_epi32(q2,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
    922 						mull = _mm256_mul_epi32(q1,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
    923 						mull = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
    924 						summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
    925 						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
    926 					}
    927 				}
    928 				else { /* order == 7 */
    929 					__m256i q0, q1, q2, q3, q4, q5, q6;
    930 					q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
    931 					q1  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
    932 					q2  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
    933 					q3  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
    934 					q4  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ]));
    935 					q5  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ]));
    936 					q6  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[6 ]));
    937 
    938 					for(i = 0; i < (int)data_len-3; i+=4) {
    939 						__m256i summ, mull;
    940 						summ = _mm256_mul_epi32(q6,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 ))));
    941 						mull = _mm256_mul_epi32(q5,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull);
    942 						mull = _mm256_mul_epi32(q4,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
    943 						mull = _mm256_mul_epi32(q3,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
    944 						mull = _mm256_mul_epi32(q2,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
    945 						mull = _mm256_mul_epi32(q1,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
    946 						mull = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
    947 						summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
    948 						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
    949 					}
    950 				}
    951 			}
    952 			else {
    953 				if(order == 6) {
    954 					__m256i q0, q1, q2, q3, q4, q5;
    955 					q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
    956 					q1  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
    957 					q2  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
    958 					q3  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
    959 					q4  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ]));
    960 					q5  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ]));
    961 
    962 					for(i = 0; i < (int)data_len-3; i+=4) {
    963 						__m256i summ, mull;
    964 						summ = _mm256_mul_epi32(q5,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 ))));
    965 						mull = _mm256_mul_epi32(q4,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
    966 						mull = _mm256_mul_epi32(q3,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
    967 						mull = _mm256_mul_epi32(q2,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
    968 						mull = _mm256_mul_epi32(q1,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
    969 						mull = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
    970 						summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
    971 						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
    972 					}
    973 				}
    974 				else { /* order == 5 */
    975 					__m256i q0, q1, q2, q3, q4;
    976 					q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
    977 					q1  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
    978 					q2  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
    979 					q3  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
    980 					q4  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ]));
    981 
    982 					for(i = 0; i < (int)data_len-3; i+=4) {
    983 						__m256i summ, mull;
    984 						summ = _mm256_mul_epi32(q4,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 ))));
    985 						mull = _mm256_mul_epi32(q3,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
    986 						mull = _mm256_mul_epi32(q2,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
    987 						mull = _mm256_mul_epi32(q1,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
    988 						mull = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
    989 						summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
    990 						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
    991 					}
    992 				}
    993 			}
    994 		}
    995 		else {
    996 			if(order > 2) {
    997 				if(order == 4) {
    998 					__m256i q0, q1, q2, q3;
    999 					q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
   1000 					q1  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
   1001 					q2  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
   1002 					q3  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
   1003 
   1004 					for(i = 0; i < (int)data_len-3; i+=4) {
   1005 						__m256i summ, mull;
   1006 						summ = _mm256_mul_epi32(q3,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 ))));
   1007 						mull = _mm256_mul_epi32(q2,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
   1008 						mull = _mm256_mul_epi32(q1,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
   1009 						mull = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
   1010 						summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
   1011 						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
   1012 					}
   1013 				}
   1014 				else { /* order == 3 */
   1015 					__m256i q0, q1, q2;
   1016 					q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
   1017 					q1  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
   1018 					q2  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
   1019 
   1020 					for(i = 0; i < (int)data_len-3; i+=4) {
   1021 						__m256i summ, mull;
   1022 						summ = _mm256_mul_epi32(q2,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 ))));
   1023 						mull = _mm256_mul_epi32(q1,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
   1024 						mull = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
   1025 						summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
   1026 						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
   1027 					}
   1028 				}
   1029 			}
   1030 			else {
   1031 				if(order == 2) {
   1032 					__m256i q0, q1;
   1033 					q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
   1034 					q1  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
   1035 
   1036 					for(i = 0; i < (int)data_len-3; i+=4) {
   1037 						__m256i summ, mull;
   1038 						summ = _mm256_mul_epi32(q1,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 ))));
   1039 						mull = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
   1040 						summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
   1041 						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
   1042 					}
   1043 				}
   1044 				else { /* order == 1 */
   1045 					__m256i q0;
   1046 					q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
   1047 
   1048 					for(i = 0; i < (int)data_len-3; i+=4) {
   1049 						__m256i summ;
   1050 						summ = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 ))));
   1051 						summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
   1052 						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
   1053 					}
   1054 				}
   1055 			}
   1056 		}
   1057 		for(; i < (int)data_len; i++) {
   1058 			sum = 0;
   1059 			switch(order) {
   1060 				case 12: sum += qlp_coeff[11] * (FLAC__int64)data[i-12];
   1061 				case 11: sum += qlp_coeff[10] * (FLAC__int64)data[i-11];
   1062 				case 10: sum += qlp_coeff[ 9] * (FLAC__int64)data[i-10];
   1063 				case 9:  sum += qlp_coeff[ 8] * (FLAC__int64)data[i- 9];
   1064 				case 8:  sum += qlp_coeff[ 7] * (FLAC__int64)data[i- 8];
   1065 				case 7:  sum += qlp_coeff[ 6] * (FLAC__int64)data[i- 7];
   1066 				case 6:  sum += qlp_coeff[ 5] * (FLAC__int64)data[i- 6];
   1067 				case 5:  sum += qlp_coeff[ 4] * (FLAC__int64)data[i- 5];
   1068 				case 4:  sum += qlp_coeff[ 3] * (FLAC__int64)data[i- 4];
   1069 				case 3:  sum += qlp_coeff[ 2] * (FLAC__int64)data[i- 3];
   1070 				case 2:  sum += qlp_coeff[ 1] * (FLAC__int64)data[i- 2];
   1071 				case 1:  sum += qlp_coeff[ 0] * (FLAC__int64)data[i- 1];
   1072 			}
   1073 			residual[i] = data[i] - (FLAC__int32)(sum >> lp_quantization);
   1074 		}
   1075 	}
   1076 	else { /* order > 12 */
   1077 		for(i = 0; i < (int)data_len; i++) {
   1078 			sum = 0;
   1079 			switch(order) {
   1080 				case 32: sum += qlp_coeff[31] * (FLAC__int64)data[i-32];
   1081 				case 31: sum += qlp_coeff[30] * (FLAC__int64)data[i-31];
   1082 				case 30: sum += qlp_coeff[29] * (FLAC__int64)data[i-30];
   1083 				case 29: sum += qlp_coeff[28] * (FLAC__int64)data[i-29];
   1084 				case 28: sum += qlp_coeff[27] * (FLAC__int64)data[i-28];
   1085 				case 27: sum += qlp_coeff[26] * (FLAC__int64)data[i-27];
   1086 				case 26: sum += qlp_coeff[25] * (FLAC__int64)data[i-26];
   1087 				case 25: sum += qlp_coeff[24] * (FLAC__int64)data[i-25];
   1088 				case 24: sum += qlp_coeff[23] * (FLAC__int64)data[i-24];
   1089 				case 23: sum += qlp_coeff[22] * (FLAC__int64)data[i-23];
   1090 				case 22: sum += qlp_coeff[21] * (FLAC__int64)data[i-22];
   1091 				case 21: sum += qlp_coeff[20] * (FLAC__int64)data[i-21];
   1092 				case 20: sum += qlp_coeff[19] * (FLAC__int64)data[i-20];
   1093 				case 19: sum += qlp_coeff[18] * (FLAC__int64)data[i-19];
   1094 				case 18: sum += qlp_coeff[17] * (FLAC__int64)data[i-18];
   1095 				case 17: sum += qlp_coeff[16] * (FLAC__int64)data[i-17];
   1096 				case 16: sum += qlp_coeff[15] * (FLAC__int64)data[i-16];
   1097 				case 15: sum += qlp_coeff[14] * (FLAC__int64)data[i-15];
   1098 				case 14: sum += qlp_coeff[13] * (FLAC__int64)data[i-14];
   1099 				case 13: sum += qlp_coeff[12] * (FLAC__int64)data[i-13];
   1100 				         sum += qlp_coeff[11] * (FLAC__int64)data[i-12];
   1101 				         sum += qlp_coeff[10] * (FLAC__int64)data[i-11];
   1102 				         sum += qlp_coeff[ 9] * (FLAC__int64)data[i-10];
   1103 				         sum += qlp_coeff[ 8] * (FLAC__int64)data[i- 9];
   1104 				         sum += qlp_coeff[ 7] * (FLAC__int64)data[i- 8];
   1105 				         sum += qlp_coeff[ 6] * (FLAC__int64)data[i- 7];
   1106 				         sum += qlp_coeff[ 5] * (FLAC__int64)data[i- 6];
   1107 				         sum += qlp_coeff[ 4] * (FLAC__int64)data[i- 5];
   1108 				         sum += qlp_coeff[ 3] * (FLAC__int64)data[i- 4];
   1109 				         sum += qlp_coeff[ 2] * (FLAC__int64)data[i- 3];
   1110 				         sum += qlp_coeff[ 1] * (FLAC__int64)data[i- 2];
   1111 				         sum += qlp_coeff[ 0] * (FLAC__int64)data[i- 1];
   1112 			}
   1113 			residual[i] = data[i] - (FLAC__int32)(sum >> lp_quantization);
   1114 		}
   1115 	}
   1116 	_mm256_zeroupper();
   1117 }
   1118 
   1119 #endif /* FLAC__AVX2_SUPPORTED */
   1120 #endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */
   1121 #endif /* FLAC__NO_ASM */
   1122 #endif /* FLAC__INTEGER_ONLY_LIBRARY */
   1123