1 /* libFLAC - Free Lossless Audio Codec library 2 * Copyright (C) 2000-2009 Josh Coalson 3 * Copyright (C) 2011-2016 Xiph.Org Foundation 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 9 * - Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 12 * - Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * - Neither the name of the Xiph.org Foundation nor the names of its 17 * contributors may be used to endorse or promote products derived from 18 * this software without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 23 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR 24 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 25 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 26 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 27 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 28 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 29 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 30 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 */ 32 33 #ifdef HAVE_CONFIG_H 34 # include <config.h> 35 #endif 36 37 #include "private/cpu.h" 38 39 #ifndef FLAC__INTEGER_ONLY_LIBRARY 40 #ifndef FLAC__NO_ASM 41 #if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN 42 #include "private/lpc.h" 43 #ifdef FLAC__AVX2_SUPPORTED 44 45 #include "FLAC/assert.h" 46 #include "FLAC/format.h" 47 48 #include <immintrin.h> /* AVX2 */ 49 50 FLAC__SSE_TARGET("avx2") 51 void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_avx2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[]) 52 { 53 int i; 54 FLAC__int32 sum; 55 __m128i cnt = _mm_cvtsi32_si128(lp_quantization); 56 57 FLAC__ASSERT(order > 0); 58 FLAC__ASSERT(order <= 32); 59 60 if(order <= 12) { 61 if(order > 8) { 62 if(order > 10) { 63 if(order == 12) { 64 __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11; 65 q0 = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]); 66 q1 = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]); 67 q2 = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]); 68 q3 = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]); 69 q4 = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]); 70 q5 = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]); 71 q6 = _mm256_set1_epi32(0xffff & qlp_coeff[6 ]); 72 q7 = _mm256_set1_epi32(0xffff & qlp_coeff[7 ]); 73 q8 = _mm256_set1_epi32(0xffff & qlp_coeff[8 ]); 74 q9 = _mm256_set1_epi32(0xffff & qlp_coeff[9 ]); 75 q10 = _mm256_set1_epi32(0xffff & qlp_coeff[10]); 76 q11 = _mm256_set1_epi32(0xffff & qlp_coeff[11]); 77 78 for(i = 0; i < (int)data_len-7; i+=8) { 79 __m256i summ, mull; 80 summ = _mm256_madd_epi16(q11, _mm256_loadu_si256((const __m256i*)(data+i-12))); 81 mull = _mm256_madd_epi16(q10, _mm256_loadu_si256((const __m256i*)(data+i-11))); summ = _mm256_add_epi32(summ, mull); 82 mull = _mm256_madd_epi16(q9, _mm256_loadu_si256((const __m256i*)(data+i-10))); summ = _mm256_add_epi32(summ, mull); 83 mull = _mm256_madd_epi16(q8, _mm256_loadu_si256((const __m256i*)(data+i-9 ))); summ = _mm256_add_epi32(summ, mull); 84 mull = _mm256_madd_epi16(q7, _mm256_loadu_si256((const __m256i*)(data+i-8 ))); summ = _mm256_add_epi32(summ, mull); 85 mull = _mm256_madd_epi16(q6, _mm256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mull); 86 mull = _mm256_madd_epi16(q5, _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull); 87 mull = _mm256_madd_epi16(q4, _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull); 88 mull = _mm256_madd_epi16(q3, _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull); 89 mull = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull); 90 mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull); 91 mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull); 92 summ = _mm256_sra_epi32(summ, cnt); 93 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)); 94 } 95 } 96 else { /* order == 11 */ 97 __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10; 98 q0 = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]); 99 q1 = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]); 100 q2 = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]); 101 q3 = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]); 102 q4 = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]); 103 q5 = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]); 104 q6 = _mm256_set1_epi32(0xffff & qlp_coeff[6 ]); 105 q7 = _mm256_set1_epi32(0xffff & qlp_coeff[7 ]); 106 q8 = _mm256_set1_epi32(0xffff & qlp_coeff[8 ]); 107 q9 = _mm256_set1_epi32(0xffff & qlp_coeff[9 ]); 108 q10 = _mm256_set1_epi32(0xffff & qlp_coeff[10]); 109 110 for(i = 0; i < (int)data_len-7; i+=8) { 111 __m256i summ, mull; 112 summ = _mm256_madd_epi16(q10, _mm256_loadu_si256((const __m256i*)(data+i-11))); 113 mull = _mm256_madd_epi16(q9, _mm256_loadu_si256((const __m256i*)(data+i-10))); summ = _mm256_add_epi32(summ, mull); 114 mull = _mm256_madd_epi16(q8, _mm256_loadu_si256((const __m256i*)(data+i-9 ))); summ = _mm256_add_epi32(summ, mull); 115 mull = _mm256_madd_epi16(q7, _mm256_loadu_si256((const __m256i*)(data+i-8 ))); summ = _mm256_add_epi32(summ, mull); 116 mull = _mm256_madd_epi16(q6, _mm256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mull); 117 mull = _mm256_madd_epi16(q5, _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull); 118 mull = _mm256_madd_epi16(q4, _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull); 119 mull = _mm256_madd_epi16(q3, _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull); 120 mull = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull); 121 mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull); 122 mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull); 123 summ = _mm256_sra_epi32(summ, cnt); 124 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)); 125 } 126 } 127 } 128 else { 129 if(order == 10) { 130 __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9; 131 q0 = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]); 132 q1 = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]); 133 q2 = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]); 134 q3 = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]); 135 q4 = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]); 136 q5 = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]); 137 q6 = _mm256_set1_epi32(0xffff & qlp_coeff[6 ]); 138 q7 = _mm256_set1_epi32(0xffff & qlp_coeff[7 ]); 139 q8 = _mm256_set1_epi32(0xffff & qlp_coeff[8 ]); 140 q9 = _mm256_set1_epi32(0xffff & qlp_coeff[9 ]); 141 142 for(i = 0; i < (int)data_len-7; i+=8) { 143 __m256i summ, mull; 144 summ = _mm256_madd_epi16(q9, _mm256_loadu_si256((const __m256i*)(data+i-10))); 145 mull = _mm256_madd_epi16(q8, _mm256_loadu_si256((const __m256i*)(data+i-9 ))); summ = _mm256_add_epi32(summ, mull); 146 mull = _mm256_madd_epi16(q7, _mm256_loadu_si256((const __m256i*)(data+i-8 ))); summ = _mm256_add_epi32(summ, mull); 147 mull = _mm256_madd_epi16(q6, _mm256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mull); 148 mull = _mm256_madd_epi16(q5, _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull); 149 mull = _mm256_madd_epi16(q4, _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull); 150 mull = _mm256_madd_epi16(q3, _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull); 151 mull = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull); 152 mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull); 153 mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull); 154 summ = _mm256_sra_epi32(summ, cnt); 155 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)); 156 } 157 } 158 else { /* order == 9 */ 159 __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8; 160 q0 = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]); 161 q1 = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]); 162 q2 = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]); 163 q3 = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]); 164 q4 = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]); 165 q5 = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]); 166 q6 = _mm256_set1_epi32(0xffff & qlp_coeff[6 ]); 167 q7 = _mm256_set1_epi32(0xffff & qlp_coeff[7 ]); 168 q8 = _mm256_set1_epi32(0xffff & qlp_coeff[8 ]); 169 170 for(i = 0; i < (int)data_len-7; i+=8) { 171 __m256i summ, mull; 172 summ = _mm256_madd_epi16(q8, _mm256_loadu_si256((const __m256i*)(data+i-9 ))); 173 mull = _mm256_madd_epi16(q7, _mm256_loadu_si256((const __m256i*)(data+i-8 ))); summ = _mm256_add_epi32(summ, mull); 174 mull = _mm256_madd_epi16(q6, _mm256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mull); 175 mull = _mm256_madd_epi16(q5, _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull); 176 mull = _mm256_madd_epi16(q4, _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull); 177 mull = _mm256_madd_epi16(q3, _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull); 178 mull = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull); 179 mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull); 180 mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull); 181 summ = _mm256_sra_epi32(summ, cnt); 182 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)); 183 } 184 } 185 } 186 } 187 else if(order > 4) { 188 if(order > 6) { 189 if(order == 8) { 190 __m256i q0, q1, q2, q3, q4, q5, q6, q7; 191 q0 = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]); 192 q1 = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]); 193 q2 = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]); 194 q3 = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]); 195 q4 = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]); 196 q5 = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]); 197 q6 = _mm256_set1_epi32(0xffff & qlp_coeff[6 ]); 198 q7 = _mm256_set1_epi32(0xffff & qlp_coeff[7 ]); 199 200 for(i = 0; i < (int)data_len-7; i+=8) { 201 __m256i summ, mull; 202 summ = _mm256_madd_epi16(q7, _mm256_loadu_si256((const __m256i*)(data+i-8 ))); 203 mull = _mm256_madd_epi16(q6, _mm256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mull); 204 mull = _mm256_madd_epi16(q5, _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull); 205 mull = _mm256_madd_epi16(q4, _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull); 206 mull = _mm256_madd_epi16(q3, _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull); 207 mull = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull); 208 mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull); 209 mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull); 210 summ = _mm256_sra_epi32(summ, cnt); 211 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)); 212 } 213 } 214 else { /* order == 7 */ 215 __m256i q0, q1, q2, q3, q4, q5, q6; 216 q0 = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]); 217 q1 = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]); 218 q2 = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]); 219 q3 = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]); 220 q4 = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]); 221 q5 = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]); 222 q6 = _mm256_set1_epi32(0xffff & qlp_coeff[6 ]); 223 224 for(i = 0; i < (int)data_len-7; i+=8) { 225 __m256i summ, mull; 226 summ = _mm256_madd_epi16(q6, _mm256_loadu_si256((const __m256i*)(data+i-7 ))); 227 mull = _mm256_madd_epi16(q5, _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull); 228 mull = _mm256_madd_epi16(q4, _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull); 229 mull = _mm256_madd_epi16(q3, _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull); 230 mull = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull); 231 mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull); 232 mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull); 233 summ = _mm256_sra_epi32(summ, cnt); 234 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)); 235 } 236 } 237 } 238 else { 239 if(order == 6) { 240 __m256i q0, q1, q2, q3, q4, q5; 241 q0 = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]); 242 q1 = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]); 243 q2 = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]); 244 q3 = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]); 245 q4 = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]); 246 q5 = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]); 247 248 for(i = 0; i < (int)data_len-7; i+=8) { 249 __m256i summ, mull; 250 summ = _mm256_madd_epi16(q5, _mm256_loadu_si256((const __m256i*)(data+i-6 ))); 251 mull = _mm256_madd_epi16(q4, _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull); 252 mull = _mm256_madd_epi16(q3, _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull); 253 mull = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull); 254 mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull); 255 mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull); 256 summ = _mm256_sra_epi32(summ, cnt); 257 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)); 258 } 259 } 260 else { /* order == 5 */ 261 __m256i q0, q1, q2, q3, q4; 262 q0 = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]); 263 q1 = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]); 264 q2 = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]); 265 q3 = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]); 266 q4 = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]); 267 268 for(i = 0; i < (int)data_len-7; i+=8) { 269 __m256i summ, mull; 270 summ = _mm256_madd_epi16(q4, _mm256_loadu_si256((const __m256i*)(data+i-5 ))); 271 mull = _mm256_madd_epi16(q3, _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull); 272 mull = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull); 273 mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull); 274 mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull); 275 summ = _mm256_sra_epi32(summ, cnt); 276 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)); 277 } 278 } 279 } 280 } 281 else { 282 if(order > 2) { 283 if(order == 4) { 284 __m256i q0, q1, q2, q3; 285 q0 = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]); 286 q1 = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]); 287 q2 = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]); 288 q3 = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]); 289 290 for(i = 0; i < (int)data_len-7; i+=8) { 291 __m256i summ, mull; 292 summ = _mm256_madd_epi16(q3, _mm256_loadu_si256((const __m256i*)(data+i-4 ))); 293 mull = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull); 294 mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull); 295 mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull); 296 summ = _mm256_sra_epi32(summ, cnt); 297 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)); 298 } 299 } 300 else { /* order == 3 */ 301 __m256i q0, q1, q2; 302 q0 = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]); 303 q1 = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]); 304 q2 = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]); 305 306 for(i = 0; i < (int)data_len-7; i+=8) { 307 __m256i summ, mull; 308 summ = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(data+i-3 ))); 309 mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull); 310 mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull); 311 summ = _mm256_sra_epi32(summ, cnt); 312 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)); 313 } 314 } 315 } 316 else { 317 if(order == 2) { 318 __m256i q0, q1; 319 q0 = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]); 320 q1 = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]); 321 322 for(i = 0; i < (int)data_len-7; i+=8) { 323 __m256i summ, mull; 324 summ = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(data+i-2 ))); 325 mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull); 326 summ = _mm256_sra_epi32(summ, cnt); 327 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)); 328 } 329 } 330 else { /* order == 1 */ 331 __m256i q0; 332 q0 = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]); 333 334 for(i = 0; i < (int)data_len-7; i+=8) { 335 __m256i summ; 336 summ = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(data+i-1 ))); 337 summ = _mm256_sra_epi32(summ, cnt); 338 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)); 339 } 340 } 341 } 342 } 343 for(; i < (int)data_len; i++) { 344 sum = 0; 345 switch(order) { 346 case 12: sum += qlp_coeff[11] * data[i-12]; 347 case 11: sum += qlp_coeff[10] * data[i-11]; 348 case 10: sum += qlp_coeff[ 9] * data[i-10]; 349 case 9: sum += qlp_coeff[ 8] * data[i- 9]; 350 case 8: sum += qlp_coeff[ 7] * data[i- 8]; 351 case 7: sum += qlp_coeff[ 6] * data[i- 7]; 352 case 6: sum += qlp_coeff[ 5] * data[i- 6]; 353 case 5: sum += qlp_coeff[ 4] * data[i- 5]; 354 case 4: sum += qlp_coeff[ 3] * data[i- 4]; 355 case 3: sum += qlp_coeff[ 2] * data[i- 3]; 356 case 2: sum += qlp_coeff[ 1] * data[i- 2]; 357 case 1: sum += qlp_coeff[ 0] * data[i- 1]; 358 } 359 residual[i] = data[i] - (sum >> lp_quantization); 360 } 361 } 362 else { /* order > 12 */ 363 for(i = 0; i < (int)data_len; i++) { 364 sum = 0; 365 switch(order) { 366 case 32: sum += qlp_coeff[31] * data[i-32]; 367 case 31: sum += qlp_coeff[30] * data[i-31]; 368 case 30: sum += qlp_coeff[29] * data[i-30]; 369 case 29: sum += qlp_coeff[28] * data[i-29]; 370 case 28: sum += qlp_coeff[27] * data[i-28]; 371 case 27: sum += qlp_coeff[26] * data[i-27]; 372 case 26: sum += qlp_coeff[25] * data[i-26]; 373 case 25: sum += qlp_coeff[24] * data[i-25]; 374 case 24: sum += qlp_coeff[23] * data[i-24]; 375 case 23: sum += qlp_coeff[22] * data[i-23]; 376 case 22: sum += qlp_coeff[21] * data[i-22]; 377 case 21: sum += qlp_coeff[20] * data[i-21]; 378 case 20: sum += qlp_coeff[19] * data[i-20]; 379 case 19: sum += qlp_coeff[18] * data[i-19]; 380 case 18: sum += qlp_coeff[17] * data[i-18]; 381 case 17: sum += qlp_coeff[16] * data[i-17]; 382 case 16: sum += qlp_coeff[15] * data[i-16]; 383 case 15: sum += qlp_coeff[14] * data[i-15]; 384 case 14: sum += qlp_coeff[13] * data[i-14]; 385 case 13: sum += qlp_coeff[12] * data[i-13]; 386 sum += qlp_coeff[11] * data[i-12]; 387 sum += qlp_coeff[10] * data[i-11]; 388 sum += qlp_coeff[ 9] * data[i-10]; 389 sum += qlp_coeff[ 8] * data[i- 9]; 390 sum += qlp_coeff[ 7] * data[i- 8]; 391 sum += qlp_coeff[ 6] * data[i- 7]; 392 sum += qlp_coeff[ 5] * data[i- 6]; 393 sum += qlp_coeff[ 4] * data[i- 5]; 394 sum += qlp_coeff[ 3] * data[i- 4]; 395 sum += qlp_coeff[ 2] * data[i- 3]; 396 sum += qlp_coeff[ 1] * data[i- 2]; 397 sum += qlp_coeff[ 0] * data[i- 1]; 398 } 399 residual[i] = data[i] - (sum >> lp_quantization); 400 } 401 } 402 _mm256_zeroupper(); 403 } 404 405 FLAC__SSE_TARGET("avx2") 406 void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_avx2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[]) 407 { 408 int i; 409 FLAC__int32 sum; 410 __m128i cnt = _mm_cvtsi32_si128(lp_quantization); 411 412 FLAC__ASSERT(order > 0); 413 FLAC__ASSERT(order <= 32); 414 415 if(order <= 12) { 416 if(order > 8) { 417 if(order > 10) { 418 if(order == 12) { 419 __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11; 420 q0 = _mm256_set1_epi32(qlp_coeff[0 ]); 421 q1 = _mm256_set1_epi32(qlp_coeff[1 ]); 422 q2 = _mm256_set1_epi32(qlp_coeff[2 ]); 423 q3 = _mm256_set1_epi32(qlp_coeff[3 ]); 424 q4 = _mm256_set1_epi32(qlp_coeff[4 ]); 425 q5 = _mm256_set1_epi32(qlp_coeff[5 ]); 426 q6 = _mm256_set1_epi32(qlp_coeff[6 ]); 427 q7 = _mm256_set1_epi32(qlp_coeff[7 ]); 428 q8 = _mm256_set1_epi32(qlp_coeff[8 ]); 429 q9 = _mm256_set1_epi32(qlp_coeff[9 ]); 430 q10 = _mm256_set1_epi32(qlp_coeff[10]); 431 q11 = _mm256_set1_epi32(qlp_coeff[11]); 432 433 for(i = 0; i < (int)data_len-7; i+=8) { 434 __m256i summ, mull; 435 summ = _mm256_mullo_epi32(q11, _mm256_loadu_si256((const __m256i*)(data+i-12))); 436 mull = _mm256_mullo_epi32(q10, _mm256_loadu_si256((const __m256i*)(data+i-11))); summ = _mm256_add_epi32(summ, mull); 437 mull = _mm256_mullo_epi32(q9, _mm256_loadu_si256((const __m256i*)(data+i-10))); summ = _mm256_add_epi32(summ, mull); 438 mull = _mm256_mullo_epi32(q8, _mm256_loadu_si256((const __m256i*)(data+i-9))); summ = _mm256_add_epi32(summ, mull); 439 mull = _mm256_mullo_epi32(q7, _mm256_loadu_si256((const __m256i*)(data+i-8))); summ = _mm256_add_epi32(summ, mull); 440 mull = _mm256_mullo_epi32(q6, _mm256_loadu_si256((const __m256i*)(data+i-7))); summ = _mm256_add_epi32(summ, mull); 441 mull = _mm256_mullo_epi32(q5, _mm256_loadu_si256((const __m256i*)(data+i-6))); summ = _mm256_add_epi32(summ, mull); 442 mull = _mm256_mullo_epi32(q4, _mm256_loadu_si256((const __m256i*)(data+i-5))); summ = _mm256_add_epi32(summ, mull); 443 mull = _mm256_mullo_epi32(q3, _mm256_loadu_si256((const __m256i*)(data+i-4))); summ = _mm256_add_epi32(summ, mull); 444 mull = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, mull); 445 mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, mull); 446 mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, mull); 447 summ = _mm256_sra_epi32(summ, cnt); 448 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)); 449 } 450 } 451 else { /* order == 11 */ 452 __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10; 453 q0 = _mm256_set1_epi32(qlp_coeff[0 ]); 454 q1 = _mm256_set1_epi32(qlp_coeff[1 ]); 455 q2 = _mm256_set1_epi32(qlp_coeff[2 ]); 456 q3 = _mm256_set1_epi32(qlp_coeff[3 ]); 457 q4 = _mm256_set1_epi32(qlp_coeff[4 ]); 458 q5 = _mm256_set1_epi32(qlp_coeff[5 ]); 459 q6 = _mm256_set1_epi32(qlp_coeff[6 ]); 460 q7 = _mm256_set1_epi32(qlp_coeff[7 ]); 461 q8 = _mm256_set1_epi32(qlp_coeff[8 ]); 462 q9 = _mm256_set1_epi32(qlp_coeff[9 ]); 463 q10 = _mm256_set1_epi32(qlp_coeff[10]); 464 465 for(i = 0; i < (int)data_len-7; i+=8) { 466 __m256i summ, mull; 467 summ = _mm256_mullo_epi32(q10, _mm256_loadu_si256((const __m256i*)(data+i-11))); 468 mull = _mm256_mullo_epi32(q9, _mm256_loadu_si256((const __m256i*)(data+i-10))); summ = _mm256_add_epi32(summ, mull); 469 mull = _mm256_mullo_epi32(q8, _mm256_loadu_si256((const __m256i*)(data+i-9))); summ = _mm256_add_epi32(summ, mull); 470 mull = _mm256_mullo_epi32(q7, _mm256_loadu_si256((const __m256i*)(data+i-8))); summ = _mm256_add_epi32(summ, mull); 471 mull = _mm256_mullo_epi32(q6, _mm256_loadu_si256((const __m256i*)(data+i-7))); summ = _mm256_add_epi32(summ, mull); 472 mull = _mm256_mullo_epi32(q5, _mm256_loadu_si256((const __m256i*)(data+i-6))); summ = _mm256_add_epi32(summ, mull); 473 mull = _mm256_mullo_epi32(q4, _mm256_loadu_si256((const __m256i*)(data+i-5))); summ = _mm256_add_epi32(summ, mull); 474 mull = _mm256_mullo_epi32(q3, _mm256_loadu_si256((const __m256i*)(data+i-4))); summ = _mm256_add_epi32(summ, mull); 475 mull = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, mull); 476 mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, mull); 477 mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, mull); 478 summ = _mm256_sra_epi32(summ, cnt); 479 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)); 480 } 481 } 482 } 483 else { 484 if(order == 10) { 485 __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9; 486 q0 = _mm256_set1_epi32(qlp_coeff[0 ]); 487 q1 = _mm256_set1_epi32(qlp_coeff[1 ]); 488 q2 = _mm256_set1_epi32(qlp_coeff[2 ]); 489 q3 = _mm256_set1_epi32(qlp_coeff[3 ]); 490 q4 = _mm256_set1_epi32(qlp_coeff[4 ]); 491 q5 = _mm256_set1_epi32(qlp_coeff[5 ]); 492 q6 = _mm256_set1_epi32(qlp_coeff[6 ]); 493 q7 = _mm256_set1_epi32(qlp_coeff[7 ]); 494 q8 = _mm256_set1_epi32(qlp_coeff[8 ]); 495 q9 = _mm256_set1_epi32(qlp_coeff[9 ]); 496 497 for(i = 0; i < (int)data_len-7; i+=8) { 498 __m256i summ, mull; 499 summ = _mm256_mullo_epi32(q9, _mm256_loadu_si256((const __m256i*)(data+i-10))); 500 mull = _mm256_mullo_epi32(q8, _mm256_loadu_si256((const __m256i*)(data+i-9))); summ = _mm256_add_epi32(summ, mull); 501 mull = _mm256_mullo_epi32(q7, _mm256_loadu_si256((const __m256i*)(data+i-8))); summ = _mm256_add_epi32(summ, mull); 502 mull = _mm256_mullo_epi32(q6, _mm256_loadu_si256((const __m256i*)(data+i-7))); summ = _mm256_add_epi32(summ, mull); 503 mull = _mm256_mullo_epi32(q5, _mm256_loadu_si256((const __m256i*)(data+i-6))); summ = _mm256_add_epi32(summ, mull); 504 mull = _mm256_mullo_epi32(q4, _mm256_loadu_si256((const __m256i*)(data+i-5))); summ = _mm256_add_epi32(summ, mull); 505 mull = _mm256_mullo_epi32(q3, _mm256_loadu_si256((const __m256i*)(data+i-4))); summ = _mm256_add_epi32(summ, mull); 506 mull = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, mull); 507 mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, mull); 508 mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, mull); 509 summ = _mm256_sra_epi32(summ, cnt); 510 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)); 511 } 512 } 513 else { /* order == 9 */ 514 __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8; 515 q0 = _mm256_set1_epi32(qlp_coeff[0 ]); 516 q1 = _mm256_set1_epi32(qlp_coeff[1 ]); 517 q2 = _mm256_set1_epi32(qlp_coeff[2 ]); 518 q3 = _mm256_set1_epi32(qlp_coeff[3 ]); 519 q4 = _mm256_set1_epi32(qlp_coeff[4 ]); 520 q5 = _mm256_set1_epi32(qlp_coeff[5 ]); 521 q6 = _mm256_set1_epi32(qlp_coeff[6 ]); 522 q7 = _mm256_set1_epi32(qlp_coeff[7 ]); 523 q8 = _mm256_set1_epi32(qlp_coeff[8 ]); 524 525 for(i = 0; i < (int)data_len-7; i+=8) { 526 __m256i summ, mull; 527 summ = _mm256_mullo_epi32(q8, _mm256_loadu_si256((const __m256i*)(data+i-9))); 528 mull = _mm256_mullo_epi32(q7, _mm256_loadu_si256((const __m256i*)(data+i-8))); summ = _mm256_add_epi32(summ, mull); 529 mull = _mm256_mullo_epi32(q6, _mm256_loadu_si256((const __m256i*)(data+i-7))); summ = _mm256_add_epi32(summ, mull); 530 mull = _mm256_mullo_epi32(q5, _mm256_loadu_si256((const __m256i*)(data+i-6))); summ = _mm256_add_epi32(summ, mull); 531 mull = _mm256_mullo_epi32(q4, _mm256_loadu_si256((const __m256i*)(data+i-5))); summ = _mm256_add_epi32(summ, mull); 532 mull = _mm256_mullo_epi32(q3, _mm256_loadu_si256((const __m256i*)(data+i-4))); summ = _mm256_add_epi32(summ, mull); 533 mull = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, mull); 534 mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, mull); 535 mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, mull); 536 summ = _mm256_sra_epi32(summ, cnt); 537 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)); 538 } 539 } 540 } 541 } 542 else if(order > 4) { 543 if(order > 6) { 544 if(order == 8) { 545 __m256i q0, q1, q2, q3, q4, q5, q6, q7; 546 q0 = _mm256_set1_epi32(qlp_coeff[0 ]); 547 q1 = _mm256_set1_epi32(qlp_coeff[1 ]); 548 q2 = _mm256_set1_epi32(qlp_coeff[2 ]); 549 q3 = _mm256_set1_epi32(qlp_coeff[3 ]); 550 q4 = _mm256_set1_epi32(qlp_coeff[4 ]); 551 q5 = _mm256_set1_epi32(qlp_coeff[5 ]); 552 q6 = _mm256_set1_epi32(qlp_coeff[6 ]); 553 q7 = _mm256_set1_epi32(qlp_coeff[7 ]); 554 555 for(i = 0; i < (int)data_len-7; i+=8) { 556 __m256i summ, mull; 557 summ = _mm256_mullo_epi32(q7, _mm256_loadu_si256((const __m256i*)(data+i-8))); 558 mull = _mm256_mullo_epi32(q6, _mm256_loadu_si256((const __m256i*)(data+i-7))); summ = _mm256_add_epi32(summ, mull); 559 mull = _mm256_mullo_epi32(q5, _mm256_loadu_si256((const __m256i*)(data+i-6))); summ = _mm256_add_epi32(summ, mull); 560 mull = _mm256_mullo_epi32(q4, _mm256_loadu_si256((const __m256i*)(data+i-5))); summ = _mm256_add_epi32(summ, mull); 561 mull = _mm256_mullo_epi32(q3, _mm256_loadu_si256((const __m256i*)(data+i-4))); summ = _mm256_add_epi32(summ, mull); 562 mull = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, mull); 563 mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, mull); 564 mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, mull); 565 summ = _mm256_sra_epi32(summ, cnt); 566 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)); 567 } 568 } 569 else { /* order == 7 */ 570 __m256i q0, q1, q2, q3, q4, q5, q6; 571 q0 = _mm256_set1_epi32(qlp_coeff[0 ]); 572 q1 = _mm256_set1_epi32(qlp_coeff[1 ]); 573 q2 = _mm256_set1_epi32(qlp_coeff[2 ]); 574 q3 = _mm256_set1_epi32(qlp_coeff[3 ]); 575 q4 = _mm256_set1_epi32(qlp_coeff[4 ]); 576 q5 = _mm256_set1_epi32(qlp_coeff[5 ]); 577 q6 = _mm256_set1_epi32(qlp_coeff[6 ]); 578 579 for(i = 0; i < (int)data_len-7; i+=8) { 580 __m256i summ, mull; 581 summ = _mm256_mullo_epi32(q6, _mm256_loadu_si256((const __m256i*)(data+i-7))); 582 mull = _mm256_mullo_epi32(q5, _mm256_loadu_si256((const __m256i*)(data+i-6))); summ = _mm256_add_epi32(summ, mull); 583 mull = _mm256_mullo_epi32(q4, _mm256_loadu_si256((const __m256i*)(data+i-5))); summ = _mm256_add_epi32(summ, mull); 584 mull = _mm256_mullo_epi32(q3, _mm256_loadu_si256((const __m256i*)(data+i-4))); summ = _mm256_add_epi32(summ, mull); 585 mull = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, mull); 586 mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, mull); 587 mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, mull); 588 summ = _mm256_sra_epi32(summ, cnt); 589 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)); 590 } 591 } 592 } 593 else { 594 if(order == 6) { 595 __m256i q0, q1, q2, q3, q4, q5; 596 q0 = _mm256_set1_epi32(qlp_coeff[0 ]); 597 q1 = _mm256_set1_epi32(qlp_coeff[1 ]); 598 q2 = _mm256_set1_epi32(qlp_coeff[2 ]); 599 q3 = _mm256_set1_epi32(qlp_coeff[3 ]); 600 q4 = _mm256_set1_epi32(qlp_coeff[4 ]); 601 q5 = _mm256_set1_epi32(qlp_coeff[5 ]); 602 603 for(i = 0; i < (int)data_len-7; i+=8) { 604 __m256i summ, mull; 605 summ = _mm256_mullo_epi32(q5, _mm256_loadu_si256((const __m256i*)(data+i-6))); 606 mull = _mm256_mullo_epi32(q4, _mm256_loadu_si256((const __m256i*)(data+i-5))); summ = _mm256_add_epi32(summ, mull); 607 mull = _mm256_mullo_epi32(q3, _mm256_loadu_si256((const __m256i*)(data+i-4))); summ = _mm256_add_epi32(summ, mull); 608 mull = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, mull); 609 mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, mull); 610 mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, mull); 611 summ = _mm256_sra_epi32(summ, cnt); 612 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)); 613 } 614 } 615 else { /* order == 5 */ 616 __m256i q0, q1, q2, q3, q4; 617 q0 = _mm256_set1_epi32(qlp_coeff[0 ]); 618 q1 = _mm256_set1_epi32(qlp_coeff[1 ]); 619 q2 = _mm256_set1_epi32(qlp_coeff[2 ]); 620 q3 = _mm256_set1_epi32(qlp_coeff[3 ]); 621 q4 = _mm256_set1_epi32(qlp_coeff[4 ]); 622 623 for(i = 0; i < (int)data_len-7; i+=8) { 624 __m256i summ, mull; 625 summ = _mm256_mullo_epi32(q4, _mm256_loadu_si256((const __m256i*)(data+i-5))); 626 mull = _mm256_mullo_epi32(q3, _mm256_loadu_si256((const __m256i*)(data+i-4))); summ = _mm256_add_epi32(summ, mull); 627 mull = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, mull); 628 mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, mull); 629 mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, mull); 630 summ = _mm256_sra_epi32(summ, cnt); 631 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)); 632 } 633 } 634 } 635 } 636 else { 637 if(order > 2) { 638 if(order == 4) { 639 __m256i q0, q1, q2, q3; 640 q0 = _mm256_set1_epi32(qlp_coeff[0 ]); 641 q1 = _mm256_set1_epi32(qlp_coeff[1 ]); 642 q2 = _mm256_set1_epi32(qlp_coeff[2 ]); 643 q3 = _mm256_set1_epi32(qlp_coeff[3 ]); 644 645 for(i = 0; i < (int)data_len-7; i+=8) { 646 __m256i summ, mull; 647 summ = _mm256_mullo_epi32(q3, _mm256_loadu_si256((const __m256i*)(data+i-4))); 648 mull = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, mull); 649 mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, mull); 650 mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, mull); 651 summ = _mm256_sra_epi32(summ, cnt); 652 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)); 653 } 654 } 655 else { /* order == 3 */ 656 __m256i q0, q1, q2; 657 q0 = _mm256_set1_epi32(qlp_coeff[0 ]); 658 q1 = _mm256_set1_epi32(qlp_coeff[1 ]); 659 q2 = _mm256_set1_epi32(qlp_coeff[2 ]); 660 661 for(i = 0; i < (int)data_len-7; i+=8) { 662 __m256i summ, mull; 663 summ = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(data+i-3))); 664 mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, mull); 665 mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, mull); 666 summ = _mm256_sra_epi32(summ, cnt); 667 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)); 668 } 669 } 670 } 671 else { 672 if(order == 2) { 673 __m256i q0, q1; 674 q0 = _mm256_set1_epi32(qlp_coeff[0 ]); 675 q1 = _mm256_set1_epi32(qlp_coeff[1 ]); 676 677 for(i = 0; i < (int)data_len-7; i+=8) { 678 __m256i summ, mull; 679 summ = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(data+i-2))); 680 mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, mull); 681 summ = _mm256_sra_epi32(summ, cnt); 682 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)); 683 } 684 } 685 else { /* order == 1 */ 686 __m256i q0; 687 q0 = _mm256_set1_epi32(qlp_coeff[0 ]); 688 689 for(i = 0; i < (int)data_len-7; i+=8) { 690 __m256i summ; 691 summ = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(data+i-1))); 692 summ = _mm256_sra_epi32(summ, cnt); 693 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)); 694 } 695 } 696 } 697 } 698 for(; i < (int)data_len; i++) { 699 sum = 0; 700 switch(order) { 701 case 12: sum += qlp_coeff[11] * data[i-12]; 702 case 11: sum += qlp_coeff[10] * data[i-11]; 703 case 10: sum += qlp_coeff[ 9] * data[i-10]; 704 case 9: sum += qlp_coeff[ 8] * data[i- 9]; 705 case 8: sum += qlp_coeff[ 7] * data[i- 8]; 706 case 7: sum += qlp_coeff[ 6] * data[i- 7]; 707 case 6: sum += qlp_coeff[ 5] * data[i- 6]; 708 case 5: sum += qlp_coeff[ 4] * data[i- 5]; 709 case 4: sum += qlp_coeff[ 3] * data[i- 4]; 710 case 3: sum += qlp_coeff[ 2] * data[i- 3]; 711 case 2: sum += qlp_coeff[ 1] * data[i- 2]; 712 case 1: sum += qlp_coeff[ 0] * data[i- 1]; 713 } 714 residual[i] = data[i] - (sum >> lp_quantization); 715 } 716 } 717 else { /* order > 12 */ 718 for(i = 0; i < (int)data_len; i++) { 719 sum = 0; 720 switch(order) { 721 case 32: sum += qlp_coeff[31] * data[i-32]; 722 case 31: sum += qlp_coeff[30] * data[i-31]; 723 case 30: sum += qlp_coeff[29] * data[i-30]; 724 case 29: sum += qlp_coeff[28] * data[i-29]; 725 case 28: sum += qlp_coeff[27] * data[i-28]; 726 case 27: sum += qlp_coeff[26] * data[i-27]; 727 case 26: sum += qlp_coeff[25] * data[i-26]; 728 case 25: sum += qlp_coeff[24] * data[i-25]; 729 case 24: sum += qlp_coeff[23] * data[i-24]; 730 case 23: sum += qlp_coeff[22] * data[i-23]; 731 case 22: sum += qlp_coeff[21] * data[i-22]; 732 case 21: sum += qlp_coeff[20] * data[i-21]; 733 case 20: sum += qlp_coeff[19] * data[i-20]; 734 case 19: sum += qlp_coeff[18] * data[i-19]; 735 case 18: sum += qlp_coeff[17] * data[i-18]; 736 case 17: sum += qlp_coeff[16] * data[i-17]; 737 case 16: sum += qlp_coeff[15] * data[i-16]; 738 case 15: sum += qlp_coeff[14] * data[i-15]; 739 case 14: sum += qlp_coeff[13] * data[i-14]; 740 case 13: sum += qlp_coeff[12] * data[i-13]; 741 sum += qlp_coeff[11] * data[i-12]; 742 sum += qlp_coeff[10] * data[i-11]; 743 sum += qlp_coeff[ 9] * data[i-10]; 744 sum += qlp_coeff[ 8] * data[i- 9]; 745 sum += qlp_coeff[ 7] * data[i- 8]; 746 sum += qlp_coeff[ 6] * data[i- 7]; 747 sum += qlp_coeff[ 5] * data[i- 6]; 748 sum += qlp_coeff[ 4] * data[i- 5]; 749 sum += qlp_coeff[ 3] * data[i- 4]; 750 sum += qlp_coeff[ 2] * data[i- 3]; 751 sum += qlp_coeff[ 1] * data[i- 2]; 752 sum += qlp_coeff[ 0] * data[i- 1]; 753 } 754 residual[i] = data[i] - (sum >> lp_quantization); 755 } 756 } 757 _mm256_zeroupper(); 758 } 759 760 static FLAC__int32 pack_arr[8] = { 0, 2, 4, 6, 1, 3, 5, 7 }; 761 762 FLAC__SSE_TARGET("avx2") 763 void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_avx2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[]) 764 { 765 int i; 766 FLAC__int64 sum; 767 __m128i cnt = _mm_cvtsi32_si128(lp_quantization); 768 __m256i pack = _mm256_loadu_si256((const __m256i *)pack_arr); 769 770 FLAC__ASSERT(order > 0); 771 FLAC__ASSERT(order <= 32); 772 FLAC__ASSERT(lp_quantization <= 32); /* there's no _mm256_sra_epi64() so we have to use _mm256_srl_epi64() */ 773 774 if(order <= 12) { 775 if(order > 8) { 776 if(order > 10) { 777 if(order == 12) { 778 __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11; 779 q0 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ])); 780 q1 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ])); 781 q2 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ])); 782 q3 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ])); 783 q4 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ])); 784 q5 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ])); 785 q6 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[6 ])); 786 q7 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[7 ])); 787 q8 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[8 ])); 788 q9 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[9 ])); 789 q10 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[10])); 790 q11 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[11])); 791 792 for(i = 0; i < (int)data_len-3; i+=4) { 793 __m256i summ, mull; 794 summ = _mm256_mul_epi32(q11, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-12)))); 795 mull = _mm256_mul_epi32(q10, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-11)))); summ = _mm256_add_epi64(summ, mull); 796 mull = _mm256_mul_epi32(q9, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-10)))); summ = _mm256_add_epi64(summ, mull); 797 mull = _mm256_mul_epi32(q8, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-9 )))); summ = _mm256_add_epi64(summ, mull); 798 mull = _mm256_mul_epi32(q7, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 )))); summ = _mm256_add_epi64(summ, mull); 799 mull = _mm256_mul_epi32(q6, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256_add_epi64(summ, mull); 800 mull = _mm256_mul_epi32(q5, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull); 801 mull = _mm256_mul_epi32(q4, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull); 802 mull = _mm256_mul_epi32(q3, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull); 803 mull = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull); 804 mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull); 805 mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull); 806 summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack); 807 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ))); 808 } 809 } 810 else { /* order == 11 */ 811 __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10; 812 q0 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ])); 813 q1 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ])); 814 q2 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ])); 815 q3 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ])); 816 q4 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ])); 817 q5 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ])); 818 q6 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[6 ])); 819 q7 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[7 ])); 820 q8 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[8 ])); 821 q9 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[9 ])); 822 q10 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[10])); 823 824 for(i = 0; i < (int)data_len-3; i+=4) { 825 __m256i summ, mull; 826 summ = _mm256_mul_epi32(q10, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-11)))); 827 mull = _mm256_mul_epi32(q9, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-10)))); summ = _mm256_add_epi64(summ, mull); 828 mull = _mm256_mul_epi32(q8, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-9 )))); summ = _mm256_add_epi64(summ, mull); 829 mull = _mm256_mul_epi32(q7, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 )))); summ = _mm256_add_epi64(summ, mull); 830 mull = _mm256_mul_epi32(q6, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256_add_epi64(summ, mull); 831 mull = _mm256_mul_epi32(q5, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull); 832 mull = _mm256_mul_epi32(q4, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull); 833 mull = _mm256_mul_epi32(q3, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull); 834 mull = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull); 835 mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull); 836 mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull); 837 summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack); 838 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ))); 839 } 840 } 841 } 842 else { 843 if(order == 10) { 844 __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9; 845 q0 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ])); 846 q1 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ])); 847 q2 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ])); 848 q3 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ])); 849 q4 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ])); 850 q5 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ])); 851 q6 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[6 ])); 852 q7 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[7 ])); 853 q8 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[8 ])); 854 q9 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[9 ])); 855 856 for(i = 0; i < (int)data_len-3; i+=4) { 857 __m256i summ, mull; 858 summ = _mm256_mul_epi32(q9, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-10)))); 859 mull = _mm256_mul_epi32(q8, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-9 )))); summ = _mm256_add_epi64(summ, mull); 860 mull = _mm256_mul_epi32(q7, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 )))); summ = _mm256_add_epi64(summ, mull); 861 mull = _mm256_mul_epi32(q6, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256_add_epi64(summ, mull); 862 mull = _mm256_mul_epi32(q5, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull); 863 mull = _mm256_mul_epi32(q4, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull); 864 mull = _mm256_mul_epi32(q3, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull); 865 mull = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull); 866 mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull); 867 mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull); 868 summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack); 869 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ))); 870 } 871 } 872 else { /* order == 9 */ 873 __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8; 874 q0 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ])); 875 q1 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ])); 876 q2 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ])); 877 q3 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ])); 878 q4 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ])); 879 q5 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ])); 880 q6 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[6 ])); 881 q7 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[7 ])); 882 q8 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[8 ])); 883 884 for(i = 0; i < (int)data_len-3; i+=4) { 885 __m256i summ, mull; 886 summ = _mm256_mul_epi32(q8, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-9 )))); 887 mull = _mm256_mul_epi32(q7, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 )))); summ = _mm256_add_epi64(summ, mull); 888 mull = _mm256_mul_epi32(q6, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256_add_epi64(summ, mull); 889 mull = _mm256_mul_epi32(q5, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull); 890 mull = _mm256_mul_epi32(q4, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull); 891 mull = _mm256_mul_epi32(q3, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull); 892 mull = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull); 893 mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull); 894 mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull); 895 summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack); 896 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ))); 897 } 898 } 899 } 900 } 901 else if(order > 4) { 902 if(order > 6) { 903 if(order == 8) { 904 __m256i q0, q1, q2, q3, q4, q5, q6, q7; 905 q0 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ])); 906 q1 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ])); 907 q2 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ])); 908 q3 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ])); 909 q4 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ])); 910 q5 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ])); 911 q6 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[6 ])); 912 q7 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[7 ])); 913 914 for(i = 0; i < (int)data_len-3; i+=4) { 915 __m256i summ, mull; 916 summ = _mm256_mul_epi32(q7, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 )))); 917 mull = _mm256_mul_epi32(q6, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256_add_epi64(summ, mull); 918 mull = _mm256_mul_epi32(q5, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull); 919 mull = _mm256_mul_epi32(q4, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull); 920 mull = _mm256_mul_epi32(q3, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull); 921 mull = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull); 922 mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull); 923 mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull); 924 summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack); 925 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ))); 926 } 927 } 928 else { /* order == 7 */ 929 __m256i q0, q1, q2, q3, q4, q5, q6; 930 q0 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ])); 931 q1 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ])); 932 q2 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ])); 933 q3 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ])); 934 q4 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ])); 935 q5 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ])); 936 q6 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[6 ])); 937 938 for(i = 0; i < (int)data_len-3; i+=4) { 939 __m256i summ, mull; 940 summ = _mm256_mul_epi32(q6, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); 941 mull = _mm256_mul_epi32(q5, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull); 942 mull = _mm256_mul_epi32(q4, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull); 943 mull = _mm256_mul_epi32(q3, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull); 944 mull = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull); 945 mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull); 946 mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull); 947 summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack); 948 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ))); 949 } 950 } 951 } 952 else { 953 if(order == 6) { 954 __m256i q0, q1, q2, q3, q4, q5; 955 q0 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ])); 956 q1 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ])); 957 q2 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ])); 958 q3 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ])); 959 q4 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ])); 960 q5 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ])); 961 962 for(i = 0; i < (int)data_len-3; i+=4) { 963 __m256i summ, mull; 964 summ = _mm256_mul_epi32(q5, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); 965 mull = _mm256_mul_epi32(q4, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull); 966 mull = _mm256_mul_epi32(q3, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull); 967 mull = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull); 968 mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull); 969 mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull); 970 summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack); 971 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ))); 972 } 973 } 974 else { /* order == 5 */ 975 __m256i q0, q1, q2, q3, q4; 976 q0 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ])); 977 q1 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ])); 978 q2 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ])); 979 q3 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ])); 980 q4 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ])); 981 982 for(i = 0; i < (int)data_len-3; i+=4) { 983 __m256i summ, mull; 984 summ = _mm256_mul_epi32(q4, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); 985 mull = _mm256_mul_epi32(q3, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull); 986 mull = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull); 987 mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull); 988 mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull); 989 summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack); 990 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ))); 991 } 992 } 993 } 994 } 995 else { 996 if(order > 2) { 997 if(order == 4) { 998 __m256i q0, q1, q2, q3; 999 q0 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ])); 1000 q1 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ])); 1001 q2 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ])); 1002 q3 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ])); 1003 1004 for(i = 0; i < (int)data_len-3; i+=4) { 1005 __m256i summ, mull; 1006 summ = _mm256_mul_epi32(q3, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); 1007 mull = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull); 1008 mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull); 1009 mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull); 1010 summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack); 1011 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ))); 1012 } 1013 } 1014 else { /* order == 3 */ 1015 __m256i q0, q1, q2; 1016 q0 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ])); 1017 q1 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ])); 1018 q2 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ])); 1019 1020 for(i = 0; i < (int)data_len-3; i+=4) { 1021 __m256i summ, mull; 1022 summ = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); 1023 mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull); 1024 mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull); 1025 summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack); 1026 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ))); 1027 } 1028 } 1029 } 1030 else { 1031 if(order == 2) { 1032 __m256i q0, q1; 1033 q0 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ])); 1034 q1 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ])); 1035 1036 for(i = 0; i < (int)data_len-3; i+=4) { 1037 __m256i summ, mull; 1038 summ = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); 1039 mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull); 1040 summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack); 1041 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ))); 1042 } 1043 } 1044 else { /* order == 1 */ 1045 __m256i q0; 1046 q0 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ])); 1047 1048 for(i = 0; i < (int)data_len-3; i+=4) { 1049 __m256i summ; 1050 summ = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); 1051 summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack); 1052 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ))); 1053 } 1054 } 1055 } 1056 } 1057 for(; i < (int)data_len; i++) { 1058 sum = 0; 1059 switch(order) { 1060 case 12: sum += qlp_coeff[11] * (FLAC__int64)data[i-12]; 1061 case 11: sum += qlp_coeff[10] * (FLAC__int64)data[i-11]; 1062 case 10: sum += qlp_coeff[ 9] * (FLAC__int64)data[i-10]; 1063 case 9: sum += qlp_coeff[ 8] * (FLAC__int64)data[i- 9]; 1064 case 8: sum += qlp_coeff[ 7] * (FLAC__int64)data[i- 8]; 1065 case 7: sum += qlp_coeff[ 6] * (FLAC__int64)data[i- 7]; 1066 case 6: sum += qlp_coeff[ 5] * (FLAC__int64)data[i- 6]; 1067 case 5: sum += qlp_coeff[ 4] * (FLAC__int64)data[i- 5]; 1068 case 4: sum += qlp_coeff[ 3] * (FLAC__int64)data[i- 4]; 1069 case 3: sum += qlp_coeff[ 2] * (FLAC__int64)data[i- 3]; 1070 case 2: sum += qlp_coeff[ 1] * (FLAC__int64)data[i- 2]; 1071 case 1: sum += qlp_coeff[ 0] * (FLAC__int64)data[i- 1]; 1072 } 1073 residual[i] = data[i] - (FLAC__int32)(sum >> lp_quantization); 1074 } 1075 } 1076 else { /* order > 12 */ 1077 for(i = 0; i < (int)data_len; i++) { 1078 sum = 0; 1079 switch(order) { 1080 case 32: sum += qlp_coeff[31] * (FLAC__int64)data[i-32]; 1081 case 31: sum += qlp_coeff[30] * (FLAC__int64)data[i-31]; 1082 case 30: sum += qlp_coeff[29] * (FLAC__int64)data[i-30]; 1083 case 29: sum += qlp_coeff[28] * (FLAC__int64)data[i-29]; 1084 case 28: sum += qlp_coeff[27] * (FLAC__int64)data[i-28]; 1085 case 27: sum += qlp_coeff[26] * (FLAC__int64)data[i-27]; 1086 case 26: sum += qlp_coeff[25] * (FLAC__int64)data[i-26]; 1087 case 25: sum += qlp_coeff[24] * (FLAC__int64)data[i-25]; 1088 case 24: sum += qlp_coeff[23] * (FLAC__int64)data[i-24]; 1089 case 23: sum += qlp_coeff[22] * (FLAC__int64)data[i-23]; 1090 case 22: sum += qlp_coeff[21] * (FLAC__int64)data[i-22]; 1091 case 21: sum += qlp_coeff[20] * (FLAC__int64)data[i-21]; 1092 case 20: sum += qlp_coeff[19] * (FLAC__int64)data[i-20]; 1093 case 19: sum += qlp_coeff[18] * (FLAC__int64)data[i-19]; 1094 case 18: sum += qlp_coeff[17] * (FLAC__int64)data[i-18]; 1095 case 17: sum += qlp_coeff[16] * (FLAC__int64)data[i-17]; 1096 case 16: sum += qlp_coeff[15] * (FLAC__int64)data[i-16]; 1097 case 15: sum += qlp_coeff[14] * (FLAC__int64)data[i-15]; 1098 case 14: sum += qlp_coeff[13] * (FLAC__int64)data[i-14]; 1099 case 13: sum += qlp_coeff[12] * (FLAC__int64)data[i-13]; 1100 sum += qlp_coeff[11] * (FLAC__int64)data[i-12]; 1101 sum += qlp_coeff[10] * (FLAC__int64)data[i-11]; 1102 sum += qlp_coeff[ 9] * (FLAC__int64)data[i-10]; 1103 sum += qlp_coeff[ 8] * (FLAC__int64)data[i- 9]; 1104 sum += qlp_coeff[ 7] * (FLAC__int64)data[i- 8]; 1105 sum += qlp_coeff[ 6] * (FLAC__int64)data[i- 7]; 1106 sum += qlp_coeff[ 5] * (FLAC__int64)data[i- 6]; 1107 sum += qlp_coeff[ 4] * (FLAC__int64)data[i- 5]; 1108 sum += qlp_coeff[ 3] * (FLAC__int64)data[i- 4]; 1109 sum += qlp_coeff[ 2] * (FLAC__int64)data[i- 3]; 1110 sum += qlp_coeff[ 1] * (FLAC__int64)data[i- 2]; 1111 sum += qlp_coeff[ 0] * (FLAC__int64)data[i- 1]; 1112 } 1113 residual[i] = data[i] - (FLAC__int32)(sum >> lp_quantization); 1114 } 1115 } 1116 _mm256_zeroupper(); 1117 } 1118 1119 #endif /* FLAC__AVX2_SUPPORTED */ 1120 #endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */ 1121 #endif /* FLAC__NO_ASM */ 1122 #endif /* FLAC__INTEGER_ONLY_LIBRARY */ 1123