1 /* 2 * 3 * Bluetooth low-complexity, subband codec (SBC) library 4 * 5 * Copyright (C) 2008-2010 Nokia Corporation 6 * Copyright (C) 2004-2010 Marcel Holtmann <marcel (at) holtmann.org> 7 * Copyright (C) 2004-2005 Henryk Ploetz <henryk (at) ploetzli.ch> 8 * Copyright (C) 2005-2006 Brad Midgley <bmidgley (at) xmission.com> 9 * 10 * 11 * This library is free software; you can redistribute it and/or 12 * modify it under the terms of the GNU Lesser General Public 13 * License as published by the Free Software Foundation; either 14 * version 2.1 of the License, or (at your option) any later version. 15 * 16 * This library is distributed in the hope that it will be useful, 17 * but WITHOUT ANY WARRANTY; without even the implied warranty of 18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 19 * Lesser General Public License for more details. 20 * 21 * You should have received a copy of the GNU Lesser General Public 22 * License along with this library; if not, write to the Free Software 23 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 24 * 25 */ 26 27 #include <stdint.h> 28 #include <limits.h> 29 #include "sbc.h" 30 #include "sbc_math.h" 31 #include "sbc_tables.h" 32 33 #include "sbc_primitives_mmx.h" 34 35 /* 36 * MMX optimizations 37 */ 38 39 #ifdef SBC_BUILD_WITH_MMX_SUPPORT 40 41 static inline void sbc_analyze_four_mmx(const int16_t *in, int32_t *out, 42 const FIXED_T *consts) 43 { 44 static const SBC_ALIGNED int32_t round_c[2] = { 45 1 << (SBC_PROTO_FIXED4_SCALE - 1), 46 1 << (SBC_PROTO_FIXED4_SCALE - 1), 47 }; 48 asm volatile ( 49 "movq (%0), %%mm0\n" 50 "movq 8(%0), %%mm1\n" 51 "pmaddwd (%1), %%mm0\n" 52 "pmaddwd 8(%1), %%mm1\n" 53 "paddd (%2), %%mm0\n" 54 "paddd (%2), %%mm1\n" 55 "\n" 56 "movq 16(%0), %%mm2\n" 57 "movq 24(%0), %%mm3\n" 58 "pmaddwd 16(%1), %%mm2\n" 59 "pmaddwd 24(%1), %%mm3\n" 60 "paddd %%mm2, %%mm0\n" 61 "paddd %%mm3, %%mm1\n" 62 "\n" 63 "movq 32(%0), %%mm2\n" 64 "movq 40(%0), %%mm3\n" 65 "pmaddwd 32(%1), %%mm2\n" 66 "pmaddwd 40(%1), %%mm3\n" 67 "paddd %%mm2, %%mm0\n" 68 "paddd %%mm3, %%mm1\n" 69 "\n" 70 "movq 48(%0), %%mm2\n" 71 "movq 56(%0), %%mm3\n" 72 "pmaddwd 48(%1), %%mm2\n" 73 "pmaddwd 56(%1), %%mm3\n" 74 "paddd %%mm2, %%mm0\n" 75 "paddd %%mm3, %%mm1\n" 76 "\n" 77 "movq 64(%0), %%mm2\n" 78 "movq 72(%0), %%mm3\n" 79 "pmaddwd 64(%1), %%mm2\n" 80 "pmaddwd 72(%1), %%mm3\n" 81 "paddd %%mm2, %%mm0\n" 82 "paddd %%mm3, %%mm1\n" 83 "\n" 84 "psrad %4, %%mm0\n" 85 "psrad %4, %%mm1\n" 86 "packssdw %%mm0, %%mm0\n" 87 "packssdw %%mm1, %%mm1\n" 88 "\n" 89 "movq %%mm0, %%mm2\n" 90 "pmaddwd 80(%1), %%mm0\n" 91 "pmaddwd 88(%1), %%mm2\n" 92 "\n" 93 "movq %%mm1, %%mm3\n" 94 "pmaddwd 96(%1), %%mm1\n" 95 "pmaddwd 104(%1), %%mm3\n" 96 "paddd %%mm1, %%mm0\n" 97 "paddd %%mm3, %%mm2\n" 98 "\n" 99 "movq %%mm0, (%3)\n" 100 "movq %%mm2, 8(%3)\n" 101 : 102 : "r" (in), "r" (consts), "r" (&round_c), "r" (out), 103 "i" (SBC_PROTO_FIXED4_SCALE) 104 : "cc", "memory"); 105 } 106 107 static inline void sbc_analyze_eight_mmx(const int16_t *in, int32_t *out, 108 const FIXED_T *consts) 109 { 110 static const SBC_ALIGNED int32_t round_c[2] = { 111 1 << (SBC_PROTO_FIXED8_SCALE - 1), 112 1 << (SBC_PROTO_FIXED8_SCALE - 1), 113 }; 114 asm volatile ( 115 "movq (%0), %%mm0\n" 116 "movq 8(%0), %%mm1\n" 117 "movq 16(%0), %%mm2\n" 118 "movq 24(%0), %%mm3\n" 119 "pmaddwd (%1), %%mm0\n" 120 "pmaddwd 8(%1), %%mm1\n" 121 "pmaddwd 16(%1), %%mm2\n" 122 "pmaddwd 24(%1), %%mm3\n" 123 "paddd (%2), %%mm0\n" 124 "paddd (%2), %%mm1\n" 125 "paddd (%2), %%mm2\n" 126 "paddd (%2), %%mm3\n" 127 "\n" 128 "movq 32(%0), %%mm4\n" 129 "movq 40(%0), %%mm5\n" 130 "movq 48(%0), %%mm6\n" 131 "movq 56(%0), %%mm7\n" 132 "pmaddwd 32(%1), %%mm4\n" 133 "pmaddwd 40(%1), %%mm5\n" 134 "pmaddwd 48(%1), %%mm6\n" 135 "pmaddwd 56(%1), %%mm7\n" 136 "paddd %%mm4, %%mm0\n" 137 "paddd %%mm5, %%mm1\n" 138 "paddd %%mm6, %%mm2\n" 139 "paddd %%mm7, %%mm3\n" 140 "\n" 141 "movq 64(%0), %%mm4\n" 142 "movq 72(%0), %%mm5\n" 143 "movq 80(%0), %%mm6\n" 144 "movq 88(%0), %%mm7\n" 145 "pmaddwd 64(%1), %%mm4\n" 146 "pmaddwd 72(%1), %%mm5\n" 147 "pmaddwd 80(%1), %%mm6\n" 148 "pmaddwd 88(%1), %%mm7\n" 149 "paddd %%mm4, %%mm0\n" 150 "paddd %%mm5, %%mm1\n" 151 "paddd %%mm6, %%mm2\n" 152 "paddd %%mm7, %%mm3\n" 153 "\n" 154 "movq 96(%0), %%mm4\n" 155 "movq 104(%0), %%mm5\n" 156 "movq 112(%0), %%mm6\n" 157 "movq 120(%0), %%mm7\n" 158 "pmaddwd 96(%1), %%mm4\n" 159 "pmaddwd 104(%1), %%mm5\n" 160 "pmaddwd 112(%1), %%mm6\n" 161 "pmaddwd 120(%1), %%mm7\n" 162 "paddd %%mm4, %%mm0\n" 163 "paddd %%mm5, %%mm1\n" 164 "paddd %%mm6, %%mm2\n" 165 "paddd %%mm7, %%mm3\n" 166 "\n" 167 "movq 128(%0), %%mm4\n" 168 "movq 136(%0), %%mm5\n" 169 "movq 144(%0), %%mm6\n" 170 "movq 152(%0), %%mm7\n" 171 "pmaddwd 128(%1), %%mm4\n" 172 "pmaddwd 136(%1), %%mm5\n" 173 "pmaddwd 144(%1), %%mm6\n" 174 "pmaddwd 152(%1), %%mm7\n" 175 "paddd %%mm4, %%mm0\n" 176 "paddd %%mm5, %%mm1\n" 177 "paddd %%mm6, %%mm2\n" 178 "paddd %%mm7, %%mm3\n" 179 "\n" 180 "psrad %4, %%mm0\n" 181 "psrad %4, %%mm1\n" 182 "psrad %4, %%mm2\n" 183 "psrad %4, %%mm3\n" 184 "\n" 185 "packssdw %%mm0, %%mm0\n" 186 "packssdw %%mm1, %%mm1\n" 187 "packssdw %%mm2, %%mm2\n" 188 "packssdw %%mm3, %%mm3\n" 189 "\n" 190 "movq %%mm0, %%mm4\n" 191 "movq %%mm0, %%mm5\n" 192 "pmaddwd 160(%1), %%mm4\n" 193 "pmaddwd 168(%1), %%mm5\n" 194 "\n" 195 "movq %%mm1, %%mm6\n" 196 "movq %%mm1, %%mm7\n" 197 "pmaddwd 192(%1), %%mm6\n" 198 "pmaddwd 200(%1), %%mm7\n" 199 "paddd %%mm6, %%mm4\n" 200 "paddd %%mm7, %%mm5\n" 201 "\n" 202 "movq %%mm2, %%mm6\n" 203 "movq %%mm2, %%mm7\n" 204 "pmaddwd 224(%1), %%mm6\n" 205 "pmaddwd 232(%1), %%mm7\n" 206 "paddd %%mm6, %%mm4\n" 207 "paddd %%mm7, %%mm5\n" 208 "\n" 209 "movq %%mm3, %%mm6\n" 210 "movq %%mm3, %%mm7\n" 211 "pmaddwd 256(%1), %%mm6\n" 212 "pmaddwd 264(%1), %%mm7\n" 213 "paddd %%mm6, %%mm4\n" 214 "paddd %%mm7, %%mm5\n" 215 "\n" 216 "movq %%mm4, (%3)\n" 217 "movq %%mm5, 8(%3)\n" 218 "\n" 219 "movq %%mm0, %%mm5\n" 220 "pmaddwd 176(%1), %%mm0\n" 221 "pmaddwd 184(%1), %%mm5\n" 222 "\n" 223 "movq %%mm1, %%mm7\n" 224 "pmaddwd 208(%1), %%mm1\n" 225 "pmaddwd 216(%1), %%mm7\n" 226 "paddd %%mm1, %%mm0\n" 227 "paddd %%mm7, %%mm5\n" 228 "\n" 229 "movq %%mm2, %%mm7\n" 230 "pmaddwd 240(%1), %%mm2\n" 231 "pmaddwd 248(%1), %%mm7\n" 232 "paddd %%mm2, %%mm0\n" 233 "paddd %%mm7, %%mm5\n" 234 "\n" 235 "movq %%mm3, %%mm7\n" 236 "pmaddwd 272(%1), %%mm3\n" 237 "pmaddwd 280(%1), %%mm7\n" 238 "paddd %%mm3, %%mm0\n" 239 "paddd %%mm7, %%mm5\n" 240 "\n" 241 "movq %%mm0, 16(%3)\n" 242 "movq %%mm5, 24(%3)\n" 243 : 244 : "r" (in), "r" (consts), "r" (&round_c), "r" (out), 245 "i" (SBC_PROTO_FIXED8_SCALE) 246 : "cc", "memory"); 247 } 248 249 static inline void sbc_analyze_4b_4s_mmx(int16_t *x, int32_t *out, 250 int out_stride) 251 { 252 /* Analyze blocks */ 253 sbc_analyze_four_mmx(x + 12, out, analysis_consts_fixed4_simd_odd); 254 out += out_stride; 255 sbc_analyze_four_mmx(x + 8, out, analysis_consts_fixed4_simd_even); 256 out += out_stride; 257 sbc_analyze_four_mmx(x + 4, out, analysis_consts_fixed4_simd_odd); 258 out += out_stride; 259 sbc_analyze_four_mmx(x + 0, out, analysis_consts_fixed4_simd_even); 260 261 asm volatile ("emms\n"); 262 } 263 264 static inline void sbc_analyze_4b_8s_mmx(int16_t *x, int32_t *out, 265 int out_stride) 266 { 267 /* Analyze blocks */ 268 sbc_analyze_eight_mmx(x + 24, out, analysis_consts_fixed8_simd_odd); 269 out += out_stride; 270 sbc_analyze_eight_mmx(x + 16, out, analysis_consts_fixed8_simd_even); 271 out += out_stride; 272 sbc_analyze_eight_mmx(x + 8, out, analysis_consts_fixed8_simd_odd); 273 out += out_stride; 274 sbc_analyze_eight_mmx(x + 0, out, analysis_consts_fixed8_simd_even); 275 276 asm volatile ("emms\n"); 277 } 278 279 static void sbc_calc_scalefactors_mmx( 280 int32_t sb_sample_f[16][2][8], 281 uint32_t scale_factor[2][8], 282 int blocks, int channels, int subbands) 283 { 284 static const SBC_ALIGNED int32_t consts[2] = { 285 1 << SCALE_OUT_BITS, 286 1 << SCALE_OUT_BITS, 287 }; 288 int ch, sb; 289 intptr_t blk; 290 for (ch = 0; ch < channels; ch++) { 291 for (sb = 0; sb < subbands; sb += 2) { 292 blk = (blocks - 1) * (((char *) &sb_sample_f[1][0][0] - 293 (char *) &sb_sample_f[0][0][0])); 294 asm volatile ( 295 "movq (%4), %%mm0\n" 296 "1:\n" 297 "movq (%1, %0), %%mm1\n" 298 "pxor %%mm2, %%mm2\n" 299 "pcmpgtd %%mm2, %%mm1\n" 300 "paddd (%1, %0), %%mm1\n" 301 "pcmpgtd %%mm1, %%mm2\n" 302 "pxor %%mm2, %%mm1\n" 303 304 "por %%mm1, %%mm0\n" 305 306 "sub %2, %0\n" 307 "jns 1b\n" 308 309 "movd %%mm0, %k0\n" 310 "psrlq $32, %%mm0\n" 311 "bsrl %k0, %k0\n" 312 "subl %5, %k0\n" 313 "movl %k0, (%3)\n" 314 315 "movd %%mm0, %k0\n" 316 "bsrl %k0, %k0\n" 317 "subl %5, %k0\n" 318 "movl %k0, 4(%3)\n" 319 : "+r" (blk) 320 : "r" (&sb_sample_f[0][ch][sb]), 321 "i" ((char *) &sb_sample_f[1][0][0] - 322 (char *) &sb_sample_f[0][0][0]), 323 "r" (&scale_factor[ch][sb]), 324 "r" (&consts), 325 "i" (SCALE_OUT_BITS) 326 : "cc", "memory"); 327 } 328 } 329 asm volatile ("emms\n"); 330 } 331 332 static int check_mmx_support(void) 333 { 334 #ifdef __amd64__ 335 return 1; /* We assume that all 64-bit processors have MMX support */ 336 #else 337 int cpuid_feature_information; 338 asm volatile ( 339 /* According to Intel manual, CPUID instruction is supported 340 * if the value of ID bit (bit 21) in EFLAGS can be modified */ 341 "pushf\n" 342 "movl (%%esp), %0\n" 343 "xorl $0x200000, (%%esp)\n" /* try to modify ID bit */ 344 "popf\n" 345 "pushf\n" 346 "xorl (%%esp), %0\n" /* check if ID bit changed */ 347 "jz 1f\n" 348 "push %%eax\n" 349 "push %%ebx\n" 350 "push %%ecx\n" 351 "mov $1, %%eax\n" 352 "cpuid\n" 353 "pop %%ecx\n" 354 "pop %%ebx\n" 355 "pop %%eax\n" 356 "1:\n" 357 "popf\n" 358 : "=d" (cpuid_feature_information) 359 : 360 : "cc"); 361 return cpuid_feature_information & (1 << 23); 362 #endif 363 } 364 365 void sbc_init_primitives_mmx(struct sbc_encoder_state *state) 366 { 367 if (check_mmx_support()) { 368 state->sbc_analyze_4b_4s = sbc_analyze_4b_4s_mmx; 369 state->sbc_analyze_4b_8s = sbc_analyze_4b_8s_mmx; 370 state->sbc_calc_scalefactors = sbc_calc_scalefactors_mmx; 371 state->implementation_info = "MMX"; 372 } 373 } 374 375 #endif 376