1 /* 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 12 #include "vpx_ports/config.h" 13 #include "vpx_ports/x86.h" 14 #include "variance.h" 15 #include "onyx_int.h" 16 17 18 #if HAVE_MMX 19 void vp8_short_fdct8x4_mmx(short *input, short *output, int pitch) 20 { 21 vp8_short_fdct4x4_c(input, output, pitch); 22 vp8_short_fdct4x4_c(input + 4, output + 16, pitch); 23 } 24 25 26 int vp8_fast_quantize_b_impl_mmx(short *coeff_ptr, short *zbin_ptr, 27 short *qcoeff_ptr, short *dequant_ptr, 28 short *scan_mask, short *round_ptr, 29 short *quant_ptr, short *dqcoeff_ptr); 30 void vp8_fast_quantize_b_mmx(BLOCK *b, BLOCKD *d) 31 { 32 short *scan_mask = vp8_default_zig_zag_mask;//d->scan_order_mask_ptr; 33 short *coeff_ptr = b->coeff; 34 short *zbin_ptr = b->zbin; 35 short *round_ptr = b->round; 36 short *quant_ptr = b->quant; 37 short *qcoeff_ptr = d->qcoeff; 38 short *dqcoeff_ptr = d->dqcoeff; 39 short *dequant_ptr = d->dequant; 40 41 d->eob = vp8_fast_quantize_b_impl_mmx( 42 coeff_ptr, 43 zbin_ptr, 44 qcoeff_ptr, 45 dequant_ptr, 46 scan_mask, 47 48 round_ptr, 49 quant_ptr, 50 dqcoeff_ptr 51 ); 52 } 53 54 int vp8_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr, int dc); 55 int vp8_mbblock_error_mmx(MACROBLOCK *mb, int dc) 56 { 57 short *coeff_ptr = mb->block[0].coeff; 58 short *dcoef_ptr = mb->e_mbd.block[0].dqcoeff; 59 return vp8_mbblock_error_mmx_impl(coeff_ptr, dcoef_ptr, dc); 60 } 61 62 int vp8_mbuverror_mmx_impl(short *s_ptr, short *d_ptr); 63 int vp8_mbuverror_mmx(MACROBLOCK *mb) 64 { 65 short *s_ptr = &mb->coeff[256]; 66 short *d_ptr = &mb->e_mbd.dqcoeff[256]; 67 return vp8_mbuverror_mmx_impl(s_ptr, d_ptr); 68 } 69 70 void vp8_subtract_b_mmx_impl(unsigned char *z, int src_stride, 71 short *diff, unsigned char *predictor, 72 int pitch); 73 void vp8_subtract_b_mmx(BLOCK *be, BLOCKD *bd, int pitch) 74 { 75 unsigned char *z = *(be->base_src) + be->src; 76 unsigned int src_stride = be->src_stride; 77 short *diff = &be->src_diff[0]; 78 unsigned char *predictor = &bd->predictor[0]; 79 vp8_subtract_b_mmx_impl(z, src_stride, diff, predictor, pitch); 80 } 81 82 #endif 83 84 #if HAVE_SSE2 85 void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch) 86 { 87 vp8_short_fdct4x4_sse2(input, output, pitch); 88 vp8_short_fdct4x4_sse2(input + 4, output + 16, pitch); 89 } 90 91 int vp8_fast_quantize_b_impl_sse2(short *coeff_ptr, 92 short *qcoeff_ptr, short *dequant_ptr, 93 short *scan_mask, short *round_ptr, 94 short *quant_ptr, short *dqcoeff_ptr); 95 void vp8_fast_quantize_b_sse2(BLOCK *b, BLOCKD *d) 96 { 97 short *scan_mask = vp8_default_zig_zag_mask;//d->scan_order_mask_ptr; 98 short *coeff_ptr = b->coeff; 99 short *round_ptr = b->round; 100 short *quant_ptr = b->quant; 101 short *qcoeff_ptr = d->qcoeff; 102 short *dqcoeff_ptr = d->dqcoeff; 103 short *dequant_ptr = d->dequant; 104 105 d->eob = vp8_fast_quantize_b_impl_sse2( 106 coeff_ptr, 107 qcoeff_ptr, 108 dequant_ptr, 109 scan_mask, 110 111 round_ptr, 112 quant_ptr, 113 dqcoeff_ptr 114 ); 115 } 116 117 118 int vp8_regular_quantize_b_impl_sse2(short *coeff_ptr, short *zbin_ptr, 119 short *qcoeff_ptr,short *dequant_ptr, 120 const int *default_zig_zag, short *round_ptr, 121 short *quant_ptr, short *dqcoeff_ptr, 122 unsigned short zbin_oq_value, 123 short *zbin_boost_ptr); 124 125 void vp8_regular_quantize_b_sse2(BLOCK *b,BLOCKD *d) 126 { 127 short *zbin_boost_ptr = b->zrun_zbin_boost; 128 short *coeff_ptr = b->coeff; 129 short *zbin_ptr = b->zbin; 130 short *round_ptr = b->round; 131 short *quant_ptr = b->quant; 132 short *qcoeff_ptr = d->qcoeff; 133 short *dqcoeff_ptr = d->dqcoeff; 134 short *dequant_ptr = d->dequant; 135 short zbin_oq_value = b->zbin_extra; 136 137 d->eob = vp8_regular_quantize_b_impl_sse2( 138 coeff_ptr, 139 zbin_ptr, 140 qcoeff_ptr, 141 dequant_ptr, 142 vp8_default_zig_zag1d, 143 144 round_ptr, 145 quant_ptr, 146 dqcoeff_ptr, 147 zbin_oq_value, 148 zbin_boost_ptr 149 ); 150 } 151 152 int vp8_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc); 153 int vp8_mbblock_error_xmm(MACROBLOCK *mb, int dc) 154 { 155 short *coeff_ptr = mb->block[0].coeff; 156 short *dcoef_ptr = mb->e_mbd.block[0].dqcoeff; 157 return vp8_mbblock_error_xmm_impl(coeff_ptr, dcoef_ptr, dc); 158 } 159 160 int vp8_mbuverror_xmm_impl(short *s_ptr, short *d_ptr); 161 int vp8_mbuverror_xmm(MACROBLOCK *mb) 162 { 163 short *s_ptr = &mb->coeff[256]; 164 short *d_ptr = &mb->e_mbd.dqcoeff[256]; 165 return vp8_mbuverror_xmm_impl(s_ptr, d_ptr); 166 } 167 168 void vp8_subtract_b_sse2_impl(unsigned char *z, int src_stride, 169 short *diff, unsigned char *predictor, 170 int pitch); 171 void vp8_subtract_b_sse2(BLOCK *be, BLOCKD *bd, int pitch) 172 { 173 unsigned char *z = *(be->base_src) + be->src; 174 unsigned int src_stride = be->src_stride; 175 short *diff = &be->src_diff[0]; 176 unsigned char *predictor = &bd->predictor[0]; 177 vp8_subtract_b_sse2_impl(z, src_stride, diff, predictor, pitch); 178 } 179 180 #endif 181 182 #if HAVE_SSSE3 183 int vp8_fast_quantize_b_impl_ssse3(short *coeff_ptr, 184 short *qcoeff_ptr, short *dequant_ptr, 185 short *round_ptr, 186 short *quant_ptr, short *dqcoeff_ptr); 187 void vp8_fast_quantize_b_ssse3(BLOCK *b, BLOCKD *d) 188 { 189 d->eob = vp8_fast_quantize_b_impl_ssse3( 190 b->coeff, 191 d->qcoeff, 192 d->dequant, 193 b->round, 194 b->quant, 195 d->dqcoeff 196 ); 197 } 198 #endif 199 200 201 void vp8_arch_x86_encoder_init(VP8_COMP *cpi) 202 { 203 #if CONFIG_RUNTIME_CPU_DETECT 204 int flags = x86_simd_caps(); 205 int mmx_enabled = flags & HAS_MMX; 206 int xmm_enabled = flags & HAS_SSE; 207 int wmt_enabled = flags & HAS_SSE2; 208 int SSE3Enabled = flags & HAS_SSE3; 209 int SSSE3Enabled = flags & HAS_SSSE3; 210 int SSE4_1Enabled = flags & HAS_SSE4_1; 211 212 /* Note: 213 * 214 * This platform can be built without runtime CPU detection as well. If 215 * you modify any of the function mappings present in this file, be sure 216 * to also update them in static mapings (<arch>/filename_<arch>.h) 217 */ 218 219 /* Override default functions with fastest ones for this CPU. */ 220 #if HAVE_MMX 221 if (mmx_enabled) 222 { 223 cpi->rtcd.variance.sad16x16 = vp8_sad16x16_mmx; 224 cpi->rtcd.variance.sad16x8 = vp8_sad16x8_mmx; 225 cpi->rtcd.variance.sad8x16 = vp8_sad8x16_mmx; 226 cpi->rtcd.variance.sad8x8 = vp8_sad8x8_mmx; 227 cpi->rtcd.variance.sad4x4 = vp8_sad4x4_mmx; 228 229 cpi->rtcd.variance.var4x4 = vp8_variance4x4_mmx; 230 cpi->rtcd.variance.var8x8 = vp8_variance8x8_mmx; 231 cpi->rtcd.variance.var8x16 = vp8_variance8x16_mmx; 232 cpi->rtcd.variance.var16x8 = vp8_variance16x8_mmx; 233 cpi->rtcd.variance.var16x16 = vp8_variance16x16_mmx; 234 235 cpi->rtcd.variance.subpixvar4x4 = vp8_sub_pixel_variance4x4_mmx; 236 cpi->rtcd.variance.subpixvar8x8 = vp8_sub_pixel_variance8x8_mmx; 237 cpi->rtcd.variance.subpixvar8x16 = vp8_sub_pixel_variance8x16_mmx; 238 cpi->rtcd.variance.subpixvar16x8 = vp8_sub_pixel_variance16x8_mmx; 239 cpi->rtcd.variance.subpixvar16x16 = vp8_sub_pixel_variance16x16_mmx; 240 cpi->rtcd.variance.halfpixvar16x16_h = vp8_variance_halfpixvar16x16_h_mmx; 241 cpi->rtcd.variance.halfpixvar16x16_v = vp8_variance_halfpixvar16x16_v_mmx; 242 cpi->rtcd.variance.halfpixvar16x16_hv = vp8_variance_halfpixvar16x16_hv_mmx; 243 cpi->rtcd.variance.subpixmse16x16 = vp8_sub_pixel_mse16x16_mmx; 244 245 cpi->rtcd.variance.mse16x16 = vp8_mse16x16_mmx; 246 cpi->rtcd.variance.getmbss = vp8_get_mb_ss_mmx; 247 248 cpi->rtcd.variance.get16x16prederror = vp8_get16x16pred_error_mmx; 249 cpi->rtcd.variance.get8x8var = vp8_get8x8var_mmx; 250 cpi->rtcd.variance.get16x16var = vp8_get16x16var_mmx; 251 cpi->rtcd.variance.get4x4sse_cs = vp8_get4x4sse_cs_mmx; 252 #if 0 // new fdct 253 cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_mmx; 254 cpi->rtcd.fdct.short8x4 = vp8_short_fdct8x4_mmx; 255 cpi->rtcd.fdct.fast4x4 = vp8_short_fdct4x4_mmx; 256 cpi->rtcd.fdct.fast8x4 = vp8_short_fdct8x4_mmx; 257 #else 258 cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_c; 259 cpi->rtcd.fdct.short8x4 = vp8_short_fdct8x4_c; 260 cpi->rtcd.fdct.fast4x4 = vp8_short_fdct4x4_c; 261 cpi->rtcd.fdct.fast8x4 = vp8_short_fdct8x4_c; 262 263 #endif 264 265 cpi->rtcd.fdct.walsh_short4x4 = vp8_short_walsh4x4_c; 266 267 cpi->rtcd.encodemb.berr = vp8_block_error_mmx; 268 cpi->rtcd.encodemb.mberr = vp8_mbblock_error_mmx; 269 cpi->rtcd.encodemb.mbuverr = vp8_mbuverror_mmx; 270 cpi->rtcd.encodemb.subb = vp8_subtract_b_mmx; 271 cpi->rtcd.encodemb.submby = vp8_subtract_mby_mmx; 272 cpi->rtcd.encodemb.submbuv = vp8_subtract_mbuv_mmx; 273 274 /*cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_mmx;*/ 275 } 276 #endif 277 278 #if HAVE_SSE2 279 if (wmt_enabled) 280 { 281 cpi->rtcd.variance.sad16x16 = vp8_sad16x16_wmt; 282 cpi->rtcd.variance.sad16x8 = vp8_sad16x8_wmt; 283 cpi->rtcd.variance.sad8x16 = vp8_sad8x16_wmt; 284 cpi->rtcd.variance.sad8x8 = vp8_sad8x8_wmt; 285 cpi->rtcd.variance.sad4x4 = vp8_sad4x4_wmt; 286 287 cpi->rtcd.variance.var4x4 = vp8_variance4x4_wmt; 288 cpi->rtcd.variance.var8x8 = vp8_variance8x8_wmt; 289 cpi->rtcd.variance.var8x16 = vp8_variance8x16_wmt; 290 cpi->rtcd.variance.var16x8 = vp8_variance16x8_wmt; 291 cpi->rtcd.variance.var16x16 = vp8_variance16x16_wmt; 292 293 cpi->rtcd.variance.subpixvar4x4 = vp8_sub_pixel_variance4x4_wmt; 294 cpi->rtcd.variance.subpixvar8x8 = vp8_sub_pixel_variance8x8_wmt; 295 cpi->rtcd.variance.subpixvar8x16 = vp8_sub_pixel_variance8x16_wmt; 296 cpi->rtcd.variance.subpixvar16x8 = vp8_sub_pixel_variance16x8_wmt; 297 cpi->rtcd.variance.subpixvar16x16 = vp8_sub_pixel_variance16x16_wmt; 298 cpi->rtcd.variance.halfpixvar16x16_h = vp8_variance_halfpixvar16x16_h_wmt; 299 cpi->rtcd.variance.halfpixvar16x16_v = vp8_variance_halfpixvar16x16_v_wmt; 300 cpi->rtcd.variance.halfpixvar16x16_hv = vp8_variance_halfpixvar16x16_hv_wmt; 301 cpi->rtcd.variance.subpixmse16x16 = vp8_sub_pixel_mse16x16_wmt; 302 303 cpi->rtcd.variance.mse16x16 = vp8_mse16x16_wmt; 304 cpi->rtcd.variance.getmbss = vp8_get_mb_ss_sse2; 305 306 cpi->rtcd.variance.get16x16prederror = vp8_get16x16pred_error_sse2; 307 cpi->rtcd.variance.get8x8var = vp8_get8x8var_sse2; 308 cpi->rtcd.variance.get16x16var = vp8_get16x16var_sse2; 309 /* cpi->rtcd.variance.get4x4sse_cs not implemented for wmt */; 310 311 cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_sse2; 312 cpi->rtcd.fdct.short8x4 = vp8_short_fdct8x4_sse2; 313 cpi->rtcd.fdct.fast4x4 = vp8_short_fdct4x4_sse2; 314 cpi->rtcd.fdct.fast8x4 = vp8_short_fdct8x4_sse2; 315 316 cpi->rtcd.fdct.walsh_short4x4 = vp8_short_walsh4x4_sse2 ; 317 318 cpi->rtcd.encodemb.berr = vp8_block_error_xmm; 319 cpi->rtcd.encodemb.mberr = vp8_mbblock_error_xmm; 320 cpi->rtcd.encodemb.mbuverr = vp8_mbuverror_xmm; 321 cpi->rtcd.encodemb.subb = vp8_subtract_b_sse2; 322 cpi->rtcd.encodemb.submby = vp8_subtract_mby_sse2; 323 cpi->rtcd.encodemb.submbuv = vp8_subtract_mbuv_sse2; 324 325 /*cpi->rtcd.quantize.quantb = vp8_regular_quantize_b_sse2;*/ 326 cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_sse2; 327 } 328 #endif 329 330 #if HAVE_SSE3 331 if (SSE3Enabled) 332 { 333 cpi->rtcd.variance.sad16x16 = vp8_sad16x16_sse3; 334 cpi->rtcd.variance.sad16x16x3 = vp8_sad16x16x3_sse3; 335 cpi->rtcd.variance.sad16x8x3 = vp8_sad16x8x3_sse3; 336 cpi->rtcd.variance.sad8x16x3 = vp8_sad8x16x3_sse3; 337 cpi->rtcd.variance.sad8x8x3 = vp8_sad8x8x3_sse3; 338 cpi->rtcd.variance.sad4x4x3 = vp8_sad4x4x3_sse3; 339 cpi->rtcd.search.full_search = vp8_full_search_sadx3; 340 341 cpi->rtcd.variance.sad16x16x4d = vp8_sad16x16x4d_sse3; 342 cpi->rtcd.variance.sad16x8x4d = vp8_sad16x8x4d_sse3; 343 cpi->rtcd.variance.sad8x16x4d = vp8_sad8x16x4d_sse3; 344 cpi->rtcd.variance.sad8x8x4d = vp8_sad8x8x4d_sse3; 345 cpi->rtcd.variance.sad4x4x4d = vp8_sad4x4x4d_sse3; 346 cpi->rtcd.search.diamond_search = vp8_diamond_search_sadx4; 347 } 348 #endif 349 350 #if HAVE_SSSE3 351 if (SSSE3Enabled) 352 { 353 cpi->rtcd.variance.sad16x16x3 = vp8_sad16x16x3_ssse3; 354 cpi->rtcd.variance.sad16x8x3 = vp8_sad16x8x3_ssse3; 355 356 cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_ssse3; 357 358 } 359 #endif 360 361 #if HAVE_SSE4_1 362 if (SSE4_1Enabled) 363 { 364 cpi->rtcd.variance.sad16x16x8 = vp8_sad16x16x8_sse4; 365 cpi->rtcd.variance.sad16x8x8 = vp8_sad16x8x8_sse4; 366 cpi->rtcd.variance.sad8x16x8 = vp8_sad8x16x8_sse4; 367 cpi->rtcd.variance.sad8x8x8 = vp8_sad8x8x8_sse4; 368 cpi->rtcd.variance.sad4x4x8 = vp8_sad4x4x8_sse4; 369 cpi->rtcd.search.full_search = vp8_full_search_sadx8; 370 } 371 #endif 372 373 #endif 374 } 375