1 /* 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 12 #include "vpx_ports/config.h" 13 #include "vpx_ports/x86.h" 14 #include "vp8/encoder/variance.h" 15 #include "vp8/encoder/onyx_int.h" 16 17 18 #if HAVE_MMX 19 static void short_fdct8x4_mmx(short *input, short *output, int pitch) 20 { 21 vp8_short_fdct4x4_mmx(input, output, pitch); 22 vp8_short_fdct4x4_mmx(input + 4, output + 16, pitch); 23 } 24 25 int vp8_fast_quantize_b_impl_mmx(short *coeff_ptr, short *zbin_ptr, 26 short *qcoeff_ptr, short *dequant_ptr, 27 short *scan_mask, short *round_ptr, 28 short *quant_ptr, short *dqcoeff_ptr); 29 static void fast_quantize_b_mmx(BLOCK *b, BLOCKD *d) 30 { 31 short *scan_mask = vp8_default_zig_zag_mask;//d->scan_order_mask_ptr; 32 short *coeff_ptr = b->coeff; 33 short *zbin_ptr = b->zbin; 34 short *round_ptr = b->round; 35 short *quant_ptr = b->quant_fast; 36 short *qcoeff_ptr = d->qcoeff; 37 short *dqcoeff_ptr = d->dqcoeff; 38 short *dequant_ptr = d->dequant; 39 40 d->eob = vp8_fast_quantize_b_impl_mmx( 41 coeff_ptr, 42 zbin_ptr, 43 qcoeff_ptr, 44 dequant_ptr, 45 scan_mask, 46 47 round_ptr, 48 quant_ptr, 49 dqcoeff_ptr 50 ); 51 } 52 53 int vp8_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr, int dc); 54 static int mbblock_error_mmx(MACROBLOCK *mb, int dc) 55 { 56 short *coeff_ptr = mb->block[0].coeff; 57 short *dcoef_ptr = mb->e_mbd.block[0].dqcoeff; 58 return vp8_mbblock_error_mmx_impl(coeff_ptr, dcoef_ptr, dc); 59 } 60 61 int vp8_mbuverror_mmx_impl(short *s_ptr, short *d_ptr); 62 static int mbuverror_mmx(MACROBLOCK *mb) 63 { 64 short *s_ptr = &mb->coeff[256]; 65 short *d_ptr = &mb->e_mbd.dqcoeff[256]; 66 return vp8_mbuverror_mmx_impl(s_ptr, d_ptr); 67 } 68 69 void vp8_subtract_b_mmx_impl(unsigned char *z, int src_stride, 70 short *diff, unsigned char *predictor, 71 int pitch); 72 static void subtract_b_mmx(BLOCK *be, BLOCKD *bd, int pitch) 73 { 74 unsigned char *z = *(be->base_src) + be->src; 75 unsigned int src_stride = be->src_stride; 76 short *diff = &be->src_diff[0]; 77 unsigned char *predictor = &bd->predictor[0]; 78 vp8_subtract_b_mmx_impl(z, src_stride, diff, predictor, pitch); 79 } 80 81 #endif 82 83 #if HAVE_SSE2 84 int vp8_fast_quantize_b_impl_sse2(short *coeff_ptr, 85 short *qcoeff_ptr, short *dequant_ptr, 86 const short *inv_scan_order, short *round_ptr, 87 short *quant_ptr, short *dqcoeff_ptr); 88 static void fast_quantize_b_sse2(BLOCK *b, BLOCKD *d) 89 { 90 short *scan_mask = vp8_default_zig_zag_mask;//d->scan_order_mask_ptr; 91 short *coeff_ptr = b->coeff; 92 short *round_ptr = b->round; 93 short *quant_ptr = b->quant_fast; 94 short *qcoeff_ptr = d->qcoeff; 95 short *dqcoeff_ptr = d->dqcoeff; 96 short *dequant_ptr = d->dequant; 97 98 d->eob = vp8_fast_quantize_b_impl_sse2( 99 coeff_ptr, 100 qcoeff_ptr, 101 dequant_ptr, 102 vp8_default_inv_zig_zag, 103 round_ptr, 104 quant_ptr, 105 dqcoeff_ptr 106 ); 107 } 108 109 int vp8_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc); 110 static int mbblock_error_xmm(MACROBLOCK *mb, int dc) 111 { 112 short *coeff_ptr = mb->block[0].coeff; 113 short *dcoef_ptr = mb->e_mbd.block[0].dqcoeff; 114 return vp8_mbblock_error_xmm_impl(coeff_ptr, dcoef_ptr, dc); 115 } 116 117 int vp8_mbuverror_xmm_impl(short *s_ptr, short *d_ptr); 118 static int mbuverror_xmm(MACROBLOCK *mb) 119 { 120 short *s_ptr = &mb->coeff[256]; 121 short *d_ptr = &mb->e_mbd.dqcoeff[256]; 122 return vp8_mbuverror_xmm_impl(s_ptr, d_ptr); 123 } 124 125 void vp8_subtract_b_sse2_impl(unsigned char *z, int src_stride, 126 short *diff, unsigned char *predictor, 127 int pitch); 128 static void subtract_b_sse2(BLOCK *be, BLOCKD *bd, int pitch) 129 { 130 unsigned char *z = *(be->base_src) + be->src; 131 unsigned int src_stride = be->src_stride; 132 short *diff = &be->src_diff[0]; 133 unsigned char *predictor = &bd->predictor[0]; 134 vp8_subtract_b_sse2_impl(z, src_stride, diff, predictor, pitch); 135 } 136 137 #endif 138 139 #if HAVE_SSSE3 140 int vp8_fast_quantize_b_impl_ssse3(short *coeff_ptr, 141 short *qcoeff_ptr, short *dequant_ptr, 142 short *round_ptr, 143 short *quant_ptr, short *dqcoeff_ptr); 144 static void fast_quantize_b_ssse3(BLOCK *b, BLOCKD *d) 145 { 146 d->eob = vp8_fast_quantize_b_impl_ssse3( 147 b->coeff, 148 d->qcoeff, 149 d->dequant, 150 b->round, 151 b->quant_fast, 152 d->dqcoeff 153 ); 154 } 155 #if CONFIG_PSNR 156 #if ARCH_X86_64 157 typedef void ssimpf 158 ( 159 unsigned char *s, 160 int sp, 161 unsigned char *r, 162 int rp, 163 unsigned long *sum_s, 164 unsigned long *sum_r, 165 unsigned long *sum_sq_s, 166 unsigned long *sum_sq_r, 167 unsigned long *sum_sxr 168 ); 169 170 extern ssimpf vp8_ssim_parms_16x16_sse3; 171 extern ssimpf vp8_ssim_parms_8x8_sse3; 172 #endif 173 #endif 174 #endif 175 176 177 void vp8_arch_x86_encoder_init(VP8_COMP *cpi) 178 { 179 #if CONFIG_RUNTIME_CPU_DETECT 180 int flags = x86_simd_caps(); 181 int mmx_enabled = flags & HAS_MMX; 182 int xmm_enabled = flags & HAS_SSE; 183 int wmt_enabled = flags & HAS_SSE2; 184 int SSE3Enabled = flags & HAS_SSE3; 185 int SSSE3Enabled = flags & HAS_SSSE3; 186 int SSE4_1Enabled = flags & HAS_SSE4_1; 187 188 /* Note: 189 * 190 * This platform can be built without runtime CPU detection as well. If 191 * you modify any of the function mappings present in this file, be sure 192 * to also update them in static mapings (<arch>/filename_<arch>.h) 193 */ 194 195 /* Override default functions with fastest ones for this CPU. */ 196 #if HAVE_MMX 197 if (mmx_enabled) 198 { 199 cpi->rtcd.variance.sad16x16 = vp8_sad16x16_mmx; 200 cpi->rtcd.variance.sad16x8 = vp8_sad16x8_mmx; 201 cpi->rtcd.variance.sad8x16 = vp8_sad8x16_mmx; 202 cpi->rtcd.variance.sad8x8 = vp8_sad8x8_mmx; 203 cpi->rtcd.variance.sad4x4 = vp8_sad4x4_mmx; 204 205 cpi->rtcd.variance.var4x4 = vp8_variance4x4_mmx; 206 cpi->rtcd.variance.var8x8 = vp8_variance8x8_mmx; 207 cpi->rtcd.variance.var8x16 = vp8_variance8x16_mmx; 208 cpi->rtcd.variance.var16x8 = vp8_variance16x8_mmx; 209 cpi->rtcd.variance.var16x16 = vp8_variance16x16_mmx; 210 211 cpi->rtcd.variance.subpixvar4x4 = vp8_sub_pixel_variance4x4_mmx; 212 cpi->rtcd.variance.subpixvar8x8 = vp8_sub_pixel_variance8x8_mmx; 213 cpi->rtcd.variance.subpixvar8x16 = vp8_sub_pixel_variance8x16_mmx; 214 cpi->rtcd.variance.subpixvar16x8 = vp8_sub_pixel_variance16x8_mmx; 215 cpi->rtcd.variance.subpixvar16x16 = vp8_sub_pixel_variance16x16_mmx; 216 cpi->rtcd.variance.halfpixvar16x16_h = vp8_variance_halfpixvar16x16_h_mmx; 217 cpi->rtcd.variance.halfpixvar16x16_v = vp8_variance_halfpixvar16x16_v_mmx; 218 cpi->rtcd.variance.halfpixvar16x16_hv = vp8_variance_halfpixvar16x16_hv_mmx; 219 cpi->rtcd.variance.subpixmse16x16 = vp8_sub_pixel_mse16x16_mmx; 220 221 cpi->rtcd.variance.mse16x16 = vp8_mse16x16_mmx; 222 cpi->rtcd.variance.getmbss = vp8_get_mb_ss_mmx; 223 224 cpi->rtcd.variance.get16x16prederror = vp8_get16x16pred_error_mmx; 225 cpi->rtcd.variance.get8x8var = vp8_get8x8var_mmx; 226 cpi->rtcd.variance.get16x16var = vp8_get16x16var_mmx; 227 cpi->rtcd.variance.get4x4sse_cs = vp8_get4x4sse_cs_mmx; 228 229 cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_mmx; 230 cpi->rtcd.fdct.short8x4 = short_fdct8x4_mmx; 231 cpi->rtcd.fdct.fast4x4 = vp8_short_fdct4x4_mmx; 232 cpi->rtcd.fdct.fast8x4 = short_fdct8x4_mmx; 233 234 cpi->rtcd.fdct.walsh_short4x4 = vp8_short_walsh4x4_c; 235 236 cpi->rtcd.encodemb.berr = vp8_block_error_mmx; 237 cpi->rtcd.encodemb.mberr = mbblock_error_mmx; 238 cpi->rtcd.encodemb.mbuverr = mbuverror_mmx; 239 cpi->rtcd.encodemb.subb = subtract_b_mmx; 240 cpi->rtcd.encodemb.submby = vp8_subtract_mby_mmx; 241 cpi->rtcd.encodemb.submbuv = vp8_subtract_mbuv_mmx; 242 243 /*cpi->rtcd.quantize.fastquantb = fast_quantize_b_mmx;*/ 244 } 245 #endif 246 247 #if HAVE_SSE2 248 if (wmt_enabled) 249 { 250 cpi->rtcd.variance.sad16x16 = vp8_sad16x16_wmt; 251 cpi->rtcd.variance.sad16x8 = vp8_sad16x8_wmt; 252 cpi->rtcd.variance.sad8x16 = vp8_sad8x16_wmt; 253 cpi->rtcd.variance.sad8x8 = vp8_sad8x8_wmt; 254 cpi->rtcd.variance.sad4x4 = vp8_sad4x4_wmt; 255 256 cpi->rtcd.variance.var4x4 = vp8_variance4x4_wmt; 257 cpi->rtcd.variance.var8x8 = vp8_variance8x8_wmt; 258 cpi->rtcd.variance.var8x16 = vp8_variance8x16_wmt; 259 cpi->rtcd.variance.var16x8 = vp8_variance16x8_wmt; 260 cpi->rtcd.variance.var16x16 = vp8_variance16x16_wmt; 261 262 cpi->rtcd.variance.subpixvar4x4 = vp8_sub_pixel_variance4x4_wmt; 263 cpi->rtcd.variance.subpixvar8x8 = vp8_sub_pixel_variance8x8_wmt; 264 cpi->rtcd.variance.subpixvar8x16 = vp8_sub_pixel_variance8x16_wmt; 265 cpi->rtcd.variance.subpixvar16x8 = vp8_sub_pixel_variance16x8_wmt; 266 cpi->rtcd.variance.subpixvar16x16 = vp8_sub_pixel_variance16x16_wmt; 267 cpi->rtcd.variance.halfpixvar16x16_h = vp8_variance_halfpixvar16x16_h_wmt; 268 cpi->rtcd.variance.halfpixvar16x16_v = vp8_variance_halfpixvar16x16_v_wmt; 269 cpi->rtcd.variance.halfpixvar16x16_hv = vp8_variance_halfpixvar16x16_hv_wmt; 270 cpi->rtcd.variance.subpixmse16x16 = vp8_sub_pixel_mse16x16_wmt; 271 272 cpi->rtcd.variance.mse16x16 = vp8_mse16x16_wmt; 273 cpi->rtcd.variance.getmbss = vp8_get_mb_ss_sse2; 274 275 cpi->rtcd.variance.get16x16prederror = vp8_get16x16pred_error_sse2; 276 cpi->rtcd.variance.get8x8var = vp8_get8x8var_sse2; 277 cpi->rtcd.variance.get16x16var = vp8_get16x16var_sse2; 278 279 280 /* cpi->rtcd.variance.get4x4sse_cs not implemented for wmt */; 281 282 cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_sse2; 283 cpi->rtcd.fdct.short8x4 = vp8_short_fdct8x4_sse2; 284 cpi->rtcd.fdct.fast4x4 = vp8_short_fdct4x4_sse2; 285 cpi->rtcd.fdct.fast8x4 = vp8_short_fdct8x4_sse2; 286 287 cpi->rtcd.fdct.walsh_short4x4 = vp8_short_walsh4x4_sse2 ; 288 289 cpi->rtcd.encodemb.berr = vp8_block_error_xmm; 290 cpi->rtcd.encodemb.mberr = mbblock_error_xmm; 291 cpi->rtcd.encodemb.mbuverr = mbuverror_xmm; 292 cpi->rtcd.encodemb.subb = subtract_b_sse2; 293 cpi->rtcd.encodemb.submby = vp8_subtract_mby_sse2; 294 cpi->rtcd.encodemb.submbuv = vp8_subtract_mbuv_sse2; 295 296 cpi->rtcd.quantize.quantb = vp8_regular_quantize_b_sse2; 297 cpi->rtcd.quantize.fastquantb = fast_quantize_b_sse2; 298 299 #if !(CONFIG_REALTIME_ONLY) 300 cpi->rtcd.temporal.apply = vp8_temporal_filter_apply_sse2; 301 #endif 302 } 303 #endif 304 305 #if HAVE_SSE3 306 if (SSE3Enabled) 307 { 308 cpi->rtcd.variance.sad16x16 = vp8_sad16x16_sse3; 309 cpi->rtcd.variance.sad16x16x3 = vp8_sad16x16x3_sse3; 310 cpi->rtcd.variance.sad16x8x3 = vp8_sad16x8x3_sse3; 311 cpi->rtcd.variance.sad8x16x3 = vp8_sad8x16x3_sse3; 312 cpi->rtcd.variance.sad8x8x3 = vp8_sad8x8x3_sse3; 313 cpi->rtcd.variance.sad4x4x3 = vp8_sad4x4x3_sse3; 314 #if !(CONFIG_REALTIME_ONLY) 315 cpi->rtcd.search.full_search = vp8_full_search_sadx3; 316 #endif 317 cpi->rtcd.variance.sad16x16x4d = vp8_sad16x16x4d_sse3; 318 cpi->rtcd.variance.sad16x8x4d = vp8_sad16x8x4d_sse3; 319 cpi->rtcd.variance.sad8x16x4d = vp8_sad8x16x4d_sse3; 320 cpi->rtcd.variance.sad8x8x4d = vp8_sad8x8x4d_sse3; 321 cpi->rtcd.variance.sad4x4x4d = vp8_sad4x4x4d_sse3; 322 cpi->rtcd.search.diamond_search = vp8_diamond_search_sadx4; 323 } 324 #endif 325 326 #if HAVE_SSSE3 327 if (SSSE3Enabled) 328 { 329 cpi->rtcd.variance.sad16x16x3 = vp8_sad16x16x3_ssse3; 330 cpi->rtcd.variance.sad16x8x3 = vp8_sad16x8x3_ssse3; 331 332 cpi->rtcd.variance.subpixvar16x8 = vp8_sub_pixel_variance16x8_ssse3; 333 cpi->rtcd.variance.subpixvar16x16 = vp8_sub_pixel_variance16x16_ssse3; 334 335 cpi->rtcd.quantize.fastquantb = fast_quantize_b_ssse3; 336 337 #if CONFIG_PSNR 338 #if ARCH_X86_64 339 cpi->rtcd.variance.ssimpf_8x8 = vp8_ssim_parms_8x8_sse3; 340 cpi->rtcd.variance.ssimpf = vp8_ssim_parms_16x16_sse3; 341 #endif 342 #endif 343 344 } 345 #endif 346 347 348 349 #if HAVE_SSE4_1 350 if (SSE4_1Enabled) 351 { 352 cpi->rtcd.variance.sad16x16x8 = vp8_sad16x16x8_sse4; 353 cpi->rtcd.variance.sad16x8x8 = vp8_sad16x8x8_sse4; 354 cpi->rtcd.variance.sad8x16x8 = vp8_sad8x16x8_sse4; 355 cpi->rtcd.variance.sad8x8x8 = vp8_sad8x8x8_sse4; 356 cpi->rtcd.variance.sad4x4x8 = vp8_sad4x4x8_sse4; 357 #if !(CONFIG_REALTIME_ONLY) 358 cpi->rtcd.search.full_search = vp8_full_search_sadx8; 359 #endif 360 } 361 #endif 362 363 #endif 364 } 365