1 /* 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 12 #include "vpx_ports/config.h" 13 #include "vpx_ports/x86.h" 14 #include "variance.h" 15 #include "onyx_int.h" 16 17 18 #if HAVE_MMX 19 void vp8_short_fdct8x4_mmx(short *input, short *output, int pitch) 20 { 21 vp8_short_fdct4x4_c(input, output, pitch); 22 vp8_short_fdct4x4_c(input + 4, output + 16, pitch); 23 } 24 25 26 int vp8_fast_quantize_b_impl_mmx(short *coeff_ptr, short *zbin_ptr, 27 short *qcoeff_ptr, short *dequant_ptr, 28 short *scan_mask, short *round_ptr, 29 short *quant_ptr, short *dqcoeff_ptr); 30 void vp8_fast_quantize_b_mmx(BLOCK *b, BLOCKD *d) 31 { 32 short *scan_mask = vp8_default_zig_zag_mask;//d->scan_order_mask_ptr; 33 short *coeff_ptr = &b->coeff[0]; 34 short *zbin_ptr = &b->zbin[0][0]; 35 short *round_ptr = &b->round[0][0]; 36 short *quant_ptr = &b->quant[0][0]; 37 short *qcoeff_ptr = d->qcoeff; 38 short *dqcoeff_ptr = d->dqcoeff; 39 short *dequant_ptr = &d->dequant[0][0]; 40 41 d->eob = vp8_fast_quantize_b_impl_mmx( 42 coeff_ptr, 43 zbin_ptr, 44 qcoeff_ptr, 45 dequant_ptr, 46 scan_mask, 47 48 round_ptr, 49 quant_ptr, 50 dqcoeff_ptr 51 ); 52 } 53 54 int vp8_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr, int dc); 55 int vp8_mbblock_error_mmx(MACROBLOCK *mb, int dc) 56 { 57 short *coeff_ptr = mb->block[0].coeff; 58 short *dcoef_ptr = mb->e_mbd.block[0].dqcoeff; 59 return vp8_mbblock_error_mmx_impl(coeff_ptr, dcoef_ptr, dc); 60 } 61 62 int vp8_mbuverror_mmx_impl(short *s_ptr, short *d_ptr); 63 int vp8_mbuverror_mmx(MACROBLOCK *mb) 64 { 65 short *s_ptr = &mb->coeff[256]; 66 short *d_ptr = &mb->e_mbd.dqcoeff[256]; 67 return vp8_mbuverror_mmx_impl(s_ptr, d_ptr); 68 } 69 70 void vp8_subtract_b_mmx_impl(unsigned char *z, int src_stride, 71 short *diff, unsigned char *predictor, 72 int pitch); 73 void vp8_subtract_b_mmx(BLOCK *be, BLOCKD *bd, int pitch) 74 { 75 unsigned char *z = *(be->base_src) + be->src; 76 unsigned int src_stride = be->src_stride; 77 short *diff = &be->src_diff[0]; 78 unsigned char *predictor = &bd->predictor[0]; 79 vp8_subtract_b_mmx_impl(z, src_stride, diff, predictor, pitch); 80 } 81 82 #endif 83 84 #if HAVE_SSE2 85 void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch) 86 { 87 vp8_short_fdct4x4_sse2(input, output, pitch); 88 vp8_short_fdct4x4_sse2(input + 4, output + 16, pitch); 89 } 90 91 int vp8_fast_quantize_b_impl_sse(short *coeff_ptr, short *zbin_ptr, 92 short *qcoeff_ptr, short *dequant_ptr, 93 short *scan_mask, short *round_ptr, 94 short *quant_ptr, short *dqcoeff_ptr); 95 void vp8_fast_quantize_b_sse(BLOCK *b, BLOCKD *d) 96 { 97 short *scan_mask = vp8_default_zig_zag_mask;//d->scan_order_mask_ptr; 98 short *coeff_ptr = &b->coeff[0]; 99 short *zbin_ptr = &b->zbin[0][0]; 100 short *round_ptr = &b->round[0][0]; 101 short *quant_ptr = &b->quant[0][0]; 102 short *qcoeff_ptr = d->qcoeff; 103 short *dqcoeff_ptr = d->dqcoeff; 104 short *dequant_ptr = &d->dequant[0][0]; 105 106 d->eob = vp8_fast_quantize_b_impl_sse( 107 coeff_ptr, 108 zbin_ptr, 109 qcoeff_ptr, 110 dequant_ptr, 111 scan_mask, 112 113 round_ptr, 114 quant_ptr, 115 dqcoeff_ptr 116 ); 117 } 118 119 int vp8_regular_quantize_b_impl_sse2(short *coeff_ptr, short *zbin_ptr, 120 short *qcoeff_ptr,short *dequant_ptr, 121 const int *default_zig_zag, short *round_ptr, 122 short *quant_ptr, short *dqcoeff_ptr, 123 unsigned short zbin_oq_value, 124 short *zbin_boost_ptr); 125 126 void vp8_regular_quantize_b_sse2(BLOCK *b,BLOCKD *d) 127 { 128 short *zbin_boost_ptr = &b->zrun_zbin_boost[0]; 129 short *coeff_ptr = &b->coeff[0]; 130 short *zbin_ptr = &b->zbin[0][0]; 131 short *round_ptr = &b->round[0][0]; 132 short *quant_ptr = &b->quant[0][0]; 133 short *qcoeff_ptr = d->qcoeff; 134 short *dqcoeff_ptr = d->dqcoeff; 135 short *dequant_ptr = &d->dequant[0][0]; 136 short zbin_oq_value = b->zbin_extra; 137 138 d->eob = vp8_regular_quantize_b_impl_sse2( 139 coeff_ptr, 140 zbin_ptr, 141 qcoeff_ptr, 142 dequant_ptr, 143 vp8_default_zig_zag1d, 144 145 round_ptr, 146 quant_ptr, 147 dqcoeff_ptr, 148 zbin_oq_value, 149 zbin_boost_ptr 150 ); 151 } 152 153 int vp8_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc); 154 int vp8_mbblock_error_xmm(MACROBLOCK *mb, int dc) 155 { 156 short *coeff_ptr = mb->block[0].coeff; 157 short *dcoef_ptr = mb->e_mbd.block[0].dqcoeff; 158 return vp8_mbblock_error_xmm_impl(coeff_ptr, dcoef_ptr, dc); 159 } 160 161 int vp8_mbuverror_xmm_impl(short *s_ptr, short *d_ptr); 162 int vp8_mbuverror_xmm(MACROBLOCK *mb) 163 { 164 short *s_ptr = &mb->coeff[256]; 165 short *d_ptr = &mb->e_mbd.dqcoeff[256]; 166 return vp8_mbuverror_xmm_impl(s_ptr, d_ptr); 167 } 168 169 #endif 170 171 void vp8_arch_x86_encoder_init(VP8_COMP *cpi) 172 { 173 #if CONFIG_RUNTIME_CPU_DETECT 174 int flags = x86_simd_caps(); 175 int mmx_enabled = flags & HAS_MMX; 176 int xmm_enabled = flags & HAS_SSE; 177 int wmt_enabled = flags & HAS_SSE2; 178 int SSE3Enabled = flags & HAS_SSE3; 179 int SSSE3Enabled = flags & HAS_SSSE3; 180 181 /* Note: 182 * 183 * This platform can be built without runtime CPU detection as well. If 184 * you modify any of the function mappings present in this file, be sure 185 * to also update them in static mapings (<arch>/filename_<arch>.h) 186 */ 187 188 /* Override default functions with fastest ones for this CPU. */ 189 #if HAVE_MMX 190 191 if (mmx_enabled) 192 { 193 cpi->rtcd.variance.sad16x16 = vp8_sad16x16_mmx; 194 cpi->rtcd.variance.sad16x8 = vp8_sad16x8_mmx; 195 cpi->rtcd.variance.sad8x16 = vp8_sad8x16_mmx; 196 cpi->rtcd.variance.sad8x8 = vp8_sad8x8_mmx; 197 cpi->rtcd.variance.sad4x4 = vp8_sad4x4_mmx; 198 199 cpi->rtcd.variance.var4x4 = vp8_variance4x4_mmx; 200 cpi->rtcd.variance.var8x8 = vp8_variance8x8_mmx; 201 cpi->rtcd.variance.var8x16 = vp8_variance8x16_mmx; 202 cpi->rtcd.variance.var16x8 = vp8_variance16x8_mmx; 203 cpi->rtcd.variance.var16x16 = vp8_variance16x16_mmx; 204 205 cpi->rtcd.variance.subpixvar4x4 = vp8_sub_pixel_variance4x4_mmx; 206 cpi->rtcd.variance.subpixvar8x8 = vp8_sub_pixel_variance8x8_mmx; 207 cpi->rtcd.variance.subpixvar8x16 = vp8_sub_pixel_variance8x16_mmx; 208 cpi->rtcd.variance.subpixvar16x8 = vp8_sub_pixel_variance16x8_mmx; 209 cpi->rtcd.variance.subpixvar16x16 = vp8_sub_pixel_variance16x16_mmx; 210 cpi->rtcd.variance.subpixmse16x16 = vp8_sub_pixel_mse16x16_mmx; 211 212 cpi->rtcd.variance.mse16x16 = vp8_mse16x16_mmx; 213 cpi->rtcd.variance.getmbss = vp8_get_mb_ss_mmx; 214 215 cpi->rtcd.variance.get16x16prederror = vp8_get16x16pred_error_mmx; 216 cpi->rtcd.variance.get8x8var = vp8_get8x8var_mmx; 217 cpi->rtcd.variance.get16x16var = vp8_get16x16var_mmx; 218 cpi->rtcd.variance.get4x4sse_cs = vp8_get4x4sse_cs_mmx; 219 #if 0 // new fdct 220 cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_mmx; 221 cpi->rtcd.fdct.short8x4 = vp8_short_fdct8x4_mmx; 222 cpi->rtcd.fdct.fast4x4 = vp8_short_fdct4x4_mmx; 223 cpi->rtcd.fdct.fast8x4 = vp8_short_fdct8x4_mmx; 224 #else 225 cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_c; 226 cpi->rtcd.fdct.short8x4 = vp8_short_fdct8x4_c; 227 cpi->rtcd.fdct.fast4x4 = vp8_short_fdct4x4_c; 228 cpi->rtcd.fdct.fast8x4 = vp8_short_fdct8x4_c; 229 230 #endif 231 232 cpi->rtcd.fdct.walsh_short4x4 = vp8_short_walsh4x4_c; 233 234 cpi->rtcd.encodemb.berr = vp8_block_error_mmx; 235 cpi->rtcd.encodemb.mberr = vp8_mbblock_error_mmx; 236 cpi->rtcd.encodemb.mbuverr = vp8_mbuverror_mmx; 237 cpi->rtcd.encodemb.subb = vp8_subtract_b_mmx; 238 cpi->rtcd.encodemb.submby = vp8_subtract_mby_mmx; 239 cpi->rtcd.encodemb.submbuv = vp8_subtract_mbuv_mmx; 240 241 /*cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_mmx;*/ 242 } 243 244 #endif 245 #if HAVE_SSE2 246 247 if (wmt_enabled) 248 { 249 cpi->rtcd.variance.sad16x16 = vp8_sad16x16_wmt; 250 cpi->rtcd.variance.sad16x8 = vp8_sad16x8_wmt; 251 cpi->rtcd.variance.sad8x16 = vp8_sad8x16_wmt; 252 cpi->rtcd.variance.sad8x8 = vp8_sad8x8_wmt; 253 cpi->rtcd.variance.sad4x4 = vp8_sad4x4_wmt; 254 255 cpi->rtcd.variance.var4x4 = vp8_variance4x4_wmt; 256 cpi->rtcd.variance.var8x8 = vp8_variance8x8_wmt; 257 cpi->rtcd.variance.var8x16 = vp8_variance8x16_wmt; 258 cpi->rtcd.variance.var16x8 = vp8_variance16x8_wmt; 259 cpi->rtcd.variance.var16x16 = vp8_variance16x16_wmt; 260 261 cpi->rtcd.variance.subpixvar4x4 = vp8_sub_pixel_variance4x4_wmt; 262 cpi->rtcd.variance.subpixvar8x8 = vp8_sub_pixel_variance8x8_wmt; 263 cpi->rtcd.variance.subpixvar8x16 = vp8_sub_pixel_variance8x16_wmt; 264 cpi->rtcd.variance.subpixvar16x8 = vp8_sub_pixel_variance16x8_wmt; 265 cpi->rtcd.variance.subpixvar16x16 = vp8_sub_pixel_variance16x16_wmt; 266 cpi->rtcd.variance.subpixmse16x16 = vp8_sub_pixel_mse16x16_wmt; 267 268 cpi->rtcd.variance.mse16x16 = vp8_mse16x16_wmt; 269 cpi->rtcd.variance.getmbss = vp8_get_mb_ss_sse2; 270 271 cpi->rtcd.variance.get16x16prederror = vp8_get16x16pred_error_sse2; 272 cpi->rtcd.variance.get8x8var = vp8_get8x8var_sse2; 273 cpi->rtcd.variance.get16x16var = vp8_get16x16var_sse2; 274 /* cpi->rtcd.variance.get4x4sse_cs not implemented for wmt */; 275 276 cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_sse2; 277 cpi->rtcd.fdct.short8x4 = vp8_short_fdct8x4_sse2; 278 cpi->rtcd.fdct.fast4x4 = vp8_short_fdct4x4_sse2; 279 cpi->rtcd.fdct.fast8x4 = vp8_short_fdct8x4_sse2; 280 281 cpi->rtcd.fdct.walsh_short4x4 = vp8_short_walsh4x4_c ; 282 283 cpi->rtcd.encodemb.berr = vp8_block_error_xmm; 284 cpi->rtcd.encodemb.mberr = vp8_mbblock_error_xmm; 285 cpi->rtcd.encodemb.mbuverr = vp8_mbuverror_xmm; 286 /* cpi->rtcd.encodemb.sub* not implemented for wmt */ 287 288 /*cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_sse; 289 cpi->rtcd.quantize.quantb = vp8_regular_quantize_b_sse2;*/ 290 } 291 292 #endif 293 #if HAVE_SSE3 294 295 if (SSE3Enabled) 296 { 297 cpi->rtcd.variance.sad16x16 = vp8_sad16x16_sse3; 298 cpi->rtcd.variance.sad16x16x3 = vp8_sad16x16x3_sse3; 299 cpi->rtcd.variance.sad16x8x3 = vp8_sad16x8x3_sse3; 300 cpi->rtcd.variance.sad8x16x3 = vp8_sad8x16x3_sse3; 301 cpi->rtcd.variance.sad8x8x3 = vp8_sad8x8x3_sse3; 302 cpi->rtcd.variance.sad4x4x3 = vp8_sad4x4x3_sse3; 303 cpi->rtcd.search.full_search = vp8_full_search_sadx3; 304 305 cpi->rtcd.variance.sad16x16x4d = vp8_sad16x16x4d_sse3; 306 cpi->rtcd.variance.sad16x8x4d = vp8_sad16x8x4d_sse3; 307 cpi->rtcd.variance.sad8x16x4d = vp8_sad8x16x4d_sse3; 308 cpi->rtcd.variance.sad8x8x4d = vp8_sad8x8x4d_sse3; 309 cpi->rtcd.variance.sad4x4x4d = vp8_sad4x4x4d_sse3; 310 cpi->rtcd.search.diamond_search = vp8_diamond_search_sadx4; 311 } 312 313 #endif 314 #if HAVE_SSSE3 315 316 if (SSSE3Enabled) 317 { 318 cpi->rtcd.variance.sad16x16x3 = vp8_sad16x16x3_ssse3; 319 cpi->rtcd.variance.sad16x8x3 = vp8_sad16x8x3_ssse3; 320 } 321 322 #endif 323 #endif 324 } 325