1 /* 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 12 #include "vp8/encoder/variance.h" 13 #include "vp8/common/pragmas.h" 14 #include "vpx_ports/mem.h" 15 16 extern void filter_block1d_h6_mmx(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter); 17 extern void filter_block1d_v6_mmx(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter); 18 extern void filter_block1d8_h6_sse2(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter); 19 extern void filter_block1d8_v6_sse2(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter); 20 21 extern void vp8_filter_block2d_bil4x4_var_mmx 22 ( 23 const unsigned char *ref_ptr, 24 int ref_pixels_per_line, 25 const unsigned char *src_ptr, 26 int src_pixels_per_line, 27 const short *HFilter, 28 const short *VFilter, 29 int *sum, 30 unsigned int *sumsquared 31 ); 32 33 extern unsigned int vp8_get4x4var_mmx 34 ( 35 const unsigned char *src_ptr, 36 int source_stride, 37 const unsigned char *ref_ptr, 38 int recon_stride, 39 unsigned int *SSE, 40 int *Sum 41 ); 42 43 unsigned int vp8_get_mb_ss_sse2 44 ( 45 const short *src_ptr 46 ); 47 unsigned int vp8_get16x16var_sse2 48 ( 49 const unsigned char *src_ptr, 50 int source_stride, 51 const unsigned char *ref_ptr, 52 int recon_stride, 53 unsigned int *SSE, 54 int *Sum 55 ); 56 unsigned int vp8_get16x16pred_error_sse2 57 ( 58 const unsigned char *src_ptr, 59 int src_stride, 60 const unsigned char *ref_ptr, 61 int ref_stride 62 ); 63 unsigned int vp8_get8x8var_sse2 64 ( 65 const unsigned char *src_ptr, 66 int source_stride, 67 const unsigned char *ref_ptr, 68 int recon_stride, 69 unsigned int *SSE, 70 int *Sum 71 ); 72 void vp8_filter_block2d_bil_var_sse2 73 ( 74 const unsigned char *ref_ptr, 75 int ref_pixels_per_line, 76 const unsigned char *src_ptr, 77 int src_pixels_per_line, 78 unsigned int Height, 79 int xoffset, 80 int yoffset, 81 int *sum, 82 unsigned int *sumsquared 83 ); 84 void vp8_half_horiz_vert_variance8x_h_sse2 85 ( 86 const unsigned char *ref_ptr, 87 int ref_pixels_per_line, 88 const unsigned char *src_ptr, 89 int src_pixels_per_line, 90 unsigned int Height, 91 int *sum, 92 unsigned int *sumsquared 93 ); 94 void vp8_half_horiz_vert_variance16x_h_sse2 95 ( 96 const unsigned char *ref_ptr, 97 int ref_pixels_per_line, 98 const unsigned char *src_ptr, 99 int src_pixels_per_line, 100 unsigned int Height, 101 int *sum, 102 unsigned int *sumsquared 103 ); 104 void vp8_half_horiz_variance8x_h_sse2 105 ( 106 const unsigned char *ref_ptr, 107 int ref_pixels_per_line, 108 const unsigned char *src_ptr, 109 int src_pixels_per_line, 110 unsigned int Height, 111 int *sum, 112 unsigned int *sumsquared 113 ); 114 void vp8_half_horiz_variance16x_h_sse2 115 ( 116 const unsigned char *ref_ptr, 117 int ref_pixels_per_line, 118 const unsigned char *src_ptr, 119 int src_pixels_per_line, 120 unsigned int Height, 121 int *sum, 122 unsigned int *sumsquared 123 ); 124 void vp8_half_vert_variance8x_h_sse2 125 ( 126 const unsigned char *ref_ptr, 127 int ref_pixels_per_line, 128 const unsigned char *src_ptr, 129 int src_pixels_per_line, 130 unsigned int Height, 131 int *sum, 132 unsigned int *sumsquared 133 ); 134 void vp8_half_vert_variance16x_h_sse2 135 ( 136 const unsigned char *ref_ptr, 137 int ref_pixels_per_line, 138 const unsigned char *src_ptr, 139 int src_pixels_per_line, 140 unsigned int Height, 141 int *sum, 142 unsigned int *sumsquared 143 ); 144 145 DECLARE_ALIGNED(16, extern short, vp8_vp7_bilinear_filters_mmx[8][8]); 146 147 unsigned int vp8_variance4x4_wmt( 148 const unsigned char *src_ptr, 149 int source_stride, 150 const unsigned char *ref_ptr, 151 int recon_stride) 152 { 153 unsigned int var; 154 int avg; 155 156 vp8_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ; 157 return (var - ((avg * avg) >> 4)); 158 159 } 160 161 162 163 unsigned int vp8_variance8x8_wmt 164 ( 165 const unsigned char *src_ptr, 166 int source_stride, 167 const unsigned char *ref_ptr, 168 int recon_stride) 169 { 170 unsigned int var; 171 int avg; 172 173 vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ; 174 175 return (var - ((avg * avg) >> 6)); 176 177 } 178 179 180 unsigned int vp8_variance16x16_wmt 181 ( 182 const unsigned char *src_ptr, 183 int source_stride, 184 const unsigned char *ref_ptr, 185 int recon_stride, 186 unsigned int *sse) 187 { 188 unsigned int sse0; 189 int sum0; 190 191 192 vp8_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; 193 *sse = sse0; 194 return (sse0 - ((sum0 * sum0) >> 8)); 195 } 196 unsigned int vp8_mse16x16_wmt( 197 const unsigned char *src_ptr, 198 int source_stride, 199 const unsigned char *ref_ptr, 200 int recon_stride, 201 unsigned int *sse) 202 { 203 204 unsigned int sse0; 205 int sum0; 206 vp8_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; 207 *sse = sse0; 208 return sse0; 209 210 } 211 212 213 unsigned int vp8_variance16x8_wmt 214 ( 215 const unsigned char *src_ptr, 216 int source_stride, 217 const unsigned char *ref_ptr, 218 int recon_stride, 219 unsigned int *sse) 220 { 221 unsigned int sse0, sse1, var; 222 int sum0, sum1, avg; 223 224 vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; 225 vp8_get8x8var_sse2(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1); 226 227 var = sse0 + sse1; 228 avg = sum0 + sum1; 229 *sse = var; 230 return (var - ((avg * avg) >> 7)); 231 232 } 233 234 unsigned int vp8_variance8x16_wmt 235 ( 236 const unsigned char *src_ptr, 237 int source_stride, 238 const unsigned char *ref_ptr, 239 int recon_stride, 240 unsigned int *sse) 241 { 242 unsigned int sse0, sse1, var; 243 int sum0, sum1, avg; 244 245 vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; 246 vp8_get8x8var_sse2(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1) ; 247 248 var = sse0 + sse1; 249 avg = sum0 + sum1; 250 *sse = var; 251 return (var - ((avg * avg) >> 7)); 252 253 } 254 255 unsigned int vp8_sub_pixel_variance4x4_wmt 256 ( 257 const unsigned char *src_ptr, 258 int src_pixels_per_line, 259 int xoffset, 260 int yoffset, 261 const unsigned char *dst_ptr, 262 int dst_pixels_per_line, 263 unsigned int *sse 264 ) 265 { 266 int xsum; 267 unsigned int xxsum; 268 vp8_filter_block2d_bil4x4_var_mmx( 269 src_ptr, src_pixels_per_line, 270 dst_ptr, dst_pixels_per_line, 271 vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset], 272 &xsum, &xxsum 273 ); 274 *sse = xxsum; 275 return (xxsum - ((xsum * xsum) >> 4)); 276 } 277 278 279 unsigned int vp8_sub_pixel_variance8x8_wmt 280 ( 281 const unsigned char *src_ptr, 282 int src_pixels_per_line, 283 int xoffset, 284 int yoffset, 285 const unsigned char *dst_ptr, 286 int dst_pixels_per_line, 287 unsigned int *sse 288 ) 289 { 290 int xsum; 291 unsigned int xxsum; 292 293 if (xoffset == 4 && yoffset == 0) 294 { 295 vp8_half_horiz_variance8x_h_sse2( 296 src_ptr, src_pixels_per_line, 297 dst_ptr, dst_pixels_per_line, 8, 298 &xsum, &xxsum); 299 } 300 else if (xoffset == 0 && yoffset == 4) 301 { 302 vp8_half_vert_variance8x_h_sse2( 303 src_ptr, src_pixels_per_line, 304 dst_ptr, dst_pixels_per_line, 8, 305 &xsum, &xxsum); 306 } 307 else if (xoffset == 4 && yoffset == 4) 308 { 309 vp8_half_horiz_vert_variance8x_h_sse2( 310 src_ptr, src_pixels_per_line, 311 dst_ptr, dst_pixels_per_line, 8, 312 &xsum, &xxsum); 313 } 314 else 315 { 316 vp8_filter_block2d_bil_var_sse2( 317 src_ptr, src_pixels_per_line, 318 dst_ptr, dst_pixels_per_line, 8, 319 xoffset, yoffset, 320 &xsum, &xxsum); 321 } 322 323 *sse = xxsum; 324 return (xxsum - ((xsum * xsum) >> 6)); 325 } 326 327 unsigned int vp8_sub_pixel_variance16x16_wmt 328 ( 329 const unsigned char *src_ptr, 330 int src_pixels_per_line, 331 int xoffset, 332 int yoffset, 333 const unsigned char *dst_ptr, 334 int dst_pixels_per_line, 335 unsigned int *sse 336 ) 337 { 338 int xsum0, xsum1; 339 unsigned int xxsum0, xxsum1; 340 341 342 // note we could avoid these if statements if the calling function 343 // just called the appropriate functions inside. 344 if (xoffset == 4 && yoffset == 0) 345 { 346 vp8_half_horiz_variance16x_h_sse2( 347 src_ptr, src_pixels_per_line, 348 dst_ptr, dst_pixels_per_line, 16, 349 &xsum0, &xxsum0); 350 } 351 else if (xoffset == 0 && yoffset == 4) 352 { 353 vp8_half_vert_variance16x_h_sse2( 354 src_ptr, src_pixels_per_line, 355 dst_ptr, dst_pixels_per_line, 16, 356 &xsum0, &xxsum0); 357 } 358 else if (xoffset == 4 && yoffset == 4) 359 { 360 vp8_half_horiz_vert_variance16x_h_sse2( 361 src_ptr, src_pixels_per_line, 362 dst_ptr, dst_pixels_per_line, 16, 363 &xsum0, &xxsum0); 364 } 365 else 366 { 367 vp8_filter_block2d_bil_var_sse2( 368 src_ptr, src_pixels_per_line, 369 dst_ptr, dst_pixels_per_line, 16, 370 xoffset, yoffset, 371 &xsum0, &xxsum0 372 ); 373 374 vp8_filter_block2d_bil_var_sse2( 375 src_ptr + 8, src_pixels_per_line, 376 dst_ptr + 8, dst_pixels_per_line, 16, 377 xoffset, yoffset, 378 &xsum1, &xxsum1 379 ); 380 xsum0 += xsum1; 381 xxsum0 += xxsum1; 382 } 383 384 *sse = xxsum0; 385 return (xxsum0 - ((xsum0 * xsum0) >> 8)); 386 } 387 388 unsigned int vp8_sub_pixel_mse16x16_wmt( 389 const unsigned char *src_ptr, 390 int src_pixels_per_line, 391 int xoffset, 392 int yoffset, 393 const unsigned char *dst_ptr, 394 int dst_pixels_per_line, 395 unsigned int *sse 396 ) 397 { 398 vp8_sub_pixel_variance16x16_wmt(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse); 399 return *sse; 400 } 401 402 unsigned int vp8_sub_pixel_variance16x8_wmt 403 ( 404 const unsigned char *src_ptr, 405 int src_pixels_per_line, 406 int xoffset, 407 int yoffset, 408 const unsigned char *dst_ptr, 409 int dst_pixels_per_line, 410 unsigned int *sse 411 412 ) 413 { 414 int xsum0, xsum1; 415 unsigned int xxsum0, xxsum1; 416 417 if (xoffset == 4 && yoffset == 0) 418 { 419 vp8_half_horiz_variance16x_h_sse2( 420 src_ptr, src_pixels_per_line, 421 dst_ptr, dst_pixels_per_line, 8, 422 &xsum0, &xxsum0); 423 } 424 else if (xoffset == 0 && yoffset == 4) 425 { 426 vp8_half_vert_variance16x_h_sse2( 427 src_ptr, src_pixels_per_line, 428 dst_ptr, dst_pixels_per_line, 8, 429 &xsum0, &xxsum0); 430 } 431 else if (xoffset == 4 && yoffset == 4) 432 { 433 vp8_half_horiz_vert_variance16x_h_sse2( 434 src_ptr, src_pixels_per_line, 435 dst_ptr, dst_pixels_per_line, 8, 436 &xsum0, &xxsum0); 437 } 438 else 439 { 440 vp8_filter_block2d_bil_var_sse2( 441 src_ptr, src_pixels_per_line, 442 dst_ptr, dst_pixels_per_line, 8, 443 xoffset, yoffset, 444 &xsum0, &xxsum0); 445 446 vp8_filter_block2d_bil_var_sse2( 447 src_ptr + 8, src_pixels_per_line, 448 dst_ptr + 8, dst_pixels_per_line, 8, 449 xoffset, yoffset, 450 &xsum1, &xxsum1); 451 xsum0 += xsum1; 452 xxsum0 += xxsum1; 453 } 454 455 *sse = xxsum0; 456 return (xxsum0 - ((xsum0 * xsum0) >> 7)); 457 } 458 459 unsigned int vp8_sub_pixel_variance8x16_wmt 460 ( 461 const unsigned char *src_ptr, 462 int src_pixels_per_line, 463 int xoffset, 464 int yoffset, 465 const unsigned char *dst_ptr, 466 int dst_pixels_per_line, 467 unsigned int *sse 468 ) 469 { 470 int xsum; 471 unsigned int xxsum; 472 473 if (xoffset == 4 && yoffset == 0) 474 { 475 vp8_half_horiz_variance8x_h_sse2( 476 src_ptr, src_pixels_per_line, 477 dst_ptr, dst_pixels_per_line, 16, 478 &xsum, &xxsum); 479 } 480 else if (xoffset == 0 && yoffset == 4) 481 { 482 vp8_half_vert_variance8x_h_sse2( 483 src_ptr, src_pixels_per_line, 484 dst_ptr, dst_pixels_per_line, 16, 485 &xsum, &xxsum); 486 } 487 else if (xoffset == 4 && yoffset == 4) 488 { 489 vp8_half_horiz_vert_variance8x_h_sse2( 490 src_ptr, src_pixels_per_line, 491 dst_ptr, dst_pixels_per_line, 16, 492 &xsum, &xxsum); 493 } 494 else 495 { 496 vp8_filter_block2d_bil_var_sse2( 497 src_ptr, src_pixels_per_line, 498 dst_ptr, dst_pixels_per_line, 16, 499 xoffset, yoffset, 500 &xsum, &xxsum); 501 } 502 503 *sse = xxsum; 504 return (xxsum - ((xsum * xsum) >> 7)); 505 } 506 507 508 unsigned int vp8_variance_halfpixvar16x16_h_wmt( 509 const unsigned char *src_ptr, 510 int src_pixels_per_line, 511 const unsigned char *dst_ptr, 512 int dst_pixels_per_line, 513 unsigned int *sse) 514 { 515 int xsum0; 516 unsigned int xxsum0; 517 518 vp8_half_horiz_variance16x_h_sse2( 519 src_ptr, src_pixels_per_line, 520 dst_ptr, dst_pixels_per_line, 16, 521 &xsum0, &xxsum0); 522 523 *sse = xxsum0; 524 return (xxsum0 - ((xsum0 * xsum0) >> 8)); 525 } 526 527 528 unsigned int vp8_variance_halfpixvar16x16_v_wmt( 529 const unsigned char *src_ptr, 530 int src_pixels_per_line, 531 const unsigned char *dst_ptr, 532 int dst_pixels_per_line, 533 unsigned int *sse) 534 { 535 int xsum0; 536 unsigned int xxsum0; 537 vp8_half_vert_variance16x_h_sse2( 538 src_ptr, src_pixels_per_line, 539 dst_ptr, dst_pixels_per_line, 16, 540 &xsum0, &xxsum0); 541 542 *sse = xxsum0; 543 return (xxsum0 - ((xsum0 * xsum0) >> 8)); 544 } 545 546 547 unsigned int vp8_variance_halfpixvar16x16_hv_wmt( 548 const unsigned char *src_ptr, 549 int src_pixels_per_line, 550 const unsigned char *dst_ptr, 551 int dst_pixels_per_line, 552 unsigned int *sse) 553 { 554 int xsum0; 555 unsigned int xxsum0; 556 557 vp8_half_horiz_vert_variance16x_h_sse2( 558 src_ptr, src_pixels_per_line, 559 dst_ptr, dst_pixels_per_line, 16, 560 &xsum0, &xxsum0); 561 562 *sse = xxsum0; 563 return (xxsum0 - ((xsum0 * xsum0) >> 8)); 564 } 565