1 /* 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include "vpx_config.h" 12 #include "vp8/common/variance.h" 13 #include "vp8/common/pragmas.h" 14 #include "vpx_ports/mem.h" 15 #include "vp8/common/x86/filter_x86.h" 16 17 extern void filter_block1d_h6_mmx(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter); 18 extern void filter_block1d_v6_mmx(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter); 19 extern void filter_block1d8_h6_sse2(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter); 20 extern void filter_block1d8_v6_sse2(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter); 21 22 extern void vp8_filter_block2d_bil4x4_var_mmx 23 ( 24 const unsigned char *ref_ptr, 25 int ref_pixels_per_line, 26 const unsigned char *src_ptr, 27 int src_pixels_per_line, 28 const short *HFilter, 29 const short *VFilter, 30 int *sum, 31 unsigned int *sumsquared 32 ); 33 34 extern unsigned int vp8_get4x4var_mmx 35 ( 36 const unsigned char *src_ptr, 37 int source_stride, 38 const unsigned char *ref_ptr, 39 int recon_stride, 40 unsigned int *SSE, 41 int *Sum 42 ); 43 44 unsigned int vp8_get_mb_ss_sse2 45 ( 46 const short *src_ptr 47 ); 48 unsigned int vp8_get16x16var_sse2 49 ( 50 const unsigned char *src_ptr, 51 int source_stride, 52 const unsigned char *ref_ptr, 53 int recon_stride, 54 unsigned int *SSE, 55 int *Sum 56 ); 57 unsigned int vp8_get8x8var_sse2 58 ( 59 const unsigned char *src_ptr, 60 int source_stride, 61 const unsigned char *ref_ptr, 62 int recon_stride, 63 unsigned int *SSE, 64 int *Sum 65 ); 66 void vp8_filter_block2d_bil_var_sse2 67 ( 68 const unsigned char *ref_ptr, 69 int ref_pixels_per_line, 70 const unsigned char *src_ptr, 71 int src_pixels_per_line, 72 unsigned int Height, 73 int xoffset, 74 int yoffset, 75 int *sum, 76 unsigned int *sumsquared 77 ); 78 void vp8_half_horiz_vert_variance8x_h_sse2 79 ( 80 const unsigned char *ref_ptr, 81 int ref_pixels_per_line, 82 const unsigned char *src_ptr, 83 int src_pixels_per_line, 84 unsigned int Height, 85 int *sum, 86 unsigned int *sumsquared 87 ); 88 void vp8_half_horiz_vert_variance16x_h_sse2 89 ( 90 const unsigned char *ref_ptr, 91 int ref_pixels_per_line, 92 const unsigned char *src_ptr, 93 int src_pixels_per_line, 94 unsigned int Height, 95 int *sum, 96 unsigned int *sumsquared 97 ); 98 void vp8_half_horiz_variance8x_h_sse2 99 ( 100 const unsigned char *ref_ptr, 101 int ref_pixels_per_line, 102 const unsigned char *src_ptr, 103 int src_pixels_per_line, 104 unsigned int Height, 105 int *sum, 106 unsigned int *sumsquared 107 ); 108 void vp8_half_horiz_variance16x_h_sse2 109 ( 110 const unsigned char *ref_ptr, 111 int ref_pixels_per_line, 112 const unsigned char *src_ptr, 113 int src_pixels_per_line, 114 unsigned int Height, 115 int *sum, 116 unsigned int *sumsquared 117 ); 118 void vp8_half_vert_variance8x_h_sse2 119 ( 120 const unsigned char *ref_ptr, 121 int ref_pixels_per_line, 122 const unsigned char *src_ptr, 123 int src_pixels_per_line, 124 unsigned int Height, 125 int *sum, 126 unsigned int *sumsquared 127 ); 128 void vp8_half_vert_variance16x_h_sse2 129 ( 130 const unsigned char *ref_ptr, 131 int ref_pixels_per_line, 132 const unsigned char *src_ptr, 133 int src_pixels_per_line, 134 unsigned int Height, 135 int *sum, 136 unsigned int *sumsquared 137 ); 138 139 unsigned int vp8_variance4x4_wmt( 140 const unsigned char *src_ptr, 141 int source_stride, 142 const unsigned char *ref_ptr, 143 int recon_stride, 144 unsigned int *sse) 145 { 146 unsigned int var; 147 int avg; 148 149 vp8_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ; 150 *sse = var; 151 return (var - (((unsigned int)avg * avg) >> 4)); 152 153 } 154 155 unsigned int vp8_variance8x8_wmt 156 ( 157 const unsigned char *src_ptr, 158 int source_stride, 159 const unsigned char *ref_ptr, 160 int recon_stride, 161 unsigned int *sse) 162 { 163 unsigned int var; 164 int avg; 165 166 vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ; 167 *sse = var; 168 return (var - (((unsigned int)avg * avg) >> 6)); 169 170 } 171 172 173 unsigned int vp8_variance16x16_wmt 174 ( 175 const unsigned char *src_ptr, 176 int source_stride, 177 const unsigned char *ref_ptr, 178 int recon_stride, 179 unsigned int *sse) 180 { 181 unsigned int sse0; 182 int sum0; 183 184 185 vp8_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; 186 *sse = sse0; 187 return (sse0 - (((unsigned int)sum0 * sum0) >> 8)); 188 } 189 unsigned int vp8_mse16x16_wmt( 190 const unsigned char *src_ptr, 191 int source_stride, 192 const unsigned char *ref_ptr, 193 int recon_stride, 194 unsigned int *sse) 195 { 196 197 unsigned int sse0; 198 int sum0; 199 vp8_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; 200 *sse = sse0; 201 return sse0; 202 203 } 204 205 206 unsigned int vp8_variance16x8_wmt 207 ( 208 const unsigned char *src_ptr, 209 int source_stride, 210 const unsigned char *ref_ptr, 211 int recon_stride, 212 unsigned int *sse) 213 { 214 unsigned int sse0, sse1, var; 215 int sum0, sum1, avg; 216 217 vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; 218 vp8_get8x8var_sse2(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1); 219 220 var = sse0 + sse1; 221 avg = sum0 + sum1; 222 *sse = var; 223 return (var - (((unsigned int)avg * avg) >> 7)); 224 225 } 226 227 unsigned int vp8_variance8x16_wmt 228 ( 229 const unsigned char *src_ptr, 230 int source_stride, 231 const unsigned char *ref_ptr, 232 int recon_stride, 233 unsigned int *sse) 234 { 235 unsigned int sse0, sse1, var; 236 int sum0, sum1, avg; 237 238 vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; 239 vp8_get8x8var_sse2(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1) ; 240 241 var = sse0 + sse1; 242 avg = sum0 + sum1; 243 *sse = var; 244 return (var - (((unsigned int)avg * avg) >> 7)); 245 246 } 247 248 unsigned int vp8_sub_pixel_variance4x4_wmt 249 ( 250 const unsigned char *src_ptr, 251 int src_pixels_per_line, 252 int xoffset, 253 int yoffset, 254 const unsigned char *dst_ptr, 255 int dst_pixels_per_line, 256 unsigned int *sse 257 ) 258 { 259 int xsum; 260 unsigned int xxsum; 261 vp8_filter_block2d_bil4x4_var_mmx( 262 src_ptr, src_pixels_per_line, 263 dst_ptr, dst_pixels_per_line, 264 vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset], 265 &xsum, &xxsum 266 ); 267 *sse = xxsum; 268 return (xxsum - (((unsigned int)xsum * xsum) >> 4)); 269 } 270 271 272 unsigned int vp8_sub_pixel_variance8x8_wmt 273 ( 274 const unsigned char *src_ptr, 275 int src_pixels_per_line, 276 int xoffset, 277 int yoffset, 278 const unsigned char *dst_ptr, 279 int dst_pixels_per_line, 280 unsigned int *sse 281 ) 282 { 283 int xsum; 284 unsigned int xxsum; 285 286 if (xoffset == 4 && yoffset == 0) 287 { 288 vp8_half_horiz_variance8x_h_sse2( 289 src_ptr, src_pixels_per_line, 290 dst_ptr, dst_pixels_per_line, 8, 291 &xsum, &xxsum); 292 } 293 else if (xoffset == 0 && yoffset == 4) 294 { 295 vp8_half_vert_variance8x_h_sse2( 296 src_ptr, src_pixels_per_line, 297 dst_ptr, dst_pixels_per_line, 8, 298 &xsum, &xxsum); 299 } 300 else if (xoffset == 4 && yoffset == 4) 301 { 302 vp8_half_horiz_vert_variance8x_h_sse2( 303 src_ptr, src_pixels_per_line, 304 dst_ptr, dst_pixels_per_line, 8, 305 &xsum, &xxsum); 306 } 307 else 308 { 309 vp8_filter_block2d_bil_var_sse2( 310 src_ptr, src_pixels_per_line, 311 dst_ptr, dst_pixels_per_line, 8, 312 xoffset, yoffset, 313 &xsum, &xxsum); 314 } 315 316 *sse = xxsum; 317 return (xxsum - (((unsigned int)xsum * xsum) >> 6)); 318 } 319 320 unsigned int vp8_sub_pixel_variance16x16_wmt 321 ( 322 const unsigned char *src_ptr, 323 int src_pixels_per_line, 324 int xoffset, 325 int yoffset, 326 const unsigned char *dst_ptr, 327 int dst_pixels_per_line, 328 unsigned int *sse 329 ) 330 { 331 int xsum0, xsum1; 332 unsigned int xxsum0, xxsum1; 333 334 335 /* note we could avoid these if statements if the calling function 336 * just called the appropriate functions inside. 337 */ 338 if (xoffset == 4 && yoffset == 0) 339 { 340 vp8_half_horiz_variance16x_h_sse2( 341 src_ptr, src_pixels_per_line, 342 dst_ptr, dst_pixels_per_line, 16, 343 &xsum0, &xxsum0); 344 } 345 else if (xoffset == 0 && yoffset == 4) 346 { 347 vp8_half_vert_variance16x_h_sse2( 348 src_ptr, src_pixels_per_line, 349 dst_ptr, dst_pixels_per_line, 16, 350 &xsum0, &xxsum0); 351 } 352 else if (xoffset == 4 && yoffset == 4) 353 { 354 vp8_half_horiz_vert_variance16x_h_sse2( 355 src_ptr, src_pixels_per_line, 356 dst_ptr, dst_pixels_per_line, 16, 357 &xsum0, &xxsum0); 358 } 359 else 360 { 361 vp8_filter_block2d_bil_var_sse2( 362 src_ptr, src_pixels_per_line, 363 dst_ptr, dst_pixels_per_line, 16, 364 xoffset, yoffset, 365 &xsum0, &xxsum0 366 ); 367 368 vp8_filter_block2d_bil_var_sse2( 369 src_ptr + 8, src_pixels_per_line, 370 dst_ptr + 8, dst_pixels_per_line, 16, 371 xoffset, yoffset, 372 &xsum1, &xxsum1 373 ); 374 xsum0 += xsum1; 375 xxsum0 += xxsum1; 376 } 377 378 *sse = xxsum0; 379 return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8)); 380 } 381 382 unsigned int vp8_sub_pixel_mse16x16_wmt( 383 const unsigned char *src_ptr, 384 int src_pixels_per_line, 385 int xoffset, 386 int yoffset, 387 const unsigned char *dst_ptr, 388 int dst_pixels_per_line, 389 unsigned int *sse 390 ) 391 { 392 vp8_sub_pixel_variance16x16_wmt(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse); 393 return *sse; 394 } 395 396 unsigned int vp8_sub_pixel_variance16x8_wmt 397 ( 398 const unsigned char *src_ptr, 399 int src_pixels_per_line, 400 int xoffset, 401 int yoffset, 402 const unsigned char *dst_ptr, 403 int dst_pixels_per_line, 404 unsigned int *sse 405 406 ) 407 { 408 int xsum0, xsum1; 409 unsigned int xxsum0, xxsum1; 410 411 if (xoffset == 4 && yoffset == 0) 412 { 413 vp8_half_horiz_variance16x_h_sse2( 414 src_ptr, src_pixels_per_line, 415 dst_ptr, dst_pixels_per_line, 8, 416 &xsum0, &xxsum0); 417 } 418 else if (xoffset == 0 && yoffset == 4) 419 { 420 vp8_half_vert_variance16x_h_sse2( 421 src_ptr, src_pixels_per_line, 422 dst_ptr, dst_pixels_per_line, 8, 423 &xsum0, &xxsum0); 424 } 425 else if (xoffset == 4 && yoffset == 4) 426 { 427 vp8_half_horiz_vert_variance16x_h_sse2( 428 src_ptr, src_pixels_per_line, 429 dst_ptr, dst_pixels_per_line, 8, 430 &xsum0, &xxsum0); 431 } 432 else 433 { 434 vp8_filter_block2d_bil_var_sse2( 435 src_ptr, src_pixels_per_line, 436 dst_ptr, dst_pixels_per_line, 8, 437 xoffset, yoffset, 438 &xsum0, &xxsum0); 439 440 vp8_filter_block2d_bil_var_sse2( 441 src_ptr + 8, src_pixels_per_line, 442 dst_ptr + 8, dst_pixels_per_line, 8, 443 xoffset, yoffset, 444 &xsum1, &xxsum1); 445 xsum0 += xsum1; 446 xxsum0 += xxsum1; 447 } 448 449 *sse = xxsum0; 450 return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 7)); 451 } 452 453 unsigned int vp8_sub_pixel_variance8x16_wmt 454 ( 455 const unsigned char *src_ptr, 456 int src_pixels_per_line, 457 int xoffset, 458 int yoffset, 459 const unsigned char *dst_ptr, 460 int dst_pixels_per_line, 461 unsigned int *sse 462 ) 463 { 464 int xsum; 465 unsigned int xxsum; 466 467 if (xoffset == 4 && yoffset == 0) 468 { 469 vp8_half_horiz_variance8x_h_sse2( 470 src_ptr, src_pixels_per_line, 471 dst_ptr, dst_pixels_per_line, 16, 472 &xsum, &xxsum); 473 } 474 else if (xoffset == 0 && yoffset == 4) 475 { 476 vp8_half_vert_variance8x_h_sse2( 477 src_ptr, src_pixels_per_line, 478 dst_ptr, dst_pixels_per_line, 16, 479 &xsum, &xxsum); 480 } 481 else if (xoffset == 4 && yoffset == 4) 482 { 483 vp8_half_horiz_vert_variance8x_h_sse2( 484 src_ptr, src_pixels_per_line, 485 dst_ptr, dst_pixels_per_line, 16, 486 &xsum, &xxsum); 487 } 488 else 489 { 490 vp8_filter_block2d_bil_var_sse2( 491 src_ptr, src_pixels_per_line, 492 dst_ptr, dst_pixels_per_line, 16, 493 xoffset, yoffset, 494 &xsum, &xxsum); 495 } 496 497 *sse = xxsum; 498 return (xxsum - (((unsigned int)xsum * xsum) >> 7)); 499 } 500 501 502 unsigned int vp8_variance_halfpixvar16x16_h_wmt( 503 const unsigned char *src_ptr, 504 int src_pixels_per_line, 505 const unsigned char *dst_ptr, 506 int dst_pixels_per_line, 507 unsigned int *sse) 508 { 509 int xsum0; 510 unsigned int xxsum0; 511 512 vp8_half_horiz_variance16x_h_sse2( 513 src_ptr, src_pixels_per_line, 514 dst_ptr, dst_pixels_per_line, 16, 515 &xsum0, &xxsum0); 516 517 *sse = xxsum0; 518 return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8)); 519 } 520 521 522 unsigned int vp8_variance_halfpixvar16x16_v_wmt( 523 const unsigned char *src_ptr, 524 int src_pixels_per_line, 525 const unsigned char *dst_ptr, 526 int dst_pixels_per_line, 527 unsigned int *sse) 528 { 529 int xsum0; 530 unsigned int xxsum0; 531 vp8_half_vert_variance16x_h_sse2( 532 src_ptr, src_pixels_per_line, 533 dst_ptr, dst_pixels_per_line, 16, 534 &xsum0, &xxsum0); 535 536 *sse = xxsum0; 537 return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8)); 538 } 539 540 541 unsigned int vp8_variance_halfpixvar16x16_hv_wmt( 542 const unsigned char *src_ptr, 543 int src_pixels_per_line, 544 const unsigned char *dst_ptr, 545 int dst_pixels_per_line, 546 unsigned int *sse) 547 { 548 int xsum0; 549 unsigned int xxsum0; 550 551 vp8_half_horiz_vert_variance16x_h_sse2( 552 src_ptr, src_pixels_per_line, 553 dst_ptr, dst_pixels_per_line, 16, 554 &xsum0, &xxsum0); 555 556 *sse = xxsum0; 557 return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8)); 558 } 559