1 /* 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include "vpx_config.h" 12 13 #include "vp9/encoder/vp9_variance.h" 14 #include "vp9/common/vp9_pragmas.h" 15 #include "vpx_ports/mem.h" 16 17 extern unsigned int vp9_get4x4var_mmx 18 ( 19 const unsigned char *src_ptr, 20 int source_stride, 21 const unsigned char *ref_ptr, 22 int recon_stride, 23 unsigned int *SSE, 24 int *Sum 25 ); 26 27 unsigned int vp9_get_mb_ss_sse2 28 ( 29 const short *src_ptr 30 ); 31 unsigned int vp9_get16x16var_sse2 32 ( 33 const unsigned char *src_ptr, 34 int source_stride, 35 const unsigned char *ref_ptr, 36 int recon_stride, 37 unsigned int *SSE, 38 int *Sum 39 ); 40 unsigned int vp9_get8x8var_sse2 41 ( 42 const unsigned char *src_ptr, 43 int source_stride, 44 const unsigned char *ref_ptr, 45 int recon_stride, 46 unsigned int *SSE, 47 int *Sum 48 ); 49 void vp9_half_horiz_vert_variance8x_h_sse2 50 ( 51 const unsigned char *ref_ptr, 52 int ref_pixels_per_line, 53 const unsigned char *src_ptr, 54 int src_pixels_per_line, 55 unsigned int Height, 56 int *sum, 57 unsigned int *sumsquared 58 ); 59 void vp9_half_horiz_vert_variance16x_h_sse2 60 ( 61 const unsigned char *ref_ptr, 62 int ref_pixels_per_line, 63 const unsigned char *src_ptr, 64 int src_pixels_per_line, 65 unsigned int Height, 66 int *sum, 67 unsigned int *sumsquared 68 ); 69 void vp9_half_horiz_variance8x_h_sse2 70 ( 71 const unsigned char *ref_ptr, 72 int ref_pixels_per_line, 73 const unsigned char *src_ptr, 74 int src_pixels_per_line, 75 unsigned int Height, 76 int *sum, 77 unsigned int *sumsquared 78 ); 79 void vp9_half_horiz_variance16x_h_sse2 80 ( 81 const unsigned char *ref_ptr, 82 int ref_pixels_per_line, 83 const unsigned char *src_ptr, 84 int src_pixels_per_line, 85 unsigned int Height, 86 int *sum, 87 unsigned int *sumsquared 88 ); 89 void vp9_half_vert_variance8x_h_sse2 90 ( 91 const unsigned char *ref_ptr, 92 int ref_pixels_per_line, 93 const unsigned char *src_ptr, 94 int src_pixels_per_line, 95 unsigned int Height, 96 int *sum, 97 unsigned int *sumsquared 98 ); 99 void vp9_half_vert_variance16x_h_sse2 100 ( 101 const unsigned char *ref_ptr, 102 int ref_pixels_per_line, 103 const unsigned char *src_ptr, 104 int src_pixels_per_line, 105 unsigned int Height, 106 int *sum, 107 unsigned int *sumsquared 108 ); 109 110 typedef unsigned int (*get_var_sse2) ( 111 const unsigned char *src_ptr, 112 int source_stride, 113 const unsigned char *ref_ptr, 114 int recon_stride, 115 unsigned int *SSE, 116 int *Sum 117 ); 118 119 static void variance_sse2(const unsigned char *src_ptr, int source_stride, 120 const unsigned char *ref_ptr, int recon_stride, 121 int w, int h, unsigned int *sse, int *sum, 122 get_var_sse2 var_fn, int block_size) { 123 unsigned int sse0; 124 int sum0; 125 int i, j; 126 127 *sse = 0; 128 *sum = 0; 129 130 for (i = 0; i < h; i += block_size) { 131 for (j = 0; j < w; j += block_size) { 132 var_fn(src_ptr + source_stride * i + j, source_stride, 133 ref_ptr + recon_stride * i + j, recon_stride, &sse0, &sum0); 134 *sse += sse0; 135 *sum += sum0; 136 } 137 } 138 } 139 140 unsigned int vp9_variance4x4_sse2( 141 const unsigned char *src_ptr, 142 int source_stride, 143 const unsigned char *ref_ptr, 144 int recon_stride, 145 unsigned int *sse) { 146 unsigned int var; 147 int avg; 148 149 variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 4, 4, 150 &var, &avg, vp9_get4x4var_mmx, 4); 151 *sse = var; 152 return (var - (((unsigned int)avg * avg) >> 4)); 153 } 154 155 unsigned int vp9_variance8x4_sse2(const uint8_t *src_ptr, 156 int source_stride, 157 const uint8_t *ref_ptr, 158 int recon_stride, 159 unsigned int *sse) { 160 unsigned int var; 161 int avg; 162 163 variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 8, 4, 164 &var, &avg, vp9_get4x4var_mmx, 4); 165 *sse = var; 166 return (var - (((unsigned int)avg * avg) >> 5)); 167 } 168 169 unsigned int vp9_variance4x8_sse2(const uint8_t *src_ptr, 170 int source_stride, 171 const uint8_t *ref_ptr, 172 int recon_stride, 173 unsigned int *sse) { 174 unsigned int var; 175 int avg; 176 177 variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 4, 8, 178 &var, &avg, vp9_get4x4var_mmx, 4); 179 *sse = var; 180 return (var - (((unsigned int)avg * avg) >> 5)); 181 } 182 183 unsigned int vp9_variance8x8_sse2 184 ( 185 const unsigned char *src_ptr, 186 int source_stride, 187 const unsigned char *ref_ptr, 188 int recon_stride, 189 unsigned int *sse) { 190 unsigned int var; 191 int avg; 192 193 variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 8, 8, 194 &var, &avg, vp9_get8x8var_sse2, 8); 195 *sse = var; 196 return (var - (((unsigned int)avg * avg) >> 6)); 197 } 198 199 unsigned int vp9_variance16x8_sse2 200 ( 201 const unsigned char *src_ptr, 202 int source_stride, 203 const unsigned char *ref_ptr, 204 int recon_stride, 205 unsigned int *sse) { 206 unsigned int var; 207 int avg; 208 209 variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 16, 8, 210 &var, &avg, vp9_get8x8var_sse2, 8); 211 *sse = var; 212 return (var - (((unsigned int)avg * avg) >> 7)); 213 } 214 215 unsigned int vp9_variance8x16_sse2 216 ( 217 const unsigned char *src_ptr, 218 int source_stride, 219 const unsigned char *ref_ptr, 220 int recon_stride, 221 unsigned int *sse) { 222 unsigned int var; 223 int avg; 224 225 variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 8, 16, 226 &var, &avg, vp9_get8x8var_sse2, 8); 227 *sse = var; 228 return (var - (((unsigned int)avg * avg) >> 7)); 229 } 230 231 unsigned int vp9_variance16x16_sse2 232 ( 233 const unsigned char *src_ptr, 234 int source_stride, 235 const unsigned char *ref_ptr, 236 int recon_stride, 237 unsigned int *sse) { 238 unsigned int var; 239 int avg; 240 241 variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, 242 &var, &avg, vp9_get16x16var_sse2, 16); 243 *sse = var; 244 return (var - (((unsigned int)avg * avg) >> 8)); 245 } 246 247 unsigned int vp9_mse16x16_sse2( 248 const unsigned char *src_ptr, 249 int source_stride, 250 const unsigned char *ref_ptr, 251 int recon_stride, 252 unsigned int *sse) { 253 254 unsigned int sse0; 255 int sum0; 256 vp9_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, 257 &sum0); 258 *sse = sse0; 259 return sse0; 260 } 261 262 unsigned int vp9_variance32x32_sse2(const uint8_t *src_ptr, 263 int source_stride, 264 const uint8_t *ref_ptr, 265 int recon_stride, 266 unsigned int *sse) { 267 unsigned int var; 268 int avg; 269 270 variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 32, 271 &var, &avg, vp9_get16x16var_sse2, 16); 272 *sse = var; 273 return (var - (((int64_t)avg * avg) >> 10)); 274 } 275 276 unsigned int vp9_variance32x16_sse2(const uint8_t *src_ptr, 277 int source_stride, 278 const uint8_t *ref_ptr, 279 int recon_stride, 280 unsigned int *sse) { 281 unsigned int var; 282 int avg; 283 284 variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 16, 285 &var, &avg, vp9_get16x16var_sse2, 16); 286 *sse = var; 287 return (var - (((int64_t)avg * avg) >> 9)); 288 } 289 290 unsigned int vp9_variance16x32_sse2(const uint8_t *src_ptr, 291 int source_stride, 292 const uint8_t *ref_ptr, 293 int recon_stride, 294 unsigned int *sse) { 295 unsigned int var; 296 int avg; 297 298 variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 16, 32, 299 &var, &avg, vp9_get16x16var_sse2, 16); 300 *sse = var; 301 return (var - (((int64_t)avg * avg) >> 9)); 302 } 303 304 unsigned int vp9_variance64x64_sse2(const uint8_t *src_ptr, 305 int source_stride, 306 const uint8_t *ref_ptr, 307 int recon_stride, 308 unsigned int *sse) { 309 unsigned int var; 310 int avg; 311 312 variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 64, 64, 313 &var, &avg, vp9_get16x16var_sse2, 16); 314 *sse = var; 315 return (var - (((int64_t)avg * avg) >> 12)); 316 } 317 318 unsigned int vp9_variance64x32_sse2(const uint8_t *src_ptr, 319 int source_stride, 320 const uint8_t *ref_ptr, 321 int recon_stride, 322 unsigned int *sse) { 323 unsigned int var; 324 int avg; 325 326 variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 64, 32, 327 &var, &avg, vp9_get16x16var_sse2, 16); 328 *sse = var; 329 return (var - (((int64_t)avg * avg) >> 11)); 330 } 331 332 unsigned int vp9_variance32x64_sse2(const uint8_t *src_ptr, 333 int source_stride, 334 const uint8_t *ref_ptr, 335 int recon_stride, 336 unsigned int *sse) { 337 unsigned int var; 338 int avg; 339 340 variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 64, 341 &var, &avg, vp9_get16x16var_sse2, 16); 342 *sse = var; 343 return (var - (((int64_t)avg * avg) >> 11)); 344 } 345 346 #define DECL(w, opt) \ 347 int vp9_sub_pixel_variance##w##xh_##opt(const uint8_t *src, \ 348 ptrdiff_t src_stride, \ 349 int x_offset, int y_offset, \ 350 const uint8_t *dst, \ 351 ptrdiff_t dst_stride, \ 352 int height, unsigned int *sse) 353 #define DECLS(opt1, opt2) \ 354 DECL(4, opt2); \ 355 DECL(8, opt1); \ 356 DECL(16, opt1) 357 358 DECLS(sse2, sse); 359 DECLS(ssse3, ssse3); 360 #undef DECLS 361 #undef DECL 362 363 #define FN(w, h, wf, wlog2, hlog2, opt, cast) \ 364 unsigned int vp9_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src, \ 365 int src_stride, \ 366 int x_offset, \ 367 int y_offset, \ 368 const uint8_t *dst, \ 369 int dst_stride, \ 370 unsigned int *sse_ptr) { \ 371 unsigned int sse; \ 372 int se = vp9_sub_pixel_variance##wf##xh_##opt(src, src_stride, x_offset, \ 373 y_offset, dst, dst_stride, \ 374 h, &sse); \ 375 if (w > wf) { \ 376 unsigned int sse2; \ 377 int se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 16, src_stride, \ 378 x_offset, y_offset, \ 379 dst + 16, dst_stride, \ 380 h, &sse2); \ 381 se += se2; \ 382 sse += sse2; \ 383 if (w > wf * 2) { \ 384 se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \ 385 x_offset, y_offset, \ 386 dst + 32, dst_stride, \ 387 h, &sse2); \ 388 se += se2; \ 389 sse += sse2; \ 390 se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \ 391 x_offset, y_offset, \ 392 dst + 48, dst_stride, \ 393 h, &sse2); \ 394 se += se2; \ 395 sse += sse2; \ 396 } \ 397 } \ 398 *sse_ptr = sse; \ 399 return sse - ((cast se * se) >> (wlog2 + hlog2)); \ 400 } 401 402 #define FNS(opt1, opt2) \ 403 FN(64, 64, 16, 6, 6, opt1, (int64_t)); \ 404 FN(64, 32, 16, 6, 5, opt1, (int64_t)); \ 405 FN(32, 64, 16, 5, 6, opt1, (int64_t)); \ 406 FN(32, 32, 16, 5, 5, opt1, (int64_t)); \ 407 FN(32, 16, 16, 5, 4, opt1, (int64_t)); \ 408 FN(16, 32, 16, 4, 5, opt1, (int64_t)); \ 409 FN(16, 16, 16, 4, 4, opt1, (unsigned int)); \ 410 FN(16, 8, 16, 4, 3, opt1,); \ 411 FN(8, 16, 8, 3, 4, opt1,); \ 412 FN(8, 8, 8, 3, 3, opt1,); \ 413 FN(8, 4, 8, 3, 2, opt1,); \ 414 FN(4, 8, 4, 2, 3, opt2,); \ 415 FN(4, 4, 4, 2, 2, opt2,) 416 417 FNS(sse2, sse); 418 FNS(ssse3, ssse3); 419 420 #undef FNS 421 #undef FN 422 423 #define DECL(w, opt) \ 424 int vp9_sub_pixel_avg_variance##w##xh_##opt(const uint8_t *src, \ 425 ptrdiff_t src_stride, \ 426 int x_offset, int y_offset, \ 427 const uint8_t *dst, \ 428 ptrdiff_t dst_stride, \ 429 const uint8_t *sec, \ 430 ptrdiff_t sec_stride, \ 431 int height, unsigned int *sse) 432 #define DECLS(opt1, opt2) \ 433 DECL(4, opt2); \ 434 DECL(8, opt1); \ 435 DECL(16, opt1) 436 437 DECLS(sse2, sse); 438 DECLS(ssse3, ssse3); 439 #undef DECL 440 #undef DECLS 441 442 #define FN(w, h, wf, wlog2, hlog2, opt, cast) \ 443 unsigned int vp9_sub_pixel_avg_variance##w##x##h##_##opt(const uint8_t *src, \ 444 int src_stride, \ 445 int x_offset, \ 446 int y_offset, \ 447 const uint8_t *dst, \ 448 int dst_stride, \ 449 unsigned int *sseptr, \ 450 const uint8_t *sec) { \ 451 unsigned int sse; \ 452 int se = vp9_sub_pixel_avg_variance##wf##xh_##opt(src, src_stride, x_offset, \ 453 y_offset, dst, dst_stride, \ 454 sec, w, h, &sse); \ 455 if (w > wf) { \ 456 unsigned int sse2; \ 457 int se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 16, src_stride, \ 458 x_offset, y_offset, \ 459 dst + 16, dst_stride, \ 460 sec + 16, w, h, &sse2); \ 461 se += se2; \ 462 sse += sse2; \ 463 if (w > wf * 2) { \ 464 se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 32, src_stride, \ 465 x_offset, y_offset, \ 466 dst + 32, dst_stride, \ 467 sec + 32, w, h, &sse2); \ 468 se += se2; \ 469 sse += sse2; \ 470 se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 48, src_stride, \ 471 x_offset, y_offset, \ 472 dst + 48, dst_stride, \ 473 sec + 48, w, h, &sse2); \ 474 se += se2; \ 475 sse += sse2; \ 476 } \ 477 } \ 478 *sseptr = sse; \ 479 return sse - ((cast se * se) >> (wlog2 + hlog2)); \ 480 } 481 482 #define FNS(opt1, opt2) \ 483 FN(64, 64, 16, 6, 6, opt1, (int64_t)); \ 484 FN(64, 32, 16, 6, 5, opt1, (int64_t)); \ 485 FN(32, 64, 16, 5, 6, opt1, (int64_t)); \ 486 FN(32, 32, 16, 5, 5, opt1, (int64_t)); \ 487 FN(32, 16, 16, 5, 4, opt1, (int64_t)); \ 488 FN(16, 32, 16, 4, 5, opt1, (int64_t)); \ 489 FN(16, 16, 16, 4, 4, opt1, (unsigned int)); \ 490 FN(16, 8, 16, 4, 3, opt1,); \ 491 FN(8, 16, 8, 3, 4, opt1,); \ 492 FN(8, 8, 8, 3, 3, opt1,); \ 493 FN(8, 4, 8, 3, 2, opt1,); \ 494 FN(4, 8, 4, 2, 3, opt2,); \ 495 FN(4, 4, 4, 2, 2, opt2,) 496 497 FNS(sse2, sse); 498 FNS(ssse3, ssse3); 499 500 #undef FNS 501 #undef FN 502 503 unsigned int vp9_variance_halfpixvar16x16_h_sse2( 504 const unsigned char *src_ptr, 505 int src_pixels_per_line, 506 const unsigned char *dst_ptr, 507 int dst_pixels_per_line, 508 unsigned int *sse) { 509 int xsum0; 510 unsigned int xxsum0; 511 512 vp9_half_horiz_variance16x_h_sse2( 513 src_ptr, src_pixels_per_line, 514 dst_ptr, dst_pixels_per_line, 16, 515 &xsum0, &xxsum0); 516 517 *sse = xxsum0; 518 return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8)); 519 } 520 521 522 unsigned int vp9_variance_halfpixvar16x16_v_sse2( 523 const unsigned char *src_ptr, 524 int src_pixels_per_line, 525 const unsigned char *dst_ptr, 526 int dst_pixels_per_line, 527 unsigned int *sse) { 528 int xsum0; 529 unsigned int xxsum0; 530 vp9_half_vert_variance16x_h_sse2( 531 src_ptr, src_pixels_per_line, 532 dst_ptr, dst_pixels_per_line, 16, 533 &xsum0, &xxsum0); 534 535 *sse = xxsum0; 536 return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8)); 537 } 538 539 540 unsigned int vp9_variance_halfpixvar16x16_hv_sse2( 541 const unsigned char *src_ptr, 542 int src_pixels_per_line, 543 const unsigned char *dst_ptr, 544 int dst_pixels_per_line, 545 unsigned int *sse) { 546 int xsum0; 547 unsigned int xxsum0; 548 549 vp9_half_horiz_vert_variance16x_h_sse2( 550 src_ptr, src_pixels_per_line, 551 dst_ptr, dst_pixels_per_line, 16, 552 &xsum0, &xxsum0); 553 554 *sse = xxsum0; 555 return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8)); 556 } 557