1 /* 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include "./vpx_config.h" 12 13 #include "vp9/encoder/vp9_variance.h" 14 #include "vp9/common/vp9_pragmas.h" 15 #include "vpx_ports/mem.h" 16 17 extern unsigned int vp9_get4x4var_mmx 18 ( 19 const unsigned char *src_ptr, 20 int source_stride, 21 const unsigned char *ref_ptr, 22 int recon_stride, 23 unsigned int *SSE, 24 int *Sum 25 ); 26 27 unsigned int vp9_get_mb_ss_sse2 28 ( 29 const int16_t *src_ptr 30 ); 31 unsigned int vp9_get16x16var_sse2 32 ( 33 const unsigned char *src_ptr, 34 int source_stride, 35 const unsigned char *ref_ptr, 36 int recon_stride, 37 unsigned int *SSE, 38 int *Sum 39 ); 40 unsigned int vp9_get8x8var_sse2 41 ( 42 const unsigned char *src_ptr, 43 int source_stride, 44 const unsigned char *ref_ptr, 45 int recon_stride, 46 unsigned int *SSE, 47 int *Sum 48 ); 49 void vp9_half_horiz_vert_variance8x_h_sse2 50 ( 51 const unsigned char *ref_ptr, 52 int ref_pixels_per_line, 53 const unsigned char *src_ptr, 54 int src_pixels_per_line, 55 unsigned int Height, 56 int *sum, 57 unsigned int *sumsquared 58 ); 59 void vp9_half_horiz_vert_variance16x_h_sse2 60 ( 61 const unsigned char *ref_ptr, 62 int ref_pixels_per_line, 63 const unsigned char *src_ptr, 64 int src_pixels_per_line, 65 unsigned int Height, 66 int *sum, 67 unsigned int *sumsquared 68 ); 69 void vp9_half_horiz_variance8x_h_sse2 70 ( 71 const unsigned char *ref_ptr, 72 int ref_pixels_per_line, 73 const unsigned char *src_ptr, 74 int src_pixels_per_line, 75 unsigned int Height, 76 int *sum, 77 unsigned int *sumsquared 78 ); 79 void vp9_half_horiz_variance16x_h_sse2 80 ( 81 const unsigned char *ref_ptr, 82 int ref_pixels_per_line, 83 const unsigned char *src_ptr, 84 int src_pixels_per_line, 85 unsigned int Height, 86 int *sum, 87 unsigned int *sumsquared 88 ); 89 void vp9_half_vert_variance8x_h_sse2 90 ( 91 const unsigned char *ref_ptr, 92 int ref_pixels_per_line, 93 const unsigned char *src_ptr, 94 int src_pixels_per_line, 95 unsigned int Height, 96 int *sum, 97 unsigned int *sumsquared 98 ); 99 void vp9_half_vert_variance16x_h_sse2 100 ( 101 const unsigned char *ref_ptr, 102 int ref_pixels_per_line, 103 const unsigned char *src_ptr, 104 int src_pixels_per_line, 105 unsigned int Height, 106 int *sum, 107 unsigned int *sumsquared 108 ); 109 110 typedef unsigned int (*get_var_sse2) ( 111 const unsigned char *src_ptr, 112 int source_stride, 113 const unsigned char *ref_ptr, 114 int recon_stride, 115 unsigned int *SSE, 116 int *Sum 117 ); 118 119 static void variance_sse2(const unsigned char *src_ptr, int source_stride, 120 const unsigned char *ref_ptr, int recon_stride, 121 int w, int h, unsigned int *sse, int *sum, 122 get_var_sse2 var_fn, int block_size) { 123 unsigned int sse0; 124 int sum0; 125 int i, j; 126 127 *sse = 0; 128 *sum = 0; 129 130 for (i = 0; i < h; i += block_size) { 131 for (j = 0; j < w; j += block_size) { 132 var_fn(src_ptr + source_stride * i + j, source_stride, 133 ref_ptr + recon_stride * i + j, recon_stride, &sse0, &sum0); 134 *sse += sse0; 135 *sum += sum0; 136 } 137 } 138 } 139 140 unsigned int vp9_variance4x4_sse2( 141 const unsigned char *src_ptr, 142 int source_stride, 143 const unsigned char *ref_ptr, 144 int recon_stride, 145 unsigned int *sse) { 146 unsigned int var; 147 int avg; 148 149 variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 4, 4, 150 &var, &avg, vp9_get4x4var_mmx, 4); 151 *sse = var; 152 return (var - (((unsigned int)avg * avg) >> 4)); 153 } 154 155 unsigned int vp9_variance8x4_sse2(const uint8_t *src_ptr, 156 int source_stride, 157 const uint8_t *ref_ptr, 158 int recon_stride, 159 unsigned int *sse) { 160 unsigned int var; 161 int avg; 162 163 variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 8, 4, 164 &var, &avg, vp9_get4x4var_mmx, 4); 165 *sse = var; 166 return (var - (((unsigned int)avg * avg) >> 5)); 167 } 168 169 unsigned int vp9_variance4x8_sse2(const uint8_t *src_ptr, 170 int source_stride, 171 const uint8_t *ref_ptr, 172 int recon_stride, 173 unsigned int *sse) { 174 unsigned int var; 175 int avg; 176 177 variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 4, 8, 178 &var, &avg, vp9_get4x4var_mmx, 4); 179 *sse = var; 180 return (var - (((unsigned int)avg * avg) >> 5)); 181 } 182 183 unsigned int vp9_variance8x8_sse2 184 ( 185 const unsigned char *src_ptr, 186 int source_stride, 187 const unsigned char *ref_ptr, 188 int recon_stride, 189 unsigned int *sse) { 190 unsigned int var; 191 int avg; 192 193 variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 8, 8, 194 &var, &avg, vp9_get8x8var_sse2, 8); 195 *sse = var; 196 return (var - (((unsigned int)avg * avg) >> 6)); 197 } 198 199 unsigned int vp9_variance16x8_sse2 200 ( 201 const unsigned char *src_ptr, 202 int source_stride, 203 const unsigned char *ref_ptr, 204 int recon_stride, 205 unsigned int *sse) { 206 unsigned int var; 207 int avg; 208 209 variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 16, 8, 210 &var, &avg, vp9_get8x8var_sse2, 8); 211 *sse = var; 212 return (var - (((unsigned int)avg * avg) >> 7)); 213 } 214 215 unsigned int vp9_variance8x16_sse2 216 ( 217 const unsigned char *src_ptr, 218 int source_stride, 219 const unsigned char *ref_ptr, 220 int recon_stride, 221 unsigned int *sse) { 222 unsigned int var; 223 int avg; 224 225 variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 8, 16, 226 &var, &avg, vp9_get8x8var_sse2, 8); 227 *sse = var; 228 return (var - (((unsigned int)avg * avg) >> 7)); 229 } 230 231 unsigned int vp9_variance16x16_sse2 232 ( 233 const unsigned char *src_ptr, 234 int source_stride, 235 const unsigned char *ref_ptr, 236 int recon_stride, 237 unsigned int *sse) { 238 unsigned int var; 239 int avg; 240 241 variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, 242 &var, &avg, vp9_get16x16var_sse2, 16); 243 *sse = var; 244 return (var - (((unsigned int)avg * avg) >> 8)); 245 } 246 247 unsigned int vp9_mse16x16_sse2( 248 const unsigned char *src_ptr, 249 int source_stride, 250 const unsigned char *ref_ptr, 251 int recon_stride, 252 unsigned int *sse) { 253 unsigned int sse0; 254 int sum0; 255 vp9_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, 256 &sum0); 257 *sse = sse0; 258 return sse0; 259 } 260 261 unsigned int vp9_variance32x32_sse2(const uint8_t *src_ptr, 262 int source_stride, 263 const uint8_t *ref_ptr, 264 int recon_stride, 265 unsigned int *sse) { 266 unsigned int var; 267 int avg; 268 269 variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 32, 270 &var, &avg, vp9_get16x16var_sse2, 16); 271 *sse = var; 272 return (var - (((int64_t)avg * avg) >> 10)); 273 } 274 275 unsigned int vp9_variance32x16_sse2(const uint8_t *src_ptr, 276 int source_stride, 277 const uint8_t *ref_ptr, 278 int recon_stride, 279 unsigned int *sse) { 280 unsigned int var; 281 int avg; 282 283 variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 16, 284 &var, &avg, vp9_get16x16var_sse2, 16); 285 *sse = var; 286 return (var - (((int64_t)avg * avg) >> 9)); 287 } 288 289 unsigned int vp9_variance16x32_sse2(const uint8_t *src_ptr, 290 int source_stride, 291 const uint8_t *ref_ptr, 292 int recon_stride, 293 unsigned int *sse) { 294 unsigned int var; 295 int avg; 296 297 variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 16, 32, 298 &var, &avg, vp9_get16x16var_sse2, 16); 299 *sse = var; 300 return (var - (((int64_t)avg * avg) >> 9)); 301 } 302 303 unsigned int vp9_variance64x64_sse2(const uint8_t *src_ptr, 304 int source_stride, 305 const uint8_t *ref_ptr, 306 int recon_stride, 307 unsigned int *sse) { 308 unsigned int var; 309 int avg; 310 311 variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 64, 64, 312 &var, &avg, vp9_get16x16var_sse2, 16); 313 *sse = var; 314 return (var - (((int64_t)avg * avg) >> 12)); 315 } 316 317 unsigned int vp9_variance64x32_sse2(const uint8_t *src_ptr, 318 int source_stride, 319 const uint8_t *ref_ptr, 320 int recon_stride, 321 unsigned int *sse) { 322 unsigned int var; 323 int avg; 324 325 variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 64, 32, 326 &var, &avg, vp9_get16x16var_sse2, 16); 327 *sse = var; 328 return (var - (((int64_t)avg * avg) >> 11)); 329 } 330 331 unsigned int vp9_variance32x64_sse2(const uint8_t *src_ptr, 332 int source_stride, 333 const uint8_t *ref_ptr, 334 int recon_stride, 335 unsigned int *sse) { 336 unsigned int var; 337 int avg; 338 339 variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 64, 340 &var, &avg, vp9_get16x16var_sse2, 16); 341 *sse = var; 342 return (var - (((int64_t)avg * avg) >> 11)); 343 } 344 345 #define DECL(w, opt) \ 346 int vp9_sub_pixel_variance##w##xh_##opt(const uint8_t *src, \ 347 ptrdiff_t src_stride, \ 348 int x_offset, int y_offset, \ 349 const uint8_t *dst, \ 350 ptrdiff_t dst_stride, \ 351 int height, unsigned int *sse) 352 #define DECLS(opt1, opt2) \ 353 DECL(4, opt2); \ 354 DECL(8, opt1); \ 355 DECL(16, opt1) 356 357 DECLS(sse2, sse); 358 DECLS(ssse3, ssse3); 359 #undef DECLS 360 #undef DECL 361 362 #define FN(w, h, wf, wlog2, hlog2, opt, cast) \ 363 unsigned int vp9_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src, \ 364 int src_stride, \ 365 int x_offset, \ 366 int y_offset, \ 367 const uint8_t *dst, \ 368 int dst_stride, \ 369 unsigned int *sse_ptr) { \ 370 unsigned int sse; \ 371 int se = vp9_sub_pixel_variance##wf##xh_##opt(src, src_stride, x_offset, \ 372 y_offset, dst, dst_stride, \ 373 h, &sse); \ 374 if (w > wf) { \ 375 unsigned int sse2; \ 376 int se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 16, src_stride, \ 377 x_offset, y_offset, \ 378 dst + 16, dst_stride, \ 379 h, &sse2); \ 380 se += se2; \ 381 sse += sse2; \ 382 if (w > wf * 2) { \ 383 se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \ 384 x_offset, y_offset, \ 385 dst + 32, dst_stride, \ 386 h, &sse2); \ 387 se += se2; \ 388 sse += sse2; \ 389 se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \ 390 x_offset, y_offset, \ 391 dst + 48, dst_stride, \ 392 h, &sse2); \ 393 se += se2; \ 394 sse += sse2; \ 395 } \ 396 } \ 397 *sse_ptr = sse; \ 398 return sse - ((cast se * se) >> (wlog2 + hlog2)); \ 399 } 400 401 #define FNS(opt1, opt2) \ 402 FN(64, 64, 16, 6, 6, opt1, (int64_t)); \ 403 FN(64, 32, 16, 6, 5, opt1, (int64_t)); \ 404 FN(32, 64, 16, 5, 6, opt1, (int64_t)); \ 405 FN(32, 32, 16, 5, 5, opt1, (int64_t)); \ 406 FN(32, 16, 16, 5, 4, opt1, (int64_t)); \ 407 FN(16, 32, 16, 4, 5, opt1, (int64_t)); \ 408 FN(16, 16, 16, 4, 4, opt1, (unsigned int)); \ 409 FN(16, 8, 16, 4, 3, opt1, (unsigned int)); \ 410 FN(8, 16, 8, 3, 4, opt1, (unsigned int)); \ 411 FN(8, 8, 8, 3, 3, opt1, (unsigned int)); \ 412 FN(8, 4, 8, 3, 2, opt1, (unsigned int)); \ 413 FN(4, 8, 4, 2, 3, opt2, (unsigned int)); \ 414 FN(4, 4, 4, 2, 2, opt2, (unsigned int)) 415 416 FNS(sse2, sse); 417 FNS(ssse3, ssse3); 418 419 #undef FNS 420 #undef FN 421 422 #define DECL(w, opt) \ 423 int vp9_sub_pixel_avg_variance##w##xh_##opt(const uint8_t *src, \ 424 ptrdiff_t src_stride, \ 425 int x_offset, int y_offset, \ 426 const uint8_t *dst, \ 427 ptrdiff_t dst_stride, \ 428 const uint8_t *sec, \ 429 ptrdiff_t sec_stride, \ 430 int height, unsigned int *sse) 431 #define DECLS(opt1, opt2) \ 432 DECL(4, opt2); \ 433 DECL(8, opt1); \ 434 DECL(16, opt1) 435 436 DECLS(sse2, sse); 437 DECLS(ssse3, ssse3); 438 #undef DECL 439 #undef DECLS 440 441 #define FN(w, h, wf, wlog2, hlog2, opt, cast) \ 442 unsigned int vp9_sub_pixel_avg_variance##w##x##h##_##opt(const uint8_t *src, \ 443 int src_stride, \ 444 int x_offset, \ 445 int y_offset, \ 446 const uint8_t *dst, \ 447 int dst_stride, \ 448 unsigned int *sseptr, \ 449 const uint8_t *sec) { \ 450 unsigned int sse; \ 451 int se = vp9_sub_pixel_avg_variance##wf##xh_##opt(src, src_stride, x_offset, \ 452 y_offset, dst, dst_stride, \ 453 sec, w, h, &sse); \ 454 if (w > wf) { \ 455 unsigned int sse2; \ 456 int se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 16, src_stride, \ 457 x_offset, y_offset, \ 458 dst + 16, dst_stride, \ 459 sec + 16, w, h, &sse2); \ 460 se += se2; \ 461 sse += sse2; \ 462 if (w > wf * 2) { \ 463 se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 32, src_stride, \ 464 x_offset, y_offset, \ 465 dst + 32, dst_stride, \ 466 sec + 32, w, h, &sse2); \ 467 se += se2; \ 468 sse += sse2; \ 469 se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 48, src_stride, \ 470 x_offset, y_offset, \ 471 dst + 48, dst_stride, \ 472 sec + 48, w, h, &sse2); \ 473 se += se2; \ 474 sse += sse2; \ 475 } \ 476 } \ 477 *sseptr = sse; \ 478 return sse - ((cast se * se) >> (wlog2 + hlog2)); \ 479 } 480 481 #define FNS(opt1, opt2) \ 482 FN(64, 64, 16, 6, 6, opt1, (int64_t)); \ 483 FN(64, 32, 16, 6, 5, opt1, (int64_t)); \ 484 FN(32, 64, 16, 5, 6, opt1, (int64_t)); \ 485 FN(32, 32, 16, 5, 5, opt1, (int64_t)); \ 486 FN(32, 16, 16, 5, 4, opt1, (int64_t)); \ 487 FN(16, 32, 16, 4, 5, opt1, (int64_t)); \ 488 FN(16, 16, 16, 4, 4, opt1, (unsigned int)); \ 489 FN(16, 8, 16, 4, 3, opt1, (unsigned int)); \ 490 FN(8, 16, 8, 3, 4, opt1, (unsigned int)); \ 491 FN(8, 8, 8, 3, 3, opt1, (unsigned int)); \ 492 FN(8, 4, 8, 3, 2, opt1, (unsigned int)); \ 493 FN(4, 8, 4, 2, 3, opt2, (unsigned int)); \ 494 FN(4, 4, 4, 2, 2, opt2, (unsigned int)) 495 496 FNS(sse2, sse); 497 FNS(ssse3, ssse3); 498 499 #undef FNS 500 #undef FN 501 502 unsigned int vp9_variance_halfpixvar16x16_h_sse2( 503 const unsigned char *src_ptr, 504 int src_pixels_per_line, 505 const unsigned char *dst_ptr, 506 int dst_pixels_per_line, 507 unsigned int *sse) { 508 int xsum0; 509 unsigned int xxsum0; 510 511 vp9_half_horiz_variance16x_h_sse2( 512 src_ptr, src_pixels_per_line, 513 dst_ptr, dst_pixels_per_line, 16, 514 &xsum0, &xxsum0); 515 516 *sse = xxsum0; 517 return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8)); 518 } 519 520 521 unsigned int vp9_variance_halfpixvar16x16_v_sse2( 522 const unsigned char *src_ptr, 523 int src_pixels_per_line, 524 const unsigned char *dst_ptr, 525 int dst_pixels_per_line, 526 unsigned int *sse) { 527 int xsum0; 528 unsigned int xxsum0; 529 vp9_half_vert_variance16x_h_sse2( 530 src_ptr, src_pixels_per_line, 531 dst_ptr, dst_pixels_per_line, 16, 532 &xsum0, &xxsum0); 533 534 *sse = xxsum0; 535 return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8)); 536 } 537 538 539 unsigned int vp9_variance_halfpixvar16x16_hv_sse2( 540 const unsigned char *src_ptr, 541 int src_pixels_per_line, 542 const unsigned char *dst_ptr, 543 int dst_pixels_per_line, 544 unsigned int *sse) { 545 int xsum0; 546 unsigned int xxsum0; 547 548 vp9_half_horiz_vert_variance16x_h_sse2( 549 src_ptr, src_pixels_per_line, 550 dst_ptr, dst_pixels_per_line, 16, 551 &xsum0, &xxsum0); 552 553 *sse = xxsum0; 554 return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8)); 555 } 556