1 /* 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include "./vpx_config.h" 12 13 #include "vp9/encoder/vp9_variance.h" 14 #include "vp9/common/vp9_pragmas.h" 15 #include "vpx_ports/mem.h" 16 17 extern unsigned int vp9_get4x4var_mmx 18 ( 19 const unsigned char *src_ptr, 20 int source_stride, 21 const unsigned char *ref_ptr, 22 int recon_stride, 23 unsigned int *SSE, 24 int *Sum 25 ); 26 27 unsigned int vp9_get16x16var_sse2 28 ( 29 const unsigned char *src_ptr, 30 int source_stride, 31 const unsigned char *ref_ptr, 32 int recon_stride, 33 unsigned int *SSE, 34 int *Sum 35 ); 36 unsigned int vp9_get8x8var_sse2 37 ( 38 const unsigned char *src_ptr, 39 int source_stride, 40 const unsigned char *ref_ptr, 41 int recon_stride, 42 unsigned int *SSE, 43 int *Sum 44 ); 45 void vp9_half_horiz_vert_variance8x_h_sse2 46 ( 47 const unsigned char *ref_ptr, 48 int ref_pixels_per_line, 49 const unsigned char *src_ptr, 50 int src_pixels_per_line, 51 unsigned int Height, 52 int *sum, 53 unsigned int *sumsquared 54 ); 55 void vp9_half_horiz_vert_variance16x_h_sse2 56 ( 57 const unsigned char *ref_ptr, 58 int ref_pixels_per_line, 59 const unsigned char *src_ptr, 60 int src_pixels_per_line, 61 unsigned int Height, 62 int *sum, 63 unsigned int *sumsquared 64 ); 65 void vp9_half_horiz_variance8x_h_sse2 66 ( 67 const unsigned char *ref_ptr, 68 int ref_pixels_per_line, 69 const unsigned char *src_ptr, 70 int src_pixels_per_line, 71 unsigned int Height, 72 int *sum, 73 unsigned int *sumsquared 74 ); 75 void vp9_half_horiz_variance16x_h_sse2 76 ( 77 const unsigned char *ref_ptr, 78 int ref_pixels_per_line, 79 const unsigned char *src_ptr, 80 int src_pixels_per_line, 81 unsigned int Height, 82 int *sum, 83 unsigned int *sumsquared 84 ); 85 void vp9_half_vert_variance8x_h_sse2 86 ( 87 const unsigned char *ref_ptr, 88 int ref_pixels_per_line, 89 const unsigned char *src_ptr, 90 int src_pixels_per_line, 91 unsigned int Height, 92 int *sum, 93 unsigned int *sumsquared 94 ); 95 void vp9_half_vert_variance16x_h_sse2 96 ( 97 const unsigned char *ref_ptr, 98 int ref_pixels_per_line, 99 const unsigned char *src_ptr, 100 int src_pixels_per_line, 101 unsigned int Height, 102 int *sum, 103 unsigned int *sumsquared 104 ); 105 106 typedef unsigned int (*get_var_sse2) ( 107 const unsigned char *src_ptr, 108 int source_stride, 109 const unsigned char *ref_ptr, 110 int recon_stride, 111 unsigned int *SSE, 112 int *Sum 113 ); 114 115 static void variance_sse2(const unsigned char *src_ptr, int source_stride, 116 const unsigned char *ref_ptr, int recon_stride, 117 int w, int h, unsigned int *sse, int *sum, 118 get_var_sse2 var_fn, int block_size) { 119 unsigned int sse0; 120 int sum0; 121 int i, j; 122 123 *sse = 0; 124 *sum = 0; 125 126 for (i = 0; i < h; i += block_size) { 127 for (j = 0; j < w; j += block_size) { 128 var_fn(src_ptr + source_stride * i + j, source_stride, 129 ref_ptr + recon_stride * i + j, recon_stride, &sse0, &sum0); 130 *sse += sse0; 131 *sum += sum0; 132 } 133 } 134 } 135 136 unsigned int vp9_variance4x4_sse2( 137 const unsigned char *src_ptr, 138 int source_stride, 139 const unsigned char *ref_ptr, 140 int recon_stride, 141 unsigned int *sse) { 142 unsigned int var; 143 int avg; 144 145 variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 4, 4, 146 &var, &avg, vp9_get4x4var_mmx, 4); 147 *sse = var; 148 return (var - (((unsigned int)avg * avg) >> 4)); 149 } 150 151 unsigned int vp9_variance8x4_sse2(const uint8_t *src_ptr, 152 int source_stride, 153 const uint8_t *ref_ptr, 154 int recon_stride, 155 unsigned int *sse) { 156 unsigned int var; 157 int avg; 158 159 variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 8, 4, 160 &var, &avg, vp9_get4x4var_mmx, 4); 161 *sse = var; 162 return (var - (((unsigned int)avg * avg) >> 5)); 163 } 164 165 unsigned int vp9_variance4x8_sse2(const uint8_t *src_ptr, 166 int source_stride, 167 const uint8_t *ref_ptr, 168 int recon_stride, 169 unsigned int *sse) { 170 unsigned int var; 171 int avg; 172 173 variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 4, 8, 174 &var, &avg, vp9_get4x4var_mmx, 4); 175 *sse = var; 176 return (var - (((unsigned int)avg * avg) >> 5)); 177 } 178 179 unsigned int vp9_variance8x8_sse2 180 ( 181 const unsigned char *src_ptr, 182 int source_stride, 183 const unsigned char *ref_ptr, 184 int recon_stride, 185 unsigned int *sse) { 186 unsigned int var; 187 int avg; 188 189 variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 8, 8, 190 &var, &avg, vp9_get8x8var_sse2, 8); 191 *sse = var; 192 return (var - (((unsigned int)avg * avg) >> 6)); 193 } 194 195 unsigned int vp9_variance16x8_sse2 196 ( 197 const unsigned char *src_ptr, 198 int source_stride, 199 const unsigned char *ref_ptr, 200 int recon_stride, 201 unsigned int *sse) { 202 unsigned int var; 203 int avg; 204 205 variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 16, 8, 206 &var, &avg, vp9_get8x8var_sse2, 8); 207 *sse = var; 208 return (var - (((unsigned int)avg * avg) >> 7)); 209 } 210 211 unsigned int vp9_variance8x16_sse2 212 ( 213 const unsigned char *src_ptr, 214 int source_stride, 215 const unsigned char *ref_ptr, 216 int recon_stride, 217 unsigned int *sse) { 218 unsigned int var; 219 int avg; 220 221 variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 8, 16, 222 &var, &avg, vp9_get8x8var_sse2, 8); 223 *sse = var; 224 return (var - (((unsigned int)avg * avg) >> 7)); 225 } 226 227 unsigned int vp9_variance16x16_sse2 228 ( 229 const unsigned char *src_ptr, 230 int source_stride, 231 const unsigned char *ref_ptr, 232 int recon_stride, 233 unsigned int *sse) { 234 unsigned int var; 235 int avg; 236 237 variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, 238 &var, &avg, vp9_get16x16var_sse2, 16); 239 *sse = var; 240 return (var - (((unsigned int)avg * avg) >> 8)); 241 } 242 243 unsigned int vp9_mse16x16_sse2( 244 const unsigned char *src_ptr, 245 int source_stride, 246 const unsigned char *ref_ptr, 247 int recon_stride, 248 unsigned int *sse) { 249 unsigned int sse0; 250 int sum0; 251 vp9_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, 252 &sum0); 253 *sse = sse0; 254 return sse0; 255 } 256 257 unsigned int vp9_variance32x32_sse2(const uint8_t *src_ptr, 258 int source_stride, 259 const uint8_t *ref_ptr, 260 int recon_stride, 261 unsigned int *sse) { 262 unsigned int var; 263 int avg; 264 265 variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 32, 266 &var, &avg, vp9_get16x16var_sse2, 16); 267 *sse = var; 268 return (var - (((int64_t)avg * avg) >> 10)); 269 } 270 271 unsigned int vp9_variance32x16_sse2(const uint8_t *src_ptr, 272 int source_stride, 273 const uint8_t *ref_ptr, 274 int recon_stride, 275 unsigned int *sse) { 276 unsigned int var; 277 int avg; 278 279 variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 16, 280 &var, &avg, vp9_get16x16var_sse2, 16); 281 *sse = var; 282 return (var - (((int64_t)avg * avg) >> 9)); 283 } 284 285 unsigned int vp9_variance16x32_sse2(const uint8_t *src_ptr, 286 int source_stride, 287 const uint8_t *ref_ptr, 288 int recon_stride, 289 unsigned int *sse) { 290 unsigned int var; 291 int avg; 292 293 variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 16, 32, 294 &var, &avg, vp9_get16x16var_sse2, 16); 295 *sse = var; 296 return (var - (((int64_t)avg * avg) >> 9)); 297 } 298 299 unsigned int vp9_variance64x64_sse2(const uint8_t *src_ptr, 300 int source_stride, 301 const uint8_t *ref_ptr, 302 int recon_stride, 303 unsigned int *sse) { 304 unsigned int var; 305 int avg; 306 307 variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 64, 64, 308 &var, &avg, vp9_get16x16var_sse2, 16); 309 *sse = var; 310 return (var - (((int64_t)avg * avg) >> 12)); 311 } 312 313 unsigned int vp9_variance64x32_sse2(const uint8_t *src_ptr, 314 int source_stride, 315 const uint8_t *ref_ptr, 316 int recon_stride, 317 unsigned int *sse) { 318 unsigned int var; 319 int avg; 320 321 variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 64, 32, 322 &var, &avg, vp9_get16x16var_sse2, 16); 323 *sse = var; 324 return (var - (((int64_t)avg * avg) >> 11)); 325 } 326 327 unsigned int vp9_variance32x64_sse2(const uint8_t *src_ptr, 328 int source_stride, 329 const uint8_t *ref_ptr, 330 int recon_stride, 331 unsigned int *sse) { 332 unsigned int var; 333 int avg; 334 335 variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 64, 336 &var, &avg, vp9_get16x16var_sse2, 16); 337 *sse = var; 338 return (var - (((int64_t)avg * avg) >> 11)); 339 } 340 341 #define DECL(w, opt) \ 342 int vp9_sub_pixel_variance##w##xh_##opt(const uint8_t *src, \ 343 ptrdiff_t src_stride, \ 344 int x_offset, int y_offset, \ 345 const uint8_t *dst, \ 346 ptrdiff_t dst_stride, \ 347 int height, unsigned int *sse) 348 #define DECLS(opt1, opt2) \ 349 DECL(4, opt2); \ 350 DECL(8, opt1); \ 351 DECL(16, opt1) 352 353 DECLS(sse2, sse); 354 DECLS(ssse3, ssse3); 355 #undef DECLS 356 #undef DECL 357 358 #define FN(w, h, wf, wlog2, hlog2, opt, cast) \ 359 unsigned int vp9_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src, \ 360 int src_stride, \ 361 int x_offset, \ 362 int y_offset, \ 363 const uint8_t *dst, \ 364 int dst_stride, \ 365 unsigned int *sse_ptr) { \ 366 unsigned int sse; \ 367 int se = vp9_sub_pixel_variance##wf##xh_##opt(src, src_stride, x_offset, \ 368 y_offset, dst, dst_stride, \ 369 h, &sse); \ 370 if (w > wf) { \ 371 unsigned int sse2; \ 372 int se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 16, src_stride, \ 373 x_offset, y_offset, \ 374 dst + 16, dst_stride, \ 375 h, &sse2); \ 376 se += se2; \ 377 sse += sse2; \ 378 if (w > wf * 2) { \ 379 se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \ 380 x_offset, y_offset, \ 381 dst + 32, dst_stride, \ 382 h, &sse2); \ 383 se += se2; \ 384 sse += sse2; \ 385 se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \ 386 x_offset, y_offset, \ 387 dst + 48, dst_stride, \ 388 h, &sse2); \ 389 se += se2; \ 390 sse += sse2; \ 391 } \ 392 } \ 393 *sse_ptr = sse; \ 394 return sse - ((cast se * se) >> (wlog2 + hlog2)); \ 395 } 396 397 #define FNS(opt1, opt2) \ 398 FN(64, 64, 16, 6, 6, opt1, (int64_t)); \ 399 FN(64, 32, 16, 6, 5, opt1, (int64_t)); \ 400 FN(32, 64, 16, 5, 6, opt1, (int64_t)); \ 401 FN(32, 32, 16, 5, 5, opt1, (int64_t)); \ 402 FN(32, 16, 16, 5, 4, opt1, (int64_t)); \ 403 FN(16, 32, 16, 4, 5, opt1, (int64_t)); \ 404 FN(16, 16, 16, 4, 4, opt1, (unsigned int)); \ 405 FN(16, 8, 16, 4, 3, opt1, (unsigned int)); \ 406 FN(8, 16, 8, 3, 4, opt1, (unsigned int)); \ 407 FN(8, 8, 8, 3, 3, opt1, (unsigned int)); \ 408 FN(8, 4, 8, 3, 2, opt1, (unsigned int)); \ 409 FN(4, 8, 4, 2, 3, opt2, (unsigned int)); \ 410 FN(4, 4, 4, 2, 2, opt2, (unsigned int)) 411 412 FNS(sse2, sse); 413 FNS(ssse3, ssse3); 414 415 #undef FNS 416 #undef FN 417 418 #define DECL(w, opt) \ 419 int vp9_sub_pixel_avg_variance##w##xh_##opt(const uint8_t *src, \ 420 ptrdiff_t src_stride, \ 421 int x_offset, int y_offset, \ 422 const uint8_t *dst, \ 423 ptrdiff_t dst_stride, \ 424 const uint8_t *sec, \ 425 ptrdiff_t sec_stride, \ 426 int height, unsigned int *sse) 427 #define DECLS(opt1, opt2) \ 428 DECL(4, opt2); \ 429 DECL(8, opt1); \ 430 DECL(16, opt1) 431 432 DECLS(sse2, sse); 433 DECLS(ssse3, ssse3); 434 #undef DECL 435 #undef DECLS 436 437 #define FN(w, h, wf, wlog2, hlog2, opt, cast) \ 438 unsigned int vp9_sub_pixel_avg_variance##w##x##h##_##opt(const uint8_t *src, \ 439 int src_stride, \ 440 int x_offset, \ 441 int y_offset, \ 442 const uint8_t *dst, \ 443 int dst_stride, \ 444 unsigned int *sseptr, \ 445 const uint8_t *sec) { \ 446 unsigned int sse; \ 447 int se = vp9_sub_pixel_avg_variance##wf##xh_##opt(src, src_stride, x_offset, \ 448 y_offset, dst, dst_stride, \ 449 sec, w, h, &sse); \ 450 if (w > wf) { \ 451 unsigned int sse2; \ 452 int se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 16, src_stride, \ 453 x_offset, y_offset, \ 454 dst + 16, dst_stride, \ 455 sec + 16, w, h, &sse2); \ 456 se += se2; \ 457 sse += sse2; \ 458 if (w > wf * 2) { \ 459 se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 32, src_stride, \ 460 x_offset, y_offset, \ 461 dst + 32, dst_stride, \ 462 sec + 32, w, h, &sse2); \ 463 se += se2; \ 464 sse += sse2; \ 465 se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 48, src_stride, \ 466 x_offset, y_offset, \ 467 dst + 48, dst_stride, \ 468 sec + 48, w, h, &sse2); \ 469 se += se2; \ 470 sse += sse2; \ 471 } \ 472 } \ 473 *sseptr = sse; \ 474 return sse - ((cast se * se) >> (wlog2 + hlog2)); \ 475 } 476 477 #define FNS(opt1, opt2) \ 478 FN(64, 64, 16, 6, 6, opt1, (int64_t)); \ 479 FN(64, 32, 16, 6, 5, opt1, (int64_t)); \ 480 FN(32, 64, 16, 5, 6, opt1, (int64_t)); \ 481 FN(32, 32, 16, 5, 5, opt1, (int64_t)); \ 482 FN(32, 16, 16, 5, 4, opt1, (int64_t)); \ 483 FN(16, 32, 16, 4, 5, opt1, (int64_t)); \ 484 FN(16, 16, 16, 4, 4, opt1, (unsigned int)); \ 485 FN(16, 8, 16, 4, 3, opt1, (unsigned int)); \ 486 FN(8, 16, 8, 3, 4, opt1, (unsigned int)); \ 487 FN(8, 8, 8, 3, 3, opt1, (unsigned int)); \ 488 FN(8, 4, 8, 3, 2, opt1, (unsigned int)); \ 489 FN(4, 8, 4, 2, 3, opt2, (unsigned int)); \ 490 FN(4, 4, 4, 2, 2, opt2, (unsigned int)) 491 492 FNS(sse2, sse); 493 FNS(ssse3, ssse3); 494 495 #undef FNS 496 #undef FN 497 498 unsigned int vp9_variance_halfpixvar16x16_h_sse2( 499 const unsigned char *src_ptr, 500 int src_pixels_per_line, 501 const unsigned char *dst_ptr, 502 int dst_pixels_per_line, 503 unsigned int *sse) { 504 int xsum0; 505 unsigned int xxsum0; 506 507 vp9_half_horiz_variance16x_h_sse2( 508 src_ptr, src_pixels_per_line, 509 dst_ptr, dst_pixels_per_line, 16, 510 &xsum0, &xxsum0); 511 512 *sse = xxsum0; 513 return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8)); 514 } 515 516 517 unsigned int vp9_variance_halfpixvar16x16_v_sse2( 518 const unsigned char *src_ptr, 519 int src_pixels_per_line, 520 const unsigned char *dst_ptr, 521 int dst_pixels_per_line, 522 unsigned int *sse) { 523 int xsum0; 524 unsigned int xxsum0; 525 vp9_half_vert_variance16x_h_sse2( 526 src_ptr, src_pixels_per_line, 527 dst_ptr, dst_pixels_per_line, 16, 528 &xsum0, &xxsum0); 529 530 *sse = xxsum0; 531 return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8)); 532 } 533 534 535 unsigned int vp9_variance_halfpixvar16x16_hv_sse2( 536 const unsigned char *src_ptr, 537 int src_pixels_per_line, 538 const unsigned char *dst_ptr, 539 int dst_pixels_per_line, 540 unsigned int *sse) { 541 int xsum0; 542 unsigned int xxsum0; 543 544 vp9_half_horiz_vert_variance16x_h_sse2( 545 src_ptr, src_pixels_per_line, 546 dst_ptr, dst_pixels_per_line, 16, 547 &xsum0, &xxsum0); 548 549 *sse = xxsum0; 550 return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8)); 551 } 552