1 /* 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 12 #include "variance.h" 13 #include "pragmas.h" 14 #include "vpx_ports/mem.h" 15 16 extern void filter_block1d_h6_mmx 17 ( 18 unsigned char *src_ptr, 19 unsigned short *output_ptr, 20 unsigned int src_pixels_per_line, 21 unsigned int pixel_step, 22 unsigned int output_height, 23 unsigned int output_width, 24 short *vp7_filter 25 ); 26 extern void filter_block1d_v6_mmx 27 ( 28 short *src_ptr, 29 unsigned char *output_ptr, 30 unsigned int pixels_per_line, 31 unsigned int pixel_step, 32 unsigned int output_height, 33 unsigned int output_width, 34 short *vp7_filter 35 ); 36 37 extern unsigned int vp8_get_mb_ss_mmx(short *src_ptr); 38 extern unsigned int vp8_get8x8var_mmx 39 ( 40 unsigned char *src_ptr, 41 int source_stride, 42 unsigned char *ref_ptr, 43 int recon_stride, 44 unsigned int *SSE, 45 int *Sum 46 ); 47 extern unsigned int vp8_get4x4var_mmx 48 ( 49 unsigned char *src_ptr, 50 int source_stride, 51 unsigned char *ref_ptr, 52 int recon_stride, 53 unsigned int *SSE, 54 int *Sum 55 ); 56 extern unsigned int vp8_get4x4sse_cs_mmx 57 ( 58 unsigned char *src_ptr, 59 int source_stride, 60 unsigned char *ref_ptr, 61 int recon_stride 62 ); 63 extern void vp8_filter_block2d_bil4x4_var_mmx 64 ( 65 unsigned char *ref_ptr, 66 int ref_pixels_per_line, 67 unsigned char *src_ptr, 68 int src_pixels_per_line, 69 const short *HFilter, 70 const short *VFilter, 71 int *sum, 72 unsigned int *sumsquared 73 ); 74 extern void vp8_filter_block2d_bil_var_mmx 75 ( 76 unsigned char *ref_ptr, 77 int ref_pixels_per_line, 78 unsigned char *src_ptr, 79 int src_pixels_per_line, 80 unsigned int Height, 81 const short *HFilter, 82 const short *VFilter, 83 int *sum, 84 unsigned int *sumsquared 85 ); 86 extern unsigned int vp8_get16x16pred_error_mmx 87 ( 88 unsigned char *src_ptr, 89 int src_stride, 90 unsigned char *ref_ptr, 91 int ref_stride 92 ); 93 94 95 void vp8_test_get_mb_ss(void) 96 { 97 short zz[] = 98 { 99 -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4, 100 -2, -2, -2, -2, 2, 2, 2, 2, -2, -2, -2, -2, 2, 2, 2, 2, 101 -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4, 102 -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4, 103 -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4, 104 -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4, 105 -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4, 106 -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4, 107 -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4, 108 -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4, 109 -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4, 110 -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4, 111 -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4, 112 -3, -3, -3, -3, 3, 3, 3, 3, -3, -3, -3, -3, 3, 3, 3, 3, 113 -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4, 114 -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4, 115 }; 116 int s = 0, x = vp8_get_mb_ss_mmx(zz); 117 { 118 int y; 119 120 for (y = 0; y < 256; y++) 121 s += (zz[y] * zz[y]); 122 } 123 124 x += 0; 125 } 126 127 128 unsigned int vp8_get16x16var_mmx( 129 unsigned char *src_ptr, 130 int source_stride, 131 unsigned char *ref_ptr, 132 int recon_stride, 133 unsigned *SSE, 134 unsigned *SUM 135 ) 136 { 137 unsigned int sse0, sse1, sse2, sse3, var; 138 int sum0, sum1, sum2, sum3, avg; 139 140 141 vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; 142 vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1); 143 vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2) ; 144 vp8_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3); 145 146 var = sse0 + sse1 + sse2 + sse3; 147 avg = sum0 + sum1 + sum2 + sum3; 148 149 *SSE = var; 150 *SUM = avg; 151 return (var - ((avg * avg) >> 8)); 152 153 } 154 155 156 157 158 159 unsigned int vp8_variance4x4_mmx( 160 unsigned char *src_ptr, 161 int source_stride, 162 unsigned char *ref_ptr, 163 int recon_stride, 164 unsigned int *sse) 165 { 166 unsigned int var; 167 int avg; 168 169 vp8_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ; 170 *sse = var; 171 return (var - ((avg * avg) >> 4)); 172 173 } 174 175 unsigned int vp8_variance8x8_mmx( 176 unsigned char *src_ptr, 177 int source_stride, 178 unsigned char *ref_ptr, 179 int recon_stride, 180 unsigned int *sse) 181 { 182 unsigned int var; 183 int avg; 184 185 vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ; 186 *sse = var; 187 188 return (var - ((avg * avg) >> 6)); 189 190 } 191 192 unsigned int vp8_mse16x16_mmx( 193 unsigned char *src_ptr, 194 int source_stride, 195 unsigned char *ref_ptr, 196 int recon_stride, 197 unsigned int *sse) 198 { 199 unsigned int sse0, sse1, sse2, sse3, var; 200 int sum0, sum1, sum2, sum3; 201 202 203 vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; 204 vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1); 205 vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2) ; 206 vp8_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3); 207 208 var = sse0 + sse1 + sse2 + sse3; 209 *sse = var; 210 return var; 211 } 212 213 214 unsigned int vp8_variance16x16_mmx( 215 unsigned char *src_ptr, 216 int source_stride, 217 unsigned char *ref_ptr, 218 int recon_stride, 219 int *sse) 220 { 221 unsigned int sse0, sse1, sse2, sse3, var; 222 int sum0, sum1, sum2, sum3, avg; 223 224 225 vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; 226 vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1); 227 vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2) ; 228 vp8_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3); 229 230 var = sse0 + sse1 + sse2 + sse3; 231 avg = sum0 + sum1 + sum2 + sum3; 232 *sse = var; 233 return (var - ((avg * avg) >> 8)); 234 } 235 236 unsigned int vp8_variance16x8_mmx( 237 unsigned char *src_ptr, 238 int source_stride, 239 unsigned char *ref_ptr, 240 int recon_stride, 241 unsigned int *sse) 242 { 243 unsigned int sse0, sse1, var; 244 int sum0, sum1, avg; 245 246 vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; 247 vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1); 248 249 var = sse0 + sse1; 250 avg = sum0 + sum1; 251 *sse = var; 252 return (var - ((avg * avg) >> 7)); 253 254 } 255 256 257 unsigned int vp8_variance8x16_mmx( 258 unsigned char *src_ptr, 259 int source_stride, 260 unsigned char *ref_ptr, 261 int recon_stride, 262 unsigned int *sse) 263 { 264 unsigned int sse0, sse1, var; 265 int sum0, sum1, avg; 266 267 vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; 268 vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1) ; 269 270 var = sse0 + sse1; 271 avg = sum0 + sum1; 272 *sse = var; 273 274 return (var - ((avg * avg) >> 7)); 275 276 } 277 278 279 280 281 /////////////////////////////////////////////////////////////////////////// 282 // the mmx function that does the bilinear filtering and var calculation // 283 // int one pass // 284 /////////////////////////////////////////////////////////////////////////// 285 DECLARE_ALIGNED(16, const short, vp8_vp7_bilinear_filters_mmx[8][8]) = 286 { 287 { 128, 128, 128, 128, 0, 0, 0, 0 }, 288 { 112, 112, 112, 112, 16, 16, 16, 16 }, 289 { 96, 96, 96, 96, 32, 32, 32, 32 }, 290 { 80, 80, 80, 80, 48, 48, 48, 48 }, 291 { 64, 64, 64, 64, 64, 64, 64, 64 }, 292 { 48, 48, 48, 48, 80, 80, 80, 80 }, 293 { 32, 32, 32, 32, 96, 96, 96, 96 }, 294 { 16, 16, 16, 16, 112, 112, 112, 112 } 295 }; 296 297 unsigned int vp8_sub_pixel_variance4x4_mmx 298 ( 299 unsigned char *src_ptr, 300 int src_pixels_per_line, 301 int xoffset, 302 int yoffset, 303 unsigned char *dst_ptr, 304 int dst_pixels_per_line, 305 unsigned int *sse) 306 307 { 308 int xsum; 309 unsigned int xxsum; 310 vp8_filter_block2d_bil4x4_var_mmx( 311 src_ptr, src_pixels_per_line, 312 dst_ptr, dst_pixels_per_line, 313 vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset], 314 &xsum, &xxsum 315 ); 316 *sse = xxsum; 317 return (xxsum - ((xsum * xsum) >> 4)); 318 } 319 320 321 unsigned int vp8_sub_pixel_variance8x8_mmx 322 ( 323 unsigned char *src_ptr, 324 int src_pixels_per_line, 325 int xoffset, 326 int yoffset, 327 unsigned char *dst_ptr, 328 int dst_pixels_per_line, 329 unsigned int *sse 330 ) 331 { 332 333 int xsum; 334 unsigned int xxsum; 335 vp8_filter_block2d_bil_var_mmx( 336 src_ptr, src_pixels_per_line, 337 dst_ptr, dst_pixels_per_line, 8, 338 vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset], 339 &xsum, &xxsum 340 ); 341 *sse = xxsum; 342 return (xxsum - ((xsum * xsum) >> 6)); 343 } 344 345 unsigned int vp8_sub_pixel_variance16x16_mmx 346 ( 347 unsigned char *src_ptr, 348 int src_pixels_per_line, 349 int xoffset, 350 int yoffset, 351 unsigned char *dst_ptr, 352 int dst_pixels_per_line, 353 unsigned int *sse 354 ) 355 { 356 357 int xsum0, xsum1; 358 unsigned int xxsum0, xxsum1; 359 360 361 vp8_filter_block2d_bil_var_mmx( 362 src_ptr, src_pixels_per_line, 363 dst_ptr, dst_pixels_per_line, 16, 364 vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset], 365 &xsum0, &xxsum0 366 ); 367 368 369 vp8_filter_block2d_bil_var_mmx( 370 src_ptr + 8, src_pixels_per_line, 371 dst_ptr + 8, dst_pixels_per_line, 16, 372 vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset], 373 &xsum1, &xxsum1 374 ); 375 376 xsum0 += xsum1; 377 xxsum0 += xxsum1; 378 379 *sse = xxsum0; 380 return (xxsum0 - ((xsum0 * xsum0) >> 8)); 381 382 383 } 384 385 unsigned int vp8_sub_pixel_mse16x16_mmx( 386 unsigned char *src_ptr, 387 int src_pixels_per_line, 388 int xoffset, 389 int yoffset, 390 unsigned char *dst_ptr, 391 int dst_pixels_per_line, 392 unsigned int *sse 393 ) 394 { 395 vp8_sub_pixel_variance16x16_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse); 396 return *sse; 397 } 398 399 unsigned int vp8_sub_pixel_variance16x8_mmx 400 ( 401 unsigned char *src_ptr, 402 int src_pixels_per_line, 403 int xoffset, 404 int yoffset, 405 unsigned char *dst_ptr, 406 int dst_pixels_per_line, 407 unsigned int *sse 408 ) 409 { 410 int xsum0, xsum1; 411 unsigned int xxsum0, xxsum1; 412 413 414 vp8_filter_block2d_bil_var_mmx( 415 src_ptr, src_pixels_per_line, 416 dst_ptr, dst_pixels_per_line, 8, 417 vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset], 418 &xsum0, &xxsum0 419 ); 420 421 422 vp8_filter_block2d_bil_var_mmx( 423 src_ptr + 8, src_pixels_per_line, 424 dst_ptr + 8, dst_pixels_per_line, 8, 425 vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset], 426 &xsum1, &xxsum1 427 ); 428 429 xsum0 += xsum1; 430 xxsum0 += xxsum1; 431 432 *sse = xxsum0; 433 return (xxsum0 - ((xsum0 * xsum0) >> 7)); 434 } 435 436 unsigned int vp8_sub_pixel_variance8x16_mmx 437 ( 438 unsigned char *src_ptr, 439 int src_pixels_per_line, 440 int xoffset, 441 int yoffset, 442 unsigned char *dst_ptr, 443 int dst_pixels_per_line, 444 int *sse 445 ) 446 { 447 int xsum; 448 unsigned int xxsum; 449 vp8_filter_block2d_bil_var_mmx( 450 src_ptr, src_pixels_per_line, 451 dst_ptr, dst_pixels_per_line, 16, 452 vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset], 453 &xsum, &xxsum 454 ); 455 *sse = xxsum; 456 return (xxsum - ((xsum * xsum) >> 7)); 457 } 458 459 unsigned int vp8_i_variance16x16_mmx( 460 unsigned char *src_ptr, 461 int source_stride, 462 unsigned char *ref_ptr, 463 int recon_stride, 464 unsigned int *sse) 465 { 466 unsigned int sse0, sse1, sse2, sse3, var; 467 int sum0, sum1, sum2, sum3, avg; 468 469 470 vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; 471 vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1); 472 vp8_get8x8var_mmx(src_ptr + (source_stride >> 1), source_stride, ref_ptr + (recon_stride >> 1), recon_stride, &sse2, &sum2) ; 473 vp8_get8x8var_mmx(src_ptr + (source_stride >> 1) + 8, source_stride, ref_ptr + (recon_stride >> 1) + 8, recon_stride, &sse3, &sum3); 474 475 var = sse0 + sse1 + sse2 + sse3; 476 avg = sum0 + sum1 + sum2 + sum3; 477 *sse = var; 478 return (var - ((avg * avg) >> 8)); 479 480 } 481 482 unsigned int vp8_i_variance8x16_mmx( 483 unsigned char *src_ptr, 484 int source_stride, 485 unsigned char *ref_ptr, 486 int recon_stride, 487 unsigned int *sse) 488 { 489 unsigned int sse0, sse1, var; 490 int sum0, sum1, avg; 491 vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; 492 vp8_get8x8var_mmx(src_ptr + (source_stride >> 1), source_stride, ref_ptr + (recon_stride >> 1), recon_stride, &sse1, &sum1) ; 493 494 var = sse0 + sse1; 495 avg = sum0 + sum1; 496 497 *sse = var; 498 return (var - ((avg * avg) >> 7)); 499 500 } 501 502 unsigned int vp8_i_sub_pixel_variance16x16_mmx 503 ( 504 unsigned char *src_ptr, 505 int src_pixels_per_line, 506 int xoffset, 507 int yoffset, 508 unsigned char *dst_ptr, 509 int dst_pixels_per_line, 510 unsigned int *sse 511 ) 512 { 513 int xsum0, xsum1; 514 unsigned int xxsum0, xxsum1; 515 int f2soffset = (src_pixels_per_line >> 1); 516 int f2doffset = (dst_pixels_per_line >> 1); 517 518 519 vp8_filter_block2d_bil_var_mmx( 520 src_ptr, src_pixels_per_line, 521 dst_ptr, dst_pixels_per_line, 8, 522 vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset], 523 &xsum0, &xxsum0 524 ); 525 526 527 vp8_filter_block2d_bil_var_mmx( 528 src_ptr + 8, src_pixels_per_line, 529 dst_ptr + 8, dst_pixels_per_line, 8, 530 vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset], 531 &xsum1, &xxsum1 532 ); 533 534 xsum0 += xsum1; 535 xxsum0 += xxsum1; 536 537 vp8_filter_block2d_bil_var_mmx( 538 src_ptr + f2soffset, src_pixels_per_line, 539 dst_ptr + f2doffset, dst_pixels_per_line, 8, 540 vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset], 541 &xsum1, &xxsum1 542 ); 543 544 xsum0 += xsum1; 545 xxsum0 += xxsum1; 546 547 vp8_filter_block2d_bil_var_mmx( 548 src_ptr + f2soffset + 8, src_pixels_per_line, 549 dst_ptr + f2doffset + 8, dst_pixels_per_line, 8, 550 vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset], 551 &xsum1, &xxsum1 552 ); 553 554 xsum0 += xsum1; 555 xxsum0 += xxsum1; 556 *sse = xxsum0; 557 return (xxsum0 - ((xsum0 * xsum0) >> 8)); 558 } 559 560 561 unsigned int vp8_i_sub_pixel_variance8x16_mmx 562 ( 563 unsigned char *src_ptr, 564 int src_pixels_per_line, 565 int xoffset, 566 int yoffset, 567 unsigned char *dst_ptr, 568 int dst_pixels_per_line, 569 unsigned int *sse 570 ) 571 { 572 int xsum0, xsum1; 573 unsigned int xxsum0, xxsum1; 574 int f2soffset = (src_pixels_per_line >> 1); 575 int f2doffset = (dst_pixels_per_line >> 1); 576 577 578 vp8_filter_block2d_bil_var_mmx( 579 src_ptr, src_pixels_per_line, 580 dst_ptr, dst_pixels_per_line, 8, 581 vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset], 582 &xsum0, &xxsum0 583 ); 584 585 586 vp8_filter_block2d_bil_var_mmx( 587 src_ptr + f2soffset, src_pixels_per_line, 588 dst_ptr + f2doffset, dst_pixels_per_line, 8, 589 vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset], 590 &xsum1, &xxsum1 591 ); 592 593 xsum0 += xsum1; 594 xxsum0 += xxsum1; 595 *sse = xxsum0; 596 return (xxsum0 - ((xsum0 * xsum0) >> 7)); 597 } 598