1 /* 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 12 #include "variance.h" 13 #include "pragmas.h" 14 #include "vpx_ports/mem.h" 15 16 extern void filter_block1d_h6_mmx(unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter); 17 extern void filter_block1d_v6_mmx(short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter); 18 extern void filter_block1d8_h6_sse2(unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter); 19 extern void filter_block1d8_v6_sse2(short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter); 20 21 extern void vp8_filter_block2d_bil4x4_var_mmx 22 ( 23 unsigned char *ref_ptr, 24 int ref_pixels_per_line, 25 unsigned char *src_ptr, 26 int src_pixels_per_line, 27 const short *HFilter, 28 const short *VFilter, 29 int *sum, 30 unsigned int *sumsquared 31 ); 32 33 extern unsigned int vp8_get4x4var_mmx 34 ( 35 unsigned char *src_ptr, 36 int source_stride, 37 unsigned char *ref_ptr, 38 int recon_stride, 39 unsigned int *SSE, 40 int *Sum 41 ); 42 43 unsigned int vp8_get_mb_ss_sse2 44 ( 45 short *src_ptr 46 ); 47 unsigned int vp8_get16x16var_sse2 48 ( 49 unsigned char *src_ptr, 50 int source_stride, 51 unsigned char *ref_ptr, 52 int recon_stride, 53 unsigned int *SSE, 54 int *Sum 55 ); 56 unsigned int vp8_get16x16pred_error_sse2 57 ( 58 unsigned char *src_ptr, 59 int src_stride, 60 unsigned char *ref_ptr, 61 int ref_stride 62 ); 63 unsigned int vp8_get8x8var_sse2 64 ( 65 unsigned char *src_ptr, 66 int source_stride, 67 unsigned char *ref_ptr, 68 int recon_stride, 69 unsigned int *SSE, 70 int *Sum 71 ); 72 void vp8_filter_block2d_bil_var_sse2 73 ( 74 unsigned char *ref_ptr, 75 int ref_pixels_per_line, 76 unsigned char *src_ptr, 77 int src_pixels_per_line, 78 unsigned int Height, 79 const short *HFilter, 80 const short *VFilter, 81 int *sum, 82 unsigned int *sumsquared 83 ); 84 void vp8_half_horiz_vert_variance16x_h_sse2 85 ( 86 unsigned char *ref_ptr, 87 int ref_pixels_per_line, 88 unsigned char *src_ptr, 89 int src_pixels_per_line, 90 unsigned int Height, 91 int *sum, 92 unsigned int *sumsquared 93 ); 94 void vp8_half_horiz_variance16x_h_sse2 95 ( 96 unsigned char *ref_ptr, 97 int ref_pixels_per_line, 98 unsigned char *src_ptr, 99 int src_pixels_per_line, 100 unsigned int Height, 101 int *sum, 102 unsigned int *sumsquared 103 ); 104 void vp8_half_vert_variance16x_h_sse2 105 ( 106 unsigned char *ref_ptr, 107 int ref_pixels_per_line, 108 unsigned char *src_ptr, 109 int src_pixels_per_line, 110 unsigned int Height, 111 int *sum, 112 unsigned int *sumsquared 113 ); 114 115 DECLARE_ALIGNED(16, extern short, vp8_vp7_bilinear_filters_mmx[8][8]); 116 117 unsigned int vp8_variance4x4_wmt( 118 unsigned char *src_ptr, 119 int source_stride, 120 unsigned char *ref_ptr, 121 int recon_stride) 122 { 123 unsigned int var; 124 int avg; 125 126 vp8_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ; 127 return (var - ((avg * avg) >> 4)); 128 129 } 130 131 132 133 unsigned int vp8_variance8x8_wmt 134 ( 135 unsigned char *src_ptr, 136 int source_stride, 137 unsigned char *ref_ptr, 138 int recon_stride) 139 { 140 unsigned int var; 141 int avg; 142 143 vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ; 144 145 return (var - ((avg * avg) >> 6)); 146 147 } 148 149 150 unsigned int vp8_variance16x16_wmt 151 ( 152 unsigned char *src_ptr, 153 int source_stride, 154 unsigned char *ref_ptr, 155 int recon_stride, 156 unsigned int *sse) 157 { 158 unsigned int sse0; 159 int sum0; 160 161 162 vp8_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; 163 *sse = sse0; 164 return (sse0 - ((sum0 * sum0) >> 8)); 165 } 166 unsigned int vp8_mse16x16_wmt( 167 unsigned char *src_ptr, 168 int source_stride, 169 unsigned char *ref_ptr, 170 int recon_stride, 171 unsigned int *sse) 172 { 173 174 unsigned int sse0; 175 int sum0; 176 vp8_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; 177 *sse = sse0; 178 return sse0; 179 180 } 181 182 183 unsigned int vp8_variance16x8_wmt 184 ( 185 unsigned char *src_ptr, 186 int source_stride, 187 unsigned char *ref_ptr, 188 int recon_stride, 189 unsigned int *sse) 190 { 191 unsigned int sse0, sse1, var; 192 int sum0, sum1, avg; 193 194 vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; 195 vp8_get8x8var_sse2(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1); 196 197 var = sse0 + sse1; 198 avg = sum0 + sum1; 199 *sse = var; 200 return (var - ((avg * avg) >> 7)); 201 202 } 203 204 unsigned int vp8_variance8x16_wmt 205 ( 206 unsigned char *src_ptr, 207 int source_stride, 208 unsigned char *ref_ptr, 209 int recon_stride, 210 unsigned int *sse) 211 { 212 unsigned int sse0, sse1, var; 213 int sum0, sum1, avg; 214 215 vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; 216 vp8_get8x8var_sse2(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1) ; 217 218 var = sse0 + sse1; 219 avg = sum0 + sum1; 220 *sse = var; 221 return (var - ((avg * avg) >> 7)); 222 223 } 224 225 /////////////////////////////////////////////////////////////////////////// 226 // the mmx function that does the bilinear filtering and var calculation // 227 // int one pass // 228 /////////////////////////////////////////////////////////////////////////// 229 DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_xmm[8][16]) = 230 { 231 { 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0 }, 232 { 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16 }, 233 { 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32 }, 234 { 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48 }, 235 { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, 236 { 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80 }, 237 { 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96 }, 238 { 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112 } 239 }; 240 unsigned int vp8_sub_pixel_variance4x4_wmt 241 ( 242 unsigned char *src_ptr, 243 int src_pixels_per_line, 244 int xoffset, 245 int yoffset, 246 unsigned char *dst_ptr, 247 int dst_pixels_per_line, 248 unsigned int *sse 249 ) 250 { 251 int xsum; 252 unsigned int xxsum; 253 vp8_filter_block2d_bil4x4_var_mmx( 254 src_ptr, src_pixels_per_line, 255 dst_ptr, dst_pixels_per_line, 256 vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset], 257 &xsum, &xxsum 258 ); 259 *sse = xxsum; 260 return (xxsum - ((xsum * xsum) >> 4)); 261 } 262 263 264 unsigned int vp8_sub_pixel_variance8x8_wmt 265 ( 266 unsigned char *src_ptr, 267 int src_pixels_per_line, 268 int xoffset, 269 int yoffset, 270 unsigned char *dst_ptr, 271 int dst_pixels_per_line, 272 unsigned int *sse 273 ) 274 { 275 276 int xsum; 277 unsigned int xxsum; 278 vp8_filter_block2d_bil_var_sse2( 279 src_ptr, src_pixels_per_line, 280 dst_ptr, dst_pixels_per_line, 8, 281 vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset], 282 &xsum, &xxsum 283 ); 284 285 *sse = xxsum; 286 return (xxsum - ((xsum * xsum) >> 6)); 287 } 288 289 unsigned int vp8_sub_pixel_variance16x16_wmt 290 ( 291 unsigned char *src_ptr, 292 int src_pixels_per_line, 293 int xoffset, 294 int yoffset, 295 unsigned char *dst_ptr, 296 int dst_pixels_per_line, 297 unsigned int *sse 298 ) 299 { 300 int xsum0, xsum1; 301 unsigned int xxsum0, xxsum1; 302 303 304 // note we could avoid these if statements if the calling function 305 // just called the appropriate functions inside. 306 if (xoffset == 4 && yoffset == 0) 307 { 308 vp8_half_horiz_variance16x_h_sse2( 309 src_ptr, src_pixels_per_line, 310 dst_ptr, dst_pixels_per_line, 16, 311 &xsum0, &xxsum0); 312 313 vp8_half_horiz_variance16x_h_sse2( 314 src_ptr + 8, src_pixels_per_line, 315 dst_ptr + 8, dst_pixels_per_line, 16, 316 &xsum1, &xxsum1); 317 } 318 else if (xoffset == 0 && yoffset == 4) 319 { 320 vp8_half_vert_variance16x_h_sse2( 321 src_ptr, src_pixels_per_line, 322 dst_ptr, dst_pixels_per_line, 16, 323 &xsum0, &xxsum0); 324 325 vp8_half_vert_variance16x_h_sse2( 326 src_ptr + 8, src_pixels_per_line, 327 dst_ptr + 8, dst_pixels_per_line, 16, 328 &xsum1, &xxsum1); 329 } 330 else if (xoffset == 4 && yoffset == 4) 331 { 332 vp8_half_horiz_vert_variance16x_h_sse2( 333 src_ptr, src_pixels_per_line, 334 dst_ptr, dst_pixels_per_line, 16, 335 &xsum0, &xxsum0); 336 337 vp8_half_horiz_vert_variance16x_h_sse2( 338 src_ptr + 8, src_pixels_per_line, 339 dst_ptr + 8, dst_pixels_per_line, 16, 340 &xsum1, &xxsum1); 341 } 342 else 343 { 344 vp8_filter_block2d_bil_var_sse2( 345 src_ptr, src_pixels_per_line, 346 dst_ptr, dst_pixels_per_line, 16, 347 vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset], 348 &xsum0, &xxsum0 349 ); 350 351 352 vp8_filter_block2d_bil_var_sse2( 353 src_ptr + 8, src_pixels_per_line, 354 dst_ptr + 8, dst_pixels_per_line, 16, 355 vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset], 356 &xsum1, &xxsum1 357 ); 358 } 359 360 xsum0 += xsum1; 361 xxsum0 += xxsum1; 362 *sse = xxsum0; 363 return (xxsum0 - ((xsum0 * xsum0) >> 8)); 364 } 365 366 unsigned int vp8_sub_pixel_mse16x16_wmt( 367 unsigned char *src_ptr, 368 int src_pixels_per_line, 369 int xoffset, 370 int yoffset, 371 unsigned char *dst_ptr, 372 int dst_pixels_per_line, 373 unsigned int *sse 374 ) 375 { 376 vp8_sub_pixel_variance16x16_wmt(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse); 377 return *sse; 378 } 379 380 unsigned int vp8_sub_pixel_variance16x8_wmt 381 ( 382 unsigned char *src_ptr, 383 int src_pixels_per_line, 384 int xoffset, 385 int yoffset, 386 unsigned char *dst_ptr, 387 int dst_pixels_per_line, 388 unsigned int *sse 389 390 ) 391 { 392 int xsum0, xsum1; 393 unsigned int xxsum0, xxsum1; 394 395 396 vp8_filter_block2d_bil_var_sse2( 397 src_ptr, src_pixels_per_line, 398 dst_ptr, dst_pixels_per_line, 8, 399 vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset], 400 &xsum0, &xxsum0 401 ); 402 403 404 vp8_filter_block2d_bil_var_sse2( 405 src_ptr + 8, src_pixels_per_line, 406 dst_ptr + 8, dst_pixels_per_line, 8, 407 vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset], 408 &xsum1, &xxsum1 409 ); 410 411 xsum0 += xsum1; 412 xxsum0 += xxsum1; 413 414 *sse = xxsum0; 415 return (xxsum0 - ((xsum0 * xsum0) >> 7)); 416 } 417 418 unsigned int vp8_sub_pixel_variance8x16_wmt 419 ( 420 unsigned char *src_ptr, 421 int src_pixels_per_line, 422 int xoffset, 423 int yoffset, 424 unsigned char *dst_ptr, 425 int dst_pixels_per_line, 426 unsigned int *sse 427 ) 428 { 429 int xsum; 430 unsigned int xxsum; 431 vp8_filter_block2d_bil_var_sse2( 432 src_ptr, src_pixels_per_line, 433 dst_ptr, dst_pixels_per_line, 16, 434 vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset], 435 &xsum, &xxsum 436 ); 437 438 *sse = xxsum; 439 return (xxsum - ((xsum * xsum) >> 7)); 440 } 441 442 unsigned int vp8_i_variance16x16_wmt( 443 unsigned char *src_ptr, 444 int source_stride, 445 unsigned char *ref_ptr, 446 int recon_stride, 447 unsigned int *sse) 448 { 449 unsigned int sse0, sse1, sse2, sse3, var; 450 int sum0, sum1, sum2, sum3, avg; 451 452 453 vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; 454 vp8_get8x8var_sse2(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1); 455 vp8_get8x8var_sse2(src_ptr + (source_stride >> 1), source_stride, ref_ptr + (recon_stride >> 1), recon_stride, &sse2, &sum2) ; 456 vp8_get8x8var_sse2(src_ptr + (source_stride >> 1) + 8, source_stride, ref_ptr + (recon_stride >> 1) + 8, recon_stride, &sse3, &sum3); 457 458 var = sse0 + sse1 + sse2 + sse3; 459 avg = sum0 + sum1 + sum2 + sum3; 460 461 *sse = var; 462 return (var - ((avg * avg) >> 8)); 463 464 } 465 466 unsigned int vp8_i_variance8x16_wmt( 467 unsigned char *src_ptr, 468 int source_stride, 469 unsigned char *ref_ptr, 470 int recon_stride, 471 unsigned int *sse) 472 { 473 unsigned int sse0, sse1, var; 474 int sum0, sum1, avg; 475 vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; 476 vp8_get8x8var_sse2(src_ptr + (source_stride >> 1), source_stride, ref_ptr + (recon_stride >> 1), recon_stride, &sse1, &sum1) ; 477 478 var = sse0 + sse1; 479 avg = sum0 + sum1; 480 481 *sse = var; 482 return (var - ((avg * avg) >> 7)); 483 484 } 485 486 487 unsigned int vp8_i_sub_pixel_variance16x16_wmt 488 ( 489 unsigned char *src_ptr, 490 int src_pixels_per_line, 491 int xoffset, 492 int yoffset, 493 unsigned char *dst_ptr, 494 int dst_pixels_per_line, 495 unsigned int *sse 496 ) 497 { 498 return vp8_sub_pixel_variance16x16_wmt(src_ptr, (src_pixels_per_line >> 1), xoffset, yoffset, dst_ptr, (dst_pixels_per_line >> 1), sse); 499 } 500 501 502 unsigned int vp8_i_sub_pixel_variance8x16_wmt 503 ( 504 unsigned char *src_ptr, 505 int src_pixels_per_line, 506 int xoffset, 507 int yoffset, 508 unsigned char *dst_ptr, 509 int dst_pixels_per_line, 510 unsigned int *sse 511 ) 512 { 513 514 return vp8_sub_pixel_variance8x16_wmt(src_ptr, (src_pixels_per_line >> 1), xoffset, yoffset, dst_ptr, (dst_pixels_per_line >> 1), sse); 515 } 516