1 /* 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 12 #include "vp8/encoder/variance.h" 13 #include "vp8/common/pragmas.h" 14 #include "vpx_ports/mem.h" 15 16 extern void filter_block1d_h6_mmx 17 ( 18 const unsigned char *src_ptr, 19 unsigned short *output_ptr, 20 unsigned int src_pixels_per_line, 21 unsigned int pixel_step, 22 unsigned int output_height, 23 unsigned int output_width, 24 short *vp7_filter 25 ); 26 extern void filter_block1d_v6_mmx 27 ( 28 const short *src_ptr, 29 unsigned char *output_ptr, 30 unsigned int pixels_per_line, 31 unsigned int pixel_step, 32 unsigned int output_height, 33 unsigned int output_width, 34 short *vp7_filter 35 ); 36 37 extern unsigned int vp8_get_mb_ss_mmx(short *src_ptr); 38 extern unsigned int vp8_get8x8var_mmx 39 ( 40 const unsigned char *src_ptr, 41 int source_stride, 42 const unsigned char *ref_ptr, 43 int recon_stride, 44 unsigned int *SSE, 45 int *Sum 46 ); 47 extern unsigned int vp8_get4x4var_mmx 48 ( 49 const unsigned char *src_ptr, 50 int source_stride, 51 const unsigned char *ref_ptr, 52 int recon_stride, 53 unsigned int *SSE, 54 int *Sum 55 ); 56 extern void vp8_filter_block2d_bil4x4_var_mmx 57 ( 58 const unsigned char *ref_ptr, 59 int ref_pixels_per_line, 60 const unsigned char *src_ptr, 61 int src_pixels_per_line, 62 const short *HFilter, 63 const short *VFilter, 64 int *sum, 65 unsigned int *sumsquared 66 ); 67 extern void vp8_filter_block2d_bil_var_mmx 68 ( 69 const unsigned char *ref_ptr, 70 int ref_pixels_per_line, 71 const unsigned char *src_ptr, 72 int src_pixels_per_line, 73 unsigned int Height, 74 const short *HFilter, 75 const short *VFilter, 76 int *sum, 77 unsigned int *sumsquared 78 ); 79 extern unsigned int vp8_get16x16pred_error_mmx 80 ( 81 unsigned char *src_ptr, 82 int src_stride, 83 unsigned char *ref_ptr, 84 int ref_stride 85 ); 86 87 88 unsigned int vp8_get16x16var_mmx( 89 const unsigned char *src_ptr, 90 int source_stride, 91 const unsigned char *ref_ptr, 92 int recon_stride, 93 unsigned *SSE, 94 unsigned *SUM 95 ) 96 { 97 unsigned int sse0, sse1, sse2, sse3, var; 98 int sum0, sum1, sum2, sum3, avg; 99 100 101 vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; 102 vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1); 103 vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2) ; 104 vp8_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3); 105 106 var = sse0 + sse1 + sse2 + sse3; 107 avg = sum0 + sum1 + sum2 + sum3; 108 109 *SSE = var; 110 *SUM = avg; 111 return (var - ((avg * avg) >> 8)); 112 113 } 114 115 116 117 118 119 unsigned int vp8_variance4x4_mmx( 120 const unsigned char *src_ptr, 121 int source_stride, 122 const unsigned char *ref_ptr, 123 int recon_stride, 124 unsigned int *sse) 125 { 126 unsigned int var; 127 int avg; 128 129 vp8_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ; 130 *sse = var; 131 return (var - ((avg * avg) >> 4)); 132 133 } 134 135 unsigned int vp8_variance8x8_mmx( 136 const unsigned char *src_ptr, 137 int source_stride, 138 const unsigned char *ref_ptr, 139 int recon_stride, 140 unsigned int *sse) 141 { 142 unsigned int var; 143 int avg; 144 145 vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ; 146 *sse = var; 147 148 return (var - ((avg * avg) >> 6)); 149 150 } 151 152 unsigned int vp8_mse16x16_mmx( 153 const unsigned char *src_ptr, 154 int source_stride, 155 const unsigned char *ref_ptr, 156 int recon_stride, 157 unsigned int *sse) 158 { 159 unsigned int sse0, sse1, sse2, sse3, var; 160 int sum0, sum1, sum2, sum3; 161 162 163 vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; 164 vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1); 165 vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2) ; 166 vp8_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3); 167 168 var = sse0 + sse1 + sse2 + sse3; 169 *sse = var; 170 return var; 171 } 172 173 174 unsigned int vp8_variance16x16_mmx( 175 const unsigned char *src_ptr, 176 int source_stride, 177 const unsigned char *ref_ptr, 178 int recon_stride, 179 int *sse) 180 { 181 unsigned int sse0, sse1, sse2, sse3, var; 182 int sum0, sum1, sum2, sum3, avg; 183 184 185 vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; 186 vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1); 187 vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2) ; 188 vp8_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3); 189 190 var = sse0 + sse1 + sse2 + sse3; 191 avg = sum0 + sum1 + sum2 + sum3; 192 *sse = var; 193 return (var - ((avg * avg) >> 8)); 194 } 195 196 unsigned int vp8_variance16x8_mmx( 197 const unsigned char *src_ptr, 198 int source_stride, 199 const unsigned char *ref_ptr, 200 int recon_stride, 201 unsigned int *sse) 202 { 203 unsigned int sse0, sse1, var; 204 int sum0, sum1, avg; 205 206 vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; 207 vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1); 208 209 var = sse0 + sse1; 210 avg = sum0 + sum1; 211 *sse = var; 212 return (var - ((avg * avg) >> 7)); 213 214 } 215 216 217 unsigned int vp8_variance8x16_mmx( 218 const unsigned char *src_ptr, 219 int source_stride, 220 const unsigned char *ref_ptr, 221 int recon_stride, 222 unsigned int *sse) 223 { 224 unsigned int sse0, sse1, var; 225 int sum0, sum1, avg; 226 227 vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; 228 vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1) ; 229 230 var = sse0 + sse1; 231 avg = sum0 + sum1; 232 *sse = var; 233 234 return (var - ((avg * avg) >> 7)); 235 236 } 237 238 239 240 241 /////////////////////////////////////////////////////////////////////////// 242 // the mmx function that does the bilinear filtering and var calculation // 243 // int one pass // 244 /////////////////////////////////////////////////////////////////////////// 245 DECLARE_ALIGNED(16, const short, vp8_vp7_bilinear_filters_mmx[8][8]) = 246 { 247 { 128, 128, 128, 128, 0, 0, 0, 0 }, 248 { 112, 112, 112, 112, 16, 16, 16, 16 }, 249 { 96, 96, 96, 96, 32, 32, 32, 32 }, 250 { 80, 80, 80, 80, 48, 48, 48, 48 }, 251 { 64, 64, 64, 64, 64, 64, 64, 64 }, 252 { 48, 48, 48, 48, 80, 80, 80, 80 }, 253 { 32, 32, 32, 32, 96, 96, 96, 96 }, 254 { 16, 16, 16, 16, 112, 112, 112, 112 } 255 }; 256 257 unsigned int vp8_sub_pixel_variance4x4_mmx 258 ( 259 const unsigned char *src_ptr, 260 int src_pixels_per_line, 261 int xoffset, 262 int yoffset, 263 const unsigned char *dst_ptr, 264 int dst_pixels_per_line, 265 unsigned int *sse) 266 267 { 268 int xsum; 269 unsigned int xxsum; 270 vp8_filter_block2d_bil4x4_var_mmx( 271 src_ptr, src_pixels_per_line, 272 dst_ptr, dst_pixels_per_line, 273 vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset], 274 &xsum, &xxsum 275 ); 276 *sse = xxsum; 277 return (xxsum - ((xsum * xsum) >> 4)); 278 } 279 280 281 unsigned int vp8_sub_pixel_variance8x8_mmx 282 ( 283 const unsigned char *src_ptr, 284 int src_pixels_per_line, 285 int xoffset, 286 int yoffset, 287 const unsigned char *dst_ptr, 288 int dst_pixels_per_line, 289 unsigned int *sse 290 ) 291 { 292 293 int xsum; 294 unsigned int xxsum; 295 vp8_filter_block2d_bil_var_mmx( 296 src_ptr, src_pixels_per_line, 297 dst_ptr, dst_pixels_per_line, 8, 298 vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset], 299 &xsum, &xxsum 300 ); 301 *sse = xxsum; 302 return (xxsum - ((xsum * xsum) >> 6)); 303 } 304 305 unsigned int vp8_sub_pixel_variance16x16_mmx 306 ( 307 const unsigned char *src_ptr, 308 int src_pixels_per_line, 309 int xoffset, 310 int yoffset, 311 const unsigned char *dst_ptr, 312 int dst_pixels_per_line, 313 unsigned int *sse 314 ) 315 { 316 317 int xsum0, xsum1; 318 unsigned int xxsum0, xxsum1; 319 320 321 vp8_filter_block2d_bil_var_mmx( 322 src_ptr, src_pixels_per_line, 323 dst_ptr, dst_pixels_per_line, 16, 324 vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset], 325 &xsum0, &xxsum0 326 ); 327 328 329 vp8_filter_block2d_bil_var_mmx( 330 src_ptr + 8, src_pixels_per_line, 331 dst_ptr + 8, dst_pixels_per_line, 16, 332 vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset], 333 &xsum1, &xxsum1 334 ); 335 336 xsum0 += xsum1; 337 xxsum0 += xxsum1; 338 339 *sse = xxsum0; 340 return (xxsum0 - ((xsum0 * xsum0) >> 8)); 341 342 343 } 344 345 unsigned int vp8_sub_pixel_mse16x16_mmx( 346 const unsigned char *src_ptr, 347 int src_pixels_per_line, 348 int xoffset, 349 int yoffset, 350 const unsigned char *dst_ptr, 351 int dst_pixels_per_line, 352 unsigned int *sse 353 ) 354 { 355 vp8_sub_pixel_variance16x16_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse); 356 return *sse; 357 } 358 359 unsigned int vp8_sub_pixel_variance16x8_mmx 360 ( 361 const unsigned char *src_ptr, 362 int src_pixels_per_line, 363 int xoffset, 364 int yoffset, 365 const unsigned char *dst_ptr, 366 int dst_pixels_per_line, 367 unsigned int *sse 368 ) 369 { 370 int xsum0, xsum1; 371 unsigned int xxsum0, xxsum1; 372 373 374 vp8_filter_block2d_bil_var_mmx( 375 src_ptr, src_pixels_per_line, 376 dst_ptr, dst_pixels_per_line, 8, 377 vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset], 378 &xsum0, &xxsum0 379 ); 380 381 382 vp8_filter_block2d_bil_var_mmx( 383 src_ptr + 8, src_pixels_per_line, 384 dst_ptr + 8, dst_pixels_per_line, 8, 385 vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset], 386 &xsum1, &xxsum1 387 ); 388 389 xsum0 += xsum1; 390 xxsum0 += xxsum1; 391 392 *sse = xxsum0; 393 return (xxsum0 - ((xsum0 * xsum0) >> 7)); 394 } 395 396 unsigned int vp8_sub_pixel_variance8x16_mmx 397 ( 398 const unsigned char *src_ptr, 399 int src_pixels_per_line, 400 int xoffset, 401 int yoffset, 402 const unsigned char *dst_ptr, 403 int dst_pixels_per_line, 404 int *sse 405 ) 406 { 407 int xsum; 408 unsigned int xxsum; 409 vp8_filter_block2d_bil_var_mmx( 410 src_ptr, src_pixels_per_line, 411 dst_ptr, dst_pixels_per_line, 16, 412 vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset], 413 &xsum, &xxsum 414 ); 415 *sse = xxsum; 416 return (xxsum - ((xsum * xsum) >> 7)); 417 } 418 419 420 unsigned int vp8_variance_halfpixvar16x16_h_mmx( 421 const unsigned char *src_ptr, 422 int source_stride, 423 const unsigned char *ref_ptr, 424 int recon_stride, 425 unsigned int *sse) 426 { 427 return vp8_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 4, 0, 428 ref_ptr, recon_stride, sse); 429 } 430 431 432 unsigned int vp8_variance_halfpixvar16x16_v_mmx( 433 const unsigned char *src_ptr, 434 int source_stride, 435 const unsigned char *ref_ptr, 436 int recon_stride, 437 unsigned int *sse) 438 { 439 return vp8_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 0, 4, 440 ref_ptr, recon_stride, sse); 441 } 442 443 444 unsigned int vp8_variance_halfpixvar16x16_hv_mmx( 445 const unsigned char *src_ptr, 446 int source_stride, 447 const unsigned char *ref_ptr, 448 int recon_stride, 449 unsigned int *sse) 450 { 451 return vp8_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 4, 4, 452 ref_ptr, recon_stride, sse); 453 } 454