1 /* 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 12 #include "vpx_ports/config.h" 13 #include "vpx_ports/mem.h" 14 #include "vp8/common/subpixel.h" 15 16 extern const short vp8_six_tap_mmx[8][6*8]; 17 extern const short vp8_bilinear_filters_mmx[8][2*8]; 18 19 extern void vp8_filter_block1d_h6_mmx 20 ( 21 unsigned char *src_ptr, 22 unsigned short *output_ptr, 23 unsigned int src_pixels_per_line, 24 unsigned int pixel_step, 25 unsigned int output_height, 26 unsigned int output_width, 27 const short *vp8_filter 28 ); 29 extern void vp8_filter_block1dc_v6_mmx 30 ( 31 unsigned short *src_ptr, 32 unsigned char *output_ptr, 33 int output_pitch, 34 unsigned int pixels_per_line, 35 unsigned int pixel_step, 36 unsigned int output_height, 37 unsigned int output_width, 38 const short *vp8_filter 39 ); 40 extern void vp8_filter_block1d8_h6_sse2 41 ( 42 unsigned char *src_ptr, 43 unsigned short *output_ptr, 44 unsigned int src_pixels_per_line, 45 unsigned int pixel_step, 46 unsigned int output_height, 47 unsigned int output_width, 48 const short *vp8_filter 49 ); 50 extern void vp8_filter_block1d16_h6_sse2 51 ( 52 unsigned char *src_ptr, 53 unsigned short *output_ptr, 54 unsigned int src_pixels_per_line, 55 unsigned int pixel_step, 56 unsigned int output_height, 57 unsigned int output_width, 58 const short *vp8_filter 59 ); 60 extern void vp8_filter_block1d8_v6_sse2 61 ( 62 unsigned short *src_ptr, 63 unsigned char *output_ptr, 64 int dst_ptich, 65 unsigned int pixels_per_line, 66 unsigned int pixel_step, 67 unsigned int output_height, 68 unsigned int output_width, 69 const short *vp8_filter 70 ); 71 extern void vp8_filter_block1d16_v6_sse2 72 ( 73 unsigned short *src_ptr, 74 unsigned char *output_ptr, 75 int dst_ptich, 76 unsigned int pixels_per_line, 77 unsigned int pixel_step, 78 unsigned int output_height, 79 unsigned int output_width, 80 const short *vp8_filter 81 ); 82 extern void vp8_unpack_block1d16_h6_sse2 83 ( 84 unsigned char *src_ptr, 85 unsigned short *output_ptr, 86 unsigned int src_pixels_per_line, 87 unsigned int output_height, 88 unsigned int output_width 89 ); 90 extern void vp8_filter_block1d8_h6_only_sse2 91 ( 92 unsigned char *src_ptr, 93 unsigned int src_pixels_per_line, 94 unsigned char *output_ptr, 95 int dst_ptich, 96 unsigned int output_height, 97 const short *vp8_filter 98 ); 99 extern void vp8_filter_block1d16_h6_only_sse2 100 ( 101 unsigned char *src_ptr, 102 unsigned int src_pixels_per_line, 103 unsigned char *output_ptr, 104 int dst_ptich, 105 unsigned int output_height, 106 const short *vp8_filter 107 ); 108 extern void vp8_filter_block1d8_v6_only_sse2 109 ( 110 unsigned char *src_ptr, 111 unsigned int src_pixels_per_line, 112 unsigned char *output_ptr, 113 int dst_ptich, 114 unsigned int output_height, 115 const short *vp8_filter 116 ); 117 extern prototype_subpixel_predict(vp8_bilinear_predict8x8_mmx); 118 119 120 #if HAVE_MMX 121 void vp8_sixtap_predict4x4_mmx 122 ( 123 unsigned char *src_ptr, 124 int src_pixels_per_line, 125 int xoffset, 126 int yoffset, 127 unsigned char *dst_ptr, 128 int dst_pitch 129 ) 130 { 131 DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 16*16); /* Temp data bufffer used in filtering */ 132 const short *HFilter, *VFilter; 133 HFilter = vp8_six_tap_mmx[xoffset]; 134 vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 9, 8, HFilter); 135 VFilter = vp8_six_tap_mmx[yoffset]; 136 vp8_filter_block1dc_v6_mmx(FData2 + 8, dst_ptr, dst_pitch, 8, 4 , 4, 4, VFilter); 137 138 } 139 140 141 void vp8_sixtap_predict16x16_mmx 142 ( 143 unsigned char *src_ptr, 144 int src_pixels_per_line, 145 int xoffset, 146 int yoffset, 147 unsigned char *dst_ptr, 148 int dst_pitch 149 ) 150 { 151 152 DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 24*24); /* Temp data bufffer used in filtering */ 153 154 const short *HFilter, *VFilter; 155 156 157 HFilter = vp8_six_tap_mmx[xoffset]; 158 159 vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 21, 32, HFilter); 160 vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4, FData2 + 4, src_pixels_per_line, 1, 21, 32, HFilter); 161 vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 8, FData2 + 8, src_pixels_per_line, 1, 21, 32, HFilter); 162 vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 12, FData2 + 12, src_pixels_per_line, 1, 21, 32, HFilter); 163 164 VFilter = vp8_six_tap_mmx[yoffset]; 165 vp8_filter_block1dc_v6_mmx(FData2 + 32, dst_ptr, dst_pitch, 32, 16 , 16, 16, VFilter); 166 vp8_filter_block1dc_v6_mmx(FData2 + 36, dst_ptr + 4, dst_pitch, 32, 16 , 16, 16, VFilter); 167 vp8_filter_block1dc_v6_mmx(FData2 + 40, dst_ptr + 8, dst_pitch, 32, 16 , 16, 16, VFilter); 168 vp8_filter_block1dc_v6_mmx(FData2 + 44, dst_ptr + 12, dst_pitch, 32, 16 , 16, 16, VFilter); 169 170 } 171 172 173 void vp8_sixtap_predict8x8_mmx 174 ( 175 unsigned char *src_ptr, 176 int src_pixels_per_line, 177 int xoffset, 178 int yoffset, 179 unsigned char *dst_ptr, 180 int dst_pitch 181 ) 182 { 183 184 DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 256); /* Temp data bufffer used in filtering */ 185 186 const short *HFilter, *VFilter; 187 188 HFilter = vp8_six_tap_mmx[xoffset]; 189 vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 13, 16, HFilter); 190 vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4, FData2 + 4, src_pixels_per_line, 1, 13, 16, HFilter); 191 192 VFilter = vp8_six_tap_mmx[yoffset]; 193 vp8_filter_block1dc_v6_mmx(FData2 + 16, dst_ptr, dst_pitch, 16, 8 , 8, 8, VFilter); 194 vp8_filter_block1dc_v6_mmx(FData2 + 20, dst_ptr + 4, dst_pitch, 16, 8 , 8, 8, VFilter); 195 196 } 197 198 199 void vp8_sixtap_predict8x4_mmx 200 ( 201 unsigned char *src_ptr, 202 int src_pixels_per_line, 203 int xoffset, 204 int yoffset, 205 unsigned char *dst_ptr, 206 int dst_pitch 207 ) 208 { 209 210 DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 256); /* Temp data bufffer used in filtering */ 211 212 const short *HFilter, *VFilter; 213 214 HFilter = vp8_six_tap_mmx[xoffset]; 215 vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 9, 16, HFilter); 216 vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4, FData2 + 4, src_pixels_per_line, 1, 9, 16, HFilter); 217 218 VFilter = vp8_six_tap_mmx[yoffset]; 219 vp8_filter_block1dc_v6_mmx(FData2 + 16, dst_ptr, dst_pitch, 16, 8 , 4, 8, VFilter); 220 vp8_filter_block1dc_v6_mmx(FData2 + 20, dst_ptr + 4, dst_pitch, 16, 8 , 4, 8, VFilter); 221 222 } 223 224 225 226 void vp8_bilinear_predict16x16_mmx 227 ( 228 unsigned char *src_ptr, 229 int src_pixels_per_line, 230 int xoffset, 231 int yoffset, 232 unsigned char *dst_ptr, 233 int dst_pitch 234 ) 235 { 236 vp8_bilinear_predict8x8_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pitch); 237 vp8_bilinear_predict8x8_mmx(src_ptr + 8, src_pixels_per_line, xoffset, yoffset, dst_ptr + 8, dst_pitch); 238 vp8_bilinear_predict8x8_mmx(src_ptr + 8 * src_pixels_per_line, src_pixels_per_line, xoffset, yoffset, dst_ptr + dst_pitch * 8, dst_pitch); 239 vp8_bilinear_predict8x8_mmx(src_ptr + 8 * src_pixels_per_line + 8, src_pixels_per_line, xoffset, yoffset, dst_ptr + dst_pitch * 8 + 8, dst_pitch); 240 } 241 #endif 242 243 244 #if HAVE_SSE2 245 void vp8_sixtap_predict16x16_sse2 246 ( 247 unsigned char *src_ptr, 248 int src_pixels_per_line, 249 int xoffset, 250 int yoffset, 251 unsigned char *dst_ptr, 252 int dst_pitch 253 254 ) 255 { 256 DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 24*24); /* Temp data bufffer used in filtering */ 257 258 const short *HFilter, *VFilter; 259 260 if (xoffset) 261 { 262 if (yoffset) 263 { 264 HFilter = vp8_six_tap_mmx[xoffset]; 265 vp8_filter_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 21, 32, HFilter); 266 VFilter = vp8_six_tap_mmx[yoffset]; 267 vp8_filter_block1d16_v6_sse2(FData2 + 32, dst_ptr, dst_pitch, 32, 16 , 16, dst_pitch, VFilter); 268 } 269 else 270 { 271 /* First-pass only */ 272 HFilter = vp8_six_tap_mmx[xoffset]; 273 vp8_filter_block1d16_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 16, HFilter); 274 } 275 } 276 else 277 { 278 /* Second-pass only */ 279 VFilter = vp8_six_tap_mmx[yoffset]; 280 vp8_unpack_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 21, 32); 281 vp8_filter_block1d16_v6_sse2(FData2 + 32, dst_ptr, dst_pitch, 32, 16 , 16, dst_pitch, VFilter); 282 } 283 } 284 285 286 void vp8_sixtap_predict8x8_sse2 287 ( 288 unsigned char *src_ptr, 289 int src_pixels_per_line, 290 int xoffset, 291 int yoffset, 292 unsigned char *dst_ptr, 293 int dst_pitch 294 ) 295 { 296 DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 256); /* Temp data bufffer used in filtering */ 297 const short *HFilter, *VFilter; 298 299 if (xoffset) 300 { 301 if (yoffset) 302 { 303 HFilter = vp8_six_tap_mmx[xoffset]; 304 vp8_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 13, 16, HFilter); 305 VFilter = vp8_six_tap_mmx[yoffset]; 306 vp8_filter_block1d8_v6_sse2(FData2 + 16, dst_ptr, dst_pitch, 16, 8 , 8, dst_pitch, VFilter); 307 } 308 else 309 { 310 /* First-pass only */ 311 HFilter = vp8_six_tap_mmx[xoffset]; 312 vp8_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 8, HFilter); 313 } 314 } 315 else 316 { 317 /* Second-pass only */ 318 VFilter = vp8_six_tap_mmx[yoffset]; 319 vp8_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, dst_ptr, dst_pitch, 8, VFilter); 320 } 321 } 322 323 324 void vp8_sixtap_predict8x4_sse2 325 ( 326 unsigned char *src_ptr, 327 int src_pixels_per_line, 328 int xoffset, 329 int yoffset, 330 unsigned char *dst_ptr, 331 int dst_pitch 332 ) 333 { 334 DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 256); /* Temp data bufffer used in filtering */ 335 const short *HFilter, *VFilter; 336 337 if (xoffset) 338 { 339 if (yoffset) 340 { 341 HFilter = vp8_six_tap_mmx[xoffset]; 342 vp8_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 9, 16, HFilter); 343 VFilter = vp8_six_tap_mmx[yoffset]; 344 vp8_filter_block1d8_v6_sse2(FData2 + 16, dst_ptr, dst_pitch, 16, 8 , 4, dst_pitch, VFilter); 345 } 346 else 347 { 348 /* First-pass only */ 349 HFilter = vp8_six_tap_mmx[xoffset]; 350 vp8_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 4, HFilter); 351 } 352 } 353 else 354 { 355 /* Second-pass only */ 356 VFilter = vp8_six_tap_mmx[yoffset]; 357 vp8_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, dst_ptr, dst_pitch, 4, VFilter); 358 } 359 } 360 361 #endif 362 363 #if HAVE_SSSE3 364 365 extern void vp8_filter_block1d8_h6_ssse3 366 ( 367 unsigned char *src_ptr, 368 unsigned int src_pixels_per_line, 369 unsigned char *output_ptr, 370 unsigned int output_pitch, 371 unsigned int output_height, 372 unsigned int vp8_filter_index 373 ); 374 375 extern void vp8_filter_block1d16_h6_ssse3 376 ( 377 unsigned char *src_ptr, 378 unsigned int src_pixels_per_line, 379 unsigned char *output_ptr, 380 unsigned int output_pitch, 381 unsigned int output_height, 382 unsigned int vp8_filter_index 383 ); 384 385 extern void vp8_filter_block1d16_v6_ssse3 386 ( 387 unsigned char *src_ptr, 388 unsigned int src_pitch, 389 unsigned char *output_ptr, 390 unsigned int out_pitch, 391 unsigned int output_height, 392 unsigned int vp8_filter_index 393 ); 394 395 extern void vp8_filter_block1d8_v6_ssse3 396 ( 397 unsigned char *src_ptr, 398 unsigned int src_pitch, 399 unsigned char *output_ptr, 400 unsigned int out_pitch, 401 unsigned int output_height, 402 unsigned int vp8_filter_index 403 ); 404 405 extern void vp8_filter_block1d4_h6_ssse3 406 ( 407 unsigned char *src_ptr, 408 unsigned int src_pixels_per_line, 409 unsigned char *output_ptr, 410 unsigned int output_pitch, 411 unsigned int output_height, 412 unsigned int vp8_filter_index 413 ); 414 415 extern void vp8_filter_block1d4_v6_ssse3 416 ( 417 unsigned char *src_ptr, 418 unsigned int src_pitch, 419 unsigned char *output_ptr, 420 unsigned int out_pitch, 421 unsigned int output_height, 422 unsigned int vp8_filter_index 423 ); 424 425 void vp8_sixtap_predict16x16_ssse3 426 ( 427 unsigned char *src_ptr, 428 int src_pixels_per_line, 429 int xoffset, 430 int yoffset, 431 unsigned char *dst_ptr, 432 int dst_pitch 433 434 ) 435 { 436 DECLARE_ALIGNED_ARRAY(16, unsigned char, FData2, 24*24); 437 438 if (xoffset) 439 { 440 if (yoffset) 441 { 442 vp8_filter_block1d16_h6_ssse3(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, FData2, 16, 21, xoffset); 443 vp8_filter_block1d16_v6_ssse3(FData2 , 16, dst_ptr, dst_pitch, 16, yoffset); 444 } 445 else 446 { 447 /* First-pass only */ 448 vp8_filter_block1d16_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 16, xoffset); 449 } 450 } 451 else 452 { 453 /* Second-pass only */ 454 vp8_filter_block1d16_v6_ssse3(src_ptr - (2 * src_pixels_per_line) , src_pixels_per_line, dst_ptr, dst_pitch, 16, yoffset); 455 } 456 } 457 458 void vp8_sixtap_predict8x8_ssse3 459 ( 460 unsigned char *src_ptr, 461 int src_pixels_per_line, 462 int xoffset, 463 int yoffset, 464 unsigned char *dst_ptr, 465 int dst_pitch 466 ) 467 { 468 DECLARE_ALIGNED_ARRAY(16, unsigned char, FData2, 256); 469 470 if (xoffset) 471 { 472 if (yoffset) 473 { 474 vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, FData2, 8, 13, xoffset); 475 vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch, 8, yoffset); 476 } 477 else 478 { 479 vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 8, xoffset); 480 } 481 } 482 else 483 { 484 /* Second-pass only */ 485 vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, dst_ptr, dst_pitch, 8, yoffset); 486 } 487 } 488 489 490 void vp8_sixtap_predict8x4_ssse3 491 ( 492 unsigned char *src_ptr, 493 int src_pixels_per_line, 494 int xoffset, 495 int yoffset, 496 unsigned char *dst_ptr, 497 int dst_pitch 498 ) 499 { 500 DECLARE_ALIGNED_ARRAY(16, unsigned char, FData2, 256); 501 502 if (xoffset) 503 { 504 if (yoffset) 505 { 506 vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, FData2, 8, 9, xoffset); 507 vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch, 4, yoffset); 508 } 509 else 510 { 511 /* First-pass only */ 512 vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 4, xoffset); 513 } 514 } 515 else 516 { 517 /* Second-pass only */ 518 vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, dst_ptr, dst_pitch, 4, yoffset); 519 } 520 } 521 522 void vp8_sixtap_predict4x4_ssse3 523 ( 524 unsigned char *src_ptr, 525 int src_pixels_per_line, 526 int xoffset, 527 int yoffset, 528 unsigned char *dst_ptr, 529 int dst_pitch 530 ) 531 { 532 DECLARE_ALIGNED_ARRAY(16, unsigned char, FData2, 4*9); 533 534 if (xoffset) 535 { 536 if (yoffset) 537 { 538 vp8_filter_block1d4_h6_ssse3(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, FData2, 4, 9, xoffset); 539 vp8_filter_block1d4_v6_ssse3(FData2, 4, dst_ptr, dst_pitch, 4, yoffset); 540 } 541 else 542 { 543 vp8_filter_block1d4_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 4, xoffset); 544 } 545 } 546 else 547 { 548 vp8_filter_block1d4_v6_ssse3(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, dst_ptr, dst_pitch, 4, yoffset); 549 } 550 551 } 552 553 #endif 554