1 ; 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3 ; 4 ; Use of this source code is governed by a BSD-style license 5 ; that can be found in the LICENSE file in the root of the source 6 ; tree. An additional intellectual property rights grant can be found 7 ; in the file PATENTS. All contributing project authors may 8 ; be found in the AUTHORS file in the root of the source tree. 9 ; 10 11 12 %include "vpx_ports/x86_abi_support.asm" 13 14 %define BLOCK_HEIGHT_WIDTH 4 15 %define VP8_FILTER_WEIGHT 128 16 %define VP8_FILTER_SHIFT 7 17 18 19 ;/************************************************************************************ 20 ; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The 21 ; input pixel array has output_height rows. This routine assumes that output_height is an 22 ; even number. This function handles 8 pixels in horizontal direction, calculating ONE 23 ; rows each iteration to take advantage of the 128 bits operations. 24 ;*************************************************************************************/ 25 ;void vp8_filter_block1d8_h6_sse2 26 ;( 27 ; unsigned char *src_ptr, 28 ; unsigned short *output_ptr, 29 ; unsigned int src_pixels_per_line, 30 ; unsigned int pixel_step, 31 ; unsigned int output_height, 32 ; unsigned int output_width, 33 ; short *vp8_filter 34 ;) 35 global sym(vp8_filter_block1d8_h6_sse2) 36 sym(vp8_filter_block1d8_h6_sse2): 37 push rbp 38 mov rbp, rsp 39 SHADOW_ARGS_TO_STACK 7 40 SAVE_XMM 41 GET_GOT rbx 42 push rsi 43 push rdi 44 ; end prolog 45 46 mov rdx, arg(6) ;vp8_filter 47 mov rsi, arg(0) ;src_ptr 48 49 mov rdi, arg(1) ;output_ptr 50 51 movsxd rcx, dword ptr arg(4) ;output_height 52 movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source 53 %if ABI_IS_32BIT=0 54 movsxd r8, dword ptr arg(5) ;output_width 55 %endif 56 pxor xmm0, xmm0 ; clear xmm0 for unpack 57 58 filter_block1d8_h6_rowloop: 59 movq xmm3, MMWORD PTR [rsi - 2] 60 movq xmm1, MMWORD PTR [rsi + 6] 61 62 prefetcht2 [rsi+rax-2] 63 64 pslldq xmm1, 8 65 por xmm1, xmm3 66 67 movdqa xmm4, xmm1 68 movdqa xmm5, xmm1 69 70 movdqa xmm6, xmm1 71 movdqa xmm7, xmm1 72 73 punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 74 psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 75 76 pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 77 punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 78 79 psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 80 pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 81 82 83 punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 84 psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 85 86 pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 87 88 punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 89 psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 90 91 pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 92 93 punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 94 psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 95 96 97 pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 98 99 punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 100 pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 101 102 103 paddsw xmm4, xmm7 104 paddsw xmm4, xmm5 105 106 paddsw xmm4, xmm3 107 paddsw xmm4, xmm6 108 109 paddsw xmm4, xmm1 110 paddsw xmm4, [GLOBAL(rd)] 111 112 psraw xmm4, 7 113 114 packuswb xmm4, xmm0 115 punpcklbw xmm4, xmm0 116 117 movdqa XMMWORD Ptr [rdi], xmm4 118 lea rsi, [rsi + rax] 119 120 %if ABI_IS_32BIT 121 add rdi, DWORD Ptr arg(5) ;[output_width] 122 %else 123 add rdi, r8 124 %endif 125 dec rcx 126 127 jnz filter_block1d8_h6_rowloop ; next row 128 129 ; begin epilog 130 pop rdi 131 pop rsi 132 RESTORE_GOT 133 RESTORE_XMM 134 UNSHADOW_ARGS 135 pop rbp 136 ret 137 138 139 ;void vp8_filter_block1d16_h6_sse2 140 ;( 141 ; unsigned char *src_ptr, 142 ; unsigned short *output_ptr, 143 ; unsigned int src_pixels_per_line, 144 ; unsigned int pixel_step, 145 ; unsigned int output_height, 146 ; unsigned int output_width, 147 ; short *vp8_filter 148 ;) 149 ;/************************************************************************************ 150 ; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The 151 ; input pixel array has output_height rows. This routine assumes that output_height is an 152 ; even number. This function handles 8 pixels in horizontal direction, calculating ONE 153 ; rows each iteration to take advantage of the 128 bits operations. 154 ;*************************************************************************************/ 155 global sym(vp8_filter_block1d16_h6_sse2) 156 sym(vp8_filter_block1d16_h6_sse2): 157 push rbp 158 mov rbp, rsp 159 SHADOW_ARGS_TO_STACK 7 160 SAVE_XMM 161 GET_GOT rbx 162 push rsi 163 push rdi 164 ; end prolog 165 166 mov rdx, arg(6) ;vp8_filter 167 mov rsi, arg(0) ;src_ptr 168 169 mov rdi, arg(1) ;output_ptr 170 171 movsxd rcx, dword ptr arg(4) ;output_height 172 movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source 173 %if ABI_IS_32BIT=0 174 movsxd r8, dword ptr arg(5) ;output_width 175 %endif 176 177 pxor xmm0, xmm0 ; clear xmm0 for unpack 178 179 filter_block1d16_h6_sse2_rowloop: 180 movq xmm3, MMWORD PTR [rsi - 2] 181 movq xmm1, MMWORD PTR [rsi + 6] 182 183 movq xmm2, MMWORD PTR [rsi +14] 184 pslldq xmm2, 8 185 186 por xmm2, xmm1 187 prefetcht2 [rsi+rax-2] 188 189 pslldq xmm1, 8 190 por xmm1, xmm3 191 192 movdqa xmm4, xmm1 193 movdqa xmm5, xmm1 194 195 movdqa xmm6, xmm1 196 movdqa xmm7, xmm1 197 198 punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 199 psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 200 201 pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 202 punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 203 204 psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 205 pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 206 207 208 punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 209 psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 210 211 pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 212 213 punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 214 psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 215 216 pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 217 218 punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 219 psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 220 221 222 pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 223 224 punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 225 pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 226 227 paddsw xmm4, xmm7 228 paddsw xmm4, xmm5 229 230 paddsw xmm4, xmm3 231 paddsw xmm4, xmm6 232 233 paddsw xmm4, xmm1 234 paddsw xmm4, [GLOBAL(rd)] 235 236 psraw xmm4, 7 237 238 packuswb xmm4, xmm0 239 punpcklbw xmm4, xmm0 240 241 movdqa XMMWORD Ptr [rdi], xmm4 242 243 movdqa xmm3, xmm2 244 movdqa xmm4, xmm2 245 246 movdqa xmm5, xmm2 247 movdqa xmm6, xmm2 248 249 movdqa xmm7, xmm2 250 251 punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 252 psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 253 254 pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 255 punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 256 257 psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 258 pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 259 260 261 punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 262 psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 263 264 pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 265 266 punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 267 psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 268 269 pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 270 271 punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 272 psrldq xmm2, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 273 274 pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 275 276 punpcklbw xmm2, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 277 pmullw xmm2, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 278 279 280 paddsw xmm4, xmm7 281 paddsw xmm4, xmm5 282 283 paddsw xmm4, xmm3 284 paddsw xmm4, xmm6 285 286 paddsw xmm4, xmm2 287 paddsw xmm4, [GLOBAL(rd)] 288 289 psraw xmm4, 7 290 291 packuswb xmm4, xmm0 292 punpcklbw xmm4, xmm0 293 294 movdqa XMMWORD Ptr [rdi+16], xmm4 295 296 lea rsi, [rsi + rax] 297 %if ABI_IS_32BIT 298 add rdi, DWORD Ptr arg(5) ;[output_width] 299 %else 300 add rdi, r8 301 %endif 302 303 dec rcx 304 jnz filter_block1d16_h6_sse2_rowloop ; next row 305 306 ; begin epilog 307 pop rdi 308 pop rsi 309 RESTORE_GOT 310 RESTORE_XMM 311 UNSHADOW_ARGS 312 pop rbp 313 ret 314 315 316 ;void vp8_filter_block1d8_v6_sse2 317 ;( 318 ; short *src_ptr, 319 ; unsigned char *output_ptr, 320 ; int dst_ptich, 321 ; unsigned int pixels_per_line, 322 ; unsigned int pixel_step, 323 ; unsigned int output_height, 324 ; unsigned int output_width, 325 ; short * vp8_filter 326 ;) 327 ;/************************************************************************************ 328 ; Notes: filter_block1d8_v6 applies a 6 tap filter vertically to the input pixels. The 329 ; input pixel array has output_height rows. 330 ;*************************************************************************************/ 331 global sym(vp8_filter_block1d8_v6_sse2) 332 sym(vp8_filter_block1d8_v6_sse2): 333 push rbp 334 mov rbp, rsp 335 SHADOW_ARGS_TO_STACK 8 336 SAVE_XMM 337 GET_GOT rbx 338 push rsi 339 push rdi 340 ; end prolog 341 342 mov rax, arg(7) ;vp8_filter 343 movsxd rdx, dword ptr arg(3) ;pixels_per_line 344 345 mov rdi, arg(1) ;output_ptr 346 mov rsi, arg(0) ;src_ptr 347 348 sub rsi, rdx 349 sub rsi, rdx 350 351 movsxd rcx, DWORD PTR arg(5) ;[output_height] 352 pxor xmm0, xmm0 ; clear xmm0 353 354 movdqa xmm7, XMMWORD PTR [GLOBAL(rd)] 355 %if ABI_IS_32BIT=0 356 movsxd r8, dword ptr arg(2) ; dst_ptich 357 %endif 358 359 vp8_filter_block1d8_v6_sse2_loop: 360 movdqa xmm1, XMMWORD PTR [rsi] 361 pmullw xmm1, [rax] 362 363 movdqa xmm2, XMMWORD PTR [rsi + rdx] 364 pmullw xmm2, [rax + 16] 365 366 movdqa xmm3, XMMWORD PTR [rsi + rdx * 2] 367 pmullw xmm3, [rax + 32] 368 369 movdqa xmm5, XMMWORD PTR [rsi + rdx * 4] 370 pmullw xmm5, [rax + 64] 371 372 add rsi, rdx 373 movdqa xmm4, XMMWORD PTR [rsi + rdx * 2] 374 375 pmullw xmm4, [rax + 48] 376 movdqa xmm6, XMMWORD PTR [rsi + rdx * 4] 377 378 pmullw xmm6, [rax + 80] 379 380 paddsw xmm2, xmm5 381 paddsw xmm2, xmm3 382 383 paddsw xmm2, xmm1 384 paddsw xmm2, xmm4 385 386 paddsw xmm2, xmm6 387 paddsw xmm2, xmm7 388 389 psraw xmm2, 7 390 packuswb xmm2, xmm0 ; pack and saturate 391 392 movq QWORD PTR [rdi], xmm2 ; store the results in the destination 393 %if ABI_IS_32BIT 394 add rdi, DWORD PTR arg(2) ;[dst_ptich] 395 %else 396 add rdi, r8 397 %endif 398 dec rcx ; decrement count 399 jnz vp8_filter_block1d8_v6_sse2_loop ; next row 400 401 ; begin epilog 402 pop rdi 403 pop rsi 404 RESTORE_GOT 405 RESTORE_XMM 406 UNSHADOW_ARGS 407 pop rbp 408 ret 409 410 411 ;void vp8_filter_block1d16_v6_sse2 412 ;( 413 ; unsigned short *src_ptr, 414 ; unsigned char *output_ptr, 415 ; int dst_ptich, 416 ; unsigned int pixels_per_line, 417 ; unsigned int pixel_step, 418 ; unsigned int output_height, 419 ; unsigned int output_width, 420 ; const short *vp8_filter 421 ;) 422 ;/************************************************************************************ 423 ; Notes: filter_block1d16_v6 applies a 6 tap filter vertically to the input pixels. The 424 ; input pixel array has output_height rows. 425 ;*************************************************************************************/ 426 global sym(vp8_filter_block1d16_v6_sse2) 427 sym(vp8_filter_block1d16_v6_sse2): 428 push rbp 429 mov rbp, rsp 430 SHADOW_ARGS_TO_STACK 8 431 SAVE_XMM 432 GET_GOT rbx 433 push rsi 434 push rdi 435 ; end prolog 436 437 mov rax, arg(7) ;vp8_filter 438 movsxd rdx, dword ptr arg(3) ;pixels_per_line 439 440 mov rdi, arg(1) ;output_ptr 441 mov rsi, arg(0) ;src_ptr 442 443 sub rsi, rdx 444 sub rsi, rdx 445 446 movsxd rcx, DWORD PTR arg(5) ;[output_height] 447 %if ABI_IS_32BIT=0 448 movsxd r8, dword ptr arg(2) ; dst_ptich 449 %endif 450 451 vp8_filter_block1d16_v6_sse2_loop: 452 ; The order for adding 6-tap is 2 5 3 1 4 6. Read in data in that order. 453 movdqa xmm1, XMMWORD PTR [rsi + rdx] ; line 2 454 movdqa xmm2, XMMWORD PTR [rsi + rdx + 16] 455 pmullw xmm1, [rax + 16] 456 pmullw xmm2, [rax + 16] 457 458 movdqa xmm3, XMMWORD PTR [rsi + rdx * 4] ; line 5 459 movdqa xmm4, XMMWORD PTR [rsi + rdx * 4 + 16] 460 pmullw xmm3, [rax + 64] 461 pmullw xmm4, [rax + 64] 462 463 movdqa xmm5, XMMWORD PTR [rsi + rdx * 2] ; line 3 464 movdqa xmm6, XMMWORD PTR [rsi + rdx * 2 + 16] 465 pmullw xmm5, [rax + 32] 466 pmullw xmm6, [rax + 32] 467 468 movdqa xmm7, XMMWORD PTR [rsi] ; line 1 469 movdqa xmm0, XMMWORD PTR [rsi + 16] 470 pmullw xmm7, [rax] 471 pmullw xmm0, [rax] 472 473 paddsw xmm1, xmm3 474 paddsw xmm2, xmm4 475 paddsw xmm1, xmm5 476 paddsw xmm2, xmm6 477 paddsw xmm1, xmm7 478 paddsw xmm2, xmm0 479 480 add rsi, rdx 481 482 movdqa xmm3, XMMWORD PTR [rsi + rdx * 2] ; line 4 483 movdqa xmm4, XMMWORD PTR [rsi + rdx * 2 + 16] 484 pmullw xmm3, [rax + 48] 485 pmullw xmm4, [rax + 48] 486 487 movdqa xmm5, XMMWORD PTR [rsi + rdx * 4] ; line 6 488 movdqa xmm6, XMMWORD PTR [rsi + rdx * 4 + 16] 489 pmullw xmm5, [rax + 80] 490 pmullw xmm6, [rax + 80] 491 492 movdqa xmm7, XMMWORD PTR [GLOBAL(rd)] 493 pxor xmm0, xmm0 ; clear xmm0 494 495 paddsw xmm1, xmm3 496 paddsw xmm2, xmm4 497 paddsw xmm1, xmm5 498 paddsw xmm2, xmm6 499 500 paddsw xmm1, xmm7 501 paddsw xmm2, xmm7 502 503 psraw xmm1, 7 504 psraw xmm2, 7 505 506 packuswb xmm1, xmm2 ; pack and saturate 507 movdqa XMMWORD PTR [rdi], xmm1 ; store the results in the destination 508 %if ABI_IS_32BIT 509 add rdi, DWORD PTR arg(2) ;[dst_ptich] 510 %else 511 add rdi, r8 512 %endif 513 dec rcx ; decrement count 514 jnz vp8_filter_block1d16_v6_sse2_loop ; next row 515 516 ; begin epilog 517 pop rdi 518 pop rsi 519 RESTORE_GOT 520 RESTORE_XMM 521 UNSHADOW_ARGS 522 pop rbp 523 ret 524 525 526 ;void vp8_filter_block1d8_h6_only_sse2 527 ;( 528 ; unsigned char *src_ptr, 529 ; unsigned int src_pixels_per_line, 530 ; unsigned char *output_ptr, 531 ; int dst_ptich, 532 ; unsigned int output_height, 533 ; const short *vp8_filter 534 ;) 535 ; First-pass filter only when yoffset==0 536 global sym(vp8_filter_block1d8_h6_only_sse2) 537 sym(vp8_filter_block1d8_h6_only_sse2): 538 push rbp 539 mov rbp, rsp 540 SHADOW_ARGS_TO_STACK 6 541 SAVE_XMM 542 GET_GOT rbx 543 push rsi 544 push rdi 545 ; end prolog 546 547 mov rdx, arg(5) ;vp8_filter 548 mov rsi, arg(0) ;src_ptr 549 550 mov rdi, arg(2) ;output_ptr 551 552 movsxd rcx, dword ptr arg(4) ;output_height 553 movsxd rax, dword ptr arg(1) ;src_pixels_per_line ; Pitch for Source 554 %if ABI_IS_32BIT=0 555 movsxd r8, dword ptr arg(3) ;dst_ptich 556 %endif 557 pxor xmm0, xmm0 ; clear xmm0 for unpack 558 559 filter_block1d8_h6_only_rowloop: 560 movq xmm3, MMWORD PTR [rsi - 2] 561 movq xmm1, MMWORD PTR [rsi + 6] 562 563 prefetcht2 [rsi+rax-2] 564 565 pslldq xmm1, 8 566 por xmm1, xmm3 567 568 movdqa xmm4, xmm1 569 movdqa xmm5, xmm1 570 571 movdqa xmm6, xmm1 572 movdqa xmm7, xmm1 573 574 punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 575 psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 576 577 pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 578 punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 579 580 psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 581 pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 582 583 584 punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 585 psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 586 587 pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 588 589 punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 590 psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 591 592 pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 593 594 punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 595 psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 596 597 598 pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 599 600 punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 601 pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 602 603 604 paddsw xmm4, xmm7 605 paddsw xmm4, xmm5 606 607 paddsw xmm4, xmm3 608 paddsw xmm4, xmm6 609 610 paddsw xmm4, xmm1 611 paddsw xmm4, [GLOBAL(rd)] 612 613 psraw xmm4, 7 614 615 packuswb xmm4, xmm0 616 617 movq QWORD PTR [rdi], xmm4 ; store the results in the destination 618 lea rsi, [rsi + rax] 619 620 %if ABI_IS_32BIT 621 add rdi, DWORD Ptr arg(3) ;dst_ptich 622 %else 623 add rdi, r8 624 %endif 625 dec rcx 626 627 jnz filter_block1d8_h6_only_rowloop ; next row 628 629 ; begin epilog 630 pop rdi 631 pop rsi 632 RESTORE_GOT 633 RESTORE_XMM 634 UNSHADOW_ARGS 635 pop rbp 636 ret 637 638 639 ;void vp8_filter_block1d16_h6_only_sse2 640 ;( 641 ; unsigned char *src_ptr, 642 ; unsigned int src_pixels_per_line, 643 ; unsigned char *output_ptr, 644 ; int dst_ptich, 645 ; unsigned int output_height, 646 ; const short *vp8_filter 647 ;) 648 ; First-pass filter only when yoffset==0 649 global sym(vp8_filter_block1d16_h6_only_sse2) 650 sym(vp8_filter_block1d16_h6_only_sse2): 651 push rbp 652 mov rbp, rsp 653 SHADOW_ARGS_TO_STACK 6 654 SAVE_XMM 655 GET_GOT rbx 656 push rsi 657 push rdi 658 ; end prolog 659 660 mov rdx, arg(5) ;vp8_filter 661 mov rsi, arg(0) ;src_ptr 662 663 mov rdi, arg(2) ;output_ptr 664 665 movsxd rcx, dword ptr arg(4) ;output_height 666 movsxd rax, dword ptr arg(1) ;src_pixels_per_line ; Pitch for Source 667 %if ABI_IS_32BIT=0 668 movsxd r8, dword ptr arg(3) ;dst_ptich 669 %endif 670 671 pxor xmm0, xmm0 ; clear xmm0 for unpack 672 673 filter_block1d16_h6_only_sse2_rowloop: 674 movq xmm3, MMWORD PTR [rsi - 2] 675 movq xmm1, MMWORD PTR [rsi + 6] 676 677 movq xmm2, MMWORD PTR [rsi +14] 678 pslldq xmm2, 8 679 680 por xmm2, xmm1 681 prefetcht2 [rsi+rax-2] 682 683 pslldq xmm1, 8 684 por xmm1, xmm3 685 686 movdqa xmm4, xmm1 687 movdqa xmm5, xmm1 688 689 movdqa xmm6, xmm1 690 movdqa xmm7, xmm1 691 692 punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 693 psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 694 695 pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 696 punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 697 698 psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 699 pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 700 701 punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 702 psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 703 704 pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 705 706 punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 707 psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 708 709 pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 710 711 punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 712 psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 713 714 pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 715 716 punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 717 pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 718 719 paddsw xmm4, xmm7 720 paddsw xmm4, xmm5 721 722 paddsw xmm4, xmm3 723 paddsw xmm4, xmm6 724 725 paddsw xmm4, xmm1 726 paddsw xmm4, [GLOBAL(rd)] 727 728 psraw xmm4, 7 729 730 packuswb xmm4, xmm0 ; lower 8 bytes 731 732 movq QWORD Ptr [rdi], xmm4 ; store the results in the destination 733 734 movdqa xmm3, xmm2 735 movdqa xmm4, xmm2 736 737 movdqa xmm5, xmm2 738 movdqa xmm6, xmm2 739 740 movdqa xmm7, xmm2 741 742 punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 743 psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 744 745 pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 746 punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 747 748 psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 749 pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 750 751 punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 752 psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 753 754 pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 755 756 punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 757 psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 758 759 pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 760 761 punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 762 psrldq xmm2, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 763 764 pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 765 766 punpcklbw xmm2, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 767 pmullw xmm2, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 768 769 paddsw xmm4, xmm7 770 paddsw xmm4, xmm5 771 772 paddsw xmm4, xmm3 773 paddsw xmm4, xmm6 774 775 paddsw xmm4, xmm2 776 paddsw xmm4, [GLOBAL(rd)] 777 778 psraw xmm4, 7 779 780 packuswb xmm4, xmm0 ; higher 8 bytes 781 782 movq QWORD Ptr [rdi+8], xmm4 ; store the results in the destination 783 784 lea rsi, [rsi + rax] 785 %if ABI_IS_32BIT 786 add rdi, DWORD Ptr arg(3) ;dst_ptich 787 %else 788 add rdi, r8 789 %endif 790 791 dec rcx 792 jnz filter_block1d16_h6_only_sse2_rowloop ; next row 793 794 ; begin epilog 795 pop rdi 796 pop rsi 797 RESTORE_GOT 798 RESTORE_XMM 799 UNSHADOW_ARGS 800 pop rbp 801 ret 802 803 804 ;void vp8_filter_block1d8_v6_only_sse2 805 ;( 806 ; unsigned char *src_ptr, 807 ; unsigned int src_pixels_per_line, 808 ; unsigned char *output_ptr, 809 ; int dst_ptich, 810 ; unsigned int output_height, 811 ; const short *vp8_filter 812 ;) 813 ; Second-pass filter only when xoffset==0 814 global sym(vp8_filter_block1d8_v6_only_sse2) 815 sym(vp8_filter_block1d8_v6_only_sse2): 816 push rbp 817 mov rbp, rsp 818 SHADOW_ARGS_TO_STACK 6 819 SAVE_XMM 820 GET_GOT rbx 821 push rsi 822 push rdi 823 ; end prolog 824 825 mov rsi, arg(0) ;src_ptr 826 mov rdi, arg(2) ;output_ptr 827 828 movsxd rcx, dword ptr arg(4) ;output_height 829 movsxd rdx, dword ptr arg(1) ;src_pixels_per_line 830 831 mov rax, arg(5) ;vp8_filter 832 833 pxor xmm0, xmm0 ; clear xmm0 834 835 movdqa xmm7, XMMWORD PTR [GLOBAL(rd)] 836 %if ABI_IS_32BIT=0 837 movsxd r8, dword ptr arg(3) ; dst_ptich 838 %endif 839 840 vp8_filter_block1d8_v6_only_sse2_loop: 841 movq xmm1, MMWORD PTR [rsi] 842 movq xmm2, MMWORD PTR [rsi + rdx] 843 movq xmm3, MMWORD PTR [rsi + rdx * 2] 844 movq xmm5, MMWORD PTR [rsi + rdx * 4] 845 add rsi, rdx 846 movq xmm4, MMWORD PTR [rsi + rdx * 2] 847 movq xmm6, MMWORD PTR [rsi + rdx * 4] 848 849 punpcklbw xmm1, xmm0 850 pmullw xmm1, [rax] 851 852 punpcklbw xmm2, xmm0 853 pmullw xmm2, [rax + 16] 854 855 punpcklbw xmm3, xmm0 856 pmullw xmm3, [rax + 32] 857 858 punpcklbw xmm5, xmm0 859 pmullw xmm5, [rax + 64] 860 861 punpcklbw xmm4, xmm0 862 pmullw xmm4, [rax + 48] 863 864 punpcklbw xmm6, xmm0 865 pmullw xmm6, [rax + 80] 866 867 paddsw xmm2, xmm5 868 paddsw xmm2, xmm3 869 870 paddsw xmm2, xmm1 871 paddsw xmm2, xmm4 872 873 paddsw xmm2, xmm6 874 paddsw xmm2, xmm7 875 876 psraw xmm2, 7 877 packuswb xmm2, xmm0 ; pack and saturate 878 879 movq QWORD PTR [rdi], xmm2 ; store the results in the destination 880 %if ABI_IS_32BIT 881 add rdi, DWORD PTR arg(3) ;[dst_ptich] 882 %else 883 add rdi, r8 884 %endif 885 dec rcx ; decrement count 886 jnz vp8_filter_block1d8_v6_only_sse2_loop ; next row 887 888 ; begin epilog 889 pop rdi 890 pop rsi 891 RESTORE_GOT 892 RESTORE_XMM 893 UNSHADOW_ARGS 894 pop rbp 895 ret 896 897 898 ;void vp8_unpack_block1d16_h6_sse2 899 ;( 900 ; unsigned char *src_ptr, 901 ; unsigned short *output_ptr, 902 ; unsigned int src_pixels_per_line, 903 ; unsigned int output_height, 904 ; unsigned int output_width 905 ;) 906 global sym(vp8_unpack_block1d16_h6_sse2) 907 sym(vp8_unpack_block1d16_h6_sse2): 908 push rbp 909 mov rbp, rsp 910 SHADOW_ARGS_TO_STACK 5 911 ;SAVE_XMM ;xmm6, xmm7 are not used here. 912 GET_GOT rbx 913 push rsi 914 push rdi 915 ; end prolog 916 917 mov rsi, arg(0) ;src_ptr 918 mov rdi, arg(1) ;output_ptr 919 920 movsxd rcx, dword ptr arg(3) ;output_height 921 movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source 922 923 pxor xmm0, xmm0 ; clear xmm0 for unpack 924 %if ABI_IS_32BIT=0 925 movsxd r8, dword ptr arg(4) ;output_width ; Pitch for Source 926 %endif 927 928 unpack_block1d16_h6_sse2_rowloop: 929 movq xmm1, MMWORD PTR [rsi] ; 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 -2 930 movq xmm3, MMWORD PTR [rsi+8] ; make copy of xmm1 931 932 punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 933 punpcklbw xmm1, xmm0 934 935 movdqa XMMWORD Ptr [rdi], xmm1 936 movdqa XMMWORD Ptr [rdi + 16], xmm3 937 938 lea rsi, [rsi + rax] 939 %if ABI_IS_32BIT 940 add rdi, DWORD Ptr arg(4) ;[output_width] 941 %else 942 add rdi, r8 943 %endif 944 dec rcx 945 jnz unpack_block1d16_h6_sse2_rowloop ; next row 946 947 ; begin epilog 948 pop rdi 949 pop rsi 950 RESTORE_GOT 951 ;RESTORE_XMM 952 UNSHADOW_ARGS 953 pop rbp 954 ret 955 956 957 ;void vp8_bilinear_predict16x16_sse2 958 ;( 959 ; unsigned char *src_ptr, 960 ; int src_pixels_per_line, 961 ; int xoffset, 962 ; int yoffset, 963 ; unsigned char *dst_ptr, 964 ; int dst_pitch 965 ;) 966 extern sym(vp8_bilinear_filters_mmx) 967 global sym(vp8_bilinear_predict16x16_sse2) 968 sym(vp8_bilinear_predict16x16_sse2): 969 push rbp 970 mov rbp, rsp 971 SHADOW_ARGS_TO_STACK 6 972 SAVE_XMM 973 GET_GOT rbx 974 push rsi 975 push rdi 976 ; end prolog 977 978 ;const short *HFilter = bilinear_filters_mmx[xoffset] 979 ;const short *VFilter = bilinear_filters_mmx[yoffset] 980 981 lea rcx, [GLOBAL(sym(vp8_bilinear_filters_mmx))] 982 movsxd rax, dword ptr arg(2) ;xoffset 983 984 cmp rax, 0 ;skip first_pass filter if xoffset=0 985 je b16x16_sp_only 986 987 shl rax, 5 988 add rax, rcx ;HFilter 989 990 mov rdi, arg(4) ;dst_ptr 991 mov rsi, arg(0) ;src_ptr 992 movsxd rdx, dword ptr arg(5) ;dst_pitch 993 994 movdqa xmm1, [rax] 995 movdqa xmm2, [rax+16] 996 997 movsxd rax, dword ptr arg(3) ;yoffset 998 999 cmp rax, 0 ;skip second_pass filter if yoffset=0 1000 je b16x16_fp_only 1001 1002 shl rax, 5 1003 add rax, rcx ;VFilter 1004 1005 lea rcx, [rdi+rdx*8] 1006 lea rcx, [rcx+rdx*8] 1007 movsxd rdx, dword ptr arg(1) ;src_pixels_per_line 1008 1009 pxor xmm0, xmm0 1010 1011 %if ABI_IS_32BIT=0 1012 movsxd r8, dword ptr arg(5) ;dst_pitch 1013 %endif 1014 ; get the first horizontal line done 1015 movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 1016 movdqa xmm4, xmm3 ; make a copy of current line 1017 1018 punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06 1019 punpckhbw xmm4, xmm0 1020 1021 pmullw xmm3, xmm1 1022 pmullw xmm4, xmm1 1023 1024 movdqu xmm5, [rsi+1] 1025 movdqa xmm6, xmm5 1026 1027 punpcklbw xmm5, xmm0 1028 punpckhbw xmm6, xmm0 1029 1030 pmullw xmm5, xmm2 1031 pmullw xmm6, xmm2 1032 1033 paddw xmm3, xmm5 1034 paddw xmm4, xmm6 1035 1036 paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value 1037 psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 1038 1039 paddw xmm4, [GLOBAL(rd)] 1040 psraw xmm4, VP8_FILTER_SHIFT 1041 1042 movdqa xmm7, xmm3 1043 packuswb xmm7, xmm4 1044 1045 add rsi, rdx ; next line 1046 next_row: 1047 movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 1048 movdqa xmm4, xmm3 ; make a copy of current line 1049 1050 punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06 1051 punpckhbw xmm4, xmm0 1052 1053 pmullw xmm3, xmm1 1054 pmullw xmm4, xmm1 1055 1056 movdqu xmm5, [rsi+1] 1057 movdqa xmm6, xmm5 1058 1059 punpcklbw xmm5, xmm0 1060 punpckhbw xmm6, xmm0 1061 1062 pmullw xmm5, xmm2 1063 pmullw xmm6, xmm2 1064 1065 paddw xmm3, xmm5 1066 paddw xmm4, xmm6 1067 1068 movdqa xmm5, xmm7 1069 movdqa xmm6, xmm7 1070 1071 punpcklbw xmm5, xmm0 1072 punpckhbw xmm6, xmm0 1073 1074 pmullw xmm5, [rax] 1075 pmullw xmm6, [rax] 1076 1077 paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value 1078 psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 1079 1080 paddw xmm4, [GLOBAL(rd)] 1081 psraw xmm4, VP8_FILTER_SHIFT 1082 1083 movdqa xmm7, xmm3 1084 packuswb xmm7, xmm4 1085 1086 pmullw xmm3, [rax+16] 1087 pmullw xmm4, [rax+16] 1088 1089 paddw xmm3, xmm5 1090 paddw xmm4, xmm6 1091 1092 paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value 1093 psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 1094 1095 paddw xmm4, [GLOBAL(rd)] 1096 psraw xmm4, VP8_FILTER_SHIFT 1097 1098 packuswb xmm3, xmm4 1099 movdqa [rdi], xmm3 ; store the results in the destination 1100 1101 add rsi, rdx ; next line 1102 %if ABI_IS_32BIT 1103 add rdi, DWORD PTR arg(5) ;dst_pitch 1104 %else 1105 add rdi, r8 1106 %endif 1107 1108 cmp rdi, rcx 1109 jne next_row 1110 1111 jmp done 1112 1113 b16x16_sp_only: 1114 movsxd rax, dword ptr arg(3) ;yoffset 1115 shl rax, 5 1116 add rax, rcx ;VFilter 1117 1118 mov rdi, arg(4) ;dst_ptr 1119 mov rsi, arg(0) ;src_ptr 1120 movsxd rdx, dword ptr arg(5) ;dst_pitch 1121 1122 movdqa xmm1, [rax] 1123 movdqa xmm2, [rax+16] 1124 1125 lea rcx, [rdi+rdx*8] 1126 lea rcx, [rcx+rdx*8] 1127 movsxd rax, dword ptr arg(1) ;src_pixels_per_line 1128 1129 pxor xmm0, xmm0 1130 1131 ; get the first horizontal line done 1132 movdqu xmm7, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 1133 1134 add rsi, rax ; next line 1135 next_row_spo: 1136 movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 1137 1138 movdqa xmm5, xmm7 1139 movdqa xmm6, xmm7 1140 1141 movdqa xmm4, xmm3 ; make a copy of current line 1142 movdqa xmm7, xmm3 1143 1144 punpcklbw xmm5, xmm0 1145 punpckhbw xmm6, xmm0 1146 punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06 1147 punpckhbw xmm4, xmm0 1148 1149 pmullw xmm5, xmm1 1150 pmullw xmm6, xmm1 1151 pmullw xmm3, xmm2 1152 pmullw xmm4, xmm2 1153 1154 paddw xmm3, xmm5 1155 paddw xmm4, xmm6 1156 1157 paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value 1158 psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 1159 1160 paddw xmm4, [GLOBAL(rd)] 1161 psraw xmm4, VP8_FILTER_SHIFT 1162 1163 packuswb xmm3, xmm4 1164 movdqa [rdi], xmm3 ; store the results in the destination 1165 1166 add rsi, rax ; next line 1167 add rdi, rdx ;dst_pitch 1168 cmp rdi, rcx 1169 jne next_row_spo 1170 1171 jmp done 1172 1173 b16x16_fp_only: 1174 lea rcx, [rdi+rdx*8] 1175 lea rcx, [rcx+rdx*8] 1176 movsxd rax, dword ptr arg(1) ;src_pixels_per_line 1177 pxor xmm0, xmm0 1178 1179 next_row_fpo: 1180 movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 1181 movdqa xmm4, xmm3 ; make a copy of current line 1182 1183 punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06 1184 punpckhbw xmm4, xmm0 1185 1186 pmullw xmm3, xmm1 1187 pmullw xmm4, xmm1 1188 1189 movdqu xmm5, [rsi+1] 1190 movdqa xmm6, xmm5 1191 1192 punpcklbw xmm5, xmm0 1193 punpckhbw xmm6, xmm0 1194 1195 pmullw xmm5, xmm2 1196 pmullw xmm6, xmm2 1197 1198 paddw xmm3, xmm5 1199 paddw xmm4, xmm6 1200 1201 paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value 1202 psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 1203 1204 paddw xmm4, [GLOBAL(rd)] 1205 psraw xmm4, VP8_FILTER_SHIFT 1206 1207 packuswb xmm3, xmm4 1208 movdqa [rdi], xmm3 ; store the results in the destination 1209 1210 add rsi, rax ; next line 1211 add rdi, rdx ; dst_pitch 1212 cmp rdi, rcx 1213 jne next_row_fpo 1214 1215 done: 1216 ; begin epilog 1217 pop rdi 1218 pop rsi 1219 RESTORE_GOT 1220 RESTORE_XMM 1221 UNSHADOW_ARGS 1222 pop rbp 1223 ret 1224 1225 1226 ;void vp8_bilinear_predict8x8_sse2 1227 ;( 1228 ; unsigned char *src_ptr, 1229 ; int src_pixels_per_line, 1230 ; int xoffset, 1231 ; int yoffset, 1232 ; unsigned char *dst_ptr, 1233 ; int dst_pitch 1234 ;) 1235 extern sym(vp8_bilinear_filters_mmx) 1236 global sym(vp8_bilinear_predict8x8_sse2) 1237 sym(vp8_bilinear_predict8x8_sse2): 1238 push rbp 1239 mov rbp, rsp 1240 SHADOW_ARGS_TO_STACK 6 1241 SAVE_XMM 1242 GET_GOT rbx 1243 push rsi 1244 push rdi 1245 ; end prolog 1246 1247 ALIGN_STACK 16, rax 1248 sub rsp, 144 ; reserve 144 bytes 1249 1250 ;const short *HFilter = bilinear_filters_mmx[xoffset] 1251 ;const short *VFilter = bilinear_filters_mmx[yoffset] 1252 lea rcx, [GLOBAL(sym(vp8_bilinear_filters_mmx))] 1253 1254 mov rsi, arg(0) ;src_ptr 1255 movsxd rdx, dword ptr arg(1) ;src_pixels_per_line 1256 1257 ;Read 9-line unaligned data in and put them on stack. This gives a big 1258 ;performance boost. 1259 movdqu xmm0, [rsi] 1260 lea rax, [rdx + rdx*2] 1261 movdqu xmm1, [rsi+rdx] 1262 movdqu xmm2, [rsi+rdx*2] 1263 add rsi, rax 1264 movdqu xmm3, [rsi] 1265 movdqu xmm4, [rsi+rdx] 1266 movdqu xmm5, [rsi+rdx*2] 1267 add rsi, rax 1268 movdqu xmm6, [rsi] 1269 movdqu xmm7, [rsi+rdx] 1270 1271 movdqa XMMWORD PTR [rsp], xmm0 1272 1273 movdqu xmm0, [rsi+rdx*2] 1274 1275 movdqa XMMWORD PTR [rsp+16], xmm1 1276 movdqa XMMWORD PTR [rsp+32], xmm2 1277 movdqa XMMWORD PTR [rsp+48], xmm3 1278 movdqa XMMWORD PTR [rsp+64], xmm4 1279 movdqa XMMWORD PTR [rsp+80], xmm5 1280 movdqa XMMWORD PTR [rsp+96], xmm6 1281 movdqa XMMWORD PTR [rsp+112], xmm7 1282 movdqa XMMWORD PTR [rsp+128], xmm0 1283 1284 movsxd rax, dword ptr arg(2) ;xoffset 1285 shl rax, 5 1286 add rax, rcx ;HFilter 1287 1288 mov rdi, arg(4) ;dst_ptr 1289 movsxd rdx, dword ptr arg(5) ;dst_pitch 1290 1291 movdqa xmm1, [rax] 1292 movdqa xmm2, [rax+16] 1293 1294 movsxd rax, dword ptr arg(3) ;yoffset 1295 shl rax, 5 1296 add rax, rcx ;VFilter 1297 1298 lea rcx, [rdi+rdx*8] 1299 1300 movdqa xmm5, [rax] 1301 movdqa xmm6, [rax+16] 1302 1303 pxor xmm0, xmm0 1304 1305 ; get the first horizontal line done 1306 movdqa xmm3, XMMWORD PTR [rsp] 1307 movdqa xmm4, xmm3 ; make a copy of current line 1308 psrldq xmm4, 1 1309 1310 punpcklbw xmm3, xmm0 ; 00 01 02 03 04 05 06 07 1311 punpcklbw xmm4, xmm0 ; 01 02 03 04 05 06 07 08 1312 1313 pmullw xmm3, xmm1 1314 pmullw xmm4, xmm2 1315 1316 paddw xmm3, xmm4 1317 1318 paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value 1319 psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 1320 1321 movdqa xmm7, xmm3 1322 add rsp, 16 ; next line 1323 next_row8x8: 1324 movdqa xmm3, XMMWORD PTR [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 1325 movdqa xmm4, xmm3 ; make a copy of current line 1326 psrldq xmm4, 1 1327 1328 punpcklbw xmm3, xmm0 ; 00 01 02 03 04 05 06 07 1329 punpcklbw xmm4, xmm0 ; 01 02 03 04 05 06 07 08 1330 1331 pmullw xmm3, xmm1 1332 pmullw xmm4, xmm2 1333 1334 paddw xmm3, xmm4 1335 pmullw xmm7, xmm5 1336 1337 paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value 1338 psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 1339 1340 movdqa xmm4, xmm3 1341 1342 pmullw xmm3, xmm6 1343 paddw xmm3, xmm7 1344 1345 movdqa xmm7, xmm4 1346 1347 paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value 1348 psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 1349 1350 packuswb xmm3, xmm0 1351 movq [rdi], xmm3 ; store the results in the destination 1352 1353 add rsp, 16 ; next line 1354 add rdi, rdx 1355 1356 cmp rdi, rcx 1357 jne next_row8x8 1358 1359 ;add rsp, 144 1360 pop rsp 1361 ; begin epilog 1362 pop rdi 1363 pop rsi 1364 RESTORE_GOT 1365 RESTORE_XMM 1366 UNSHADOW_ARGS 1367 pop rbp 1368 ret 1369 1370 1371 SECTION_RODATA 1372 align 16 1373 rd: 1374 times 8 dw 0x40 1375