1 ; 2 ; Copyright (c) 2014 The WebM project authors. All Rights Reserved. 3 ; 4 ; Use of this source code is governed by a BSD-style license 5 ; that can be found in the LICENSE file in the root of the source 6 ; tree. An additional intellectual property rights grant can be found 7 ; in the file PATENTS. All contributing project authors may 8 ; be found in the AUTHORS file in the root of the source tree. 9 ; 10 11 12 %include "vpx_ports/x86_abi_support.asm" 13 14 ;Note: tap3 and tap4 have to be applied and added after other taps to avoid 15 ;overflow. 16 17 %macro HIGH_GET_FILTERS_4 0 18 mov rdx, arg(5) ;filter ptr 19 mov rcx, 0x00000040 20 21 movdqa xmm7, [rdx] ;load filters 22 pshuflw xmm0, xmm7, 0b ;k0 23 pshuflw xmm1, xmm7, 01010101b ;k1 24 pshuflw xmm2, xmm7, 10101010b ;k2 25 pshuflw xmm3, xmm7, 11111111b ;k3 26 psrldq xmm7, 8 27 pshuflw xmm4, xmm7, 0b ;k4 28 pshuflw xmm5, xmm7, 01010101b ;k5 29 pshuflw xmm6, xmm7, 10101010b ;k6 30 pshuflw xmm7, xmm7, 11111111b ;k7 31 32 punpcklwd xmm0, xmm6 33 punpcklwd xmm2, xmm5 34 punpcklwd xmm3, xmm4 35 punpcklwd xmm1, xmm7 36 37 movdqa k0k6, xmm0 38 movdqa k2k5, xmm2 39 movdqa k3k4, xmm3 40 movdqa k1k7, xmm1 41 42 movq xmm6, rcx 43 pshufd xmm6, xmm6, 0 44 movdqa krd, xmm6 45 46 ;Compute max and min values of a pixel 47 mov rdx, 0x00010001 48 movsxd rcx, DWORD PTR arg(6) ;bps 49 movq xmm0, rdx 50 movq xmm1, rcx 51 pshufd xmm0, xmm0, 0b 52 movdqa xmm2, xmm0 53 psllw xmm0, xmm1 54 psubw xmm0, xmm2 55 pxor xmm1, xmm1 56 movdqa max, xmm0 ;max value (for clamping) 57 movdqa min, xmm1 ;min value (for clamping) 58 59 %endm 60 61 %macro HIGH_APPLY_FILTER_4 1 62 punpcklwd xmm0, xmm6 ;two row in one register 63 punpcklwd xmm1, xmm7 64 punpcklwd xmm2, xmm5 65 punpcklwd xmm3, xmm4 66 67 pmaddwd xmm0, k0k6 ;multiply the filter factors 68 pmaddwd xmm1, k1k7 69 pmaddwd xmm2, k2k5 70 pmaddwd xmm3, k3k4 71 72 paddd xmm0, xmm1 ;sum 73 paddd xmm0, xmm2 74 paddd xmm0, xmm3 75 76 paddd xmm0, krd ;rounding 77 psrad xmm0, 7 ;shift 78 packssdw xmm0, xmm0 ;pack to word 79 80 ;clamp the values 81 pminsw xmm0, max 82 pmaxsw xmm0, min 83 84 %if %1 85 movq xmm1, [rdi] 86 pavgw xmm0, xmm1 87 %endif 88 movq [rdi], xmm0 89 %endm 90 91 %macro HIGH_GET_FILTERS 0 92 mov rdx, arg(5) ;filter ptr 93 mov rsi, arg(0) ;src_ptr 94 mov rdi, arg(2) ;output_ptr 95 mov rcx, 0x00000040 96 97 movdqa xmm7, [rdx] ;load filters 98 pshuflw xmm0, xmm7, 0b ;k0 99 pshuflw xmm1, xmm7, 01010101b ;k1 100 pshuflw xmm2, xmm7, 10101010b ;k2 101 pshuflw xmm3, xmm7, 11111111b ;k3 102 pshufhw xmm4, xmm7, 0b ;k4 103 pshufhw xmm5, xmm7, 01010101b ;k5 104 pshufhw xmm6, xmm7, 10101010b ;k6 105 pshufhw xmm7, xmm7, 11111111b ;k7 106 punpcklqdq xmm2, xmm2 107 punpcklqdq xmm3, xmm3 108 punpcklwd xmm0, xmm1 109 punpckhwd xmm6, xmm7 110 punpckhwd xmm2, xmm5 111 punpckhwd xmm3, xmm4 112 113 movdqa k0k1, xmm0 ;store filter factors on stack 114 movdqa k6k7, xmm6 115 movdqa k2k5, xmm2 116 movdqa k3k4, xmm3 117 118 movq xmm6, rcx 119 pshufd xmm6, xmm6, 0 120 movdqa krd, xmm6 ;rounding 121 122 ;Compute max and min values of a pixel 123 mov rdx, 0x00010001 124 movsxd rcx, DWORD PTR arg(6) ;bps 125 movq xmm0, rdx 126 movq xmm1, rcx 127 pshufd xmm0, xmm0, 0b 128 movdqa xmm2, xmm0 129 psllw xmm0, xmm1 130 psubw xmm0, xmm2 131 pxor xmm1, xmm1 132 movdqa max, xmm0 ;max value (for clamping) 133 movdqa min, xmm1 ;min value (for clamping) 134 %endm 135 136 %macro LOAD_VERT_8 1 137 movdqu xmm0, [rsi + %1] ;0 138 movdqu xmm1, [rsi + rax + %1] ;1 139 movdqu xmm6, [rsi + rdx * 2 + %1] ;6 140 lea rsi, [rsi + rax] 141 movdqu xmm7, [rsi + rdx * 2 + %1] ;7 142 movdqu xmm2, [rsi + rax + %1] ;2 143 movdqu xmm3, [rsi + rax * 2 + %1] ;3 144 movdqu xmm4, [rsi + rdx + %1] ;4 145 movdqu xmm5, [rsi + rax * 4 + %1] ;5 146 %endm 147 148 %macro HIGH_APPLY_FILTER_8 2 149 movdqu temp, xmm4 150 movdqa xmm4, xmm0 151 punpcklwd xmm0, xmm1 152 punpckhwd xmm4, xmm1 153 movdqa xmm1, xmm6 154 punpcklwd xmm6, xmm7 155 punpckhwd xmm1, xmm7 156 movdqa xmm7, xmm2 157 punpcklwd xmm2, xmm5 158 punpckhwd xmm7, xmm5 159 160 movdqu xmm5, temp 161 movdqu temp, xmm4 162 movdqa xmm4, xmm3 163 punpcklwd xmm3, xmm5 164 punpckhwd xmm4, xmm5 165 movdqu xmm5, temp 166 167 pmaddwd xmm0, k0k1 168 pmaddwd xmm5, k0k1 169 pmaddwd xmm6, k6k7 170 pmaddwd xmm1, k6k7 171 pmaddwd xmm2, k2k5 172 pmaddwd xmm7, k2k5 173 pmaddwd xmm3, k3k4 174 pmaddwd xmm4, k3k4 175 176 paddd xmm0, xmm6 177 paddd xmm0, xmm2 178 paddd xmm0, xmm3 179 paddd xmm5, xmm1 180 paddd xmm5, xmm7 181 paddd xmm5, xmm4 182 183 paddd xmm0, krd ;rounding 184 paddd xmm5, krd 185 psrad xmm0, 7 ;shift 186 psrad xmm5, 7 187 packssdw xmm0, xmm5 ;pack back to word 188 189 ;clamp the values 190 pminsw xmm0, max 191 pmaxsw xmm0, min 192 193 %if %1 194 movdqu xmm1, [rdi + %2] 195 pavgw xmm0, xmm1 196 %endif 197 movdqu [rdi + %2], xmm0 198 %endm 199 200 ;void vpx_filter_block1d4_v8_sse2 201 ;( 202 ; unsigned char *src_ptr, 203 ; unsigned int src_pitch, 204 ; unsigned char *output_ptr, 205 ; unsigned int out_pitch, 206 ; unsigned int output_height, 207 ; short *filter 208 ;) 209 global sym(vpx_highbd_filter_block1d4_v8_sse2) PRIVATE 210 sym(vpx_highbd_filter_block1d4_v8_sse2): 211 push rbp 212 mov rbp, rsp 213 SHADOW_ARGS_TO_STACK 7 214 SAVE_XMM 7 215 push rsi 216 push rdi 217 push rbx 218 ; end prolog 219 220 ALIGN_STACK 16, rax 221 sub rsp, 16 * 7 222 %define k0k6 [rsp + 16 * 0] 223 %define k2k5 [rsp + 16 * 1] 224 %define k3k4 [rsp + 16 * 2] 225 %define k1k7 [rsp + 16 * 3] 226 %define krd [rsp + 16 * 4] 227 %define max [rsp + 16 * 5] 228 %define min [rsp + 16 * 6] 229 230 HIGH_GET_FILTERS_4 231 232 mov rsi, arg(0) ;src_ptr 233 mov rdi, arg(2) ;output_ptr 234 235 movsxd rax, DWORD PTR arg(1) ;pixels_per_line 236 movsxd rbx, DWORD PTR arg(3) ;out_pitch 237 lea rax, [rax + rax] ;bytes per line 238 lea rbx, [rbx + rbx] 239 lea rdx, [rax + rax * 2] 240 movsxd rcx, DWORD PTR arg(4) ;output_height 241 242 .loop: 243 movq xmm0, [rsi] ;load src: row 0 244 movq xmm1, [rsi + rax] ;1 245 movq xmm6, [rsi + rdx * 2] ;6 246 lea rsi, [rsi + rax] 247 movq xmm7, [rsi + rdx * 2] ;7 248 movq xmm2, [rsi + rax] ;2 249 movq xmm3, [rsi + rax * 2] ;3 250 movq xmm4, [rsi + rdx] ;4 251 movq xmm5, [rsi + rax * 4] ;5 252 253 HIGH_APPLY_FILTER_4 0 254 255 lea rdi, [rdi + rbx] 256 dec rcx 257 jnz .loop 258 259 add rsp, 16 * 7 260 pop rsp 261 pop rbx 262 ; begin epilog 263 pop rdi 264 pop rsi 265 RESTORE_XMM 266 UNSHADOW_ARGS 267 pop rbp 268 ret 269 270 ;void vpx_filter_block1d8_v8_sse2 271 ;( 272 ; unsigned char *src_ptr, 273 ; unsigned int src_pitch, 274 ; unsigned char *output_ptr, 275 ; unsigned int out_pitch, 276 ; unsigned int output_height, 277 ; short *filter 278 ;) 279 global sym(vpx_highbd_filter_block1d8_v8_sse2) PRIVATE 280 sym(vpx_highbd_filter_block1d8_v8_sse2): 281 push rbp 282 mov rbp, rsp 283 SHADOW_ARGS_TO_STACK 7 284 SAVE_XMM 7 285 push rsi 286 push rdi 287 push rbx 288 ; end prolog 289 290 ALIGN_STACK 16, rax 291 sub rsp, 16 * 8 292 %define k0k1 [rsp + 16 * 0] 293 %define k6k7 [rsp + 16 * 1] 294 %define k2k5 [rsp + 16 * 2] 295 %define k3k4 [rsp + 16 * 3] 296 %define krd [rsp + 16 * 4] 297 %define temp [rsp + 16 * 5] 298 %define max [rsp + 16 * 6] 299 %define min [rsp + 16 * 7] 300 301 HIGH_GET_FILTERS 302 303 movsxd rax, DWORD PTR arg(1) ;pixels_per_line 304 movsxd rbx, DWORD PTR arg(3) ;out_pitch 305 lea rax, [rax + rax] ;bytes per line 306 lea rbx, [rbx + rbx] 307 lea rdx, [rax + rax * 2] 308 movsxd rcx, DWORD PTR arg(4) ;output_height 309 310 .loop: 311 LOAD_VERT_8 0 312 HIGH_APPLY_FILTER_8 0, 0 313 314 lea rdi, [rdi + rbx] 315 dec rcx 316 jnz .loop 317 318 add rsp, 16 * 8 319 pop rsp 320 pop rbx 321 ; begin epilog 322 pop rdi 323 pop rsi 324 RESTORE_XMM 325 UNSHADOW_ARGS 326 pop rbp 327 ret 328 329 ;void vpx_filter_block1d16_v8_sse2 330 ;( 331 ; unsigned char *src_ptr, 332 ; unsigned int src_pitch, 333 ; unsigned char *output_ptr, 334 ; unsigned int out_pitch, 335 ; unsigned int output_height, 336 ; short *filter 337 ;) 338 global sym(vpx_highbd_filter_block1d16_v8_sse2) PRIVATE 339 sym(vpx_highbd_filter_block1d16_v8_sse2): 340 push rbp 341 mov rbp, rsp 342 SHADOW_ARGS_TO_STACK 7 343 SAVE_XMM 7 344 push rsi 345 push rdi 346 push rbx 347 ; end prolog 348 349 ALIGN_STACK 16, rax 350 sub rsp, 16 * 8 351 %define k0k1 [rsp + 16 * 0] 352 %define k6k7 [rsp + 16 * 1] 353 %define k2k5 [rsp + 16 * 2] 354 %define k3k4 [rsp + 16 * 3] 355 %define krd [rsp + 16 * 4] 356 %define temp [rsp + 16 * 5] 357 %define max [rsp + 16 * 6] 358 %define min [rsp + 16 * 7] 359 360 HIGH_GET_FILTERS 361 362 movsxd rax, DWORD PTR arg(1) ;pixels_per_line 363 movsxd rbx, DWORD PTR arg(3) ;out_pitch 364 lea rax, [rax + rax] ;bytes per line 365 lea rbx, [rbx + rbx] 366 lea rdx, [rax + rax * 2] 367 movsxd rcx, DWORD PTR arg(4) ;output_height 368 369 .loop: 370 LOAD_VERT_8 0 371 HIGH_APPLY_FILTER_8 0, 0 372 sub rsi, rax 373 374 LOAD_VERT_8 16 375 HIGH_APPLY_FILTER_8 0, 16 376 add rdi, rbx 377 378 dec rcx 379 jnz .loop 380 381 add rsp, 16 * 8 382 pop rsp 383 pop rbx 384 ; begin epilog 385 pop rdi 386 pop rsi 387 RESTORE_XMM 388 UNSHADOW_ARGS 389 pop rbp 390 ret 391 392 global sym(vpx_highbd_filter_block1d4_v8_avg_sse2) PRIVATE 393 sym(vpx_highbd_filter_block1d4_v8_avg_sse2): 394 push rbp 395 mov rbp, rsp 396 SHADOW_ARGS_TO_STACK 7 397 SAVE_XMM 7 398 push rsi 399 push rdi 400 push rbx 401 ; end prolog 402 403 ALIGN_STACK 16, rax 404 sub rsp, 16 * 7 405 %define k0k6 [rsp + 16 * 0] 406 %define k2k5 [rsp + 16 * 1] 407 %define k3k4 [rsp + 16 * 2] 408 %define k1k7 [rsp + 16 * 3] 409 %define krd [rsp + 16 * 4] 410 %define max [rsp + 16 * 5] 411 %define min [rsp + 16 * 6] 412 413 HIGH_GET_FILTERS_4 414 415 mov rsi, arg(0) ;src_ptr 416 mov rdi, arg(2) ;output_ptr 417 418 movsxd rax, DWORD PTR arg(1) ;pixels_per_line 419 movsxd rbx, DWORD PTR arg(3) ;out_pitch 420 lea rax, [rax + rax] ;bytes per line 421 lea rbx, [rbx + rbx] 422 lea rdx, [rax + rax * 2] 423 movsxd rcx, DWORD PTR arg(4) ;output_height 424 425 .loop: 426 movq xmm0, [rsi] ;load src: row 0 427 movq xmm1, [rsi + rax] ;1 428 movq xmm6, [rsi + rdx * 2] ;6 429 lea rsi, [rsi + rax] 430 movq xmm7, [rsi + rdx * 2] ;7 431 movq xmm2, [rsi + rax] ;2 432 movq xmm3, [rsi + rax * 2] ;3 433 movq xmm4, [rsi + rdx] ;4 434 movq xmm5, [rsi + rax * 4] ;5 435 436 HIGH_APPLY_FILTER_4 1 437 438 lea rdi, [rdi + rbx] 439 dec rcx 440 jnz .loop 441 442 add rsp, 16 * 7 443 pop rsp 444 pop rbx 445 ; begin epilog 446 pop rdi 447 pop rsi 448 RESTORE_XMM 449 UNSHADOW_ARGS 450 pop rbp 451 ret 452 453 global sym(vpx_highbd_filter_block1d8_v8_avg_sse2) PRIVATE 454 sym(vpx_highbd_filter_block1d8_v8_avg_sse2): 455 push rbp 456 mov rbp, rsp 457 SHADOW_ARGS_TO_STACK 7 458 SAVE_XMM 7 459 push rsi 460 push rdi 461 push rbx 462 ; end prolog 463 464 ALIGN_STACK 16, rax 465 sub rsp, 16 * 8 466 %define k0k1 [rsp + 16 * 0] 467 %define k6k7 [rsp + 16 * 1] 468 %define k2k5 [rsp + 16 * 2] 469 %define k3k4 [rsp + 16 * 3] 470 %define krd [rsp + 16 * 4] 471 %define temp [rsp + 16 * 5] 472 %define max [rsp + 16 * 6] 473 %define min [rsp + 16 * 7] 474 475 HIGH_GET_FILTERS 476 477 movsxd rax, DWORD PTR arg(1) ;pixels_per_line 478 movsxd rbx, DWORD PTR arg(3) ;out_pitch 479 lea rax, [rax + rax] ;bytes per line 480 lea rbx, [rbx + rbx] 481 lea rdx, [rax + rax * 2] 482 movsxd rcx, DWORD PTR arg(4) ;output_height 483 .loop: 484 LOAD_VERT_8 0 485 HIGH_APPLY_FILTER_8 1, 0 486 487 lea rdi, [rdi + rbx] 488 dec rcx 489 jnz .loop 490 491 add rsp, 16 * 8 492 pop rsp 493 pop rbx 494 ; begin epilog 495 pop rdi 496 pop rsi 497 RESTORE_XMM 498 UNSHADOW_ARGS 499 pop rbp 500 ret 501 502 global sym(vpx_highbd_filter_block1d16_v8_avg_sse2) PRIVATE 503 sym(vpx_highbd_filter_block1d16_v8_avg_sse2): 504 push rbp 505 mov rbp, rsp 506 SHADOW_ARGS_TO_STACK 7 507 SAVE_XMM 7 508 push rsi 509 push rdi 510 push rbx 511 ; end prolog 512 513 ALIGN_STACK 16, rax 514 sub rsp, 16 * 8 515 %define k0k1 [rsp + 16 * 0] 516 %define k6k7 [rsp + 16 * 1] 517 %define k2k5 [rsp + 16 * 2] 518 %define k3k4 [rsp + 16 * 3] 519 %define krd [rsp + 16 * 4] 520 %define temp [rsp + 16 * 5] 521 %define max [rsp + 16 * 6] 522 %define min [rsp + 16 * 7] 523 524 HIGH_GET_FILTERS 525 526 movsxd rax, DWORD PTR arg(1) ;pixels_per_line 527 movsxd rbx, DWORD PTR arg(3) ;out_pitch 528 lea rax, [rax + rax] ;bytes per line 529 lea rbx, [rbx + rbx] 530 lea rdx, [rax + rax * 2] 531 movsxd rcx, DWORD PTR arg(4) ;output_height 532 .loop: 533 LOAD_VERT_8 0 534 HIGH_APPLY_FILTER_8 1, 0 535 sub rsi, rax 536 537 LOAD_VERT_8 16 538 HIGH_APPLY_FILTER_8 1, 16 539 add rdi, rbx 540 541 dec rcx 542 jnz .loop 543 544 add rsp, 16 * 8 545 pop rsp 546 pop rbx 547 ; begin epilog 548 pop rdi 549 pop rsi 550 RESTORE_XMM 551 UNSHADOW_ARGS 552 pop rbp 553 ret 554 555 ;void vpx_filter_block1d4_h8_sse2 556 ;( 557 ; unsigned char *src_ptr, 558 ; unsigned int src_pixels_per_line, 559 ; unsigned char *output_ptr, 560 ; unsigned int output_pitch, 561 ; unsigned int output_height, 562 ; short *filter 563 ;) 564 global sym(vpx_highbd_filter_block1d4_h8_sse2) PRIVATE 565 sym(vpx_highbd_filter_block1d4_h8_sse2): 566 push rbp 567 mov rbp, rsp 568 SHADOW_ARGS_TO_STACK 7 569 SAVE_XMM 7 570 push rsi 571 push rdi 572 ; end prolog 573 574 ALIGN_STACK 16, rax 575 sub rsp, 16 * 7 576 %define k0k6 [rsp + 16 * 0] 577 %define k2k5 [rsp + 16 * 1] 578 %define k3k4 [rsp + 16 * 2] 579 %define k1k7 [rsp + 16 * 3] 580 %define krd [rsp + 16 * 4] 581 %define max [rsp + 16 * 5] 582 %define min [rsp + 16 * 6] 583 584 HIGH_GET_FILTERS_4 585 586 mov rsi, arg(0) ;src_ptr 587 mov rdi, arg(2) ;output_ptr 588 589 movsxd rax, DWORD PTR arg(1) ;pixels_per_line 590 movsxd rdx, DWORD PTR arg(3) ;out_pitch 591 lea rax, [rax + rax] ;bytes per line 592 lea rdx, [rdx + rdx] 593 movsxd rcx, DWORD PTR arg(4) ;output_height 594 595 .loop: 596 movdqu xmm0, [rsi - 6] ;load src 597 movdqu xmm4, [rsi + 2] 598 movdqa xmm1, xmm0 599 movdqa xmm6, xmm4 600 movdqa xmm7, xmm4 601 movdqa xmm2, xmm0 602 movdqa xmm3, xmm0 603 movdqa xmm5, xmm4 604 605 psrldq xmm1, 2 606 psrldq xmm6, 4 607 psrldq xmm7, 6 608 psrldq xmm2, 4 609 psrldq xmm3, 6 610 psrldq xmm5, 2 611 612 HIGH_APPLY_FILTER_4 0 613 614 lea rsi, [rsi + rax] 615 lea rdi, [rdi + rdx] 616 dec rcx 617 jnz .loop 618 619 add rsp, 16 * 7 620 pop rsp 621 622 ; begin epilog 623 pop rdi 624 pop rsi 625 RESTORE_XMM 626 UNSHADOW_ARGS 627 pop rbp 628 ret 629 630 ;void vpx_filter_block1d8_h8_sse2 631 ;( 632 ; unsigned char *src_ptr, 633 ; unsigned int src_pixels_per_line, 634 ; unsigned char *output_ptr, 635 ; unsigned int output_pitch, 636 ; unsigned int output_height, 637 ; short *filter 638 ;) 639 global sym(vpx_highbd_filter_block1d8_h8_sse2) PRIVATE 640 sym(vpx_highbd_filter_block1d8_h8_sse2): 641 push rbp 642 mov rbp, rsp 643 SHADOW_ARGS_TO_STACK 7 644 SAVE_XMM 7 645 push rsi 646 push rdi 647 ; end prolog 648 649 ALIGN_STACK 16, rax 650 sub rsp, 16 * 8 651 %define k0k1 [rsp + 16 * 0] 652 %define k6k7 [rsp + 16 * 1] 653 %define k2k5 [rsp + 16 * 2] 654 %define k3k4 [rsp + 16 * 3] 655 %define krd [rsp + 16 * 4] 656 %define temp [rsp + 16 * 5] 657 %define max [rsp + 16 * 6] 658 %define min [rsp + 16 * 7] 659 660 HIGH_GET_FILTERS 661 662 movsxd rax, DWORD PTR arg(1) ;pixels_per_line 663 movsxd rdx, DWORD PTR arg(3) ;out_pitch 664 lea rax, [rax + rax] ;bytes per line 665 lea rdx, [rdx + rdx] 666 movsxd rcx, DWORD PTR arg(4) ;output_height 667 668 .loop: 669 movdqu xmm0, [rsi - 6] ;load src 670 movdqu xmm1, [rsi - 4] 671 movdqu xmm2, [rsi - 2] 672 movdqu xmm3, [rsi] 673 movdqu xmm4, [rsi + 2] 674 movdqu xmm5, [rsi + 4] 675 movdqu xmm6, [rsi + 6] 676 movdqu xmm7, [rsi + 8] 677 678 HIGH_APPLY_FILTER_8 0, 0 679 680 lea rsi, [rsi + rax] 681 lea rdi, [rdi + rdx] 682 dec rcx 683 jnz .loop 684 685 add rsp, 16 * 8 686 pop rsp 687 688 ; begin epilog 689 pop rdi 690 pop rsi 691 RESTORE_XMM 692 UNSHADOW_ARGS 693 pop rbp 694 ret 695 696 ;void vpx_filter_block1d16_h8_sse2 697 ;( 698 ; unsigned char *src_ptr, 699 ; unsigned int src_pixels_per_line, 700 ; unsigned char *output_ptr, 701 ; unsigned int output_pitch, 702 ; unsigned int output_height, 703 ; short *filter 704 ;) 705 global sym(vpx_highbd_filter_block1d16_h8_sse2) PRIVATE 706 sym(vpx_highbd_filter_block1d16_h8_sse2): 707 push rbp 708 mov rbp, rsp 709 SHADOW_ARGS_TO_STACK 7 710 SAVE_XMM 7 711 push rsi 712 push rdi 713 ; end prolog 714 715 ALIGN_STACK 16, rax 716 sub rsp, 16 * 8 717 %define k0k1 [rsp + 16 * 0] 718 %define k6k7 [rsp + 16 * 1] 719 %define k2k5 [rsp + 16 * 2] 720 %define k3k4 [rsp + 16 * 3] 721 %define krd [rsp + 16 * 4] 722 %define temp [rsp + 16 * 5] 723 %define max [rsp + 16 * 6] 724 %define min [rsp + 16 * 7] 725 726 HIGH_GET_FILTERS 727 728 movsxd rax, DWORD PTR arg(1) ;pixels_per_line 729 movsxd rdx, DWORD PTR arg(3) ;out_pitch 730 lea rax, [rax + rax] ;bytes per line 731 lea rdx, [rdx + rdx] 732 movsxd rcx, DWORD PTR arg(4) ;output_height 733 734 .loop: 735 movdqu xmm0, [rsi - 6] ;load src 736 movdqu xmm1, [rsi - 4] 737 movdqu xmm2, [rsi - 2] 738 movdqu xmm3, [rsi] 739 movdqu xmm4, [rsi + 2] 740 movdqu xmm5, [rsi + 4] 741 movdqu xmm6, [rsi + 6] 742 movdqu xmm7, [rsi + 8] 743 744 HIGH_APPLY_FILTER_8 0, 0 745 746 movdqu xmm0, [rsi + 10] ;load src 747 movdqu xmm1, [rsi + 12] 748 movdqu xmm2, [rsi + 14] 749 movdqu xmm3, [rsi + 16] 750 movdqu xmm4, [rsi + 18] 751 movdqu xmm5, [rsi + 20] 752 movdqu xmm6, [rsi + 22] 753 movdqu xmm7, [rsi + 24] 754 755 HIGH_APPLY_FILTER_8 0, 16 756 757 lea rsi, [rsi + rax] 758 lea rdi, [rdi + rdx] 759 dec rcx 760 jnz .loop 761 762 add rsp, 16 * 8 763 pop rsp 764 765 ; begin epilog 766 pop rdi 767 pop rsi 768 RESTORE_XMM 769 UNSHADOW_ARGS 770 pop rbp 771 ret 772 773 global sym(vpx_highbd_filter_block1d4_h8_avg_sse2) PRIVATE 774 sym(vpx_highbd_filter_block1d4_h8_avg_sse2): 775 push rbp 776 mov rbp, rsp 777 SHADOW_ARGS_TO_STACK 7 778 SAVE_XMM 7 779 push rsi 780 push rdi 781 ; end prolog 782 783 ALIGN_STACK 16, rax 784 sub rsp, 16 * 7 785 %define k0k6 [rsp + 16 * 0] 786 %define k2k5 [rsp + 16 * 1] 787 %define k3k4 [rsp + 16 * 2] 788 %define k1k7 [rsp + 16 * 3] 789 %define krd [rsp + 16 * 4] 790 %define max [rsp + 16 * 5] 791 %define min [rsp + 16 * 6] 792 793 HIGH_GET_FILTERS_4 794 795 mov rsi, arg(0) ;src_ptr 796 mov rdi, arg(2) ;output_ptr 797 798 movsxd rax, DWORD PTR arg(1) ;pixels_per_line 799 movsxd rdx, DWORD PTR arg(3) ;out_pitch 800 lea rax, [rax + rax] ;bytes per line 801 lea rdx, [rdx + rdx] 802 movsxd rcx, DWORD PTR arg(4) ;output_height 803 804 .loop: 805 movdqu xmm0, [rsi - 6] ;load src 806 movdqu xmm4, [rsi + 2] 807 movdqa xmm1, xmm0 808 movdqa xmm6, xmm4 809 movdqa xmm7, xmm4 810 movdqa xmm2, xmm0 811 movdqa xmm3, xmm0 812 movdqa xmm5, xmm4 813 814 psrldq xmm1, 2 815 psrldq xmm6, 4 816 psrldq xmm7, 6 817 psrldq xmm2, 4 818 psrldq xmm3, 6 819 psrldq xmm5, 2 820 821 HIGH_APPLY_FILTER_4 1 822 823 lea rsi, [rsi + rax] 824 lea rdi, [rdi + rdx] 825 dec rcx 826 jnz .loop 827 828 add rsp, 16 * 7 829 pop rsp 830 831 ; begin epilog 832 pop rdi 833 pop rsi 834 RESTORE_XMM 835 UNSHADOW_ARGS 836 pop rbp 837 ret 838 839 global sym(vpx_highbd_filter_block1d8_h8_avg_sse2) PRIVATE 840 sym(vpx_highbd_filter_block1d8_h8_avg_sse2): 841 push rbp 842 mov rbp, rsp 843 SHADOW_ARGS_TO_STACK 7 844 SAVE_XMM 7 845 push rsi 846 push rdi 847 ; end prolog 848 849 ALIGN_STACK 16, rax 850 sub rsp, 16 * 8 851 %define k0k1 [rsp + 16 * 0] 852 %define k6k7 [rsp + 16 * 1] 853 %define k2k5 [rsp + 16 * 2] 854 %define k3k4 [rsp + 16 * 3] 855 %define krd [rsp + 16 * 4] 856 %define temp [rsp + 16 * 5] 857 %define max [rsp + 16 * 6] 858 %define min [rsp + 16 * 7] 859 860 HIGH_GET_FILTERS 861 862 movsxd rax, DWORD PTR arg(1) ;pixels_per_line 863 movsxd rdx, DWORD PTR arg(3) ;out_pitch 864 lea rax, [rax + rax] ;bytes per line 865 lea rdx, [rdx + rdx] 866 movsxd rcx, DWORD PTR arg(4) ;output_height 867 868 .loop: 869 movdqu xmm0, [rsi - 6] ;load src 870 movdqu xmm1, [rsi - 4] 871 movdqu xmm2, [rsi - 2] 872 movdqu xmm3, [rsi] 873 movdqu xmm4, [rsi + 2] 874 movdqu xmm5, [rsi + 4] 875 movdqu xmm6, [rsi + 6] 876 movdqu xmm7, [rsi + 8] 877 878 HIGH_APPLY_FILTER_8 1, 0 879 880 lea rsi, [rsi + rax] 881 lea rdi, [rdi + rdx] 882 dec rcx 883 jnz .loop 884 885 add rsp, 16 * 8 886 pop rsp 887 888 ; begin epilog 889 pop rdi 890 pop rsi 891 RESTORE_XMM 892 UNSHADOW_ARGS 893 pop rbp 894 ret 895 896 global sym(vpx_highbd_filter_block1d16_h8_avg_sse2) PRIVATE 897 sym(vpx_highbd_filter_block1d16_h8_avg_sse2): 898 push rbp 899 mov rbp, rsp 900 SHADOW_ARGS_TO_STACK 7 901 SAVE_XMM 7 902 push rsi 903 push rdi 904 ; end prolog 905 906 ALIGN_STACK 16, rax 907 sub rsp, 16 * 8 908 %define k0k1 [rsp + 16 * 0] 909 %define k6k7 [rsp + 16 * 1] 910 %define k2k5 [rsp + 16 * 2] 911 %define k3k4 [rsp + 16 * 3] 912 %define krd [rsp + 16 * 4] 913 %define temp [rsp + 16 * 5] 914 %define max [rsp + 16 * 6] 915 %define min [rsp + 16 * 7] 916 917 HIGH_GET_FILTERS 918 919 movsxd rax, DWORD PTR arg(1) ;pixels_per_line 920 movsxd rdx, DWORD PTR arg(3) ;out_pitch 921 lea rax, [rax + rax] ;bytes per line 922 lea rdx, [rdx + rdx] 923 movsxd rcx, DWORD PTR arg(4) ;output_height 924 925 .loop: 926 movdqu xmm0, [rsi - 6] ;load src 927 movdqu xmm1, [rsi - 4] 928 movdqu xmm2, [rsi - 2] 929 movdqu xmm3, [rsi] 930 movdqu xmm4, [rsi + 2] 931 movdqu xmm5, [rsi + 4] 932 movdqu xmm6, [rsi + 6] 933 movdqu xmm7, [rsi + 8] 934 935 HIGH_APPLY_FILTER_8 1, 0 936 937 movdqu xmm0, [rsi + 10] ;load src 938 movdqu xmm1, [rsi + 12] 939 movdqu xmm2, [rsi + 14] 940 movdqu xmm3, [rsi + 16] 941 movdqu xmm4, [rsi + 18] 942 movdqu xmm5, [rsi + 20] 943 movdqu xmm6, [rsi + 22] 944 movdqu xmm7, [rsi + 24] 945 946 HIGH_APPLY_FILTER_8 1, 16 947 948 lea rsi, [rsi + rax] 949 lea rdi, [rdi + rdx] 950 dec rcx 951 jnz .loop 952 953 add rsp, 16 * 8 954 pop rsp 955 956 ; begin epilog 957 pop rdi 958 pop rsi 959 RESTORE_XMM 960 UNSHADOW_ARGS 961 pop rbp 962 ret 963