1 ; 2 ; Copyright (c) 2015 The WebM project authors. All Rights Reserved. 3 ; 4 ; Use of this source code is governed by a BSD-style license 5 ; that can be found in the LICENSE file in the root of the source 6 ; tree. An additional intellectual property rights grant can be found 7 ; in the file PATENTS. All contributing project authors may 8 ; be found in the AUTHORS file in the root of the source tree. 9 ; 10 11 %include "third_party/x86inc/x86inc.asm" 12 13 SECTION_RODATA 14 pw_64: times 8 dw 64 15 16 ; %define USE_PMULHRSW 17 ; NOTE: pmulhrsw has a latency of 5 cycles. Tests showed a performance loss 18 ; when using this instruction. 19 ; 20 ; The add order below (based on ffvp9) must be followed to prevent outranges. 21 ; x = k0k1 + k4k5 22 ; y = k2k3 + k6k7 23 ; z = signed SAT(x + y) 24 25 SECTION .text 26 %define LOCAL_VARS_SIZE 16*6 27 28 %macro SETUP_LOCAL_VARS 0 29 ; TODO(slavarnway): using xmm registers for these on ARCH_X86_64 + 30 ; pmaddubsw has a higher latency on some platforms, this might be eased by 31 ; interleaving the instructions. 32 %define k0k1 [rsp + 16*0] 33 %define k2k3 [rsp + 16*1] 34 %define k4k5 [rsp + 16*2] 35 %define k6k7 [rsp + 16*3] 36 packsswb m4, m4 37 ; TODO(slavarnway): multiple pshufb instructions had a higher latency on 38 ; some platforms. 39 pshuflw m0, m4, 0b ;k0_k1 40 pshuflw m1, m4, 01010101b ;k2_k3 41 pshuflw m2, m4, 10101010b ;k4_k5 42 pshuflw m3, m4, 11111111b ;k6_k7 43 punpcklqdq m0, m0 44 punpcklqdq m1, m1 45 punpcklqdq m2, m2 46 punpcklqdq m3, m3 47 mova k0k1, m0 48 mova k2k3, m1 49 mova k4k5, m2 50 mova k6k7, m3 51 %if ARCH_X86_64 52 %define krd m12 53 %define tmp0 [rsp + 16*4] 54 %define tmp1 [rsp + 16*5] 55 mova krd, [GLOBAL(pw_64)] 56 %else 57 %define krd [rsp + 16*4] 58 %if CONFIG_PIC=0 59 mova m6, [GLOBAL(pw_64)] 60 %else 61 ; build constants without accessing global memory 62 pcmpeqb m6, m6 ;all ones 63 psrlw m6, 15 64 psllw m6, 6 ;aka pw_64 65 %endif 66 mova krd, m6 67 %endif 68 %endm 69 70 ;------------------------------------------------------------------------------- 71 %if ARCH_X86_64 72 %define LOCAL_VARS_SIZE_H4 0 73 %else 74 %define LOCAL_VARS_SIZE_H4 16*4 75 %endif 76 77 %macro SUBPIX_HFILTER4 1 78 cglobal filter_block1d4_%1, 6, 6, 11, LOCAL_VARS_SIZE_H4, \ 79 src, sstride, dst, dstride, height, filter 80 mova m4, [filterq] 81 packsswb m4, m4 82 %if ARCH_X86_64 83 %define k0k1k4k5 m8 84 %define k2k3k6k7 m9 85 %define krd m10 86 mova krd, [GLOBAL(pw_64)] 87 pshuflw k0k1k4k5, m4, 0b ;k0_k1 88 pshufhw k0k1k4k5, k0k1k4k5, 10101010b ;k0_k1_k4_k5 89 pshuflw k2k3k6k7, m4, 01010101b ;k2_k3 90 pshufhw k2k3k6k7, k2k3k6k7, 11111111b ;k2_k3_k6_k7 91 %else 92 %define k0k1k4k5 [rsp + 16*0] 93 %define k2k3k6k7 [rsp + 16*1] 94 %define krd [rsp + 16*2] 95 pshuflw m6, m4, 0b ;k0_k1 96 pshufhw m6, m6, 10101010b ;k0_k1_k4_k5 97 pshuflw m7, m4, 01010101b ;k2_k3 98 pshufhw m7, m7, 11111111b ;k2_k3_k6_k7 99 %if CONFIG_PIC=0 100 mova m1, [GLOBAL(pw_64)] 101 %else 102 ; build constants without accessing global memory 103 pcmpeqb m1, m1 ;all ones 104 psrlw m1, 15 105 psllw m1, 6 ;aka pw_64 106 %endif 107 mova k0k1k4k5, m6 108 mova k2k3k6k7, m7 109 mova krd, m1 110 %endif 111 dec heightd 112 113 .loop: 114 ;Do two rows at once 115 movu m4, [srcq - 3] 116 movu m5, [srcq + sstrideq - 3] 117 punpckhbw m1, m4, m4 118 punpcklbw m4, m4 119 punpckhbw m3, m5, m5 120 punpcklbw m5, m5 121 palignr m0, m1, m4, 1 122 pmaddubsw m0, k0k1k4k5 123 palignr m1, m4, 5 124 pmaddubsw m1, k2k3k6k7 125 palignr m2, m3, m5, 1 126 pmaddubsw m2, k0k1k4k5 127 palignr m3, m5, 5 128 pmaddubsw m3, k2k3k6k7 129 punpckhqdq m4, m0, m2 130 punpcklqdq m0, m2 131 punpckhqdq m5, m1, m3 132 punpcklqdq m1, m3 133 paddsw m0, m4 134 paddsw m1, m5 135 %ifidn %1, h8_avg 136 movd m4, [dstq] 137 movd m5, [dstq + dstrideq] 138 %endif 139 paddsw m0, m1 140 paddsw m0, krd 141 psraw m0, 7 142 packuswb m0, m0 143 psrldq m1, m0, 4 144 145 %ifidn %1, h8_avg 146 pavgb m0, m4 147 pavgb m1, m5 148 %endif 149 movd [dstq], m0 150 movd [dstq + dstrideq], m1 151 152 lea srcq, [srcq + sstrideq ] 153 prefetcht0 [srcq + 4 * sstrideq - 3] 154 lea srcq, [srcq + sstrideq ] 155 lea dstq, [dstq + 2 * dstrideq ] 156 prefetcht0 [srcq + 2 * sstrideq - 3] 157 158 sub heightd, 2 159 jg .loop 160 161 ; Do last row if output_height is odd 162 jne .done 163 164 movu m4, [srcq - 3] 165 punpckhbw m1, m4, m4 166 punpcklbw m4, m4 167 palignr m0, m1, m4, 1 168 palignr m1, m4, 5 169 pmaddubsw m0, k0k1k4k5 170 pmaddubsw m1, k2k3k6k7 171 psrldq m2, m0, 8 172 psrldq m3, m1, 8 173 paddsw m0, m2 174 paddsw m1, m3 175 paddsw m0, m1 176 paddsw m0, krd 177 psraw m0, 7 178 packuswb m0, m0 179 %ifidn %1, h8_avg 180 movd m4, [dstq] 181 pavgb m0, m4 182 %endif 183 movd [dstq], m0 184 .done: 185 REP_RET 186 %endm 187 188 ;------------------------------------------------------------------------------- 189 %macro SUBPIX_HFILTER8 1 190 cglobal filter_block1d8_%1, 6, 6, 14, LOCAL_VARS_SIZE, \ 191 src, sstride, dst, dstride, height, filter 192 mova m4, [filterq] 193 SETUP_LOCAL_VARS 194 dec heightd 195 196 .loop: 197 ;Do two rows at once 198 movu m0, [srcq - 3] 199 movu m4, [srcq + sstrideq - 3] 200 punpckhbw m1, m0, m0 201 punpcklbw m0, m0 202 palignr m5, m1, m0, 13 203 pmaddubsw m5, k6k7 204 palignr m2, m1, m0, 5 205 palignr m3, m1, m0, 9 206 palignr m1, m0, 1 207 pmaddubsw m1, k0k1 208 punpckhbw m6, m4, m4 209 punpcklbw m4, m4 210 pmaddubsw m2, k2k3 211 pmaddubsw m3, k4k5 212 213 palignr m7, m6, m4, 13 214 palignr m0, m6, m4, 5 215 pmaddubsw m7, k6k7 216 paddsw m1, m3 217 paddsw m2, m5 218 paddsw m1, m2 219 %ifidn %1, h8_avg 220 movh m2, [dstq] 221 movhps m2, [dstq + dstrideq] 222 %endif 223 palignr m5, m6, m4, 9 224 palignr m6, m4, 1 225 pmaddubsw m0, k2k3 226 pmaddubsw m6, k0k1 227 paddsw m1, krd 228 pmaddubsw m5, k4k5 229 psraw m1, 7 230 paddsw m0, m7 231 paddsw m6, m5 232 paddsw m6, m0 233 paddsw m6, krd 234 psraw m6, 7 235 packuswb m1, m6 236 %ifidn %1, h8_avg 237 pavgb m1, m2 238 %endif 239 movh [dstq], m1 240 movhps [dstq + dstrideq], m1 241 242 lea srcq, [srcq + sstrideq ] 243 prefetcht0 [srcq + 4 * sstrideq - 3] 244 lea srcq, [srcq + sstrideq ] 245 lea dstq, [dstq + 2 * dstrideq ] 246 prefetcht0 [srcq + 2 * sstrideq - 3] 247 sub heightd, 2 248 jg .loop 249 250 ; Do last row if output_height is odd 251 jne .done 252 253 movu m0, [srcq - 3] 254 punpckhbw m3, m0, m0 255 punpcklbw m0, m0 256 palignr m1, m3, m0, 1 257 palignr m2, m3, m0, 5 258 palignr m4, m3, m0, 13 259 palignr m3, m0, 9 260 pmaddubsw m1, k0k1 261 pmaddubsw m2, k2k3 262 pmaddubsw m3, k4k5 263 pmaddubsw m4, k6k7 264 paddsw m1, m3 265 paddsw m4, m2 266 paddsw m1, m4 267 paddsw m1, krd 268 psraw m1, 7 269 packuswb m1, m1 270 %ifidn %1, h8_avg 271 movh m0, [dstq] 272 pavgb m1, m0 273 %endif 274 movh [dstq], m1 275 .done: 276 REP_RET 277 %endm 278 279 ;------------------------------------------------------------------------------- 280 %macro SUBPIX_HFILTER16 1 281 cglobal filter_block1d16_%1, 6, 6, 14, LOCAL_VARS_SIZE, \ 282 src, sstride, dst, dstride, height, filter 283 mova m4, [filterq] 284 SETUP_LOCAL_VARS 285 286 .loop: 287 prefetcht0 [srcq + 2 * sstrideq -3] 288 289 movu m0, [srcq - 3] 290 movu m4, [srcq - 2] 291 pmaddubsw m0, k0k1 292 pmaddubsw m4, k0k1 293 movu m1, [srcq - 1] 294 movu m5, [srcq + 0] 295 pmaddubsw m1, k2k3 296 pmaddubsw m5, k2k3 297 movu m2, [srcq + 1] 298 movu m6, [srcq + 2] 299 pmaddubsw m2, k4k5 300 pmaddubsw m6, k4k5 301 movu m3, [srcq + 3] 302 movu m7, [srcq + 4] 303 pmaddubsw m3, k6k7 304 pmaddubsw m7, k6k7 305 paddsw m0, m2 306 paddsw m1, m3 307 paddsw m0, m1 308 paddsw m4, m6 309 paddsw m5, m7 310 paddsw m4, m5 311 paddsw m0, krd 312 paddsw m4, krd 313 psraw m0, 7 314 psraw m4, 7 315 packuswb m0, m0 316 packuswb m4, m4 317 punpcklbw m0, m4 318 %ifidn %1, h8_avg 319 pavgb m0, [dstq] 320 %endif 321 lea srcq, [srcq + sstrideq] 322 mova [dstq], m0 323 lea dstq, [dstq + dstrideq] 324 dec heightd 325 jnz .loop 326 REP_RET 327 %endm 328 329 INIT_XMM ssse3 330 SUBPIX_HFILTER16 h8 ; vpx_filter_block1d16_h8_ssse3 331 SUBPIX_HFILTER16 h8_avg ; vpx_filter_block1d16_h8_avg_ssse3 332 SUBPIX_HFILTER8 h8 ; vpx_filter_block1d8_h8_ssse3 333 SUBPIX_HFILTER8 h8_avg ; vpx_filter_block1d8_h8_avg_ssse3 334 SUBPIX_HFILTER4 h8 ; vpx_filter_block1d4_h8_ssse3 335 SUBPIX_HFILTER4 h8_avg ; vpx_filter_block1d4_h8_avg_ssse3 336 337 ;------------------------------------------------------------------------------- 338 339 ; TODO(Linfeng): Detect cpu type and choose the code with better performance. 340 %define X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON 1 341 342 %if ARCH_X86_64 && X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON 343 %define NUM_GENERAL_REG_USED 9 344 %else 345 %define NUM_GENERAL_REG_USED 6 346 %endif 347 348 %macro SUBPIX_VFILTER 2 349 cglobal filter_block1d%2_%1, 6, NUM_GENERAL_REG_USED, 15, LOCAL_VARS_SIZE, \ 350 src, sstride, dst, dstride, height, filter 351 mova m4, [filterq] 352 SETUP_LOCAL_VARS 353 354 %ifidn %2, 8 355 %define movx movh 356 %else 357 %define movx movd 358 %endif 359 360 dec heightd 361 362 %if ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON 363 364 %if ARCH_X86_64 365 %define src1q r7 366 %define sstride6q r8 367 %define dst_stride dstrideq 368 %else 369 %define src1q filterq 370 %define sstride6q dstrideq 371 %define dst_stride dstridemp 372 %endif 373 mov src1q, srcq 374 add src1q, sstrideq 375 lea sstride6q, [sstrideq + sstrideq * 4] 376 add sstride6q, sstrideq ;pitch * 6 377 378 .loop: 379 ;Do two rows at once 380 movx m0, [srcq ] ;A 381 movx m1, [src1q ] ;B 382 punpcklbw m0, m1 ;A B 383 movx m2, [srcq + sstrideq * 2 ] ;C 384 pmaddubsw m0, k0k1 385 mova m6, m2 386 movx m3, [src1q + sstrideq * 2] ;D 387 punpcklbw m2, m3 ;C D 388 pmaddubsw m2, k2k3 389 movx m4, [srcq + sstrideq * 4 ] ;E 390 mova m7, m4 391 movx m5, [src1q + sstrideq * 4] ;F 392 punpcklbw m4, m5 ;E F 393 pmaddubsw m4, k4k5 394 punpcklbw m1, m6 ;A B next iter 395 movx m6, [srcq + sstride6q ] ;G 396 punpcklbw m5, m6 ;E F next iter 397 punpcklbw m3, m7 ;C D next iter 398 pmaddubsw m5, k4k5 399 movx m7, [src1q + sstride6q ] ;H 400 punpcklbw m6, m7 ;G H 401 pmaddubsw m6, k6k7 402 pmaddubsw m3, k2k3 403 pmaddubsw m1, k0k1 404 paddsw m0, m4 405 paddsw m2, m6 406 movx m6, [srcq + sstrideq * 8 ] ;H next iter 407 punpcklbw m7, m6 408 pmaddubsw m7, k6k7 409 paddsw m0, m2 410 paddsw m0, krd 411 psraw m0, 7 412 paddsw m1, m5 413 packuswb m0, m0 414 415 paddsw m3, m7 416 paddsw m1, m3 417 paddsw m1, krd 418 psraw m1, 7 419 lea srcq, [srcq + sstrideq * 2 ] 420 lea src1q, [src1q + sstrideq * 2] 421 packuswb m1, m1 422 423 %ifidn %1, v8_avg 424 movx m2, [dstq] 425 pavgb m0, m2 426 %endif 427 movx [dstq], m0 428 add dstq, dst_stride 429 %ifidn %1, v8_avg 430 movx m3, [dstq] 431 pavgb m1, m3 432 %endif 433 movx [dstq], m1 434 add dstq, dst_stride 435 sub heightd, 2 436 jg .loop 437 438 ; Do last row if output_height is odd 439 jne .done 440 441 movx m0, [srcq ] ;A 442 movx m1, [srcq + sstrideq ] ;B 443 movx m6, [srcq + sstride6q ] ;G 444 punpcklbw m0, m1 ;A B 445 movx m7, [src1q + sstride6q ] ;H 446 pmaddubsw m0, k0k1 447 movx m2, [srcq + sstrideq * 2 ] ;C 448 punpcklbw m6, m7 ;G H 449 movx m3, [src1q + sstrideq * 2] ;D 450 pmaddubsw m6, k6k7 451 movx m4, [srcq + sstrideq * 4 ] ;E 452 punpcklbw m2, m3 ;C D 453 movx m5, [src1q + sstrideq * 4] ;F 454 punpcklbw m4, m5 ;E F 455 pmaddubsw m2, k2k3 456 pmaddubsw m4, k4k5 457 paddsw m2, m6 458 paddsw m0, m4 459 paddsw m0, m2 460 paddsw m0, krd 461 psraw m0, 7 462 packuswb m0, m0 463 %ifidn %1, v8_avg 464 movx m1, [dstq] 465 pavgb m0, m1 466 %endif 467 movx [dstq], m0 468 469 %else 470 ; ARCH_X86_64 471 472 movx m0, [srcq ] ;A 473 movx m1, [srcq + sstrideq ] ;B 474 lea srcq, [srcq + sstrideq * 2 ] 475 movx m2, [srcq] ;C 476 movx m3, [srcq + sstrideq] ;D 477 lea srcq, [srcq + sstrideq * 2 ] 478 movx m4, [srcq] ;E 479 movx m5, [srcq + sstrideq] ;F 480 lea srcq, [srcq + sstrideq * 2 ] 481 movx m6, [srcq] ;G 482 punpcklbw m0, m1 ;A B 483 punpcklbw m1, m2 ;A B next iter 484 punpcklbw m2, m3 ;C D 485 punpcklbw m3, m4 ;C D next iter 486 punpcklbw m4, m5 ;E F 487 punpcklbw m5, m6 ;E F next iter 488 489 .loop: 490 ;Do two rows at once 491 movx m7, [srcq + sstrideq] ;H 492 lea srcq, [srcq + sstrideq * 2 ] 493 movx m14, [srcq] ;H next iter 494 punpcklbw m6, m7 ;G H 495 punpcklbw m7, m14 ;G H next iter 496 pmaddubsw m8, m0, k0k1 497 pmaddubsw m9, m1, k0k1 498 mova m0, m2 499 mova m1, m3 500 pmaddubsw m10, m2, k2k3 501 pmaddubsw m11, m3, k2k3 502 mova m2, m4 503 mova m3, m5 504 pmaddubsw m4, k4k5 505 pmaddubsw m5, k4k5 506 paddsw m8, m4 507 paddsw m9, m5 508 mova m4, m6 509 mova m5, m7 510 pmaddubsw m6, k6k7 511 pmaddubsw m7, k6k7 512 paddsw m10, m6 513 paddsw m11, m7 514 paddsw m8, m10 515 paddsw m9, m11 516 mova m6, m14 517 paddsw m8, krd 518 paddsw m9, krd 519 psraw m8, 7 520 psraw m9, 7 521 %ifidn %2, 4 522 packuswb m8, m8 523 packuswb m9, m9 524 %else 525 packuswb m8, m9 526 %endif 527 528 %ifidn %1, v8_avg 529 movx m7, [dstq] 530 %ifidn %2, 4 531 movx m10, [dstq + dstrideq] 532 pavgb m9, m10 533 %else 534 movhpd m7, [dstq + dstrideq] 535 %endif 536 pavgb m8, m7 537 %endif 538 movx [dstq], m8 539 %ifidn %2, 4 540 movx [dstq + dstrideq], m9 541 %else 542 movhpd [dstq + dstrideq], m8 543 %endif 544 545 lea dstq, [dstq + dstrideq * 2 ] 546 sub heightd, 2 547 jg .loop 548 549 ; Do last row if output_height is odd 550 jne .done 551 552 movx m7, [srcq + sstrideq] ;H 553 punpcklbw m6, m7 ;G H 554 pmaddubsw m0, k0k1 555 pmaddubsw m2, k2k3 556 pmaddubsw m4, k4k5 557 pmaddubsw m6, k6k7 558 paddsw m0, m4 559 paddsw m2, m6 560 paddsw m0, m2 561 paddsw m0, krd 562 psraw m0, 7 563 packuswb m0, m0 564 %ifidn %1, v8_avg 565 movx m1, [dstq] 566 pavgb m0, m1 567 %endif 568 movx [dstq], m0 569 570 %endif ; ARCH_X86_64 571 572 .done: 573 REP_RET 574 575 %endm 576 577 ;------------------------------------------------------------------------------- 578 %macro SUBPIX_VFILTER16 1 579 cglobal filter_block1d16_%1, 6, NUM_GENERAL_REG_USED, 16, LOCAL_VARS_SIZE, \ 580 src, sstride, dst, dstride, height, filter 581 mova m4, [filterq] 582 SETUP_LOCAL_VARS 583 584 %if ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON 585 586 %if ARCH_X86_64 587 %define src1q r7 588 %define sstride6q r8 589 %define dst_stride dstrideq 590 %else 591 %define src1q filterq 592 %define sstride6q dstrideq 593 %define dst_stride dstridemp 594 %endif 595 lea src1q, [srcq + sstrideq] 596 lea sstride6q, [sstrideq + sstrideq * 4] 597 add sstride6q, sstrideq ;pitch * 6 598 599 .loop: 600 movh m0, [srcq ] ;A 601 movh m1, [src1q ] ;B 602 movh m2, [srcq + sstrideq * 2 ] ;C 603 movh m3, [src1q + sstrideq * 2] ;D 604 movh m4, [srcq + sstrideq * 4 ] ;E 605 movh m5, [src1q + sstrideq * 4] ;F 606 607 punpcklbw m0, m1 ;A B 608 movh m6, [srcq + sstride6q] ;G 609 punpcklbw m2, m3 ;C D 610 movh m7, [src1q + sstride6q] ;H 611 punpcklbw m4, m5 ;E F 612 pmaddubsw m0, k0k1 613 movh m3, [srcq + 8] ;A 614 pmaddubsw m2, k2k3 615 punpcklbw m6, m7 ;G H 616 movh m5, [srcq + sstrideq + 8] ;B 617 pmaddubsw m4, k4k5 618 punpcklbw m3, m5 ;A B 619 movh m7, [srcq + sstrideq * 2 + 8] ;C 620 pmaddubsw m6, k6k7 621 movh m5, [src1q + sstrideq * 2 + 8] ;D 622 punpcklbw m7, m5 ;C D 623 paddsw m2, m6 624 pmaddubsw m3, k0k1 625 movh m1, [srcq + sstrideq * 4 + 8] ;E 626 paddsw m0, m4 627 pmaddubsw m7, k2k3 628 movh m6, [src1q + sstrideq * 4 + 8] ;F 629 punpcklbw m1, m6 ;E F 630 paddsw m0, m2 631 paddsw m0, krd 632 movh m2, [srcq + sstride6q + 8] ;G 633 pmaddubsw m1, k4k5 634 movh m5, [src1q + sstride6q + 8] ;H 635 psraw m0, 7 636 punpcklbw m2, m5 ;G H 637 pmaddubsw m2, k6k7 638 paddsw m7, m2 639 paddsw m3, m1 640 paddsw m3, m7 641 paddsw m3, krd 642 psraw m3, 7 643 packuswb m0, m3 644 645 add srcq, sstrideq 646 add src1q, sstrideq 647 %ifidn %1, v8_avg 648 pavgb m0, [dstq] 649 %endif 650 mova [dstq], m0 651 add dstq, dst_stride 652 dec heightd 653 jnz .loop 654 REP_RET 655 656 %else 657 ; ARCH_X86_64 658 dec heightd 659 660 movu m1, [srcq ] ;A 661 movu m3, [srcq + sstrideq ] ;B 662 lea srcq, [srcq + sstrideq * 2] 663 punpcklbw m0, m1, m3 ;A B 664 punpckhbw m1, m3 ;A B 665 movu m5, [srcq] ;C 666 punpcklbw m2, m3, m5 ;A B next iter 667 punpckhbw m3, m5 ;A B next iter 668 mova tmp0, m2 ;store to stack 669 mova tmp1, m3 ;store to stack 670 movu m7, [srcq + sstrideq] ;D 671 lea srcq, [srcq + sstrideq * 2] 672 punpcklbw m4, m5, m7 ;C D 673 punpckhbw m5, m7 ;C D 674 movu m9, [srcq] ;E 675 punpcklbw m6, m7, m9 ;C D next iter 676 punpckhbw m7, m9 ;C D next iter 677 movu m11, [srcq + sstrideq] ;F 678 lea srcq, [srcq + sstrideq * 2] 679 punpcklbw m8, m9, m11 ;E F 680 punpckhbw m9, m11 ;E F 681 movu m2, [srcq] ;G 682 punpcklbw m10, m11, m2 ;E F next iter 683 punpckhbw m11, m2 ;E F next iter 684 685 .loop: 686 ;Do two rows at once 687 pmaddubsw m13, m0, k0k1 688 mova m0, m4 689 pmaddubsw m14, m8, k4k5 690 pmaddubsw m15, m4, k2k3 691 mova m4, m8 692 paddsw m13, m14 693 movu m3, [srcq + sstrideq] ;H 694 lea srcq, [srcq + sstrideq * 2] 695 punpcklbw m14, m2, m3 ;G H 696 mova m8, m14 697 pmaddubsw m14, k6k7 698 paddsw m15, m14 699 paddsw m13, m15 700 paddsw m13, krd 701 psraw m13, 7 702 703 pmaddubsw m14, m1, k0k1 704 pmaddubsw m1, m9, k4k5 705 pmaddubsw m15, m5, k2k3 706 paddsw m14, m1 707 mova m1, m5 708 mova m5, m9 709 punpckhbw m2, m3 ;G H 710 mova m9, m2 711 pmaddubsw m2, k6k7 712 paddsw m15, m2 713 paddsw m14, m15 714 paddsw m14, krd 715 psraw m14, 7 716 packuswb m13, m14 717 %ifidn %1, v8_avg 718 pavgb m13, [dstq] 719 %endif 720 mova [dstq], m13 721 722 ; next iter 723 pmaddubsw m15, tmp0, k0k1 724 pmaddubsw m14, m10, k4k5 725 pmaddubsw m13, m6, k2k3 726 paddsw m15, m14 727 mova tmp0, m6 728 mova m6, m10 729 movu m2, [srcq] ;G next iter 730 punpcklbw m14, m3, m2 ;G H next iter 731 mova m10, m14 732 pmaddubsw m14, k6k7 733 paddsw m13, m14 734 paddsw m15, m13 735 paddsw m15, krd 736 psraw m15, 7 737 738 pmaddubsw m14, tmp1, k0k1 739 mova tmp1, m7 740 pmaddubsw m13, m7, k2k3 741 mova m7, m11 742 pmaddubsw m11, k4k5 743 paddsw m14, m11 744 punpckhbw m3, m2 ;G H next iter 745 mova m11, m3 746 pmaddubsw m3, k6k7 747 paddsw m13, m3 748 paddsw m14, m13 749 paddsw m14, krd 750 psraw m14, 7 751 packuswb m15, m14 752 %ifidn %1, v8_avg 753 pavgb m15, [dstq + dstrideq] 754 %endif 755 mova [dstq + dstrideq], m15 756 lea dstq, [dstq + dstrideq * 2] 757 sub heightd, 2 758 jg .loop 759 760 ; Do last row if output_height is odd 761 jne .done 762 763 movu m3, [srcq + sstrideq] ;H 764 punpcklbw m6, m2, m3 ;G H 765 punpckhbw m2, m3 ;G H 766 pmaddubsw m0, k0k1 767 pmaddubsw m1, k0k1 768 pmaddubsw m4, k2k3 769 pmaddubsw m5, k2k3 770 pmaddubsw m8, k4k5 771 pmaddubsw m9, k4k5 772 pmaddubsw m6, k6k7 773 pmaddubsw m2, k6k7 774 paddsw m0, m8 775 paddsw m1, m9 776 paddsw m4, m6 777 paddsw m5, m2 778 paddsw m0, m4 779 paddsw m1, m5 780 paddsw m0, krd 781 paddsw m1, krd 782 psraw m0, 7 783 psraw m1, 7 784 packuswb m0, m1 785 %ifidn %1, v8_avg 786 pavgb m0, [dstq] 787 %endif 788 mova [dstq], m0 789 790 .done: 791 REP_RET 792 793 %endif ; ARCH_X86_64 794 795 %endm 796 797 INIT_XMM ssse3 798 SUBPIX_VFILTER16 v8 ; vpx_filter_block1d16_v8_ssse3 799 SUBPIX_VFILTER16 v8_avg ; vpx_filter_block1d16_v8_avg_ssse3 800 SUBPIX_VFILTER v8, 8 ; vpx_filter_block1d8_v8_ssse3 801 SUBPIX_VFILTER v8_avg, 8 ; vpx_filter_block1d8_v8_avg_ssse3 802 SUBPIX_VFILTER v8, 4 ; vpx_filter_block1d4_v8_ssse3 803 SUBPIX_VFILTER v8_avg, 4 ; vpx_filter_block1d4_v8_avg_ssse3 804