1 ; 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3 ; 4 ; Use of this source code is governed by a BSD-style license 5 ; that can be found in the LICENSE file in the root of the source 6 ; tree. An additional intellectual property rights grant can be found 7 ; in the file PATENTS. All contributing project authors may 8 ; be found in the AUTHORS file in the root of the source tree. 9 ; 10 11 %include "third_party/x86inc/x86inc.asm" 12 13 SECTION_RODATA 14 pw_8: times 8 dw 8 15 bilin_filter_m_sse2: times 8 dw 16 16 times 8 dw 0 17 times 8 dw 15 18 times 8 dw 1 19 times 8 dw 14 20 times 8 dw 2 21 times 8 dw 13 22 times 8 dw 3 23 times 8 dw 12 24 times 8 dw 4 25 times 8 dw 11 26 times 8 dw 5 27 times 8 dw 10 28 times 8 dw 6 29 times 8 dw 9 30 times 8 dw 7 31 times 16 dw 8 32 times 8 dw 7 33 times 8 dw 9 34 times 8 dw 6 35 times 8 dw 10 36 times 8 dw 5 37 times 8 dw 11 38 times 8 dw 4 39 times 8 dw 12 40 times 8 dw 3 41 times 8 dw 13 42 times 8 dw 2 43 times 8 dw 14 44 times 8 dw 1 45 times 8 dw 15 46 47 bilin_filter_m_ssse3: times 8 db 16, 0 48 times 8 db 15, 1 49 times 8 db 14, 2 50 times 8 db 13, 3 51 times 8 db 12, 4 52 times 8 db 11, 5 53 times 8 db 10, 6 54 times 8 db 9, 7 55 times 16 db 8 56 times 8 db 7, 9 57 times 8 db 6, 10 58 times 8 db 5, 11 59 times 8 db 4, 12 60 times 8 db 3, 13 61 times 8 db 2, 14 62 times 8 db 1, 15 63 64 SECTION .text 65 66 ; int vp9_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride, 67 ; int x_offset, int y_offset, 68 ; const uint8_t *dst, ptrdiff_t dst_stride, 69 ; int height, unsigned int *sse); 70 ; 71 ; This function returns the SE and stores SSE in the given pointer. 72 73 %macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse 74 psubw %3, %4 75 psubw %1, %2 76 paddw %5, %3 77 pmaddwd %3, %3 78 paddw %5, %1 79 pmaddwd %1, %1 80 paddd %6, %3 81 paddd %6, %1 82 %endmacro 83 84 %macro STORE_AND_RET 0 85 %if mmsize == 16 86 ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit 87 ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg. 88 ; We have to sign-extend it before adding the words within the register 89 ; and outputing to a dword. 90 pcmpgtw m5, m6 ; mask for 0 > x 91 movhlps m3, m7 92 punpcklwd m4, m6, m5 93 punpckhwd m6, m5 ; sign-extend m6 word->dword 94 paddd m7, m3 95 paddd m6, m4 96 pshufd m3, m7, 0x1 97 movhlps m4, m6 98 paddd m7, m3 99 paddd m6, m4 100 mov r1, ssem ; r1 = unsigned int *sse 101 pshufd m4, m6, 0x1 102 movd [r1], m7 ; store sse 103 paddd m6, m4 104 movd rax, m6 ; store sum as return value 105 %else ; mmsize == 8 106 pshufw m4, m6, 0xe 107 pshufw m3, m7, 0xe 108 paddw m6, m4 109 paddd m7, m3 110 pcmpgtw m5, m6 ; mask for 0 > x 111 mov r1, ssem ; r1 = unsigned int *sse 112 punpcklwd m6, m5 ; sign-extend m6 word->dword 113 movd [r1], m7 ; store sse 114 pshufw m4, m6, 0xe 115 paddd m6, m4 116 movd rax, m6 ; store sum as return value 117 %endif 118 RET 119 %endmacro 120 121 %macro INC_SRC_BY_SRC_STRIDE 0 122 %if ARCH_X86=1 && CONFIG_PIC=1 123 add srcq, src_stridemp 124 %else 125 add srcq, src_strideq 126 %endif 127 %endmacro 128 129 %macro SUBPEL_VARIANCE 1-2 0 ; W 130 %if cpuflag(ssse3) 131 %define bilin_filter_m bilin_filter_m_ssse3 132 %define filter_idx_shift 4 133 %else 134 %define bilin_filter_m bilin_filter_m_sse2 135 %define filter_idx_shift 5 136 %endif 137 ; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses 138 ; 11, not 13, if the registers are ordered correctly. May make a minor speed 139 ; difference on Win64 140 141 %ifdef PIC ; 64bit PIC 142 %if %2 == 1 ; avg 143 cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \ 144 x_offset, y_offset, \ 145 dst, dst_stride, \ 146 sec, sec_stride, height, sse 147 %define sec_str sec_strideq 148 %else 149 cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \ 150 y_offset, dst, dst_stride, height, sse 151 %endif 152 %define h heightd 153 %define bilin_filter sseq 154 %else 155 %if ARCH_X86=1 && CONFIG_PIC=1 156 %if %2 == 1 ; avg 157 cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ 158 x_offset, y_offset, \ 159 dst, dst_stride, \ 160 sec, sec_stride, \ 161 height, sse, g_bilin_filter, g_pw_8 162 %define h dword heightm 163 %define sec_str sec_stridemp 164 165 ;Store bilin_filter and pw_8 location in stack 166 GET_GOT eax 167 add esp, 4 ; restore esp 168 169 lea ecx, [GLOBAL(bilin_filter_m)] 170 mov g_bilin_filterm, ecx 171 172 lea ecx, [GLOBAL(pw_8)] 173 mov g_pw_8m, ecx 174 175 LOAD_IF_USED 0, 1 ; load eax, ecx back 176 %else 177 cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \ 178 y_offset, dst, dst_stride, height, sse, \ 179 g_bilin_filter, g_pw_8 180 %define h heightd 181 182 ;Store bilin_filter and pw_8 location in stack 183 GET_GOT eax 184 add esp, 4 ; restore esp 185 186 lea ecx, [GLOBAL(bilin_filter_m)] 187 mov g_bilin_filterm, ecx 188 189 lea ecx, [GLOBAL(pw_8)] 190 mov g_pw_8m, ecx 191 192 LOAD_IF_USED 0, 1 ; load eax, ecx back 193 %endif 194 %else 195 %if %2 == 1 ; avg 196 cglobal sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \ 197 7 + 2 * ARCH_X86_64, 13, src, src_stride, \ 198 x_offset, y_offset, \ 199 dst, dst_stride, \ 200 sec, sec_stride, \ 201 height, sse 202 %if ARCH_X86_64 203 %define h heightd 204 %define sec_str sec_strideq 205 %else 206 %define h dword heightm 207 %define sec_str sec_stridemp 208 %endif 209 %else 210 cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \ 211 y_offset, dst, dst_stride, height, sse 212 %define h heightd 213 %endif 214 215 %define bilin_filter bilin_filter_m 216 %endif 217 %endif 218 219 ASSERT %1 <= 16 ; m6 overflows if w > 16 220 pxor m6, m6 ; sum 221 pxor m7, m7 ; sse 222 ; FIXME(rbultje) if both filters are bilinear, we don't actually use m5; we 223 ; could perhaps use it for something more productive then 224 pxor m5, m5 ; dedicated zero register 225 %if %1 < 16 226 sar h, 1 227 %if %2 == 1 ; avg 228 shl sec_str, 1 229 %endif 230 %endif 231 232 ; FIXME(rbultje) replace by jumptable? 233 test x_offsetd, x_offsetd 234 jnz .x_nonzero 235 ; x_offset == 0 236 test y_offsetd, y_offsetd 237 jnz .x_zero_y_nonzero 238 239 ; x_offset == 0 && y_offset == 0 240 .x_zero_y_zero_loop: 241 %if %1 == 16 242 movu m0, [srcq] 243 mova m1, [dstq] 244 %if %2 == 1 ; avg 245 pavgb m0, [secq] 246 punpckhbw m3, m1, m5 247 punpcklbw m1, m5 248 %endif 249 punpckhbw m2, m0, m5 250 punpcklbw m0, m5 251 %if %2 == 0 ; !avg 252 punpckhbw m3, m1, m5 253 punpcklbw m1, m5 254 %endif 255 SUM_SSE m0, m1, m2, m3, m6, m7 256 257 add srcq, src_strideq 258 add dstq, dst_strideq 259 %else ; %1 < 16 260 movh m0, [srcq] 261 %if %2 == 1 ; avg 262 %if mmsize == 16 263 movhps m0, [srcq+src_strideq] 264 %else ; mmsize == 8 265 punpckldq m0, [srcq+src_strideq] 266 %endif 267 %else ; !avg 268 movh m2, [srcq+src_strideq] 269 %endif 270 movh m1, [dstq] 271 movh m3, [dstq+dst_strideq] 272 %if %2 == 1 ; avg 273 pavgb m0, [secq] 274 punpcklbw m3, m5 275 punpcklbw m1, m5 276 punpckhbw m2, m0, m5 277 punpcklbw m0, m5 278 %else ; !avg 279 punpcklbw m0, m5 280 punpcklbw m2, m5 281 punpcklbw m3, m5 282 punpcklbw m1, m5 283 %endif 284 SUM_SSE m0, m1, m2, m3, m6, m7 285 286 lea srcq, [srcq+src_strideq*2] 287 lea dstq, [dstq+dst_strideq*2] 288 %endif 289 %if %2 == 1 ; avg 290 add secq, sec_str 291 %endif 292 dec h 293 jg .x_zero_y_zero_loop 294 STORE_AND_RET 295 296 .x_zero_y_nonzero: 297 cmp y_offsetd, 8 298 jne .x_zero_y_nonhalf 299 300 ; x_offset == 0 && y_offset == 0.5 301 .x_zero_y_half_loop: 302 %if %1 == 16 303 movu m0, [srcq] 304 movu m4, [srcq+src_strideq] 305 mova m1, [dstq] 306 pavgb m0, m4 307 punpckhbw m3, m1, m5 308 %if %2 == 1 ; avg 309 pavgb m0, [secq] 310 %endif 311 punpcklbw m1, m5 312 punpckhbw m2, m0, m5 313 punpcklbw m0, m5 314 SUM_SSE m0, m1, m2, m3, m6, m7 315 316 add srcq, src_strideq 317 add dstq, dst_strideq 318 %else ; %1 < 16 319 movh m0, [srcq] 320 movh m2, [srcq+src_strideq] 321 %if %2 == 1 ; avg 322 %if mmsize == 16 323 movhps m2, [srcq+src_strideq*2] 324 %else ; mmsize == 8 325 %if %1 == 4 326 movh m1, [srcq+src_strideq*2] 327 punpckldq m2, m1 328 %else 329 punpckldq m2, [srcq+src_strideq*2] 330 %endif 331 %endif 332 movh m1, [dstq] 333 %if mmsize == 16 334 movlhps m0, m2 335 %else ; mmsize == 8 336 punpckldq m0, m2 337 %endif 338 movh m3, [dstq+dst_strideq] 339 pavgb m0, m2 340 punpcklbw m1, m5 341 pavgb m0, [secq] 342 punpcklbw m3, m5 343 punpckhbw m2, m0, m5 344 punpcklbw m0, m5 345 %else ; !avg 346 movh m4, [srcq+src_strideq*2] 347 movh m1, [dstq] 348 pavgb m0, m2 349 movh m3, [dstq+dst_strideq] 350 pavgb m2, m4 351 punpcklbw m0, m5 352 punpcklbw m2, m5 353 punpcklbw m3, m5 354 punpcklbw m1, m5 355 %endif 356 SUM_SSE m0, m1, m2, m3, m6, m7 357 358 lea srcq, [srcq+src_strideq*2] 359 lea dstq, [dstq+dst_strideq*2] 360 %endif 361 %if %2 == 1 ; avg 362 add secq, sec_str 363 %endif 364 dec h 365 jg .x_zero_y_half_loop 366 STORE_AND_RET 367 368 .x_zero_y_nonhalf: 369 ; x_offset == 0 && y_offset == bilin interpolation 370 %ifdef PIC 371 lea bilin_filter, [bilin_filter_m] 372 %endif 373 shl y_offsetd, filter_idx_shift 374 %if ARCH_X86_64 && mmsize == 16 375 mova m8, [bilin_filter+y_offsetq] 376 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 377 mova m9, [bilin_filter+y_offsetq+16] 378 %endif 379 mova m10, [pw_8] 380 %define filter_y_a m8 381 %define filter_y_b m9 382 %define filter_rnd m10 383 %else ; x86-32 or mmx 384 %if ARCH_X86=1 && CONFIG_PIC=1 385 ; x_offset == 0, reuse x_offset reg 386 %define tempq x_offsetq 387 add y_offsetq, g_bilin_filterm 388 %define filter_y_a [y_offsetq] 389 %define filter_y_b [y_offsetq+16] 390 mov tempq, g_pw_8m 391 %define filter_rnd [tempq] 392 %else 393 add y_offsetq, bilin_filter 394 %define filter_y_a [y_offsetq] 395 %define filter_y_b [y_offsetq+16] 396 %define filter_rnd [pw_8] 397 %endif 398 %endif 399 400 .x_zero_y_other_loop: 401 %if %1 == 16 402 movu m0, [srcq] 403 movu m4, [srcq+src_strideq] 404 mova m1, [dstq] 405 %if cpuflag(ssse3) 406 punpckhbw m2, m0, m4 407 punpcklbw m0, m4 408 pmaddubsw m2, filter_y_a 409 pmaddubsw m0, filter_y_a 410 paddw m2, filter_rnd 411 paddw m0, filter_rnd 412 %else 413 punpckhbw m2, m0, m5 414 punpckhbw m3, m4, m5 415 punpcklbw m0, m5 416 punpcklbw m4, m5 417 ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can 418 ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of 419 ; instructions is the same (5), but it is 1 mul instead of 2, so might be 420 ; slightly faster because of pmullw latency. It would also cut our rodata 421 ; tables in half for this function, and save 1-2 registers on x86-64. 422 pmullw m2, filter_y_a 423 pmullw m3, filter_y_b 424 paddw m2, filter_rnd 425 pmullw m0, filter_y_a 426 pmullw m4, filter_y_b 427 paddw m0, filter_rnd 428 paddw m2, m3 429 paddw m0, m4 430 %endif 431 psraw m2, 4 432 psraw m0, 4 433 %if %2 == 1 ; avg 434 ; FIXME(rbultje) pipeline 435 packuswb m0, m2 436 pavgb m0, [secq] 437 punpckhbw m2, m0, m5 438 punpcklbw m0, m5 439 %endif 440 punpckhbw m3, m1, m5 441 punpcklbw m1, m5 442 SUM_SSE m0, m1, m2, m3, m6, m7 443 444 add srcq, src_strideq 445 add dstq, dst_strideq 446 %else ; %1 < 16 447 movh m0, [srcq] 448 movh m2, [srcq+src_strideq] 449 movh m4, [srcq+src_strideq*2] 450 movh m3, [dstq+dst_strideq] 451 %if cpuflag(ssse3) 452 movh m1, [dstq] 453 punpcklbw m0, m2 454 punpcklbw m2, m4 455 pmaddubsw m0, filter_y_a 456 pmaddubsw m2, filter_y_a 457 punpcklbw m3, m5 458 paddw m2, filter_rnd 459 paddw m0, filter_rnd 460 %else 461 punpcklbw m0, m5 462 punpcklbw m2, m5 463 punpcklbw m4, m5 464 pmullw m0, filter_y_a 465 pmullw m1, m2, filter_y_b 466 punpcklbw m3, m5 467 paddw m0, filter_rnd 468 pmullw m2, filter_y_a 469 pmullw m4, filter_y_b 470 paddw m0, m1 471 paddw m2, filter_rnd 472 movh m1, [dstq] 473 paddw m2, m4 474 %endif 475 psraw m0, 4 476 psraw m2, 4 477 %if %2 == 1 ; avg 478 ; FIXME(rbultje) pipeline 479 packuswb m0, m2 480 pavgb m0, [secq] 481 punpckhbw m2, m0, m5 482 punpcklbw m0, m5 483 %endif 484 punpcklbw m1, m5 485 SUM_SSE m0, m1, m2, m3, m6, m7 486 487 lea srcq, [srcq+src_strideq*2] 488 lea dstq, [dstq+dst_strideq*2] 489 %endif 490 %if %2 == 1 ; avg 491 add secq, sec_str 492 %endif 493 dec h 494 jg .x_zero_y_other_loop 495 %undef filter_y_a 496 %undef filter_y_b 497 %undef filter_rnd 498 STORE_AND_RET 499 500 .x_nonzero: 501 cmp x_offsetd, 8 502 jne .x_nonhalf 503 ; x_offset == 0.5 504 test y_offsetd, y_offsetd 505 jnz .x_half_y_nonzero 506 507 ; x_offset == 0.5 && y_offset == 0 508 .x_half_y_zero_loop: 509 %if %1 == 16 510 movu m0, [srcq] 511 movu m4, [srcq+1] 512 mova m1, [dstq] 513 pavgb m0, m4 514 punpckhbw m3, m1, m5 515 %if %2 == 1 ; avg 516 pavgb m0, [secq] 517 %endif 518 punpcklbw m1, m5 519 punpckhbw m2, m0, m5 520 punpcklbw m0, m5 521 SUM_SSE m0, m1, m2, m3, m6, m7 522 523 add srcq, src_strideq 524 add dstq, dst_strideq 525 %else ; %1 < 16 526 movh m0, [srcq] 527 movh m4, [srcq+1] 528 %if %2 == 1 ; avg 529 %if mmsize == 16 530 movhps m0, [srcq+src_strideq] 531 movhps m4, [srcq+src_strideq+1] 532 %else ; mmsize == 8 533 punpckldq m0, [srcq+src_strideq] 534 punpckldq m4, [srcq+src_strideq+1] 535 %endif 536 movh m1, [dstq] 537 movh m3, [dstq+dst_strideq] 538 pavgb m0, m4 539 punpcklbw m3, m5 540 pavgb m0, [secq] 541 punpcklbw m1, m5 542 punpckhbw m2, m0, m5 543 punpcklbw m0, m5 544 %else ; !avg 545 movh m2, [srcq+src_strideq] 546 movh m1, [dstq] 547 pavgb m0, m4 548 movh m4, [srcq+src_strideq+1] 549 movh m3, [dstq+dst_strideq] 550 pavgb m2, m4 551 punpcklbw m0, m5 552 punpcklbw m2, m5 553 punpcklbw m3, m5 554 punpcklbw m1, m5 555 %endif 556 SUM_SSE m0, m1, m2, m3, m6, m7 557 558 lea srcq, [srcq+src_strideq*2] 559 lea dstq, [dstq+dst_strideq*2] 560 %endif 561 %if %2 == 1 ; avg 562 add secq, sec_str 563 %endif 564 dec h 565 jg .x_half_y_zero_loop 566 STORE_AND_RET 567 568 .x_half_y_nonzero: 569 cmp y_offsetd, 8 570 jne .x_half_y_nonhalf 571 572 ; x_offset == 0.5 && y_offset == 0.5 573 %if %1 == 16 574 movu m0, [srcq] 575 movu m3, [srcq+1] 576 add srcq, src_strideq 577 pavgb m0, m3 578 .x_half_y_half_loop: 579 movu m4, [srcq] 580 movu m3, [srcq+1] 581 mova m1, [dstq] 582 pavgb m4, m3 583 punpckhbw m3, m1, m5 584 pavgb m0, m4 585 %if %2 == 1 ; avg 586 punpcklbw m1, m5 587 pavgb m0, [secq] 588 punpckhbw m2, m0, m5 589 punpcklbw m0, m5 590 %else 591 punpckhbw m2, m0, m5 592 punpcklbw m0, m5 593 punpcklbw m1, m5 594 %endif 595 SUM_SSE m0, m1, m2, m3, m6, m7 596 mova m0, m4 597 598 add srcq, src_strideq 599 add dstq, dst_strideq 600 %else ; %1 < 16 601 movh m0, [srcq] 602 movh m3, [srcq+1] 603 add srcq, src_strideq 604 pavgb m0, m3 605 .x_half_y_half_loop: 606 movh m2, [srcq] 607 movh m3, [srcq+1] 608 %if %2 == 1 ; avg 609 %if mmsize == 16 610 movhps m2, [srcq+src_strideq] 611 movhps m3, [srcq+src_strideq+1] 612 %else 613 %if %1 == 4 614 movh m1, [srcq+src_strideq] 615 punpckldq m2, m1 616 movh m1, [srcq+src_strideq+1] 617 punpckldq m3, m1 618 %else 619 punpckldq m2, [srcq+src_strideq] 620 punpckldq m3, [srcq+src_strideq+1] 621 %endif 622 %endif 623 pavgb m2, m3 624 %if mmsize == 16 625 movlhps m0, m2 626 movhlps m4, m2 627 %else ; mmsize == 8 628 punpckldq m0, m2 629 pshufw m4, m2, 0xe 630 %endif 631 movh m1, [dstq] 632 pavgb m0, m2 633 movh m3, [dstq+dst_strideq] 634 pavgb m0, [secq] 635 punpcklbw m3, m5 636 punpcklbw m1, m5 637 punpckhbw m2, m0, m5 638 punpcklbw m0, m5 639 %else ; !avg 640 movh m4, [srcq+src_strideq] 641 movh m1, [srcq+src_strideq+1] 642 pavgb m2, m3 643 pavgb m4, m1 644 pavgb m0, m2 645 pavgb m2, m4 646 movh m1, [dstq] 647 movh m3, [dstq+dst_strideq] 648 punpcklbw m0, m5 649 punpcklbw m2, m5 650 punpcklbw m3, m5 651 punpcklbw m1, m5 652 %endif 653 SUM_SSE m0, m1, m2, m3, m6, m7 654 mova m0, m4 655 656 lea srcq, [srcq+src_strideq*2] 657 lea dstq, [dstq+dst_strideq*2] 658 %endif 659 %if %2 == 1 ; avg 660 add secq, sec_str 661 %endif 662 dec h 663 jg .x_half_y_half_loop 664 STORE_AND_RET 665 666 .x_half_y_nonhalf: 667 ; x_offset == 0.5 && y_offset == bilin interpolation 668 %ifdef PIC 669 lea bilin_filter, [bilin_filter_m] 670 %endif 671 shl y_offsetd, filter_idx_shift 672 %if ARCH_X86_64 && mmsize == 16 673 mova m8, [bilin_filter+y_offsetq] 674 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 675 mova m9, [bilin_filter+y_offsetq+16] 676 %endif 677 mova m10, [pw_8] 678 %define filter_y_a m8 679 %define filter_y_b m9 680 %define filter_rnd m10 681 %else ;x86_32 682 %if ARCH_X86=1 && CONFIG_PIC=1 683 ; x_offset == 0.5. We can reuse x_offset reg 684 %define tempq x_offsetq 685 add y_offsetq, g_bilin_filterm 686 %define filter_y_a [y_offsetq] 687 %define filter_y_b [y_offsetq+16] 688 mov tempq, g_pw_8m 689 %define filter_rnd [tempq] 690 %else 691 add y_offsetq, bilin_filter 692 %define filter_y_a [y_offsetq] 693 %define filter_y_b [y_offsetq+16] 694 %define filter_rnd [pw_8] 695 %endif 696 %endif 697 698 %if %1 == 16 699 movu m0, [srcq] 700 movu m3, [srcq+1] 701 add srcq, src_strideq 702 pavgb m0, m3 703 .x_half_y_other_loop: 704 movu m4, [srcq] 705 movu m2, [srcq+1] 706 mova m1, [dstq] 707 pavgb m4, m2 708 %if cpuflag(ssse3) 709 punpckhbw m2, m0, m4 710 punpcklbw m0, m4 711 pmaddubsw m2, filter_y_a 712 pmaddubsw m0, filter_y_a 713 paddw m2, filter_rnd 714 paddw m0, filter_rnd 715 psraw m2, 4 716 %else 717 punpckhbw m2, m0, m5 718 punpckhbw m3, m4, m5 719 pmullw m2, filter_y_a 720 pmullw m3, filter_y_b 721 paddw m2, filter_rnd 722 punpcklbw m0, m5 723 paddw m2, m3 724 punpcklbw m3, m4, m5 725 pmullw m0, filter_y_a 726 pmullw m3, filter_y_b 727 paddw m0, filter_rnd 728 psraw m2, 4 729 paddw m0, m3 730 %endif 731 punpckhbw m3, m1, m5 732 psraw m0, 4 733 %if %2 == 1 ; avg 734 ; FIXME(rbultje) pipeline 735 packuswb m0, m2 736 pavgb m0, [secq] 737 punpckhbw m2, m0, m5 738 punpcklbw m0, m5 739 %endif 740 punpcklbw m1, m5 741 SUM_SSE m0, m1, m2, m3, m6, m7 742 mova m0, m4 743 744 add srcq, src_strideq 745 add dstq, dst_strideq 746 %else ; %1 < 16 747 movh m0, [srcq] 748 movh m3, [srcq+1] 749 add srcq, src_strideq 750 pavgb m0, m3 751 %if notcpuflag(ssse3) 752 punpcklbw m0, m5 753 %endif 754 .x_half_y_other_loop: 755 movh m2, [srcq] 756 movh m1, [srcq+1] 757 movh m4, [srcq+src_strideq] 758 movh m3, [srcq+src_strideq+1] 759 pavgb m2, m1 760 pavgb m4, m3 761 movh m3, [dstq+dst_strideq] 762 %if cpuflag(ssse3) 763 movh m1, [dstq] 764 punpcklbw m0, m2 765 punpcklbw m2, m4 766 pmaddubsw m0, filter_y_a 767 pmaddubsw m2, filter_y_a 768 punpcklbw m3, m5 769 paddw m0, filter_rnd 770 paddw m2, filter_rnd 771 %else 772 punpcklbw m2, m5 773 punpcklbw m4, m5 774 pmullw m0, filter_y_a 775 pmullw m1, m2, filter_y_b 776 punpcklbw m3, m5 777 paddw m0, filter_rnd 778 pmullw m2, filter_y_a 779 paddw m0, m1 780 pmullw m1, m4, filter_y_b 781 paddw m2, filter_rnd 782 paddw m2, m1 783 movh m1, [dstq] 784 %endif 785 psraw m0, 4 786 psraw m2, 4 787 %if %2 == 1 ; avg 788 ; FIXME(rbultje) pipeline 789 packuswb m0, m2 790 pavgb m0, [secq] 791 punpckhbw m2, m0, m5 792 punpcklbw m0, m5 793 %endif 794 punpcklbw m1, m5 795 SUM_SSE m0, m1, m2, m3, m6, m7 796 mova m0, m4 797 798 lea srcq, [srcq+src_strideq*2] 799 lea dstq, [dstq+dst_strideq*2] 800 %endif 801 %if %2 == 1 ; avg 802 add secq, sec_str 803 %endif 804 dec h 805 jg .x_half_y_other_loop 806 %undef filter_y_a 807 %undef filter_y_b 808 %undef filter_rnd 809 STORE_AND_RET 810 811 .x_nonhalf: 812 test y_offsetd, y_offsetd 813 jnz .x_nonhalf_y_nonzero 814 815 ; x_offset == bilin interpolation && y_offset == 0 816 %ifdef PIC 817 lea bilin_filter, [bilin_filter_m] 818 %endif 819 shl x_offsetd, filter_idx_shift 820 %if ARCH_X86_64 && mmsize == 16 821 mova m8, [bilin_filter+x_offsetq] 822 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 823 mova m9, [bilin_filter+x_offsetq+16] 824 %endif 825 mova m10, [pw_8] 826 %define filter_x_a m8 827 %define filter_x_b m9 828 %define filter_rnd m10 829 %else ; x86-32 830 %if ARCH_X86=1 && CONFIG_PIC=1 831 ;y_offset == 0. We can reuse y_offset reg. 832 %define tempq y_offsetq 833 add x_offsetq, g_bilin_filterm 834 %define filter_x_a [x_offsetq] 835 %define filter_x_b [x_offsetq+16] 836 mov tempq, g_pw_8m 837 %define filter_rnd [tempq] 838 %else 839 add x_offsetq, bilin_filter 840 %define filter_x_a [x_offsetq] 841 %define filter_x_b [x_offsetq+16] 842 %define filter_rnd [pw_8] 843 %endif 844 %endif 845 846 .x_other_y_zero_loop: 847 %if %1 == 16 848 movu m0, [srcq] 849 movu m4, [srcq+1] 850 mova m1, [dstq] 851 %if cpuflag(ssse3) 852 punpckhbw m2, m0, m4 853 punpcklbw m0, m4 854 pmaddubsw m2, filter_x_a 855 pmaddubsw m0, filter_x_a 856 paddw m2, filter_rnd 857 paddw m0, filter_rnd 858 %else 859 punpckhbw m2, m0, m5 860 punpckhbw m3, m4, m5 861 punpcklbw m0, m5 862 punpcklbw m4, m5 863 pmullw m2, filter_x_a 864 pmullw m3, filter_x_b 865 paddw m2, filter_rnd 866 pmullw m0, filter_x_a 867 pmullw m4, filter_x_b 868 paddw m0, filter_rnd 869 paddw m2, m3 870 paddw m0, m4 871 %endif 872 psraw m2, 4 873 psraw m0, 4 874 %if %2 == 1 ; avg 875 ; FIXME(rbultje) pipeline 876 packuswb m0, m2 877 pavgb m0, [secq] 878 punpckhbw m2, m0, m5 879 punpcklbw m0, m5 880 %endif 881 punpckhbw m3, m1, m5 882 punpcklbw m1, m5 883 SUM_SSE m0, m1, m2, m3, m6, m7 884 885 add srcq, src_strideq 886 add dstq, dst_strideq 887 %else ; %1 < 16 888 movh m0, [srcq] 889 movh m1, [srcq+1] 890 movh m2, [srcq+src_strideq] 891 movh m4, [srcq+src_strideq+1] 892 movh m3, [dstq+dst_strideq] 893 %if cpuflag(ssse3) 894 punpcklbw m0, m1 895 movh m1, [dstq] 896 punpcklbw m2, m4 897 pmaddubsw m0, filter_x_a 898 pmaddubsw m2, filter_x_a 899 punpcklbw m3, m5 900 paddw m0, filter_rnd 901 paddw m2, filter_rnd 902 %else 903 punpcklbw m0, m5 904 punpcklbw m1, m5 905 punpcklbw m2, m5 906 punpcklbw m4, m5 907 pmullw m0, filter_x_a 908 pmullw m1, filter_x_b 909 punpcklbw m3, m5 910 paddw m0, filter_rnd 911 pmullw m2, filter_x_a 912 pmullw m4, filter_x_b 913 paddw m0, m1 914 paddw m2, filter_rnd 915 movh m1, [dstq] 916 paddw m2, m4 917 %endif 918 psraw m0, 4 919 psraw m2, 4 920 %if %2 == 1 ; avg 921 ; FIXME(rbultje) pipeline 922 packuswb m0, m2 923 pavgb m0, [secq] 924 punpckhbw m2, m0, m5 925 punpcklbw m0, m5 926 %endif 927 punpcklbw m1, m5 928 SUM_SSE m0, m1, m2, m3, m6, m7 929 930 lea srcq, [srcq+src_strideq*2] 931 lea dstq, [dstq+dst_strideq*2] 932 %endif 933 %if %2 == 1 ; avg 934 add secq, sec_str 935 %endif 936 dec h 937 jg .x_other_y_zero_loop 938 %undef filter_x_a 939 %undef filter_x_b 940 %undef filter_rnd 941 STORE_AND_RET 942 943 .x_nonhalf_y_nonzero: 944 cmp y_offsetd, 8 945 jne .x_nonhalf_y_nonhalf 946 947 ; x_offset == bilin interpolation && y_offset == 0.5 948 %ifdef PIC 949 lea bilin_filter, [bilin_filter_m] 950 %endif 951 shl x_offsetd, filter_idx_shift 952 %if ARCH_X86_64 && mmsize == 16 953 mova m8, [bilin_filter+x_offsetq] 954 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 955 mova m9, [bilin_filter+x_offsetq+16] 956 %endif 957 mova m10, [pw_8] 958 %define filter_x_a m8 959 %define filter_x_b m9 960 %define filter_rnd m10 961 %else ; x86-32 962 %if ARCH_X86=1 && CONFIG_PIC=1 963 ; y_offset == 0.5. We can reuse y_offset reg. 964 %define tempq y_offsetq 965 add x_offsetq, g_bilin_filterm 966 %define filter_x_a [x_offsetq] 967 %define filter_x_b [x_offsetq+16] 968 mov tempq, g_pw_8m 969 %define filter_rnd [tempq] 970 %else 971 add x_offsetq, bilin_filter 972 %define filter_x_a [x_offsetq] 973 %define filter_x_b [x_offsetq+16] 974 %define filter_rnd [pw_8] 975 %endif 976 %endif 977 978 %if %1 == 16 979 movu m0, [srcq] 980 movu m1, [srcq+1] 981 %if cpuflag(ssse3) 982 punpckhbw m2, m0, m1 983 punpcklbw m0, m1 984 pmaddubsw m2, filter_x_a 985 pmaddubsw m0, filter_x_a 986 paddw m2, filter_rnd 987 paddw m0, filter_rnd 988 %else 989 punpckhbw m2, m0, m5 990 punpckhbw m3, m1, m5 991 punpcklbw m0, m5 992 punpcklbw m1, m5 993 pmullw m0, filter_x_a 994 pmullw m1, filter_x_b 995 paddw m0, filter_rnd 996 pmullw m2, filter_x_a 997 pmullw m3, filter_x_b 998 paddw m2, filter_rnd 999 paddw m0, m1 1000 paddw m2, m3 1001 %endif 1002 psraw m0, 4 1003 psraw m2, 4 1004 add srcq, src_strideq 1005 packuswb m0, m2 1006 .x_other_y_half_loop: 1007 movu m4, [srcq] 1008 movu m3, [srcq+1] 1009 %if cpuflag(ssse3) 1010 mova m1, [dstq] 1011 punpckhbw m2, m4, m3 1012 punpcklbw m4, m3 1013 pmaddubsw m2, filter_x_a 1014 pmaddubsw m4, filter_x_a 1015 paddw m2, filter_rnd 1016 paddw m4, filter_rnd 1017 psraw m2, 4 1018 psraw m4, 4 1019 packuswb m4, m2 1020 pavgb m0, m4 1021 punpckhbw m3, m1, m5 1022 punpcklbw m1, m5 1023 %else 1024 punpckhbw m2, m4, m5 1025 punpckhbw m1, m3, m5 1026 punpcklbw m4, m5 1027 punpcklbw m3, m5 1028 pmullw m4, filter_x_a 1029 pmullw m3, filter_x_b 1030 paddw m4, filter_rnd 1031 pmullw m2, filter_x_a 1032 pmullw m1, filter_x_b 1033 paddw m2, filter_rnd 1034 paddw m4, m3 1035 paddw m2, m1 1036 mova m1, [dstq] 1037 psraw m4, 4 1038 psraw m2, 4 1039 punpckhbw m3, m1, m5 1040 ; FIXME(rbultje) the repeated pack/unpack here around m0/m2 is because we 1041 ; have a 1-register shortage to be able to store the backup of the bilin 1042 ; filtered second line as words as cache for the next line. Packing into 1043 ; a byte costs 1 pack and 2 unpacks, but saves a register. 1044 packuswb m4, m2 1045 punpcklbw m1, m5 1046 pavgb m0, m4 1047 %endif 1048 %if %2 == 1 ; avg 1049 ; FIXME(rbultje) pipeline 1050 pavgb m0, [secq] 1051 %endif 1052 punpckhbw m2, m0, m5 1053 punpcklbw m0, m5 1054 SUM_SSE m0, m1, m2, m3, m6, m7 1055 mova m0, m4 1056 1057 add srcq, src_strideq 1058 add dstq, dst_strideq 1059 %else ; %1 < 16 1060 movh m0, [srcq] 1061 movh m1, [srcq+1] 1062 %if cpuflag(ssse3) 1063 punpcklbw m0, m1 1064 pmaddubsw m0, filter_x_a 1065 paddw m0, filter_rnd 1066 %else 1067 punpcklbw m0, m5 1068 punpcklbw m1, m5 1069 pmullw m0, filter_x_a 1070 pmullw m1, filter_x_b 1071 paddw m0, filter_rnd 1072 paddw m0, m1 1073 %endif 1074 add srcq, src_strideq 1075 psraw m0, 4 1076 .x_other_y_half_loop: 1077 movh m2, [srcq] 1078 movh m1, [srcq+1] 1079 movh m4, [srcq+src_strideq] 1080 movh m3, [srcq+src_strideq+1] 1081 %if cpuflag(ssse3) 1082 punpcklbw m2, m1 1083 punpcklbw m4, m3 1084 pmaddubsw m2, filter_x_a 1085 pmaddubsw m4, filter_x_a 1086 movh m1, [dstq] 1087 movh m3, [dstq+dst_strideq] 1088 paddw m2, filter_rnd 1089 paddw m4, filter_rnd 1090 %else 1091 punpcklbw m2, m5 1092 punpcklbw m1, m5 1093 punpcklbw m4, m5 1094 punpcklbw m3, m5 1095 pmullw m2, filter_x_a 1096 pmullw m1, filter_x_b 1097 paddw m2, filter_rnd 1098 pmullw m4, filter_x_a 1099 pmullw m3, filter_x_b 1100 paddw m4, filter_rnd 1101 paddw m2, m1 1102 movh m1, [dstq] 1103 paddw m4, m3 1104 movh m3, [dstq+dst_strideq] 1105 %endif 1106 psraw m2, 4 1107 psraw m4, 4 1108 pavgw m0, m2 1109 pavgw m2, m4 1110 %if %2 == 1 ; avg 1111 ; FIXME(rbultje) pipeline - also consider going to bytes here 1112 packuswb m0, m2 1113 pavgb m0, [secq] 1114 punpckhbw m2, m0, m5 1115 punpcklbw m0, m5 1116 %endif 1117 punpcklbw m3, m5 1118 punpcklbw m1, m5 1119 SUM_SSE m0, m1, m2, m3, m6, m7 1120 mova m0, m4 1121 1122 lea srcq, [srcq+src_strideq*2] 1123 lea dstq, [dstq+dst_strideq*2] 1124 %endif 1125 %if %2 == 1 ; avg 1126 add secq, sec_str 1127 %endif 1128 dec h 1129 jg .x_other_y_half_loop 1130 %undef filter_x_a 1131 %undef filter_x_b 1132 %undef filter_rnd 1133 STORE_AND_RET 1134 1135 .x_nonhalf_y_nonhalf: 1136 %ifdef PIC 1137 lea bilin_filter, [bilin_filter_m] 1138 %endif 1139 shl x_offsetd, filter_idx_shift 1140 shl y_offsetd, filter_idx_shift 1141 %if ARCH_X86_64 && mmsize == 16 1142 mova m8, [bilin_filter+x_offsetq] 1143 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 1144 mova m9, [bilin_filter+x_offsetq+16] 1145 %endif 1146 mova m10, [bilin_filter+y_offsetq] 1147 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 1148 mova m11, [bilin_filter+y_offsetq+16] 1149 %endif 1150 mova m12, [pw_8] 1151 %define filter_x_a m8 1152 %define filter_x_b m9 1153 %define filter_y_a m10 1154 %define filter_y_b m11 1155 %define filter_rnd m12 1156 %else ; x86-32 1157 %if ARCH_X86=1 && CONFIG_PIC=1 1158 ; In this case, there is NO unused register. Used src_stride register. Later, 1159 ; src_stride has to be loaded from stack when it is needed. 1160 %define tempq src_strideq 1161 mov tempq, g_bilin_filterm 1162 add x_offsetq, tempq 1163 add y_offsetq, tempq 1164 %define filter_x_a [x_offsetq] 1165 %define filter_x_b [x_offsetq+16] 1166 %define filter_y_a [y_offsetq] 1167 %define filter_y_b [y_offsetq+16] 1168 1169 mov tempq, g_pw_8m 1170 %define filter_rnd [tempq] 1171 %else 1172 add x_offsetq, bilin_filter 1173 add y_offsetq, bilin_filter 1174 %define filter_x_a [x_offsetq] 1175 %define filter_x_b [x_offsetq+16] 1176 %define filter_y_a [y_offsetq] 1177 %define filter_y_b [y_offsetq+16] 1178 %define filter_rnd [pw_8] 1179 %endif 1180 %endif 1181 1182 ; x_offset == bilin interpolation && y_offset == bilin interpolation 1183 %if %1 == 16 1184 movu m0, [srcq] 1185 movu m1, [srcq+1] 1186 %if cpuflag(ssse3) 1187 punpckhbw m2, m0, m1 1188 punpcklbw m0, m1 1189 pmaddubsw m2, filter_x_a 1190 pmaddubsw m0, filter_x_a 1191 paddw m2, filter_rnd 1192 paddw m0, filter_rnd 1193 %else 1194 punpckhbw m2, m0, m5 1195 punpckhbw m3, m1, m5 1196 punpcklbw m0, m5 1197 punpcklbw m1, m5 1198 pmullw m0, filter_x_a 1199 pmullw m1, filter_x_b 1200 paddw m0, filter_rnd 1201 pmullw m2, filter_x_a 1202 pmullw m3, filter_x_b 1203 paddw m2, filter_rnd 1204 paddw m0, m1 1205 paddw m2, m3 1206 %endif 1207 psraw m0, 4 1208 psraw m2, 4 1209 1210 INC_SRC_BY_SRC_STRIDE 1211 1212 packuswb m0, m2 1213 .x_other_y_other_loop: 1214 %if cpuflag(ssse3) 1215 movu m4, [srcq] 1216 movu m3, [srcq+1] 1217 mova m1, [dstq] 1218 punpckhbw m2, m4, m3 1219 punpcklbw m4, m3 1220 pmaddubsw m2, filter_x_a 1221 pmaddubsw m4, filter_x_a 1222 punpckhbw m3, m1, m5 1223 paddw m2, filter_rnd 1224 paddw m4, filter_rnd 1225 psraw m2, 4 1226 psraw m4, 4 1227 packuswb m4, m2 1228 punpckhbw m2, m0, m4 1229 punpcklbw m0, m4 1230 pmaddubsw m2, filter_y_a 1231 pmaddubsw m0, filter_y_a 1232 punpcklbw m1, m5 1233 paddw m2, filter_rnd 1234 paddw m0, filter_rnd 1235 psraw m2, 4 1236 psraw m0, 4 1237 %else 1238 movu m3, [srcq] 1239 movu m4, [srcq+1] 1240 punpckhbw m1, m3, m5 1241 punpckhbw m2, m4, m5 1242 punpcklbw m3, m5 1243 punpcklbw m4, m5 1244 pmullw m3, filter_x_a 1245 pmullw m4, filter_x_b 1246 paddw m3, filter_rnd 1247 pmullw m1, filter_x_a 1248 pmullw m2, filter_x_b 1249 paddw m1, filter_rnd 1250 paddw m3, m4 1251 paddw m1, m2 1252 psraw m3, 4 1253 psraw m1, 4 1254 packuswb m4, m3, m1 1255 punpckhbw m2, m0, m5 1256 punpcklbw m0, m5 1257 pmullw m2, filter_y_a 1258 pmullw m1, filter_y_b 1259 paddw m2, filter_rnd 1260 pmullw m0, filter_y_a 1261 pmullw m3, filter_y_b 1262 paddw m2, m1 1263 mova m1, [dstq] 1264 paddw m0, filter_rnd 1265 psraw m2, 4 1266 paddw m0, m3 1267 punpckhbw m3, m1, m5 1268 psraw m0, 4 1269 punpcklbw m1, m5 1270 %endif 1271 %if %2 == 1 ; avg 1272 ; FIXME(rbultje) pipeline 1273 packuswb m0, m2 1274 pavgb m0, [secq] 1275 punpckhbw m2, m0, m5 1276 punpcklbw m0, m5 1277 %endif 1278 SUM_SSE m0, m1, m2, m3, m6, m7 1279 mova m0, m4 1280 1281 INC_SRC_BY_SRC_STRIDE 1282 add dstq, dst_strideq 1283 %else ; %1 < 16 1284 movh m0, [srcq] 1285 movh m1, [srcq+1] 1286 %if cpuflag(ssse3) 1287 punpcklbw m0, m1 1288 pmaddubsw m0, filter_x_a 1289 paddw m0, filter_rnd 1290 %else 1291 punpcklbw m0, m5 1292 punpcklbw m1, m5 1293 pmullw m0, filter_x_a 1294 pmullw m1, filter_x_b 1295 paddw m0, filter_rnd 1296 paddw m0, m1 1297 %endif 1298 psraw m0, 4 1299 %if cpuflag(ssse3) 1300 packuswb m0, m0 1301 %endif 1302 1303 INC_SRC_BY_SRC_STRIDE 1304 1305 .x_other_y_other_loop: 1306 movh m2, [srcq] 1307 movh m1, [srcq+1] 1308 1309 INC_SRC_BY_SRC_STRIDE 1310 movh m4, [srcq] 1311 movh m3, [srcq+1] 1312 1313 %if cpuflag(ssse3) 1314 punpcklbw m2, m1 1315 punpcklbw m4, m3 1316 pmaddubsw m2, filter_x_a 1317 pmaddubsw m4, filter_x_a 1318 movh m3, [dstq+dst_strideq] 1319 movh m1, [dstq] 1320 paddw m2, filter_rnd 1321 paddw m4, filter_rnd 1322 psraw m2, 4 1323 psraw m4, 4 1324 packuswb m2, m2 1325 packuswb m4, m4 1326 punpcklbw m0, m2 1327 punpcklbw m2, m4 1328 pmaddubsw m0, filter_y_a 1329 pmaddubsw m2, filter_y_a 1330 punpcklbw m3, m5 1331 paddw m0, filter_rnd 1332 paddw m2, filter_rnd 1333 psraw m0, 4 1334 psraw m2, 4 1335 punpcklbw m1, m5 1336 %else 1337 punpcklbw m2, m5 1338 punpcklbw m1, m5 1339 punpcklbw m4, m5 1340 punpcklbw m3, m5 1341 pmullw m2, filter_x_a 1342 pmullw m1, filter_x_b 1343 paddw m2, filter_rnd 1344 pmullw m4, filter_x_a 1345 pmullw m3, filter_x_b 1346 paddw m4, filter_rnd 1347 paddw m2, m1 1348 paddw m4, m3 1349 psraw m2, 4 1350 psraw m4, 4 1351 pmullw m0, filter_y_a 1352 pmullw m3, m2, filter_y_b 1353 paddw m0, filter_rnd 1354 pmullw m2, filter_y_a 1355 pmullw m1, m4, filter_y_b 1356 paddw m2, filter_rnd 1357 paddw m0, m3 1358 movh m3, [dstq+dst_strideq] 1359 paddw m2, m1 1360 movh m1, [dstq] 1361 psraw m0, 4 1362 psraw m2, 4 1363 punpcklbw m3, m5 1364 punpcklbw m1, m5 1365 %endif 1366 %if %2 == 1 ; avg 1367 ; FIXME(rbultje) pipeline 1368 packuswb m0, m2 1369 pavgb m0, [secq] 1370 punpckhbw m2, m0, m5 1371 punpcklbw m0, m5 1372 %endif 1373 SUM_SSE m0, m1, m2, m3, m6, m7 1374 mova m0, m4 1375 1376 INC_SRC_BY_SRC_STRIDE 1377 lea dstq, [dstq+dst_strideq*2] 1378 %endif 1379 %if %2 == 1 ; avg 1380 add secq, sec_str 1381 %endif 1382 dec h 1383 jg .x_other_y_other_loop 1384 %undef filter_x_a 1385 %undef filter_x_b 1386 %undef filter_y_a 1387 %undef filter_y_b 1388 %undef filter_rnd 1389 STORE_AND_RET 1390 %endmacro 1391 1392 ; FIXME(rbultje) the non-bilinear versions (i.e. x=0,8&&y=0,8) are identical 1393 ; between the ssse3 and non-ssse3 version. It may make sense to merge their 1394 ; code in the sense that the ssse3 version would jump to the appropriate 1395 ; location in the sse/2 version, rather than duplicating that code in the 1396 ; binary. 1397 1398 INIT_MMX sse 1399 SUBPEL_VARIANCE 4 1400 INIT_XMM sse2 1401 SUBPEL_VARIANCE 8 1402 SUBPEL_VARIANCE 16 1403 1404 INIT_MMX ssse3 1405 SUBPEL_VARIANCE 4 1406 INIT_XMM ssse3 1407 SUBPEL_VARIANCE 8 1408 SUBPEL_VARIANCE 16 1409 1410 INIT_MMX sse 1411 SUBPEL_VARIANCE 4, 1 1412 INIT_XMM sse2 1413 SUBPEL_VARIANCE 8, 1 1414 SUBPEL_VARIANCE 16, 1 1415 1416 INIT_MMX ssse3 1417 SUBPEL_VARIANCE 4, 1 1418 INIT_XMM ssse3 1419 SUBPEL_VARIANCE 8, 1 1420 SUBPEL_VARIANCE 16, 1 1421