Home | History | Annotate | Download | only in x86
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 %include "third_party/x86inc/x86inc.asm"
     12 
     13 SECTION_RODATA
     14 pw_8: times  8 dw  8
     15 bilin_filter_m_sse2: times  8 dw 16
     16                      times  8 dw  0
     17                      times  8 dw 15
     18                      times  8 dw  1
     19                      times  8 dw 14
     20                      times  8 dw  2
     21                      times  8 dw 13
     22                      times  8 dw  3
     23                      times  8 dw 12
     24                      times  8 dw  4
     25                      times  8 dw 11
     26                      times  8 dw  5
     27                      times  8 dw 10
     28                      times  8 dw  6
     29                      times  8 dw  9
     30                      times  8 dw  7
     31                      times 16 dw  8
     32                      times  8 dw  7
     33                      times  8 dw  9
     34                      times  8 dw  6
     35                      times  8 dw 10
     36                      times  8 dw  5
     37                      times  8 dw 11
     38                      times  8 dw  4
     39                      times  8 dw 12
     40                      times  8 dw  3
     41                      times  8 dw 13
     42                      times  8 dw  2
     43                      times  8 dw 14
     44                      times  8 dw  1
     45                      times  8 dw 15
     46 
     47 bilin_filter_m_ssse3: times  8 db 16,  0
     48                       times  8 db 15,  1
     49                       times  8 db 14,  2
     50                       times  8 db 13,  3
     51                       times  8 db 12,  4
     52                       times  8 db 11,  5
     53                       times  8 db 10,  6
     54                       times  8 db  9,  7
     55                       times 16 db  8
     56                       times  8 db  7,  9
     57                       times  8 db  6, 10
     58                       times  8 db  5, 11
     59                       times  8 db  4, 12
     60                       times  8 db  3, 13
     61                       times  8 db  2, 14
     62                       times  8 db  1, 15
     63 
     64 SECTION .text
     65 
     66 ; int vp9_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
     67 ;                               int x_offset, int y_offset,
     68 ;                               const uint8_t *dst, ptrdiff_t dst_stride,
     69 ;                               int height, unsigned int *sse);
     70 ;
     71 ; This function returns the SE and stores SSE in the given pointer.
     72 
     73 %macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse
     74   psubw                %3, %4
     75   psubw                %1, %2
     76   paddw                %5, %3
     77   pmaddwd              %3, %3
     78   paddw                %5, %1
     79   pmaddwd              %1, %1
     80   paddd                %6, %3
     81   paddd                %6, %1
     82 %endmacro
     83 
     84 %macro STORE_AND_RET 0
     85 %if mmsize == 16
     86   ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit
     87   ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg.
     88   ; We have to sign-extend it before adding the words within the register
     89   ; and outputing to a dword.
     90   pcmpgtw              m5, m6           ; mask for 0 > x
     91   movhlps              m3, m7
     92   punpcklwd            m4, m6, m5
     93   punpckhwd            m6, m5           ; sign-extend m6 word->dword
     94   paddd                m7, m3
     95   paddd                m6, m4
     96   pshufd               m3, m7, 0x1
     97   movhlps              m4, m6
     98   paddd                m7, m3
     99   paddd                m6, m4
    100   mov                  r1, ssem         ; r1 = unsigned int *sse
    101   pshufd               m4, m6, 0x1
    102   movd               [r1], m7           ; store sse
    103   paddd                m6, m4
    104   movd                rax, m6           ; store sum as return value
    105 %else ; mmsize == 8
    106   pshufw               m4, m6, 0xe
    107   pshufw               m3, m7, 0xe
    108   paddw                m6, m4
    109   paddd                m7, m3
    110   pcmpgtw              m5, m6           ; mask for 0 > x
    111   mov                  r1, ssem         ; r1 = unsigned int *sse
    112   punpcklwd            m6, m5           ; sign-extend m6 word->dword
    113   movd               [r1], m7           ; store sse
    114   pshufw               m4, m6, 0xe
    115   paddd                m6, m4
    116   movd                rax, m6           ; store sum as return value
    117 %endif
    118   RET
    119 %endmacro
    120 
    121 %macro INC_SRC_BY_SRC_STRIDE  0
    122 %if ARCH_X86=1 && CONFIG_PIC=1
    123   add                srcq, src_stridemp
    124 %else
    125   add                srcq, src_strideq
    126 %endif
    127 %endmacro
    128 
    129 %macro SUBPEL_VARIANCE 1-2 0 ; W
    130 %if cpuflag(ssse3)
    131 %define bilin_filter_m bilin_filter_m_ssse3
    132 %define filter_idx_shift 4
    133 %else
    134 %define bilin_filter_m bilin_filter_m_sse2
    135 %define filter_idx_shift 5
    136 %endif
    137 ; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses
    138 ; 11, not 13, if the registers are ordered correctly. May make a minor speed
    139 ; difference on Win64
    140 
    141 %ifdef PIC    ; 64bit PIC
    142   %if %2 == 1 ; avg
    143     cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
    144                                       x_offset, y_offset, \
    145                                       dst, dst_stride, \
    146                                       sec, sec_stride, height, sse
    147     %define sec_str sec_strideq
    148   %else
    149     cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \
    150                                   y_offset, dst, dst_stride, height, sse
    151   %endif
    152   %define h heightd
    153   %define bilin_filter sseq
    154 %else
    155   %if ARCH_X86=1 && CONFIG_PIC=1
    156     %if %2 == 1 ; avg
    157       cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
    158                                   x_offset, y_offset, \
    159                                   dst, dst_stride, \
    160                                   sec, sec_stride, \
    161                                   height, sse, g_bilin_filter, g_pw_8
    162       %define h dword heightm
    163       %define sec_str sec_stridemp
    164 
    165       ;Store bilin_filter and pw_8 location in stack
    166       GET_GOT eax
    167       add esp, 4                ; restore esp
    168 
    169       lea ecx, [GLOBAL(bilin_filter_m)]
    170       mov g_bilin_filterm, ecx
    171 
    172       lea ecx, [GLOBAL(pw_8)]
    173       mov g_pw_8m, ecx
    174 
    175       LOAD_IF_USED 0, 1         ; load eax, ecx back
    176     %else
    177       cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \
    178                                 y_offset, dst, dst_stride, height, sse, \
    179                                 g_bilin_filter, g_pw_8
    180       %define h heightd
    181 
    182       ;Store bilin_filter and pw_8 location in stack
    183       GET_GOT eax
    184       add esp, 4                ; restore esp
    185 
    186       lea ecx, [GLOBAL(bilin_filter_m)]
    187       mov g_bilin_filterm, ecx
    188 
    189       lea ecx, [GLOBAL(pw_8)]
    190       mov g_pw_8m, ecx
    191 
    192       LOAD_IF_USED 0, 1         ; load eax, ecx back
    193     %endif
    194   %else
    195     %if %2 == 1 ; avg
    196       cglobal sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \
    197                         7 + 2 * ARCH_X86_64, 13, src, src_stride, \
    198                                              x_offset, y_offset, \
    199                                              dst, dst_stride, \
    200                                              sec, sec_stride, \
    201                                              height, sse
    202       %if ARCH_X86_64
    203       %define h heightd
    204       %define sec_str sec_strideq
    205       %else
    206       %define h dword heightm
    207       %define sec_str sec_stridemp
    208       %endif
    209     %else
    210       cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \
    211                               y_offset, dst, dst_stride, height, sse
    212       %define h heightd
    213     %endif
    214 
    215     %define bilin_filter bilin_filter_m
    216   %endif
    217 %endif
    218 
    219   ASSERT               %1 <= 16         ; m6 overflows if w > 16
    220   pxor                 m6, m6           ; sum
    221   pxor                 m7, m7           ; sse
    222   ; FIXME(rbultje) if both filters are bilinear, we don't actually use m5; we
    223   ; could perhaps use it for something more productive then
    224   pxor                 m5, m5           ; dedicated zero register
    225 %if %1 < 16
    226   sar                   h, 1
    227 %if %2 == 1 ; avg
    228   shl             sec_str, 1
    229 %endif
    230 %endif
    231 
    232   ; FIXME(rbultje) replace by jumptable?
    233   test          x_offsetd, x_offsetd
    234   jnz .x_nonzero
    235   ; x_offset == 0
    236   test          y_offsetd, y_offsetd
    237   jnz .x_zero_y_nonzero
    238 
    239   ; x_offset == 0 && y_offset == 0
    240 .x_zero_y_zero_loop:
    241 %if %1 == 16
    242   movu                 m0, [srcq]
    243   mova                 m1, [dstq]
    244 %if %2 == 1 ; avg
    245   pavgb                m0, [secq]
    246   punpckhbw            m3, m1, m5
    247   punpcklbw            m1, m5
    248 %endif
    249   punpckhbw            m2, m0, m5
    250   punpcklbw            m0, m5
    251 %if %2 == 0 ; !avg
    252   punpckhbw            m3, m1, m5
    253   punpcklbw            m1, m5
    254 %endif
    255   SUM_SSE              m0, m1, m2, m3, m6, m7
    256 
    257   add                srcq, src_strideq
    258   add                dstq, dst_strideq
    259 %else ; %1 < 16
    260   movh                 m0, [srcq]
    261 %if %2 == 1 ; avg
    262 %if mmsize == 16
    263   movhps               m0, [srcq+src_strideq]
    264 %else ; mmsize == 8
    265   punpckldq            m0, [srcq+src_strideq]
    266 %endif
    267 %else ; !avg
    268   movh                 m2, [srcq+src_strideq]
    269 %endif
    270   movh                 m1, [dstq]
    271   movh                 m3, [dstq+dst_strideq]
    272 %if %2 == 1 ; avg
    273   pavgb                m0, [secq]
    274   punpcklbw            m3, m5
    275   punpcklbw            m1, m5
    276   punpckhbw            m2, m0, m5
    277   punpcklbw            m0, m5
    278 %else ; !avg
    279   punpcklbw            m0, m5
    280   punpcklbw            m2, m5
    281   punpcklbw            m3, m5
    282   punpcklbw            m1, m5
    283 %endif
    284   SUM_SSE              m0, m1, m2, m3, m6, m7
    285 
    286   lea                srcq, [srcq+src_strideq*2]
    287   lea                dstq, [dstq+dst_strideq*2]
    288 %endif
    289 %if %2 == 1 ; avg
    290   add                secq, sec_str
    291 %endif
    292   dec                   h
    293   jg .x_zero_y_zero_loop
    294   STORE_AND_RET
    295 
    296 .x_zero_y_nonzero:
    297   cmp           y_offsetd, 8
    298   jne .x_zero_y_nonhalf
    299 
    300   ; x_offset == 0 && y_offset == 0.5
    301 .x_zero_y_half_loop:
    302 %if %1 == 16
    303   movu                 m0, [srcq]
    304   movu                 m4, [srcq+src_strideq]
    305   mova                 m1, [dstq]
    306   pavgb                m0, m4
    307   punpckhbw            m3, m1, m5
    308 %if %2 == 1 ; avg
    309   pavgb                m0, [secq]
    310 %endif
    311   punpcklbw            m1, m5
    312   punpckhbw            m2, m0, m5
    313   punpcklbw            m0, m5
    314   SUM_SSE              m0, m1, m2, m3, m6, m7
    315 
    316   add                srcq, src_strideq
    317   add                dstq, dst_strideq
    318 %else ; %1 < 16
    319   movh                 m0, [srcq]
    320   movh                 m2, [srcq+src_strideq]
    321 %if %2 == 1 ; avg
    322 %if mmsize == 16
    323   movhps               m2, [srcq+src_strideq*2]
    324 %else ; mmsize == 8
    325 %if %1 == 4
    326   movh                 m1, [srcq+src_strideq*2]
    327   punpckldq            m2, m1
    328 %else
    329   punpckldq            m2, [srcq+src_strideq*2]
    330 %endif
    331 %endif
    332   movh                 m1, [dstq]
    333 %if mmsize == 16
    334   movlhps              m0, m2
    335 %else ; mmsize == 8
    336   punpckldq            m0, m2
    337 %endif
    338   movh                 m3, [dstq+dst_strideq]
    339   pavgb                m0, m2
    340   punpcklbw            m1, m5
    341   pavgb                m0, [secq]
    342   punpcklbw            m3, m5
    343   punpckhbw            m2, m0, m5
    344   punpcklbw            m0, m5
    345 %else ; !avg
    346   movh                 m4, [srcq+src_strideq*2]
    347   movh                 m1, [dstq]
    348   pavgb                m0, m2
    349   movh                 m3, [dstq+dst_strideq]
    350   pavgb                m2, m4
    351   punpcklbw            m0, m5
    352   punpcklbw            m2, m5
    353   punpcklbw            m3, m5
    354   punpcklbw            m1, m5
    355 %endif
    356   SUM_SSE              m0, m1, m2, m3, m6, m7
    357 
    358   lea                srcq, [srcq+src_strideq*2]
    359   lea                dstq, [dstq+dst_strideq*2]
    360 %endif
    361 %if %2 == 1 ; avg
    362   add                secq, sec_str
    363 %endif
    364   dec                   h
    365   jg .x_zero_y_half_loop
    366   STORE_AND_RET
    367 
    368 .x_zero_y_nonhalf:
    369   ; x_offset == 0 && y_offset == bilin interpolation
    370 %ifdef PIC
    371   lea        bilin_filter, [bilin_filter_m]
    372 %endif
    373   shl           y_offsetd, filter_idx_shift
    374 %if ARCH_X86_64 && mmsize == 16
    375   mova                 m8, [bilin_filter+y_offsetq]
    376 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
    377   mova                 m9, [bilin_filter+y_offsetq+16]
    378 %endif
    379   mova                m10, [pw_8]
    380 %define filter_y_a m8
    381 %define filter_y_b m9
    382 %define filter_rnd m10
    383 %else ; x86-32 or mmx
    384 %if ARCH_X86=1 && CONFIG_PIC=1
    385 ; x_offset == 0, reuse x_offset reg
    386 %define tempq x_offsetq
    387   add y_offsetq, g_bilin_filterm
    388 %define filter_y_a [y_offsetq]
    389 %define filter_y_b [y_offsetq+16]
    390   mov tempq, g_pw_8m
    391 %define filter_rnd [tempq]
    392 %else
    393   add           y_offsetq, bilin_filter
    394 %define filter_y_a [y_offsetq]
    395 %define filter_y_b [y_offsetq+16]
    396 %define filter_rnd [pw_8]
    397 %endif
    398 %endif
    399 
    400 .x_zero_y_other_loop:
    401 %if %1 == 16
    402   movu                 m0, [srcq]
    403   movu                 m4, [srcq+src_strideq]
    404   mova                 m1, [dstq]
    405 %if cpuflag(ssse3)
    406   punpckhbw            m2, m0, m4
    407   punpcklbw            m0, m4
    408   pmaddubsw            m2, filter_y_a
    409   pmaddubsw            m0, filter_y_a
    410   paddw                m2, filter_rnd
    411   paddw                m0, filter_rnd
    412 %else
    413   punpckhbw            m2, m0, m5
    414   punpckhbw            m3, m4, m5
    415   punpcklbw            m0, m5
    416   punpcklbw            m4, m5
    417   ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can
    418   ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of
    419   ; instructions is the same (5), but it is 1 mul instead of 2, so might be
    420   ; slightly faster because of pmullw latency. It would also cut our rodata
    421   ; tables in half for this function, and save 1-2 registers on x86-64.
    422   pmullw               m2, filter_y_a
    423   pmullw               m3, filter_y_b
    424   paddw                m2, filter_rnd
    425   pmullw               m0, filter_y_a
    426   pmullw               m4, filter_y_b
    427   paddw                m0, filter_rnd
    428   paddw                m2, m3
    429   paddw                m0, m4
    430 %endif
    431   psraw                m2, 4
    432   psraw                m0, 4
    433 %if %2 == 1 ; avg
    434   ; FIXME(rbultje) pipeline
    435   packuswb             m0, m2
    436   pavgb                m0, [secq]
    437   punpckhbw            m2, m0, m5
    438   punpcklbw            m0, m5
    439 %endif
    440   punpckhbw            m3, m1, m5
    441   punpcklbw            m1, m5
    442   SUM_SSE              m0, m1, m2, m3, m6, m7
    443 
    444   add                srcq, src_strideq
    445   add                dstq, dst_strideq
    446 %else ; %1 < 16
    447   movh                 m0, [srcq]
    448   movh                 m2, [srcq+src_strideq]
    449   movh                 m4, [srcq+src_strideq*2]
    450   movh                 m3, [dstq+dst_strideq]
    451 %if cpuflag(ssse3)
    452   movh                 m1, [dstq]
    453   punpcklbw            m0, m2
    454   punpcklbw            m2, m4
    455   pmaddubsw            m0, filter_y_a
    456   pmaddubsw            m2, filter_y_a
    457   punpcklbw            m3, m5
    458   paddw                m2, filter_rnd
    459   paddw                m0, filter_rnd
    460 %else
    461   punpcklbw            m0, m5
    462   punpcklbw            m2, m5
    463   punpcklbw            m4, m5
    464   pmullw               m0, filter_y_a
    465   pmullw               m1, m2, filter_y_b
    466   punpcklbw            m3, m5
    467   paddw                m0, filter_rnd
    468   pmullw               m2, filter_y_a
    469   pmullw               m4, filter_y_b
    470   paddw                m0, m1
    471   paddw                m2, filter_rnd
    472   movh                 m1, [dstq]
    473   paddw                m2, m4
    474 %endif
    475   psraw                m0, 4
    476   psraw                m2, 4
    477 %if %2 == 1 ; avg
    478   ; FIXME(rbultje) pipeline
    479   packuswb             m0, m2
    480   pavgb                m0, [secq]
    481   punpckhbw            m2, m0, m5
    482   punpcklbw            m0, m5
    483 %endif
    484   punpcklbw            m1, m5
    485   SUM_SSE              m0, m1, m2, m3, m6, m7
    486 
    487   lea                srcq, [srcq+src_strideq*2]
    488   lea                dstq, [dstq+dst_strideq*2]
    489 %endif
    490 %if %2 == 1 ; avg
    491   add                secq, sec_str
    492 %endif
    493   dec                   h
    494   jg .x_zero_y_other_loop
    495 %undef filter_y_a
    496 %undef filter_y_b
    497 %undef filter_rnd
    498   STORE_AND_RET
    499 
    500 .x_nonzero:
    501   cmp           x_offsetd, 8
    502   jne .x_nonhalf
    503   ; x_offset == 0.5
    504   test          y_offsetd, y_offsetd
    505   jnz .x_half_y_nonzero
    506 
    507   ; x_offset == 0.5 && y_offset == 0
    508 .x_half_y_zero_loop:
    509 %if %1 == 16
    510   movu                 m0, [srcq]
    511   movu                 m4, [srcq+1]
    512   mova                 m1, [dstq]
    513   pavgb                m0, m4
    514   punpckhbw            m3, m1, m5
    515 %if %2 == 1 ; avg
    516   pavgb                m0, [secq]
    517 %endif
    518   punpcklbw            m1, m5
    519   punpckhbw            m2, m0, m5
    520   punpcklbw            m0, m5
    521   SUM_SSE              m0, m1, m2, m3, m6, m7
    522 
    523   add                srcq, src_strideq
    524   add                dstq, dst_strideq
    525 %else ; %1 < 16
    526   movh                 m0, [srcq]
    527   movh                 m4, [srcq+1]
    528 %if %2 == 1 ; avg
    529 %if mmsize == 16
    530   movhps               m0, [srcq+src_strideq]
    531   movhps               m4, [srcq+src_strideq+1]
    532 %else ; mmsize == 8
    533   punpckldq            m0, [srcq+src_strideq]
    534   punpckldq            m4, [srcq+src_strideq+1]
    535 %endif
    536   movh                 m1, [dstq]
    537   movh                 m3, [dstq+dst_strideq]
    538   pavgb                m0, m4
    539   punpcklbw            m3, m5
    540   pavgb                m0, [secq]
    541   punpcklbw            m1, m5
    542   punpckhbw            m2, m0, m5
    543   punpcklbw            m0, m5
    544 %else ; !avg
    545   movh                 m2, [srcq+src_strideq]
    546   movh                 m1, [dstq]
    547   pavgb                m0, m4
    548   movh                 m4, [srcq+src_strideq+1]
    549   movh                 m3, [dstq+dst_strideq]
    550   pavgb                m2, m4
    551   punpcklbw            m0, m5
    552   punpcklbw            m2, m5
    553   punpcklbw            m3, m5
    554   punpcklbw            m1, m5
    555 %endif
    556   SUM_SSE              m0, m1, m2, m3, m6, m7
    557 
    558   lea                srcq, [srcq+src_strideq*2]
    559   lea                dstq, [dstq+dst_strideq*2]
    560 %endif
    561 %if %2 == 1 ; avg
    562   add                secq, sec_str
    563 %endif
    564   dec                   h
    565   jg .x_half_y_zero_loop
    566   STORE_AND_RET
    567 
    568 .x_half_y_nonzero:
    569   cmp           y_offsetd, 8
    570   jne .x_half_y_nonhalf
    571 
    572   ; x_offset == 0.5 && y_offset == 0.5
    573 %if %1 == 16
    574   movu                 m0, [srcq]
    575   movu                 m3, [srcq+1]
    576   add                srcq, src_strideq
    577   pavgb                m0, m3
    578 .x_half_y_half_loop:
    579   movu                 m4, [srcq]
    580   movu                 m3, [srcq+1]
    581   mova                 m1, [dstq]
    582   pavgb                m4, m3
    583   punpckhbw            m3, m1, m5
    584   pavgb                m0, m4
    585 %if %2 == 1 ; avg
    586   punpcklbw            m1, m5
    587   pavgb                m0, [secq]
    588   punpckhbw            m2, m0, m5
    589   punpcklbw            m0, m5
    590 %else
    591   punpckhbw            m2, m0, m5
    592   punpcklbw            m0, m5
    593   punpcklbw            m1, m5
    594 %endif
    595   SUM_SSE              m0, m1, m2, m3, m6, m7
    596   mova                 m0, m4
    597 
    598   add                srcq, src_strideq
    599   add                dstq, dst_strideq
    600 %else ; %1 < 16
    601   movh                 m0, [srcq]
    602   movh                 m3, [srcq+1]
    603   add                srcq, src_strideq
    604   pavgb                m0, m3
    605 .x_half_y_half_loop:
    606   movh                 m2, [srcq]
    607   movh                 m3, [srcq+1]
    608 %if %2 == 1 ; avg
    609 %if mmsize == 16
    610   movhps               m2, [srcq+src_strideq]
    611   movhps               m3, [srcq+src_strideq+1]
    612 %else
    613 %if %1 == 4
    614   movh                 m1, [srcq+src_strideq]
    615   punpckldq            m2, m1
    616   movh                 m1, [srcq+src_strideq+1]
    617   punpckldq            m3, m1
    618 %else
    619   punpckldq            m2, [srcq+src_strideq]
    620   punpckldq            m3, [srcq+src_strideq+1]
    621 %endif
    622 %endif
    623   pavgb                m2, m3
    624 %if mmsize == 16
    625   movlhps              m0, m2
    626   movhlps              m4, m2
    627 %else ; mmsize == 8
    628   punpckldq            m0, m2
    629   pshufw               m4, m2, 0xe
    630 %endif
    631   movh                 m1, [dstq]
    632   pavgb                m0, m2
    633   movh                 m3, [dstq+dst_strideq]
    634   pavgb                m0, [secq]
    635   punpcklbw            m3, m5
    636   punpcklbw            m1, m5
    637   punpckhbw            m2, m0, m5
    638   punpcklbw            m0, m5
    639 %else ; !avg
    640   movh                 m4, [srcq+src_strideq]
    641   movh                 m1, [srcq+src_strideq+1]
    642   pavgb                m2, m3
    643   pavgb                m4, m1
    644   pavgb                m0, m2
    645   pavgb                m2, m4
    646   movh                 m1, [dstq]
    647   movh                 m3, [dstq+dst_strideq]
    648   punpcklbw            m0, m5
    649   punpcklbw            m2, m5
    650   punpcklbw            m3, m5
    651   punpcklbw            m1, m5
    652 %endif
    653   SUM_SSE              m0, m1, m2, m3, m6, m7
    654   mova                 m0, m4
    655 
    656   lea                srcq, [srcq+src_strideq*2]
    657   lea                dstq, [dstq+dst_strideq*2]
    658 %endif
    659 %if %2 == 1 ; avg
    660   add                secq, sec_str
    661 %endif
    662   dec                   h
    663   jg .x_half_y_half_loop
    664   STORE_AND_RET
    665 
    666 .x_half_y_nonhalf:
    667   ; x_offset == 0.5 && y_offset == bilin interpolation
    668 %ifdef PIC
    669   lea        bilin_filter, [bilin_filter_m]
    670 %endif
    671   shl           y_offsetd, filter_idx_shift
    672 %if ARCH_X86_64 && mmsize == 16
    673   mova                 m8, [bilin_filter+y_offsetq]
    674 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
    675   mova                 m9, [bilin_filter+y_offsetq+16]
    676 %endif
    677   mova                m10, [pw_8]
    678 %define filter_y_a m8
    679 %define filter_y_b m9
    680 %define filter_rnd m10
    681 %else  ;x86_32
    682 %if ARCH_X86=1 && CONFIG_PIC=1
    683 ; x_offset == 0.5. We can reuse x_offset reg
    684 %define tempq x_offsetq
    685   add y_offsetq, g_bilin_filterm
    686 %define filter_y_a [y_offsetq]
    687 %define filter_y_b [y_offsetq+16]
    688   mov tempq, g_pw_8m
    689 %define filter_rnd [tempq]
    690 %else
    691   add           y_offsetq, bilin_filter
    692 %define filter_y_a [y_offsetq]
    693 %define filter_y_b [y_offsetq+16]
    694 %define filter_rnd [pw_8]
    695 %endif
    696 %endif
    697 
    698 %if %1 == 16
    699   movu                 m0, [srcq]
    700   movu                 m3, [srcq+1]
    701   add                srcq, src_strideq
    702   pavgb                m0, m3
    703 .x_half_y_other_loop:
    704   movu                 m4, [srcq]
    705   movu                 m2, [srcq+1]
    706   mova                 m1, [dstq]
    707   pavgb                m4, m2
    708 %if cpuflag(ssse3)
    709   punpckhbw            m2, m0, m4
    710   punpcklbw            m0, m4
    711   pmaddubsw            m2, filter_y_a
    712   pmaddubsw            m0, filter_y_a
    713   paddw                m2, filter_rnd
    714   paddw                m0, filter_rnd
    715   psraw                m2, 4
    716 %else
    717   punpckhbw            m2, m0, m5
    718   punpckhbw            m3, m4, m5
    719   pmullw               m2, filter_y_a
    720   pmullw               m3, filter_y_b
    721   paddw                m2, filter_rnd
    722   punpcklbw            m0, m5
    723   paddw                m2, m3
    724   punpcklbw            m3, m4, m5
    725   pmullw               m0, filter_y_a
    726   pmullw               m3, filter_y_b
    727   paddw                m0, filter_rnd
    728   psraw                m2, 4
    729   paddw                m0, m3
    730 %endif
    731   punpckhbw            m3, m1, m5
    732   psraw                m0, 4
    733 %if %2 == 1 ; avg
    734   ; FIXME(rbultje) pipeline
    735   packuswb             m0, m2
    736   pavgb                m0, [secq]
    737   punpckhbw            m2, m0, m5
    738   punpcklbw            m0, m5
    739 %endif
    740   punpcklbw            m1, m5
    741   SUM_SSE              m0, m1, m2, m3, m6, m7
    742   mova                 m0, m4
    743 
    744   add                srcq, src_strideq
    745   add                dstq, dst_strideq
    746 %else ; %1 < 16
    747   movh                 m0, [srcq]
    748   movh                 m3, [srcq+1]
    749   add                srcq, src_strideq
    750   pavgb                m0, m3
    751 %if notcpuflag(ssse3)
    752   punpcklbw            m0, m5
    753 %endif
    754 .x_half_y_other_loop:
    755   movh                 m2, [srcq]
    756   movh                 m1, [srcq+1]
    757   movh                 m4, [srcq+src_strideq]
    758   movh                 m3, [srcq+src_strideq+1]
    759   pavgb                m2, m1
    760   pavgb                m4, m3
    761   movh                 m3, [dstq+dst_strideq]
    762 %if cpuflag(ssse3)
    763   movh                 m1, [dstq]
    764   punpcklbw            m0, m2
    765   punpcklbw            m2, m4
    766   pmaddubsw            m0, filter_y_a
    767   pmaddubsw            m2, filter_y_a
    768   punpcklbw            m3, m5
    769   paddw                m0, filter_rnd
    770   paddw                m2, filter_rnd
    771 %else
    772   punpcklbw            m2, m5
    773   punpcklbw            m4, m5
    774   pmullw               m0, filter_y_a
    775   pmullw               m1, m2, filter_y_b
    776   punpcklbw            m3, m5
    777   paddw                m0, filter_rnd
    778   pmullw               m2, filter_y_a
    779   paddw                m0, m1
    780   pmullw               m1, m4, filter_y_b
    781   paddw                m2, filter_rnd
    782   paddw                m2, m1
    783   movh                 m1, [dstq]
    784 %endif
    785   psraw                m0, 4
    786   psraw                m2, 4
    787 %if %2 == 1 ; avg
    788   ; FIXME(rbultje) pipeline
    789   packuswb             m0, m2
    790   pavgb                m0, [secq]
    791   punpckhbw            m2, m0, m5
    792   punpcklbw            m0, m5
    793 %endif
    794   punpcklbw            m1, m5
    795   SUM_SSE              m0, m1, m2, m3, m6, m7
    796   mova                 m0, m4
    797 
    798   lea                srcq, [srcq+src_strideq*2]
    799   lea                dstq, [dstq+dst_strideq*2]
    800 %endif
    801 %if %2 == 1 ; avg
    802   add                secq, sec_str
    803 %endif
    804   dec                   h
    805   jg .x_half_y_other_loop
    806 %undef filter_y_a
    807 %undef filter_y_b
    808 %undef filter_rnd
    809   STORE_AND_RET
    810 
    811 .x_nonhalf:
    812   test          y_offsetd, y_offsetd
    813   jnz .x_nonhalf_y_nonzero
    814 
    815   ; x_offset == bilin interpolation && y_offset == 0
    816 %ifdef PIC
    817   lea        bilin_filter, [bilin_filter_m]
    818 %endif
    819   shl           x_offsetd, filter_idx_shift
    820 %if ARCH_X86_64 && mmsize == 16
    821   mova                 m8, [bilin_filter+x_offsetq]
    822 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
    823   mova                 m9, [bilin_filter+x_offsetq+16]
    824 %endif
    825   mova                m10, [pw_8]
    826 %define filter_x_a m8
    827 %define filter_x_b m9
    828 %define filter_rnd m10
    829 %else    ; x86-32
    830 %if ARCH_X86=1 && CONFIG_PIC=1
    831 ;y_offset == 0. We can reuse y_offset reg.
    832 %define tempq y_offsetq
    833   add x_offsetq, g_bilin_filterm
    834 %define filter_x_a [x_offsetq]
    835 %define filter_x_b [x_offsetq+16]
    836   mov tempq, g_pw_8m
    837 %define filter_rnd [tempq]
    838 %else
    839   add           x_offsetq, bilin_filter
    840 %define filter_x_a [x_offsetq]
    841 %define filter_x_b [x_offsetq+16]
    842 %define filter_rnd [pw_8]
    843 %endif
    844 %endif
    845 
    846 .x_other_y_zero_loop:
    847 %if %1 == 16
    848   movu                 m0, [srcq]
    849   movu                 m4, [srcq+1]
    850   mova                 m1, [dstq]
    851 %if cpuflag(ssse3)
    852   punpckhbw            m2, m0, m4
    853   punpcklbw            m0, m4
    854   pmaddubsw            m2, filter_x_a
    855   pmaddubsw            m0, filter_x_a
    856   paddw                m2, filter_rnd
    857   paddw                m0, filter_rnd
    858 %else
    859   punpckhbw            m2, m0, m5
    860   punpckhbw            m3, m4, m5
    861   punpcklbw            m0, m5
    862   punpcklbw            m4, m5
    863   pmullw               m2, filter_x_a
    864   pmullw               m3, filter_x_b
    865   paddw                m2, filter_rnd
    866   pmullw               m0, filter_x_a
    867   pmullw               m4, filter_x_b
    868   paddw                m0, filter_rnd
    869   paddw                m2, m3
    870   paddw                m0, m4
    871 %endif
    872   psraw                m2, 4
    873   psraw                m0, 4
    874 %if %2 == 1 ; avg
    875   ; FIXME(rbultje) pipeline
    876   packuswb             m0, m2
    877   pavgb                m0, [secq]
    878   punpckhbw            m2, m0, m5
    879   punpcklbw            m0, m5
    880 %endif
    881   punpckhbw            m3, m1, m5
    882   punpcklbw            m1, m5
    883   SUM_SSE              m0, m1, m2, m3, m6, m7
    884 
    885   add                srcq, src_strideq
    886   add                dstq, dst_strideq
    887 %else ; %1 < 16
    888   movh                 m0, [srcq]
    889   movh                 m1, [srcq+1]
    890   movh                 m2, [srcq+src_strideq]
    891   movh                 m4, [srcq+src_strideq+1]
    892   movh                 m3, [dstq+dst_strideq]
    893 %if cpuflag(ssse3)
    894   punpcklbw            m0, m1
    895   movh                 m1, [dstq]
    896   punpcklbw            m2, m4
    897   pmaddubsw            m0, filter_x_a
    898   pmaddubsw            m2, filter_x_a
    899   punpcklbw            m3, m5
    900   paddw                m0, filter_rnd
    901   paddw                m2, filter_rnd
    902 %else
    903   punpcklbw            m0, m5
    904   punpcklbw            m1, m5
    905   punpcklbw            m2, m5
    906   punpcklbw            m4, m5
    907   pmullw               m0, filter_x_a
    908   pmullw               m1, filter_x_b
    909   punpcklbw            m3, m5
    910   paddw                m0, filter_rnd
    911   pmullw               m2, filter_x_a
    912   pmullw               m4, filter_x_b
    913   paddw                m0, m1
    914   paddw                m2, filter_rnd
    915   movh                 m1, [dstq]
    916   paddw                m2, m4
    917 %endif
    918   psraw                m0, 4
    919   psraw                m2, 4
    920 %if %2 == 1 ; avg
    921   ; FIXME(rbultje) pipeline
    922   packuswb             m0, m2
    923   pavgb                m0, [secq]
    924   punpckhbw            m2, m0, m5
    925   punpcklbw            m0, m5
    926 %endif
    927   punpcklbw            m1, m5
    928   SUM_SSE              m0, m1, m2, m3, m6, m7
    929 
    930   lea                srcq, [srcq+src_strideq*2]
    931   lea                dstq, [dstq+dst_strideq*2]
    932 %endif
    933 %if %2 == 1 ; avg
    934   add                secq, sec_str
    935 %endif
    936   dec                   h
    937   jg .x_other_y_zero_loop
    938 %undef filter_x_a
    939 %undef filter_x_b
    940 %undef filter_rnd
    941   STORE_AND_RET
    942 
    943 .x_nonhalf_y_nonzero:
    944   cmp           y_offsetd, 8
    945   jne .x_nonhalf_y_nonhalf
    946 
    947   ; x_offset == bilin interpolation && y_offset == 0.5
    948 %ifdef PIC
    949   lea        bilin_filter, [bilin_filter_m]
    950 %endif
    951   shl           x_offsetd, filter_idx_shift
    952 %if ARCH_X86_64 && mmsize == 16
    953   mova                 m8, [bilin_filter+x_offsetq]
    954 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
    955   mova                 m9, [bilin_filter+x_offsetq+16]
    956 %endif
    957   mova                m10, [pw_8]
    958 %define filter_x_a m8
    959 %define filter_x_b m9
    960 %define filter_rnd m10
    961 %else    ; x86-32
    962 %if ARCH_X86=1 && CONFIG_PIC=1
    963 ; y_offset == 0.5. We can reuse y_offset reg.
    964 %define tempq y_offsetq
    965   add x_offsetq, g_bilin_filterm
    966 %define filter_x_a [x_offsetq]
    967 %define filter_x_b [x_offsetq+16]
    968   mov tempq, g_pw_8m
    969 %define filter_rnd [tempq]
    970 %else
    971   add           x_offsetq, bilin_filter
    972 %define filter_x_a [x_offsetq]
    973 %define filter_x_b [x_offsetq+16]
    974 %define filter_rnd [pw_8]
    975 %endif
    976 %endif
    977 
    978 %if %1 == 16
    979   movu                 m0, [srcq]
    980   movu                 m1, [srcq+1]
    981 %if cpuflag(ssse3)
    982   punpckhbw            m2, m0, m1
    983   punpcklbw            m0, m1
    984   pmaddubsw            m2, filter_x_a
    985   pmaddubsw            m0, filter_x_a
    986   paddw                m2, filter_rnd
    987   paddw                m0, filter_rnd
    988 %else
    989   punpckhbw            m2, m0, m5
    990   punpckhbw            m3, m1, m5
    991   punpcklbw            m0, m5
    992   punpcklbw            m1, m5
    993   pmullw               m0, filter_x_a
    994   pmullw               m1, filter_x_b
    995   paddw                m0, filter_rnd
    996   pmullw               m2, filter_x_a
    997   pmullw               m3, filter_x_b
    998   paddw                m2, filter_rnd
    999   paddw                m0, m1
   1000   paddw                m2, m3
   1001 %endif
   1002   psraw                m0, 4
   1003   psraw                m2, 4
   1004   add                srcq, src_strideq
   1005   packuswb             m0, m2
   1006 .x_other_y_half_loop:
   1007   movu                 m4, [srcq]
   1008   movu                 m3, [srcq+1]
   1009 %if cpuflag(ssse3)
   1010   mova                 m1, [dstq]
   1011   punpckhbw            m2, m4, m3
   1012   punpcklbw            m4, m3
   1013   pmaddubsw            m2, filter_x_a
   1014   pmaddubsw            m4, filter_x_a
   1015   paddw                m2, filter_rnd
   1016   paddw                m4, filter_rnd
   1017   psraw                m2, 4
   1018   psraw                m4, 4
   1019   packuswb             m4, m2
   1020   pavgb                m0, m4
   1021   punpckhbw            m3, m1, m5
   1022   punpcklbw            m1, m5
   1023 %else
   1024   punpckhbw            m2, m4, m5
   1025   punpckhbw            m1, m3, m5
   1026   punpcklbw            m4, m5
   1027   punpcklbw            m3, m5
   1028   pmullw               m4, filter_x_a
   1029   pmullw               m3, filter_x_b
   1030   paddw                m4, filter_rnd
   1031   pmullw               m2, filter_x_a
   1032   pmullw               m1, filter_x_b
   1033   paddw                m2, filter_rnd
   1034   paddw                m4, m3
   1035   paddw                m2, m1
   1036   mova                 m1, [dstq]
   1037   psraw                m4, 4
   1038   psraw                m2, 4
   1039   punpckhbw            m3, m1, m5
   1040   ; FIXME(rbultje) the repeated pack/unpack here around m0/m2 is because we
   1041   ; have a 1-register shortage to be able to store the backup of the bilin
   1042   ; filtered second line as words as cache for the next line. Packing into
   1043   ; a byte costs 1 pack and 2 unpacks, but saves a register.
   1044   packuswb             m4, m2
   1045   punpcklbw            m1, m5
   1046   pavgb                m0, m4
   1047 %endif
   1048 %if %2 == 1 ; avg
   1049   ; FIXME(rbultje) pipeline
   1050   pavgb                m0, [secq]
   1051 %endif
   1052   punpckhbw            m2, m0, m5
   1053   punpcklbw            m0, m5
   1054   SUM_SSE              m0, m1, m2, m3, m6, m7
   1055   mova                 m0, m4
   1056 
   1057   add                srcq, src_strideq
   1058   add                dstq, dst_strideq
   1059 %else ; %1 < 16
   1060   movh                 m0, [srcq]
   1061   movh                 m1, [srcq+1]
   1062 %if cpuflag(ssse3)
   1063   punpcklbw            m0, m1
   1064   pmaddubsw            m0, filter_x_a
   1065   paddw                m0, filter_rnd
   1066 %else
   1067   punpcklbw            m0, m5
   1068   punpcklbw            m1, m5
   1069   pmullw               m0, filter_x_a
   1070   pmullw               m1, filter_x_b
   1071   paddw                m0, filter_rnd
   1072   paddw                m0, m1
   1073 %endif
   1074   add                srcq, src_strideq
   1075   psraw                m0, 4
   1076 .x_other_y_half_loop:
   1077   movh                 m2, [srcq]
   1078   movh                 m1, [srcq+1]
   1079   movh                 m4, [srcq+src_strideq]
   1080   movh                 m3, [srcq+src_strideq+1]
   1081 %if cpuflag(ssse3)
   1082   punpcklbw            m2, m1
   1083   punpcklbw            m4, m3
   1084   pmaddubsw            m2, filter_x_a
   1085   pmaddubsw            m4, filter_x_a
   1086   movh                 m1, [dstq]
   1087   movh                 m3, [dstq+dst_strideq]
   1088   paddw                m2, filter_rnd
   1089   paddw                m4, filter_rnd
   1090 %else
   1091   punpcklbw            m2, m5
   1092   punpcklbw            m1, m5
   1093   punpcklbw            m4, m5
   1094   punpcklbw            m3, m5
   1095   pmullw               m2, filter_x_a
   1096   pmullw               m1, filter_x_b
   1097   paddw                m2, filter_rnd
   1098   pmullw               m4, filter_x_a
   1099   pmullw               m3, filter_x_b
   1100   paddw                m4, filter_rnd
   1101   paddw                m2, m1
   1102   movh                 m1, [dstq]
   1103   paddw                m4, m3
   1104   movh                 m3, [dstq+dst_strideq]
   1105 %endif
   1106   psraw                m2, 4
   1107   psraw                m4, 4
   1108   pavgw                m0, m2
   1109   pavgw                m2, m4
   1110 %if %2 == 1 ; avg
   1111   ; FIXME(rbultje) pipeline - also consider going to bytes here
   1112   packuswb             m0, m2
   1113   pavgb                m0, [secq]
   1114   punpckhbw            m2, m0, m5
   1115   punpcklbw            m0, m5
   1116 %endif
   1117   punpcklbw            m3, m5
   1118   punpcklbw            m1, m5
   1119   SUM_SSE              m0, m1, m2, m3, m6, m7
   1120   mova                 m0, m4
   1121 
   1122   lea                srcq, [srcq+src_strideq*2]
   1123   lea                dstq, [dstq+dst_strideq*2]
   1124 %endif
   1125 %if %2 == 1 ; avg
   1126   add                secq, sec_str
   1127 %endif
   1128   dec                   h
   1129   jg .x_other_y_half_loop
   1130 %undef filter_x_a
   1131 %undef filter_x_b
   1132 %undef filter_rnd
   1133   STORE_AND_RET
   1134 
   1135 .x_nonhalf_y_nonhalf:
   1136 %ifdef PIC
   1137   lea        bilin_filter, [bilin_filter_m]
   1138 %endif
   1139   shl           x_offsetd, filter_idx_shift
   1140   shl           y_offsetd, filter_idx_shift
   1141 %if ARCH_X86_64 && mmsize == 16
   1142   mova                 m8, [bilin_filter+x_offsetq]
   1143 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
   1144   mova                 m9, [bilin_filter+x_offsetq+16]
   1145 %endif
   1146   mova                m10, [bilin_filter+y_offsetq]
   1147 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
   1148   mova                m11, [bilin_filter+y_offsetq+16]
   1149 %endif
   1150   mova                m12, [pw_8]
   1151 %define filter_x_a m8
   1152 %define filter_x_b m9
   1153 %define filter_y_a m10
   1154 %define filter_y_b m11
   1155 %define filter_rnd m12
   1156 %else   ; x86-32
   1157 %if ARCH_X86=1 && CONFIG_PIC=1
   1158 ; In this case, there is NO unused register. Used src_stride register. Later,
   1159 ; src_stride has to be loaded from stack when it is needed.
   1160 %define tempq src_strideq
   1161   mov tempq, g_bilin_filterm
   1162   add           x_offsetq, tempq
   1163   add           y_offsetq, tempq
   1164 %define filter_x_a [x_offsetq]
   1165 %define filter_x_b [x_offsetq+16]
   1166 %define filter_y_a [y_offsetq]
   1167 %define filter_y_b [y_offsetq+16]
   1168 
   1169   mov tempq, g_pw_8m
   1170 %define filter_rnd [tempq]
   1171 %else
   1172   add           x_offsetq, bilin_filter
   1173   add           y_offsetq, bilin_filter
   1174 %define filter_x_a [x_offsetq]
   1175 %define filter_x_b [x_offsetq+16]
   1176 %define filter_y_a [y_offsetq]
   1177 %define filter_y_b [y_offsetq+16]
   1178 %define filter_rnd [pw_8]
   1179 %endif
   1180 %endif
   1181 
   1182   ; x_offset == bilin interpolation && y_offset == bilin interpolation
   1183 %if %1 == 16
   1184   movu                 m0, [srcq]
   1185   movu                 m1, [srcq+1]
   1186 %if cpuflag(ssse3)
   1187   punpckhbw            m2, m0, m1
   1188   punpcklbw            m0, m1
   1189   pmaddubsw            m2, filter_x_a
   1190   pmaddubsw            m0, filter_x_a
   1191   paddw                m2, filter_rnd
   1192   paddw                m0, filter_rnd
   1193 %else
   1194   punpckhbw            m2, m0, m5
   1195   punpckhbw            m3, m1, m5
   1196   punpcklbw            m0, m5
   1197   punpcklbw            m1, m5
   1198   pmullw               m0, filter_x_a
   1199   pmullw               m1, filter_x_b
   1200   paddw                m0, filter_rnd
   1201   pmullw               m2, filter_x_a
   1202   pmullw               m3, filter_x_b
   1203   paddw                m2, filter_rnd
   1204   paddw                m0, m1
   1205   paddw                m2, m3
   1206 %endif
   1207   psraw                m0, 4
   1208   psraw                m2, 4
   1209 
   1210   INC_SRC_BY_SRC_STRIDE
   1211 
   1212   packuswb             m0, m2
   1213 .x_other_y_other_loop:
   1214 %if cpuflag(ssse3)
   1215   movu                 m4, [srcq]
   1216   movu                 m3, [srcq+1]
   1217   mova                 m1, [dstq]
   1218   punpckhbw            m2, m4, m3
   1219   punpcklbw            m4, m3
   1220   pmaddubsw            m2, filter_x_a
   1221   pmaddubsw            m4, filter_x_a
   1222   punpckhbw            m3, m1, m5
   1223   paddw                m2, filter_rnd
   1224   paddw                m4, filter_rnd
   1225   psraw                m2, 4
   1226   psraw                m4, 4
   1227   packuswb             m4, m2
   1228   punpckhbw            m2, m0, m4
   1229   punpcklbw            m0, m4
   1230   pmaddubsw            m2, filter_y_a
   1231   pmaddubsw            m0, filter_y_a
   1232   punpcklbw            m1, m5
   1233   paddw                m2, filter_rnd
   1234   paddw                m0, filter_rnd
   1235   psraw                m2, 4
   1236   psraw                m0, 4
   1237 %else
   1238   movu                 m3, [srcq]
   1239   movu                 m4, [srcq+1]
   1240   punpckhbw            m1, m3, m5
   1241   punpckhbw            m2, m4, m5
   1242   punpcklbw            m3, m5
   1243   punpcklbw            m4, m5
   1244   pmullw               m3, filter_x_a
   1245   pmullw               m4, filter_x_b
   1246   paddw                m3, filter_rnd
   1247   pmullw               m1, filter_x_a
   1248   pmullw               m2, filter_x_b
   1249   paddw                m1, filter_rnd
   1250   paddw                m3, m4
   1251   paddw                m1, m2
   1252   psraw                m3, 4
   1253   psraw                m1, 4
   1254   packuswb             m4, m3, m1
   1255   punpckhbw            m2, m0, m5
   1256   punpcklbw            m0, m5
   1257   pmullw               m2, filter_y_a
   1258   pmullw               m1, filter_y_b
   1259   paddw                m2, filter_rnd
   1260   pmullw               m0, filter_y_a
   1261   pmullw               m3, filter_y_b
   1262   paddw                m2, m1
   1263   mova                 m1, [dstq]
   1264   paddw                m0, filter_rnd
   1265   psraw                m2, 4
   1266   paddw                m0, m3
   1267   punpckhbw            m3, m1, m5
   1268   psraw                m0, 4
   1269   punpcklbw            m1, m5
   1270 %endif
   1271 %if %2 == 1 ; avg
   1272   ; FIXME(rbultje) pipeline
   1273   packuswb             m0, m2
   1274   pavgb                m0, [secq]
   1275   punpckhbw            m2, m0, m5
   1276   punpcklbw            m0, m5
   1277 %endif
   1278   SUM_SSE              m0, m1, m2, m3, m6, m7
   1279   mova                 m0, m4
   1280 
   1281   INC_SRC_BY_SRC_STRIDE
   1282   add                dstq, dst_strideq
   1283 %else ; %1 < 16
   1284   movh                 m0, [srcq]
   1285   movh                 m1, [srcq+1]
   1286 %if cpuflag(ssse3)
   1287   punpcklbw            m0, m1
   1288   pmaddubsw            m0, filter_x_a
   1289   paddw                m0, filter_rnd
   1290 %else
   1291   punpcklbw            m0, m5
   1292   punpcklbw            m1, m5
   1293   pmullw               m0, filter_x_a
   1294   pmullw               m1, filter_x_b
   1295   paddw                m0, filter_rnd
   1296   paddw                m0, m1
   1297 %endif
   1298   psraw                m0, 4
   1299 %if cpuflag(ssse3)
   1300   packuswb             m0, m0
   1301 %endif
   1302 
   1303   INC_SRC_BY_SRC_STRIDE
   1304 
   1305 .x_other_y_other_loop:
   1306   movh                 m2, [srcq]
   1307   movh                 m1, [srcq+1]
   1308 
   1309   INC_SRC_BY_SRC_STRIDE
   1310   movh                 m4, [srcq]
   1311   movh                 m3, [srcq+1]
   1312 
   1313 %if cpuflag(ssse3)
   1314   punpcklbw            m2, m1
   1315   punpcklbw            m4, m3
   1316   pmaddubsw            m2, filter_x_a
   1317   pmaddubsw            m4, filter_x_a
   1318   movh                 m3, [dstq+dst_strideq]
   1319   movh                 m1, [dstq]
   1320   paddw                m2, filter_rnd
   1321   paddw                m4, filter_rnd
   1322   psraw                m2, 4
   1323   psraw                m4, 4
   1324   packuswb             m2, m2
   1325   packuswb             m4, m4
   1326   punpcklbw            m0, m2
   1327   punpcklbw            m2, m4
   1328   pmaddubsw            m0, filter_y_a
   1329   pmaddubsw            m2, filter_y_a
   1330   punpcklbw            m3, m5
   1331   paddw                m0, filter_rnd
   1332   paddw                m2, filter_rnd
   1333   psraw                m0, 4
   1334   psraw                m2, 4
   1335   punpcklbw            m1, m5
   1336 %else
   1337   punpcklbw            m2, m5
   1338   punpcklbw            m1, m5
   1339   punpcklbw            m4, m5
   1340   punpcklbw            m3, m5
   1341   pmullw               m2, filter_x_a
   1342   pmullw               m1, filter_x_b
   1343   paddw                m2, filter_rnd
   1344   pmullw               m4, filter_x_a
   1345   pmullw               m3, filter_x_b
   1346   paddw                m4, filter_rnd
   1347   paddw                m2, m1
   1348   paddw                m4, m3
   1349   psraw                m2, 4
   1350   psraw                m4, 4
   1351   pmullw               m0, filter_y_a
   1352   pmullw               m3, m2, filter_y_b
   1353   paddw                m0, filter_rnd
   1354   pmullw               m2, filter_y_a
   1355   pmullw               m1, m4, filter_y_b
   1356   paddw                m2, filter_rnd
   1357   paddw                m0, m3
   1358   movh                 m3, [dstq+dst_strideq]
   1359   paddw                m2, m1
   1360   movh                 m1, [dstq]
   1361   psraw                m0, 4
   1362   psraw                m2, 4
   1363   punpcklbw            m3, m5
   1364   punpcklbw            m1, m5
   1365 %endif
   1366 %if %2 == 1 ; avg
   1367   ; FIXME(rbultje) pipeline
   1368   packuswb             m0, m2
   1369   pavgb                m0, [secq]
   1370   punpckhbw            m2, m0, m5
   1371   punpcklbw            m0, m5
   1372 %endif
   1373   SUM_SSE              m0, m1, m2, m3, m6, m7
   1374   mova                 m0, m4
   1375 
   1376   INC_SRC_BY_SRC_STRIDE
   1377   lea                dstq, [dstq+dst_strideq*2]
   1378 %endif
   1379 %if %2 == 1 ; avg
   1380   add                secq, sec_str
   1381 %endif
   1382   dec                   h
   1383   jg .x_other_y_other_loop
   1384 %undef filter_x_a
   1385 %undef filter_x_b
   1386 %undef filter_y_a
   1387 %undef filter_y_b
   1388 %undef filter_rnd
   1389   STORE_AND_RET
   1390 %endmacro
   1391 
   1392 ; FIXME(rbultje) the non-bilinear versions (i.e. x=0,8&&y=0,8) are identical
   1393 ; between the ssse3 and non-ssse3 version. It may make sense to merge their
   1394 ; code in the sense that the ssse3 version would jump to the appropriate
   1395 ; location in the sse/2 version, rather than duplicating that code in the
   1396 ; binary.
   1397 
   1398 INIT_MMX sse
   1399 SUBPEL_VARIANCE  4
   1400 INIT_XMM sse2
   1401 SUBPEL_VARIANCE  8
   1402 SUBPEL_VARIANCE 16
   1403 
   1404 INIT_MMX ssse3
   1405 SUBPEL_VARIANCE  4
   1406 INIT_XMM ssse3
   1407 SUBPEL_VARIANCE  8
   1408 SUBPEL_VARIANCE 16
   1409 
   1410 INIT_MMX sse
   1411 SUBPEL_VARIANCE  4, 1
   1412 INIT_XMM sse2
   1413 SUBPEL_VARIANCE  8, 1
   1414 SUBPEL_VARIANCE 16, 1
   1415 
   1416 INIT_MMX ssse3
   1417 SUBPEL_VARIANCE  4, 1
   1418 INIT_XMM ssse3
   1419 SUBPEL_VARIANCE  8, 1
   1420 SUBPEL_VARIANCE 16, 1
   1421