Home | History | Annotate | Download | only in x86
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12 %include "vpx_ports/x86_abi_support.asm"
     13 
     14 %define BLOCK_HEIGHT_WIDTH 4
     15 %define VP8_FILTER_WEIGHT 128
     16 %define VP8_FILTER_SHIFT  7
     17 
     18 
     19 ;/************************************************************************************
     20 ; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
     21 ; input pixel array has output_height rows. This routine assumes that output_height is an
     22 ; even number. This function handles 8 pixels in horizontal direction, calculating ONE
     23 ; rows each iteration to take advantage of the 128 bits operations.
     24 ;
     25 ; This is an implementation of some of the SSE optimizations first seen in ffvp8
     26 ;
     27 ;*************************************************************************************/
     28 ;void vp8_filter_block1d8_h6_ssse3
     29 ;(
     30 ;    unsigned char  *src_ptr,
     31 ;    unsigned int    src_pixels_per_line,
     32 ;    unsigned char *output_ptr,
     33 ;    unsigned int    output_pitch,
     34 ;    unsigned int    output_height,
     35 ;    unsigned int    vp8_filter_index
     36 ;)
     37 global sym(vp8_filter_block1d8_h6_ssse3) PRIVATE
     38 sym(vp8_filter_block1d8_h6_ssse3):
     39     push        rbp
     40     mov         rbp, rsp
     41     SHADOW_ARGS_TO_STACK 6
     42     SAVE_XMM 7
     43     GET_GOT     rbx
     44     push        rsi
     45     push        rdi
     46     ; end prolog
     47 
     48     movsxd      rdx, DWORD PTR arg(5)   ;table index
     49     xor         rsi, rsi
     50     shl         rdx, 4
     51 
     52     movdqa      xmm7, [GLOBAL(rd)]
     53 
     54     lea         rax, [GLOBAL(k0_k5)]
     55     add         rax, rdx
     56     mov         rdi, arg(2)             ;output_ptr
     57 
     58     cmp         esi, DWORD PTR [rax]
     59     je          vp8_filter_block1d8_h4_ssse3
     60 
     61     movdqa      xmm4, XMMWORD PTR [rax]         ;k0_k5
     62     movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
     63     movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
     64 
     65     mov         rsi, arg(0)             ;src_ptr
     66     movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line
     67     movsxd      rcx, dword ptr arg(4)   ;output_height
     68 
     69     movsxd      rdx, dword ptr arg(3)   ;output_pitch
     70 
     71     sub         rdi, rdx
     72 ;xmm3 free
     73 .filter_block1d8_h6_rowloop_ssse3:
     74     movq        xmm0,   MMWORD PTR [rsi - 2]    ; -2 -1  0  1  2  3  4  5
     75 
     76     movq        xmm2,   MMWORD PTR [rsi + 3]    ;  3  4  5  6  7  8  9 10
     77 
     78     punpcklbw   xmm0,   xmm2                    ; -2  3 -1  4  0  5  1  6  2  7  3  8  4  9  5 10
     79 
     80     movdqa      xmm1,   xmm0
     81     pmaddubsw   xmm0,   xmm4
     82 
     83     movdqa      xmm2,   xmm1
     84     pshufb      xmm1,   [GLOBAL(shuf2bfrom1)]
     85 
     86     pshufb      xmm2,   [GLOBAL(shuf3bfrom1)]
     87     pmaddubsw   xmm1,   xmm5
     88 
     89     lea         rdi,    [rdi + rdx]
     90     pmaddubsw   xmm2,   xmm6
     91 
     92     lea         rsi,    [rsi + rax]
     93     dec         rcx
     94 
     95     paddsw      xmm0,   xmm1
     96     paddsw      xmm2,   xmm7
     97 
     98     paddsw      xmm0,   xmm2
     99 
    100     psraw       xmm0,   7
    101 
    102     packuswb    xmm0,   xmm0
    103 
    104     movq        MMWORD Ptr [rdi], xmm0
    105     jnz         .filter_block1d8_h6_rowloop_ssse3
    106 
    107     ; begin epilog
    108     pop rdi
    109     pop rsi
    110     RESTORE_GOT
    111     RESTORE_XMM
    112     UNSHADOW_ARGS
    113     pop         rbp
    114     ret
    115 
    116 vp8_filter_block1d8_h4_ssse3:
    117     movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
    118     movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
    119 
    120     movdqa      xmm3, XMMWORD PTR [GLOBAL(shuf2bfrom1)]
    121     movdqa      xmm4, XMMWORD PTR [GLOBAL(shuf3bfrom1)]
    122 
    123     mov         rsi, arg(0)             ;src_ptr
    124 
    125     movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line
    126     movsxd      rcx, dword ptr arg(4)   ;output_height
    127 
    128     movsxd      rdx, dword ptr arg(3)   ;output_pitch
    129 
    130     sub         rdi, rdx
    131 
    132 .filter_block1d8_h4_rowloop_ssse3:
    133     movq        xmm0,   MMWORD PTR [rsi - 2]    ; -2 -1  0  1  2  3  4  5
    134 
    135     movq        xmm1,   MMWORD PTR [rsi + 3]    ;  3  4  5  6  7  8  9 10
    136 
    137     punpcklbw   xmm0,   xmm1                    ; -2  3 -1  4  0  5  1  6  2  7  3  8  4  9  5 10
    138 
    139     movdqa      xmm2,   xmm0
    140     pshufb      xmm0,   xmm3
    141 
    142     pshufb      xmm2,   xmm4
    143     pmaddubsw   xmm0,   xmm5
    144 
    145     lea         rdi,    [rdi + rdx]
    146     pmaddubsw   xmm2,   xmm6
    147 
    148     lea         rsi,    [rsi + rax]
    149     dec         rcx
    150 
    151     paddsw      xmm0,   xmm7
    152 
    153     paddsw      xmm0,   xmm2
    154 
    155     psraw       xmm0,   7
    156 
    157     packuswb    xmm0,   xmm0
    158 
    159     movq        MMWORD Ptr [rdi], xmm0
    160 
    161     jnz         .filter_block1d8_h4_rowloop_ssse3
    162 
    163     ; begin epilog
    164     pop rdi
    165     pop rsi
    166     RESTORE_GOT
    167     RESTORE_XMM
    168     UNSHADOW_ARGS
    169     pop         rbp
    170     ret
    171 ;void vp8_filter_block1d16_h6_ssse3
    172 ;(
    173 ;    unsigned char  *src_ptr,
    174 ;    unsigned int    src_pixels_per_line,
    175 ;    unsigned char  *output_ptr,
    176 ;    unsigned int    output_pitch,
    177 ;    unsigned int    output_height,
    178 ;    unsigned int    vp8_filter_index
    179 ;)
    180 global sym(vp8_filter_block1d16_h6_ssse3) PRIVATE
    181 sym(vp8_filter_block1d16_h6_ssse3):
    182     push        rbp
    183     mov         rbp, rsp
    184     SHADOW_ARGS_TO_STACK 6
    185     SAVE_XMM 7
    186     GET_GOT     rbx
    187     push        rsi
    188     push        rdi
    189     ; end prolog
    190 
    191     movsxd      rdx, DWORD PTR arg(5)           ;table index
    192     xor         rsi, rsi
    193     shl         rdx, 4      ;
    194 
    195     lea         rax, [GLOBAL(k0_k5)]
    196     add         rax, rdx
    197 
    198     mov         rdi, arg(2)                     ;output_ptr
    199 
    200     mov         rsi, arg(0)                     ;src_ptr
    201 
    202     movdqa      xmm4, XMMWORD PTR [rax]         ;k0_k5
    203     movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
    204     movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
    205 
    206     movsxd      rax, dword ptr arg(1)           ;src_pixels_per_line
    207     movsxd      rcx, dword ptr arg(4)           ;output_height
    208     movsxd      rdx, dword ptr arg(3)           ;output_pitch
    209 
    210 .filter_block1d16_h6_rowloop_ssse3:
    211     movq        xmm0,   MMWORD PTR [rsi - 2]    ; -2 -1  0  1  2  3  4  5
    212 
    213     movq        xmm3,   MMWORD PTR [rsi + 3]    ;  3  4  5  6  7  8  9 10
    214 
    215     punpcklbw   xmm0,   xmm3                    ; -2  3 -1  4  0  5  1  6  2  7  3  8  4  9  5 10
    216 
    217     movdqa      xmm1,   xmm0
    218     pmaddubsw   xmm0,   xmm4
    219 
    220     movdqa      xmm2,   xmm1
    221     pshufb      xmm1,   [GLOBAL(shuf2bfrom1)]
    222 
    223     pshufb      xmm2,   [GLOBAL(shuf3bfrom1)]
    224     movq        xmm3,   MMWORD PTR [rsi +  6]
    225 
    226     pmaddubsw   xmm1,   xmm5
    227     movq        xmm7,   MMWORD PTR [rsi + 11]
    228 
    229     pmaddubsw   xmm2,   xmm6
    230     punpcklbw   xmm3,   xmm7
    231 
    232     paddsw      xmm0,   xmm1
    233     movdqa      xmm1,   xmm3
    234 
    235     pmaddubsw   xmm3,   xmm4
    236     paddsw      xmm0,   xmm2
    237 
    238     movdqa      xmm2,   xmm1
    239     paddsw      xmm0,   [GLOBAL(rd)]
    240 
    241     pshufb      xmm1,   [GLOBAL(shuf2bfrom1)]
    242     pshufb      xmm2,   [GLOBAL(shuf3bfrom1)]
    243 
    244     psraw       xmm0,   7
    245     pmaddubsw   xmm1,   xmm5
    246 
    247     pmaddubsw   xmm2,   xmm6
    248     packuswb    xmm0,   xmm0
    249 
    250     lea         rsi,    [rsi + rax]
    251     paddsw      xmm3,   xmm1
    252 
    253     paddsw      xmm3,   xmm2
    254 
    255     paddsw      xmm3,   [GLOBAL(rd)]
    256 
    257     psraw       xmm3,   7
    258 
    259     packuswb    xmm3,   xmm3
    260 
    261     punpcklqdq  xmm0,   xmm3
    262 
    263     movdqa      XMMWORD Ptr [rdi], xmm0
    264 
    265     lea         rdi,    [rdi + rdx]
    266     dec         rcx
    267     jnz         .filter_block1d16_h6_rowloop_ssse3
    268 
    269     ; begin epilog
    270     pop rdi
    271     pop rsi
    272     RESTORE_GOT
    273     RESTORE_XMM
    274     UNSHADOW_ARGS
    275     pop         rbp
    276     ret
    277 
    278 ;void vp8_filter_block1d4_h6_ssse3
    279 ;(
    280 ;    unsigned char  *src_ptr,
    281 ;    unsigned int    src_pixels_per_line,
    282 ;    unsigned char  *output_ptr,
    283 ;    unsigned int    output_pitch,
    284 ;    unsigned int    output_height,
    285 ;    unsigned int    vp8_filter_index
    286 ;)
    287 global sym(vp8_filter_block1d4_h6_ssse3) PRIVATE
    288 sym(vp8_filter_block1d4_h6_ssse3):
    289     push        rbp
    290     mov         rbp, rsp
    291     SHADOW_ARGS_TO_STACK 6
    292     SAVE_XMM 7
    293     GET_GOT     rbx
    294     push        rsi
    295     push        rdi
    296     ; end prolog
    297 
    298     movsxd      rdx, DWORD PTR arg(5)   ;table index
    299     xor         rsi, rsi
    300     shl         rdx, 4      ;
    301 
    302     lea         rax, [GLOBAL(k0_k5)]
    303     add         rax, rdx
    304     movdqa      xmm7, [GLOBAL(rd)]
    305 
    306     cmp         esi, DWORD PTR [rax]
    307     je          .vp8_filter_block1d4_h4_ssse3
    308 
    309     movdqa      xmm4, XMMWORD PTR [rax]         ;k0_k5
    310     movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
    311     movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
    312 
    313     mov         rsi, arg(0)             ;src_ptr
    314     mov         rdi, arg(2)             ;output_ptr
    315     movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line
    316     movsxd      rcx, dword ptr arg(4)   ;output_height
    317 
    318     movsxd      rdx, dword ptr arg(3)   ;output_pitch
    319 
    320 ;xmm3 free
    321 .filter_block1d4_h6_rowloop_ssse3:
    322     movdqu      xmm0,   XMMWORD PTR [rsi - 2]
    323 
    324     movdqa      xmm1, xmm0
    325     pshufb      xmm0, [GLOBAL(shuf1b)]
    326 
    327     movdqa      xmm2, xmm1
    328     pshufb      xmm1, [GLOBAL(shuf2b)]
    329     pmaddubsw   xmm0, xmm4
    330     pshufb      xmm2, [GLOBAL(shuf3b)]
    331     pmaddubsw   xmm1, xmm5
    332 
    333 ;--
    334     pmaddubsw   xmm2, xmm6
    335 
    336     lea         rsi,    [rsi + rax]
    337 ;--
    338     paddsw      xmm0, xmm1
    339     paddsw      xmm0, xmm7
    340     pxor        xmm1, xmm1
    341     paddsw      xmm0, xmm2
    342     psraw       xmm0, 7
    343     packuswb    xmm0, xmm0
    344 
    345     movd        DWORD PTR [rdi], xmm0
    346 
    347     add         rdi, rdx
    348     dec         rcx
    349     jnz         .filter_block1d4_h6_rowloop_ssse3
    350 
    351     ; begin epilog
    352     pop rdi
    353     pop rsi
    354     RESTORE_GOT
    355     RESTORE_XMM
    356     UNSHADOW_ARGS
    357     pop         rbp
    358     ret
    359 
    360 .vp8_filter_block1d4_h4_ssse3:
    361     movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
    362     movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
    363     movdqa      xmm0, XMMWORD PTR [GLOBAL(shuf2b)]
    364     movdqa      xmm3, XMMWORD PTR [GLOBAL(shuf3b)]
    365 
    366     mov         rsi, arg(0)             ;src_ptr
    367     mov         rdi, arg(2)             ;output_ptr
    368     movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line
    369     movsxd      rcx, dword ptr arg(4)   ;output_height
    370 
    371     movsxd      rdx, dword ptr arg(3)   ;output_pitch
    372 
    373 .filter_block1d4_h4_rowloop_ssse3:
    374     movdqu      xmm1,   XMMWORD PTR [rsi - 2]
    375 
    376     movdqa      xmm2, xmm1
    377     pshufb      xmm1, xmm0 ;;[GLOBAL(shuf2b)]
    378     pshufb      xmm2, xmm3 ;;[GLOBAL(shuf3b)]
    379     pmaddubsw   xmm1, xmm5
    380 
    381 ;--
    382     pmaddubsw   xmm2, xmm6
    383 
    384     lea         rsi,    [rsi + rax]
    385 ;--
    386     paddsw      xmm1, xmm7
    387     paddsw      xmm1, xmm2
    388     psraw       xmm1, 7
    389     packuswb    xmm1, xmm1
    390 
    391     movd        DWORD PTR [rdi], xmm1
    392 
    393     add         rdi, rdx
    394     dec         rcx
    395     jnz         .filter_block1d4_h4_rowloop_ssse3
    396 
    397     ; begin epilog
    398     pop rdi
    399     pop rsi
    400     RESTORE_GOT
    401     RESTORE_XMM
    402     UNSHADOW_ARGS
    403     pop         rbp
    404     ret
    405 
    406 
    407 
    408 ;void vp8_filter_block1d16_v6_ssse3
    409 ;(
    410 ;    unsigned char *src_ptr,
    411 ;    unsigned int   src_pitch,
    412 ;    unsigned char *output_ptr,
    413 ;    unsigned int   out_pitch,
    414 ;    unsigned int   output_height,
    415 ;    unsigned int   vp8_filter_index
    416 ;)
    417 global sym(vp8_filter_block1d16_v6_ssse3) PRIVATE
    418 sym(vp8_filter_block1d16_v6_ssse3):
    419     push        rbp
    420     mov         rbp, rsp
    421     SHADOW_ARGS_TO_STACK 6
    422     SAVE_XMM 7
    423     GET_GOT     rbx
    424     push        rsi
    425     push        rdi
    426     ; end prolog
    427 
    428     movsxd      rdx, DWORD PTR arg(5)   ;table index
    429     xor         rsi, rsi
    430     shl         rdx, 4      ;
    431 
    432     lea         rax, [GLOBAL(k0_k5)]
    433     add         rax, rdx
    434 
    435     cmp         esi, DWORD PTR [rax]
    436     je          .vp8_filter_block1d16_v4_ssse3
    437 
    438     movdqa      xmm5, XMMWORD PTR [rax]         ;k0_k5
    439     movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4
    440     movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3
    441 
    442     mov         rsi, arg(0)             ;src_ptr
    443     movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line
    444     mov         rdi, arg(2)             ;output_ptr
    445 
    446 %if ABI_IS_32BIT=0
    447     movsxd      r8, DWORD PTR arg(3)    ;out_pitch
    448 %endif
    449     mov         rax, rsi
    450     movsxd      rcx, DWORD PTR arg(4)   ;output_height
    451     add         rax, rdx
    452 
    453 
    454 .vp8_filter_block1d16_v6_ssse3_loop:
    455     movq        xmm1, MMWORD PTR [rsi]                  ;A
    456     movq        xmm2, MMWORD PTR [rsi + rdx]            ;B
    457     movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C
    458     movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D
    459     movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E
    460 
    461     punpcklbw   xmm2, xmm4                  ;B D
    462     punpcklbw   xmm3, xmm0                  ;C E
    463 
    464     movq        xmm0, MMWORD PTR [rax + rdx * 4]        ;F
    465 
    466     pmaddubsw   xmm3, xmm6
    467     punpcklbw   xmm1, xmm0                  ;A F
    468     pmaddubsw   xmm2, xmm7
    469     pmaddubsw   xmm1, xmm5
    470 
    471     paddsw      xmm2, xmm3
    472     paddsw      xmm2, xmm1
    473     paddsw      xmm2, [GLOBAL(rd)]
    474     psraw       xmm2, 7
    475     packuswb    xmm2, xmm2
    476 
    477     movq        MMWORD PTR [rdi], xmm2          ;store the results
    478 
    479     movq        xmm1, MMWORD PTR [rsi + 8]                  ;A
    480     movq        xmm2, MMWORD PTR [rsi + rdx + 8]            ;B
    481     movq        xmm3, MMWORD PTR [rsi + rdx * 2 + 8]        ;C
    482     movq        xmm4, MMWORD PTR [rax + rdx * 2 + 8]        ;D
    483     movq        xmm0, MMWORD PTR [rsi + rdx * 4 + 8]        ;E
    484 
    485     punpcklbw   xmm2, xmm4                  ;B D
    486     punpcklbw   xmm3, xmm0                  ;C E
    487 
    488     movq        xmm0, MMWORD PTR [rax + rdx * 4 + 8]        ;F
    489     pmaddubsw   xmm3, xmm6
    490     punpcklbw   xmm1, xmm0                  ;A F
    491     pmaddubsw   xmm2, xmm7
    492     pmaddubsw   xmm1, xmm5
    493 
    494     add         rsi,  rdx
    495     add         rax,  rdx
    496 ;--
    497 ;--
    498     paddsw      xmm2, xmm3
    499     paddsw      xmm2, xmm1
    500     paddsw      xmm2, [GLOBAL(rd)]
    501     psraw       xmm2, 7
    502     packuswb    xmm2, xmm2
    503 
    504     movq        MMWORD PTR [rdi+8], xmm2
    505 
    506 %if ABI_IS_32BIT
    507     add         rdi,        DWORD PTR arg(3) ;out_pitch
    508 %else
    509     add         rdi,        r8
    510 %endif
    511     dec         rcx
    512     jnz         .vp8_filter_block1d16_v6_ssse3_loop
    513 
    514     ; begin epilog
    515     pop rdi
    516     pop rsi
    517     RESTORE_GOT
    518     RESTORE_XMM
    519     UNSHADOW_ARGS
    520     pop         rbp
    521     ret
    522 
    523 .vp8_filter_block1d16_v4_ssse3:
    524     movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4
    525     movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3
    526 
    527     mov         rsi, arg(0)             ;src_ptr
    528     movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line
    529     mov         rdi, arg(2)             ;output_ptr
    530 
    531 %if ABI_IS_32BIT=0
    532     movsxd      r8, DWORD PTR arg(3)    ;out_pitch
    533 %endif
    534     mov         rax, rsi
    535     movsxd      rcx, DWORD PTR arg(4)   ;output_height
    536     add         rax, rdx
    537 
    538 .vp8_filter_block1d16_v4_ssse3_loop:
    539     movq        xmm2, MMWORD PTR [rsi + rdx]            ;B
    540     movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C
    541     movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D
    542     movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E
    543 
    544     punpcklbw   xmm2, xmm4                  ;B D
    545     punpcklbw   xmm3, xmm0                  ;C E
    546 
    547     pmaddubsw   xmm3, xmm6
    548     pmaddubsw   xmm2, xmm7
    549     movq        xmm5, MMWORD PTR [rsi + rdx + 8]            ;B
    550     movq        xmm1, MMWORD PTR [rsi + rdx * 2 + 8]        ;C
    551     movq        xmm4, MMWORD PTR [rax + rdx * 2 + 8]        ;D
    552     movq        xmm0, MMWORD PTR [rsi + rdx * 4 + 8]        ;E
    553 
    554     paddsw      xmm2, [GLOBAL(rd)]
    555     paddsw      xmm2, xmm3
    556     psraw       xmm2, 7
    557     packuswb    xmm2, xmm2
    558 
    559     punpcklbw   xmm5, xmm4                  ;B D
    560     punpcklbw   xmm1, xmm0                  ;C E
    561 
    562     pmaddubsw   xmm1, xmm6
    563     pmaddubsw   xmm5, xmm7
    564 
    565     movdqa      xmm4, [GLOBAL(rd)]
    566     add         rsi,  rdx
    567     add         rax,  rdx
    568 ;--
    569 ;--
    570     paddsw      xmm5, xmm1
    571     paddsw      xmm5, xmm4
    572     psraw       xmm5, 7
    573     packuswb    xmm5, xmm5
    574 
    575     punpcklqdq  xmm2, xmm5
    576 
    577     movdqa       XMMWORD PTR [rdi], xmm2
    578 
    579 %if ABI_IS_32BIT
    580     add         rdi,        DWORD PTR arg(3) ;out_pitch
    581 %else
    582     add         rdi,        r8
    583 %endif
    584     dec         rcx
    585     jnz         .vp8_filter_block1d16_v4_ssse3_loop
    586 
    587     ; begin epilog
    588     pop rdi
    589     pop rsi
    590     RESTORE_GOT
    591     RESTORE_XMM
    592     UNSHADOW_ARGS
    593     pop         rbp
    594     ret
    595 
    596 ;void vp8_filter_block1d8_v6_ssse3
    597 ;(
    598 ;    unsigned char *src_ptr,
    599 ;    unsigned int   src_pitch,
    600 ;    unsigned char *output_ptr,
    601 ;    unsigned int   out_pitch,
    602 ;    unsigned int   output_height,
    603 ;    unsigned int   vp8_filter_index
    604 ;)
    605 global sym(vp8_filter_block1d8_v6_ssse3) PRIVATE
    606 sym(vp8_filter_block1d8_v6_ssse3):
    607     push        rbp
    608     mov         rbp, rsp
    609     SHADOW_ARGS_TO_STACK 6
    610     SAVE_XMM 7
    611     GET_GOT     rbx
    612     push        rsi
    613     push        rdi
    614     ; end prolog
    615 
    616     movsxd      rdx, DWORD PTR arg(5)   ;table index
    617     xor         rsi, rsi
    618     shl         rdx, 4      ;
    619 
    620     lea         rax, [GLOBAL(k0_k5)]
    621     add         rax, rdx
    622 
    623     movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line
    624     mov         rdi, arg(2)             ;output_ptr
    625 %if ABI_IS_32BIT=0
    626     movsxd      r8, DWORD PTR arg(3)    ; out_pitch
    627 %endif
    628     movsxd      rcx, DWORD PTR arg(4)   ;[output_height]
    629 
    630     cmp         esi, DWORD PTR [rax]
    631     je          .vp8_filter_block1d8_v4_ssse3
    632 
    633     movdqa      xmm5, XMMWORD PTR [rax]         ;k0_k5
    634     movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4
    635     movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3
    636 
    637     mov         rsi, arg(0)             ;src_ptr
    638 
    639     mov         rax, rsi
    640     add         rax, rdx
    641 
    642 .vp8_filter_block1d8_v6_ssse3_loop:
    643     movq        xmm1, MMWORD PTR [rsi]                  ;A
    644     movq        xmm2, MMWORD PTR [rsi + rdx]            ;B
    645     movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C
    646     movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D
    647     movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E
    648 
    649     punpcklbw   xmm2, xmm4                  ;B D
    650     punpcklbw   xmm3, xmm0                  ;C E
    651 
    652     movq        xmm0, MMWORD PTR [rax + rdx * 4]        ;F
    653     movdqa      xmm4, [GLOBAL(rd)]
    654 
    655     pmaddubsw   xmm3, xmm6
    656     punpcklbw   xmm1, xmm0                  ;A F
    657     pmaddubsw   xmm2, xmm7
    658     pmaddubsw   xmm1, xmm5
    659     add         rsi,  rdx
    660     add         rax,  rdx
    661 ;--
    662 ;--
    663     paddsw      xmm2, xmm3
    664     paddsw      xmm2, xmm1
    665     paddsw      xmm2, xmm4
    666     psraw       xmm2, 7
    667     packuswb    xmm2, xmm2
    668 
    669     movq        MMWORD PTR [rdi], xmm2
    670 
    671 %if ABI_IS_32BIT
    672     add         rdi,        DWORD PTR arg(3) ;[out_pitch]
    673 %else
    674     add         rdi,        r8
    675 %endif
    676     dec         rcx
    677     jnz         .vp8_filter_block1d8_v6_ssse3_loop
    678 
    679     ; begin epilog
    680     pop rdi
    681     pop rsi
    682     RESTORE_GOT
    683     RESTORE_XMM
    684     UNSHADOW_ARGS
    685     pop         rbp
    686     ret
    687 
    688 .vp8_filter_block1d8_v4_ssse3:
    689     movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4
    690     movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3
    691     movdqa      xmm5, [GLOBAL(rd)]
    692 
    693     mov         rsi, arg(0)             ;src_ptr
    694 
    695     mov         rax, rsi
    696     add         rax, rdx
    697 
    698 .vp8_filter_block1d8_v4_ssse3_loop:
    699     movq        xmm2, MMWORD PTR [rsi + rdx]            ;B
    700     movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C
    701     movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D
    702     movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E
    703 
    704     punpcklbw   xmm2, xmm4                  ;B D
    705     punpcklbw   xmm3, xmm0                  ;C E
    706 
    707     pmaddubsw   xmm3, xmm6
    708     pmaddubsw   xmm2, xmm7
    709     add         rsi,  rdx
    710     add         rax,  rdx
    711 ;--
    712 ;--
    713     paddsw      xmm2, xmm3
    714     paddsw      xmm2, xmm5
    715     psraw       xmm2, 7
    716     packuswb    xmm2, xmm2
    717 
    718     movq        MMWORD PTR [rdi], xmm2
    719 
    720 %if ABI_IS_32BIT
    721     add         rdi,        DWORD PTR arg(3) ;[out_pitch]
    722 %else
    723     add         rdi,        r8
    724 %endif
    725     dec         rcx
    726     jnz         .vp8_filter_block1d8_v4_ssse3_loop
    727 
    728     ; begin epilog
    729     pop rdi
    730     pop rsi
    731     RESTORE_GOT
    732     RESTORE_XMM
    733     UNSHADOW_ARGS
    734     pop         rbp
    735     ret
    736 ;void vp8_filter_block1d4_v6_ssse3
    737 ;(
    738 ;    unsigned char *src_ptr,
    739 ;    unsigned int   src_pitch,
    740 ;    unsigned char *output_ptr,
    741 ;    unsigned int   out_pitch,
    742 ;    unsigned int   output_height,
    743 ;    unsigned int   vp8_filter_index
    744 ;)
    745 global sym(vp8_filter_block1d4_v6_ssse3) PRIVATE
    746 sym(vp8_filter_block1d4_v6_ssse3):
    747     push        rbp
    748     mov         rbp, rsp
    749     SHADOW_ARGS_TO_STACK 6
    750     GET_GOT     rbx
    751     push        rsi
    752     push        rdi
    753     ; end prolog
    754 
    755     movsxd      rdx, DWORD PTR arg(5)   ;table index
    756     xor         rsi, rsi
    757     shl         rdx, 4      ;
    758 
    759     lea         rax, [GLOBAL(k0_k5)]
    760     add         rax, rdx
    761 
    762     movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line
    763     mov         rdi, arg(2)             ;output_ptr
    764 %if ABI_IS_32BIT=0
    765     movsxd      r8, DWORD PTR arg(3)    ; out_pitch
    766 %endif
    767     movsxd      rcx, DWORD PTR arg(4)   ;[output_height]
    768 
    769     cmp         esi, DWORD PTR [rax]
    770     je          .vp8_filter_block1d4_v4_ssse3
    771 
    772     movq        mm5, MMWORD PTR [rax]         ;k0_k5
    773     movq        mm6, MMWORD PTR [rax+256]     ;k2_k4
    774     movq        mm7, MMWORD PTR [rax+128]     ;k1_k3
    775 
    776     mov         rsi, arg(0)             ;src_ptr
    777 
    778     mov         rax, rsi
    779     add         rax, rdx
    780 
    781 .vp8_filter_block1d4_v6_ssse3_loop:
    782     movd        mm1, DWORD PTR [rsi]                  ;A
    783     movd        mm2, DWORD PTR [rsi + rdx]            ;B
    784     movd        mm3, DWORD PTR [rsi + rdx * 2]        ;C
    785     movd        mm4, DWORD PTR [rax + rdx * 2]        ;D
    786     movd        mm0, DWORD PTR [rsi + rdx * 4]        ;E
    787 
    788     punpcklbw   mm2, mm4                  ;B D
    789     punpcklbw   mm3, mm0                  ;C E
    790 
    791     movd        mm0, DWORD PTR [rax + rdx * 4]        ;F
    792 
    793     movq        mm4, [GLOBAL(rd)]
    794 
    795     pmaddubsw   mm3, mm6
    796     punpcklbw   mm1, mm0                  ;A F
    797     pmaddubsw   mm2, mm7
    798     pmaddubsw   mm1, mm5
    799     add         rsi,  rdx
    800     add         rax,  rdx
    801 ;--
    802 ;--
    803     paddsw      mm2, mm3
    804     paddsw      mm2, mm1
    805     paddsw      mm2, mm4
    806     psraw       mm2, 7
    807     packuswb    mm2, mm2
    808 
    809     movd        DWORD PTR [rdi], mm2
    810 
    811 %if ABI_IS_32BIT
    812     add         rdi,        DWORD PTR arg(3) ;[out_pitch]
    813 %else
    814     add         rdi,        r8
    815 %endif
    816     dec         rcx
    817     jnz         .vp8_filter_block1d4_v6_ssse3_loop
    818 
    819     ; begin epilog
    820     pop rdi
    821     pop rsi
    822     RESTORE_GOT
    823     UNSHADOW_ARGS
    824     pop         rbp
    825     ret
    826 
    827 .vp8_filter_block1d4_v4_ssse3:
    828     movq        mm6, MMWORD PTR [rax+256]     ;k2_k4
    829     movq        mm7, MMWORD PTR [rax+128]     ;k1_k3
    830     movq        mm5, MMWORD PTR [GLOBAL(rd)]
    831 
    832     mov         rsi, arg(0)             ;src_ptr
    833 
    834     mov         rax, rsi
    835     add         rax, rdx
    836 
    837 .vp8_filter_block1d4_v4_ssse3_loop:
    838     movd        mm2, DWORD PTR [rsi + rdx]            ;B
    839     movd        mm3, DWORD PTR [rsi + rdx * 2]        ;C
    840     movd        mm4, DWORD PTR [rax + rdx * 2]        ;D
    841     movd        mm0, DWORD PTR [rsi + rdx * 4]        ;E
    842 
    843     punpcklbw   mm2, mm4                  ;B D
    844     punpcklbw   mm3, mm0                  ;C E
    845 
    846     pmaddubsw   mm3, mm6
    847     pmaddubsw   mm2, mm7
    848     add         rsi,  rdx
    849     add         rax,  rdx
    850 ;--
    851 ;--
    852     paddsw      mm2, mm3
    853     paddsw      mm2, mm5
    854     psraw       mm2, 7
    855     packuswb    mm2, mm2
    856 
    857     movd        DWORD PTR [rdi], mm2
    858 
    859 %if ABI_IS_32BIT
    860     add         rdi,        DWORD PTR arg(3) ;[out_pitch]
    861 %else
    862     add         rdi,        r8
    863 %endif
    864     dec         rcx
    865     jnz         .vp8_filter_block1d4_v4_ssse3_loop
    866 
    867     ; begin epilog
    868     pop rdi
    869     pop rsi
    870     RESTORE_GOT
    871     UNSHADOW_ARGS
    872     pop         rbp
    873     ret
    874 
    875 ;void vp8_bilinear_predict16x16_ssse3
    876 ;(
    877 ;    unsigned char  *src_ptr,
    878 ;    int   src_pixels_per_line,
    879 ;    int  xoffset,
    880 ;    int  yoffset,
    881 ;    unsigned char *dst_ptr,
    882 ;    int dst_pitch
    883 ;)
    884 global sym(vp8_bilinear_predict16x16_ssse3) PRIVATE
    885 sym(vp8_bilinear_predict16x16_ssse3):
    886     push        rbp
    887     mov         rbp, rsp
    888     SHADOW_ARGS_TO_STACK 6
    889     SAVE_XMM 7
    890     GET_GOT     rbx
    891     push        rsi
    892     push        rdi
    893     ; end prolog
    894 
    895         lea         rcx,        [GLOBAL(vp8_bilinear_filters_ssse3)]
    896         movsxd      rax,        dword ptr arg(2)    ; xoffset
    897 
    898         cmp         rax,        0                   ; skip first_pass filter if xoffset=0
    899         je          .b16x16_sp_only
    900 
    901         shl         rax,        4
    902         lea         rax,        [rax + rcx]         ; HFilter
    903 
    904         mov         rdi,        arg(4)              ; dst_ptr
    905         mov         rsi,        arg(0)              ; src_ptr
    906         movsxd      rdx,        dword ptr arg(5)    ; dst_pitch
    907 
    908         movdqa      xmm1,       [rax]
    909 
    910         movsxd      rax,        dword ptr arg(3)    ; yoffset
    911 
    912         cmp         rax,        0                   ; skip second_pass filter if yoffset=0
    913         je          .b16x16_fp_only
    914 
    915         shl         rax,        4
    916         lea         rax,        [rax + rcx]         ; VFilter
    917 
    918         lea         rcx,        [rdi+rdx*8]
    919         lea         rcx,        [rcx+rdx*8]
    920         movsxd      rdx,        dword ptr arg(1)    ; src_pixels_per_line
    921 
    922         movdqa      xmm2,       [rax]
    923 
    924 %if ABI_IS_32BIT=0
    925         movsxd      r8,         dword ptr arg(5)    ; dst_pitch
    926 %endif
    927         movq        xmm3,       [rsi]               ; 00 01 02 03 04 05 06 07
    928         movq        xmm5,       [rsi+1]             ; 01 02 03 04 05 06 07 08
    929 
    930         punpcklbw   xmm3,       xmm5                ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08
    931         movq        xmm4,       [rsi+8]             ; 08 09 10 11 12 13 14 15
    932 
    933         movq        xmm5,       [rsi+9]             ; 09 10 11 12 13 14 15 16
    934 
    935         lea         rsi,        [rsi + rdx]         ; next line
    936 
    937         pmaddubsw   xmm3,       xmm1                ; 00 02 04 06 08 10 12 14
    938 
    939         punpcklbw   xmm4,       xmm5                ; 08 09 09 10 10 11 11 12 12 13 13 14 14 15 15 16
    940         pmaddubsw   xmm4,       xmm1                ; 01 03 05 07 09 11 13 15
    941 
    942         paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
    943         psraw       xmm3,       VP8_FILTER_SHIFT    ; xmm3 /= 128
    944 
    945         paddw       xmm4,       [GLOBAL(rd)]        ; xmm4 += round value
    946         psraw       xmm4,       VP8_FILTER_SHIFT    ; xmm4 /= 128
    947 
    948         movdqa      xmm7,       xmm3
    949         packuswb    xmm7,       xmm4                ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
    950 
    951 .next_row:
    952         movq        xmm6,       [rsi]               ; 00 01 02 03 04 05 06 07
    953         movq        xmm5,       [rsi+1]             ; 01 02 03 04 05 06 07 08
    954 
    955         punpcklbw   xmm6,       xmm5
    956         movq        xmm4,       [rsi+8]             ; 08 09 10 11 12 13 14 15
    957 
    958         movq        xmm5,       [rsi+9]             ; 09 10 11 12 13 14 15 16
    959         lea         rsi,        [rsi + rdx]         ; next line
    960 
    961         pmaddubsw   xmm6,       xmm1
    962 
    963         punpcklbw   xmm4,       xmm5
    964         pmaddubsw   xmm4,       xmm1
    965 
    966         paddw       xmm6,       [GLOBAL(rd)]        ; xmm6 += round value
    967         psraw       xmm6,       VP8_FILTER_SHIFT    ; xmm6 /= 128
    968 
    969         paddw       xmm4,       [GLOBAL(rd)]        ; xmm4 += round value
    970         psraw       xmm4,       VP8_FILTER_SHIFT    ; xmm4 /= 128
    971 
    972         packuswb    xmm6,       xmm4
    973         movdqa      xmm5,       xmm7
    974 
    975         punpcklbw   xmm5,       xmm6
    976         pmaddubsw   xmm5,       xmm2
    977 
    978         punpckhbw   xmm7,       xmm6
    979         pmaddubsw   xmm7,       xmm2
    980 
    981         paddw       xmm5,       [GLOBAL(rd)]        ; xmm5 += round value
    982         psraw       xmm5,       VP8_FILTER_SHIFT    ; xmm5 /= 128
    983 
    984         paddw       xmm7,       [GLOBAL(rd)]        ; xmm7 += round value
    985         psraw       xmm7,       VP8_FILTER_SHIFT    ; xmm7 /= 128
    986 
    987         packuswb    xmm5,       xmm7
    988         movdqa      xmm7,       xmm6
    989 
    990         movdqa      [rdi],      xmm5                ; store the results in the destination
    991 %if ABI_IS_32BIT
    992         add         rdi,        DWORD PTR arg(5)    ; dst_pitch
    993 %else
    994         add         rdi,        r8
    995 %endif
    996 
    997         cmp         rdi,        rcx
    998         jne         .next_row
    999 
   1000         jmp         .done
   1001 
   1002 .b16x16_sp_only:
   1003         movsxd      rax,        dword ptr arg(3)    ; yoffset
   1004         shl         rax,        4
   1005         lea         rax,        [rax + rcx]         ; VFilter
   1006 
   1007         mov         rdi,        arg(4)              ; dst_ptr
   1008         mov         rsi,        arg(0)              ; src_ptr
   1009         movsxd      rdx,        dword ptr arg(5)    ; dst_pitch
   1010 
   1011         movdqa      xmm1,       [rax]               ; VFilter
   1012 
   1013         lea         rcx,        [rdi+rdx*8]
   1014         lea         rcx,        [rcx+rdx*8]
   1015         movsxd      rax,        dword ptr arg(1)    ; src_pixels_per_line
   1016 
   1017         ; get the first horizontal line done
   1018         movq        xmm4,       [rsi]               ; load row 0
   1019         movq        xmm2,       [rsi + 8]           ; load row 0
   1020 
   1021         lea         rsi,        [rsi + rax]         ; next line
   1022 .next_row_sp:
   1023         movq        xmm3,       [rsi]               ; load row + 1
   1024         movq        xmm5,       [rsi + 8]           ; load row + 1
   1025 
   1026         punpcklbw   xmm4,       xmm3
   1027         punpcklbw   xmm2,       xmm5
   1028 
   1029         pmaddubsw   xmm4,       xmm1
   1030         movq        xmm7,       [rsi + rax]         ; load row + 2
   1031 
   1032         pmaddubsw   xmm2,       xmm1
   1033         movq        xmm6,       [rsi + rax + 8]     ; load row + 2
   1034 
   1035         punpcklbw   xmm3,       xmm7
   1036         punpcklbw   xmm5,       xmm6
   1037 
   1038         pmaddubsw   xmm3,       xmm1
   1039         paddw       xmm4,       [GLOBAL(rd)]
   1040 
   1041         pmaddubsw   xmm5,       xmm1
   1042         paddw       xmm2,       [GLOBAL(rd)]
   1043 
   1044         psraw       xmm4,       VP8_FILTER_SHIFT
   1045         psraw       xmm2,       VP8_FILTER_SHIFT
   1046 
   1047         packuswb    xmm4,       xmm2
   1048         paddw       xmm3,       [GLOBAL(rd)]
   1049 
   1050         movdqa      [rdi],      xmm4                ; store row 0
   1051         paddw       xmm5,       [GLOBAL(rd)]
   1052 
   1053         psraw       xmm3,       VP8_FILTER_SHIFT
   1054         psraw       xmm5,       VP8_FILTER_SHIFT
   1055 
   1056         packuswb    xmm3,       xmm5
   1057         movdqa      xmm4,       xmm7
   1058 
   1059         movdqa      [rdi + rdx],xmm3                ; store row 1
   1060         lea         rsi,        [rsi + 2*rax]
   1061 
   1062         movdqa      xmm2,       xmm6
   1063         lea         rdi,        [rdi + 2*rdx]
   1064 
   1065         cmp         rdi,        rcx
   1066         jne         .next_row_sp
   1067 
   1068         jmp         .done
   1069 
   1070 .b16x16_fp_only:
   1071         lea         rcx,        [rdi+rdx*8]
   1072         lea         rcx,        [rcx+rdx*8]
   1073         movsxd      rax,        dword ptr arg(1)    ; src_pixels_per_line
   1074 
   1075 .next_row_fp:
   1076         movq        xmm2,       [rsi]               ; 00 01 02 03 04 05 06 07
   1077         movq        xmm4,       [rsi+1]             ; 01 02 03 04 05 06 07 08
   1078 
   1079         punpcklbw   xmm2,       xmm4
   1080         movq        xmm3,       [rsi+8]             ; 08 09 10 11 12 13 14 15
   1081 
   1082         pmaddubsw   xmm2,       xmm1
   1083         movq        xmm4,       [rsi+9]             ; 09 10 11 12 13 14 15 16
   1084 
   1085         lea         rsi,        [rsi + rax]         ; next line
   1086         punpcklbw   xmm3,       xmm4
   1087 
   1088         pmaddubsw   xmm3,       xmm1
   1089         movq        xmm5,       [rsi]
   1090 
   1091         paddw       xmm2,       [GLOBAL(rd)]
   1092         movq        xmm7,       [rsi+1]
   1093 
   1094         movq        xmm6,       [rsi+8]
   1095         psraw       xmm2,       VP8_FILTER_SHIFT
   1096 
   1097         punpcklbw   xmm5,       xmm7
   1098         movq        xmm7,       [rsi+9]
   1099 
   1100         paddw       xmm3,       [GLOBAL(rd)]
   1101         pmaddubsw   xmm5,       xmm1
   1102 
   1103         psraw       xmm3,       VP8_FILTER_SHIFT
   1104         punpcklbw   xmm6,       xmm7
   1105 
   1106         packuswb    xmm2,       xmm3
   1107         pmaddubsw   xmm6,       xmm1
   1108 
   1109         movdqa      [rdi],      xmm2                ; store the results in the destination
   1110         paddw       xmm5,       [GLOBAL(rd)]
   1111 
   1112         lea         rdi,        [rdi + rdx]         ; dst_pitch
   1113         psraw       xmm5,       VP8_FILTER_SHIFT
   1114 
   1115         paddw       xmm6,       [GLOBAL(rd)]
   1116         psraw       xmm6,       VP8_FILTER_SHIFT
   1117 
   1118         packuswb    xmm5,       xmm6
   1119         lea         rsi,        [rsi + rax]         ; next line
   1120 
   1121         movdqa      [rdi],      xmm5                ; store the results in the destination
   1122         lea         rdi,        [rdi + rdx]         ; dst_pitch
   1123 
   1124         cmp         rdi,        rcx
   1125 
   1126         jne         .next_row_fp
   1127 
   1128 .done:
   1129     ; begin epilog
   1130     pop         rdi
   1131     pop         rsi
   1132     RESTORE_GOT
   1133     RESTORE_XMM
   1134     UNSHADOW_ARGS
   1135     pop         rbp
   1136     ret
   1137 
   1138 ;void vp8_bilinear_predict8x8_ssse3
   1139 ;(
   1140 ;    unsigned char  *src_ptr,
   1141 ;    int   src_pixels_per_line,
   1142 ;    int  xoffset,
   1143 ;    int  yoffset,
   1144 ;    unsigned char *dst_ptr,
   1145 ;    int dst_pitch
   1146 ;)
   1147 global sym(vp8_bilinear_predict8x8_ssse3) PRIVATE
   1148 sym(vp8_bilinear_predict8x8_ssse3):
   1149     push        rbp
   1150     mov         rbp, rsp
   1151     SHADOW_ARGS_TO_STACK 6
   1152     SAVE_XMM 7
   1153     GET_GOT     rbx
   1154     push        rsi
   1155     push        rdi
   1156     ; end prolog
   1157 
   1158     ALIGN_STACK 16, rax
   1159     sub         rsp, 144                         ; reserve 144 bytes
   1160 
   1161         lea         rcx,        [GLOBAL(vp8_bilinear_filters_ssse3)]
   1162 
   1163         mov         rsi,        arg(0) ;src_ptr
   1164         movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line
   1165 
   1166     ;Read 9-line unaligned data in and put them on stack. This gives a big
   1167     ;performance boost.
   1168         movdqu      xmm0,       [rsi]
   1169         lea         rax,        [rdx + rdx*2]
   1170         movdqu      xmm1,       [rsi+rdx]
   1171         movdqu      xmm2,       [rsi+rdx*2]
   1172         add         rsi,        rax
   1173         movdqu      xmm3,       [rsi]
   1174         movdqu      xmm4,       [rsi+rdx]
   1175         movdqu      xmm5,       [rsi+rdx*2]
   1176         add         rsi,        rax
   1177         movdqu      xmm6,       [rsi]
   1178         movdqu      xmm7,       [rsi+rdx]
   1179 
   1180         movdqa      XMMWORD PTR [rsp],            xmm0
   1181 
   1182         movdqu      xmm0,       [rsi+rdx*2]
   1183 
   1184         movdqa      XMMWORD PTR [rsp+16],         xmm1
   1185         movdqa      XMMWORD PTR [rsp+32],         xmm2
   1186         movdqa      XMMWORD PTR [rsp+48],         xmm3
   1187         movdqa      XMMWORD PTR [rsp+64],         xmm4
   1188         movdqa      XMMWORD PTR [rsp+80],         xmm5
   1189         movdqa      XMMWORD PTR [rsp+96],         xmm6
   1190         movdqa      XMMWORD PTR [rsp+112],        xmm7
   1191         movdqa      XMMWORD PTR [rsp+128],        xmm0
   1192 
   1193         movsxd      rax,        dword ptr arg(2)    ; xoffset
   1194         cmp         rax,        0                   ; skip first_pass filter if xoffset=0
   1195         je          .b8x8_sp_only
   1196 
   1197         shl         rax,        4
   1198         add         rax,        rcx                 ; HFilter
   1199 
   1200         mov         rdi,        arg(4)              ; dst_ptr
   1201         movsxd      rdx,        dword ptr arg(5)    ; dst_pitch
   1202 
   1203         movdqa      xmm0,       [rax]
   1204 
   1205         movsxd      rax,        dword ptr arg(3)    ; yoffset
   1206         cmp         rax,        0                   ; skip second_pass filter if yoffset=0
   1207         je          .b8x8_fp_only
   1208 
   1209         shl         rax,        4
   1210         lea         rax,        [rax + rcx]         ; VFilter
   1211 
   1212         lea         rcx,        [rdi+rdx*8]
   1213 
   1214         movdqa      xmm1,       [rax]
   1215 
   1216         ; get the first horizontal line done
   1217         movdqa      xmm3,       [rsp]               ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
   1218         movdqa      xmm5,       xmm3                ; 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 xx
   1219 
   1220         psrldq      xmm5,       1
   1221         lea         rsp,        [rsp + 16]          ; next line
   1222 
   1223         punpcklbw   xmm3,       xmm5                ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08
   1224         pmaddubsw   xmm3,       xmm0                ; 00 02 04 06 08 10 12 14
   1225 
   1226         paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
   1227         psraw       xmm3,       VP8_FILTER_SHIFT    ; xmm3 /= 128
   1228 
   1229         movdqa      xmm7,       xmm3
   1230         packuswb    xmm7,       xmm7                ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
   1231 
   1232 .next_row:
   1233         movdqa      xmm6,       [rsp]               ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
   1234         lea         rsp,        [rsp + 16]          ; next line
   1235 
   1236         movdqa      xmm5,       xmm6
   1237 
   1238         psrldq      xmm5,       1
   1239 
   1240         punpcklbw   xmm6,       xmm5
   1241         pmaddubsw   xmm6,       xmm0
   1242 
   1243         paddw       xmm6,       [GLOBAL(rd)]        ; xmm6 += round value
   1244         psraw       xmm6,       VP8_FILTER_SHIFT    ; xmm6 /= 128
   1245 
   1246         packuswb    xmm6,       xmm6
   1247 
   1248         punpcklbw   xmm7,       xmm6
   1249         pmaddubsw   xmm7,       xmm1
   1250 
   1251         paddw       xmm7,       [GLOBAL(rd)]        ; xmm7 += round value
   1252         psraw       xmm7,       VP8_FILTER_SHIFT    ; xmm7 /= 128
   1253 
   1254         packuswb    xmm7,       xmm7
   1255 
   1256         movq        [rdi],      xmm7                ; store the results in the destination
   1257         lea         rdi,        [rdi + rdx]
   1258 
   1259         movdqa      xmm7,       xmm6
   1260 
   1261         cmp         rdi,        rcx
   1262         jne         .next_row
   1263 
   1264         jmp         .done8x8
   1265 
   1266 .b8x8_sp_only:
   1267         movsxd      rax,        dword ptr arg(3)    ; yoffset
   1268         shl         rax,        4
   1269         lea         rax,        [rax + rcx]         ; VFilter
   1270 
   1271         mov         rdi,        arg(4) ;dst_ptr
   1272         movsxd      rdx,        dword ptr arg(5)    ; dst_pitch
   1273 
   1274         movdqa      xmm0,       [rax]               ; VFilter
   1275 
   1276         movq        xmm1,       XMMWORD PTR [rsp]
   1277         movq        xmm2,       XMMWORD PTR [rsp+16]
   1278 
   1279         movq        xmm3,       XMMWORD PTR [rsp+32]
   1280         punpcklbw   xmm1,       xmm2
   1281 
   1282         movq        xmm4,       XMMWORD PTR [rsp+48]
   1283         punpcklbw   xmm2,       xmm3
   1284 
   1285         movq        xmm5,       XMMWORD PTR [rsp+64]
   1286         punpcklbw   xmm3,       xmm4
   1287 
   1288         movq        xmm6,       XMMWORD PTR [rsp+80]
   1289         punpcklbw   xmm4,       xmm5
   1290 
   1291         movq        xmm7,       XMMWORD PTR [rsp+96]
   1292         punpcklbw   xmm5,       xmm6
   1293 
   1294         pmaddubsw   xmm1,       xmm0
   1295         pmaddubsw   xmm2,       xmm0
   1296 
   1297         pmaddubsw   xmm3,       xmm0
   1298         pmaddubsw   xmm4,       xmm0
   1299 
   1300         pmaddubsw   xmm5,       xmm0
   1301         punpcklbw   xmm6,       xmm7
   1302 
   1303         pmaddubsw   xmm6,       xmm0
   1304         paddw       xmm1,       [GLOBAL(rd)]
   1305 
   1306         paddw       xmm2,       [GLOBAL(rd)]
   1307         psraw       xmm1,       VP8_FILTER_SHIFT
   1308 
   1309         paddw       xmm3,       [GLOBAL(rd)]
   1310         psraw       xmm2,       VP8_FILTER_SHIFT
   1311 
   1312         paddw       xmm4,       [GLOBAL(rd)]
   1313         psraw       xmm3,       VP8_FILTER_SHIFT
   1314 
   1315         paddw       xmm5,       [GLOBAL(rd)]
   1316         psraw       xmm4,       VP8_FILTER_SHIFT
   1317 
   1318         paddw       xmm6,       [GLOBAL(rd)]
   1319         psraw       xmm5,       VP8_FILTER_SHIFT
   1320 
   1321         psraw       xmm6,       VP8_FILTER_SHIFT
   1322         packuswb    xmm1,       xmm1
   1323 
   1324         packuswb    xmm2,       xmm2
   1325         movq        [rdi],      xmm1
   1326 
   1327         packuswb    xmm3,       xmm3
   1328         movq        [rdi+rdx],  xmm2
   1329 
   1330         packuswb    xmm4,       xmm4
   1331         movq        xmm1,       XMMWORD PTR [rsp+112]
   1332 
   1333         lea         rdi,        [rdi + 2*rdx]
   1334         movq        xmm2,       XMMWORD PTR [rsp+128]
   1335 
   1336         packuswb    xmm5,       xmm5
   1337         movq        [rdi],      xmm3
   1338 
   1339         packuswb    xmm6,       xmm6
   1340         movq        [rdi+rdx],  xmm4
   1341 
   1342         lea         rdi,        [rdi + 2*rdx]
   1343         punpcklbw   xmm7,       xmm1
   1344 
   1345         movq        [rdi],      xmm5
   1346         pmaddubsw   xmm7,       xmm0
   1347 
   1348         movq        [rdi+rdx],  xmm6
   1349         punpcklbw   xmm1,       xmm2
   1350 
   1351         pmaddubsw   xmm1,       xmm0
   1352         paddw       xmm7,       [GLOBAL(rd)]
   1353 
   1354         psraw       xmm7,       VP8_FILTER_SHIFT
   1355         paddw       xmm1,       [GLOBAL(rd)]
   1356 
   1357         psraw       xmm1,       VP8_FILTER_SHIFT
   1358         packuswb    xmm7,       xmm7
   1359 
   1360         packuswb    xmm1,       xmm1
   1361         lea         rdi,        [rdi + 2*rdx]
   1362 
   1363         movq        [rdi],      xmm7
   1364 
   1365         movq        [rdi+rdx],  xmm1
   1366         lea         rsp,        [rsp + 144]
   1367 
   1368         jmp         .done8x8
   1369 
   1370 .b8x8_fp_only:
   1371         lea         rcx,        [rdi+rdx*8]
   1372 
   1373 .next_row_fp:
   1374         movdqa      xmm1,       XMMWORD PTR [rsp]
   1375         movdqa      xmm3,       XMMWORD PTR [rsp+16]
   1376 
   1377         movdqa      xmm2,       xmm1
   1378         movdqa      xmm5,       XMMWORD PTR [rsp+32]
   1379 
   1380         psrldq      xmm2,       1
   1381         movdqa      xmm7,       XMMWORD PTR [rsp+48]
   1382 
   1383         movdqa      xmm4,       xmm3
   1384         psrldq      xmm4,       1
   1385 
   1386         movdqa      xmm6,       xmm5
   1387         psrldq      xmm6,       1
   1388 
   1389         punpcklbw   xmm1,       xmm2
   1390         pmaddubsw   xmm1,       xmm0
   1391 
   1392         punpcklbw   xmm3,       xmm4
   1393         pmaddubsw   xmm3,       xmm0
   1394 
   1395         punpcklbw   xmm5,       xmm6
   1396         pmaddubsw   xmm5,       xmm0
   1397 
   1398         movdqa      xmm2,       xmm7
   1399         psrldq      xmm2,       1
   1400 
   1401         punpcklbw   xmm7,       xmm2
   1402         pmaddubsw   xmm7,       xmm0
   1403 
   1404         paddw       xmm1,       [GLOBAL(rd)]
   1405         psraw       xmm1,       VP8_FILTER_SHIFT
   1406 
   1407         paddw       xmm3,       [GLOBAL(rd)]
   1408         psraw       xmm3,       VP8_FILTER_SHIFT
   1409 
   1410         paddw       xmm5,       [GLOBAL(rd)]
   1411         psraw       xmm5,       VP8_FILTER_SHIFT
   1412 
   1413         paddw       xmm7,       [GLOBAL(rd)]
   1414         psraw       xmm7,       VP8_FILTER_SHIFT
   1415 
   1416         packuswb    xmm1,       xmm1
   1417         packuswb    xmm3,       xmm3
   1418 
   1419         packuswb    xmm5,       xmm5
   1420         movq        [rdi],      xmm1
   1421 
   1422         packuswb    xmm7,       xmm7
   1423         movq        [rdi+rdx],  xmm3
   1424 
   1425         lea         rdi,        [rdi + 2*rdx]
   1426         movq        [rdi],      xmm5
   1427 
   1428         lea         rsp,        [rsp + 4*16]
   1429         movq        [rdi+rdx],  xmm7
   1430 
   1431         lea         rdi,        [rdi + 2*rdx]
   1432         cmp         rdi,        rcx
   1433 
   1434         jne         .next_row_fp
   1435 
   1436         lea         rsp,        [rsp + 16]
   1437 
   1438 .done8x8:
   1439     ;add rsp, 144
   1440     pop         rsp
   1441     ; begin epilog
   1442     pop         rdi
   1443     pop         rsi
   1444     RESTORE_GOT
   1445     RESTORE_XMM
   1446     UNSHADOW_ARGS
   1447     pop         rbp
   1448     ret
   1449 
   1450 SECTION_RODATA
   1451 align 16
   1452 shuf1b:
   1453     db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12
   1454 shuf2b:
   1455     db 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11
   1456 shuf3b:
   1457     db 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10
   1458 
   1459 align 16
   1460 shuf2bfrom1:
   1461     db  4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11, 9,13
   1462 align 16
   1463 shuf3bfrom1:
   1464     db  2, 6, 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11
   1465 
   1466 align 16
   1467 rd:
   1468     times 8 dw 0x40
   1469 
   1470 align 16
   1471 k0_k5:
   1472     times 8 db 0, 0             ;placeholder
   1473     times 8 db 0, 0
   1474     times 8 db 2, 1
   1475     times 8 db 0, 0
   1476     times 8 db 3, 3
   1477     times 8 db 0, 0
   1478     times 8 db 1, 2
   1479     times 8 db 0, 0
   1480 k1_k3:
   1481     times 8 db  0,    0         ;placeholder
   1482     times 8 db  -6,  12
   1483     times 8 db -11,  36
   1484     times 8 db  -9,  50
   1485     times 8 db -16,  77
   1486     times 8 db  -6,  93
   1487     times 8 db  -8, 108
   1488     times 8 db  -1, 123
   1489 k2_k4:
   1490     times 8 db 128,    0        ;placeholder
   1491     times 8 db 123,   -1
   1492     times 8 db 108,   -8
   1493     times 8 db  93,   -6
   1494     times 8 db  77,  -16
   1495     times 8 db  50,   -9
   1496     times 8 db  36,  -11
   1497     times 8 db  12,   -6
   1498 align 16
   1499 vp8_bilinear_filters_ssse3:
   1500     times 8 db 128, 0
   1501     times 8 db 112, 16
   1502     times 8 db 96,  32
   1503     times 8 db 80,  48
   1504     times 8 db 64,  64
   1505     times 8 db 48,  80
   1506     times 8 db 32,  96
   1507     times 8 db 16,  112
   1508 
   1509