Home | History | Annotate | Download | only in x86
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12 %include "vpx_ports/x86_abi_support.asm"
     13 
     14 %define BLOCK_HEIGHT_WIDTH 4
     15 %define VP8_FILTER_WEIGHT 128
     16 %define VP8_FILTER_SHIFT  7
     17 
     18 
     19 ;/************************************************************************************
     20 ; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
     21 ; input pixel array has output_height rows. This routine assumes that output_height is an
     22 ; even number. This function handles 8 pixels in horizontal direction, calculating ONE
     23 ; rows each iteration to take advantage of the 128 bits operations.
     24 ;
     25 ; This is an implementation of some of the SSE optimizations first seen in ffvp8
     26 ;
     27 ;*************************************************************************************/
     28 ;void vp8_filter_block1d8_h6_ssse3
     29 ;(
     30 ;    unsigned char  *src_ptr,
     31 ;    unsigned int    src_pixels_per_line,
     32 ;    unsigned char *output_ptr,
     33 ;    unsigned int    output_pitch,
     34 ;    unsigned int    output_height,
     35 ;    unsigned int    vp8_filter_index
     36 ;)
     37 global sym(vp8_filter_block1d8_h6_ssse3)
     38 sym(vp8_filter_block1d8_h6_ssse3):
     39     push        rbp
     40     mov         rbp, rsp
     41     SHADOW_ARGS_TO_STACK 6
     42     GET_GOT     rbx
     43     push        rsi
     44     push        rdi
     45     ; end prolog
     46 
     47     movsxd      rdx, DWORD PTR arg(5)   ;table index
     48     xor         rsi, rsi
     49     shl         rdx, 4
     50 
     51     movdqa      xmm7, [GLOBAL(rd)]
     52 
     53     lea         rax, [GLOBAL(k0_k5)]
     54     add         rax, rdx
     55     mov         rdi, arg(2)             ;output_ptr
     56 
     57     cmp         esi, DWORD PTR [rax]
     58     je          vp8_filter_block1d8_h4_ssse3
     59 
     60     movdqa      xmm4, XMMWORD PTR [rax]         ;k0_k5
     61     movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
     62     movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
     63 
     64     mov         rsi, arg(0)             ;src_ptr
     65     movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line
     66     movsxd      rcx, dword ptr arg(4)   ;output_height
     67 
     68     movsxd      rdx, dword ptr arg(3)   ;output_pitch
     69 
     70     sub         rdi, rdx
     71 ;xmm3 free
     72 filter_block1d8_h6_rowloop_ssse3:
     73     movq        xmm0,   MMWORD PTR [rsi - 2]    ; -2 -1  0  1  2  3  4  5
     74 
     75     movq        xmm2,   MMWORD PTR [rsi + 3]    ;  3  4  5  6  7  8  9 10
     76 
     77     punpcklbw   xmm0,   xmm2                    ; -2  3 -1  4  0  5  1  6  2  7  3  8  4  9  5 10
     78 
     79     movdqa      xmm1,   xmm0
     80     pmaddubsw   xmm0,   xmm4
     81 
     82     movdqa      xmm2,   xmm1
     83     pshufb      xmm1,   [GLOBAL(shuf2bfrom1)]
     84 
     85     pshufb      xmm2,   [GLOBAL(shuf3bfrom1)]
     86     pmaddubsw   xmm1,   xmm5
     87 
     88     lea         rdi,    [rdi + rdx]
     89     pmaddubsw   xmm2,   xmm6
     90 
     91     lea         rsi,    [rsi + rax]
     92     dec         rcx
     93 
     94     paddsw      xmm0,   xmm1
     95     paddsw      xmm2,   xmm7
     96 
     97     paddsw      xmm0,   xmm2
     98 
     99     psraw       xmm0,   7
    100 
    101     packuswb    xmm0,   xmm0
    102 
    103     movq        MMWORD Ptr [rdi], xmm0
    104     jnz         filter_block1d8_h6_rowloop_ssse3
    105 
    106     ; begin epilog
    107     pop rdi
    108     pop rsi
    109     RESTORE_GOT
    110     UNSHADOW_ARGS
    111     pop         rbp
    112     ret
    113 
    114 vp8_filter_block1d8_h4_ssse3:
    115     movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
    116     movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
    117 
    118     movdqa      xmm3, XMMWORD PTR [GLOBAL(shuf2bfrom1)]
    119     movdqa      xmm4, XMMWORD PTR [GLOBAL(shuf3bfrom1)]
    120 
    121     mov         rsi, arg(0)             ;src_ptr
    122 
    123     movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line
    124     movsxd      rcx, dword ptr arg(4)   ;output_height
    125 
    126     movsxd      rdx, dword ptr arg(3)   ;output_pitch
    127 
    128     sub         rdi, rdx
    129 
    130 filter_block1d8_h4_rowloop_ssse3:
    131     movq        xmm0,   MMWORD PTR [rsi - 2]    ; -2 -1  0  1  2  3  4  5
    132 
    133     movq        xmm1,   MMWORD PTR [rsi + 3]    ;  3  4  5  6  7  8  9 10
    134 
    135     punpcklbw   xmm0,   xmm1                    ; -2  3 -1  4  0  5  1  6  2  7  3  8  4  9  5 10
    136 
    137     movdqa      xmm2,   xmm0
    138     pshufb      xmm0,   xmm3
    139 
    140     pshufb      xmm2,   xmm4
    141     pmaddubsw   xmm0,   xmm5
    142 
    143     lea         rdi,    [rdi + rdx]
    144     pmaddubsw   xmm2,   xmm6
    145 
    146     lea         rsi,    [rsi + rax]
    147     dec         rcx
    148 
    149     paddsw      xmm0,   xmm7
    150 
    151     paddsw      xmm0,   xmm2
    152 
    153     psraw       xmm0,   7
    154 
    155     packuswb    xmm0,   xmm0
    156 
    157     movq        MMWORD Ptr [rdi], xmm0
    158 
    159     jnz         filter_block1d8_h4_rowloop_ssse3
    160 
    161     ; begin epilog
    162     pop rdi
    163     pop rsi
    164     RESTORE_GOT
    165     UNSHADOW_ARGS
    166     pop         rbp
    167     ret
    168 ;void vp8_filter_block1d16_h6_ssse3
    169 ;(
    170 ;    unsigned char  *src_ptr,
    171 ;    unsigned int    src_pixels_per_line,
    172 ;    unsigned char  *output_ptr,
    173 ;    unsigned int    output_pitch,
    174 ;    unsigned int    output_height,
    175 ;    unsigned int    vp8_filter_index
    176 ;)
    177 global sym(vp8_filter_block1d16_h6_ssse3)
    178 sym(vp8_filter_block1d16_h6_ssse3):
    179     push        rbp
    180     mov         rbp, rsp
    181     SHADOW_ARGS_TO_STACK 6
    182     SAVE_XMM
    183     GET_GOT     rbx
    184     push        rsi
    185     push        rdi
    186     ; end prolog
    187 
    188     movsxd      rdx, DWORD PTR arg(5)           ;table index
    189     xor         rsi, rsi
    190     shl         rdx, 4      ;
    191 
    192     lea         rax, [GLOBAL(k0_k5)]
    193     add         rax, rdx
    194 
    195     mov         rdi, arg(2)                     ;output_ptr
    196 
    197 ;;
    198 ;;    cmp         esi, DWORD PTR [rax]
    199 ;;    je          vp8_filter_block1d16_h4_ssse3
    200 
    201     mov         rsi, arg(0)                     ;src_ptr
    202 
    203     movdqa      xmm4, XMMWORD PTR [rax]         ;k0_k5
    204     movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
    205     movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
    206 
    207     movsxd      rax, dword ptr arg(1)           ;src_pixels_per_line
    208     movsxd      rcx, dword ptr arg(4)           ;output_height
    209     movsxd      rdx, dword ptr arg(3)           ;output_pitch
    210 
    211 filter_block1d16_h6_rowloop_ssse3:
    212     movq        xmm0,   MMWORD PTR [rsi - 2]    ; -2 -1  0  1  2  3  4  5
    213 
    214     movq        xmm3,   MMWORD PTR [rsi + 3]    ;  3  4  5  6  7  8  9 10
    215 
    216     punpcklbw   xmm0,   xmm3                    ; -2  3 -1  4  0  5  1  6  2  7  3  8  4  9  5 10
    217 
    218     movdqa      xmm1,   xmm0
    219     pmaddubsw   xmm0,   xmm4
    220 
    221     movdqa      xmm2,   xmm1
    222     pshufb      xmm1,   [GLOBAL(shuf2bfrom1)]
    223 
    224     pshufb      xmm2,   [GLOBAL(shuf3bfrom1)]
    225     movq        xmm3,   MMWORD PTR [rsi +  6]
    226 
    227     pmaddubsw   xmm1,   xmm5
    228     movq        xmm7,   MMWORD PTR [rsi + 11]
    229 
    230     pmaddubsw   xmm2,   xmm6
    231     punpcklbw   xmm3,   xmm7
    232 
    233     paddsw      xmm0,   xmm1
    234     movdqa      xmm1,   xmm3
    235 
    236     pmaddubsw   xmm3,   xmm4
    237     paddsw      xmm0,   xmm2
    238 
    239     movdqa      xmm2,   xmm1
    240     paddsw      xmm0,   [GLOBAL(rd)]
    241 
    242     pshufb      xmm1,   [GLOBAL(shuf2bfrom1)]
    243     pshufb      xmm2,   [GLOBAL(shuf3bfrom1)]
    244 
    245     psraw       xmm0,   7
    246     pmaddubsw   xmm1,   xmm5
    247 
    248     pmaddubsw   xmm2,   xmm6
    249     packuswb    xmm0,   xmm0
    250 
    251     lea         rsi,    [rsi + rax]
    252     paddsw      xmm3,   xmm1
    253 
    254     paddsw      xmm3,   xmm2
    255 
    256     paddsw      xmm3,   [GLOBAL(rd)]
    257 
    258     psraw       xmm3,   7
    259 
    260     packuswb    xmm3,   xmm3
    261 
    262     punpcklqdq  xmm0,   xmm3
    263 
    264     movdqa      XMMWORD Ptr [rdi], xmm0
    265 
    266     lea         rdi,    [rdi + rdx]
    267     dec         rcx
    268     jnz         filter_block1d16_h6_rowloop_ssse3
    269 
    270     ; begin epilog
    271     pop rdi
    272     pop rsi
    273     RESTORE_GOT
    274     UNSHADOW_ARGS
    275     pop         rbp
    276     ret
    277 
    278 vp8_filter_block1d16_h4_ssse3:
    279     movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
    280     movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
    281 
    282     mov         rsi, arg(0)             ;src_ptr
    283     movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line
    284     movsxd      rcx, dword ptr arg(4)   ;output_height
    285     movsxd      rdx, dword ptr arg(3)   ;output_pitch
    286 
    287 filter_block1d16_h4_rowloop_ssse3:
    288     movdqu      xmm1,   XMMWORD PTR [rsi - 2]
    289 
    290     movdqa      xmm2, xmm1
    291     pshufb      xmm1, [GLOBAL(shuf2b)]
    292     pshufb      xmm2, [GLOBAL(shuf3b)]
    293     pmaddubsw   xmm1, xmm5
    294 
    295     movdqu      xmm3,   XMMWORD PTR [rsi + 6]
    296 
    297     pmaddubsw   xmm2, xmm6
    298     movdqa      xmm0, xmm3
    299     pshufb      xmm3, [GLOBAL(shuf3b)]
    300     pshufb      xmm0, [GLOBAL(shuf2b)]
    301 
    302     paddsw      xmm1, [GLOBAL(rd)]
    303     paddsw      xmm1, xmm2
    304 
    305     pmaddubsw   xmm0, xmm5
    306     pmaddubsw   xmm3, xmm6
    307 
    308     psraw       xmm1, 7
    309     packuswb    xmm1, xmm1
    310     lea         rsi,    [rsi + rax]
    311     paddsw      xmm3, xmm0
    312     paddsw      xmm3, [GLOBAL(rd)]
    313     psraw       xmm3, 7
    314     packuswb    xmm3, xmm3
    315 
    316     punpcklqdq  xmm1, xmm3
    317 
    318     movdqa      XMMWORD Ptr [rdi], xmm1
    319 
    320     add         rdi, rdx
    321     dec         rcx
    322     jnz         filter_block1d16_h4_rowloop_ssse3
    323 
    324 
    325     ; begin epilog
    326     pop rdi
    327     pop rsi
    328     RESTORE_GOT
    329     UNSHADOW_ARGS
    330     pop         rbp
    331     ret
    332 
    333 ;void vp8_filter_block1d4_h6_ssse3
    334 ;(
    335 ;    unsigned char  *src_ptr,
    336 ;    unsigned int    src_pixels_per_line,
    337 ;    unsigned char  *output_ptr,
    338 ;    unsigned int    output_pitch,
    339 ;    unsigned int    output_height,
    340 ;    unsigned int    vp8_filter_index
    341 ;)
    342 global sym(vp8_filter_block1d4_h6_ssse3)
    343 sym(vp8_filter_block1d4_h6_ssse3):
    344     push        rbp
    345     mov         rbp, rsp
    346     SHADOW_ARGS_TO_STACK 6
    347     GET_GOT     rbx
    348     push        rsi
    349     push        rdi
    350     ; end prolog
    351 
    352     movsxd      rdx, DWORD PTR arg(5)   ;table index
    353     xor         rsi, rsi
    354     shl         rdx, 4      ;
    355 
    356     lea         rax, [GLOBAL(k0_k5)]
    357     add         rax, rdx
    358     movdqa      xmm7, [GLOBAL(rd)]
    359 
    360     cmp         esi, DWORD PTR [rax]
    361     je          vp8_filter_block1d4_h4_ssse3
    362 
    363     movdqa      xmm4, XMMWORD PTR [rax]         ;k0_k5
    364     movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
    365     movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
    366 
    367     mov         rsi, arg(0)             ;src_ptr
    368     mov         rdi, arg(2)             ;output_ptr
    369     movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line
    370     movsxd      rcx, dword ptr arg(4)   ;output_height
    371 
    372     movsxd      rdx, dword ptr arg(3)   ;output_pitch
    373 
    374 ;xmm3 free
    375 filter_block1d4_h6_rowloop_ssse3:
    376     movdqu      xmm0,   XMMWORD PTR [rsi - 2]
    377 
    378     movdqa      xmm1, xmm0
    379     pshufb      xmm0, [GLOBAL(shuf1b)]
    380 
    381     movdqa      xmm2, xmm1
    382     pshufb      xmm1, [GLOBAL(shuf2b)]
    383     pmaddubsw   xmm0, xmm4
    384     pshufb      xmm2, [GLOBAL(shuf3b)]
    385     pmaddubsw   xmm1, xmm5
    386 
    387 ;--
    388     pmaddubsw   xmm2, xmm6
    389 
    390     lea         rsi,    [rsi + rax]
    391 ;--
    392     paddsw      xmm0, xmm1
    393     paddsw      xmm0, xmm7
    394     pxor        xmm1, xmm1
    395     paddsw      xmm0, xmm2
    396     psraw       xmm0, 7
    397     packuswb    xmm0, xmm0
    398 
    399     movd        DWORD PTR [rdi], xmm0
    400 
    401     add         rdi, rdx
    402     dec         rcx
    403     jnz         filter_block1d4_h6_rowloop_ssse3
    404 
    405     ; begin epilog
    406     pop rdi
    407     pop rsi
    408     RESTORE_GOT
    409     UNSHADOW_ARGS
    410     pop         rbp
    411     ret
    412 
    413 vp8_filter_block1d4_h4_ssse3:
    414     movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
    415     movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
    416     movdqa      xmm0, XMMWORD PTR [GLOBAL(shuf2b)]
    417     movdqa      xmm3, XMMWORD PTR [GLOBAL(shuf3b)]
    418 
    419     mov         rsi, arg(0)             ;src_ptr
    420     mov         rdi, arg(2)             ;output_ptr
    421     movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line
    422     movsxd      rcx, dword ptr arg(4)   ;output_height
    423 
    424     movsxd      rdx, dword ptr arg(3)   ;output_pitch
    425 
    426 filter_block1d4_h4_rowloop_ssse3:
    427     movdqu      xmm1,   XMMWORD PTR [rsi - 2]
    428 
    429     movdqa      xmm2, xmm1
    430     pshufb      xmm1, xmm0 ;;[GLOBAL(shuf2b)]
    431     pshufb      xmm2, xmm3 ;;[GLOBAL(shuf3b)]
    432     pmaddubsw   xmm1, xmm5
    433 
    434 ;--
    435     pmaddubsw   xmm2, xmm6
    436 
    437     lea         rsi,    [rsi + rax]
    438 ;--
    439     paddsw      xmm1, xmm7
    440     paddsw      xmm1, xmm2
    441     psraw       xmm1, 7
    442     packuswb    xmm1, xmm1
    443 
    444     movd        DWORD PTR [rdi], xmm1
    445 
    446     add         rdi, rdx
    447     dec         rcx
    448     jnz         filter_block1d4_h4_rowloop_ssse3
    449 
    450     ; begin epilog
    451     pop rdi
    452     pop rsi
    453     RESTORE_GOT
    454     UNSHADOW_ARGS
    455     pop         rbp
    456     ret
    457 
    458 
    459 
    460 ;void vp8_filter_block1d16_v6_ssse3
    461 ;(
    462 ;    unsigned char *src_ptr,
    463 ;    unsigned int   src_pitch,
    464 ;    unsigned char *output_ptr,
    465 ;    unsigned int   out_pitch,
    466 ;    unsigned int   output_height,
    467 ;    unsigned int   vp8_filter_index
    468 ;)
    469 global sym(vp8_filter_block1d16_v6_ssse3)
    470 sym(vp8_filter_block1d16_v6_ssse3):
    471     push        rbp
    472     mov         rbp, rsp
    473     SHADOW_ARGS_TO_STACK 6
    474     GET_GOT     rbx
    475     push        rsi
    476     push        rdi
    477     ; end prolog
    478 
    479     movsxd      rdx, DWORD PTR arg(5)   ;table index
    480     xor         rsi, rsi
    481     shl         rdx, 4      ;
    482 
    483     lea         rax, [GLOBAL(k0_k5)]
    484     add         rax, rdx
    485 
    486     cmp         esi, DWORD PTR [rax]
    487     je          vp8_filter_block1d16_v4_ssse3
    488 
    489     movdqa      xmm5, XMMWORD PTR [rax]         ;k0_k5
    490     movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4
    491     movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3
    492 
    493     mov         rsi, arg(0)             ;src_ptr
    494     movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line
    495     mov         rdi, arg(2)             ;output_ptr
    496 
    497 %if ABI_IS_32BIT=0
    498     movsxd      r8, DWORD PTR arg(3)    ;out_pitch
    499 %endif
    500     mov         rax, rsi
    501     movsxd      rcx, DWORD PTR arg(4)   ;output_height
    502     add         rax, rdx
    503 
    504 
    505 vp8_filter_block1d16_v6_ssse3_loop:
    506     movq        xmm1, MMWORD PTR [rsi]                  ;A
    507     movq        xmm2, MMWORD PTR [rsi + rdx]            ;B
    508     movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C
    509     movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D
    510     movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E
    511 
    512     punpcklbw   xmm2, xmm4                  ;B D
    513     punpcklbw   xmm3, xmm0                  ;C E
    514 
    515     movq        xmm0, MMWORD PTR [rax + rdx * 4]        ;F
    516 
    517     pmaddubsw   xmm3, xmm6
    518     punpcklbw   xmm1, xmm0                  ;A F
    519     pmaddubsw   xmm2, xmm7
    520     pmaddubsw   xmm1, xmm5
    521 
    522     paddsw      xmm2, xmm3
    523     paddsw      xmm2, xmm1
    524     paddsw      xmm2, [GLOBAL(rd)]
    525     psraw       xmm2, 7
    526     packuswb    xmm2, xmm2
    527 
    528     movq        MMWORD PTR [rdi], xmm2          ;store the results
    529 
    530     movq        xmm1, MMWORD PTR [rsi + 8]                  ;A
    531     movq        xmm2, MMWORD PTR [rsi + rdx + 8]            ;B
    532     movq        xmm3, MMWORD PTR [rsi + rdx * 2 + 8]        ;C
    533     movq        xmm4, MMWORD PTR [rax + rdx * 2 + 8]        ;D
    534     movq        xmm0, MMWORD PTR [rsi + rdx * 4 + 8]        ;E
    535 
    536     punpcklbw   xmm2, xmm4                  ;B D
    537     punpcklbw   xmm3, xmm0                  ;C E
    538 
    539     movq        xmm0, MMWORD PTR [rax + rdx * 4 + 8]        ;F
    540     pmaddubsw   xmm3, xmm6
    541     punpcklbw   xmm1, xmm0                  ;A F
    542     pmaddubsw   xmm2, xmm7
    543     pmaddubsw   xmm1, xmm5
    544 
    545     add         rsi,  rdx
    546     add         rax,  rdx
    547 ;--
    548 ;--
    549     paddsw      xmm2, xmm3
    550     paddsw      xmm2, xmm1
    551     paddsw      xmm2, [GLOBAL(rd)]
    552     psraw       xmm2, 7
    553     packuswb    xmm2, xmm2
    554 
    555     movq        MMWORD PTR [rdi+8], xmm2
    556 
    557 %if ABI_IS_32BIT
    558     add         rdi,        DWORD PTR arg(3) ;out_pitch
    559 %else
    560     add         rdi,        r8
    561 %endif
    562     dec         rcx
    563     jnz         vp8_filter_block1d16_v6_ssse3_loop
    564 
    565     ; begin epilog
    566     pop rdi
    567     pop rsi
    568     RESTORE_GOT
    569     UNSHADOW_ARGS
    570     pop         rbp
    571     ret
    572 
    573 vp8_filter_block1d16_v4_ssse3:
    574     movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4
    575     movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3
    576 
    577     mov         rsi, arg(0)             ;src_ptr
    578     movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line
    579     mov         rdi, arg(2)             ;output_ptr
    580 
    581 %if ABI_IS_32BIT=0
    582     movsxd      r8, DWORD PTR arg(3)    ;out_pitch
    583 %endif
    584     mov         rax, rsi
    585     movsxd      rcx, DWORD PTR arg(4)   ;output_height
    586     add         rax, rdx
    587 
    588 vp8_filter_block1d16_v4_ssse3_loop:
    589     movq        xmm2, MMWORD PTR [rsi + rdx]            ;B
    590     movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C
    591     movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D
    592     movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E
    593 
    594     punpcklbw   xmm2, xmm4                  ;B D
    595     punpcklbw   xmm3, xmm0                  ;C E
    596 
    597     pmaddubsw   xmm3, xmm6
    598     pmaddubsw   xmm2, xmm7
    599     movq        xmm5, MMWORD PTR [rsi + rdx + 8]            ;B
    600     movq        xmm1, MMWORD PTR [rsi + rdx * 2 + 8]        ;C
    601     movq        xmm4, MMWORD PTR [rax + rdx * 2 + 8]        ;D
    602     movq        xmm0, MMWORD PTR [rsi + rdx * 4 + 8]        ;E
    603 
    604     paddsw      xmm2, [GLOBAL(rd)]
    605     paddsw      xmm2, xmm3
    606     psraw       xmm2, 7
    607     packuswb    xmm2, xmm2
    608 
    609     punpcklbw   xmm5, xmm4                  ;B D
    610     punpcklbw   xmm1, xmm0                  ;C E
    611 
    612     pmaddubsw   xmm1, xmm6
    613     pmaddubsw   xmm5, xmm7
    614 
    615     movdqa      xmm4, [GLOBAL(rd)]
    616     add         rsi,  rdx
    617     add         rax,  rdx
    618 ;--
    619 ;--
    620     paddsw      xmm5, xmm1
    621     paddsw      xmm5, xmm4
    622     psraw       xmm5, 7
    623     packuswb    xmm5, xmm5
    624 
    625     punpcklqdq  xmm2, xmm5
    626 
    627     movdqa       XMMWORD PTR [rdi], xmm2
    628 
    629 %if ABI_IS_32BIT
    630     add         rdi,        DWORD PTR arg(3) ;out_pitch
    631 %else
    632     add         rdi,        r8
    633 %endif
    634     dec         rcx
    635     jnz         vp8_filter_block1d16_v4_ssse3_loop
    636 
    637     ; begin epilog
    638     pop rdi
    639     pop rsi
    640     RESTORE_GOT
    641     UNSHADOW_ARGS
    642     pop         rbp
    643     ret
    644 
    645 ;void vp8_filter_block1d8_v6_ssse3
    646 ;(
    647 ;    unsigned char *src_ptr,
    648 ;    unsigned int   src_pitch,
    649 ;    unsigned char *output_ptr,
    650 ;    unsigned int   out_pitch,
    651 ;    unsigned int   output_height,
    652 ;    unsigned int   vp8_filter_index
    653 ;)
    654 global sym(vp8_filter_block1d8_v6_ssse3)
    655 sym(vp8_filter_block1d8_v6_ssse3):
    656     push        rbp
    657     mov         rbp, rsp
    658     SHADOW_ARGS_TO_STACK 6
    659     GET_GOT     rbx
    660     push        rsi
    661     push        rdi
    662     ; end prolog
    663 
    664     movsxd      rdx, DWORD PTR arg(5)   ;table index
    665     xor         rsi, rsi
    666     shl         rdx, 4      ;
    667 
    668     lea         rax, [GLOBAL(k0_k5)]
    669     add         rax, rdx
    670 
    671     movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line
    672     mov         rdi, arg(2)             ;output_ptr
    673 %if ABI_IS_32BIT=0
    674     movsxd      r8, DWORD PTR arg(3)    ; out_pitch
    675 %endif
    676     movsxd      rcx, DWORD PTR arg(4)   ;[output_height]
    677 
    678     cmp         esi, DWORD PTR [rax]
    679     je          vp8_filter_block1d8_v4_ssse3
    680 
    681     movdqa      xmm5, XMMWORD PTR [rax]         ;k0_k5
    682     movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4
    683     movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3
    684 
    685     mov         rsi, arg(0)             ;src_ptr
    686 
    687     mov         rax, rsi
    688     add         rax, rdx
    689 
    690 vp8_filter_block1d8_v6_ssse3_loop:
    691     movq        xmm1, MMWORD PTR [rsi]                  ;A
    692     movq        xmm2, MMWORD PTR [rsi + rdx]            ;B
    693     movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C
    694     movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D
    695     movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E
    696 
    697     punpcklbw   xmm2, xmm4                  ;B D
    698     punpcklbw   xmm3, xmm0                  ;C E
    699 
    700     movq        xmm0, MMWORD PTR [rax + rdx * 4]        ;F
    701     movdqa      xmm4, [GLOBAL(rd)]
    702 
    703     pmaddubsw   xmm3, xmm6
    704     punpcklbw   xmm1, xmm0                  ;A F
    705     pmaddubsw   xmm2, xmm7
    706     pmaddubsw   xmm1, xmm5
    707     add         rsi,  rdx
    708     add         rax,  rdx
    709 ;--
    710 ;--
    711     paddsw      xmm2, xmm3
    712     paddsw      xmm2, xmm1
    713     paddsw      xmm2, xmm4
    714     psraw       xmm2, 7
    715     packuswb    xmm2, xmm2
    716 
    717     movq        MMWORD PTR [rdi], xmm2
    718 
    719 %if ABI_IS_32BIT
    720     add         rdi,        DWORD PTR arg(3) ;[out_pitch]
    721 %else
    722     add         rdi,        r8
    723 %endif
    724     dec         rcx
    725     jnz         vp8_filter_block1d8_v6_ssse3_loop
    726 
    727     ; begin epilog
    728     pop rdi
    729     pop rsi
    730     RESTORE_GOT
    731     UNSHADOW_ARGS
    732     pop         rbp
    733     ret
    734 
    735 vp8_filter_block1d8_v4_ssse3:
    736     movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4
    737     movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3
    738     movdqa      xmm5, [GLOBAL(rd)]
    739 
    740     mov         rsi, arg(0)             ;src_ptr
    741 
    742     mov         rax, rsi
    743     add         rax, rdx
    744 
    745 vp8_filter_block1d8_v4_ssse3_loop:
    746     movq        xmm2, MMWORD PTR [rsi + rdx]            ;B
    747     movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C
    748     movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D
    749     movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E
    750 
    751     punpcklbw   xmm2, xmm4                  ;B D
    752     punpcklbw   xmm3, xmm0                  ;C E
    753 
    754     pmaddubsw   xmm3, xmm6
    755     pmaddubsw   xmm2, xmm7
    756     add         rsi,  rdx
    757     add         rax,  rdx
    758 ;--
    759 ;--
    760     paddsw      xmm2, xmm3
    761     paddsw      xmm2, xmm5
    762     psraw       xmm2, 7
    763     packuswb    xmm2, xmm2
    764 
    765     movq        MMWORD PTR [rdi], xmm2
    766 
    767 %if ABI_IS_32BIT
    768     add         rdi,        DWORD PTR arg(3) ;[out_pitch]
    769 %else
    770     add         rdi,        r8
    771 %endif
    772     dec         rcx
    773     jnz         vp8_filter_block1d8_v4_ssse3_loop
    774 
    775     ; begin epilog
    776     pop rdi
    777     pop rsi
    778     RESTORE_GOT
    779     UNSHADOW_ARGS
    780     pop         rbp
    781     ret
    782 ;void vp8_filter_block1d4_v6_ssse3
    783 ;(
    784 ;    unsigned char *src_ptr,
    785 ;    unsigned int   src_pitch,
    786 ;    unsigned char *output_ptr,
    787 ;    unsigned int   out_pitch,
    788 ;    unsigned int   output_height,
    789 ;    unsigned int   vp8_filter_index
    790 ;)
    791 global sym(vp8_filter_block1d4_v6_ssse3)
    792 sym(vp8_filter_block1d4_v6_ssse3):
    793     push        rbp
    794     mov         rbp, rsp
    795     SHADOW_ARGS_TO_STACK 6
    796     GET_GOT     rbx
    797     push        rsi
    798     push        rdi
    799     ; end prolog
    800 
    801     movsxd      rdx, DWORD PTR arg(5)   ;table index
    802     xor         rsi, rsi
    803     shl         rdx, 4      ;
    804 
    805     lea         rax, [GLOBAL(k0_k5)]
    806     add         rax, rdx
    807 
    808     movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line
    809     mov         rdi, arg(2)             ;output_ptr
    810 %if ABI_IS_32BIT=0
    811     movsxd      r8, DWORD PTR arg(3)    ; out_pitch
    812 %endif
    813     movsxd      rcx, DWORD PTR arg(4)   ;[output_height]
    814 
    815     cmp         esi, DWORD PTR [rax]
    816     je          vp8_filter_block1d4_v4_ssse3
    817 
    818     movq        mm5, MMWORD PTR [rax]         ;k0_k5
    819     movq        mm6, MMWORD PTR [rax+256]     ;k2_k4
    820     movq        mm7, MMWORD PTR [rax+128]     ;k1_k3
    821 
    822     mov         rsi, arg(0)             ;src_ptr
    823 
    824     mov         rax, rsi
    825     add         rax, rdx
    826 
    827 vp8_filter_block1d4_v6_ssse3_loop:
    828     movd        mm1, DWORD PTR [rsi]                  ;A
    829     movd        mm2, DWORD PTR [rsi + rdx]            ;B
    830     movd        mm3, DWORD PTR [rsi + rdx * 2]        ;C
    831     movd        mm4, DWORD PTR [rax + rdx * 2]        ;D
    832     movd        mm0, DWORD PTR [rsi + rdx * 4]        ;E
    833 
    834     punpcklbw   mm2, mm4                  ;B D
    835     punpcklbw   mm3, mm0                  ;C E
    836 
    837     movd        mm0, DWORD PTR [rax + rdx * 4]        ;F
    838 
    839     movq        mm4, [GLOBAL(rd)]
    840 
    841     pmaddubsw   mm3, mm6
    842     punpcklbw   mm1, mm0                  ;A F
    843     pmaddubsw   mm2, mm7
    844     pmaddubsw   mm1, mm5
    845     add         rsi,  rdx
    846     add         rax,  rdx
    847 ;--
    848 ;--
    849     paddsw      mm2, mm3
    850     paddsw      mm2, mm1
    851     paddsw      mm2, mm4
    852     psraw       mm2, 7
    853     packuswb    mm2, mm2
    854 
    855     movd        DWORD PTR [rdi], mm2
    856 
    857 %if ABI_IS_32BIT
    858     add         rdi,        DWORD PTR arg(3) ;[out_pitch]
    859 %else
    860     add         rdi,        r8
    861 %endif
    862     dec         rcx
    863     jnz         vp8_filter_block1d4_v6_ssse3_loop
    864 
    865     ; begin epilog
    866     pop rdi
    867     pop rsi
    868     RESTORE_GOT
    869     UNSHADOW_ARGS
    870     pop         rbp
    871     ret
    872 
    873 vp8_filter_block1d4_v4_ssse3:
    874     movq        mm6, MMWORD PTR [rax+256]     ;k2_k4
    875     movq        mm7, MMWORD PTR [rax+128]     ;k1_k3
    876     movq        mm5, MMWORD PTR [GLOBAL(rd)]
    877 
    878     mov         rsi, arg(0)             ;src_ptr
    879 
    880     mov         rax, rsi
    881     add         rax, rdx
    882 
    883 vp8_filter_block1d4_v4_ssse3_loop:
    884     movd        mm2, DWORD PTR [rsi + rdx]            ;B
    885     movd        mm3, DWORD PTR [rsi + rdx * 2]        ;C
    886     movd        mm4, DWORD PTR [rax + rdx * 2]        ;D
    887     movd        mm0, DWORD PTR [rsi + rdx * 4]        ;E
    888 
    889     punpcklbw   mm2, mm4                  ;B D
    890     punpcklbw   mm3, mm0                  ;C E
    891 
    892     pmaddubsw   mm3, mm6
    893     pmaddubsw   mm2, mm7
    894     add         rsi,  rdx
    895     add         rax,  rdx
    896 ;--
    897 ;--
    898     paddsw      mm2, mm3
    899     paddsw      mm2, mm5
    900     psraw       mm2, 7
    901     packuswb    mm2, mm2
    902 
    903     movd        DWORD PTR [rdi], mm2
    904 
    905 %if ABI_IS_32BIT
    906     add         rdi,        DWORD PTR arg(3) ;[out_pitch]
    907 %else
    908     add         rdi,        r8
    909 %endif
    910     dec         rcx
    911     jnz         vp8_filter_block1d4_v4_ssse3_loop
    912 
    913     ; begin epilog
    914     pop rdi
    915     pop rsi
    916     RESTORE_GOT
    917     UNSHADOW_ARGS
    918     pop         rbp
    919     ret
    920 
    921 ;void vp8_bilinear_predict16x16_ssse3
    922 ;(
    923 ;    unsigned char  *src_ptr,
    924 ;    int   src_pixels_per_line,
    925 ;    int  xoffset,
    926 ;    int  yoffset,
    927 ;    unsigned char *dst_ptr,
    928 ;    int dst_pitch
    929 ;)
    930 global sym(vp8_bilinear_predict16x16_ssse3)
    931 sym(vp8_bilinear_predict16x16_ssse3):
    932     push        rbp
    933     mov         rbp, rsp
    934     SHADOW_ARGS_TO_STACK 6
    935     SAVE_XMM
    936     GET_GOT     rbx
    937     push        rsi
    938     push        rdi
    939     ; end prolog
    940 
    941         lea         rcx,        [GLOBAL(vp8_bilinear_filters_ssse3)]
    942         movsxd      rax,        dword ptr arg(2)    ; xoffset
    943 
    944         cmp         rax,        0                   ; skip first_pass filter if xoffset=0
    945         je          b16x16_sp_only
    946 
    947         shl         rax,        4
    948         lea         rax,        [rax + rcx]         ; HFilter
    949 
    950         mov         rdi,        arg(4)              ; dst_ptr
    951         mov         rsi,        arg(0)              ; src_ptr
    952         movsxd      rdx,        dword ptr arg(5)    ; dst_pitch
    953 
    954         movdqa      xmm1,       [rax]
    955 
    956         movsxd      rax,        dword ptr arg(3)    ; yoffset
    957 
    958         cmp         rax,        0                   ; skip second_pass filter if yoffset=0
    959         je          b16x16_fp_only
    960 
    961         shl         rax,        4
    962         lea         rax,        [rax + rcx]         ; VFilter
    963 
    964         lea         rcx,        [rdi+rdx*8]
    965         lea         rcx,        [rcx+rdx*8]
    966         movsxd      rdx,        dword ptr arg(1)    ; src_pixels_per_line
    967 
    968         movdqa      xmm2,       [rax]
    969 
    970 %if ABI_IS_32BIT=0
    971         movsxd      r8,         dword ptr arg(5)    ; dst_pitch
    972 %endif
    973         movq        xmm3,       [rsi]               ; 00 01 02 03 04 05 06 07
    974         movq        xmm5,       [rsi+1]             ; 01 02 03 04 05 06 07 08
    975 
    976         punpcklbw   xmm3,       xmm5                ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08
    977         movq        xmm4,       [rsi+8]             ; 08 09 10 11 12 13 14 15
    978 
    979         movq        xmm5,       [rsi+9]             ; 09 10 11 12 13 14 15 16
    980 
    981         lea         rsi,        [rsi + rdx]         ; next line
    982 
    983         pmaddubsw   xmm3,       xmm1                ; 00 02 04 06 08 10 12 14
    984 
    985         punpcklbw   xmm4,       xmm5                ; 08 09 09 10 10 11 11 12 12 13 13 14 14 15 15 16
    986         pmaddubsw   xmm4,       xmm1                ; 01 03 05 07 09 11 13 15
    987 
    988         paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
    989         psraw       xmm3,       VP8_FILTER_SHIFT    ; xmm3 /= 128
    990 
    991         paddw       xmm4,       [GLOBAL(rd)]        ; xmm4 += round value
    992         psraw       xmm4,       VP8_FILTER_SHIFT    ; xmm4 /= 128
    993 
    994         movdqa      xmm7,       xmm3
    995         packuswb    xmm7,       xmm4                ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
    996 
    997 .next_row:
    998         movq        xmm6,       [rsi]               ; 00 01 02 03 04 05 06 07
    999         movq        xmm5,       [rsi+1]             ; 01 02 03 04 05 06 07 08
   1000 
   1001         punpcklbw   xmm6,       xmm5
   1002         movq        xmm4,       [rsi+8]             ; 08 09 10 11 12 13 14 15
   1003 
   1004         movq        xmm5,       [rsi+9]             ; 09 10 11 12 13 14 15 16
   1005         lea         rsi,        [rsi + rdx]         ; next line
   1006 
   1007         pmaddubsw   xmm6,       xmm1
   1008 
   1009         punpcklbw   xmm4,       xmm5
   1010         pmaddubsw   xmm4,       xmm1
   1011 
   1012         paddw       xmm6,       [GLOBAL(rd)]        ; xmm6 += round value
   1013         psraw       xmm6,       VP8_FILTER_SHIFT    ; xmm6 /= 128
   1014 
   1015         paddw       xmm4,       [GLOBAL(rd)]        ; xmm4 += round value
   1016         psraw       xmm4,       VP8_FILTER_SHIFT    ; xmm4 /= 128
   1017 
   1018         packuswb    xmm6,       xmm4
   1019         movdqa      xmm5,       xmm7
   1020 
   1021         punpcklbw   xmm5,       xmm6
   1022         pmaddubsw   xmm5,       xmm2
   1023 
   1024         punpckhbw   xmm7,       xmm6
   1025         pmaddubsw   xmm7,       xmm2
   1026 
   1027         paddw       xmm5,       [GLOBAL(rd)]        ; xmm5 += round value
   1028         psraw       xmm5,       VP8_FILTER_SHIFT    ; xmm5 /= 128
   1029 
   1030         paddw       xmm7,       [GLOBAL(rd)]        ; xmm7 += round value
   1031         psraw       xmm7,       VP8_FILTER_SHIFT    ; xmm7 /= 128
   1032 
   1033         packuswb    xmm5,       xmm7
   1034         movdqa      xmm7,       xmm6
   1035 
   1036         movdqa      [rdi],      xmm5                ; store the results in the destination
   1037 %if ABI_IS_32BIT
   1038         add         rdi,        DWORD PTR arg(5)    ; dst_pitch
   1039 %else
   1040         add         rdi,        r8
   1041 %endif
   1042 
   1043         cmp         rdi,        rcx
   1044         jne         .next_row
   1045 
   1046         jmp         done
   1047 
   1048 b16x16_sp_only:
   1049         movsxd      rax,        dword ptr arg(3)    ; yoffset
   1050         shl         rax,        4
   1051         lea         rax,        [rax + rcx]         ; VFilter
   1052 
   1053         mov         rdi,        arg(4)              ; dst_ptr
   1054         mov         rsi,        arg(0)              ; src_ptr
   1055         movsxd      rdx,        dword ptr arg(5)    ; dst_pitch
   1056 
   1057         movdqa      xmm1,       [rax]               ; VFilter
   1058 
   1059         lea         rcx,        [rdi+rdx*8]
   1060         lea         rcx,        [rcx+rdx*8]
   1061         movsxd      rax,        dword ptr arg(1)    ; src_pixels_per_line
   1062 
   1063         ; get the first horizontal line done
   1064         movq        xmm4,       [rsi]               ; load row 0
   1065         movq        xmm2,       [rsi + 8]           ; load row 0
   1066 
   1067         lea         rsi,        [rsi + rax]         ; next line
   1068 .next_row:
   1069         movq        xmm3,       [rsi]               ; load row + 1
   1070         movq        xmm5,       [rsi + 8]           ; load row + 1
   1071 
   1072         punpcklbw   xmm4,       xmm3
   1073         punpcklbw   xmm2,       xmm5
   1074 
   1075         pmaddubsw   xmm4,       xmm1
   1076         movq        xmm7,       [rsi + rax]         ; load row + 2
   1077 
   1078         pmaddubsw   xmm2,       xmm1
   1079         movq        xmm6,       [rsi + rax + 8]     ; load row + 2
   1080 
   1081         punpcklbw   xmm3,       xmm7
   1082         punpcklbw   xmm5,       xmm6
   1083 
   1084         pmaddubsw   xmm3,       xmm1
   1085         paddw       xmm4,       [GLOBAL(rd)]
   1086 
   1087         pmaddubsw   xmm5,       xmm1
   1088         paddw       xmm2,       [GLOBAL(rd)]
   1089 
   1090         psraw       xmm4,       VP8_FILTER_SHIFT
   1091         psraw       xmm2,       VP8_FILTER_SHIFT
   1092 
   1093         packuswb    xmm4,       xmm2
   1094         paddw       xmm3,       [GLOBAL(rd)]
   1095 
   1096         movdqa      [rdi],      xmm4                ; store row 0
   1097         paddw       xmm5,       [GLOBAL(rd)]
   1098 
   1099         psraw       xmm3,       VP8_FILTER_SHIFT
   1100         psraw       xmm5,       VP8_FILTER_SHIFT
   1101 
   1102         packuswb    xmm3,       xmm5
   1103         movdqa      xmm4,       xmm7
   1104 
   1105         movdqa      [rdi + rdx],xmm3                ; store row 1
   1106         lea         rsi,        [rsi + 2*rax]
   1107 
   1108         movdqa      xmm2,       xmm6
   1109         lea         rdi,        [rdi + 2*rdx]
   1110 
   1111         cmp         rdi,        rcx
   1112         jne         .next_row
   1113 
   1114         jmp         done
   1115 
   1116 b16x16_fp_only:
   1117         lea         rcx,        [rdi+rdx*8]
   1118         lea         rcx,        [rcx+rdx*8]
   1119         movsxd      rax,        dword ptr arg(1)    ; src_pixels_per_line
   1120 
   1121 .next_row:
   1122         movq        xmm2,       [rsi]               ; 00 01 02 03 04 05 06 07
   1123         movq        xmm4,       [rsi+1]             ; 01 02 03 04 05 06 07 08
   1124 
   1125         punpcklbw   xmm2,       xmm4
   1126         movq        xmm3,       [rsi+8]             ; 08 09 10 11 12 13 14 15
   1127 
   1128         pmaddubsw   xmm2,       xmm1
   1129         movq        xmm4,       [rsi+9]             ; 09 10 11 12 13 14 15 16
   1130 
   1131         lea         rsi,        [rsi + rax]         ; next line
   1132         punpcklbw   xmm3,       xmm4
   1133 
   1134         pmaddubsw   xmm3,       xmm1
   1135         movq        xmm5,       [rsi]
   1136 
   1137         paddw       xmm2,       [GLOBAL(rd)]
   1138         movq        xmm7,       [rsi+1]
   1139 
   1140         movq        xmm6,       [rsi+8]
   1141         psraw       xmm2,       VP8_FILTER_SHIFT
   1142 
   1143         punpcklbw   xmm5,       xmm7
   1144         movq        xmm7,       [rsi+9]
   1145 
   1146         paddw       xmm3,       [GLOBAL(rd)]
   1147         pmaddubsw   xmm5,       xmm1
   1148 
   1149         psraw       xmm3,       VP8_FILTER_SHIFT
   1150         punpcklbw   xmm6,       xmm7
   1151 
   1152         packuswb    xmm2,       xmm3
   1153         pmaddubsw   xmm6,       xmm1
   1154 
   1155         movdqa      [rdi],      xmm2                ; store the results in the destination
   1156         paddw       xmm5,       [GLOBAL(rd)]
   1157 
   1158         lea         rdi,        [rdi + rdx]         ; dst_pitch
   1159         psraw       xmm5,       VP8_FILTER_SHIFT
   1160 
   1161         paddw       xmm6,       [GLOBAL(rd)]
   1162         psraw       xmm6,       VP8_FILTER_SHIFT
   1163 
   1164         packuswb    xmm5,       xmm6
   1165         lea         rsi,        [rsi + rax]         ; next line
   1166 
   1167         movdqa      [rdi],      xmm5                ; store the results in the destination
   1168         lea         rdi,        [rdi + rdx]         ; dst_pitch
   1169 
   1170         cmp         rdi,        rcx
   1171 
   1172         jne         .next_row
   1173 
   1174 done:
   1175     ; begin epilog
   1176     pop         rdi
   1177     pop         rsi
   1178     RESTORE_GOT
   1179     RESTORE_XMM
   1180     UNSHADOW_ARGS
   1181     pop         rbp
   1182     ret
   1183 
   1184 ;void vp8_bilinear_predict8x8_ssse3
   1185 ;(
   1186 ;    unsigned char  *src_ptr,
   1187 ;    int   src_pixels_per_line,
   1188 ;    int  xoffset,
   1189 ;    int  yoffset,
   1190 ;    unsigned char *dst_ptr,
   1191 ;    int dst_pitch
   1192 ;)
   1193 global sym(vp8_bilinear_predict8x8_ssse3)
   1194 sym(vp8_bilinear_predict8x8_ssse3):
   1195     push        rbp
   1196     mov         rbp, rsp
   1197     SHADOW_ARGS_TO_STACK 6
   1198     SAVE_XMM
   1199     GET_GOT     rbx
   1200     push        rsi
   1201     push        rdi
   1202     ; end prolog
   1203 
   1204     ALIGN_STACK 16, rax
   1205     sub         rsp, 144                         ; reserve 144 bytes
   1206 
   1207         lea         rcx,        [GLOBAL(vp8_bilinear_filters_ssse3)]
   1208 
   1209         mov         rsi,        arg(0) ;src_ptr
   1210         movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line
   1211 
   1212     ;Read 9-line unaligned data in and put them on stack. This gives a big
   1213     ;performance boost.
   1214         movdqu      xmm0,       [rsi]
   1215         lea         rax,        [rdx + rdx*2]
   1216         movdqu      xmm1,       [rsi+rdx]
   1217         movdqu      xmm2,       [rsi+rdx*2]
   1218         add         rsi,        rax
   1219         movdqu      xmm3,       [rsi]
   1220         movdqu      xmm4,       [rsi+rdx]
   1221         movdqu      xmm5,       [rsi+rdx*2]
   1222         add         rsi,        rax
   1223         movdqu      xmm6,       [rsi]
   1224         movdqu      xmm7,       [rsi+rdx]
   1225 
   1226         movdqa      XMMWORD PTR [rsp],            xmm0
   1227 
   1228         movdqu      xmm0,       [rsi+rdx*2]
   1229 
   1230         movdqa      XMMWORD PTR [rsp+16],         xmm1
   1231         movdqa      XMMWORD PTR [rsp+32],         xmm2
   1232         movdqa      XMMWORD PTR [rsp+48],         xmm3
   1233         movdqa      XMMWORD PTR [rsp+64],         xmm4
   1234         movdqa      XMMWORD PTR [rsp+80],         xmm5
   1235         movdqa      XMMWORD PTR [rsp+96],         xmm6
   1236         movdqa      XMMWORD PTR [rsp+112],        xmm7
   1237         movdqa      XMMWORD PTR [rsp+128],        xmm0
   1238 
   1239         movsxd      rax,        dword ptr arg(2)    ; xoffset
   1240         cmp         rax,        0                   ; skip first_pass filter if xoffset=0
   1241         je          b8x8_sp_only
   1242 
   1243         shl         rax,        4
   1244         add         rax,        rcx                 ; HFilter
   1245 
   1246         mov         rdi,        arg(4)              ; dst_ptr
   1247         movsxd      rdx,        dword ptr arg(5)    ; dst_pitch
   1248 
   1249         movdqa      xmm0,       [rax]
   1250 
   1251         movsxd      rax,        dword ptr arg(3)    ; yoffset
   1252         cmp         rax,        0                   ; skip second_pass filter if yoffset=0
   1253         je          b8x8_fp_only
   1254 
   1255         shl         rax,        4
   1256         lea         rax,        [rax + rcx]         ; VFilter
   1257 
   1258         lea         rcx,        [rdi+rdx*8]
   1259 
   1260         movdqa      xmm1,       [rax]
   1261 
   1262         ; get the first horizontal line done
   1263         movdqa      xmm3,       [rsp]               ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
   1264         movdqa      xmm5,       xmm3                ; 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 xx
   1265 
   1266         psrldq      xmm5,       1
   1267         lea         rsp,        [rsp + 16]          ; next line
   1268 
   1269         punpcklbw   xmm3,       xmm5                ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08
   1270         pmaddubsw   xmm3,       xmm0                ; 00 02 04 06 08 10 12 14
   1271 
   1272         paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
   1273         psraw       xmm3,       VP8_FILTER_SHIFT    ; xmm3 /= 128
   1274 
   1275         movdqa      xmm7,       xmm3
   1276         packuswb    xmm7,       xmm7                ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
   1277 
   1278 .next_row:
   1279         movdqa      xmm6,       [rsp]               ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
   1280         lea         rsp,        [rsp + 16]          ; next line
   1281 
   1282         movdqa      xmm5,       xmm6
   1283 
   1284         psrldq      xmm5,       1
   1285 
   1286         punpcklbw   xmm6,       xmm5
   1287         pmaddubsw   xmm6,       xmm0
   1288 
   1289         paddw       xmm6,       [GLOBAL(rd)]        ; xmm6 += round value
   1290         psraw       xmm6,       VP8_FILTER_SHIFT    ; xmm6 /= 128
   1291 
   1292         packuswb    xmm6,       xmm6
   1293 
   1294         punpcklbw   xmm7,       xmm6
   1295         pmaddubsw   xmm7,       xmm1
   1296 
   1297         paddw       xmm7,       [GLOBAL(rd)]        ; xmm7 += round value
   1298         psraw       xmm7,       VP8_FILTER_SHIFT    ; xmm7 /= 128
   1299 
   1300         packuswb    xmm7,       xmm7
   1301 
   1302         movq        [rdi],      xmm7                ; store the results in the destination
   1303         lea         rdi,        [rdi + rdx]
   1304 
   1305         movdqa      xmm7,       xmm6
   1306 
   1307         cmp         rdi,        rcx
   1308         jne         .next_row
   1309 
   1310         jmp         done8x8
   1311 
   1312 b8x8_sp_only:
   1313         movsxd      rax,        dword ptr arg(3)    ; yoffset
   1314         shl         rax,        4
   1315         lea         rax,        [rax + rcx]         ; VFilter
   1316 
   1317         mov         rdi,        arg(4) ;dst_ptr
   1318         movsxd      rdx,        dword ptr arg(5)    ; dst_pitch
   1319 
   1320         movdqa      xmm0,       [rax]               ; VFilter
   1321 
   1322         movq        xmm1,       XMMWORD PTR [rsp]
   1323         movq        xmm2,       XMMWORD PTR [rsp+16]
   1324 
   1325         movq        xmm3,       XMMWORD PTR [rsp+32]
   1326         punpcklbw   xmm1,       xmm2
   1327 
   1328         movq        xmm4,       XMMWORD PTR [rsp+48]
   1329         punpcklbw   xmm2,       xmm3
   1330 
   1331         movq        xmm5,       XMMWORD PTR [rsp+64]
   1332         punpcklbw   xmm3,       xmm4
   1333 
   1334         movq        xmm6,       XMMWORD PTR [rsp+80]
   1335         punpcklbw   xmm4,       xmm5
   1336 
   1337         movq        xmm7,       XMMWORD PTR [rsp+96]
   1338         punpcklbw   xmm5,       xmm6
   1339 
   1340         pmaddubsw   xmm1,       xmm0
   1341         pmaddubsw   xmm2,       xmm0
   1342 
   1343         pmaddubsw   xmm3,       xmm0
   1344         pmaddubsw   xmm4,       xmm0
   1345 
   1346         pmaddubsw   xmm5,       xmm0
   1347         punpcklbw   xmm6,       xmm7
   1348 
   1349         pmaddubsw   xmm6,       xmm0
   1350         paddw       xmm1,       [GLOBAL(rd)]
   1351 
   1352         paddw       xmm2,       [GLOBAL(rd)]
   1353         psraw       xmm1,       VP8_FILTER_SHIFT
   1354 
   1355         paddw       xmm3,       [GLOBAL(rd)]
   1356         psraw       xmm2,       VP8_FILTER_SHIFT
   1357 
   1358         paddw       xmm4,       [GLOBAL(rd)]
   1359         psraw       xmm3,       VP8_FILTER_SHIFT
   1360 
   1361         paddw       xmm5,       [GLOBAL(rd)]
   1362         psraw       xmm4,       VP8_FILTER_SHIFT
   1363 
   1364         paddw       xmm6,       [GLOBAL(rd)]
   1365         psraw       xmm5,       VP8_FILTER_SHIFT
   1366 
   1367         psraw       xmm6,       VP8_FILTER_SHIFT
   1368         packuswb    xmm1,       xmm1
   1369 
   1370         packuswb    xmm2,       xmm2
   1371         movq        [rdi],      xmm1
   1372 
   1373         packuswb    xmm3,       xmm3
   1374         movq        [rdi+rdx],  xmm2
   1375 
   1376         packuswb    xmm4,       xmm4
   1377         movq        xmm1,       XMMWORD PTR [rsp+112]
   1378 
   1379         lea         rdi,        [rdi + 2*rdx]
   1380         movq        xmm2,       XMMWORD PTR [rsp+128]
   1381 
   1382         packuswb    xmm5,       xmm5
   1383         movq        [rdi],      xmm3
   1384 
   1385         packuswb    xmm6,       xmm6
   1386         movq        [rdi+rdx],  xmm4
   1387 
   1388         lea         rdi,        [rdi + 2*rdx]
   1389         punpcklbw   xmm7,       xmm1
   1390 
   1391         movq        [rdi],      xmm5
   1392         pmaddubsw   xmm7,       xmm0
   1393 
   1394         movq        [rdi+rdx],  xmm6
   1395         punpcklbw   xmm1,       xmm2
   1396 
   1397         pmaddubsw   xmm1,       xmm0
   1398         paddw       xmm7,       [GLOBAL(rd)]
   1399 
   1400         psraw       xmm7,       VP8_FILTER_SHIFT
   1401         paddw       xmm1,       [GLOBAL(rd)]
   1402 
   1403         psraw       xmm1,       VP8_FILTER_SHIFT
   1404         packuswb    xmm7,       xmm7
   1405 
   1406         packuswb    xmm1,       xmm1
   1407         lea         rdi,        [rdi + 2*rdx]
   1408 
   1409         movq        [rdi],      xmm7
   1410 
   1411         movq        [rdi+rdx],  xmm1
   1412         lea         rsp,        [rsp + 144]
   1413 
   1414         jmp         done8x8
   1415 
   1416 b8x8_fp_only:
   1417         lea         rcx,        [rdi+rdx*8]
   1418 
   1419 .next_row:
   1420         movdqa      xmm1,       XMMWORD PTR [rsp]
   1421         movdqa      xmm3,       XMMWORD PTR [rsp+16]
   1422 
   1423         movdqa      xmm2,       xmm1
   1424         movdqa      xmm5,       XMMWORD PTR [rsp+32]
   1425 
   1426         psrldq      xmm2,       1
   1427         movdqa      xmm7,       XMMWORD PTR [rsp+48]
   1428 
   1429         movdqa      xmm4,       xmm3
   1430         psrldq      xmm4,       1
   1431 
   1432         movdqa      xmm6,       xmm5
   1433         psrldq      xmm6,       1
   1434 
   1435         punpcklbw   xmm1,       xmm2
   1436         pmaddubsw   xmm1,       xmm0
   1437 
   1438         punpcklbw   xmm3,       xmm4
   1439         pmaddubsw   xmm3,       xmm0
   1440 
   1441         punpcklbw   xmm5,       xmm6
   1442         pmaddubsw   xmm5,       xmm0
   1443 
   1444         movdqa      xmm2,       xmm7
   1445         psrldq      xmm2,       1
   1446 
   1447         punpcklbw   xmm7,       xmm2
   1448         pmaddubsw   xmm7,       xmm0
   1449 
   1450         paddw       xmm1,       [GLOBAL(rd)]
   1451         psraw       xmm1,       VP8_FILTER_SHIFT
   1452 
   1453         paddw       xmm3,       [GLOBAL(rd)]
   1454         psraw       xmm3,       VP8_FILTER_SHIFT
   1455 
   1456         paddw       xmm5,       [GLOBAL(rd)]
   1457         psraw       xmm5,       VP8_FILTER_SHIFT
   1458 
   1459         paddw       xmm7,       [GLOBAL(rd)]
   1460         psraw       xmm7,       VP8_FILTER_SHIFT
   1461 
   1462         packuswb    xmm1,       xmm1
   1463         packuswb    xmm3,       xmm3
   1464 
   1465         packuswb    xmm5,       xmm5
   1466         movq        [rdi],      xmm1
   1467 
   1468         packuswb    xmm7,       xmm7
   1469         movq        [rdi+rdx],  xmm3
   1470 
   1471         lea         rdi,        [rdi + 2*rdx]
   1472         movq        [rdi],      xmm5
   1473 
   1474         lea         rsp,        [rsp + 4*16]
   1475         movq        [rdi+rdx],  xmm7
   1476 
   1477         lea         rdi,        [rdi + 2*rdx]
   1478         cmp         rdi,        rcx
   1479 
   1480         jne         .next_row
   1481 
   1482         lea         rsp,        [rsp + 16]
   1483 
   1484 done8x8:
   1485     ;add rsp, 144
   1486     pop         rsp
   1487     ; begin epilog
   1488     pop         rdi
   1489     pop         rsi
   1490     RESTORE_GOT
   1491     RESTORE_XMM
   1492     UNSHADOW_ARGS
   1493     pop         rbp
   1494     ret
   1495 
   1496 SECTION_RODATA
   1497 align 16
   1498 shuf1b:
   1499     db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12
   1500 shuf2b:
   1501     db 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11
   1502 shuf3b:
   1503     db 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10
   1504 
   1505 align 16
   1506 shuf2bfrom1:
   1507     db  4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11, 9,13
   1508 align 16
   1509 shuf3bfrom1:
   1510     db  2, 6, 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11
   1511 
   1512 align 16
   1513 rd:
   1514     times 8 dw 0x40
   1515 
   1516 align 16
   1517 k0_k5:
   1518     times 8 db 0, 0             ;placeholder
   1519     times 8 db 0, 0
   1520     times 8 db 2, 1
   1521     times 8 db 0, 0
   1522     times 8 db 3, 3
   1523     times 8 db 0, 0
   1524     times 8 db 1, 2
   1525     times 8 db 0, 0
   1526 k1_k3:
   1527     times 8 db  0,    0         ;placeholder
   1528     times 8 db  -6,  12
   1529     times 8 db -11,  36
   1530     times 8 db  -9,  50
   1531     times 8 db -16,  77
   1532     times 8 db  -6,  93
   1533     times 8 db  -8, 108
   1534     times 8 db  -1, 123
   1535 k2_k4:
   1536     times 8 db 128,    0        ;placeholder
   1537     times 8 db 123,   -1
   1538     times 8 db 108,   -8
   1539     times 8 db  93,   -6
   1540     times 8 db  77,  -16
   1541     times 8 db  50,   -9
   1542     times 8 db  36,  -11
   1543     times 8 db  12,   -6
   1544 align 16
   1545 vp8_bilinear_filters_ssse3:
   1546     times 8 db 128, 0
   1547     times 8 db 112, 16
   1548     times 8 db 96,  32
   1549     times 8 db 80,  48
   1550     times 8 db 64,  64
   1551     times 8 db 48,  80
   1552     times 8 db 32,  96
   1553     times 8 db 16,  112
   1554 
   1555