Home | History | Annotate | Download | only in x86
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12 %include "vpx_ports/x86_abi_support.asm"
     13 extern sym(vp8_bilinear_filters_x86_8)
     14 
     15 %define BLOCK_HEIGHT_WIDTH 4
     16 %define VP8_FILTER_WEIGHT 128
     17 %define VP8_FILTER_SHIFT  7
     18 
     19 
     20 ;/************************************************************************************
     21 ; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
     22 ; input pixel array has output_height rows. This routine assumes that output_height is an
     23 ; even number. This function handles 8 pixels in horizontal direction, calculating ONE
     24 ; rows each iteration to take advantage of the 128 bits operations.
     25 ;*************************************************************************************/
     26 ;void vp8_filter_block1d8_h6_sse2
     27 ;(
     28 ;    unsigned char  *src_ptr,
     29 ;    unsigned short *output_ptr,
     30 ;    unsigned int    src_pixels_per_line,
     31 ;    unsigned int    pixel_step,
     32 ;    unsigned int    output_height,
     33 ;    unsigned int    output_width,
     34 ;    short           *vp8_filter
     35 ;)
     36 global sym(vp8_filter_block1d8_h6_sse2) PRIVATE
     37 sym(vp8_filter_block1d8_h6_sse2):
     38     push        rbp
     39     mov         rbp, rsp
     40     SHADOW_ARGS_TO_STACK 7
     41     SAVE_XMM 7
     42     GET_GOT     rbx
     43     push        rsi
     44     push        rdi
     45     ; end prolog
     46 
     47         mov         rdx,        arg(6) ;vp8_filter
     48         mov         rsi,        arg(0) ;src_ptr
     49 
     50         mov         rdi,        arg(1) ;output_ptr
     51 
     52         movsxd      rcx,        dword ptr arg(4) ;output_height
     53         movsxd      rax,        dword ptr arg(2) ;src_pixels_per_line            ; Pitch for Source
     54 %if ABI_IS_32BIT=0
     55         movsxd      r8,         dword ptr arg(5) ;output_width
     56 %endif
     57         pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
     58 
     59 .filter_block1d8_h6_rowloop:
     60         movq        xmm3,       MMWORD PTR [rsi - 2]
     61         movq        xmm1,       MMWORD PTR [rsi + 6]
     62 
     63         prefetcht2  [rsi+rax-2]
     64 
     65         pslldq      xmm1,       8
     66         por         xmm1,       xmm3
     67 
     68         movdqa      xmm4,       xmm1
     69         movdqa      xmm5,       xmm1
     70 
     71         movdqa      xmm6,       xmm1
     72         movdqa      xmm7,       xmm1
     73 
     74         punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
     75         psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
     76 
     77         pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
     78         punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
     79 
     80         psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
     81         pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
     82 
     83 
     84         punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
     85         psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
     86 
     87         pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
     88 
     89         punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
     90         psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
     91 
     92         pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
     93 
     94         punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
     95         psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
     96 
     97 
     98         pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
     99 
    100         punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
    101         pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
    102 
    103 
    104         paddsw      xmm4,       xmm7
    105         paddsw      xmm4,       xmm5
    106 
    107         paddsw      xmm4,       xmm3
    108         paddsw      xmm4,       xmm6
    109 
    110         paddsw      xmm4,       xmm1
    111         paddsw      xmm4,       [GLOBAL(rd)]
    112 
    113         psraw       xmm4,       7
    114 
    115         packuswb    xmm4,       xmm0
    116         punpcklbw   xmm4,       xmm0
    117 
    118         movdqa      XMMWORD Ptr [rdi],         xmm4
    119         lea         rsi,        [rsi + rax]
    120 
    121 %if ABI_IS_32BIT
    122         add         rdi,        DWORD Ptr arg(5) ;[output_width]
    123 %else
    124         add         rdi,        r8
    125 %endif
    126         dec         rcx
    127 
    128         jnz         .filter_block1d8_h6_rowloop                ; next row
    129 
    130     ; begin epilog
    131     pop rdi
    132     pop rsi
    133     RESTORE_GOT
    134     RESTORE_XMM
    135     UNSHADOW_ARGS
    136     pop         rbp
    137     ret
    138 
    139 
    140 ;void vp8_filter_block1d16_h6_sse2
    141 ;(
    142 ;    unsigned char  *src_ptr,
    143 ;    unsigned short *output_ptr,
    144 ;    unsigned int    src_pixels_per_line,
    145 ;    unsigned int    pixel_step,
    146 ;    unsigned int    output_height,
    147 ;    unsigned int    output_width,
    148 ;    short           *vp8_filter
    149 ;)
    150 ;/************************************************************************************
    151 ; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
    152 ; input pixel array has output_height rows. This routine assumes that output_height is an
    153 ; even number. This function handles 8 pixels in horizontal direction, calculating ONE
    154 ; rows each iteration to take advantage of the 128 bits operations.
    155 ;*************************************************************************************/
    156 global sym(vp8_filter_block1d16_h6_sse2) PRIVATE
    157 sym(vp8_filter_block1d16_h6_sse2):
    158     push        rbp
    159     mov         rbp, rsp
    160     SHADOW_ARGS_TO_STACK 7
    161     SAVE_XMM 7
    162     GET_GOT     rbx
    163     push        rsi
    164     push        rdi
    165     ; end prolog
    166 
    167         mov         rdx,        arg(6) ;vp8_filter
    168         mov         rsi,        arg(0) ;src_ptr
    169 
    170         mov         rdi,        arg(1) ;output_ptr
    171 
    172         movsxd      rcx,        dword ptr arg(4) ;output_height
    173         movsxd      rax,        dword ptr arg(2) ;src_pixels_per_line            ; Pitch for Source
    174 %if ABI_IS_32BIT=0
    175         movsxd      r8,         dword ptr arg(5) ;output_width
    176 %endif
    177 
    178         pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
    179 
    180 .filter_block1d16_h6_sse2_rowloop:
    181         movq        xmm3,       MMWORD PTR [rsi - 2]
    182         movq        xmm1,       MMWORD PTR [rsi + 6]
    183 
    184         movq        xmm2,       MMWORD PTR [rsi +14]
    185         pslldq      xmm2,       8
    186 
    187         por         xmm2,       xmm1
    188         prefetcht2  [rsi+rax-2]
    189 
    190         pslldq      xmm1,       8
    191         por         xmm1,       xmm3
    192 
    193         movdqa      xmm4,       xmm1
    194         movdqa      xmm5,       xmm1
    195 
    196         movdqa      xmm6,       xmm1
    197         movdqa      xmm7,       xmm1
    198 
    199         punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
    200         psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
    201 
    202         pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
    203         punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
    204 
    205         psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
    206         pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
    207 
    208 
    209         punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
    210         psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
    211 
    212         pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
    213 
    214         punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
    215         psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
    216 
    217         pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
    218 
    219         punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
    220         psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
    221 
    222 
    223         pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
    224 
    225         punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
    226         pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
    227 
    228         paddsw      xmm4,       xmm7
    229         paddsw      xmm4,       xmm5
    230 
    231         paddsw      xmm4,       xmm3
    232         paddsw      xmm4,       xmm6
    233 
    234         paddsw      xmm4,       xmm1
    235         paddsw      xmm4,       [GLOBAL(rd)]
    236 
    237         psraw       xmm4,       7
    238 
    239         packuswb    xmm4,       xmm0
    240         punpcklbw   xmm4,       xmm0
    241 
    242         movdqa      XMMWORD Ptr [rdi],         xmm4
    243 
    244         movdqa      xmm3,       xmm2
    245         movdqa      xmm4,       xmm2
    246 
    247         movdqa      xmm5,       xmm2
    248         movdqa      xmm6,       xmm2
    249 
    250         movdqa      xmm7,       xmm2
    251 
    252         punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
    253         psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
    254 
    255         pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
    256         punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
    257 
    258         psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
    259         pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
    260 
    261 
    262         punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
    263         psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
    264 
    265         pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
    266 
    267         punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
    268         psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
    269 
    270         pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
    271 
    272         punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
    273         psrldq      xmm2,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
    274 
    275         pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
    276 
    277         punpcklbw   xmm2,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
    278         pmullw      xmm2,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
    279 
    280 
    281         paddsw      xmm4,       xmm7
    282         paddsw      xmm4,       xmm5
    283 
    284         paddsw      xmm4,       xmm3
    285         paddsw      xmm4,       xmm6
    286 
    287         paddsw      xmm4,       xmm2
    288         paddsw      xmm4,       [GLOBAL(rd)]
    289 
    290         psraw       xmm4,       7
    291 
    292         packuswb    xmm4,       xmm0
    293         punpcklbw   xmm4,       xmm0
    294 
    295         movdqa      XMMWORD Ptr [rdi+16],      xmm4
    296 
    297         lea         rsi,        [rsi + rax]
    298 %if ABI_IS_32BIT
    299         add         rdi,        DWORD Ptr arg(5) ;[output_width]
    300 %else
    301         add         rdi,        r8
    302 %endif
    303 
    304         dec         rcx
    305         jnz         .filter_block1d16_h6_sse2_rowloop                ; next row
    306 
    307     ; begin epilog
    308     pop rdi
    309     pop rsi
    310     RESTORE_GOT
    311     RESTORE_XMM
    312     UNSHADOW_ARGS
    313     pop         rbp
    314     ret
    315 
    316 
    317 ;void vp8_filter_block1d8_v6_sse2
    318 ;(
    319 ;    short *src_ptr,
    320 ;    unsigned char *output_ptr,
    321 ;    int dst_ptich,
    322 ;    unsigned int pixels_per_line,
    323 ;    unsigned int pixel_step,
    324 ;    unsigned int output_height,
    325 ;    unsigned int output_width,
    326 ;    short * vp8_filter
    327 ;)
    328 ;/************************************************************************************
    329 ; Notes: filter_block1d8_v6 applies a 6 tap filter vertically to the input pixels. The
    330 ; input pixel array has output_height rows.
    331 ;*************************************************************************************/
    332 global sym(vp8_filter_block1d8_v6_sse2) PRIVATE
    333 sym(vp8_filter_block1d8_v6_sse2):
    334     push        rbp
    335     mov         rbp, rsp
    336     SHADOW_ARGS_TO_STACK 8
    337     SAVE_XMM 7
    338     GET_GOT     rbx
    339     push        rsi
    340     push        rdi
    341     ; end prolog
    342 
    343         mov         rax,        arg(7) ;vp8_filter
    344         movsxd      rdx,        dword ptr arg(3) ;pixels_per_line
    345 
    346         mov         rdi,        arg(1) ;output_ptr
    347         mov         rsi,        arg(0) ;src_ptr
    348 
    349         sub         rsi,        rdx
    350         sub         rsi,        rdx
    351 
    352         movsxd      rcx,        DWORD PTR arg(5) ;[output_height]
    353         pxor        xmm0,       xmm0                        ; clear xmm0
    354 
    355         movdqa      xmm7,       XMMWORD PTR [GLOBAL(rd)]
    356 %if ABI_IS_32BIT=0
    357         movsxd      r8,         dword ptr arg(2) ; dst_ptich
    358 %endif
    359 
    360 .vp8_filter_block1d8_v6_sse2_loop:
    361         movdqa      xmm1,       XMMWORD PTR [rsi]
    362         pmullw      xmm1,       [rax]
    363 
    364         movdqa      xmm2,       XMMWORD PTR [rsi + rdx]
    365         pmullw      xmm2,       [rax + 16]
    366 
    367         movdqa      xmm3,       XMMWORD PTR [rsi + rdx * 2]
    368         pmullw      xmm3,       [rax + 32]
    369 
    370         movdqa      xmm5,       XMMWORD PTR [rsi + rdx * 4]
    371         pmullw      xmm5,       [rax + 64]
    372 
    373         add         rsi,        rdx
    374         movdqa      xmm4,       XMMWORD PTR [rsi + rdx * 2]
    375 
    376         pmullw      xmm4,       [rax + 48]
    377         movdqa      xmm6,       XMMWORD PTR [rsi + rdx * 4]
    378 
    379         pmullw      xmm6,       [rax + 80]
    380 
    381         paddsw      xmm2,       xmm5
    382         paddsw      xmm2,       xmm3
    383 
    384         paddsw      xmm2,       xmm1
    385         paddsw      xmm2,       xmm4
    386 
    387         paddsw      xmm2,       xmm6
    388         paddsw      xmm2,       xmm7
    389 
    390         psraw       xmm2,       7
    391         packuswb    xmm2,       xmm0              ; pack and saturate
    392 
    393         movq        QWORD PTR [rdi], xmm2         ; store the results in the destination
    394 %if ABI_IS_32BIT
    395         add         rdi,        DWORD PTR arg(2) ;[dst_ptich]
    396 %else
    397         add         rdi,        r8
    398 %endif
    399         dec         rcx         ; decrement count
    400         jnz         .vp8_filter_block1d8_v6_sse2_loop               ; next row
    401 
    402     ; begin epilog
    403     pop rdi
    404     pop rsi
    405     RESTORE_GOT
    406     RESTORE_XMM
    407     UNSHADOW_ARGS
    408     pop         rbp
    409     ret
    410 
    411 
    412 ;void vp8_filter_block1d16_v6_sse2
    413 ;(
    414 ;    unsigned short *src_ptr,
    415 ;    unsigned char *output_ptr,
    416 ;    int dst_ptich,
    417 ;    unsigned int pixels_per_line,
    418 ;    unsigned int pixel_step,
    419 ;    unsigned int output_height,
    420 ;    unsigned int output_width,
    421 ;    const short    *vp8_filter
    422 ;)
    423 ;/************************************************************************************
    424 ; Notes: filter_block1d16_v6 applies a 6 tap filter vertically to the input pixels. The
    425 ; input pixel array has output_height rows.
    426 ;*************************************************************************************/
    427 global sym(vp8_filter_block1d16_v6_sse2) PRIVATE
    428 sym(vp8_filter_block1d16_v6_sse2):
    429     push        rbp
    430     mov         rbp, rsp
    431     SHADOW_ARGS_TO_STACK 8
    432     SAVE_XMM 7
    433     GET_GOT     rbx
    434     push        rsi
    435     push        rdi
    436     ; end prolog
    437 
    438         mov         rax,        arg(7) ;vp8_filter
    439         movsxd      rdx,        dword ptr arg(3) ;pixels_per_line
    440 
    441         mov         rdi,        arg(1) ;output_ptr
    442         mov         rsi,        arg(0) ;src_ptr
    443 
    444         sub         rsi,        rdx
    445         sub         rsi,        rdx
    446 
    447         movsxd      rcx,        DWORD PTR arg(5) ;[output_height]
    448 %if ABI_IS_32BIT=0
    449         movsxd      r8,         dword ptr arg(2) ; dst_ptich
    450 %endif
    451 
    452 .vp8_filter_block1d16_v6_sse2_loop:
    453 ; The order for adding 6-tap is 2 5 3 1 4 6. Read in data in that order.
    454         movdqa      xmm1,       XMMWORD PTR [rsi + rdx]       ; line 2
    455         movdqa      xmm2,       XMMWORD PTR [rsi + rdx + 16]
    456         pmullw      xmm1,       [rax + 16]
    457         pmullw      xmm2,       [rax + 16]
    458 
    459         movdqa      xmm3,       XMMWORD PTR [rsi + rdx * 4]       ; line 5
    460         movdqa      xmm4,       XMMWORD PTR [rsi + rdx * 4 + 16]
    461         pmullw      xmm3,       [rax + 64]
    462         pmullw      xmm4,       [rax + 64]
    463 
    464         movdqa      xmm5,       XMMWORD PTR [rsi + rdx * 2]       ; line 3
    465         movdqa      xmm6,       XMMWORD PTR [rsi + rdx * 2 + 16]
    466         pmullw      xmm5,       [rax + 32]
    467         pmullw      xmm6,       [rax + 32]
    468 
    469         movdqa      xmm7,       XMMWORD PTR [rsi]       ; line 1
    470         movdqa      xmm0,       XMMWORD PTR [rsi + 16]
    471         pmullw      xmm7,       [rax]
    472         pmullw      xmm0,       [rax]
    473 
    474         paddsw      xmm1,       xmm3
    475         paddsw      xmm2,       xmm4
    476         paddsw      xmm1,       xmm5
    477         paddsw      xmm2,       xmm6
    478         paddsw      xmm1,       xmm7
    479         paddsw      xmm2,       xmm0
    480 
    481         add         rsi,        rdx
    482 
    483         movdqa      xmm3,       XMMWORD PTR [rsi + rdx * 2]       ; line 4
    484         movdqa      xmm4,       XMMWORD PTR [rsi + rdx * 2 + 16]
    485         pmullw      xmm3,       [rax + 48]
    486         pmullw      xmm4,       [rax + 48]
    487 
    488         movdqa      xmm5,       XMMWORD PTR [rsi + rdx * 4]       ; line 6
    489         movdqa      xmm6,       XMMWORD PTR [rsi + rdx * 4 + 16]
    490         pmullw      xmm5,       [rax + 80]
    491         pmullw      xmm6,       [rax + 80]
    492 
    493         movdqa      xmm7,       XMMWORD PTR [GLOBAL(rd)]
    494         pxor        xmm0,       xmm0                        ; clear xmm0
    495 
    496         paddsw      xmm1,       xmm3
    497         paddsw      xmm2,       xmm4
    498         paddsw      xmm1,       xmm5
    499         paddsw      xmm2,       xmm6
    500 
    501         paddsw      xmm1,       xmm7
    502         paddsw      xmm2,       xmm7
    503 
    504         psraw       xmm1,       7
    505         psraw       xmm2,       7
    506 
    507         packuswb    xmm1,       xmm2              ; pack and saturate
    508         movdqa      XMMWORD PTR [rdi], xmm1       ; store the results in the destination
    509 %if ABI_IS_32BIT
    510         add         rdi,        DWORD PTR arg(2) ;[dst_ptich]
    511 %else
    512         add         rdi,        r8
    513 %endif
    514         dec         rcx         ; decrement count
    515         jnz         .vp8_filter_block1d16_v6_sse2_loop              ; next row
    516 
    517     ; begin epilog
    518     pop rdi
    519     pop rsi
    520     RESTORE_GOT
    521     RESTORE_XMM
    522     UNSHADOW_ARGS
    523     pop         rbp
    524     ret
    525 
    526 
    527 ;void vp8_filter_block1d8_h6_only_sse2
    528 ;(
    529 ;    unsigned char  *src_ptr,
    530 ;    unsigned int    src_pixels_per_line,
    531 ;    unsigned char  *output_ptr,
    532 ;    int dst_ptich,
    533 ;    unsigned int    output_height,
    534 ;    const short    *vp8_filter
    535 ;)
    536 ; First-pass filter only when yoffset==0
    537 global sym(vp8_filter_block1d8_h6_only_sse2) PRIVATE
    538 sym(vp8_filter_block1d8_h6_only_sse2):
    539     push        rbp
    540     mov         rbp, rsp
    541     SHADOW_ARGS_TO_STACK 6
    542     SAVE_XMM 7
    543     GET_GOT     rbx
    544     push        rsi
    545     push        rdi
    546     ; end prolog
    547 
    548         mov         rdx,        arg(5) ;vp8_filter
    549         mov         rsi,        arg(0) ;src_ptr
    550 
    551         mov         rdi,        arg(2) ;output_ptr
    552 
    553         movsxd      rcx,        dword ptr arg(4) ;output_height
    554         movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line            ; Pitch for Source
    555 %if ABI_IS_32BIT=0
    556         movsxd      r8,         dword ptr arg(3) ;dst_ptich
    557 %endif
    558         pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
    559 
    560 .filter_block1d8_h6_only_rowloop:
    561         movq        xmm3,       MMWORD PTR [rsi - 2]
    562         movq        xmm1,       MMWORD PTR [rsi + 6]
    563 
    564         prefetcht2  [rsi+rax-2]
    565 
    566         pslldq      xmm1,       8
    567         por         xmm1,       xmm3
    568 
    569         movdqa      xmm4,       xmm1
    570         movdqa      xmm5,       xmm1
    571 
    572         movdqa      xmm6,       xmm1
    573         movdqa      xmm7,       xmm1
    574 
    575         punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
    576         psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
    577 
    578         pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
    579         punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
    580 
    581         psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
    582         pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
    583 
    584 
    585         punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
    586         psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
    587 
    588         pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
    589 
    590         punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
    591         psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
    592 
    593         pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
    594 
    595         punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
    596         psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
    597 
    598 
    599         pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
    600 
    601         punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
    602         pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
    603 
    604 
    605         paddsw      xmm4,       xmm7
    606         paddsw      xmm4,       xmm5
    607 
    608         paddsw      xmm4,       xmm3
    609         paddsw      xmm4,       xmm6
    610 
    611         paddsw      xmm4,       xmm1
    612         paddsw      xmm4,       [GLOBAL(rd)]
    613 
    614         psraw       xmm4,       7
    615 
    616         packuswb    xmm4,       xmm0
    617 
    618         movq        QWORD PTR [rdi],   xmm4       ; store the results in the destination
    619         lea         rsi,        [rsi + rax]
    620 
    621 %if ABI_IS_32BIT
    622         add         rdi,        DWORD Ptr arg(3) ;dst_ptich
    623 %else
    624         add         rdi,        r8
    625 %endif
    626         dec         rcx
    627 
    628         jnz         .filter_block1d8_h6_only_rowloop               ; next row
    629 
    630     ; begin epilog
    631     pop rdi
    632     pop rsi
    633     RESTORE_GOT
    634     RESTORE_XMM
    635     UNSHADOW_ARGS
    636     pop         rbp
    637     ret
    638 
    639 
    640 ;void vp8_filter_block1d16_h6_only_sse2
    641 ;(
    642 ;    unsigned char  *src_ptr,
    643 ;    unsigned int    src_pixels_per_line,
    644 ;    unsigned char  *output_ptr,
    645 ;    int dst_ptich,
    646 ;    unsigned int    output_height,
    647 ;    const short    *vp8_filter
    648 ;)
    649 ; First-pass filter only when yoffset==0
    650 global sym(vp8_filter_block1d16_h6_only_sse2) PRIVATE
    651 sym(vp8_filter_block1d16_h6_only_sse2):
    652     push        rbp
    653     mov         rbp, rsp
    654     SHADOW_ARGS_TO_STACK 6
    655     SAVE_XMM 7
    656     GET_GOT     rbx
    657     push        rsi
    658     push        rdi
    659     ; end prolog
    660 
    661         mov         rdx,        arg(5) ;vp8_filter
    662         mov         rsi,        arg(0) ;src_ptr
    663 
    664         mov         rdi,        arg(2) ;output_ptr
    665 
    666         movsxd      rcx,        dword ptr arg(4) ;output_height
    667         movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line            ; Pitch for Source
    668 %if ABI_IS_32BIT=0
    669         movsxd      r8,         dword ptr arg(3) ;dst_ptich
    670 %endif
    671 
    672         pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
    673 
    674 .filter_block1d16_h6_only_sse2_rowloop:
    675         movq        xmm3,       MMWORD PTR [rsi - 2]
    676         movq        xmm1,       MMWORD PTR [rsi + 6]
    677 
    678         movq        xmm2,       MMWORD PTR [rsi +14]
    679         pslldq      xmm2,       8
    680 
    681         por         xmm2,       xmm1
    682         prefetcht2  [rsi+rax-2]
    683 
    684         pslldq      xmm1,       8
    685         por         xmm1,       xmm3
    686 
    687         movdqa      xmm4,       xmm1
    688         movdqa      xmm5,       xmm1
    689 
    690         movdqa      xmm6,       xmm1
    691         movdqa      xmm7,       xmm1
    692 
    693         punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
    694         psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
    695 
    696         pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
    697         punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
    698 
    699         psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
    700         pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
    701 
    702         punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
    703         psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
    704 
    705         pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
    706 
    707         punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
    708         psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
    709 
    710         pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
    711 
    712         punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
    713         psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
    714 
    715         pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
    716 
    717         punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
    718         pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
    719 
    720         paddsw      xmm4,       xmm7
    721         paddsw      xmm4,       xmm5
    722 
    723         paddsw      xmm4,       xmm3
    724         paddsw      xmm4,       xmm6
    725 
    726         paddsw      xmm4,       xmm1
    727         paddsw      xmm4,       [GLOBAL(rd)]
    728 
    729         psraw       xmm4,       7
    730 
    731         packuswb    xmm4,       xmm0                        ; lower 8 bytes
    732 
    733         movq        QWORD Ptr [rdi],         xmm4           ; store the results in the destination
    734 
    735         movdqa      xmm3,       xmm2
    736         movdqa      xmm4,       xmm2
    737 
    738         movdqa      xmm5,       xmm2
    739         movdqa      xmm6,       xmm2
    740 
    741         movdqa      xmm7,       xmm2
    742 
    743         punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
    744         psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
    745 
    746         pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
    747         punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
    748 
    749         psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
    750         pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
    751 
    752         punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
    753         psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
    754 
    755         pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
    756 
    757         punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
    758         psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
    759 
    760         pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
    761 
    762         punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
    763         psrldq      xmm2,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
    764 
    765         pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
    766 
    767         punpcklbw   xmm2,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
    768         pmullw      xmm2,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
    769 
    770         paddsw      xmm4,       xmm7
    771         paddsw      xmm4,       xmm5
    772 
    773         paddsw      xmm4,       xmm3
    774         paddsw      xmm4,       xmm6
    775 
    776         paddsw      xmm4,       xmm2
    777         paddsw      xmm4,       [GLOBAL(rd)]
    778 
    779         psraw       xmm4,       7
    780 
    781         packuswb    xmm4,       xmm0                        ; higher 8 bytes
    782 
    783         movq        QWORD Ptr [rdi+8],      xmm4            ; store the results in the destination
    784 
    785         lea         rsi,        [rsi + rax]
    786 %if ABI_IS_32BIT
    787         add         rdi,        DWORD Ptr arg(3) ;dst_ptich
    788 %else
    789         add         rdi,        r8
    790 %endif
    791 
    792         dec         rcx
    793         jnz         .filter_block1d16_h6_only_sse2_rowloop               ; next row
    794 
    795     ; begin epilog
    796     pop rdi
    797     pop rsi
    798     RESTORE_GOT
    799     RESTORE_XMM
    800     UNSHADOW_ARGS
    801     pop         rbp
    802     ret
    803 
    804 
    805 ;void vp8_filter_block1d8_v6_only_sse2
    806 ;(
    807 ;    unsigned char *src_ptr,
    808 ;    unsigned int    src_pixels_per_line,
    809 ;    unsigned char *output_ptr,
    810 ;    int dst_ptich,
    811 ;    unsigned int output_height,
    812 ;    const short    *vp8_filter
    813 ;)
    814 ; Second-pass filter only when xoffset==0
    815 global sym(vp8_filter_block1d8_v6_only_sse2) PRIVATE
    816 sym(vp8_filter_block1d8_v6_only_sse2):
    817     push        rbp
    818     mov         rbp, rsp
    819     SHADOW_ARGS_TO_STACK 6
    820     SAVE_XMM 7
    821     GET_GOT     rbx
    822     push        rsi
    823     push        rdi
    824     ; end prolog
    825 
    826         mov         rsi,        arg(0) ;src_ptr
    827         mov         rdi,        arg(2) ;output_ptr
    828 
    829         movsxd      rcx,        dword ptr arg(4) ;output_height
    830         movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line
    831 
    832         mov         rax,        arg(5) ;vp8_filter
    833 
    834         pxor        xmm0,       xmm0                        ; clear xmm0
    835 
    836         movdqa      xmm7,       XMMWORD PTR [GLOBAL(rd)]
    837 %if ABI_IS_32BIT=0
    838         movsxd      r8,         dword ptr arg(3) ; dst_ptich
    839 %endif
    840 
    841 .vp8_filter_block1d8_v6_only_sse2_loop:
    842         movq        xmm1,       MMWORD PTR [rsi]
    843         movq        xmm2,       MMWORD PTR [rsi + rdx]
    844         movq        xmm3,       MMWORD PTR [rsi + rdx * 2]
    845         movq        xmm5,       MMWORD PTR [rsi + rdx * 4]
    846         add         rsi,        rdx
    847         movq        xmm4,       MMWORD PTR [rsi + rdx * 2]
    848         movq        xmm6,       MMWORD PTR [rsi + rdx * 4]
    849 
    850         punpcklbw   xmm1,       xmm0
    851         pmullw      xmm1,       [rax]
    852 
    853         punpcklbw   xmm2,       xmm0
    854         pmullw      xmm2,       [rax + 16]
    855 
    856         punpcklbw   xmm3,       xmm0
    857         pmullw      xmm3,       [rax + 32]
    858 
    859         punpcklbw   xmm5,       xmm0
    860         pmullw      xmm5,       [rax + 64]
    861 
    862         punpcklbw   xmm4,       xmm0
    863         pmullw      xmm4,       [rax + 48]
    864 
    865         punpcklbw   xmm6,       xmm0
    866         pmullw      xmm6,       [rax + 80]
    867 
    868         paddsw      xmm2,       xmm5
    869         paddsw      xmm2,       xmm3
    870 
    871         paddsw      xmm2,       xmm1
    872         paddsw      xmm2,       xmm4
    873 
    874         paddsw      xmm2,       xmm6
    875         paddsw      xmm2,       xmm7
    876 
    877         psraw       xmm2,       7
    878         packuswb    xmm2,       xmm0              ; pack and saturate
    879 
    880         movq        QWORD PTR [rdi], xmm2         ; store the results in the destination
    881 %if ABI_IS_32BIT
    882         add         rdi,        DWORD PTR arg(3) ;[dst_ptich]
    883 %else
    884         add         rdi,        r8
    885 %endif
    886         dec         rcx         ; decrement count
    887         jnz         .vp8_filter_block1d8_v6_only_sse2_loop              ; next row
    888 
    889     ; begin epilog
    890     pop rdi
    891     pop rsi
    892     RESTORE_GOT
    893     RESTORE_XMM
    894     UNSHADOW_ARGS
    895     pop         rbp
    896     ret
    897 
    898 
    899 ;void vp8_unpack_block1d16_h6_sse2
    900 ;(
    901 ;    unsigned char  *src_ptr,
    902 ;    unsigned short *output_ptr,
    903 ;    unsigned int    src_pixels_per_line,
    904 ;    unsigned int    output_height,
    905 ;    unsigned int    output_width
    906 ;)
    907 global sym(vp8_unpack_block1d16_h6_sse2) PRIVATE
    908 sym(vp8_unpack_block1d16_h6_sse2):
    909     push        rbp
    910     mov         rbp, rsp
    911     SHADOW_ARGS_TO_STACK 5
    912     GET_GOT     rbx
    913     push        rsi
    914     push        rdi
    915     ; end prolog
    916 
    917         mov         rsi,        arg(0) ;src_ptr
    918         mov         rdi,        arg(1) ;output_ptr
    919 
    920         movsxd      rcx,        dword ptr arg(3) ;output_height
    921         movsxd      rax,        dword ptr arg(2) ;src_pixels_per_line            ; Pitch for Source
    922 
    923         pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
    924 %if ABI_IS_32BIT=0
    925         movsxd      r8,         dword ptr arg(4) ;output_width            ; Pitch for Source
    926 %endif
    927 
    928 .unpack_block1d16_h6_sse2_rowloop:
    929         movq        xmm1,       MMWORD PTR [rsi]            ; 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 -2
    930         movq        xmm3,       MMWORD PTR [rsi+8]          ; make copy of xmm1
    931 
    932         punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
    933         punpcklbw   xmm1,       xmm0
    934 
    935         movdqa      XMMWORD Ptr [rdi],         xmm1
    936         movdqa      XMMWORD Ptr [rdi + 16],    xmm3
    937 
    938         lea         rsi,        [rsi + rax]
    939 %if ABI_IS_32BIT
    940         add         rdi,        DWORD Ptr arg(4) ;[output_width]
    941 %else
    942         add         rdi,        r8
    943 %endif
    944         dec         rcx
    945         jnz         .unpack_block1d16_h6_sse2_rowloop               ; next row
    946 
    947     ; begin epilog
    948     pop rdi
    949     pop rsi
    950     RESTORE_GOT
    951     UNSHADOW_ARGS
    952     pop         rbp
    953     ret
    954 
    955 
    956 ;void vp8_bilinear_predict16x16_sse2
    957 ;(
    958 ;    unsigned char  *src_ptr,
    959 ;    int   src_pixels_per_line,
    960 ;    int  xoffset,
    961 ;    int  yoffset,
    962 ;    unsigned char *dst_ptr,
    963 ;    int dst_pitch
    964 ;)
    965 extern sym(vp8_bilinear_filters_x86_8)
    966 global sym(vp8_bilinear_predict16x16_sse2) PRIVATE
    967 sym(vp8_bilinear_predict16x16_sse2):
    968     push        rbp
    969     mov         rbp, rsp
    970     SHADOW_ARGS_TO_STACK 6
    971     SAVE_XMM 7
    972     GET_GOT     rbx
    973     push        rsi
    974     push        rdi
    975     ; end prolog
    976 
    977     ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset]
    978     ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset]
    979 
    980         lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
    981         movsxd      rax,        dword ptr arg(2) ;xoffset
    982 
    983         cmp         rax,        0      ;skip first_pass filter if xoffset=0
    984         je          .b16x16_sp_only
    985 
    986         shl         rax,        5
    987         add         rax,        rcx    ;HFilter
    988 
    989         mov         rdi,        arg(4) ;dst_ptr
    990         mov         rsi,        arg(0) ;src_ptr
    991         movsxd      rdx,        dword ptr arg(5) ;dst_pitch
    992 
    993         movdqa      xmm1,       [rax]
    994         movdqa      xmm2,       [rax+16]
    995 
    996         movsxd      rax,        dword ptr arg(3) ;yoffset
    997 
    998         cmp         rax,        0      ;skip second_pass filter if yoffset=0
    999         je          .b16x16_fp_only
   1000 
   1001         shl         rax,        5
   1002         add         rax,        rcx    ;VFilter
   1003 
   1004         lea         rcx,        [rdi+rdx*8]
   1005         lea         rcx,        [rcx+rdx*8]
   1006         movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line
   1007 
   1008         pxor        xmm0,       xmm0
   1009 
   1010 %if ABI_IS_32BIT=0
   1011         movsxd      r8,         dword ptr arg(5) ;dst_pitch
   1012 %endif
   1013         ; get the first horizontal line done
   1014         movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
   1015         movdqa      xmm4,       xmm3                 ; make a copy of current line
   1016 
   1017         punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06
   1018         punpckhbw   xmm4,       xmm0
   1019 
   1020         pmullw      xmm3,       xmm1
   1021         pmullw      xmm4,       xmm1
   1022 
   1023         movdqu      xmm5,       [rsi+1]
   1024         movdqa      xmm6,       xmm5
   1025 
   1026         punpcklbw   xmm5,       xmm0
   1027         punpckhbw   xmm6,       xmm0
   1028 
   1029         pmullw      xmm5,       xmm2
   1030         pmullw      xmm6,       xmm2
   1031 
   1032         paddw       xmm3,       xmm5
   1033         paddw       xmm4,       xmm6
   1034 
   1035         paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
   1036         psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
   1037 
   1038         paddw       xmm4,       [GLOBAL(rd)]
   1039         psraw       xmm4,       VP8_FILTER_SHIFT
   1040 
   1041         movdqa      xmm7,       xmm3
   1042         packuswb    xmm7,       xmm4
   1043 
   1044         add         rsi,        rdx                 ; next line
   1045 .next_row:
   1046         movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
   1047         movdqa      xmm4,       xmm3                 ; make a copy of current line
   1048 
   1049         punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06
   1050         punpckhbw   xmm4,       xmm0
   1051 
   1052         pmullw      xmm3,       xmm1
   1053         pmullw      xmm4,       xmm1
   1054 
   1055         movdqu      xmm5,       [rsi+1]
   1056         movdqa      xmm6,       xmm5
   1057 
   1058         punpcklbw   xmm5,       xmm0
   1059         punpckhbw   xmm6,       xmm0
   1060 
   1061         pmullw      xmm5,       xmm2
   1062         pmullw      xmm6,       xmm2
   1063 
   1064         paddw       xmm3,       xmm5
   1065         paddw       xmm4,       xmm6
   1066 
   1067         movdqa      xmm5,       xmm7
   1068         movdqa      xmm6,       xmm7
   1069 
   1070         punpcklbw   xmm5,       xmm0
   1071         punpckhbw   xmm6,       xmm0
   1072 
   1073         pmullw      xmm5,       [rax]
   1074         pmullw      xmm6,       [rax]
   1075 
   1076         paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
   1077         psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
   1078 
   1079         paddw       xmm4,       [GLOBAL(rd)]
   1080         psraw       xmm4,       VP8_FILTER_SHIFT
   1081 
   1082         movdqa      xmm7,       xmm3
   1083         packuswb    xmm7,       xmm4
   1084 
   1085         pmullw      xmm3,       [rax+16]
   1086         pmullw      xmm4,       [rax+16]
   1087 
   1088         paddw       xmm3,       xmm5
   1089         paddw       xmm4,       xmm6
   1090 
   1091         paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
   1092         psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
   1093 
   1094         paddw       xmm4,       [GLOBAL(rd)]
   1095         psraw       xmm4,       VP8_FILTER_SHIFT
   1096 
   1097         packuswb    xmm3,       xmm4
   1098         movdqa      [rdi],      xmm3                 ; store the results in the destination
   1099 
   1100         add         rsi,        rdx                 ; next line
   1101 %if ABI_IS_32BIT
   1102         add         rdi,        DWORD PTR arg(5) ;dst_pitch
   1103 %else
   1104         add         rdi,        r8
   1105 %endif
   1106 
   1107         cmp         rdi,        rcx
   1108         jne         .next_row
   1109 
   1110         jmp         .done
   1111 
   1112 .b16x16_sp_only:
   1113         movsxd      rax,        dword ptr arg(3) ;yoffset
   1114         shl         rax,        5
   1115         add         rax,        rcx    ;VFilter
   1116 
   1117         mov         rdi,        arg(4) ;dst_ptr
   1118         mov         rsi,        arg(0) ;src_ptr
   1119         movsxd      rdx,        dword ptr arg(5) ;dst_pitch
   1120 
   1121         movdqa      xmm1,       [rax]
   1122         movdqa      xmm2,       [rax+16]
   1123 
   1124         lea         rcx,        [rdi+rdx*8]
   1125         lea         rcx,        [rcx+rdx*8]
   1126         movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line
   1127 
   1128         pxor        xmm0,       xmm0
   1129 
   1130         ; get the first horizontal line done
   1131         movdqu      xmm7,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
   1132 
   1133         add         rsi,        rax                 ; next line
   1134 .next_row_spo:
   1135         movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
   1136 
   1137         movdqa      xmm5,       xmm7
   1138         movdqa      xmm6,       xmm7
   1139 
   1140         movdqa      xmm4,       xmm3                 ; make a copy of current line
   1141         movdqa      xmm7,       xmm3
   1142 
   1143         punpcklbw   xmm5,       xmm0
   1144         punpckhbw   xmm6,       xmm0
   1145         punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06
   1146         punpckhbw   xmm4,       xmm0
   1147 
   1148         pmullw      xmm5,       xmm1
   1149         pmullw      xmm6,       xmm1
   1150         pmullw      xmm3,       xmm2
   1151         pmullw      xmm4,       xmm2
   1152 
   1153         paddw       xmm3,       xmm5
   1154         paddw       xmm4,       xmm6
   1155 
   1156         paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
   1157         psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
   1158 
   1159         paddw       xmm4,       [GLOBAL(rd)]
   1160         psraw       xmm4,       VP8_FILTER_SHIFT
   1161 
   1162         packuswb    xmm3,       xmm4
   1163         movdqa      [rdi],      xmm3                 ; store the results in the destination
   1164 
   1165         add         rsi,        rax                 ; next line
   1166         add         rdi,        rdx                 ;dst_pitch
   1167         cmp         rdi,        rcx
   1168         jne         .next_row_spo
   1169 
   1170         jmp         .done
   1171 
   1172 .b16x16_fp_only:
   1173         lea         rcx,        [rdi+rdx*8]
   1174         lea         rcx,        [rcx+rdx*8]
   1175         movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line
   1176         pxor        xmm0,       xmm0
   1177 
   1178 .next_row_fpo:
   1179         movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
   1180         movdqa      xmm4,       xmm3                 ; make a copy of current line
   1181 
   1182         punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06
   1183         punpckhbw   xmm4,       xmm0
   1184 
   1185         pmullw      xmm3,       xmm1
   1186         pmullw      xmm4,       xmm1
   1187 
   1188         movdqu      xmm5,       [rsi+1]
   1189         movdqa      xmm6,       xmm5
   1190 
   1191         punpcklbw   xmm5,       xmm0
   1192         punpckhbw   xmm6,       xmm0
   1193 
   1194         pmullw      xmm5,       xmm2
   1195         pmullw      xmm6,       xmm2
   1196 
   1197         paddw       xmm3,       xmm5
   1198         paddw       xmm4,       xmm6
   1199 
   1200         paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
   1201         psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
   1202 
   1203         paddw       xmm4,       [GLOBAL(rd)]
   1204         psraw       xmm4,       VP8_FILTER_SHIFT
   1205 
   1206         packuswb    xmm3,       xmm4
   1207         movdqa      [rdi],      xmm3                 ; store the results in the destination
   1208 
   1209         add         rsi,        rax                 ; next line
   1210         add         rdi,        rdx                 ; dst_pitch
   1211         cmp         rdi,        rcx
   1212         jne         .next_row_fpo
   1213 
   1214 .done:
   1215     ; begin epilog
   1216     pop rdi
   1217     pop rsi
   1218     RESTORE_GOT
   1219     RESTORE_XMM
   1220     UNSHADOW_ARGS
   1221     pop         rbp
   1222     ret
   1223 
   1224 
   1225 ;void vp8_bilinear_predict8x8_sse2
   1226 ;(
   1227 ;    unsigned char  *src_ptr,
   1228 ;    int   src_pixels_per_line,
   1229 ;    int  xoffset,
   1230 ;    int  yoffset,
   1231 ;    unsigned char *dst_ptr,
   1232 ;    int dst_pitch
   1233 ;)
   1234 global sym(vp8_bilinear_predict8x8_sse2) PRIVATE
   1235 sym(vp8_bilinear_predict8x8_sse2):
   1236     push        rbp
   1237     mov         rbp, rsp
   1238     SHADOW_ARGS_TO_STACK 6
   1239     SAVE_XMM 7
   1240     GET_GOT     rbx
   1241     push        rsi
   1242     push        rdi
   1243     ; end prolog
   1244 
   1245     ALIGN_STACK 16, rax
   1246     sub         rsp, 144                         ; reserve 144 bytes
   1247 
   1248     ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset]
   1249     ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset]
   1250         lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
   1251 
   1252         mov         rsi,        arg(0) ;src_ptr
   1253         movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line
   1254 
   1255     ;Read 9-line unaligned data in and put them on stack. This gives a big
   1256     ;performance boost.
   1257         movdqu      xmm0,       [rsi]
   1258         lea         rax,        [rdx + rdx*2]
   1259         movdqu      xmm1,       [rsi+rdx]
   1260         movdqu      xmm2,       [rsi+rdx*2]
   1261         add         rsi,        rax
   1262         movdqu      xmm3,       [rsi]
   1263         movdqu      xmm4,       [rsi+rdx]
   1264         movdqu      xmm5,       [rsi+rdx*2]
   1265         add         rsi,        rax
   1266         movdqu      xmm6,       [rsi]
   1267         movdqu      xmm7,       [rsi+rdx]
   1268 
   1269         movdqa      XMMWORD PTR [rsp],            xmm0
   1270 
   1271         movdqu      xmm0,       [rsi+rdx*2]
   1272 
   1273         movdqa      XMMWORD PTR [rsp+16],         xmm1
   1274         movdqa      XMMWORD PTR [rsp+32],         xmm2
   1275         movdqa      XMMWORD PTR [rsp+48],         xmm3
   1276         movdqa      XMMWORD PTR [rsp+64],         xmm4
   1277         movdqa      XMMWORD PTR [rsp+80],         xmm5
   1278         movdqa      XMMWORD PTR [rsp+96],         xmm6
   1279         movdqa      XMMWORD PTR [rsp+112],        xmm7
   1280         movdqa      XMMWORD PTR [rsp+128],        xmm0
   1281 
   1282         movsxd      rax,        dword ptr arg(2) ;xoffset
   1283         shl         rax,        5
   1284         add         rax,        rcx    ;HFilter
   1285 
   1286         mov         rdi,        arg(4) ;dst_ptr
   1287         movsxd      rdx,        dword ptr arg(5) ;dst_pitch
   1288 
   1289         movdqa      xmm1,       [rax]
   1290         movdqa      xmm2,       [rax+16]
   1291 
   1292         movsxd      rax,        dword ptr arg(3) ;yoffset
   1293         shl         rax,        5
   1294         add         rax,        rcx    ;VFilter
   1295 
   1296         lea         rcx,        [rdi+rdx*8]
   1297 
   1298         movdqa      xmm5,       [rax]
   1299         movdqa      xmm6,       [rax+16]
   1300 
   1301         pxor        xmm0,       xmm0
   1302 
   1303         ; get the first horizontal line done
   1304         movdqa      xmm3,       XMMWORD PTR [rsp]
   1305         movdqa      xmm4,       xmm3                 ; make a copy of current line
   1306         psrldq      xmm4,       1
   1307 
   1308         punpcklbw   xmm3,       xmm0                 ; 00 01 02 03 04 05 06 07
   1309         punpcklbw   xmm4,       xmm0                 ; 01 02 03 04 05 06 07 08
   1310 
   1311         pmullw      xmm3,       xmm1
   1312         pmullw      xmm4,       xmm2
   1313 
   1314         paddw       xmm3,       xmm4
   1315 
   1316         paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
   1317         psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
   1318 
   1319         movdqa      xmm7,       xmm3
   1320         add         rsp,        16                 ; next line
   1321 .next_row8x8:
   1322         movdqa      xmm3,       XMMWORD PTR [rsp]               ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
   1323         movdqa      xmm4,       xmm3                 ; make a copy of current line
   1324         psrldq      xmm4,       1
   1325 
   1326         punpcklbw   xmm3,       xmm0                 ; 00 01 02 03 04 05 06 07
   1327         punpcklbw   xmm4,       xmm0                 ; 01 02 03 04 05 06 07 08
   1328 
   1329         pmullw      xmm3,       xmm1
   1330         pmullw      xmm4,       xmm2
   1331 
   1332         paddw       xmm3,       xmm4
   1333         pmullw      xmm7,       xmm5
   1334 
   1335         paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
   1336         psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
   1337 
   1338         movdqa      xmm4,       xmm3
   1339 
   1340         pmullw      xmm3,       xmm6
   1341         paddw       xmm3,       xmm7
   1342 
   1343         movdqa      xmm7,       xmm4
   1344 
   1345         paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
   1346         psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
   1347 
   1348         packuswb    xmm3,       xmm0
   1349         movq        [rdi],      xmm3                 ; store the results in the destination
   1350 
   1351         add         rsp,        16                 ; next line
   1352         add         rdi,        rdx
   1353 
   1354         cmp         rdi,        rcx
   1355         jne         .next_row8x8
   1356 
   1357     ;add rsp, 144
   1358     pop rsp
   1359     ; begin epilog
   1360     pop rdi
   1361     pop rsi
   1362     RESTORE_GOT
   1363     RESTORE_XMM
   1364     UNSHADOW_ARGS
   1365     pop         rbp
   1366     ret
   1367 
   1368 
   1369 SECTION_RODATA
   1370 align 16
   1371 rd:
   1372     times 8 dw 0x40
   1373