Home | History | Annotate | Download | only in x86
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12 %include "vpx_ports/x86_abi_support.asm"
     13 extern sym(vp8_bilinear_filters_x86_8)
     14 
     15 %define BLOCK_HEIGHT_WIDTH 4
     16 %define VP8_FILTER_WEIGHT 128
     17 %define VP8_FILTER_SHIFT  7
     18 
     19 SECTION .text
     20 
     21 ;/************************************************************************************
     22 ; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
     23 ; input pixel array has output_height rows. This routine assumes that output_height is an
     24 ; even number. This function handles 8 pixels in horizontal direction, calculating ONE
     25 ; rows each iteration to take advantage of the 128 bits operations.
     26 ;*************************************************************************************/
     27 ;void vp8_filter_block1d8_h6_sse2
     28 ;(
     29 ;    unsigned char  *src_ptr,
     30 ;    unsigned short *output_ptr,
     31 ;    unsigned int    src_pixels_per_line,
     32 ;    unsigned int    pixel_step,
     33 ;    unsigned int    output_height,
     34 ;    unsigned int    output_width,
     35 ;    short           *vp8_filter
     36 ;)
     37 global sym(vp8_filter_block1d8_h6_sse2) PRIVATE
     38 sym(vp8_filter_block1d8_h6_sse2):
     39     push        rbp
     40     mov         rbp, rsp
     41     SHADOW_ARGS_TO_STACK 7
     42     SAVE_XMM 7
     43     GET_GOT     rbx
     44     push        rsi
     45     push        rdi
     46     ; end prolog
     47 
     48         mov         rdx,        arg(6) ;vp8_filter
     49         mov         rsi,        arg(0) ;src_ptr
     50 
     51         mov         rdi,        arg(1) ;output_ptr
     52 
     53         movsxd      rcx,        dword ptr arg(4) ;output_height
     54         movsxd      rax,        dword ptr arg(2) ;src_pixels_per_line            ; Pitch for Source
     55 %if ABI_IS_32BIT=0
     56         movsxd      r8,         dword ptr arg(5) ;output_width
     57 %endif
     58         pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
     59 
     60 .filter_block1d8_h6_rowloop:
     61         movq        xmm3,       MMWORD PTR [rsi - 2]
     62         movq        xmm1,       MMWORD PTR [rsi + 6]
     63 
     64         prefetcht2  [rsi+rax-2]
     65 
     66         pslldq      xmm1,       8
     67         por         xmm1,       xmm3
     68 
     69         movdqa      xmm4,       xmm1
     70         movdqa      xmm5,       xmm1
     71 
     72         movdqa      xmm6,       xmm1
     73         movdqa      xmm7,       xmm1
     74 
     75         punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
     76         psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
     77 
     78         pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
     79         punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
     80 
     81         psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
     82         pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
     83 
     84 
     85         punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
     86         psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
     87 
     88         pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
     89 
     90         punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
     91         psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
     92 
     93         pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
     94 
     95         punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
     96         psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
     97 
     98 
     99         pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
    100 
    101         punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
    102         pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
    103 
    104 
    105         paddsw      xmm4,       xmm7
    106         paddsw      xmm4,       xmm5
    107 
    108         paddsw      xmm4,       xmm3
    109         paddsw      xmm4,       xmm6
    110 
    111         paddsw      xmm4,       xmm1
    112         paddsw      xmm4,       [GLOBAL(rd)]
    113 
    114         psraw       xmm4,       7
    115 
    116         packuswb    xmm4,       xmm0
    117         punpcklbw   xmm4,       xmm0
    118 
    119         movdqa      XMMWORD Ptr [rdi],         xmm4
    120         lea         rsi,        [rsi + rax]
    121 
    122 %if ABI_IS_32BIT
    123         add         rdi,        DWORD Ptr arg(5) ;[output_width]
    124 %else
    125         add         rdi,        r8
    126 %endif
    127         dec         rcx
    128 
    129         jnz         .filter_block1d8_h6_rowloop                ; next row
    130 
    131     ; begin epilog
    132     pop rdi
    133     pop rsi
    134     RESTORE_GOT
    135     RESTORE_XMM
    136     UNSHADOW_ARGS
    137     pop         rbp
    138     ret
    139 
    140 
    141 ;void vp8_filter_block1d16_h6_sse2
    142 ;(
    143 ;    unsigned char  *src_ptr,
    144 ;    unsigned short *output_ptr,
    145 ;    unsigned int    src_pixels_per_line,
    146 ;    unsigned int    pixel_step,
    147 ;    unsigned int    output_height,
    148 ;    unsigned int    output_width,
    149 ;    short           *vp8_filter
    150 ;)
    151 ;/************************************************************************************
    152 ; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
    153 ; input pixel array has output_height rows. This routine assumes that output_height is an
    154 ; even number. This function handles 8 pixels in horizontal direction, calculating ONE
    155 ; rows each iteration to take advantage of the 128 bits operations.
    156 ;*************************************************************************************/
    157 global sym(vp8_filter_block1d16_h6_sse2) PRIVATE
    158 sym(vp8_filter_block1d16_h6_sse2):
    159     push        rbp
    160     mov         rbp, rsp
    161     SHADOW_ARGS_TO_STACK 7
    162     SAVE_XMM 7
    163     GET_GOT     rbx
    164     push        rsi
    165     push        rdi
    166     ; end prolog
    167 
    168         mov         rdx,        arg(6) ;vp8_filter
    169         mov         rsi,        arg(0) ;src_ptr
    170 
    171         mov         rdi,        arg(1) ;output_ptr
    172 
    173         movsxd      rcx,        dword ptr arg(4) ;output_height
    174         movsxd      rax,        dword ptr arg(2) ;src_pixels_per_line            ; Pitch for Source
    175 %if ABI_IS_32BIT=0
    176         movsxd      r8,         dword ptr arg(5) ;output_width
    177 %endif
    178 
    179         pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
    180 
    181 .filter_block1d16_h6_sse2_rowloop:
    182         movq        xmm3,       MMWORD PTR [rsi - 2]
    183         movq        xmm1,       MMWORD PTR [rsi + 6]
    184 
    185         ; Load from 11 to avoid reading out of bounds.
    186         movq        xmm2,       MMWORD PTR [rsi +11]
    187         ; The lower bits are not cleared before 'or'ing with xmm1,
    188         ; but that is OK because the values in the overlapping positions
    189         ; are already equal to the ones in xmm1.
    190         pslldq      xmm2,       5
    191 
    192         por         xmm2,       xmm1
    193         prefetcht2  [rsi+rax-2]
    194 
    195         pslldq      xmm1,       8
    196         por         xmm1,       xmm3
    197 
    198         movdqa      xmm4,       xmm1
    199         movdqa      xmm5,       xmm1
    200 
    201         movdqa      xmm6,       xmm1
    202         movdqa      xmm7,       xmm1
    203 
    204         punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
    205         psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
    206 
    207         pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
    208         punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
    209 
    210         psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
    211         pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
    212 
    213 
    214         punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
    215         psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
    216 
    217         pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
    218 
    219         punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
    220         psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
    221 
    222         pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
    223 
    224         punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
    225         psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
    226 
    227 
    228         pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
    229 
    230         punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
    231         pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
    232 
    233         paddsw      xmm4,       xmm7
    234         paddsw      xmm4,       xmm5
    235 
    236         paddsw      xmm4,       xmm3
    237         paddsw      xmm4,       xmm6
    238 
    239         paddsw      xmm4,       xmm1
    240         paddsw      xmm4,       [GLOBAL(rd)]
    241 
    242         psraw       xmm4,       7
    243 
    244         packuswb    xmm4,       xmm0
    245         punpcklbw   xmm4,       xmm0
    246 
    247         movdqa      XMMWORD Ptr [rdi],         xmm4
    248 
    249         movdqa      xmm3,       xmm2
    250         movdqa      xmm4,       xmm2
    251 
    252         movdqa      xmm5,       xmm2
    253         movdqa      xmm6,       xmm2
    254 
    255         movdqa      xmm7,       xmm2
    256 
    257         punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
    258         psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
    259 
    260         pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
    261         punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
    262 
    263         psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
    264         pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
    265 
    266 
    267         punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
    268         psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
    269 
    270         pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
    271 
    272         punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
    273         psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
    274 
    275         pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
    276 
    277         punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
    278         psrldq      xmm2,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
    279 
    280         pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
    281 
    282         punpcklbw   xmm2,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
    283         pmullw      xmm2,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
    284 
    285 
    286         paddsw      xmm4,       xmm7
    287         paddsw      xmm4,       xmm5
    288 
    289         paddsw      xmm4,       xmm3
    290         paddsw      xmm4,       xmm6
    291 
    292         paddsw      xmm4,       xmm2
    293         paddsw      xmm4,       [GLOBAL(rd)]
    294 
    295         psraw       xmm4,       7
    296 
    297         packuswb    xmm4,       xmm0
    298         punpcklbw   xmm4,       xmm0
    299 
    300         movdqa      XMMWORD Ptr [rdi+16],      xmm4
    301 
    302         lea         rsi,        [rsi + rax]
    303 %if ABI_IS_32BIT
    304         add         rdi,        DWORD Ptr arg(5) ;[output_width]
    305 %else
    306         add         rdi,        r8
    307 %endif
    308 
    309         dec         rcx
    310         jnz         .filter_block1d16_h6_sse2_rowloop                ; next row
    311 
    312     ; begin epilog
    313     pop rdi
    314     pop rsi
    315     RESTORE_GOT
    316     RESTORE_XMM
    317     UNSHADOW_ARGS
    318     pop         rbp
    319     ret
    320 
    321 
    322 ;void vp8_filter_block1d8_v6_sse2
    323 ;(
    324 ;    short *src_ptr,
    325 ;    unsigned char *output_ptr,
    326 ;    int dst_ptich,
    327 ;    unsigned int pixels_per_line,
    328 ;    unsigned int pixel_step,
    329 ;    unsigned int output_height,
    330 ;    unsigned int output_width,
    331 ;    short * vp8_filter
    332 ;)
    333 ;/************************************************************************************
    334 ; Notes: filter_block1d8_v6 applies a 6 tap filter vertically to the input pixels. The
    335 ; input pixel array has output_height rows.
    336 ;*************************************************************************************/
    337 global sym(vp8_filter_block1d8_v6_sse2) PRIVATE
    338 sym(vp8_filter_block1d8_v6_sse2):
    339     push        rbp
    340     mov         rbp, rsp
    341     SHADOW_ARGS_TO_STACK 8
    342     SAVE_XMM 7
    343     GET_GOT     rbx
    344     push        rsi
    345     push        rdi
    346     ; end prolog
    347 
    348         mov         rax,        arg(7) ;vp8_filter
    349         movsxd      rdx,        dword ptr arg(3) ;pixels_per_line
    350 
    351         mov         rdi,        arg(1) ;output_ptr
    352         mov         rsi,        arg(0) ;src_ptr
    353 
    354         sub         rsi,        rdx
    355         sub         rsi,        rdx
    356 
    357         movsxd      rcx,        DWORD PTR arg(5) ;[output_height]
    358         pxor        xmm0,       xmm0                        ; clear xmm0
    359 
    360         movdqa      xmm7,       XMMWORD PTR [GLOBAL(rd)]
    361 %if ABI_IS_32BIT=0
    362         movsxd      r8,         dword ptr arg(2) ; dst_ptich
    363 %endif
    364 
    365 .vp8_filter_block1d8_v6_sse2_loop:
    366         movdqa      xmm1,       XMMWORD PTR [rsi]
    367         pmullw      xmm1,       [rax]
    368 
    369         movdqa      xmm2,       XMMWORD PTR [rsi + rdx]
    370         pmullw      xmm2,       [rax + 16]
    371 
    372         movdqa      xmm3,       XMMWORD PTR [rsi + rdx * 2]
    373         pmullw      xmm3,       [rax + 32]
    374 
    375         movdqa      xmm5,       XMMWORD PTR [rsi + rdx * 4]
    376         pmullw      xmm5,       [rax + 64]
    377 
    378         add         rsi,        rdx
    379         movdqa      xmm4,       XMMWORD PTR [rsi + rdx * 2]
    380 
    381         pmullw      xmm4,       [rax + 48]
    382         movdqa      xmm6,       XMMWORD PTR [rsi + rdx * 4]
    383 
    384         pmullw      xmm6,       [rax + 80]
    385 
    386         paddsw      xmm2,       xmm5
    387         paddsw      xmm2,       xmm3
    388 
    389         paddsw      xmm2,       xmm1
    390         paddsw      xmm2,       xmm4
    391 
    392         paddsw      xmm2,       xmm6
    393         paddsw      xmm2,       xmm7
    394 
    395         psraw       xmm2,       7
    396         packuswb    xmm2,       xmm0              ; pack and saturate
    397 
    398         movq        QWORD PTR [rdi], xmm2         ; store the results in the destination
    399 %if ABI_IS_32BIT
    400         add         rdi,        DWORD PTR arg(2) ;[dst_ptich]
    401 %else
    402         add         rdi,        r8
    403 %endif
    404         dec         rcx         ; decrement count
    405         jnz         .vp8_filter_block1d8_v6_sse2_loop               ; next row
    406 
    407     ; begin epilog
    408     pop rdi
    409     pop rsi
    410     RESTORE_GOT
    411     RESTORE_XMM
    412     UNSHADOW_ARGS
    413     pop         rbp
    414     ret
    415 
    416 
    417 ;void vp8_filter_block1d16_v6_sse2
    418 ;(
    419 ;    unsigned short *src_ptr,
    420 ;    unsigned char *output_ptr,
    421 ;    int dst_ptich,
    422 ;    unsigned int pixels_per_line,
    423 ;    unsigned int pixel_step,
    424 ;    unsigned int output_height,
    425 ;    unsigned int output_width,
    426 ;    const short    *vp8_filter
    427 ;)
    428 ;/************************************************************************************
    429 ; Notes: filter_block1d16_v6 applies a 6 tap filter vertically to the input pixels. The
    430 ; input pixel array has output_height rows.
    431 ;*************************************************************************************/
    432 global sym(vp8_filter_block1d16_v6_sse2) PRIVATE
    433 sym(vp8_filter_block1d16_v6_sse2):
    434     push        rbp
    435     mov         rbp, rsp
    436     SHADOW_ARGS_TO_STACK 8
    437     SAVE_XMM 7
    438     GET_GOT     rbx
    439     push        rsi
    440     push        rdi
    441     ; end prolog
    442 
    443         mov         rax,        arg(7) ;vp8_filter
    444         movsxd      rdx,        dword ptr arg(3) ;pixels_per_line
    445 
    446         mov         rdi,        arg(1) ;output_ptr
    447         mov         rsi,        arg(0) ;src_ptr
    448 
    449         sub         rsi,        rdx
    450         sub         rsi,        rdx
    451 
    452         movsxd      rcx,        DWORD PTR arg(5) ;[output_height]
    453 %if ABI_IS_32BIT=0
    454         movsxd      r8,         dword ptr arg(2) ; dst_ptich
    455 %endif
    456 
    457 .vp8_filter_block1d16_v6_sse2_loop:
    458 ; The order for adding 6-tap is 2 5 3 1 4 6. Read in data in that order.
    459         movdqa      xmm1,       XMMWORD PTR [rsi + rdx]       ; line 2
    460         movdqa      xmm2,       XMMWORD PTR [rsi + rdx + 16]
    461         pmullw      xmm1,       [rax + 16]
    462         pmullw      xmm2,       [rax + 16]
    463 
    464         movdqa      xmm3,       XMMWORD PTR [rsi + rdx * 4]       ; line 5
    465         movdqa      xmm4,       XMMWORD PTR [rsi + rdx * 4 + 16]
    466         pmullw      xmm3,       [rax + 64]
    467         pmullw      xmm4,       [rax + 64]
    468 
    469         movdqa      xmm5,       XMMWORD PTR [rsi + rdx * 2]       ; line 3
    470         movdqa      xmm6,       XMMWORD PTR [rsi + rdx * 2 + 16]
    471         pmullw      xmm5,       [rax + 32]
    472         pmullw      xmm6,       [rax + 32]
    473 
    474         movdqa      xmm7,       XMMWORD PTR [rsi]       ; line 1
    475         movdqa      xmm0,       XMMWORD PTR [rsi + 16]
    476         pmullw      xmm7,       [rax]
    477         pmullw      xmm0,       [rax]
    478 
    479         paddsw      xmm1,       xmm3
    480         paddsw      xmm2,       xmm4
    481         paddsw      xmm1,       xmm5
    482         paddsw      xmm2,       xmm6
    483         paddsw      xmm1,       xmm7
    484         paddsw      xmm2,       xmm0
    485 
    486         add         rsi,        rdx
    487 
    488         movdqa      xmm3,       XMMWORD PTR [rsi + rdx * 2]       ; line 4
    489         movdqa      xmm4,       XMMWORD PTR [rsi + rdx * 2 + 16]
    490         pmullw      xmm3,       [rax + 48]
    491         pmullw      xmm4,       [rax + 48]
    492 
    493         movdqa      xmm5,       XMMWORD PTR [rsi + rdx * 4]       ; line 6
    494         movdqa      xmm6,       XMMWORD PTR [rsi + rdx * 4 + 16]
    495         pmullw      xmm5,       [rax + 80]
    496         pmullw      xmm6,       [rax + 80]
    497 
    498         movdqa      xmm7,       XMMWORD PTR [GLOBAL(rd)]
    499         pxor        xmm0,       xmm0                        ; clear xmm0
    500 
    501         paddsw      xmm1,       xmm3
    502         paddsw      xmm2,       xmm4
    503         paddsw      xmm1,       xmm5
    504         paddsw      xmm2,       xmm6
    505 
    506         paddsw      xmm1,       xmm7
    507         paddsw      xmm2,       xmm7
    508 
    509         psraw       xmm1,       7
    510         psraw       xmm2,       7
    511 
    512         packuswb    xmm1,       xmm2              ; pack and saturate
    513         movdqa      XMMWORD PTR [rdi], xmm1       ; store the results in the destination
    514 %if ABI_IS_32BIT
    515         add         rdi,        DWORD PTR arg(2) ;[dst_ptich]
    516 %else
    517         add         rdi,        r8
    518 %endif
    519         dec         rcx         ; decrement count
    520         jnz         .vp8_filter_block1d16_v6_sse2_loop              ; next row
    521 
    522     ; begin epilog
    523     pop rdi
    524     pop rsi
    525     RESTORE_GOT
    526     RESTORE_XMM
    527     UNSHADOW_ARGS
    528     pop         rbp
    529     ret
    530 
    531 
    532 ;void vp8_filter_block1d8_h6_only_sse2
    533 ;(
    534 ;    unsigned char  *src_ptr,
    535 ;    unsigned int    src_pixels_per_line,
    536 ;    unsigned char  *output_ptr,
    537 ;    int dst_ptich,
    538 ;    unsigned int    output_height,
    539 ;    const short    *vp8_filter
    540 ;)
    541 ; First-pass filter only when yoffset==0
    542 global sym(vp8_filter_block1d8_h6_only_sse2) PRIVATE
    543 sym(vp8_filter_block1d8_h6_only_sse2):
    544     push        rbp
    545     mov         rbp, rsp
    546     SHADOW_ARGS_TO_STACK 6
    547     SAVE_XMM 7
    548     GET_GOT     rbx
    549     push        rsi
    550     push        rdi
    551     ; end prolog
    552 
    553         mov         rdx,        arg(5) ;vp8_filter
    554         mov         rsi,        arg(0) ;src_ptr
    555 
    556         mov         rdi,        arg(2) ;output_ptr
    557 
    558         movsxd      rcx,        dword ptr arg(4) ;output_height
    559         movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line            ; Pitch for Source
    560 %if ABI_IS_32BIT=0
    561         movsxd      r8,         dword ptr arg(3) ;dst_ptich
    562 %endif
    563         pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
    564 
    565 .filter_block1d8_h6_only_rowloop:
    566         movq        xmm3,       MMWORD PTR [rsi - 2]
    567         movq        xmm1,       MMWORD PTR [rsi + 6]
    568 
    569         prefetcht2  [rsi+rax-2]
    570 
    571         pslldq      xmm1,       8
    572         por         xmm1,       xmm3
    573 
    574         movdqa      xmm4,       xmm1
    575         movdqa      xmm5,       xmm1
    576 
    577         movdqa      xmm6,       xmm1
    578         movdqa      xmm7,       xmm1
    579 
    580         punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
    581         psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
    582 
    583         pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
    584         punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
    585 
    586         psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
    587         pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
    588 
    589 
    590         punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
    591         psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
    592 
    593         pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
    594 
    595         punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
    596         psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
    597 
    598         pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
    599 
    600         punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
    601         psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
    602 
    603 
    604         pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
    605 
    606         punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
    607         pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
    608 
    609 
    610         paddsw      xmm4,       xmm7
    611         paddsw      xmm4,       xmm5
    612 
    613         paddsw      xmm4,       xmm3
    614         paddsw      xmm4,       xmm6
    615 
    616         paddsw      xmm4,       xmm1
    617         paddsw      xmm4,       [GLOBAL(rd)]
    618 
    619         psraw       xmm4,       7
    620 
    621         packuswb    xmm4,       xmm0
    622 
    623         movq        QWORD PTR [rdi],   xmm4       ; store the results in the destination
    624         lea         rsi,        [rsi + rax]
    625 
    626 %if ABI_IS_32BIT
    627         add         rdi,        DWORD Ptr arg(3) ;dst_ptich
    628 %else
    629         add         rdi,        r8
    630 %endif
    631         dec         rcx
    632 
    633         jnz         .filter_block1d8_h6_only_rowloop               ; next row
    634 
    635     ; begin epilog
    636     pop rdi
    637     pop rsi
    638     RESTORE_GOT
    639     RESTORE_XMM
    640     UNSHADOW_ARGS
    641     pop         rbp
    642     ret
    643 
    644 
    645 ;void vp8_filter_block1d16_h6_only_sse2
    646 ;(
    647 ;    unsigned char  *src_ptr,
    648 ;    unsigned int    src_pixels_per_line,
    649 ;    unsigned char  *output_ptr,
    650 ;    int dst_ptich,
    651 ;    unsigned int    output_height,
    652 ;    const short    *vp8_filter
    653 ;)
    654 ; First-pass filter only when yoffset==0
    655 global sym(vp8_filter_block1d16_h6_only_sse2) PRIVATE
    656 sym(vp8_filter_block1d16_h6_only_sse2):
    657     push        rbp
    658     mov         rbp, rsp
    659     SHADOW_ARGS_TO_STACK 6
    660     SAVE_XMM 7
    661     GET_GOT     rbx
    662     push        rsi
    663     push        rdi
    664     ; end prolog
    665 
    666         mov         rdx,        arg(5) ;vp8_filter
    667         mov         rsi,        arg(0) ;src_ptr
    668 
    669         mov         rdi,        arg(2) ;output_ptr
    670 
    671         movsxd      rcx,        dword ptr arg(4) ;output_height
    672         movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line            ; Pitch for Source
    673 %if ABI_IS_32BIT=0
    674         movsxd      r8,         dword ptr arg(3) ;dst_ptich
    675 %endif
    676 
    677         pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
    678 
    679 .filter_block1d16_h6_only_sse2_rowloop:
    680         movq        xmm3,       MMWORD PTR [rsi - 2]
    681         movq        xmm1,       MMWORD PTR [rsi + 6]
    682 
    683         movq        xmm2,       MMWORD PTR [rsi +14]
    684         pslldq      xmm2,       8
    685 
    686         por         xmm2,       xmm1
    687         prefetcht2  [rsi+rax-2]
    688 
    689         pslldq      xmm1,       8
    690         por         xmm1,       xmm3
    691 
    692         movdqa      xmm4,       xmm1
    693         movdqa      xmm5,       xmm1
    694 
    695         movdqa      xmm6,       xmm1
    696         movdqa      xmm7,       xmm1
    697 
    698         punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
    699         psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
    700 
    701         pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
    702         punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
    703 
    704         psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
    705         pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
    706 
    707         punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
    708         psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
    709 
    710         pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
    711 
    712         punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
    713         psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
    714 
    715         pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
    716 
    717         punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
    718         psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
    719 
    720         pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
    721 
    722         punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
    723         pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
    724 
    725         paddsw      xmm4,       xmm7
    726         paddsw      xmm4,       xmm5
    727 
    728         paddsw      xmm4,       xmm3
    729         paddsw      xmm4,       xmm6
    730 
    731         paddsw      xmm4,       xmm1
    732         paddsw      xmm4,       [GLOBAL(rd)]
    733 
    734         psraw       xmm4,       7
    735 
    736         packuswb    xmm4,       xmm0                        ; lower 8 bytes
    737 
    738         movq        QWORD Ptr [rdi],         xmm4           ; store the results in the destination
    739 
    740         movdqa      xmm3,       xmm2
    741         movdqa      xmm4,       xmm2
    742 
    743         movdqa      xmm5,       xmm2
    744         movdqa      xmm6,       xmm2
    745 
    746         movdqa      xmm7,       xmm2
    747 
    748         punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
    749         psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
    750 
    751         pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
    752         punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
    753 
    754         psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
    755         pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
    756 
    757         punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
    758         psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
    759 
    760         pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
    761 
    762         punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
    763         psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
    764 
    765         pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
    766 
    767         punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
    768         psrldq      xmm2,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
    769 
    770         pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
    771 
    772         punpcklbw   xmm2,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
    773         pmullw      xmm2,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
    774 
    775         paddsw      xmm4,       xmm7
    776         paddsw      xmm4,       xmm5
    777 
    778         paddsw      xmm4,       xmm3
    779         paddsw      xmm4,       xmm6
    780 
    781         paddsw      xmm4,       xmm2
    782         paddsw      xmm4,       [GLOBAL(rd)]
    783 
    784         psraw       xmm4,       7
    785 
    786         packuswb    xmm4,       xmm0                        ; higher 8 bytes
    787 
    788         movq        QWORD Ptr [rdi+8],      xmm4            ; store the results in the destination
    789 
    790         lea         rsi,        [rsi + rax]
    791 %if ABI_IS_32BIT
    792         add         rdi,        DWORD Ptr arg(3) ;dst_ptich
    793 %else
    794         add         rdi,        r8
    795 %endif
    796 
    797         dec         rcx
    798         jnz         .filter_block1d16_h6_only_sse2_rowloop               ; next row
    799 
    800     ; begin epilog
    801     pop rdi
    802     pop rsi
    803     RESTORE_GOT
    804     RESTORE_XMM
    805     UNSHADOW_ARGS
    806     pop         rbp
    807     ret
    808 
    809 
    810 ;void vp8_filter_block1d8_v6_only_sse2
    811 ;(
    812 ;    unsigned char *src_ptr,
    813 ;    unsigned int    src_pixels_per_line,
    814 ;    unsigned char *output_ptr,
    815 ;    int dst_ptich,
    816 ;    unsigned int output_height,
    817 ;    const short    *vp8_filter
    818 ;)
    819 ; Second-pass filter only when xoffset==0
    820 global sym(vp8_filter_block1d8_v6_only_sse2) PRIVATE
    821 sym(vp8_filter_block1d8_v6_only_sse2):
    822     push        rbp
    823     mov         rbp, rsp
    824     SHADOW_ARGS_TO_STACK 6
    825     SAVE_XMM 7
    826     GET_GOT     rbx
    827     push        rsi
    828     push        rdi
    829     ; end prolog
    830 
    831         mov         rsi,        arg(0) ;src_ptr
    832         mov         rdi,        arg(2) ;output_ptr
    833 
    834         movsxd      rcx,        dword ptr arg(4) ;output_height
    835         movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line
    836 
    837         mov         rax,        arg(5) ;vp8_filter
    838 
    839         pxor        xmm0,       xmm0                        ; clear xmm0
    840 
    841         movdqa      xmm7,       XMMWORD PTR [GLOBAL(rd)]
    842 %if ABI_IS_32BIT=0
    843         movsxd      r8,         dword ptr arg(3) ; dst_ptich
    844 %endif
    845 
    846 .vp8_filter_block1d8_v6_only_sse2_loop:
    847         movq        xmm1,       MMWORD PTR [rsi]
    848         movq        xmm2,       MMWORD PTR [rsi + rdx]
    849         movq        xmm3,       MMWORD PTR [rsi + rdx * 2]
    850         movq        xmm5,       MMWORD PTR [rsi + rdx * 4]
    851         add         rsi,        rdx
    852         movq        xmm4,       MMWORD PTR [rsi + rdx * 2]
    853         movq        xmm6,       MMWORD PTR [rsi + rdx * 4]
    854 
    855         punpcklbw   xmm1,       xmm0
    856         pmullw      xmm1,       [rax]
    857 
    858         punpcklbw   xmm2,       xmm0
    859         pmullw      xmm2,       [rax + 16]
    860 
    861         punpcklbw   xmm3,       xmm0
    862         pmullw      xmm3,       [rax + 32]
    863 
    864         punpcklbw   xmm5,       xmm0
    865         pmullw      xmm5,       [rax + 64]
    866 
    867         punpcklbw   xmm4,       xmm0
    868         pmullw      xmm4,       [rax + 48]
    869 
    870         punpcklbw   xmm6,       xmm0
    871         pmullw      xmm6,       [rax + 80]
    872 
    873         paddsw      xmm2,       xmm5
    874         paddsw      xmm2,       xmm3
    875 
    876         paddsw      xmm2,       xmm1
    877         paddsw      xmm2,       xmm4
    878 
    879         paddsw      xmm2,       xmm6
    880         paddsw      xmm2,       xmm7
    881 
    882         psraw       xmm2,       7
    883         packuswb    xmm2,       xmm0              ; pack and saturate
    884 
    885         movq        QWORD PTR [rdi], xmm2         ; store the results in the destination
    886 %if ABI_IS_32BIT
    887         add         rdi,        DWORD PTR arg(3) ;[dst_ptich]
    888 %else
    889         add         rdi,        r8
    890 %endif
    891         dec         rcx         ; decrement count
    892         jnz         .vp8_filter_block1d8_v6_only_sse2_loop              ; next row
    893 
    894     ; begin epilog
    895     pop rdi
    896     pop rsi
    897     RESTORE_GOT
    898     RESTORE_XMM
    899     UNSHADOW_ARGS
    900     pop         rbp
    901     ret
    902 
    903 
    904 ;void vp8_unpack_block1d16_h6_sse2
    905 ;(
    906 ;    unsigned char  *src_ptr,
    907 ;    unsigned short *output_ptr,
    908 ;    unsigned int    src_pixels_per_line,
    909 ;    unsigned int    output_height,
    910 ;    unsigned int    output_width
    911 ;)
    912 global sym(vp8_unpack_block1d16_h6_sse2) PRIVATE
    913 sym(vp8_unpack_block1d16_h6_sse2):
    914     push        rbp
    915     mov         rbp, rsp
    916     SHADOW_ARGS_TO_STACK 5
    917     GET_GOT     rbx
    918     push        rsi
    919     push        rdi
    920     ; end prolog
    921 
    922         mov         rsi,        arg(0) ;src_ptr
    923         mov         rdi,        arg(1) ;output_ptr
    924 
    925         movsxd      rcx,        dword ptr arg(3) ;output_height
    926         movsxd      rax,        dword ptr arg(2) ;src_pixels_per_line            ; Pitch for Source
    927 
    928         pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
    929 %if ABI_IS_32BIT=0
    930         movsxd      r8,         dword ptr arg(4) ;output_width            ; Pitch for Source
    931 %endif
    932 
    933 .unpack_block1d16_h6_sse2_rowloop:
    934         movq        xmm1,       MMWORD PTR [rsi]            ; 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 -2
    935         movq        xmm3,       MMWORD PTR [rsi+8]          ; make copy of xmm1
    936 
    937         punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
    938         punpcklbw   xmm1,       xmm0
    939 
    940         movdqa      XMMWORD Ptr [rdi],         xmm1
    941         movdqa      XMMWORD Ptr [rdi + 16],    xmm3
    942 
    943         lea         rsi,        [rsi + rax]
    944 %if ABI_IS_32BIT
    945         add         rdi,        DWORD Ptr arg(4) ;[output_width]
    946 %else
    947         add         rdi,        r8
    948 %endif
    949         dec         rcx
    950         jnz         .unpack_block1d16_h6_sse2_rowloop               ; next row
    951 
    952     ; begin epilog
    953     pop rdi
    954     pop rsi
    955     RESTORE_GOT
    956     UNSHADOW_ARGS
    957     pop         rbp
    958     ret
    959 
    960 
    961 ;void vp8_bilinear_predict16x16_sse2
    962 ;(
    963 ;    unsigned char  *src_ptr,
    964 ;    int   src_pixels_per_line,
    965 ;    int  xoffset,
    966 ;    int  yoffset,
    967 ;    unsigned char *dst_ptr,
    968 ;    int dst_pitch
    969 ;)
    970 extern sym(vp8_bilinear_filters_x86_8)
    971 global sym(vp8_bilinear_predict16x16_sse2) PRIVATE
    972 sym(vp8_bilinear_predict16x16_sse2):
    973     push        rbp
    974     mov         rbp, rsp
    975     SHADOW_ARGS_TO_STACK 6
    976     SAVE_XMM 7
    977     GET_GOT     rbx
    978     push        rsi
    979     push        rdi
    980     ; end prolog
    981 
    982     ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset]
    983     ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset]
    984 
    985         lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
    986         movsxd      rax,        dword ptr arg(2) ;xoffset
    987 
    988         cmp         rax,        0      ;skip first_pass filter if xoffset=0
    989         je          .b16x16_sp_only
    990 
    991         shl         rax,        5
    992         add         rax,        rcx    ;HFilter
    993 
    994         mov         rdi,        arg(4) ;dst_ptr
    995         mov         rsi,        arg(0) ;src_ptr
    996         movsxd      rdx,        dword ptr arg(5) ;dst_pitch
    997 
    998         movdqa      xmm1,       [rax]
    999         movdqa      xmm2,       [rax+16]
   1000 
   1001         movsxd      rax,        dword ptr arg(3) ;yoffset
   1002 
   1003         cmp         rax,        0      ;skip second_pass filter if yoffset=0
   1004         je          .b16x16_fp_only
   1005 
   1006         shl         rax,        5
   1007         add         rax,        rcx    ;VFilter
   1008 
   1009         lea         rcx,        [rdi+rdx*8]
   1010         lea         rcx,        [rcx+rdx*8]
   1011         movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line
   1012 
   1013         pxor        xmm0,       xmm0
   1014 
   1015 %if ABI_IS_32BIT=0
   1016         movsxd      r8,         dword ptr arg(5) ;dst_pitch
   1017 %endif
   1018         ; get the first horizontal line done
   1019         movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
   1020         movdqa      xmm4,       xmm3                 ; make a copy of current line
   1021 
   1022         punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06
   1023         punpckhbw   xmm4,       xmm0
   1024 
   1025         pmullw      xmm3,       xmm1
   1026         pmullw      xmm4,       xmm1
   1027 
   1028         movdqu      xmm5,       [rsi+1]
   1029         movdqa      xmm6,       xmm5
   1030 
   1031         punpcklbw   xmm5,       xmm0
   1032         punpckhbw   xmm6,       xmm0
   1033 
   1034         pmullw      xmm5,       xmm2
   1035         pmullw      xmm6,       xmm2
   1036 
   1037         paddw       xmm3,       xmm5
   1038         paddw       xmm4,       xmm6
   1039 
   1040         paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
   1041         psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
   1042 
   1043         paddw       xmm4,       [GLOBAL(rd)]
   1044         psraw       xmm4,       VP8_FILTER_SHIFT
   1045 
   1046         movdqa      xmm7,       xmm3
   1047         packuswb    xmm7,       xmm4
   1048 
   1049         add         rsi,        rdx                 ; next line
   1050 .next_row:
   1051         movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
   1052         movdqa      xmm4,       xmm3                 ; make a copy of current line
   1053 
   1054         punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06
   1055         punpckhbw   xmm4,       xmm0
   1056 
   1057         pmullw      xmm3,       xmm1
   1058         pmullw      xmm4,       xmm1
   1059 
   1060         movdqu      xmm5,       [rsi+1]
   1061         movdqa      xmm6,       xmm5
   1062 
   1063         punpcklbw   xmm5,       xmm0
   1064         punpckhbw   xmm6,       xmm0
   1065 
   1066         pmullw      xmm5,       xmm2
   1067         pmullw      xmm6,       xmm2
   1068 
   1069         paddw       xmm3,       xmm5
   1070         paddw       xmm4,       xmm6
   1071 
   1072         movdqa      xmm5,       xmm7
   1073         movdqa      xmm6,       xmm7
   1074 
   1075         punpcklbw   xmm5,       xmm0
   1076         punpckhbw   xmm6,       xmm0
   1077 
   1078         pmullw      xmm5,       [rax]
   1079         pmullw      xmm6,       [rax]
   1080 
   1081         paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
   1082         psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
   1083 
   1084         paddw       xmm4,       [GLOBAL(rd)]
   1085         psraw       xmm4,       VP8_FILTER_SHIFT
   1086 
   1087         movdqa      xmm7,       xmm3
   1088         packuswb    xmm7,       xmm4
   1089 
   1090         pmullw      xmm3,       [rax+16]
   1091         pmullw      xmm4,       [rax+16]
   1092 
   1093         paddw       xmm3,       xmm5
   1094         paddw       xmm4,       xmm6
   1095 
   1096         paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
   1097         psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
   1098 
   1099         paddw       xmm4,       [GLOBAL(rd)]
   1100         psraw       xmm4,       VP8_FILTER_SHIFT
   1101 
   1102         packuswb    xmm3,       xmm4
   1103         movdqa      [rdi],      xmm3                 ; store the results in the destination
   1104 
   1105         add         rsi,        rdx                 ; next line
   1106 %if ABI_IS_32BIT
   1107         add         rdi,        DWORD PTR arg(5) ;dst_pitch
   1108 %else
   1109         add         rdi,        r8
   1110 %endif
   1111 
   1112         cmp         rdi,        rcx
   1113         jne         .next_row
   1114 
   1115         jmp         .done
   1116 
   1117 .b16x16_sp_only:
   1118         movsxd      rax,        dword ptr arg(3) ;yoffset
   1119         shl         rax,        5
   1120         add         rax,        rcx    ;VFilter
   1121 
   1122         mov         rdi,        arg(4) ;dst_ptr
   1123         mov         rsi,        arg(0) ;src_ptr
   1124         movsxd      rdx,        dword ptr arg(5) ;dst_pitch
   1125 
   1126         movdqa      xmm1,       [rax]
   1127         movdqa      xmm2,       [rax+16]
   1128 
   1129         lea         rcx,        [rdi+rdx*8]
   1130         lea         rcx,        [rcx+rdx*8]
   1131         movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line
   1132 
   1133         pxor        xmm0,       xmm0
   1134 
   1135         ; get the first horizontal line done
   1136         movdqu      xmm7,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
   1137 
   1138         add         rsi,        rax                 ; next line
   1139 .next_row_spo:
   1140         movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
   1141 
   1142         movdqa      xmm5,       xmm7
   1143         movdqa      xmm6,       xmm7
   1144 
   1145         movdqa      xmm4,       xmm3                 ; make a copy of current line
   1146         movdqa      xmm7,       xmm3
   1147 
   1148         punpcklbw   xmm5,       xmm0
   1149         punpckhbw   xmm6,       xmm0
   1150         punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06
   1151         punpckhbw   xmm4,       xmm0
   1152 
   1153         pmullw      xmm5,       xmm1
   1154         pmullw      xmm6,       xmm1
   1155         pmullw      xmm3,       xmm2
   1156         pmullw      xmm4,       xmm2
   1157 
   1158         paddw       xmm3,       xmm5
   1159         paddw       xmm4,       xmm6
   1160 
   1161         paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
   1162         psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
   1163 
   1164         paddw       xmm4,       [GLOBAL(rd)]
   1165         psraw       xmm4,       VP8_FILTER_SHIFT
   1166 
   1167         packuswb    xmm3,       xmm4
   1168         movdqa      [rdi],      xmm3                 ; store the results in the destination
   1169 
   1170         add         rsi,        rax                 ; next line
   1171         add         rdi,        rdx                 ;dst_pitch
   1172         cmp         rdi,        rcx
   1173         jne         .next_row_spo
   1174 
   1175         jmp         .done
   1176 
   1177 .b16x16_fp_only:
   1178         lea         rcx,        [rdi+rdx*8]
   1179         lea         rcx,        [rcx+rdx*8]
   1180         movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line
   1181         pxor        xmm0,       xmm0
   1182 
   1183 .next_row_fpo:
   1184         movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
   1185         movdqa      xmm4,       xmm3                 ; make a copy of current line
   1186 
   1187         punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06
   1188         punpckhbw   xmm4,       xmm0
   1189 
   1190         pmullw      xmm3,       xmm1
   1191         pmullw      xmm4,       xmm1
   1192 
   1193         movdqu      xmm5,       [rsi+1]
   1194         movdqa      xmm6,       xmm5
   1195 
   1196         punpcklbw   xmm5,       xmm0
   1197         punpckhbw   xmm6,       xmm0
   1198 
   1199         pmullw      xmm5,       xmm2
   1200         pmullw      xmm6,       xmm2
   1201 
   1202         paddw       xmm3,       xmm5
   1203         paddw       xmm4,       xmm6
   1204 
   1205         paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
   1206         psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
   1207 
   1208         paddw       xmm4,       [GLOBAL(rd)]
   1209         psraw       xmm4,       VP8_FILTER_SHIFT
   1210 
   1211         packuswb    xmm3,       xmm4
   1212         movdqa      [rdi],      xmm3                 ; store the results in the destination
   1213 
   1214         add         rsi,        rax                 ; next line
   1215         add         rdi,        rdx                 ; dst_pitch
   1216         cmp         rdi,        rcx
   1217         jne         .next_row_fpo
   1218 
   1219 .done:
   1220     ; begin epilog
   1221     pop rdi
   1222     pop rsi
   1223     RESTORE_GOT
   1224     RESTORE_XMM
   1225     UNSHADOW_ARGS
   1226     pop         rbp
   1227     ret
   1228 
   1229 
   1230 ;void vp8_bilinear_predict8x8_sse2
   1231 ;(
   1232 ;    unsigned char  *src_ptr,
   1233 ;    int   src_pixels_per_line,
   1234 ;    int  xoffset,
   1235 ;    int  yoffset,
   1236 ;    unsigned char *dst_ptr,
   1237 ;    int dst_pitch
   1238 ;)
   1239 global sym(vp8_bilinear_predict8x8_sse2) PRIVATE
   1240 sym(vp8_bilinear_predict8x8_sse2):
   1241     push        rbp
   1242     mov         rbp, rsp
   1243     SHADOW_ARGS_TO_STACK 6
   1244     SAVE_XMM 7
   1245     GET_GOT     rbx
   1246     push        rsi
   1247     push        rdi
   1248     ; end prolog
   1249 
   1250     ALIGN_STACK 16, rax
   1251     sub         rsp, 144                         ; reserve 144 bytes
   1252 
   1253     ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset]
   1254     ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset]
   1255         lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
   1256 
   1257         mov         rsi,        arg(0) ;src_ptr
   1258         movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line
   1259 
   1260     ;Read 9-line unaligned data in and put them on stack. This gives a big
   1261     ;performance boost.
   1262         movdqu      xmm0,       [rsi]
   1263         lea         rax,        [rdx + rdx*2]
   1264         movdqu      xmm1,       [rsi+rdx]
   1265         movdqu      xmm2,       [rsi+rdx*2]
   1266         add         rsi,        rax
   1267         movdqu      xmm3,       [rsi]
   1268         movdqu      xmm4,       [rsi+rdx]
   1269         movdqu      xmm5,       [rsi+rdx*2]
   1270         add         rsi,        rax
   1271         movdqu      xmm6,       [rsi]
   1272         movdqu      xmm7,       [rsi+rdx]
   1273 
   1274         movdqa      XMMWORD PTR [rsp],            xmm0
   1275 
   1276         movdqu      xmm0,       [rsi+rdx*2]
   1277 
   1278         movdqa      XMMWORD PTR [rsp+16],         xmm1
   1279         movdqa      XMMWORD PTR [rsp+32],         xmm2
   1280         movdqa      XMMWORD PTR [rsp+48],         xmm3
   1281         movdqa      XMMWORD PTR [rsp+64],         xmm4
   1282         movdqa      XMMWORD PTR [rsp+80],         xmm5
   1283         movdqa      XMMWORD PTR [rsp+96],         xmm6
   1284         movdqa      XMMWORD PTR [rsp+112],        xmm7
   1285         movdqa      XMMWORD PTR [rsp+128],        xmm0
   1286 
   1287         movsxd      rax,        dword ptr arg(2) ;xoffset
   1288         shl         rax,        5
   1289         add         rax,        rcx    ;HFilter
   1290 
   1291         mov         rdi,        arg(4) ;dst_ptr
   1292         movsxd      rdx,        dword ptr arg(5) ;dst_pitch
   1293 
   1294         movdqa      xmm1,       [rax]
   1295         movdqa      xmm2,       [rax+16]
   1296 
   1297         movsxd      rax,        dword ptr arg(3) ;yoffset
   1298         shl         rax,        5
   1299         add         rax,        rcx    ;VFilter
   1300 
   1301         lea         rcx,        [rdi+rdx*8]
   1302 
   1303         movdqa      xmm5,       [rax]
   1304         movdqa      xmm6,       [rax+16]
   1305 
   1306         pxor        xmm0,       xmm0
   1307 
   1308         ; get the first horizontal line done
   1309         movdqa      xmm3,       XMMWORD PTR [rsp]
   1310         movdqa      xmm4,       xmm3                 ; make a copy of current line
   1311         psrldq      xmm4,       1
   1312 
   1313         punpcklbw   xmm3,       xmm0                 ; 00 01 02 03 04 05 06 07
   1314         punpcklbw   xmm4,       xmm0                 ; 01 02 03 04 05 06 07 08
   1315 
   1316         pmullw      xmm3,       xmm1
   1317         pmullw      xmm4,       xmm2
   1318 
   1319         paddw       xmm3,       xmm4
   1320 
   1321         paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
   1322         psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
   1323 
   1324         movdqa      xmm7,       xmm3
   1325         add         rsp,        16                 ; next line
   1326 .next_row8x8:
   1327         movdqa      xmm3,       XMMWORD PTR [rsp]               ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
   1328         movdqa      xmm4,       xmm3                 ; make a copy of current line
   1329         psrldq      xmm4,       1
   1330 
   1331         punpcklbw   xmm3,       xmm0                 ; 00 01 02 03 04 05 06 07
   1332         punpcklbw   xmm4,       xmm0                 ; 01 02 03 04 05 06 07 08
   1333 
   1334         pmullw      xmm3,       xmm1
   1335         pmullw      xmm4,       xmm2
   1336 
   1337         paddw       xmm3,       xmm4
   1338         pmullw      xmm7,       xmm5
   1339 
   1340         paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
   1341         psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
   1342 
   1343         movdqa      xmm4,       xmm3
   1344 
   1345         pmullw      xmm3,       xmm6
   1346         paddw       xmm3,       xmm7
   1347 
   1348         movdqa      xmm7,       xmm4
   1349 
   1350         paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
   1351         psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
   1352 
   1353         packuswb    xmm3,       xmm0
   1354         movq        [rdi],      xmm3                 ; store the results in the destination
   1355 
   1356         add         rsp,        16                 ; next line
   1357         add         rdi,        rdx
   1358 
   1359         cmp         rdi,        rcx
   1360         jne         .next_row8x8
   1361 
   1362     ;add rsp, 144
   1363     pop rsp
   1364     ; begin epilog
   1365     pop rdi
   1366     pop rsi
   1367     RESTORE_GOT
   1368     RESTORE_XMM
   1369     UNSHADOW_ARGS
   1370     pop         rbp
   1371     ret
   1372 
   1373 
   1374 SECTION_RODATA
   1375 align 16
   1376 rd:
   1377     times 8 dw 0x40
   1378