Home | History | Annotate | Download | only in x86
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12 %include "vpx_ports/x86_abi_support.asm"
     13 
     14 %define BLOCK_HEIGHT_WIDTH 4
     15 %define VP8_FILTER_WEIGHT 128
     16 %define VP8_FILTER_SHIFT  7
     17 
     18 
     19 ;/************************************************************************************
     20 ; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
     21 ; input pixel array has output_height rows. This routine assumes that output_height is an
     22 ; even number. This function handles 8 pixels in horizontal direction, calculating ONE
     23 ; rows each iteration to take advantage of the 128 bits operations.
     24 ;*************************************************************************************/
     25 ;void vp8_filter_block1d8_h6_sse2
     26 ;(
     27 ;    unsigned char  *src_ptr,
     28 ;    unsigned short *output_ptr,
     29 ;    unsigned int    src_pixels_per_line,
     30 ;    unsigned int    pixel_step,
     31 ;    unsigned int    output_height,
     32 ;    unsigned int    output_width,
     33 ;    short           *vp8_filter
     34 ;)
     35 global sym(vp8_filter_block1d8_h6_sse2)
     36 sym(vp8_filter_block1d8_h6_sse2):
     37     push        rbp
     38     mov         rbp, rsp
     39     SHADOW_ARGS_TO_STACK 7
     40     SAVE_XMM
     41     GET_GOT     rbx
     42     push        rsi
     43     push        rdi
     44     ; end prolog
     45 
     46         mov         rdx,        arg(6) ;vp8_filter
     47         mov         rsi,        arg(0) ;src_ptr
     48 
     49         mov         rdi,        arg(1) ;output_ptr
     50 
     51         movsxd      rcx,        dword ptr arg(4) ;output_height
     52         movsxd      rax,        dword ptr arg(2) ;src_pixels_per_line            ; Pitch for Source
     53 %if ABI_IS_32BIT=0
     54         movsxd      r8,         dword ptr arg(5) ;output_width
     55 %endif
     56         pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
     57 
     58 filter_block1d8_h6_rowloop:
     59         movq        xmm3,       MMWORD PTR [rsi - 2]
     60         movq        xmm1,       MMWORD PTR [rsi + 6]
     61 
     62         prefetcht2  [rsi+rax-2]
     63 
     64         pslldq      xmm1,       8
     65         por         xmm1,       xmm3
     66 
     67         movdqa      xmm4,       xmm1
     68         movdqa      xmm5,       xmm1
     69 
     70         movdqa      xmm6,       xmm1
     71         movdqa      xmm7,       xmm1
     72 
     73         punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
     74         psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
     75 
     76         pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
     77         punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
     78 
     79         psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
     80         pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
     81 
     82 
     83         punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
     84         psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
     85 
     86         pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
     87 
     88         punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
     89         psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
     90 
     91         pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
     92 
     93         punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
     94         psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
     95 
     96 
     97         pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
     98 
     99         punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
    100         pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
    101 
    102 
    103         paddsw      xmm4,       xmm7
    104         paddsw      xmm4,       xmm5
    105 
    106         paddsw      xmm4,       xmm3
    107         paddsw      xmm4,       xmm6
    108 
    109         paddsw      xmm4,       xmm1
    110         paddsw      xmm4,       [GLOBAL(rd)]
    111 
    112         psraw       xmm4,       7
    113 
    114         packuswb    xmm4,       xmm0
    115         punpcklbw   xmm4,       xmm0
    116 
    117         movdqa      XMMWORD Ptr [rdi],         xmm4
    118         lea         rsi,        [rsi + rax]
    119 
    120 %if ABI_IS_32BIT
    121         add         rdi,        DWORD Ptr arg(5) ;[output_width]
    122 %else
    123         add         rdi,        r8
    124 %endif
    125         dec         rcx
    126 
    127         jnz         filter_block1d8_h6_rowloop                ; next row
    128 
    129     ; begin epilog
    130     pop rdi
    131     pop rsi
    132     RESTORE_GOT
    133     RESTORE_XMM
    134     UNSHADOW_ARGS
    135     pop         rbp
    136     ret
    137 
    138 
    139 ;void vp8_filter_block1d16_h6_sse2
    140 ;(
    141 ;    unsigned char  *src_ptr,
    142 ;    unsigned short *output_ptr,
    143 ;    unsigned int    src_pixels_per_line,
    144 ;    unsigned int    pixel_step,
    145 ;    unsigned int    output_height,
    146 ;    unsigned int    output_width,
    147 ;    short           *vp8_filter
    148 ;)
    149 ;/************************************************************************************
    150 ; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
    151 ; input pixel array has output_height rows. This routine assumes that output_height is an
    152 ; even number. This function handles 8 pixels in horizontal direction, calculating ONE
    153 ; rows each iteration to take advantage of the 128 bits operations.
    154 ;*************************************************************************************/
    155 global sym(vp8_filter_block1d16_h6_sse2)
    156 sym(vp8_filter_block1d16_h6_sse2):
    157     push        rbp
    158     mov         rbp, rsp
    159     SHADOW_ARGS_TO_STACK 7
    160     SAVE_XMM
    161     GET_GOT     rbx
    162     push        rsi
    163     push        rdi
    164     ; end prolog
    165 
    166         mov         rdx,        arg(6) ;vp8_filter
    167         mov         rsi,        arg(0) ;src_ptr
    168 
    169         mov         rdi,        arg(1) ;output_ptr
    170 
    171         movsxd      rcx,        dword ptr arg(4) ;output_height
    172         movsxd      rax,        dword ptr arg(2) ;src_pixels_per_line            ; Pitch for Source
    173 %if ABI_IS_32BIT=0
    174         movsxd      r8,         dword ptr arg(5) ;output_width
    175 %endif
    176 
    177         pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
    178 
    179 filter_block1d16_h6_sse2_rowloop:
    180         movq        xmm3,       MMWORD PTR [rsi - 2]
    181         movq        xmm1,       MMWORD PTR [rsi + 6]
    182 
    183         movq        xmm2,       MMWORD PTR [rsi +14]
    184         pslldq      xmm2,       8
    185 
    186         por         xmm2,       xmm1
    187         prefetcht2  [rsi+rax-2]
    188 
    189         pslldq      xmm1,       8
    190         por         xmm1,       xmm3
    191 
    192         movdqa      xmm4,       xmm1
    193         movdqa      xmm5,       xmm1
    194 
    195         movdqa      xmm6,       xmm1
    196         movdqa      xmm7,       xmm1
    197 
    198         punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
    199         psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
    200 
    201         pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
    202         punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
    203 
    204         psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
    205         pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
    206 
    207 
    208         punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
    209         psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
    210 
    211         pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
    212 
    213         punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
    214         psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
    215 
    216         pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
    217 
    218         punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
    219         psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
    220 
    221 
    222         pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
    223 
    224         punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
    225         pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
    226 
    227         paddsw      xmm4,       xmm7
    228         paddsw      xmm4,       xmm5
    229 
    230         paddsw      xmm4,       xmm3
    231         paddsw      xmm4,       xmm6
    232 
    233         paddsw      xmm4,       xmm1
    234         paddsw      xmm4,       [GLOBAL(rd)]
    235 
    236         psraw       xmm4,       7
    237 
    238         packuswb    xmm4,       xmm0
    239         punpcklbw   xmm4,       xmm0
    240 
    241         movdqa      XMMWORD Ptr [rdi],         xmm4
    242 
    243         movdqa      xmm3,       xmm2
    244         movdqa      xmm4,       xmm2
    245 
    246         movdqa      xmm5,       xmm2
    247         movdqa      xmm6,       xmm2
    248 
    249         movdqa      xmm7,       xmm2
    250 
    251         punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
    252         psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
    253 
    254         pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
    255         punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
    256 
    257         psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
    258         pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
    259 
    260 
    261         punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
    262         psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
    263 
    264         pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
    265 
    266         punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
    267         psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
    268 
    269         pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
    270 
    271         punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
    272         psrldq      xmm2,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
    273 
    274         pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
    275 
    276         punpcklbw   xmm2,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
    277         pmullw      xmm2,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
    278 
    279 
    280         paddsw      xmm4,       xmm7
    281         paddsw      xmm4,       xmm5
    282 
    283         paddsw      xmm4,       xmm3
    284         paddsw      xmm4,       xmm6
    285 
    286         paddsw      xmm4,       xmm2
    287         paddsw      xmm4,       [GLOBAL(rd)]
    288 
    289         psraw       xmm4,       7
    290 
    291         packuswb    xmm4,       xmm0
    292         punpcklbw   xmm4,       xmm0
    293 
    294         movdqa      XMMWORD Ptr [rdi+16],      xmm4
    295 
    296         lea         rsi,        [rsi + rax]
    297 %if ABI_IS_32BIT
    298         add         rdi,        DWORD Ptr arg(5) ;[output_width]
    299 %else
    300         add         rdi,        r8
    301 %endif
    302 
    303         dec         rcx
    304         jnz         filter_block1d16_h6_sse2_rowloop                ; next row
    305 
    306     ; begin epilog
    307     pop rdi
    308     pop rsi
    309     RESTORE_GOT
    310     RESTORE_XMM
    311     UNSHADOW_ARGS
    312     pop         rbp
    313     ret
    314 
    315 
    316 ;void vp8_filter_block1d8_v6_sse2
    317 ;(
    318 ;    short *src_ptr,
    319 ;    unsigned char *output_ptr,
    320 ;    int dst_ptich,
    321 ;    unsigned int pixels_per_line,
    322 ;    unsigned int pixel_step,
    323 ;    unsigned int output_height,
    324 ;    unsigned int output_width,
    325 ;    short * vp8_filter
    326 ;)
    327 ;/************************************************************************************
    328 ; Notes: filter_block1d8_v6 applies a 6 tap filter vertically to the input pixels. The
    329 ; input pixel array has output_height rows.
    330 ;*************************************************************************************/
    331 global sym(vp8_filter_block1d8_v6_sse2)
    332 sym(vp8_filter_block1d8_v6_sse2):
    333     push        rbp
    334     mov         rbp, rsp
    335     SHADOW_ARGS_TO_STACK 8
    336     SAVE_XMM
    337     GET_GOT     rbx
    338     push        rsi
    339     push        rdi
    340     ; end prolog
    341 
    342         mov         rax,        arg(7) ;vp8_filter
    343         movsxd      rdx,        dword ptr arg(3) ;pixels_per_line
    344 
    345         mov         rdi,        arg(1) ;output_ptr
    346         mov         rsi,        arg(0) ;src_ptr
    347 
    348         sub         rsi,        rdx
    349         sub         rsi,        rdx
    350 
    351         movsxd      rcx,        DWORD PTR arg(5) ;[output_height]
    352         pxor        xmm0,       xmm0                        ; clear xmm0
    353 
    354         movdqa      xmm7,       XMMWORD PTR [GLOBAL(rd)]
    355 %if ABI_IS_32BIT=0
    356         movsxd      r8,         dword ptr arg(2) ; dst_ptich
    357 %endif
    358 
    359 vp8_filter_block1d8_v6_sse2_loop:
    360         movdqa      xmm1,       XMMWORD PTR [rsi]
    361         pmullw      xmm1,       [rax]
    362 
    363         movdqa      xmm2,       XMMWORD PTR [rsi + rdx]
    364         pmullw      xmm2,       [rax + 16]
    365 
    366         movdqa      xmm3,       XMMWORD PTR [rsi + rdx * 2]
    367         pmullw      xmm3,       [rax + 32]
    368 
    369         movdqa      xmm5,       XMMWORD PTR [rsi + rdx * 4]
    370         pmullw      xmm5,       [rax + 64]
    371 
    372         add         rsi,        rdx
    373         movdqa      xmm4,       XMMWORD PTR [rsi + rdx * 2]
    374 
    375         pmullw      xmm4,       [rax + 48]
    376         movdqa      xmm6,       XMMWORD PTR [rsi + rdx * 4]
    377 
    378         pmullw      xmm6,       [rax + 80]
    379 
    380         paddsw      xmm2,       xmm5
    381         paddsw      xmm2,       xmm3
    382 
    383         paddsw      xmm2,       xmm1
    384         paddsw      xmm2,       xmm4
    385 
    386         paddsw      xmm2,       xmm6
    387         paddsw      xmm2,       xmm7
    388 
    389         psraw       xmm2,       7
    390         packuswb    xmm2,       xmm0              ; pack and saturate
    391 
    392         movq        QWORD PTR [rdi], xmm2         ; store the results in the destination
    393 %if ABI_IS_32BIT
    394         add         rdi,        DWORD PTR arg(2) ;[dst_ptich]
    395 %else
    396         add         rdi,        r8
    397 %endif
    398         dec         rcx         ; decrement count
    399         jnz         vp8_filter_block1d8_v6_sse2_loop               ; next row
    400 
    401     ; begin epilog
    402     pop rdi
    403     pop rsi
    404     RESTORE_GOT
    405     RESTORE_XMM
    406     UNSHADOW_ARGS
    407     pop         rbp
    408     ret
    409 
    410 
    411 ;void vp8_filter_block1d16_v6_sse2
    412 ;(
    413 ;    unsigned short *src_ptr,
    414 ;    unsigned char *output_ptr,
    415 ;    int dst_ptich,
    416 ;    unsigned int pixels_per_line,
    417 ;    unsigned int pixel_step,
    418 ;    unsigned int output_height,
    419 ;    unsigned int output_width,
    420 ;    const short    *vp8_filter
    421 ;)
    422 ;/************************************************************************************
    423 ; Notes: filter_block1d16_v6 applies a 6 tap filter vertically to the input pixels. The
    424 ; input pixel array has output_height rows.
    425 ;*************************************************************************************/
    426 global sym(vp8_filter_block1d16_v6_sse2)
    427 sym(vp8_filter_block1d16_v6_sse2):
    428     push        rbp
    429     mov         rbp, rsp
    430     SHADOW_ARGS_TO_STACK 8
    431     SAVE_XMM
    432     GET_GOT     rbx
    433     push        rsi
    434     push        rdi
    435     ; end prolog
    436 
    437         mov         rax,        arg(7) ;vp8_filter
    438         movsxd      rdx,        dword ptr arg(3) ;pixels_per_line
    439 
    440         mov         rdi,        arg(1) ;output_ptr
    441         mov         rsi,        arg(0) ;src_ptr
    442 
    443         sub         rsi,        rdx
    444         sub         rsi,        rdx
    445 
    446         movsxd      rcx,        DWORD PTR arg(5) ;[output_height]
    447 %if ABI_IS_32BIT=0
    448         movsxd      r8,         dword ptr arg(2) ; dst_ptich
    449 %endif
    450 
    451 vp8_filter_block1d16_v6_sse2_loop:
    452 ; The order for adding 6-tap is 2 5 3 1 4 6. Read in data in that order.
    453         movdqa      xmm1,       XMMWORD PTR [rsi + rdx]       ; line 2
    454         movdqa      xmm2,       XMMWORD PTR [rsi + rdx + 16]
    455         pmullw      xmm1,       [rax + 16]
    456         pmullw      xmm2,       [rax + 16]
    457 
    458         movdqa      xmm3,       XMMWORD PTR [rsi + rdx * 4]       ; line 5
    459         movdqa      xmm4,       XMMWORD PTR [rsi + rdx * 4 + 16]
    460         pmullw      xmm3,       [rax + 64]
    461         pmullw      xmm4,       [rax + 64]
    462 
    463         movdqa      xmm5,       XMMWORD PTR [rsi + rdx * 2]       ; line 3
    464         movdqa      xmm6,       XMMWORD PTR [rsi + rdx * 2 + 16]
    465         pmullw      xmm5,       [rax + 32]
    466         pmullw      xmm6,       [rax + 32]
    467 
    468         movdqa      xmm7,       XMMWORD PTR [rsi]       ; line 1
    469         movdqa      xmm0,       XMMWORD PTR [rsi + 16]
    470         pmullw      xmm7,       [rax]
    471         pmullw      xmm0,       [rax]
    472 
    473         paddsw      xmm1,       xmm3
    474         paddsw      xmm2,       xmm4
    475         paddsw      xmm1,       xmm5
    476         paddsw      xmm2,       xmm6
    477         paddsw      xmm1,       xmm7
    478         paddsw      xmm2,       xmm0
    479 
    480         add         rsi,        rdx
    481 
    482         movdqa      xmm3,       XMMWORD PTR [rsi + rdx * 2]       ; line 4
    483         movdqa      xmm4,       XMMWORD PTR [rsi + rdx * 2 + 16]
    484         pmullw      xmm3,       [rax + 48]
    485         pmullw      xmm4,       [rax + 48]
    486 
    487         movdqa      xmm5,       XMMWORD PTR [rsi + rdx * 4]       ; line 6
    488         movdqa      xmm6,       XMMWORD PTR [rsi + rdx * 4 + 16]
    489         pmullw      xmm5,       [rax + 80]
    490         pmullw      xmm6,       [rax + 80]
    491 
    492         movdqa      xmm7,       XMMWORD PTR [GLOBAL(rd)]
    493         pxor        xmm0,       xmm0                        ; clear xmm0
    494 
    495         paddsw      xmm1,       xmm3
    496         paddsw      xmm2,       xmm4
    497         paddsw      xmm1,       xmm5
    498         paddsw      xmm2,       xmm6
    499 
    500         paddsw      xmm1,       xmm7
    501         paddsw      xmm2,       xmm7
    502 
    503         psraw       xmm1,       7
    504         psraw       xmm2,       7
    505 
    506         packuswb    xmm1,       xmm2              ; pack and saturate
    507         movdqa      XMMWORD PTR [rdi], xmm1       ; store the results in the destination
    508 %if ABI_IS_32BIT
    509         add         rdi,        DWORD PTR arg(2) ;[dst_ptich]
    510 %else
    511         add         rdi,        r8
    512 %endif
    513         dec         rcx         ; decrement count
    514         jnz         vp8_filter_block1d16_v6_sse2_loop               ; next row
    515 
    516     ; begin epilog
    517     pop rdi
    518     pop rsi
    519     RESTORE_GOT
    520     RESTORE_XMM
    521     UNSHADOW_ARGS
    522     pop         rbp
    523     ret
    524 
    525 
    526 ;void vp8_filter_block1d8_h6_only_sse2
    527 ;(
    528 ;    unsigned char  *src_ptr,
    529 ;    unsigned int    src_pixels_per_line,
    530 ;    unsigned char  *output_ptr,
    531 ;    int dst_ptich,
    532 ;    unsigned int    output_height,
    533 ;    const short    *vp8_filter
    534 ;)
    535 ; First-pass filter only when yoffset==0
    536 global sym(vp8_filter_block1d8_h6_only_sse2)
    537 sym(vp8_filter_block1d8_h6_only_sse2):
    538     push        rbp
    539     mov         rbp, rsp
    540     SHADOW_ARGS_TO_STACK 6
    541     SAVE_XMM
    542     GET_GOT     rbx
    543     push        rsi
    544     push        rdi
    545     ; end prolog
    546 
    547         mov         rdx,        arg(5) ;vp8_filter
    548         mov         rsi,        arg(0) ;src_ptr
    549 
    550         mov         rdi,        arg(2) ;output_ptr
    551 
    552         movsxd      rcx,        dword ptr arg(4) ;output_height
    553         movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line            ; Pitch for Source
    554 %if ABI_IS_32BIT=0
    555         movsxd      r8,         dword ptr arg(3) ;dst_ptich
    556 %endif
    557         pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
    558 
    559 filter_block1d8_h6_only_rowloop:
    560         movq        xmm3,       MMWORD PTR [rsi - 2]
    561         movq        xmm1,       MMWORD PTR [rsi + 6]
    562 
    563         prefetcht2  [rsi+rax-2]
    564 
    565         pslldq      xmm1,       8
    566         por         xmm1,       xmm3
    567 
    568         movdqa      xmm4,       xmm1
    569         movdqa      xmm5,       xmm1
    570 
    571         movdqa      xmm6,       xmm1
    572         movdqa      xmm7,       xmm1
    573 
    574         punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
    575         psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
    576 
    577         pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
    578         punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
    579 
    580         psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
    581         pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
    582 
    583 
    584         punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
    585         psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
    586 
    587         pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
    588 
    589         punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
    590         psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
    591 
    592         pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
    593 
    594         punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
    595         psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
    596 
    597 
    598         pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
    599 
    600         punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
    601         pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
    602 
    603 
    604         paddsw      xmm4,       xmm7
    605         paddsw      xmm4,       xmm5
    606 
    607         paddsw      xmm4,       xmm3
    608         paddsw      xmm4,       xmm6
    609 
    610         paddsw      xmm4,       xmm1
    611         paddsw      xmm4,       [GLOBAL(rd)]
    612 
    613         psraw       xmm4,       7
    614 
    615         packuswb    xmm4,       xmm0
    616 
    617         movq        QWORD PTR [rdi],   xmm4       ; store the results in the destination
    618         lea         rsi,        [rsi + rax]
    619 
    620 %if ABI_IS_32BIT
    621         add         rdi,        DWORD Ptr arg(3) ;dst_ptich
    622 %else
    623         add         rdi,        r8
    624 %endif
    625         dec         rcx
    626 
    627         jnz         filter_block1d8_h6_only_rowloop                ; next row
    628 
    629     ; begin epilog
    630     pop rdi
    631     pop rsi
    632     RESTORE_GOT
    633     RESTORE_XMM
    634     UNSHADOW_ARGS
    635     pop         rbp
    636     ret
    637 
    638 
    639 ;void vp8_filter_block1d16_h6_only_sse2
    640 ;(
    641 ;    unsigned char  *src_ptr,
    642 ;    unsigned int    src_pixels_per_line,
    643 ;    unsigned char  *output_ptr,
    644 ;    int dst_ptich,
    645 ;    unsigned int    output_height,
    646 ;    const short    *vp8_filter
    647 ;)
    648 ; First-pass filter only when yoffset==0
    649 global sym(vp8_filter_block1d16_h6_only_sse2)
    650 sym(vp8_filter_block1d16_h6_only_sse2):
    651     push        rbp
    652     mov         rbp, rsp
    653     SHADOW_ARGS_TO_STACK 6
    654     SAVE_XMM
    655     GET_GOT     rbx
    656     push        rsi
    657     push        rdi
    658     ; end prolog
    659 
    660         mov         rdx,        arg(5) ;vp8_filter
    661         mov         rsi,        arg(0) ;src_ptr
    662 
    663         mov         rdi,        arg(2) ;output_ptr
    664 
    665         movsxd      rcx,        dword ptr arg(4) ;output_height
    666         movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line            ; Pitch for Source
    667 %if ABI_IS_32BIT=0
    668         movsxd      r8,         dword ptr arg(3) ;dst_ptich
    669 %endif
    670 
    671         pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
    672 
    673 filter_block1d16_h6_only_sse2_rowloop:
    674         movq        xmm3,       MMWORD PTR [rsi - 2]
    675         movq        xmm1,       MMWORD PTR [rsi + 6]
    676 
    677         movq        xmm2,       MMWORD PTR [rsi +14]
    678         pslldq      xmm2,       8
    679 
    680         por         xmm2,       xmm1
    681         prefetcht2  [rsi+rax-2]
    682 
    683         pslldq      xmm1,       8
    684         por         xmm1,       xmm3
    685 
    686         movdqa      xmm4,       xmm1
    687         movdqa      xmm5,       xmm1
    688 
    689         movdqa      xmm6,       xmm1
    690         movdqa      xmm7,       xmm1
    691 
    692         punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
    693         psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
    694 
    695         pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
    696         punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
    697 
    698         psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
    699         pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
    700 
    701         punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
    702         psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
    703 
    704         pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
    705 
    706         punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
    707         psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
    708 
    709         pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
    710 
    711         punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
    712         psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
    713 
    714         pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
    715 
    716         punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
    717         pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
    718 
    719         paddsw      xmm4,       xmm7
    720         paddsw      xmm4,       xmm5
    721 
    722         paddsw      xmm4,       xmm3
    723         paddsw      xmm4,       xmm6
    724 
    725         paddsw      xmm4,       xmm1
    726         paddsw      xmm4,       [GLOBAL(rd)]
    727 
    728         psraw       xmm4,       7
    729 
    730         packuswb    xmm4,       xmm0                        ; lower 8 bytes
    731 
    732         movq        QWORD Ptr [rdi],         xmm4           ; store the results in the destination
    733 
    734         movdqa      xmm3,       xmm2
    735         movdqa      xmm4,       xmm2
    736 
    737         movdqa      xmm5,       xmm2
    738         movdqa      xmm6,       xmm2
    739 
    740         movdqa      xmm7,       xmm2
    741 
    742         punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
    743         psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
    744 
    745         pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
    746         punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
    747 
    748         psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
    749         pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
    750 
    751         punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
    752         psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
    753 
    754         pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
    755 
    756         punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
    757         psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
    758 
    759         pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
    760 
    761         punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
    762         psrldq      xmm2,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
    763 
    764         pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
    765 
    766         punpcklbw   xmm2,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
    767         pmullw      xmm2,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
    768 
    769         paddsw      xmm4,       xmm7
    770         paddsw      xmm4,       xmm5
    771 
    772         paddsw      xmm4,       xmm3
    773         paddsw      xmm4,       xmm6
    774 
    775         paddsw      xmm4,       xmm2
    776         paddsw      xmm4,       [GLOBAL(rd)]
    777 
    778         psraw       xmm4,       7
    779 
    780         packuswb    xmm4,       xmm0                        ; higher 8 bytes
    781 
    782         movq        QWORD Ptr [rdi+8],      xmm4            ; store the results in the destination
    783 
    784         lea         rsi,        [rsi + rax]
    785 %if ABI_IS_32BIT
    786         add         rdi,        DWORD Ptr arg(3) ;dst_ptich
    787 %else
    788         add         rdi,        r8
    789 %endif
    790 
    791         dec         rcx
    792         jnz         filter_block1d16_h6_only_sse2_rowloop                ; next row
    793 
    794     ; begin epilog
    795     pop rdi
    796     pop rsi
    797     RESTORE_GOT
    798     RESTORE_XMM
    799     UNSHADOW_ARGS
    800     pop         rbp
    801     ret
    802 
    803 
    804 ;void vp8_filter_block1d8_v6_only_sse2
    805 ;(
    806 ;    unsigned char *src_ptr,
    807 ;    unsigned int    src_pixels_per_line,
    808 ;    unsigned char *output_ptr,
    809 ;    int dst_ptich,
    810 ;    unsigned int output_height,
    811 ;    const short    *vp8_filter
    812 ;)
    813 ; Second-pass filter only when xoffset==0
    814 global sym(vp8_filter_block1d8_v6_only_sse2)
    815 sym(vp8_filter_block1d8_v6_only_sse2):
    816     push        rbp
    817     mov         rbp, rsp
    818     SHADOW_ARGS_TO_STACK 6
    819     SAVE_XMM
    820     GET_GOT     rbx
    821     push        rsi
    822     push        rdi
    823     ; end prolog
    824 
    825         mov         rsi,        arg(0) ;src_ptr
    826         mov         rdi,        arg(2) ;output_ptr
    827 
    828         movsxd      rcx,        dword ptr arg(4) ;output_height
    829         movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line
    830 
    831         mov         rax,        arg(5) ;vp8_filter
    832 
    833         pxor        xmm0,       xmm0                        ; clear xmm0
    834 
    835         movdqa      xmm7,       XMMWORD PTR [GLOBAL(rd)]
    836 %if ABI_IS_32BIT=0
    837         movsxd      r8,         dword ptr arg(3) ; dst_ptich
    838 %endif
    839 
    840 vp8_filter_block1d8_v6_only_sse2_loop:
    841         movq        xmm1,       MMWORD PTR [rsi]
    842         movq        xmm2,       MMWORD PTR [rsi + rdx]
    843         movq        xmm3,       MMWORD PTR [rsi + rdx * 2]
    844         movq        xmm5,       MMWORD PTR [rsi + rdx * 4]
    845         add         rsi,        rdx
    846         movq        xmm4,       MMWORD PTR [rsi + rdx * 2]
    847         movq        xmm6,       MMWORD PTR [rsi + rdx * 4]
    848 
    849         punpcklbw   xmm1,       xmm0
    850         pmullw      xmm1,       [rax]
    851 
    852         punpcklbw   xmm2,       xmm0
    853         pmullw      xmm2,       [rax + 16]
    854 
    855         punpcklbw   xmm3,       xmm0
    856         pmullw      xmm3,       [rax + 32]
    857 
    858         punpcklbw   xmm5,       xmm0
    859         pmullw      xmm5,       [rax + 64]
    860 
    861         punpcklbw   xmm4,       xmm0
    862         pmullw      xmm4,       [rax + 48]
    863 
    864         punpcklbw   xmm6,       xmm0
    865         pmullw      xmm6,       [rax + 80]
    866 
    867         paddsw      xmm2,       xmm5
    868         paddsw      xmm2,       xmm3
    869 
    870         paddsw      xmm2,       xmm1
    871         paddsw      xmm2,       xmm4
    872 
    873         paddsw      xmm2,       xmm6
    874         paddsw      xmm2,       xmm7
    875 
    876         psraw       xmm2,       7
    877         packuswb    xmm2,       xmm0              ; pack and saturate
    878 
    879         movq        QWORD PTR [rdi], xmm2         ; store the results in the destination
    880 %if ABI_IS_32BIT
    881         add         rdi,        DWORD PTR arg(3) ;[dst_ptich]
    882 %else
    883         add         rdi,        r8
    884 %endif
    885         dec         rcx         ; decrement count
    886         jnz         vp8_filter_block1d8_v6_only_sse2_loop               ; next row
    887 
    888     ; begin epilog
    889     pop rdi
    890     pop rsi
    891     RESTORE_GOT
    892     RESTORE_XMM
    893     UNSHADOW_ARGS
    894     pop         rbp
    895     ret
    896 
    897 
    898 ;void vp8_unpack_block1d16_h6_sse2
    899 ;(
    900 ;    unsigned char  *src_ptr,
    901 ;    unsigned short *output_ptr,
    902 ;    unsigned int    src_pixels_per_line,
    903 ;    unsigned int    output_height,
    904 ;    unsigned int    output_width
    905 ;)
    906 global sym(vp8_unpack_block1d16_h6_sse2)
    907 sym(vp8_unpack_block1d16_h6_sse2):
    908     push        rbp
    909     mov         rbp, rsp
    910     SHADOW_ARGS_TO_STACK 5
    911     ;SAVE_XMM                          ;xmm6, xmm7 are not used here.
    912     GET_GOT     rbx
    913     push        rsi
    914     push        rdi
    915     ; end prolog
    916 
    917         mov         rsi,        arg(0) ;src_ptr
    918         mov         rdi,        arg(1) ;output_ptr
    919 
    920         movsxd      rcx,        dword ptr arg(3) ;output_height
    921         movsxd      rax,        dword ptr arg(2) ;src_pixels_per_line            ; Pitch for Source
    922 
    923         pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
    924 %if ABI_IS_32BIT=0
    925         movsxd      r8,         dword ptr arg(4) ;output_width            ; Pitch for Source
    926 %endif
    927 
    928 unpack_block1d16_h6_sse2_rowloop:
    929         movq        xmm1,       MMWORD PTR [rsi]            ; 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 -2
    930         movq        xmm3,       MMWORD PTR [rsi+8]          ; make copy of xmm1
    931 
    932         punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
    933         punpcklbw   xmm1,       xmm0
    934 
    935         movdqa      XMMWORD Ptr [rdi],         xmm1
    936         movdqa      XMMWORD Ptr [rdi + 16],    xmm3
    937 
    938         lea         rsi,        [rsi + rax]
    939 %if ABI_IS_32BIT
    940         add         rdi,        DWORD Ptr arg(4) ;[output_width]
    941 %else
    942         add         rdi,        r8
    943 %endif
    944         dec         rcx
    945         jnz         unpack_block1d16_h6_sse2_rowloop                ; next row
    946 
    947     ; begin epilog
    948     pop rdi
    949     pop rsi
    950     RESTORE_GOT
    951     ;RESTORE_XMM
    952     UNSHADOW_ARGS
    953     pop         rbp
    954     ret
    955 
    956 
    957 ;void vp8_bilinear_predict16x16_sse2
    958 ;(
    959 ;    unsigned char  *src_ptr,
    960 ;    int   src_pixels_per_line,
    961 ;    int  xoffset,
    962 ;    int  yoffset,
    963 ;    unsigned char *dst_ptr,
    964 ;    int dst_pitch
    965 ;)
    966 extern sym(vp8_bilinear_filters_mmx)
    967 global sym(vp8_bilinear_predict16x16_sse2)
    968 sym(vp8_bilinear_predict16x16_sse2):
    969     push        rbp
    970     mov         rbp, rsp
    971     SHADOW_ARGS_TO_STACK 6
    972     SAVE_XMM
    973     GET_GOT     rbx
    974     push        rsi
    975     push        rdi
    976     ; end prolog
    977 
    978     ;const short *HFilter = bilinear_filters_mmx[xoffset]
    979     ;const short *VFilter = bilinear_filters_mmx[yoffset]
    980 
    981         lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_mmx))]
    982         movsxd      rax,        dword ptr arg(2) ;xoffset
    983 
    984         cmp         rax,        0      ;skip first_pass filter if xoffset=0
    985         je          b16x16_sp_only
    986 
    987         shl         rax,        5
    988         add         rax,        rcx    ;HFilter
    989 
    990         mov         rdi,        arg(4) ;dst_ptr
    991         mov         rsi,        arg(0) ;src_ptr
    992         movsxd      rdx,        dword ptr arg(5) ;dst_pitch
    993 
    994         movdqa      xmm1,       [rax]
    995         movdqa      xmm2,       [rax+16]
    996 
    997         movsxd      rax,        dword ptr arg(3) ;yoffset
    998 
    999         cmp         rax,        0      ;skip second_pass filter if yoffset=0
   1000         je          b16x16_fp_only
   1001 
   1002         shl         rax,        5
   1003         add         rax,        rcx    ;VFilter
   1004 
   1005         lea         rcx,        [rdi+rdx*8]
   1006         lea         rcx,        [rcx+rdx*8]
   1007         movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line
   1008 
   1009         pxor        xmm0,       xmm0
   1010 
   1011 %if ABI_IS_32BIT=0
   1012         movsxd      r8,         dword ptr arg(5) ;dst_pitch
   1013 %endif
   1014         ; get the first horizontal line done
   1015         movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
   1016         movdqa      xmm4,       xmm3                 ; make a copy of current line
   1017 
   1018         punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06
   1019         punpckhbw   xmm4,       xmm0
   1020 
   1021         pmullw      xmm3,       xmm1
   1022         pmullw      xmm4,       xmm1
   1023 
   1024         movdqu      xmm5,       [rsi+1]
   1025         movdqa      xmm6,       xmm5
   1026 
   1027         punpcklbw   xmm5,       xmm0
   1028         punpckhbw   xmm6,       xmm0
   1029 
   1030         pmullw      xmm5,       xmm2
   1031         pmullw      xmm6,       xmm2
   1032 
   1033         paddw       xmm3,       xmm5
   1034         paddw       xmm4,       xmm6
   1035 
   1036         paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
   1037         psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
   1038 
   1039         paddw       xmm4,       [GLOBAL(rd)]
   1040         psraw       xmm4,       VP8_FILTER_SHIFT
   1041 
   1042         movdqa      xmm7,       xmm3
   1043         packuswb    xmm7,       xmm4
   1044 
   1045         add         rsi,        rdx                 ; next line
   1046 next_row:
   1047         movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
   1048         movdqa      xmm4,       xmm3                 ; make a copy of current line
   1049 
   1050         punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06
   1051         punpckhbw   xmm4,       xmm0
   1052 
   1053         pmullw      xmm3,       xmm1
   1054         pmullw      xmm4,       xmm1
   1055 
   1056         movdqu      xmm5,       [rsi+1]
   1057         movdqa      xmm6,       xmm5
   1058 
   1059         punpcklbw   xmm5,       xmm0
   1060         punpckhbw   xmm6,       xmm0
   1061 
   1062         pmullw      xmm5,       xmm2
   1063         pmullw      xmm6,       xmm2
   1064 
   1065         paddw       xmm3,       xmm5
   1066         paddw       xmm4,       xmm6
   1067 
   1068         movdqa      xmm5,       xmm7
   1069         movdqa      xmm6,       xmm7
   1070 
   1071         punpcklbw   xmm5,       xmm0
   1072         punpckhbw   xmm6,       xmm0
   1073 
   1074         pmullw      xmm5,       [rax]
   1075         pmullw      xmm6,       [rax]
   1076 
   1077         paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
   1078         psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
   1079 
   1080         paddw       xmm4,       [GLOBAL(rd)]
   1081         psraw       xmm4,       VP8_FILTER_SHIFT
   1082 
   1083         movdqa      xmm7,       xmm3
   1084         packuswb    xmm7,       xmm4
   1085 
   1086         pmullw      xmm3,       [rax+16]
   1087         pmullw      xmm4,       [rax+16]
   1088 
   1089         paddw       xmm3,       xmm5
   1090         paddw       xmm4,       xmm6
   1091 
   1092         paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
   1093         psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
   1094 
   1095         paddw       xmm4,       [GLOBAL(rd)]
   1096         psraw       xmm4,       VP8_FILTER_SHIFT
   1097 
   1098         packuswb    xmm3,       xmm4
   1099         movdqa      [rdi],      xmm3                 ; store the results in the destination
   1100 
   1101         add         rsi,        rdx                 ; next line
   1102 %if ABI_IS_32BIT
   1103         add         rdi,        DWORD PTR arg(5) ;dst_pitch
   1104 %else
   1105         add         rdi,        r8
   1106 %endif
   1107 
   1108         cmp         rdi,        rcx
   1109         jne         next_row
   1110 
   1111         jmp         done
   1112 
   1113 b16x16_sp_only:
   1114         movsxd      rax,        dword ptr arg(3) ;yoffset
   1115         shl         rax,        5
   1116         add         rax,        rcx    ;VFilter
   1117 
   1118         mov         rdi,        arg(4) ;dst_ptr
   1119         mov         rsi,        arg(0) ;src_ptr
   1120         movsxd      rdx,        dword ptr arg(5) ;dst_pitch
   1121 
   1122         movdqa      xmm1,       [rax]
   1123         movdqa      xmm2,       [rax+16]
   1124 
   1125         lea         rcx,        [rdi+rdx*8]
   1126         lea         rcx,        [rcx+rdx*8]
   1127         movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line
   1128 
   1129         pxor        xmm0,       xmm0
   1130 
   1131         ; get the first horizontal line done
   1132         movdqu      xmm7,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
   1133 
   1134         add         rsi,        rax                 ; next line
   1135 next_row_spo:
   1136         movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
   1137 
   1138         movdqa      xmm5,       xmm7
   1139         movdqa      xmm6,       xmm7
   1140 
   1141         movdqa      xmm4,       xmm3                 ; make a copy of current line
   1142         movdqa      xmm7,       xmm3
   1143 
   1144         punpcklbw   xmm5,       xmm0
   1145         punpckhbw   xmm6,       xmm0
   1146         punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06
   1147         punpckhbw   xmm4,       xmm0
   1148 
   1149         pmullw      xmm5,       xmm1
   1150         pmullw      xmm6,       xmm1
   1151         pmullw      xmm3,       xmm2
   1152         pmullw      xmm4,       xmm2
   1153 
   1154         paddw       xmm3,       xmm5
   1155         paddw       xmm4,       xmm6
   1156 
   1157         paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
   1158         psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
   1159 
   1160         paddw       xmm4,       [GLOBAL(rd)]
   1161         psraw       xmm4,       VP8_FILTER_SHIFT
   1162 
   1163         packuswb    xmm3,       xmm4
   1164         movdqa      [rdi],      xmm3                 ; store the results in the destination
   1165 
   1166         add         rsi,        rax                 ; next line
   1167         add         rdi,        rdx                 ;dst_pitch
   1168         cmp         rdi,        rcx
   1169         jne         next_row_spo
   1170 
   1171         jmp         done
   1172 
   1173 b16x16_fp_only:
   1174         lea         rcx,        [rdi+rdx*8]
   1175         lea         rcx,        [rcx+rdx*8]
   1176         movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line
   1177         pxor        xmm0,       xmm0
   1178 
   1179 next_row_fpo:
   1180         movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
   1181         movdqa      xmm4,       xmm3                 ; make a copy of current line
   1182 
   1183         punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06
   1184         punpckhbw   xmm4,       xmm0
   1185 
   1186         pmullw      xmm3,       xmm1
   1187         pmullw      xmm4,       xmm1
   1188 
   1189         movdqu      xmm5,       [rsi+1]
   1190         movdqa      xmm6,       xmm5
   1191 
   1192         punpcklbw   xmm5,       xmm0
   1193         punpckhbw   xmm6,       xmm0
   1194 
   1195         pmullw      xmm5,       xmm2
   1196         pmullw      xmm6,       xmm2
   1197 
   1198         paddw       xmm3,       xmm5
   1199         paddw       xmm4,       xmm6
   1200 
   1201         paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
   1202         psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
   1203 
   1204         paddw       xmm4,       [GLOBAL(rd)]
   1205         psraw       xmm4,       VP8_FILTER_SHIFT
   1206 
   1207         packuswb    xmm3,       xmm4
   1208         movdqa      [rdi],      xmm3                 ; store the results in the destination
   1209 
   1210         add         rsi,        rax                 ; next line
   1211         add         rdi,        rdx                 ; dst_pitch
   1212         cmp         rdi,        rcx
   1213         jne         next_row_fpo
   1214 
   1215 done:
   1216     ; begin epilog
   1217     pop rdi
   1218     pop rsi
   1219     RESTORE_GOT
   1220     RESTORE_XMM
   1221     UNSHADOW_ARGS
   1222     pop         rbp
   1223     ret
   1224 
   1225 
   1226 ;void vp8_bilinear_predict8x8_sse2
   1227 ;(
   1228 ;    unsigned char  *src_ptr,
   1229 ;    int   src_pixels_per_line,
   1230 ;    int  xoffset,
   1231 ;    int  yoffset,
   1232 ;    unsigned char *dst_ptr,
   1233 ;    int dst_pitch
   1234 ;)
   1235 extern sym(vp8_bilinear_filters_mmx)
   1236 global sym(vp8_bilinear_predict8x8_sse2)
   1237 sym(vp8_bilinear_predict8x8_sse2):
   1238     push        rbp
   1239     mov         rbp, rsp
   1240     SHADOW_ARGS_TO_STACK 6
   1241     SAVE_XMM
   1242     GET_GOT     rbx
   1243     push        rsi
   1244     push        rdi
   1245     ; end prolog
   1246 
   1247     ALIGN_STACK 16, rax
   1248     sub         rsp, 144                         ; reserve 144 bytes
   1249 
   1250     ;const short *HFilter = bilinear_filters_mmx[xoffset]
   1251     ;const short *VFilter = bilinear_filters_mmx[yoffset]
   1252         lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_mmx))]
   1253 
   1254         mov         rsi,        arg(0) ;src_ptr
   1255         movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line
   1256 
   1257     ;Read 9-line unaligned data in and put them on stack. This gives a big
   1258     ;performance boost.
   1259         movdqu      xmm0,       [rsi]
   1260         lea         rax,        [rdx + rdx*2]
   1261         movdqu      xmm1,       [rsi+rdx]
   1262         movdqu      xmm2,       [rsi+rdx*2]
   1263         add         rsi,        rax
   1264         movdqu      xmm3,       [rsi]
   1265         movdqu      xmm4,       [rsi+rdx]
   1266         movdqu      xmm5,       [rsi+rdx*2]
   1267         add         rsi,        rax
   1268         movdqu      xmm6,       [rsi]
   1269         movdqu      xmm7,       [rsi+rdx]
   1270 
   1271         movdqa      XMMWORD PTR [rsp],            xmm0
   1272 
   1273         movdqu      xmm0,       [rsi+rdx*2]
   1274 
   1275         movdqa      XMMWORD PTR [rsp+16],         xmm1
   1276         movdqa      XMMWORD PTR [rsp+32],         xmm2
   1277         movdqa      XMMWORD PTR [rsp+48],         xmm3
   1278         movdqa      XMMWORD PTR [rsp+64],         xmm4
   1279         movdqa      XMMWORD PTR [rsp+80],         xmm5
   1280         movdqa      XMMWORD PTR [rsp+96],         xmm6
   1281         movdqa      XMMWORD PTR [rsp+112],        xmm7
   1282         movdqa      XMMWORD PTR [rsp+128],        xmm0
   1283 
   1284         movsxd      rax,        dword ptr arg(2) ;xoffset
   1285         shl         rax,        5
   1286         add         rax,        rcx    ;HFilter
   1287 
   1288         mov         rdi,        arg(4) ;dst_ptr
   1289         movsxd      rdx,        dword ptr arg(5) ;dst_pitch
   1290 
   1291         movdqa      xmm1,       [rax]
   1292         movdqa      xmm2,       [rax+16]
   1293 
   1294         movsxd      rax,        dword ptr arg(3) ;yoffset
   1295         shl         rax,        5
   1296         add         rax,        rcx    ;VFilter
   1297 
   1298         lea         rcx,        [rdi+rdx*8]
   1299 
   1300         movdqa      xmm5,       [rax]
   1301         movdqa      xmm6,       [rax+16]
   1302 
   1303         pxor        xmm0,       xmm0
   1304 
   1305         ; get the first horizontal line done
   1306         movdqa      xmm3,       XMMWORD PTR [rsp]
   1307         movdqa      xmm4,       xmm3                 ; make a copy of current line
   1308         psrldq      xmm4,       1
   1309 
   1310         punpcklbw   xmm3,       xmm0                 ; 00 01 02 03 04 05 06 07
   1311         punpcklbw   xmm4,       xmm0                 ; 01 02 03 04 05 06 07 08
   1312 
   1313         pmullw      xmm3,       xmm1
   1314         pmullw      xmm4,       xmm2
   1315 
   1316         paddw       xmm3,       xmm4
   1317 
   1318         paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
   1319         psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
   1320 
   1321         movdqa      xmm7,       xmm3
   1322         add         rsp,        16                 ; next line
   1323 next_row8x8:
   1324         movdqa      xmm3,       XMMWORD PTR [rsp]               ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
   1325         movdqa      xmm4,       xmm3                 ; make a copy of current line
   1326         psrldq      xmm4,       1
   1327 
   1328         punpcklbw   xmm3,       xmm0                 ; 00 01 02 03 04 05 06 07
   1329         punpcklbw   xmm4,       xmm0                 ; 01 02 03 04 05 06 07 08
   1330 
   1331         pmullw      xmm3,       xmm1
   1332         pmullw      xmm4,       xmm2
   1333 
   1334         paddw       xmm3,       xmm4
   1335         pmullw      xmm7,       xmm5
   1336 
   1337         paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
   1338         psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
   1339 
   1340         movdqa      xmm4,       xmm3
   1341 
   1342         pmullw      xmm3,       xmm6
   1343         paddw       xmm3,       xmm7
   1344 
   1345         movdqa      xmm7,       xmm4
   1346 
   1347         paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
   1348         psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
   1349 
   1350         packuswb    xmm3,       xmm0
   1351         movq        [rdi],      xmm3                 ; store the results in the destination
   1352 
   1353         add         rsp,        16                 ; next line
   1354         add         rdi,        rdx
   1355 
   1356         cmp         rdi,        rcx
   1357         jne         next_row8x8
   1358 
   1359     ;add rsp, 144
   1360     pop rsp
   1361     ; begin epilog
   1362     pop rdi
   1363     pop rsi
   1364     RESTORE_GOT
   1365     RESTORE_XMM
   1366     UNSHADOW_ARGS
   1367     pop         rbp
   1368     ret
   1369 
   1370 
   1371 SECTION_RODATA
   1372 align 16
   1373 rd:
   1374     times 8 dw 0x40
   1375