Home | History | Annotate | Download | only in x86
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12 %include "vpx_ports/x86_abi_support.asm"
     13 
     14 
     15 %define BLOCK_HEIGHT_WIDTH 4
     16 %define vp8_filter_weight 128
     17 %define VP8_FILTER_SHIFT  7
     18 
     19 
     20 ;void vp8_filter_block1d_h6_mmx
     21 ;(
     22 ;    unsigned char   *src_ptr,
     23 ;    unsigned short  *output_ptr,
     24 ;    unsigned int    src_pixels_per_line,
     25 ;    unsigned int    pixel_step,
     26 ;    unsigned int    output_height,
     27 ;    unsigned int    output_width,
     28 ;    short           * vp8_filter
     29 ;)
     30 global sym(vp8_filter_block1d_h6_mmx)
     31 sym(vp8_filter_block1d_h6_mmx):
     32     push        rbp
     33     mov         rbp, rsp
     34     SHADOW_ARGS_TO_STACK 7
     35     GET_GOT     rbx
     36     push        rsi
     37     push        rdi
     38     ; end prolog
     39 
     40         mov         rdx,    arg(6) ;vp8_filter
     41 
     42         movq        mm1,    [rdx + 16]             ; do both the negative taps first!!!
     43         movq        mm2,    [rdx + 32]         ;
     44         movq        mm6,    [rdx + 48]        ;
     45         movq        mm7,    [rdx + 64]        ;
     46 
     47         mov         rdi,    arg(1) ;output_ptr
     48         mov         rsi,    arg(0) ;src_ptr
     49         movsxd      rcx,    dword ptr arg(4) ;output_height
     50         movsxd      rax,    dword ptr arg(5) ;output_width      ; destination pitch?
     51         pxor        mm0,    mm0              ; mm0 = 00000000
     52 
     53 nextrow:
     54         movq        mm3,    [rsi-2]          ; mm3 = p-2..p5
     55         movq        mm4,    mm3              ; mm4 = p-2..p5
     56         psrlq       mm3,    8                ; mm3 = p-1..p5
     57         punpcklbw   mm3,    mm0              ; mm3 = p-1..p2
     58         pmullw      mm3,    mm1              ; mm3 *= kernel 1 modifiers.
     59 
     60         movq        mm5,    mm4              ; mm5 = p-2..p5
     61         punpckhbw   mm4,    mm0              ; mm5 = p2..p5
     62         pmullw      mm4,    mm7              ; mm5 *= kernel 4 modifiers
     63         paddsw      mm3,    mm4              ; mm3 += mm5
     64 
     65         movq        mm4,    mm5              ; mm4 = p-2..p5;
     66         psrlq       mm5,    16               ; mm5 = p0..p5;
     67         punpcklbw   mm5,    mm0              ; mm5 = p0..p3
     68         pmullw      mm5,    mm2              ; mm5 *= kernel 2 modifiers
     69         paddsw      mm3,    mm5              ; mm3 += mm5
     70 
     71         movq        mm5,    mm4              ; mm5 = p-2..p5
     72         psrlq       mm4,    24               ; mm4 = p1..p5
     73         punpcklbw   mm4,    mm0              ; mm4 = p1..p4
     74         pmullw      mm4,    mm6              ; mm5 *= kernel 3 modifiers
     75         paddsw      mm3,    mm4              ; mm3 += mm5
     76 
     77         ; do outer positive taps
     78         movd        mm4,    [rsi+3]
     79         punpcklbw   mm4,    mm0              ; mm5 = p3..p6
     80         pmullw      mm4,    [rdx+80]         ; mm5 *= kernel 0 modifiers
     81         paddsw      mm3,    mm4              ; mm3 += mm5
     82 
     83         punpcklbw   mm5,    mm0              ; mm5 = p-2..p1
     84         pmullw      mm5,    [rdx]            ; mm5 *= kernel 5 modifiers
     85         paddsw      mm3,    mm5              ; mm3 += mm5
     86 
     87         paddsw      mm3,    [GLOBAL(rd)]              ; mm3 += round value
     88         psraw       mm3,    VP8_FILTER_SHIFT     ; mm3 /= 128
     89         packuswb    mm3,    mm0              ; pack and unpack to saturate
     90         punpcklbw   mm3,    mm0              ;
     91 
     92         movq        [rdi],  mm3              ; store the results in the destination
     93 
     94 %if ABI_IS_32BIT
     95         add         rsi,    dword ptr arg(2) ;src_pixels_per_line ; next line
     96         add         rdi,    rax;
     97 %else
     98         movsxd      r8,     dword ptr arg(2) ;src_pixels_per_line
     99         add         rdi,    rax;
    100 
    101         add         rsi,    r8               ; next line
    102 %endif
    103 
    104         dec         rcx                      ; decrement count
    105         jnz         nextrow                  ; next row
    106 
    107     ; begin epilog
    108     pop rdi
    109     pop rsi
    110     RESTORE_GOT
    111     UNSHADOW_ARGS
    112     pop         rbp
    113     ret
    114 
    115 
    116 ;void vp8_filter_block1dc_v6_mmx
    117 ;(
    118 ;   short *src_ptr,
    119 ;   unsigned char *output_ptr,
    120 ;    int output_pitch,
    121 ;   unsigned int pixels_per_line,
    122 ;   unsigned int pixel_step,
    123 ;   unsigned int output_height,
    124 ;   unsigned int output_width,
    125 ;   short * vp8_filter
    126 ;)
    127 global sym(vp8_filter_block1dc_v6_mmx)
    128 sym(vp8_filter_block1dc_v6_mmx):
    129     push        rbp
    130     mov         rbp, rsp
    131     SHADOW_ARGS_TO_STACK 8
    132     GET_GOT     rbx
    133     push        rsi
    134     push        rdi
    135     ; end prolog
    136 
    137         movq      mm5, [GLOBAL(rd)]
    138         push        rbx
    139         mov         rbx, arg(7) ;vp8_filter
    140         movq      mm1, [rbx + 16]             ; do both the negative taps first!!!
    141         movq      mm2, [rbx + 32]         ;
    142         movq      mm6, [rbx + 48]        ;
    143         movq      mm7, [rbx + 64]        ;
    144 
    145         movsxd      rdx, dword ptr arg(3) ;pixels_per_line
    146         mov         rdi, arg(1) ;output_ptr
    147         mov         rsi, arg(0) ;src_ptr
    148         sub         rsi, rdx
    149         sub         rsi, rdx
    150         movsxd      rcx, DWORD PTR arg(5) ;output_height
    151         movsxd      rax, DWORD PTR arg(2) ;output_pitch      ; destination pitch?
    152         pxor        mm0, mm0              ; mm0 = 00000000
    153 
    154 
    155 nextrow_cv:
    156         movq        mm3, [rsi+rdx]        ; mm3 = p0..p8  = row -1
    157         pmullw      mm3, mm1              ; mm3 *= kernel 1 modifiers.
    158 
    159 
    160         movq        mm4, [rsi + 4*rdx]      ; mm4 = p0..p3  = row 2
    161         pmullw      mm4, mm7              ; mm4 *= kernel 4 modifiers.
    162         paddsw      mm3, mm4              ; mm3 += mm4
    163 
    164         movq        mm4, [rsi + 2*rdx]           ; mm4 = p0..p3  = row 0
    165         pmullw      mm4, mm2              ; mm4 *= kernel 2 modifiers.
    166         paddsw      mm3, mm4              ; mm3 += mm4
    167 
    168         movq        mm4, [rsi]            ; mm4 = p0..p3  = row -2
    169         pmullw      mm4, [rbx]            ; mm4 *= kernel 0 modifiers.
    170         paddsw      mm3, mm4              ; mm3 += mm4
    171 
    172 
    173         add         rsi, rdx              ; move source forward 1 line to avoid 3 * pitch
    174         movq        mm4, [rsi + 2*rdx]     ; mm4 = p0..p3  = row 1
    175         pmullw      mm4, mm6              ; mm4 *= kernel 3 modifiers.
    176         paddsw      mm3, mm4              ; mm3 += mm4
    177 
    178         movq        mm4, [rsi + 4*rdx]    ; mm4 = p0..p3  = row 3
    179         pmullw      mm4, [rbx +80]        ; mm4 *= kernel 3 modifiers.
    180         paddsw      mm3, mm4              ; mm3 += mm4
    181 
    182 
    183         paddsw      mm3, mm5               ; mm3 += round value
    184         psraw       mm3, VP8_FILTER_SHIFT     ; mm3 /= 128
    185         packuswb    mm3, mm0              ; pack and saturate
    186 
    187         movd        [rdi],mm3             ; store the results in the destination
    188         ; the subsequent iterations repeat 3 out of 4 of these reads.  Since the
    189         ; recon block should be in cache this shouldn't cost much.  Its obviously
    190         ; avoidable!!!.
    191         lea         rdi,  [rdi+rax] ;
    192         dec         rcx                   ; decrement count
    193         jnz         nextrow_cv             ; next row
    194 
    195         pop         rbx
    196 
    197     ; begin epilog
    198     pop rdi
    199     pop rsi
    200     RESTORE_GOT
    201     UNSHADOW_ARGS
    202     pop         rbp
    203     ret
    204 
    205 
    206 ;void bilinear_predict8x8_mmx
    207 ;(
    208 ;    unsigned char  *src_ptr,
    209 ;    int   src_pixels_per_line,
    210 ;    int  xoffset,
    211 ;    int  yoffset,
    212 ;   unsigned char *dst_ptr,
    213 ;    int dst_pitch
    214 ;)
    215 global sym(vp8_bilinear_predict8x8_mmx)
    216 sym(vp8_bilinear_predict8x8_mmx):
    217     push        rbp
    218     mov         rbp, rsp
    219     SHADOW_ARGS_TO_STACK 6
    220     GET_GOT     rbx
    221     push        rsi
    222     push        rdi
    223     ; end prolog
    224 
    225     ;const short *HFilter = bilinear_filters_mmx[xoffset];
    226     ;const short *VFilter = bilinear_filters_mmx[yoffset];
    227 
    228         movsxd      rax,        dword ptr arg(2) ;xoffset
    229         mov         rdi,        arg(4) ;dst_ptr           ;
    230 
    231         shl         rax,        5 ; offset * 32
    232         lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_mmx))]
    233 
    234         add         rax,        rcx ; HFilter
    235         mov         rsi,        arg(0) ;src_ptr              ;
    236 
    237         movsxd      rdx,        dword ptr arg(5) ;dst_pitch
    238         movq        mm1,        [rax]               ;
    239 
    240         movq        mm2,        [rax+16]            ;
    241         movsxd      rax,        dword ptr arg(3) ;yoffset
    242 
    243         pxor        mm0,        mm0                 ;
    244 
    245         shl         rax,        5 ; offset*32
    246         add         rax,        rcx ; VFilter
    247 
    248         lea         rcx,        [rdi+rdx*8]          ;
    249         movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line    ;
    250 
    251 
    252 
    253         ; get the first horizontal line done       ;
    254         movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
    255         movq        mm4,        mm3                 ; make a copy of current line
    256 
    257         punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
    258         punpckhbw   mm4,        mm0                 ;
    259 
    260         pmullw      mm3,        mm1                 ;
    261         pmullw      mm4,        mm1                 ;
    262 
    263         movq        mm5,        [rsi+1]             ;
    264         movq        mm6,        mm5                 ;
    265 
    266         punpcklbw   mm5,        mm0                 ;
    267         punpckhbw   mm6,        mm0                 ;
    268 
    269         pmullw      mm5,        mm2                 ;
    270         pmullw      mm6,        mm2                 ;
    271 
    272         paddw       mm3,        mm5                 ;
    273         paddw       mm4,        mm6                 ;
    274 
    275         paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
    276         psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
    277 
    278         paddw       mm4,        [GLOBAL(rd)]                 ;
    279         psraw       mm4,        VP8_FILTER_SHIFT        ;
    280 
    281         movq        mm7,        mm3                 ;
    282         packuswb    mm7,        mm4                 ;
    283 
    284         add         rsi,        rdx                 ; next line
    285 next_row_8x8:
    286         movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
    287         movq        mm4,        mm3                 ; make a copy of current line
    288 
    289         punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
    290         punpckhbw   mm4,        mm0                 ;
    291 
    292         pmullw      mm3,        mm1                 ;
    293         pmullw      mm4,        mm1                 ;
    294 
    295         movq        mm5,        [rsi+1]             ;
    296         movq        mm6,        mm5                 ;
    297 
    298         punpcklbw   mm5,        mm0                 ;
    299         punpckhbw   mm6,        mm0                 ;
    300 
    301         pmullw      mm5,        mm2                 ;
    302         pmullw      mm6,        mm2                 ;
    303 
    304         paddw       mm3,        mm5                 ;
    305         paddw       mm4,        mm6                 ;
    306 
    307         movq        mm5,        mm7                 ;
    308         movq        mm6,        mm7                 ;
    309 
    310         punpcklbw   mm5,        mm0                 ;
    311         punpckhbw   mm6,        mm0
    312 
    313         pmullw      mm5,        [rax]               ;
    314         pmullw      mm6,        [rax]               ;
    315 
    316         paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
    317         psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
    318 
    319         paddw       mm4,        [GLOBAL(rd)]                 ;
    320         psraw       mm4,        VP8_FILTER_SHIFT        ;
    321 
    322         movq        mm7,        mm3                 ;
    323         packuswb    mm7,        mm4                 ;
    324 
    325 
    326         pmullw      mm3,        [rax+16]            ;
    327         pmullw      mm4,        [rax+16]            ;
    328 
    329         paddw       mm3,        mm5                 ;
    330         paddw       mm4,        mm6                 ;
    331 
    332 
    333         paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
    334         psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
    335 
    336         paddw       mm4,        [GLOBAL(rd)]                 ;
    337         psraw       mm4,        VP8_FILTER_SHIFT        ;
    338 
    339         packuswb    mm3,        mm4
    340 
    341         movq        [rdi],      mm3                 ; store the results in the destination
    342 
    343 %if ABI_IS_32BIT
    344         add         rsi,        rdx                 ; next line
    345         add         rdi,        dword ptr arg(5) ;dst_pitch                   ;
    346 %else
    347         movsxd      r8,         dword ptr arg(5) ;dst_pitch
    348         add         rsi,        rdx                 ; next line
    349         add         rdi,        r8                  ;dst_pitch
    350 %endif
    351         cmp         rdi,        rcx                 ;
    352         jne         next_row_8x8
    353 
    354     ; begin epilog
    355     pop rdi
    356     pop rsi
    357     RESTORE_GOT
    358     UNSHADOW_ARGS
    359     pop         rbp
    360     ret
    361 
    362 
    363 ;void bilinear_predict8x4_mmx
    364 ;(
    365 ;    unsigned char  *src_ptr,
    366 ;    int   src_pixels_per_line,
    367 ;    int  xoffset,
    368 ;    int  yoffset,
    369 ;    unsigned char *dst_ptr,
    370 ;    int dst_pitch
    371 ;)
    372 global sym(vp8_bilinear_predict8x4_mmx)
    373 sym(vp8_bilinear_predict8x4_mmx):
    374     push        rbp
    375     mov         rbp, rsp
    376     SHADOW_ARGS_TO_STACK 6
    377     GET_GOT     rbx
    378     push        rsi
    379     push        rdi
    380     ; end prolog
    381 
    382     ;const short *HFilter = bilinear_filters_mmx[xoffset];
    383     ;const short *VFilter = bilinear_filters_mmx[yoffset];
    384 
    385         movsxd      rax,        dword ptr arg(2) ;xoffset
    386         mov         rdi,        arg(4) ;dst_ptr           ;
    387 
    388         lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_mmx))]
    389         shl         rax,        5
    390 
    391         mov         rsi,        arg(0) ;src_ptr              ;
    392         add         rax,        rcx
    393 
    394         movsxd      rdx,        dword ptr arg(5) ;dst_pitch
    395         movq        mm1,        [rax]               ;
    396 
    397         movq        mm2,        [rax+16]            ;
    398         movsxd      rax,        dword ptr arg(3) ;yoffset
    399 
    400         pxor        mm0,        mm0                 ;
    401         shl         rax,        5
    402 
    403         add         rax,        rcx
    404         lea         rcx,        [rdi+rdx*4]          ;
    405 
    406         movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line    ;
    407 
    408         ; get the first horizontal line done       ;
    409         movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
    410         movq        mm4,        mm3                 ; make a copy of current line
    411 
    412         punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
    413         punpckhbw   mm4,        mm0                 ;
    414 
    415         pmullw      mm3,        mm1                 ;
    416         pmullw      mm4,        mm1                 ;
    417 
    418         movq        mm5,        [rsi+1]             ;
    419         movq        mm6,        mm5                 ;
    420 
    421         punpcklbw   mm5,        mm0                 ;
    422         punpckhbw   mm6,        mm0                 ;
    423 
    424         pmullw      mm5,        mm2                 ;
    425         pmullw      mm6,        mm2                 ;
    426 
    427         paddw       mm3,        mm5                 ;
    428         paddw       mm4,        mm6                 ;
    429 
    430         paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
    431         psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
    432 
    433         paddw       mm4,        [GLOBAL(rd)]                 ;
    434         psraw       mm4,        VP8_FILTER_SHIFT        ;
    435 
    436         movq        mm7,        mm3                 ;
    437         packuswb    mm7,        mm4                 ;
    438 
    439         add         rsi,        rdx                 ; next line
    440 next_row_8x4:
    441         movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
    442         movq        mm4,        mm3                 ; make a copy of current line
    443 
    444         punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
    445         punpckhbw   mm4,        mm0                 ;
    446 
    447         pmullw      mm3,        mm1                 ;
    448         pmullw      mm4,        mm1                 ;
    449 
    450         movq        mm5,        [rsi+1]             ;
    451         movq        mm6,        mm5                 ;
    452 
    453         punpcklbw   mm5,        mm0                 ;
    454         punpckhbw   mm6,        mm0                 ;
    455 
    456         pmullw      mm5,        mm2                 ;
    457         pmullw      mm6,        mm2                 ;
    458 
    459         paddw       mm3,        mm5                 ;
    460         paddw       mm4,        mm6                 ;
    461 
    462         movq        mm5,        mm7                 ;
    463         movq        mm6,        mm7                 ;
    464 
    465         punpcklbw   mm5,        mm0                 ;
    466         punpckhbw   mm6,        mm0
    467 
    468         pmullw      mm5,        [rax]               ;
    469         pmullw      mm6,        [rax]               ;
    470 
    471         paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
    472         psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
    473 
    474         paddw       mm4,        [GLOBAL(rd)]                 ;
    475         psraw       mm4,        VP8_FILTER_SHIFT        ;
    476 
    477         movq        mm7,        mm3                 ;
    478         packuswb    mm7,        mm4                 ;
    479 
    480 
    481         pmullw      mm3,        [rax+16]            ;
    482         pmullw      mm4,        [rax+16]            ;
    483 
    484         paddw       mm3,        mm5                 ;
    485         paddw       mm4,        mm6                 ;
    486 
    487 
    488         paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
    489         psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
    490 
    491         paddw       mm4,        [GLOBAL(rd)]                 ;
    492         psraw       mm4,        VP8_FILTER_SHIFT        ;
    493 
    494         packuswb    mm3,        mm4
    495 
    496         movq        [rdi],      mm3                 ; store the results in the destination
    497 
    498 %if ABI_IS_32BIT
    499         add         rsi,        rdx                 ; next line
    500         add         rdi,        dword ptr arg(5) ;dst_pitch                   ;
    501 %else
    502         movsxd      r8,         dword ptr arg(5) ;dst_pitch
    503         add         rsi,        rdx                 ; next line
    504         add         rdi,        r8
    505 %endif
    506         cmp         rdi,        rcx                 ;
    507         jne         next_row_8x4
    508 
    509     ; begin epilog
    510     pop rdi
    511     pop rsi
    512     RESTORE_GOT
    513     UNSHADOW_ARGS
    514     pop         rbp
    515     ret
    516 
    517 
    518 ;void bilinear_predict4x4_mmx
    519 ;(
    520 ;    unsigned char  *src_ptr,
    521 ;    int   src_pixels_per_line,
    522 ;    int  xoffset,
    523 ;    int  yoffset,
    524 ;    unsigned char *dst_ptr,
    525 ;    int dst_pitch
    526 ;)
    527 global sym(vp8_bilinear_predict4x4_mmx)
    528 sym(vp8_bilinear_predict4x4_mmx):
    529     push        rbp
    530     mov         rbp, rsp
    531     SHADOW_ARGS_TO_STACK 6
    532     GET_GOT     rbx
    533     push        rsi
    534     push        rdi
    535     ; end prolog
    536 
    537     ;const short *HFilter = bilinear_filters_mmx[xoffset];
    538     ;const short *VFilter = bilinear_filters_mmx[yoffset];
    539 
    540         movsxd      rax,        dword ptr arg(2) ;xoffset
    541         mov         rdi,        arg(4) ;dst_ptr           ;
    542 
    543         lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_mmx))]
    544         shl         rax,        5
    545 
    546         add         rax,        rcx ; HFilter
    547         mov         rsi,        arg(0) ;src_ptr              ;
    548 
    549         movsxd      rdx,        dword ptr arg(5) ;ldst_pitch
    550         movq        mm1,        [rax]               ;
    551 
    552         movq        mm2,        [rax+16]            ;
    553         movsxd      rax,        dword ptr arg(3) ;yoffset
    554 
    555         pxor        mm0,        mm0                 ;
    556         shl         rax,        5
    557 
    558         add         rax,        rcx
    559         lea         rcx,        [rdi+rdx*4]          ;
    560 
    561         movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line    ;
    562 
    563         ; get the first horizontal line done       ;
    564         movd        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
    565         punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
    566 
    567         pmullw      mm3,        mm1                 ;
    568         movd        mm5,        [rsi+1]             ;
    569 
    570         punpcklbw   mm5,        mm0                 ;
    571         pmullw      mm5,        mm2                 ;
    572 
    573         paddw       mm3,        mm5                 ;
    574         paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
    575 
    576         psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
    577 
    578         movq        mm7,        mm3                 ;
    579         packuswb    mm7,        mm0                 ;
    580 
    581         add         rsi,        rdx                 ; next line
    582 next_row_4x4:
    583         movd        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
    584         punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
    585 
    586         pmullw      mm3,        mm1                 ;
    587         movd        mm5,        [rsi+1]             ;
    588 
    589         punpcklbw   mm5,        mm0                 ;
    590         pmullw      mm5,        mm2                 ;
    591 
    592         paddw       mm3,        mm5                 ;
    593 
    594         movq        mm5,        mm7                 ;
    595         punpcklbw   mm5,        mm0                 ;
    596 
    597         pmullw      mm5,        [rax]               ;
    598         paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
    599 
    600         psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
    601         movq        mm7,        mm3                 ;
    602 
    603         packuswb    mm7,        mm0                 ;
    604 
    605         pmullw      mm3,        [rax+16]            ;
    606         paddw       mm3,        mm5                 ;
    607 
    608 
    609         paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
    610         psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
    611 
    612         packuswb    mm3,        mm0
    613         movd        [rdi],      mm3                 ; store the results in the destination
    614 
    615 %if ABI_IS_32BIT
    616         add         rsi,        rdx                 ; next line
    617         add         rdi,        dword ptr arg(5) ;dst_pitch                   ;
    618 %else
    619         movsxd      r8,         dword ptr arg(5) ;dst_pitch                   ;
    620         add         rsi,        rdx                 ; next line
    621         add         rdi,        r8
    622 %endif
    623 
    624         cmp         rdi,        rcx                 ;
    625         jne         next_row_4x4
    626 
    627     ; begin epilog
    628     pop rdi
    629     pop rsi
    630     RESTORE_GOT
    631     UNSHADOW_ARGS
    632     pop         rbp
    633     ret
    634 
    635 
    636 
    637 SECTION_RODATA
    638 align 16
    639 rd:
    640     times 4 dw 0x40
    641 
    642 align 16
    643 global HIDDEN_DATA(sym(vp8_six_tap_mmx))
    644 sym(vp8_six_tap_mmx):
    645     times 8 dw 0
    646     times 8 dw 0
    647     times 8 dw 128
    648     times 8 dw 0
    649     times 8 dw 0
    650     times 8 dw 0
    651 
    652     times 8 dw 0
    653     times 8 dw -6
    654     times 8 dw 123
    655     times 8 dw 12
    656     times 8 dw -1
    657     times 8 dw 0
    658 
    659     times 8 dw 2
    660     times 8 dw -11
    661     times 8 dw 108
    662     times 8 dw 36
    663     times 8 dw -8
    664     times 8 dw 1
    665 
    666     times 8 dw 0
    667     times 8 dw -9
    668     times 8 dw 93
    669     times 8 dw 50
    670     times 8 dw -6
    671     times 8 dw 0
    672 
    673     times 8 dw 3
    674     times 8 dw -16
    675     times 8 dw 77
    676     times 8 dw 77
    677     times 8 dw -16
    678     times 8 dw 3
    679 
    680     times 8 dw 0
    681     times 8 dw -6
    682     times 8 dw 50
    683     times 8 dw 93
    684     times 8 dw -9
    685     times 8 dw 0
    686 
    687     times 8 dw 1
    688     times 8 dw -8
    689     times 8 dw 36
    690     times 8 dw 108
    691     times 8 dw -11
    692     times 8 dw 2
    693 
    694     times 8 dw 0
    695     times 8 dw -1
    696     times 8 dw 12
    697     times 8 dw 123
    698     times 8 dw -6
    699     times 8 dw 0
    700 
    701 
    702 align 16
    703 global HIDDEN_DATA(sym(vp8_bilinear_filters_mmx))
    704 sym(vp8_bilinear_filters_mmx):
    705     times 8 dw 128
    706     times 8 dw 0
    707 
    708     times 8 dw 112
    709     times 8 dw 16
    710 
    711     times 8 dw 96
    712     times 8 dw 32
    713 
    714     times 8 dw 80
    715     times 8 dw 48
    716 
    717     times 8 dw 64
    718     times 8 dw 64
    719 
    720     times 8 dw 48
    721     times 8 dw 80
    722 
    723     times 8 dw 32
    724     times 8 dw 96
    725 
    726     times 8 dw 16
    727     times 8 dw 112
    728