Home | History | Annotate | Download | only in x86
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12 %include "vpx_ports/x86_abi_support.asm"
     13 extern sym(vp8_bilinear_filters_x86_8)
     14 
     15 
     16 %define BLOCK_HEIGHT_WIDTH 4
     17 %define vp8_filter_weight 128
     18 %define VP8_FILTER_SHIFT  7
     19 
     20 SECTION .text
     21 
     22 ;void vp8_filter_block1d_h6_mmx
     23 ;(
     24 ;    unsigned char   *src_ptr,
     25 ;    unsigned short  *output_ptr,
     26 ;    unsigned int    src_pixels_per_line,
     27 ;    unsigned int    pixel_step,
     28 ;    unsigned int    output_height,
     29 ;    unsigned int    output_width,
     30 ;    short           * vp8_filter
     31 ;)
     32 global sym(vp8_filter_block1d_h6_mmx) PRIVATE
     33 sym(vp8_filter_block1d_h6_mmx):
     34     push        rbp
     35     mov         rbp, rsp
     36     SHADOW_ARGS_TO_STACK 7
     37     GET_GOT     rbx
     38     push        rsi
     39     push        rdi
     40     ; end prolog
     41 
     42         mov         rdx,    arg(6) ;vp8_filter
     43 
     44         movq        mm1,    [rdx + 16]             ; do both the negative taps first!!!
     45         movq        mm2,    [rdx + 32]         ;
     46         movq        mm6,    [rdx + 48]        ;
     47         movq        mm7,    [rdx + 64]        ;
     48 
     49         mov         rdi,    arg(1) ;output_ptr
     50         mov         rsi,    arg(0) ;src_ptr
     51         movsxd      rcx,    dword ptr arg(4) ;output_height
     52         movsxd      rax,    dword ptr arg(5) ;output_width      ; destination pitch?
     53         pxor        mm0,    mm0              ; mm0 = 00000000
     54 
     55 .nextrow:
     56         movq        mm3,    [rsi-2]          ; mm3 = p-2..p5
     57         movq        mm4,    mm3              ; mm4 = p-2..p5
     58         psrlq       mm3,    8                ; mm3 = p-1..p5
     59         punpcklbw   mm3,    mm0              ; mm3 = p-1..p2
     60         pmullw      mm3,    mm1              ; mm3 *= kernel 1 modifiers.
     61 
     62         movq        mm5,    mm4              ; mm5 = p-2..p5
     63         punpckhbw   mm4,    mm0              ; mm5 = p2..p5
     64         pmullw      mm4,    mm7              ; mm5 *= kernel 4 modifiers
     65         paddsw      mm3,    mm4              ; mm3 += mm5
     66 
     67         movq        mm4,    mm5              ; mm4 = p-2..p5;
     68         psrlq       mm5,    16               ; mm5 = p0..p5;
     69         punpcklbw   mm5,    mm0              ; mm5 = p0..p3
     70         pmullw      mm5,    mm2              ; mm5 *= kernel 2 modifiers
     71         paddsw      mm3,    mm5              ; mm3 += mm5
     72 
     73         movq        mm5,    mm4              ; mm5 = p-2..p5
     74         psrlq       mm4,    24               ; mm4 = p1..p5
     75         punpcklbw   mm4,    mm0              ; mm4 = p1..p4
     76         pmullw      mm4,    mm6              ; mm5 *= kernel 3 modifiers
     77         paddsw      mm3,    mm4              ; mm3 += mm5
     78 
     79         ; do outer positive taps
     80         movd        mm4,    [rsi+3]
     81         punpcklbw   mm4,    mm0              ; mm5 = p3..p6
     82         pmullw      mm4,    [rdx+80]         ; mm5 *= kernel 0 modifiers
     83         paddsw      mm3,    mm4              ; mm3 += mm5
     84 
     85         punpcklbw   mm5,    mm0              ; mm5 = p-2..p1
     86         pmullw      mm5,    [rdx]            ; mm5 *= kernel 5 modifiers
     87         paddsw      mm3,    mm5              ; mm3 += mm5
     88 
     89         paddsw      mm3,    [GLOBAL(rd)]              ; mm3 += round value
     90         psraw       mm3,    VP8_FILTER_SHIFT     ; mm3 /= 128
     91         packuswb    mm3,    mm0              ; pack and unpack to saturate
     92         punpcklbw   mm3,    mm0              ;
     93 
     94         movq        [rdi],  mm3              ; store the results in the destination
     95 
     96 %if ABI_IS_32BIT
     97         add         rsi,    dword ptr arg(2) ;src_pixels_per_line ; next line
     98         add         rdi,    rax;
     99 %else
    100         movsxd      r8,     dword ptr arg(2) ;src_pixels_per_line
    101         add         rdi,    rax;
    102 
    103         add         rsi,    r8               ; next line
    104 %endif
    105 
    106         dec         rcx                      ; decrement count
    107         jnz         .nextrow                 ; next row
    108 
    109     ; begin epilog
    110     pop rdi
    111     pop rsi
    112     RESTORE_GOT
    113     UNSHADOW_ARGS
    114     pop         rbp
    115     ret
    116 
    117 
    118 ;void vp8_filter_block1dc_v6_mmx
    119 ;(
    120 ;   short *src_ptr,
    121 ;   unsigned char *output_ptr,
    122 ;    int output_pitch,
    123 ;   unsigned int pixels_per_line,
    124 ;   unsigned int pixel_step,
    125 ;   unsigned int output_height,
    126 ;   unsigned int output_width,
    127 ;   short * vp8_filter
    128 ;)
    129 global sym(vp8_filter_block1dc_v6_mmx) PRIVATE
    130 sym(vp8_filter_block1dc_v6_mmx):
    131     push        rbp
    132     mov         rbp, rsp
    133     SHADOW_ARGS_TO_STACK 8
    134     GET_GOT     rbx
    135     push        rsi
    136     push        rdi
    137     ; end prolog
    138 
    139         movq      mm5, [GLOBAL(rd)]
    140         push        rbx
    141         mov         rbx, arg(7) ;vp8_filter
    142         movq      mm1, [rbx + 16]             ; do both the negative taps first!!!
    143         movq      mm2, [rbx + 32]         ;
    144         movq      mm6, [rbx + 48]        ;
    145         movq      mm7, [rbx + 64]        ;
    146 
    147         movsxd      rdx, dword ptr arg(3) ;pixels_per_line
    148         mov         rdi, arg(1) ;output_ptr
    149         mov         rsi, arg(0) ;src_ptr
    150         sub         rsi, rdx
    151         sub         rsi, rdx
    152         movsxd      rcx, DWORD PTR arg(5) ;output_height
    153         movsxd      rax, DWORD PTR arg(2) ;output_pitch      ; destination pitch?
    154         pxor        mm0, mm0              ; mm0 = 00000000
    155 
    156 
    157 .nextrow_cv:
    158         movq        mm3, [rsi+rdx]        ; mm3 = p0..p8  = row -1
    159         pmullw      mm3, mm1              ; mm3 *= kernel 1 modifiers.
    160 
    161 
    162         movq        mm4, [rsi + 4*rdx]      ; mm4 = p0..p3  = row 2
    163         pmullw      mm4, mm7              ; mm4 *= kernel 4 modifiers.
    164         paddsw      mm3, mm4              ; mm3 += mm4
    165 
    166         movq        mm4, [rsi + 2*rdx]           ; mm4 = p0..p3  = row 0
    167         pmullw      mm4, mm2              ; mm4 *= kernel 2 modifiers.
    168         paddsw      mm3, mm4              ; mm3 += mm4
    169 
    170         movq        mm4, [rsi]            ; mm4 = p0..p3  = row -2
    171         pmullw      mm4, [rbx]            ; mm4 *= kernel 0 modifiers.
    172         paddsw      mm3, mm4              ; mm3 += mm4
    173 
    174 
    175         add         rsi, rdx              ; move source forward 1 line to avoid 3 * pitch
    176         movq        mm4, [rsi + 2*rdx]     ; mm4 = p0..p3  = row 1
    177         pmullw      mm4, mm6              ; mm4 *= kernel 3 modifiers.
    178         paddsw      mm3, mm4              ; mm3 += mm4
    179 
    180         movq        mm4, [rsi + 4*rdx]    ; mm4 = p0..p3  = row 3
    181         pmullw      mm4, [rbx +80]        ; mm4 *= kernel 3 modifiers.
    182         paddsw      mm3, mm4              ; mm3 += mm4
    183 
    184 
    185         paddsw      mm3, mm5               ; mm3 += round value
    186         psraw       mm3, VP8_FILTER_SHIFT     ; mm3 /= 128
    187         packuswb    mm3, mm0              ; pack and saturate
    188 
    189         movd        [rdi],mm3             ; store the results in the destination
    190         ; the subsequent iterations repeat 3 out of 4 of these reads.  Since the
    191         ; recon block should be in cache this shouldn't cost much.  Its obviously
    192         ; avoidable!!!.
    193         lea         rdi,  [rdi+rax] ;
    194         dec         rcx                   ; decrement count
    195         jnz         .nextrow_cv           ; next row
    196 
    197         pop         rbx
    198 
    199     ; begin epilog
    200     pop rdi
    201     pop rsi
    202     RESTORE_GOT
    203     UNSHADOW_ARGS
    204     pop         rbp
    205     ret
    206 
    207 
    208 ;void bilinear_predict8x4_mmx
    209 ;(
    210 ;    unsigned char  *src_ptr,
    211 ;    int   src_pixels_per_line,
    212 ;    int  xoffset,
    213 ;    int  yoffset,
    214 ;    unsigned char *dst_ptr,
    215 ;    int dst_pitch
    216 ;)
    217 global sym(vp8_bilinear_predict8x4_mmx) PRIVATE
    218 sym(vp8_bilinear_predict8x4_mmx):
    219     push        rbp
    220     mov         rbp, rsp
    221     SHADOW_ARGS_TO_STACK 6
    222     GET_GOT     rbx
    223     push        rsi
    224     push        rdi
    225     ; end prolog
    226 
    227     ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset];
    228     ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset];
    229 
    230         movsxd      rax,        dword ptr arg(2) ;xoffset
    231         mov         rdi,        arg(4) ;dst_ptr           ;
    232 
    233         lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
    234         shl         rax,        5
    235 
    236         mov         rsi,        arg(0) ;src_ptr              ;
    237         add         rax,        rcx
    238 
    239         movsxd      rdx,        dword ptr arg(5) ;dst_pitch
    240         movq        mm1,        [rax]               ;
    241 
    242         movq        mm2,        [rax+16]            ;
    243         movsxd      rax,        dword ptr arg(3) ;yoffset
    244 
    245         pxor        mm0,        mm0                 ;
    246         shl         rax,        5
    247 
    248         add         rax,        rcx
    249         lea         rcx,        [rdi+rdx*4]          ;
    250 
    251         movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line    ;
    252 
    253         ; get the first horizontal line done       ;
    254         movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
    255         movq        mm4,        mm3                 ; make a copy of current line
    256 
    257         punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
    258         punpckhbw   mm4,        mm0                 ;
    259 
    260         pmullw      mm3,        mm1                 ;
    261         pmullw      mm4,        mm1                 ;
    262 
    263         movq        mm5,        [rsi+1]             ;
    264         movq        mm6,        mm5                 ;
    265 
    266         punpcklbw   mm5,        mm0                 ;
    267         punpckhbw   mm6,        mm0                 ;
    268 
    269         pmullw      mm5,        mm2                 ;
    270         pmullw      mm6,        mm2                 ;
    271 
    272         paddw       mm3,        mm5                 ;
    273         paddw       mm4,        mm6                 ;
    274 
    275         paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
    276         psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
    277 
    278         paddw       mm4,        [GLOBAL(rd)]                 ;
    279         psraw       mm4,        VP8_FILTER_SHIFT        ;
    280 
    281         movq        mm7,        mm3                 ;
    282         packuswb    mm7,        mm4                 ;
    283 
    284         add         rsi,        rdx                 ; next line
    285 .next_row_8x4:
    286         movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
    287         movq        mm4,        mm3                 ; make a copy of current line
    288 
    289         punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
    290         punpckhbw   mm4,        mm0                 ;
    291 
    292         pmullw      mm3,        mm1                 ;
    293         pmullw      mm4,        mm1                 ;
    294 
    295         movq        mm5,        [rsi+1]             ;
    296         movq        mm6,        mm5                 ;
    297 
    298         punpcklbw   mm5,        mm0                 ;
    299         punpckhbw   mm6,        mm0                 ;
    300 
    301         pmullw      mm5,        mm2                 ;
    302         pmullw      mm6,        mm2                 ;
    303 
    304         paddw       mm3,        mm5                 ;
    305         paddw       mm4,        mm6                 ;
    306 
    307         movq        mm5,        mm7                 ;
    308         movq        mm6,        mm7                 ;
    309 
    310         punpcklbw   mm5,        mm0                 ;
    311         punpckhbw   mm6,        mm0
    312 
    313         pmullw      mm5,        [rax]               ;
    314         pmullw      mm6,        [rax]               ;
    315 
    316         paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
    317         psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
    318 
    319         paddw       mm4,        [GLOBAL(rd)]                 ;
    320         psraw       mm4,        VP8_FILTER_SHIFT        ;
    321 
    322         movq        mm7,        mm3                 ;
    323         packuswb    mm7,        mm4                 ;
    324 
    325 
    326         pmullw      mm3,        [rax+16]            ;
    327         pmullw      mm4,        [rax+16]            ;
    328 
    329         paddw       mm3,        mm5                 ;
    330         paddw       mm4,        mm6                 ;
    331 
    332 
    333         paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
    334         psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
    335 
    336         paddw       mm4,        [GLOBAL(rd)]                 ;
    337         psraw       mm4,        VP8_FILTER_SHIFT        ;
    338 
    339         packuswb    mm3,        mm4
    340 
    341         movq        [rdi],      mm3                 ; store the results in the destination
    342 
    343 %if ABI_IS_32BIT
    344         add         rsi,        rdx                 ; next line
    345         add         rdi,        dword ptr arg(5) ;dst_pitch                   ;
    346 %else
    347         movsxd      r8,         dword ptr arg(5) ;dst_pitch
    348         add         rsi,        rdx                 ; next line
    349         add         rdi,        r8
    350 %endif
    351         cmp         rdi,        rcx                 ;
    352         jne         .next_row_8x4
    353 
    354     ; begin epilog
    355     pop rdi
    356     pop rsi
    357     RESTORE_GOT
    358     UNSHADOW_ARGS
    359     pop         rbp
    360     ret
    361 
    362 
    363 ;void bilinear_predict4x4_mmx
    364 ;(
    365 ;    unsigned char  *src_ptr,
    366 ;    int   src_pixels_per_line,
    367 ;    int  xoffset,
    368 ;    int  yoffset,
    369 ;    unsigned char *dst_ptr,
    370 ;    int dst_pitch
    371 ;)
    372 global sym(vp8_bilinear_predict4x4_mmx) PRIVATE
    373 sym(vp8_bilinear_predict4x4_mmx):
    374     push        rbp
    375     mov         rbp, rsp
    376     SHADOW_ARGS_TO_STACK 6
    377     GET_GOT     rbx
    378     push        rsi
    379     push        rdi
    380     ; end prolog
    381 
    382     ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset];
    383     ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset];
    384 
    385         movsxd      rax,        dword ptr arg(2) ;xoffset
    386         mov         rdi,        arg(4) ;dst_ptr           ;
    387 
    388         lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
    389         shl         rax,        5
    390 
    391         add         rax,        rcx ; HFilter
    392         mov         rsi,        arg(0) ;src_ptr              ;
    393 
    394         movsxd      rdx,        dword ptr arg(5) ;ldst_pitch
    395         movq        mm1,        [rax]               ;
    396 
    397         movq        mm2,        [rax+16]            ;
    398         movsxd      rax,        dword ptr arg(3) ;yoffset
    399 
    400         pxor        mm0,        mm0                 ;
    401         shl         rax,        5
    402 
    403         add         rax,        rcx
    404         lea         rcx,        [rdi+rdx*4]          ;
    405 
    406         movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line    ;
    407 
    408         ; get the first horizontal line done       ;
    409         movd        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
    410         punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
    411 
    412         pmullw      mm3,        mm1                 ;
    413         movd        mm5,        [rsi+1]             ;
    414 
    415         punpcklbw   mm5,        mm0                 ;
    416         pmullw      mm5,        mm2                 ;
    417 
    418         paddw       mm3,        mm5                 ;
    419         paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
    420 
    421         psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
    422 
    423         movq        mm7,        mm3                 ;
    424         packuswb    mm7,        mm0                 ;
    425 
    426         add         rsi,        rdx                 ; next line
    427 .next_row_4x4:
    428         movd        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
    429         punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
    430 
    431         pmullw      mm3,        mm1                 ;
    432         movd        mm5,        [rsi+1]             ;
    433 
    434         punpcklbw   mm5,        mm0                 ;
    435         pmullw      mm5,        mm2                 ;
    436 
    437         paddw       mm3,        mm5                 ;
    438 
    439         movq        mm5,        mm7                 ;
    440         punpcklbw   mm5,        mm0                 ;
    441 
    442         pmullw      mm5,        [rax]               ;
    443         paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
    444 
    445         psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
    446         movq        mm7,        mm3                 ;
    447 
    448         packuswb    mm7,        mm0                 ;
    449 
    450         pmullw      mm3,        [rax+16]            ;
    451         paddw       mm3,        mm5                 ;
    452 
    453 
    454         paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
    455         psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
    456 
    457         packuswb    mm3,        mm0
    458         movd        [rdi],      mm3                 ; store the results in the destination
    459 
    460 %if ABI_IS_32BIT
    461         add         rsi,        rdx                 ; next line
    462         add         rdi,        dword ptr arg(5) ;dst_pitch                   ;
    463 %else
    464         movsxd      r8,         dword ptr arg(5) ;dst_pitch                   ;
    465         add         rsi,        rdx                 ; next line
    466         add         rdi,        r8
    467 %endif
    468 
    469         cmp         rdi,        rcx                 ;
    470         jne         .next_row_4x4
    471 
    472     ; begin epilog
    473     pop rdi
    474     pop rsi
    475     RESTORE_GOT
    476     UNSHADOW_ARGS
    477     pop         rbp
    478     ret
    479 
    480 
    481 
    482 SECTION_RODATA
    483 align 16
    484 rd:
    485     times 4 dw 0x40
    486 
    487 align 16
    488 global HIDDEN_DATA(sym(vp8_six_tap_x86))
    489 sym(vp8_six_tap_x86):
    490     times 8 dw 0
    491     times 8 dw 0
    492     times 8 dw 128
    493     times 8 dw 0
    494     times 8 dw 0
    495     times 8 dw 0
    496 
    497     times 8 dw 0
    498     times 8 dw -6
    499     times 8 dw 123
    500     times 8 dw 12
    501     times 8 dw -1
    502     times 8 dw 0
    503 
    504     times 8 dw 2
    505     times 8 dw -11
    506     times 8 dw 108
    507     times 8 dw 36
    508     times 8 dw -8
    509     times 8 dw 1
    510 
    511     times 8 dw 0
    512     times 8 dw -9
    513     times 8 dw 93
    514     times 8 dw 50
    515     times 8 dw -6
    516     times 8 dw 0
    517 
    518     times 8 dw 3
    519     times 8 dw -16
    520     times 8 dw 77
    521     times 8 dw 77
    522     times 8 dw -16
    523     times 8 dw 3
    524 
    525     times 8 dw 0
    526     times 8 dw -6
    527     times 8 dw 50
    528     times 8 dw 93
    529     times 8 dw -9
    530     times 8 dw 0
    531 
    532     times 8 dw 1
    533     times 8 dw -8
    534     times 8 dw 36
    535     times 8 dw 108
    536     times 8 dw -11
    537     times 8 dw 2
    538 
    539     times 8 dw 0
    540     times 8 dw -1
    541     times 8 dw 12
    542     times 8 dw 123
    543     times 8 dw -6
    544     times 8 dw 0
    545 
    546 
    547