Home | History | Annotate | Download | only in x86
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12 %include "vpx_ports/x86_abi_support.asm"
     13 
     14 ;void vp8_subtract_b_mmx_impl(unsigned char *z,  int src_stride,
     15 ;                            short *diff, unsigned char *Predictor,
     16 ;                            int pitch);
     17 global sym(vp8_subtract_b_mmx_impl)
     18 sym(vp8_subtract_b_mmx_impl):
     19     push        rbp
     20     mov         rbp, rsp
     21     SHADOW_ARGS_TO_STACK 5
     22     push rsi
     23     push rdi
     24     ; end prolog
     25 
     26 
     27         mov     rdi,        arg(2) ;diff
     28         mov     rax,        arg(3) ;Predictor
     29         mov     rsi,        arg(0) ;z
     30         movsxd  rdx,        dword ptr arg(1);src_stride;
     31         movsxd  rcx,        dword ptr arg(4);pitch
     32         pxor    mm7,        mm7
     33 
     34         movd    mm0,        [rsi]
     35         movd    mm1,        [rax]
     36         punpcklbw   mm0,    mm7
     37         punpcklbw   mm1,    mm7
     38         psubw   mm0,        mm1
     39         movq    [rdi],      mm0
     40 
     41 
     42         movd    mm0,        [rsi+rdx]
     43         movd    mm1,        [rax+rcx]
     44         punpcklbw   mm0,    mm7
     45         punpcklbw   mm1,    mm7
     46         psubw   mm0,        mm1
     47         movq    [rdi+rcx*2],mm0
     48 
     49 
     50         movd    mm0,        [rsi+rdx*2]
     51         movd    mm1,        [rax+rcx*2]
     52         punpcklbw   mm0,    mm7
     53         punpcklbw   mm1,    mm7
     54         psubw   mm0,        mm1
     55         movq    [rdi+rcx*4],        mm0
     56 
     57         lea     rsi,        [rsi+rdx*2]
     58         lea     rcx,        [rcx+rcx*2]
     59 
     60 
     61 
     62         movd    mm0,        [rsi+rdx]
     63         movd    mm1,        [rax+rcx]
     64         punpcklbw   mm0,    mm7
     65         punpcklbw   mm1,    mm7
     66         psubw   mm0,        mm1
     67         movq    [rdi+rcx*2],        mm0
     68 
     69     ; begin epilog
     70     pop rdi
     71     pop rsi
     72     UNSHADOW_ARGS
     73     pop         rbp
     74     ret
     75 
     76 ;void vp8_subtract_mby_mmx(short *diff, unsigned char *src, unsigned char *pred, int stride)
     77 global sym(vp8_subtract_mby_mmx)
     78 sym(vp8_subtract_mby_mmx):
     79     push        rbp
     80     mov         rbp, rsp
     81     SHADOW_ARGS_TO_STACK 4
     82     push rsi
     83     push rdi
     84     ; end prolog
     85 
     86 
     87             mov         rsi,            arg(1) ;src
     88             mov         rdi,            arg(0) ;diff
     89 
     90             mov         rax,            arg(2) ;pred
     91             movsxd      rdx,            dword ptr arg(3) ;stride
     92 
     93             mov         rcx,            16
     94             pxor        mm0,            mm0
     95 
     96 submby_loop:
     97 
     98             movq        mm1,            [rsi]
     99             movq        mm3,            [rax]
    100 
    101             movq        mm2,            mm1
    102             movq        mm4,            mm3
    103 
    104             punpcklbw   mm1,            mm0
    105             punpcklbw   mm3,            mm0
    106 
    107             punpckhbw   mm2,            mm0
    108             punpckhbw   mm4,            mm0
    109 
    110             psubw       mm1,            mm3
    111             psubw       mm2,            mm4
    112 
    113             movq        [rdi],          mm1
    114             movq        [rdi+8],        mm2
    115 
    116 
    117             movq        mm1,            [rsi+8]
    118             movq        mm3,            [rax+8]
    119 
    120             movq        mm2,            mm1
    121             movq        mm4,            mm3
    122 
    123             punpcklbw   mm1,            mm0
    124             punpcklbw   mm3,            mm0
    125 
    126             punpckhbw   mm2,            mm0
    127             punpckhbw   mm4,            mm0
    128 
    129             psubw       mm1,            mm3
    130             psubw       mm2,            mm4
    131 
    132             movq        [rdi+16],       mm1
    133             movq        [rdi+24],       mm2
    134 
    135 
    136             add         rdi,            32
    137             add         rax,            16
    138 
    139             lea         rsi,            [rsi+rdx]
    140 
    141             sub         rcx,            1
    142             jnz         submby_loop
    143 
    144     pop rdi
    145     pop rsi
    146     ; begin epilog
    147     UNSHADOW_ARGS
    148     pop         rbp
    149     ret
    150 
    151 
    152 ;void vp8_subtract_mbuv_mmx(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride)
    153 global sym(vp8_subtract_mbuv_mmx)
    154 sym(vp8_subtract_mbuv_mmx):
    155     push        rbp
    156     mov         rbp, rsp
    157     SHADOW_ARGS_TO_STACK 5
    158     push rsi
    159     push rdi
    160     ; end prolog
    161 
    162     ;short *udiff = diff + 256;
    163     ;short *vdiff = diff + 320;
    164     ;unsigned char *upred = pred + 256;
    165     ;unsigned char *vpred = pred + 320;
    166 
    167         ;unsigned char  *z    = usrc;
    168         ;unsigned short *diff = udiff;
    169         ;unsigned char  *Predictor= upred;
    170 
    171             mov     rdi,        arg(0) ;diff
    172             mov     rax,        arg(3) ;pred
    173             mov     rsi,        arg(1) ;z = usrc
    174             add     rdi,        256*2  ;diff = diff + 256 (shorts)
    175             add     rax,        256    ;Predictor = pred + 256
    176             movsxd  rdx,        dword ptr arg(4) ;stride;
    177             pxor    mm7,        mm7
    178 
    179             movq    mm0,        [rsi]
    180             movq    mm1,        [rax]
    181             movq    mm3,        mm0
    182             movq    mm4,        mm1
    183             punpcklbw   mm0,    mm7
    184             punpcklbw   mm1,    mm7
    185             punpckhbw   mm3,    mm7
    186             punpckhbw   mm4,    mm7
    187             psubw   mm0,        mm1
    188             psubw   mm3,        mm4
    189             movq    [rdi],      mm0
    190             movq    [rdi+8],    mm3
    191 
    192 
    193             movq    mm0,        [rsi+rdx]
    194             movq    mm1,        [rax+8]
    195             movq    mm3,        mm0
    196             movq    mm4,        mm1
    197             punpcklbw   mm0,    mm7
    198             punpcklbw   mm1,    mm7
    199             punpckhbw   mm3,    mm7
    200             punpckhbw   mm4,    mm7
    201             psubw   mm0,        mm1
    202             psubw   mm3,        mm4
    203             movq    [rdi+16],   mm0
    204             movq    [rdi+24],   mm3
    205 
    206             movq    mm0,        [rsi+rdx*2]
    207             movq    mm1,        [rax+16]
    208             movq    mm3,        mm0
    209             movq    mm4,        mm1
    210             punpcklbw   mm0,    mm7
    211             punpcklbw   mm1,    mm7
    212             punpckhbw   mm3,    mm7
    213             punpckhbw   mm4,    mm7
    214             psubw   mm0,        mm1
    215             psubw   mm3,        mm4
    216             movq    [rdi+32],   mm0
    217             movq    [rdi+40],   mm3
    218             lea     rsi,        [rsi+rdx*2]
    219 
    220 
    221             movq    mm0,        [rsi+rdx]
    222             movq    mm1,        [rax+24]
    223             movq    mm3,        mm0
    224             movq    mm4,        mm1
    225             punpcklbw   mm0,    mm7
    226             punpcklbw   mm1,    mm7
    227             punpckhbw   mm3,    mm7
    228             punpckhbw   mm4,    mm7
    229             psubw   mm0,        mm1
    230             psubw   mm3,        mm4
    231 
    232             movq    [rdi+48],   mm0
    233             movq    [rdi+56],   mm3
    234 
    235 
    236             add     rdi,        64
    237             add     rax,        32
    238             lea     rsi,        [rsi+rdx*2]
    239 
    240 
    241             movq    mm0,        [rsi]
    242             movq    mm1,        [rax]
    243             movq    mm3,        mm0
    244             movq    mm4,        mm1
    245             punpcklbw   mm0,    mm7
    246             punpcklbw   mm1,    mm7
    247             punpckhbw   mm3,    mm7
    248             punpckhbw   mm4,    mm7
    249             psubw   mm0,        mm1
    250             psubw   mm3,        mm4
    251             movq    [rdi],      mm0
    252             movq    [rdi+8],    mm3
    253 
    254 
    255             movq    mm0,        [rsi+rdx]
    256             movq    mm1,        [rax+8]
    257             movq    mm3,        mm0
    258             movq    mm4,        mm1
    259             punpcklbw   mm0,    mm7
    260             punpcklbw   mm1,    mm7
    261             punpckhbw   mm3,    mm7
    262             punpckhbw   mm4,    mm7
    263             psubw   mm0,        mm1
    264             psubw   mm3,        mm4
    265             movq    [rdi+16],   mm0
    266             movq    [rdi+24],   mm3
    267 
    268             movq    mm0,        [rsi+rdx*2]
    269             movq    mm1,        [rax+16]
    270             movq    mm3,        mm0
    271             movq    mm4,        mm1
    272             punpcklbw   mm0,    mm7
    273             punpcklbw   mm1,    mm7
    274             punpckhbw   mm3,    mm7
    275             punpckhbw   mm4,    mm7
    276             psubw   mm0,        mm1
    277             psubw   mm3,        mm4
    278             movq    [rdi+32],   mm0
    279             movq    [rdi+40],   mm3
    280             lea     rsi,        [rsi+rdx*2]
    281 
    282 
    283             movq    mm0,        [rsi+rdx]
    284             movq    mm1,        [rax+24]
    285             movq    mm3,        mm0
    286             movq    mm4,        mm1
    287             punpcklbw   mm0,    mm7
    288             punpcklbw   mm1,    mm7
    289             punpckhbw   mm3,    mm7
    290             punpckhbw   mm4,    mm7
    291             psubw   mm0,        mm1
    292             psubw   mm3,        mm4
    293 
    294             movq    [rdi+48],   mm0
    295             movq    [rdi+56],   mm3
    296 
    297         ;unsigned char  *z    = vsrc;
    298         ;unsigned short *diff = vdiff;
    299         ;unsigned char  *Predictor= vpred;
    300 
    301             mov     rdi,        arg(0) ;diff
    302             mov     rax,        arg(3) ;pred
    303             mov     rsi,        arg(2) ;z = usrc
    304             add     rdi,        320*2  ;diff = diff + 320 (shorts)
    305             add     rax,        320    ;Predictor = pred + 320
    306             movsxd  rdx,        dword ptr arg(4) ;stride;
    307             pxor    mm7,        mm7
    308 
    309             movq    mm0,        [rsi]
    310             movq    mm1,        [rax]
    311             movq    mm3,        mm0
    312             movq    mm4,        mm1
    313             punpcklbw   mm0,    mm7
    314             punpcklbw   mm1,    mm7
    315             punpckhbw   mm3,    mm7
    316             punpckhbw   mm4,    mm7
    317             psubw   mm0,        mm1
    318             psubw   mm3,        mm4
    319             movq    [rdi],      mm0
    320             movq    [rdi+8],    mm3
    321 
    322 
    323             movq    mm0,        [rsi+rdx]
    324             movq    mm1,        [rax+8]
    325             movq    mm3,        mm0
    326             movq    mm4,        mm1
    327             punpcklbw   mm0,    mm7
    328             punpcklbw   mm1,    mm7
    329             punpckhbw   mm3,    mm7
    330             punpckhbw   mm4,    mm7
    331             psubw   mm0,        mm1
    332             psubw   mm3,        mm4
    333             movq    [rdi+16],   mm0
    334             movq    [rdi+24],   mm3
    335 
    336             movq    mm0,        [rsi+rdx*2]
    337             movq    mm1,        [rax+16]
    338             movq    mm3,        mm0
    339             movq    mm4,        mm1
    340             punpcklbw   mm0,    mm7
    341             punpcklbw   mm1,    mm7
    342             punpckhbw   mm3,    mm7
    343             punpckhbw   mm4,    mm7
    344             psubw   mm0,        mm1
    345             psubw   mm3,        mm4
    346             movq    [rdi+32],   mm0
    347             movq    [rdi+40],   mm3
    348             lea     rsi,        [rsi+rdx*2]
    349 
    350 
    351             movq    mm0,        [rsi+rdx]
    352             movq    mm1,        [rax+24]
    353             movq    mm3,        mm0
    354             movq    mm4,        mm1
    355             punpcklbw   mm0,    mm7
    356             punpcklbw   mm1,    mm7
    357             punpckhbw   mm3,    mm7
    358             punpckhbw   mm4,    mm7
    359             psubw   mm0,        mm1
    360             psubw   mm3,        mm4
    361 
    362             movq    [rdi+48],   mm0
    363             movq    [rdi+56],   mm3
    364 
    365 
    366             add     rdi,        64
    367             add     rax,        32
    368             lea     rsi,        [rsi+rdx*2]
    369 
    370 
    371             movq    mm0,        [rsi]
    372             movq    mm1,        [rax]
    373             movq    mm3,        mm0
    374             movq    mm4,        mm1
    375             punpcklbw   mm0,    mm7
    376             punpcklbw   mm1,    mm7
    377             punpckhbw   mm3,    mm7
    378             punpckhbw   mm4,    mm7
    379             psubw   mm0,        mm1
    380             psubw   mm3,        mm4
    381             movq    [rdi],      mm0
    382             movq    [rdi+8],    mm3
    383 
    384 
    385             movq    mm0,        [rsi+rdx]
    386             movq    mm1,        [rax+8]
    387             movq    mm3,        mm0
    388             movq    mm4,        mm1
    389             punpcklbw   mm0,    mm7
    390             punpcklbw   mm1,    mm7
    391             punpckhbw   mm3,    mm7
    392             punpckhbw   mm4,    mm7
    393             psubw   mm0,        mm1
    394             psubw   mm3,        mm4
    395             movq    [rdi+16],   mm0
    396             movq    [rdi+24],   mm3
    397 
    398             movq    mm0,        [rsi+rdx*2]
    399             movq    mm1,        [rax+16]
    400             movq    mm3,        mm0
    401             movq    mm4,        mm1
    402             punpcklbw   mm0,    mm7
    403             punpcklbw   mm1,    mm7
    404             punpckhbw   mm3,    mm7
    405             punpckhbw   mm4,    mm7
    406             psubw   mm0,        mm1
    407             psubw   mm3,        mm4
    408             movq    [rdi+32],   mm0
    409             movq    [rdi+40],   mm3
    410             lea     rsi,        [rsi+rdx*2]
    411 
    412 
    413             movq    mm0,        [rsi+rdx]
    414             movq    mm1,        [rax+24]
    415             movq    mm3,        mm0
    416             movq    mm4,        mm1
    417             punpcklbw   mm0,    mm7
    418             punpcklbw   mm1,    mm7
    419             punpckhbw   mm3,    mm7
    420             punpckhbw   mm4,    mm7
    421             psubw   mm0,        mm1
    422             psubw   mm3,        mm4
    423 
    424             movq    [rdi+48],   mm0
    425             movq    [rdi+56],   mm3
    426 
    427     ; begin epilog
    428     pop rdi
    429     pop rsi
    430     UNSHADOW_ARGS
    431     pop         rbp
    432     ret
    433