Home | History | Annotate | Download | only in x86
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12 %include "vpx_ports/x86_abi_support.asm"
     13 ;void vp8_recon_b_mmx(unsigned char *s, short *q, unsigned char *d, int stride)
     14 global sym(vp8_recon_b_mmx)
     15 sym(vp8_recon_b_mmx):
     16     push        rbp
     17     mov         rbp, rsp
     18     SHADOW_ARGS_TO_STACK 4
     19     push        rsi
     20     push        rdi
     21     ; end prolog
     22 
     23         mov       rsi, arg(0) ;s
     24         mov       rdi, arg(2) ;d
     25         mov       rdx, arg(1) ;q
     26         movsxd    rax, dword ptr arg(3) ;stride
     27         pxor      mm0, mm0
     28 
     29         movd      mm1, [rsi]
     30         punpcklbw mm1, mm0
     31         paddsw    mm1, [rdx]
     32         packuswb  mm1,  mm0              ; pack and unpack to saturate
     33         movd      [rdi], mm1
     34 
     35         movd      mm2, [rsi+16]
     36         punpcklbw mm2, mm0
     37         paddsw    mm2, [rdx+32]
     38         packuswb  mm2, mm0              ; pack and unpack to saturate
     39         movd      [rdi+rax], mm2
     40 
     41         movd      mm3, [rsi+32]
     42         punpcklbw mm3, mm0
     43         paddsw    mm3, [rdx+64]
     44         packuswb  mm3,  mm0              ; pack and unpack to saturate
     45         movd      [rdi+2*rax], mm3
     46 
     47         add       rdi, rax
     48         movd      mm4, [rsi+48]
     49         punpcklbw mm4, mm0
     50         paddsw    mm4, [rdx+96]
     51         packuswb  mm4, mm0              ; pack and unpack to saturate
     52         movd      [rdi+2*rax], mm4
     53 
     54     ; begin epilog
     55     pop rdi
     56     pop rsi
     57     UNSHADOW_ARGS
     58     pop         rbp
     59     ret
     60 
     61 
     62 ;void copy_mem8x8_mmx(
     63 ;    unsigned char *src,
     64 ;    int src_stride,
     65 ;    unsigned char *dst,
     66 ;    int dst_stride
     67 ;    )
     68 global sym(vp8_copy_mem8x8_mmx)
     69 sym(vp8_copy_mem8x8_mmx):
     70     push        rbp
     71     mov         rbp, rsp
     72     SHADOW_ARGS_TO_STACK 4
     73     push        rsi
     74     push        rdi
     75     ; end prolog
     76 
     77         mov         rsi,        arg(0) ;src;
     78         movq        mm0,        [rsi]
     79 
     80         movsxd      rax,        dword ptr arg(1) ;src_stride;
     81         mov         rdi,        arg(2) ;dst;
     82 
     83         movq        mm1,        [rsi+rax]
     84         movq        mm2,        [rsi+rax*2]
     85 
     86         movsxd      rcx,        dword ptr arg(3) ;dst_stride
     87         lea         rsi,        [rsi+rax*2]
     88 
     89         movq        [rdi],      mm0
     90         add         rsi,        rax
     91 
     92         movq        [rdi+rcx],      mm1
     93         movq        [rdi+rcx*2],    mm2
     94 
     95 
     96         lea         rdi,        [rdi+rcx*2]
     97         movq        mm3,        [rsi]
     98 
     99         add         rdi,        rcx
    100         movq        mm4,        [rsi+rax]
    101 
    102         movq        mm5,        [rsi+rax*2]
    103         movq        [rdi],      mm3
    104 
    105         lea         rsi,        [rsi+rax*2]
    106         movq        [rdi+rcx],  mm4
    107 
    108         movq        [rdi+rcx*2],    mm5
    109         lea         rdi,        [rdi+rcx*2]
    110 
    111         movq        mm0,        [rsi+rax]
    112         movq        mm1,        [rsi+rax*2]
    113 
    114         movq        [rdi+rcx],  mm0
    115         movq        [rdi+rcx*2],mm1
    116 
    117     ; begin epilog
    118     pop rdi
    119     pop rsi
    120     UNSHADOW_ARGS
    121     pop         rbp
    122     ret
    123 
    124 
    125 ;void copy_mem8x4_mmx(
    126 ;    unsigned char *src,
    127 ;    int src_stride,
    128 ;    unsigned char *dst,
    129 ;    int dst_stride
    130 ;    )
    131 global sym(vp8_copy_mem8x4_mmx)
    132 sym(vp8_copy_mem8x4_mmx):
    133     push        rbp
    134     mov         rbp, rsp
    135     SHADOW_ARGS_TO_STACK 4
    136     push        rsi
    137     push        rdi
    138     ; end prolog
    139 
    140         mov         rsi,        arg(0) ;src;
    141         movq        mm0,        [rsi]
    142 
    143         movsxd      rax,        dword ptr arg(1) ;src_stride;
    144         mov         rdi,        arg(2) ;dst;
    145 
    146         movq        mm1,        [rsi+rax]
    147         movq        mm2,        [rsi+rax*2]
    148 
    149         movsxd      rcx,        dword ptr arg(3) ;dst_stride
    150         lea         rsi,        [rsi+rax*2]
    151 
    152         movq        [rdi],      mm0
    153         movq        [rdi+rcx],      mm1
    154 
    155         movq        [rdi+rcx*2],    mm2
    156         lea         rdi,        [rdi+rcx*2]
    157 
    158         movq        mm3,        [rsi+rax]
    159         movq        [rdi+rcx],      mm3
    160 
    161     ; begin epilog
    162     pop rdi
    163     pop rsi
    164     UNSHADOW_ARGS
    165     pop         rbp
    166     ret
    167 
    168 
    169 ;void copy_mem16x16_mmx(
    170 ;    unsigned char *src,
    171 ;    int src_stride,
    172 ;    unsigned char *dst,
    173 ;    int dst_stride
    174 ;    )
    175 global sym(vp8_copy_mem16x16_mmx)
    176 sym(vp8_copy_mem16x16_mmx):
    177     push        rbp
    178     mov         rbp, rsp
    179     SHADOW_ARGS_TO_STACK 4
    180     push        rsi
    181     push        rdi
    182     ; end prolog
    183 
    184         mov         rsi,        arg(0) ;src;
    185         movsxd      rax,        dword ptr arg(1) ;src_stride;
    186 
    187         mov         rdi,        arg(2) ;dst;
    188         movsxd      rcx,        dword ptr arg(3) ;dst_stride
    189 
    190         movq        mm0,            [rsi]
    191         movq        mm3,            [rsi+8];
    192 
    193         movq        mm1,            [rsi+rax]
    194         movq        mm4,            [rsi+rax+8]
    195 
    196         movq        mm2,            [rsi+rax*2]
    197         movq        mm5,            [rsi+rax*2+8]
    198 
    199         lea         rsi,            [rsi+rax*2]
    200         add         rsi,            rax
    201 
    202         movq        [rdi],          mm0
    203         movq        [rdi+8],        mm3
    204 
    205         movq        [rdi+rcx],      mm1
    206         movq        [rdi+rcx+8],    mm4
    207 
    208         movq        [rdi+rcx*2],    mm2
    209         movq        [rdi+rcx*2+8],  mm5
    210 
    211         lea         rdi,            [rdi+rcx*2]
    212         add         rdi,            rcx
    213 
    214         movq        mm0,            [rsi]
    215         movq        mm3,            [rsi+8];
    216 
    217         movq        mm1,            [rsi+rax]
    218         movq        mm4,            [rsi+rax+8]
    219 
    220         movq        mm2,            [rsi+rax*2]
    221         movq        mm5,            [rsi+rax*2+8]
    222 
    223         lea         rsi,            [rsi+rax*2]
    224         add         rsi,            rax
    225 
    226         movq        [rdi],          mm0
    227         movq        [rdi+8],        mm3
    228 
    229         movq        [rdi+rcx],      mm1
    230         movq        [rdi+rcx+8],    mm4
    231 
    232         movq        [rdi+rcx*2],    mm2
    233         movq        [rdi+rcx*2+8],  mm5
    234 
    235         lea         rdi,            [rdi+rcx*2]
    236         add         rdi,            rcx
    237 
    238         movq        mm0,            [rsi]
    239         movq        mm3,            [rsi+8];
    240 
    241         movq        mm1,            [rsi+rax]
    242         movq        mm4,            [rsi+rax+8]
    243 
    244         movq        mm2,            [rsi+rax*2]
    245         movq        mm5,            [rsi+rax*2+8]
    246 
    247         lea         rsi,            [rsi+rax*2]
    248         add         rsi,            rax
    249 
    250         movq        [rdi],          mm0
    251         movq        [rdi+8],        mm3
    252 
    253         movq        [rdi+rcx],      mm1
    254         movq        [rdi+rcx+8],    mm4
    255 
    256         movq        [rdi+rcx*2],    mm2
    257         movq        [rdi+rcx*2+8],  mm5
    258 
    259         lea         rdi,            [rdi+rcx*2]
    260         add         rdi,            rcx
    261 
    262         movq        mm0,            [rsi]
    263         movq        mm3,            [rsi+8];
    264 
    265         movq        mm1,            [rsi+rax]
    266         movq        mm4,            [rsi+rax+8]
    267 
    268         movq        mm2,            [rsi+rax*2]
    269         movq        mm5,            [rsi+rax*2+8]
    270 
    271         lea         rsi,            [rsi+rax*2]
    272         add         rsi,            rax
    273 
    274         movq        [rdi],          mm0
    275         movq        [rdi+8],        mm3
    276 
    277         movq        [rdi+rcx],      mm1
    278         movq        [rdi+rcx+8],    mm4
    279 
    280         movq        [rdi+rcx*2],    mm2
    281         movq        [rdi+rcx*2+8],  mm5
    282 
    283         lea         rdi,            [rdi+rcx*2]
    284         add         rdi,            rcx
    285 
    286         movq        mm0,            [rsi]
    287         movq        mm3,            [rsi+8];
    288 
    289         movq        mm1,            [rsi+rax]
    290         movq        mm4,            [rsi+rax+8]
    291 
    292         movq        mm2,            [rsi+rax*2]
    293         movq        mm5,            [rsi+rax*2+8]
    294 
    295         lea         rsi,            [rsi+rax*2]
    296         add         rsi,            rax
    297 
    298         movq        [rdi],          mm0
    299         movq        [rdi+8],        mm3
    300 
    301         movq        [rdi+rcx],      mm1
    302         movq        [rdi+rcx+8],    mm4
    303 
    304         movq        [rdi+rcx*2],    mm2
    305         movq        [rdi+rcx*2+8],  mm5
    306 
    307         lea         rdi,            [rdi+rcx*2]
    308         add         rdi,            rcx
    309 
    310         movq        mm0,            [rsi]
    311         movq        mm3,            [rsi+8];
    312 
    313         movq        [rdi],          mm0
    314         movq        [rdi+8],        mm3
    315 
    316     ; begin epilog
    317     pop rdi
    318     pop rsi
    319     UNSHADOW_ARGS
    320     pop         rbp
    321     ret
    322