Home | History | Annotate | Download | only in x86
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12 %include "vpx_ports/x86_abi_support.asm"
     13 ;void vp8_recon2b_sse2(unsigned char *s, short *q, unsigned char *d, int stride)
     14 global sym(vp8_recon2b_sse2)
     15 sym(vp8_recon2b_sse2):
     16     push        rbp
     17     mov         rbp, rsp
     18     SHADOW_ARGS_TO_STACK 4
     19     push        rsi
     20     push        rdi
     21     ; end prolog
     22 
     23         mov         rsi,        arg(0) ;s
     24         mov         rdi,        arg(2) ;d
     25         mov         rdx,        arg(1) ;q
     26         movsxd      rax,        dword ptr arg(3) ;stride
     27         pxor        xmm0,       xmm0
     28 
     29         movq        xmm1,       MMWORD PTR [rsi]
     30         punpcklbw   xmm1,       xmm0
     31         paddsw      xmm1,       XMMWORD PTR [rdx]
     32         packuswb    xmm1,       xmm0              ; pack and unpack to saturate
     33         movq        MMWORD PTR [rdi],   xmm1
     34 
     35 
     36         movq        xmm2,       MMWORD PTR [rsi+8]
     37         punpcklbw   xmm2,       xmm0
     38         paddsw      xmm2,       XMMWORD PTR [rdx+16]
     39         packuswb    xmm2,       xmm0              ; pack and unpack to saturate
     40         movq        MMWORD PTR [rdi+rax],   xmm2
     41 
     42 
     43         movq        xmm3,       MMWORD PTR [rsi+16]
     44         punpcklbw   xmm3,       xmm0
     45         paddsw      xmm3,       XMMWORD PTR [rdx+32]
     46         packuswb    xmm3,       xmm0              ; pack and unpack to saturate
     47         movq        MMWORD PTR [rdi+rax*2], xmm3
     48 
     49         add         rdi, rax
     50         movq        xmm4,       MMWORD PTR [rsi+24]
     51         punpcklbw   xmm4,       xmm0
     52         paddsw      xmm4,       XMMWORD PTR [rdx+48]
     53         packuswb    xmm4,       xmm0              ; pack and unpack to saturate
     54         movq        MMWORD PTR [rdi+rax*2], xmm4
     55 
     56     ; begin epilog
     57     pop rdi
     58     pop rsi
     59     UNSHADOW_ARGS
     60     pop         rbp
     61     ret
     62 
     63 
     64 ;void vp8_recon4b_sse2(unsigned char *s, short *q, unsigned char *d, int stride)
     65 global sym(vp8_recon4b_sse2)
     66 sym(vp8_recon4b_sse2):
     67     push        rbp
     68     mov         rbp, rsp
     69     SHADOW_ARGS_TO_STACK 4
     70     SAVE_XMM
     71     push        rsi
     72     push        rdi
     73     ; end prolog
     74 
     75         mov         rsi,        arg(0) ;s
     76         mov         rdi,        arg(2) ;d
     77         mov         rdx,        arg(1) ;q
     78         movsxd      rax,        dword ptr arg(3) ;stride
     79         pxor        xmm0,       xmm0
     80 
     81         movdqa      xmm1,       XMMWORD PTR [rsi]
     82         movdqa      xmm5,       xmm1
     83         punpcklbw   xmm1,       xmm0
     84         punpckhbw   xmm5,       xmm0
     85         paddsw      xmm1,       XMMWORD PTR [rdx]
     86         paddsw      xmm5,       XMMWORD PTR [rdx+16]
     87         packuswb    xmm1,       xmm5              ; pack and unpack to saturate
     88         movdqa      XMMWORD PTR [rdi],  xmm1
     89 
     90 
     91         movdqa      xmm2,       XMMWORD PTR [rsi+16]
     92         movdqa      xmm6,       xmm2
     93         punpcklbw   xmm2,       xmm0
     94         punpckhbw   xmm6,       xmm0
     95         paddsw      xmm2,       XMMWORD PTR [rdx+32]
     96         paddsw      xmm6,       XMMWORD PTR [rdx+48]
     97         packuswb    xmm2,       xmm6              ; pack and unpack to saturate
     98         movdqa      XMMWORD PTR [rdi+rax],  xmm2
     99 
    100 
    101         movdqa      xmm3,       XMMWORD PTR [rsi+32]
    102         movdqa      xmm7,       xmm3
    103         punpcklbw   xmm3,       xmm0
    104         punpckhbw   xmm7,       xmm0
    105         paddsw      xmm3,       XMMWORD PTR [rdx+64]
    106         paddsw      xmm7,       XMMWORD PTR [rdx+80]
    107         packuswb    xmm3,       xmm7              ; pack and unpack to saturate
    108         movdqa      XMMWORD PTR [rdi+rax*2],    xmm3
    109 
    110         add       rdi, rax
    111         movdqa      xmm4,       XMMWORD PTR [rsi+48]
    112         movdqa      xmm5,       xmm4
    113         punpcklbw   xmm4,       xmm0
    114         punpckhbw   xmm5,       xmm0
    115         paddsw      xmm4,       XMMWORD PTR [rdx+96]
    116         paddsw      xmm5,       XMMWORD PTR [rdx+112]
    117         packuswb    xmm4,       xmm5              ; pack and unpack to saturate
    118         movdqa      XMMWORD PTR [rdi+rax*2],    xmm4
    119 
    120     ; begin epilog
    121     pop rdi
    122     pop rsi
    123     RESTORE_XMM
    124     UNSHADOW_ARGS
    125     pop         rbp
    126     ret
    127 
    128 
    129 ;void copy_mem16x16_sse2(
    130 ;    unsigned char *src,
    131 ;    int src_stride,
    132 ;    unsigned char *dst,
    133 ;    int dst_stride
    134 ;    )
    135 global sym(vp8_copy_mem16x16_sse2)
    136 sym(vp8_copy_mem16x16_sse2):
    137     push        rbp
    138     mov         rbp, rsp
    139     SHADOW_ARGS_TO_STACK 4
    140     push        rsi
    141     push        rdi
    142     ; end prolog
    143 
    144         mov         rsi,        arg(0) ;src;
    145         movdqu      xmm0,       [rsi]
    146 
    147         movsxd      rax,        dword ptr arg(1) ;src_stride;
    148         mov         rdi,        arg(2) ;dst;
    149 
    150         movdqu      xmm1,       [rsi+rax]
    151         movdqu      xmm2,       [rsi+rax*2]
    152 
    153         movsxd      rcx,        dword ptr arg(3) ;dst_stride
    154         lea         rsi,        [rsi+rax*2]
    155 
    156         movdqa      [rdi],      xmm0
    157         add         rsi,        rax
    158 
    159         movdqa      [rdi+rcx],  xmm1
    160         movdqa      [rdi+rcx*2],xmm2
    161 
    162         lea         rdi,        [rdi+rcx*2]
    163         movdqu      xmm3,       [rsi]
    164 
    165         add         rdi,        rcx
    166         movdqu      xmm4,       [rsi+rax]
    167 
    168         movdqu      xmm5,       [rsi+rax*2]
    169         lea         rsi,        [rsi+rax*2]
    170 
    171         movdqa      [rdi],  xmm3
    172         add         rsi,        rax
    173 
    174         movdqa      [rdi+rcx],  xmm4
    175         movdqa      [rdi+rcx*2],xmm5
    176 
    177         lea         rdi,        [rdi+rcx*2]
    178         movdqu      xmm0,       [rsi]
    179 
    180         add         rdi,        rcx
    181         movdqu      xmm1,       [rsi+rax]
    182 
    183         movdqu      xmm2,       [rsi+rax*2]
    184         lea         rsi,        [rsi+rax*2]
    185 
    186         movdqa      [rdi],      xmm0
    187         add         rsi,        rax
    188 
    189         movdqa      [rdi+rcx],  xmm1
    190 
    191         movdqa      [rdi+rcx*2],    xmm2
    192         movdqu      xmm3,       [rsi]
    193 
    194         movdqu      xmm4,       [rsi+rax]
    195         lea         rdi,        [rdi+rcx*2]
    196 
    197         add         rdi,        rcx
    198         movdqu      xmm5,       [rsi+rax*2]
    199 
    200         lea         rsi,        [rsi+rax*2]
    201         movdqa      [rdi],  xmm3
    202 
    203         add         rsi,        rax
    204         movdqa      [rdi+rcx],  xmm4
    205 
    206         movdqa      [rdi+rcx*2],xmm5
    207         movdqu      xmm0,       [rsi]
    208 
    209         lea         rdi,        [rdi+rcx*2]
    210         movdqu      xmm1,       [rsi+rax]
    211 
    212         add         rdi,        rcx
    213         movdqu      xmm2,       [rsi+rax*2]
    214 
    215         lea         rsi,        [rsi+rax*2]
    216         movdqa      [rdi],      xmm0
    217 
    218         movdqa      [rdi+rcx],  xmm1
    219         movdqa      [rdi+rcx*2],xmm2
    220 
    221         movdqu      xmm3,       [rsi+rax]
    222         lea         rdi,        [rdi+rcx*2]
    223 
    224         movdqa      [rdi+rcx],  xmm3
    225 
    226     ; begin epilog
    227     pop rdi
    228     pop rsi
    229     UNSHADOW_ARGS
    230     pop         rbp
    231     ret
    232