Home | History | Annotate | Download | only in x86
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 %include "vpx_ports/x86_abi_support.asm"
     12 
     13 %macro STACK_FRAME_CREATE_X3 0
     14 %if ABI_IS_32BIT
     15   %define     src_ptr       rsi
     16   %define     src_stride    rax
     17   %define     ref_ptr       rdi
     18   %define     ref_stride    rdx
     19   %define     end_ptr       rcx
     20   %define     ret_var       rbx
     21   %define     result_ptr    arg(4)
     22   %define     max_sad       arg(4)
     23   %define     height        dword ptr arg(4)
     24     push        rbp
     25     mov         rbp,        rsp
     26     push        rsi
     27     push        rdi
     28     push        rbx
     29 
     30     mov         rsi,        arg(0)              ; src_ptr
     31     mov         rdi,        arg(2)              ; ref_ptr
     32 
     33     movsxd      rax,        dword ptr arg(1)    ; src_stride
     34     movsxd      rdx,        dword ptr arg(3)    ; ref_stride
     35 %else
     36   %if LIBVPX_YASM_WIN64
     37     SAVE_XMM 7, u
     38     %define     src_ptr     rcx
     39     %define     src_stride  rdx
     40     %define     ref_ptr     r8
     41     %define     ref_stride  r9
     42     %define     end_ptr     r10
     43     %define     ret_var     r11
     44     %define     result_ptr  [rsp+xmm_stack_space+8+4*8]
     45     %define     max_sad     [rsp+xmm_stack_space+8+4*8]
     46     %define     height      dword ptr [rsp+xmm_stack_space+8+4*8]
     47   %else
     48     %define     src_ptr     rdi
     49     %define     src_stride  rsi
     50     %define     ref_ptr     rdx
     51     %define     ref_stride  rcx
     52     %define     end_ptr     r9
     53     %define     ret_var     r10
     54     %define     result_ptr  r8
     55     %define     max_sad     r8
     56     %define     height      r8
     57   %endif
     58 %endif
     59 
     60 %endmacro
     61 
     62 %macro STACK_FRAME_DESTROY_X3 0
     63   %define     src_ptr
     64   %define     src_stride
     65   %define     ref_ptr
     66   %define     ref_stride
     67   %define     end_ptr
     68   %define     ret_var
     69   %define     result_ptr
     70   %define     max_sad
     71   %define     height
     72 
     73 %if ABI_IS_32BIT
     74     pop         rbx
     75     pop         rdi
     76     pop         rsi
     77     pop         rbp
     78 %else
     79   %if LIBVPX_YASM_WIN64
     80     RESTORE_XMM
     81   %endif
     82 %endif
     83     ret
     84 %endmacro
     85 
     86 
     87 ;void vp8_copy32xn_sse3(
     88 ;    unsigned char *src_ptr,
     89 ;    int  src_stride,
     90 ;    unsigned char *dst_ptr,
     91 ;    int  dst_stride,
     92 ;    int height);
     93 global sym(vp8_copy32xn_sse3) PRIVATE
     94 sym(vp8_copy32xn_sse3):
     95 
     96     STACK_FRAME_CREATE_X3
     97 
     98 .block_copy_sse3_loopx4:
     99         lea             end_ptr,    [src_ptr+src_stride*2]
    100 
    101         movdqu          xmm0,       XMMWORD PTR [src_ptr]
    102         movdqu          xmm1,       XMMWORD PTR [src_ptr + 16]
    103         movdqu          xmm2,       XMMWORD PTR [src_ptr + src_stride]
    104         movdqu          xmm3,       XMMWORD PTR [src_ptr + src_stride + 16]
    105         movdqu          xmm4,       XMMWORD PTR [end_ptr]
    106         movdqu          xmm5,       XMMWORD PTR [end_ptr + 16]
    107         movdqu          xmm6,       XMMWORD PTR [end_ptr + src_stride]
    108         movdqu          xmm7,       XMMWORD PTR [end_ptr + src_stride + 16]
    109 
    110         lea             src_ptr,    [src_ptr+src_stride*4]
    111 
    112         lea             end_ptr,    [ref_ptr+ref_stride*2]
    113 
    114         movdqa          XMMWORD PTR [ref_ptr], xmm0
    115         movdqa          XMMWORD PTR [ref_ptr + 16], xmm1
    116         movdqa          XMMWORD PTR [ref_ptr + ref_stride], xmm2
    117         movdqa          XMMWORD PTR [ref_ptr + ref_stride + 16], xmm3
    118         movdqa          XMMWORD PTR [end_ptr], xmm4
    119         movdqa          XMMWORD PTR [end_ptr + 16], xmm5
    120         movdqa          XMMWORD PTR [end_ptr + ref_stride], xmm6
    121         movdqa          XMMWORD PTR [end_ptr + ref_stride + 16], xmm7
    122 
    123         lea             ref_ptr,    [ref_ptr+ref_stride*4]
    124 
    125         sub             height,     4
    126         cmp             height,     4
    127         jge             .block_copy_sse3_loopx4
    128 
    129         ;Check to see if there is more rows need to be copied.
    130         cmp             height, 0
    131         je              .copy_is_done
    132 
    133 .block_copy_sse3_loop:
    134         movdqu          xmm0,       XMMWORD PTR [src_ptr]
    135         movdqu          xmm1,       XMMWORD PTR [src_ptr + 16]
    136         lea             src_ptr,    [src_ptr+src_stride]
    137 
    138         movdqa          XMMWORD PTR [ref_ptr], xmm0
    139         movdqa          XMMWORD PTR [ref_ptr + 16], xmm1
    140         lea             ref_ptr,    [ref_ptr+ref_stride]
    141 
    142         sub             height,     1
    143         jne             .block_copy_sse3_loop
    144 
    145 .copy_is_done:
    146     STACK_FRAME_DESTROY_X3
    147