Home | History | Annotate | Download | only in x86
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12 %include "vpx_ports/x86_abi_support.asm"
     13 
     14 %macro PROCESS_16X2X8 1
     15 %if %1
     16         movdqa          xmm0,       XMMWORD PTR [rsi]
     17         movq            xmm1,       MMWORD PTR [rdi]
     18         movq            xmm3,       MMWORD PTR [rdi+8]
     19         movq            xmm2,       MMWORD PTR [rdi+16]
     20         punpcklqdq      xmm1,       xmm3
     21         punpcklqdq      xmm3,       xmm2
     22 
     23         movdqa          xmm2,       xmm1
     24         mpsadbw         xmm1,       xmm0,  0x0
     25         mpsadbw         xmm2,       xmm0,  0x5
     26 
     27         psrldq          xmm0,       8
     28 
     29         movdqa          xmm4,       xmm3
     30         mpsadbw         xmm3,       xmm0,  0x0
     31         mpsadbw         xmm4,       xmm0,  0x5
     32 
     33         paddw           xmm1,       xmm2
     34         paddw           xmm1,       xmm3
     35         paddw           xmm1,       xmm4
     36 %else
     37         movdqa          xmm0,       XMMWORD PTR [rsi]
     38         movq            xmm5,       MMWORD PTR [rdi]
     39         movq            xmm3,       MMWORD PTR [rdi+8]
     40         movq            xmm2,       MMWORD PTR [rdi+16]
     41         punpcklqdq      xmm5,       xmm3
     42         punpcklqdq      xmm3,       xmm2
     43 
     44         movdqa          xmm2,       xmm5
     45         mpsadbw         xmm5,       xmm0,  0x0
     46         mpsadbw         xmm2,       xmm0,  0x5
     47 
     48         psrldq          xmm0,       8
     49 
     50         movdqa          xmm4,       xmm3
     51         mpsadbw         xmm3,       xmm0,  0x0
     52         mpsadbw         xmm4,       xmm0,  0x5
     53 
     54         paddw           xmm5,       xmm2
     55         paddw           xmm5,       xmm3
     56         paddw           xmm5,       xmm4
     57 
     58         paddw           xmm1,       xmm5
     59 %endif
     60         movdqa          xmm0,       XMMWORD PTR [rsi + rax]
     61         movq            xmm5,       MMWORD PTR [rdi+ rdx]
     62         movq            xmm3,       MMWORD PTR [rdi+ rdx+8]
     63         movq            xmm2,       MMWORD PTR [rdi+ rdx+16]
     64         punpcklqdq      xmm5,       xmm3
     65         punpcklqdq      xmm3,       xmm2
     66 
     67         lea             rsi,        [rsi+rax*2]
     68         lea             rdi,        [rdi+rdx*2]
     69 
     70         movdqa          xmm2,       xmm5
     71         mpsadbw         xmm5,       xmm0,  0x0
     72         mpsadbw         xmm2,       xmm0,  0x5
     73 
     74         psrldq          xmm0,       8
     75         movdqa          xmm4,       xmm3
     76         mpsadbw         xmm3,       xmm0,  0x0
     77         mpsadbw         xmm4,       xmm0,  0x5
     78 
     79         paddw           xmm5,       xmm2
     80         paddw           xmm5,       xmm3
     81         paddw           xmm5,       xmm4
     82 
     83         paddw           xmm1,       xmm5
     84 %endmacro
     85 
     86 %macro PROCESS_8X2X8 1
     87 %if %1
     88         movq            xmm0,       MMWORD PTR [rsi]
     89         movq            xmm1,       MMWORD PTR [rdi]
     90         movq            xmm3,       MMWORD PTR [rdi+8]
     91         punpcklqdq      xmm1,       xmm3
     92 
     93         movdqa          xmm2,       xmm1
     94         mpsadbw         xmm1,       xmm0,  0x0
     95         mpsadbw         xmm2,       xmm0,  0x5
     96         paddw           xmm1,       xmm2
     97 %else
     98         movq            xmm0,       MMWORD PTR [rsi]
     99         movq            xmm5,       MMWORD PTR [rdi]
    100         movq            xmm3,       MMWORD PTR [rdi+8]
    101         punpcklqdq      xmm5,       xmm3
    102 
    103         movdqa          xmm2,       xmm5
    104         mpsadbw         xmm5,       xmm0,  0x0
    105         mpsadbw         xmm2,       xmm0,  0x5
    106         paddw           xmm5,       xmm2
    107 
    108         paddw           xmm1,       xmm5
    109 %endif
    110         movq            xmm0,       MMWORD PTR [rsi + rax]
    111         movq            xmm5,       MMWORD PTR [rdi+ rdx]
    112         movq            xmm3,       MMWORD PTR [rdi+ rdx+8]
    113         punpcklqdq      xmm5,       xmm3
    114 
    115         lea             rsi,        [rsi+rax*2]
    116         lea             rdi,        [rdi+rdx*2]
    117 
    118         movdqa          xmm2,       xmm5
    119         mpsadbw         xmm5,       xmm0,  0x0
    120         mpsadbw         xmm2,       xmm0,  0x5
    121         paddw           xmm5,       xmm2
    122 
    123         paddw           xmm1,       xmm5
    124 %endmacro
    125 
    126 %macro PROCESS_4X2X8 1
    127 %if %1
    128         movd            xmm0,       [rsi]
    129         movq            xmm1,       MMWORD PTR [rdi]
    130         movq            xmm3,       MMWORD PTR [rdi+8]
    131         punpcklqdq      xmm1,       xmm3
    132 
    133         mpsadbw         xmm1,       xmm0,  0x0
    134 %else
    135         movd            xmm0,       [rsi]
    136         movq            xmm5,       MMWORD PTR [rdi]
    137         movq            xmm3,       MMWORD PTR [rdi+8]
    138         punpcklqdq      xmm5,       xmm3
    139 
    140         mpsadbw         xmm5,       xmm0,  0x0
    141 
    142         paddw           xmm1,       xmm5
    143 %endif
    144         movd            xmm0,       [rsi + rax]
    145         movq            xmm5,       MMWORD PTR [rdi+ rdx]
    146         movq            xmm3,       MMWORD PTR [rdi+ rdx+8]
    147         punpcklqdq      xmm5,       xmm3
    148 
    149         lea             rsi,        [rsi+rax*2]
    150         lea             rdi,        [rdi+rdx*2]
    151 
    152         mpsadbw         xmm5,       xmm0,  0x0
    153 
    154         paddw           xmm1,       xmm5
    155 %endmacro
    156 
    157 
    158 ;void vp8_sad16x16x8_sse4(
    159 ;    const unsigned char *src_ptr,
    160 ;    int  src_stride,
    161 ;    const unsigned char *ref_ptr,
    162 ;    int  ref_stride,
    163 ;    unsigned short *sad_array);
    164 global sym(vp8_sad16x16x8_sse4)
    165 sym(vp8_sad16x16x8_sse4):
    166     push        rbp
    167     mov         rbp, rsp
    168     SHADOW_ARGS_TO_STACK 5
    169     push        rsi
    170     push        rdi
    171     ; end prolog
    172 
    173         mov             rsi,        arg(0)           ;src_ptr
    174         mov             rdi,        arg(2)           ;ref_ptr
    175 
    176         movsxd          rax,        dword ptr arg(1) ;src_stride
    177         movsxd          rdx,        dword ptr arg(3) ;ref_stride
    178 
    179         PROCESS_16X2X8 1
    180         PROCESS_16X2X8 0
    181         PROCESS_16X2X8 0
    182         PROCESS_16X2X8 0
    183         PROCESS_16X2X8 0
    184         PROCESS_16X2X8 0
    185         PROCESS_16X2X8 0
    186         PROCESS_16X2X8 0
    187 
    188         mov             rdi,        arg(4)           ;Results
    189         movdqa          XMMWORD PTR [rdi],    xmm1
    190 
    191     ; begin epilog
    192     pop         rdi
    193     pop         rsi
    194     UNSHADOW_ARGS
    195     pop         rbp
    196     ret
    197 
    198 
    199 ;void vp8_sad16x8x8_sse4(
    200 ;    const unsigned char *src_ptr,
    201 ;    int  src_stride,
    202 ;    const unsigned char *ref_ptr,
    203 ;    int  ref_stride,
    204 ;    unsigned short *sad_array
    205 ;);
    206 global sym(vp8_sad16x8x8_sse4)
    207 sym(vp8_sad16x8x8_sse4):
    208     push        rbp
    209     mov         rbp, rsp
    210     SHADOW_ARGS_TO_STACK 5
    211     push        rsi
    212     push        rdi
    213     ; end prolog
    214 
    215         mov             rsi,        arg(0)           ;src_ptr
    216         mov             rdi,        arg(2)           ;ref_ptr
    217 
    218         movsxd          rax,        dword ptr arg(1) ;src_stride
    219         movsxd          rdx,        dword ptr arg(3) ;ref_stride
    220 
    221         PROCESS_16X2X8 1
    222         PROCESS_16X2X8 0
    223         PROCESS_16X2X8 0
    224         PROCESS_16X2X8 0
    225 
    226         mov             rdi,        arg(4)           ;Results
    227         movdqa          XMMWORD PTR [rdi],    xmm1
    228 
    229     ; begin epilog
    230     pop         rdi
    231     pop         rsi
    232     UNSHADOW_ARGS
    233     pop         rbp
    234     ret
    235 
    236 
    237 ;void vp8_sad8x8x8_sse4(
    238 ;    const unsigned char *src_ptr,
    239 ;    int  src_stride,
    240 ;    const unsigned char *ref_ptr,
    241 ;    int  ref_stride,
    242 ;    unsigned short *sad_array
    243 ;);
    244 global sym(vp8_sad8x8x8_sse4)
    245 sym(vp8_sad8x8x8_sse4):
    246     push        rbp
    247     mov         rbp, rsp
    248     SHADOW_ARGS_TO_STACK 5
    249     push        rsi
    250     push        rdi
    251     ; end prolog
    252 
    253         mov             rsi,        arg(0)           ;src_ptr
    254         mov             rdi,        arg(2)           ;ref_ptr
    255 
    256         movsxd          rax,        dword ptr arg(1) ;src_stride
    257         movsxd          rdx,        dword ptr arg(3) ;ref_stride
    258 
    259         PROCESS_8X2X8 1
    260         PROCESS_8X2X8 0
    261         PROCESS_8X2X8 0
    262         PROCESS_8X2X8 0
    263 
    264         mov             rdi,        arg(4)           ;Results
    265         movdqa          XMMWORD PTR [rdi],    xmm1
    266 
    267     ; begin epilog
    268     pop         rdi
    269     pop         rsi
    270     UNSHADOW_ARGS
    271     pop         rbp
    272     ret
    273 
    274 
    275 ;void vp8_sad8x16x8_sse4(
    276 ;    const unsigned char *src_ptr,
    277 ;    int  src_stride,
    278 ;    const unsigned char *ref_ptr,
    279 ;    int  ref_stride,
    280 ;    unsigned short *sad_array
    281 ;);
    282 global sym(vp8_sad8x16x8_sse4)
    283 sym(vp8_sad8x16x8_sse4):
    284     push        rbp
    285     mov         rbp, rsp
    286     SHADOW_ARGS_TO_STACK 5
    287     push        rsi
    288     push        rdi
    289     ; end prolog
    290 
    291         mov             rsi,        arg(0)           ;src_ptr
    292         mov             rdi,        arg(2)           ;ref_ptr
    293 
    294         movsxd          rax,        dword ptr arg(1) ;src_stride
    295         movsxd          rdx,        dword ptr arg(3) ;ref_stride
    296 
    297         PROCESS_8X2X8 1
    298         PROCESS_8X2X8 0
    299         PROCESS_8X2X8 0
    300         PROCESS_8X2X8 0
    301         PROCESS_8X2X8 0
    302         PROCESS_8X2X8 0
    303         PROCESS_8X2X8 0
    304         PROCESS_8X2X8 0
    305         mov             rdi,        arg(4)           ;Results
    306         movdqa          XMMWORD PTR [rdi],    xmm1
    307 
    308     ; begin epilog
    309     pop         rdi
    310     pop         rsi
    311     UNSHADOW_ARGS
    312     pop         rbp
    313     ret
    314 
    315 
    316 ;void vp8_sad4x4x8_c(
    317 ;    const unsigned char *src_ptr,
    318 ;    int  src_stride,
    319 ;    const unsigned char *ref_ptr,
    320 ;    int  ref_stride,
    321 ;    unsigned short *sad_array
    322 ;);
    323 global sym(vp8_sad4x4x8_sse4)
    324 sym(vp8_sad4x4x8_sse4):
    325     push        rbp
    326     mov         rbp, rsp
    327     SHADOW_ARGS_TO_STACK 5
    328     push        rsi
    329     push        rdi
    330     ; end prolog
    331 
    332         mov             rsi,        arg(0)           ;src_ptr
    333         mov             rdi,        arg(2)           ;ref_ptr
    334 
    335         movsxd          rax,        dword ptr arg(1) ;src_stride
    336         movsxd          rdx,        dword ptr arg(3) ;ref_stride
    337 
    338         PROCESS_4X2X8 1
    339         PROCESS_4X2X8 0
    340 
    341         mov             rdi,        arg(4)           ;Results
    342         movdqa          XMMWORD PTR [rdi],    xmm1
    343 
    344     ; begin epilog
    345     pop         rdi
    346     pop         rsi
    347     UNSHADOW_ARGS
    348     pop         rbp
    349     ret
    350 
    351 
    352 
    353 
    354