Home | History | Annotate | Download | only in x86
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12 %include "vpx_ports/x86_abi_support.asm"
     13 
     14 %macro PROCESS_16X2X8 1
     15 %if %1
     16         movdqa          xmm0,       XMMWORD PTR [rsi]
     17         movq            xmm1,       MMWORD PTR [rdi]
     18         movq            xmm3,       MMWORD PTR [rdi+8]
     19         movq            xmm2,       MMWORD PTR [rdi+16]
     20         punpcklqdq      xmm1,       xmm3
     21         punpcklqdq      xmm3,       xmm2
     22 
     23         movdqa          xmm2,       xmm1
     24         mpsadbw         xmm1,       xmm0,  0x0
     25         mpsadbw         xmm2,       xmm0,  0x5
     26 
     27         psrldq          xmm0,       8
     28 
     29         movdqa          xmm4,       xmm3
     30         mpsadbw         xmm3,       xmm0,  0x0
     31         mpsadbw         xmm4,       xmm0,  0x5
     32 
     33         paddw           xmm1,       xmm2
     34         paddw           xmm1,       xmm3
     35         paddw           xmm1,       xmm4
     36 %else
     37         movdqa          xmm0,       XMMWORD PTR [rsi]
     38         movq            xmm5,       MMWORD PTR [rdi]
     39         movq            xmm3,       MMWORD PTR [rdi+8]
     40         movq            xmm2,       MMWORD PTR [rdi+16]
     41         punpcklqdq      xmm5,       xmm3
     42         punpcklqdq      xmm3,       xmm2
     43 
     44         movdqa          xmm2,       xmm5
     45         mpsadbw         xmm5,       xmm0,  0x0
     46         mpsadbw         xmm2,       xmm0,  0x5
     47 
     48         psrldq          xmm0,       8
     49 
     50         movdqa          xmm4,       xmm3
     51         mpsadbw         xmm3,       xmm0,  0x0
     52         mpsadbw         xmm4,       xmm0,  0x5
     53 
     54         paddw           xmm5,       xmm2
     55         paddw           xmm5,       xmm3
     56         paddw           xmm5,       xmm4
     57 
     58         paddw           xmm1,       xmm5
     59 %endif
     60         movdqa          xmm0,       XMMWORD PTR [rsi + rax]
     61         movq            xmm5,       MMWORD PTR [rdi+ rdx]
     62         movq            xmm3,       MMWORD PTR [rdi+ rdx+8]
     63         movq            xmm2,       MMWORD PTR [rdi+ rdx+16]
     64         punpcklqdq      xmm5,       xmm3
     65         punpcklqdq      xmm3,       xmm2
     66 
     67         lea             rsi,        [rsi+rax*2]
     68         lea             rdi,        [rdi+rdx*2]
     69 
     70         movdqa          xmm2,       xmm5
     71         mpsadbw         xmm5,       xmm0,  0x0
     72         mpsadbw         xmm2,       xmm0,  0x5
     73 
     74         psrldq          xmm0,       8
     75         movdqa          xmm4,       xmm3
     76         mpsadbw         xmm3,       xmm0,  0x0
     77         mpsadbw         xmm4,       xmm0,  0x5
     78 
     79         paddw           xmm5,       xmm2
     80         paddw           xmm5,       xmm3
     81         paddw           xmm5,       xmm4
     82 
     83         paddw           xmm1,       xmm5
     84 %endmacro
     85 
     86 %macro PROCESS_8X2X8 1
     87 %if %1
     88         movq            xmm0,       MMWORD PTR [rsi]
     89         movq            xmm1,       MMWORD PTR [rdi]
     90         movq            xmm3,       MMWORD PTR [rdi+8]
     91         punpcklqdq      xmm1,       xmm3
     92 
     93         movdqa          xmm2,       xmm1
     94         mpsadbw         xmm1,       xmm0,  0x0
     95         mpsadbw         xmm2,       xmm0,  0x5
     96         paddw           xmm1,       xmm2
     97 %else
     98         movq            xmm0,       MMWORD PTR [rsi]
     99         movq            xmm5,       MMWORD PTR [rdi]
    100         movq            xmm3,       MMWORD PTR [rdi+8]
    101         punpcklqdq      xmm5,       xmm3
    102 
    103         movdqa          xmm2,       xmm5
    104         mpsadbw         xmm5,       xmm0,  0x0
    105         mpsadbw         xmm2,       xmm0,  0x5
    106         paddw           xmm5,       xmm2
    107 
    108         paddw           xmm1,       xmm5
    109 %endif
    110         movq            xmm0,       MMWORD PTR [rsi + rax]
    111         movq            xmm5,       MMWORD PTR [rdi+ rdx]
    112         movq            xmm3,       MMWORD PTR [rdi+ rdx+8]
    113         punpcklqdq      xmm5,       xmm3
    114 
    115         lea             rsi,        [rsi+rax*2]
    116         lea             rdi,        [rdi+rdx*2]
    117 
    118         movdqa          xmm2,       xmm5
    119         mpsadbw         xmm5,       xmm0,  0x0
    120         mpsadbw         xmm2,       xmm0,  0x5
    121         paddw           xmm5,       xmm2
    122 
    123         paddw           xmm1,       xmm5
    124 %endmacro
    125 
    126 %macro PROCESS_4X2X8 1
    127 %if %1
    128         movd            xmm0,       [rsi]
    129         movq            xmm1,       MMWORD PTR [rdi]
    130         movq            xmm3,       MMWORD PTR [rdi+8]
    131         punpcklqdq      xmm1,       xmm3
    132 
    133         mpsadbw         xmm1,       xmm0,  0x0
    134 %else
    135         movd            xmm0,       [rsi]
    136         movq            xmm5,       MMWORD PTR [rdi]
    137         movq            xmm3,       MMWORD PTR [rdi+8]
    138         punpcklqdq      xmm5,       xmm3
    139 
    140         mpsadbw         xmm5,       xmm0,  0x0
    141 
    142         paddw           xmm1,       xmm5
    143 %endif
    144         movd            xmm0,       [rsi + rax]
    145         movq            xmm5,       MMWORD PTR [rdi+ rdx]
    146         movq            xmm3,       MMWORD PTR [rdi+ rdx+8]
    147         punpcklqdq      xmm5,       xmm3
    148 
    149         lea             rsi,        [rsi+rax*2]
    150         lea             rdi,        [rdi+rdx*2]
    151 
    152         mpsadbw         xmm5,       xmm0,  0x0
    153 
    154         paddw           xmm1,       xmm5
    155 %endmacro
    156 
    157 %macro WRITE_AS_INTS 0
    158     mov             rdi,        arg(4)           ;Results
    159     pxor            xmm0, xmm0
    160     movdqa          xmm2, xmm1
    161     punpcklwd       xmm1, xmm0
    162     punpckhwd       xmm2, xmm0
    163 
    164     movdqa          [rdi],    xmm1
    165     movdqa          [rdi + 16],    xmm2
    166 %endmacro
    167 
    168 ;void vp9_sad16x16x8_sse4(
    169 ;    const unsigned char *src_ptr,
    170 ;    int  src_stride,
    171 ;    const unsigned char *ref_ptr,
    172 ;    int  ref_stride,
    173 ;    unsigned short *sad_array);
    174 global sym(vp9_sad16x16x8_sse4) PRIVATE
    175 sym(vp9_sad16x16x8_sse4):
    176     push        rbp
    177     mov         rbp, rsp
    178     SHADOW_ARGS_TO_STACK 5
    179     push        rsi
    180     push        rdi
    181     ; end prolog
    182 
    183     mov             rsi,        arg(0)           ;src_ptr
    184     mov             rdi,        arg(2)           ;ref_ptr
    185 
    186     movsxd          rax,        dword ptr arg(1) ;src_stride
    187     movsxd          rdx,        dword ptr arg(3) ;ref_stride
    188 
    189     PROCESS_16X2X8 1
    190     PROCESS_16X2X8 0
    191     PROCESS_16X2X8 0
    192     PROCESS_16X2X8 0
    193     PROCESS_16X2X8 0
    194     PROCESS_16X2X8 0
    195     PROCESS_16X2X8 0
    196     PROCESS_16X2X8 0
    197 
    198     WRITE_AS_INTS
    199 
    200     ; begin epilog
    201     pop         rdi
    202     pop         rsi
    203     UNSHADOW_ARGS
    204     pop         rbp
    205     ret
    206 
    207 
    208 ;void vp9_sad16x8x8_sse4(
    209 ;    const unsigned char *src_ptr,
    210 ;    int  src_stride,
    211 ;    const unsigned char *ref_ptr,
    212 ;    int  ref_stride,
    213 ;    unsigned short *sad_array
    214 ;);
    215 global sym(vp9_sad16x8x8_sse4) PRIVATE
    216 sym(vp9_sad16x8x8_sse4):
    217     push        rbp
    218     mov         rbp, rsp
    219     SHADOW_ARGS_TO_STACK 5
    220     push        rsi
    221     push        rdi
    222     ; end prolog
    223 
    224     mov             rsi,        arg(0)           ;src_ptr
    225     mov             rdi,        arg(2)           ;ref_ptr
    226 
    227     movsxd          rax,        dword ptr arg(1) ;src_stride
    228     movsxd          rdx,        dword ptr arg(3) ;ref_stride
    229 
    230     PROCESS_16X2X8 1
    231     PROCESS_16X2X8 0
    232     PROCESS_16X2X8 0
    233     PROCESS_16X2X8 0
    234 
    235     WRITE_AS_INTS
    236 
    237     ; begin epilog
    238     pop         rdi
    239     pop         rsi
    240     UNSHADOW_ARGS
    241     pop         rbp
    242     ret
    243 
    244 
    245 ;void vp9_sad8x8x8_sse4(
    246 ;    const unsigned char *src_ptr,
    247 ;    int  src_stride,
    248 ;    const unsigned char *ref_ptr,
    249 ;    int  ref_stride,
    250 ;    unsigned short *sad_array
    251 ;);
    252 global sym(vp9_sad8x8x8_sse4) PRIVATE
    253 sym(vp9_sad8x8x8_sse4):
    254     push        rbp
    255     mov         rbp, rsp
    256     SHADOW_ARGS_TO_STACK 5
    257     push        rsi
    258     push        rdi
    259     ; end prolog
    260 
    261     mov             rsi,        arg(0)           ;src_ptr
    262     mov             rdi,        arg(2)           ;ref_ptr
    263 
    264     movsxd          rax,        dword ptr arg(1) ;src_stride
    265     movsxd          rdx,        dword ptr arg(3) ;ref_stride
    266 
    267     PROCESS_8X2X8 1
    268     PROCESS_8X2X8 0
    269     PROCESS_8X2X8 0
    270     PROCESS_8X2X8 0
    271 
    272     WRITE_AS_INTS
    273 
    274     ; begin epilog
    275     pop         rdi
    276     pop         rsi
    277     UNSHADOW_ARGS
    278     pop         rbp
    279     ret
    280 
    281 
    282 ;void vp9_sad8x16x8_sse4(
    283 ;    const unsigned char *src_ptr,
    284 ;    int  src_stride,
    285 ;    const unsigned char *ref_ptr,
    286 ;    int  ref_stride,
    287 ;    unsigned short *sad_array
    288 ;);
    289 global sym(vp9_sad8x16x8_sse4) PRIVATE
    290 sym(vp9_sad8x16x8_sse4):
    291     push        rbp
    292     mov         rbp, rsp
    293     SHADOW_ARGS_TO_STACK 5
    294     push        rsi
    295     push        rdi
    296     ; end prolog
    297 
    298     mov             rsi,        arg(0)           ;src_ptr
    299     mov             rdi,        arg(2)           ;ref_ptr
    300 
    301     movsxd          rax,        dword ptr arg(1) ;src_stride
    302     movsxd          rdx,        dword ptr arg(3) ;ref_stride
    303 
    304     PROCESS_8X2X8 1
    305     PROCESS_8X2X8 0
    306     PROCESS_8X2X8 0
    307     PROCESS_8X2X8 0
    308     PROCESS_8X2X8 0
    309     PROCESS_8X2X8 0
    310     PROCESS_8X2X8 0
    311     PROCESS_8X2X8 0
    312 
    313     WRITE_AS_INTS
    314 
    315     ; begin epilog
    316     pop         rdi
    317     pop         rsi
    318     UNSHADOW_ARGS
    319     pop         rbp
    320     ret
    321 
    322 
    323 ;void vp9_sad4x4x8_c(
    324 ;    const unsigned char *src_ptr,
    325 ;    int  src_stride,
    326 ;    const unsigned char *ref_ptr,
    327 ;    int  ref_stride,
    328 ;    unsigned short *sad_array
    329 ;);
    330 global sym(vp9_sad4x4x8_sse4) PRIVATE
    331 sym(vp9_sad4x4x8_sse4):
    332     push        rbp
    333     mov         rbp, rsp
    334     SHADOW_ARGS_TO_STACK 5
    335     push        rsi
    336     push        rdi
    337     ; end prolog
    338 
    339     mov             rsi,        arg(0)           ;src_ptr
    340     mov             rdi,        arg(2)           ;ref_ptr
    341 
    342     movsxd          rax,        dword ptr arg(1) ;src_stride
    343     movsxd          rdx,        dword ptr arg(3) ;ref_stride
    344 
    345     PROCESS_4X2X8 1
    346     PROCESS_4X2X8 0
    347 
    348     WRITE_AS_INTS
    349 
    350     ; begin epilog
    351     pop         rdi
    352     pop         rsi
    353     UNSHADOW_ARGS
    354     pop         rbp
    355     ret
    356 
    357 
    358 
    359 
    360