Home | History | Annotate | Download | only in x86
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12 %include "vpx_ports/x86_abi_support.asm"
     13 
     14 ;unsigned int vp8_sad16x16_wmt(
     15 ;    unsigned char *src_ptr,
     16 ;    int  src_stride,
     17 ;    unsigned char *ref_ptr,
     18 ;    int  ref_stride)
     19 global sym(vp8_sad16x16_wmt) PRIVATE
     20 sym(vp8_sad16x16_wmt):
     21     push        rbp
     22     mov         rbp, rsp
     23     SHADOW_ARGS_TO_STACK 4
     24     SAVE_XMM 6
     25     push        rsi
     26     push        rdi
     27     ; end prolog
     28 
     29         mov             rsi,        arg(0) ;src_ptr
     30         mov             rdi,        arg(2) ;ref_ptr
     31 
     32         movsxd          rax,        dword ptr arg(1) ;src_stride
     33         movsxd          rdx,        dword ptr arg(3) ;ref_stride
     34 
     35         lea             rcx,        [rsi+rax*8]
     36 
     37         lea             rcx,        [rcx+rax*8]
     38         pxor            xmm6,       xmm6
     39 
     40 .x16x16sad_wmt_loop:
     41 
     42         movq            xmm0,       QWORD PTR [rsi]
     43         movq            xmm2,       QWORD PTR [rsi+8]
     44 
     45         movq            xmm1,       QWORD PTR [rdi]
     46         movq            xmm3,       QWORD PTR [rdi+8]
     47 
     48         movq            xmm4,       QWORD PTR [rsi+rax]
     49         movq            xmm5,       QWORD PTR [rdi+rdx]
     50 
     51 
     52         punpcklbw       xmm0,       xmm2
     53         punpcklbw       xmm1,       xmm3
     54 
     55         psadbw          xmm0,       xmm1
     56         movq            xmm2,       QWORD PTR [rsi+rax+8]
     57 
     58         movq            xmm3,       QWORD PTR [rdi+rdx+8]
     59         lea             rsi,        [rsi+rax*2]
     60 
     61         lea             rdi,        [rdi+rdx*2]
     62         punpcklbw       xmm4,       xmm2
     63 
     64         punpcklbw       xmm5,       xmm3
     65         psadbw          xmm4,       xmm5
     66 
     67         paddw           xmm6,       xmm0
     68         paddw           xmm6,       xmm4
     69 
     70         cmp             rsi,        rcx
     71         jne             .x16x16sad_wmt_loop
     72 
     73         movq            xmm0,       xmm6
     74         psrldq          xmm6,       8
     75 
     76         paddw           xmm0,       xmm6
     77         movq            rax,        xmm0
     78 
     79     ; begin epilog
     80     pop rdi
     81     pop rsi
     82     RESTORE_XMM
     83     UNSHADOW_ARGS
     84     pop         rbp
     85     ret
     86 
     87 ;unsigned int vp8_sad8x16_wmt(
     88 ;    unsigned char *src_ptr,
     89 ;    int  src_stride,
     90 ;    unsigned char *ref_ptr,
     91 ;    int  ref_stride,
     92 ;    int  max_sad)
     93 global sym(vp8_sad8x16_wmt) PRIVATE
     94 sym(vp8_sad8x16_wmt):
     95     push        rbp
     96     mov         rbp, rsp
     97     SHADOW_ARGS_TO_STACK 5
     98     push        rbx
     99     push        rsi
    100     push        rdi
    101     ; end prolog
    102 
    103         mov             rsi,        arg(0) ;src_ptr
    104         mov             rdi,        arg(2) ;ref_ptr
    105 
    106         movsxd          rbx,        dword ptr arg(1) ;src_stride
    107         movsxd          rdx,        dword ptr arg(3) ;ref_stride
    108 
    109         lea             rcx,        [rsi+rbx*8]
    110 
    111         lea             rcx,        [rcx+rbx*8]
    112         pxor            mm7,        mm7
    113 
    114 .x8x16sad_wmt_loop:
    115 
    116         movq            rax,        mm7
    117         cmp             eax,        arg(4)
    118         ja              .x8x16sad_wmt_early_exit
    119 
    120         movq            mm0,        QWORD PTR [rsi]
    121         movq            mm1,        QWORD PTR [rdi]
    122 
    123         movq            mm2,        QWORD PTR [rsi+rbx]
    124         movq            mm3,        QWORD PTR [rdi+rdx]
    125 
    126         psadbw          mm0,        mm1
    127         psadbw          mm2,        mm3
    128 
    129         lea             rsi,        [rsi+rbx*2]
    130         lea             rdi,        [rdi+rdx*2]
    131 
    132         paddw           mm7,        mm0
    133         paddw           mm7,        mm2
    134 
    135         cmp             rsi,        rcx
    136         jne             .x8x16sad_wmt_loop
    137 
    138         movq            rax,        mm7
    139 
    140 .x8x16sad_wmt_early_exit:
    141 
    142     ; begin epilog
    143     pop         rdi
    144     pop         rsi
    145     pop         rbx
    146     UNSHADOW_ARGS
    147     pop         rbp
    148     ret
    149 
    150 
    151 ;unsigned int vp8_sad8x8_wmt(
    152 ;    unsigned char *src_ptr,
    153 ;    int  src_stride,
    154 ;    unsigned char *ref_ptr,
    155 ;    int  ref_stride)
    156 global sym(vp8_sad8x8_wmt) PRIVATE
    157 sym(vp8_sad8x8_wmt):
    158     push        rbp
    159     mov         rbp, rsp
    160     SHADOW_ARGS_TO_STACK 5
    161     push        rbx
    162     push        rsi
    163     push        rdi
    164     ; end prolog
    165 
    166         mov             rsi,        arg(0) ;src_ptr
    167         mov             rdi,        arg(2) ;ref_ptr
    168 
    169         movsxd          rbx,        dword ptr arg(1) ;src_stride
    170         movsxd          rdx,        dword ptr arg(3) ;ref_stride
    171 
    172         lea             rcx,        [rsi+rbx*8]
    173         pxor            mm7,        mm7
    174 
    175 .x8x8sad_wmt_loop:
    176 
    177         movq            rax,        mm7
    178         cmp             eax,        arg(4)
    179         ja              .x8x8sad_wmt_early_exit
    180 
    181         movq            mm0,        QWORD PTR [rsi]
    182         movq            mm1,        QWORD PTR [rdi]
    183 
    184         psadbw          mm0,        mm1
    185         lea             rsi,        [rsi+rbx]
    186 
    187         add             rdi,        rdx
    188         paddw           mm7,        mm0
    189 
    190         cmp             rsi,        rcx
    191         jne             .x8x8sad_wmt_loop
    192 
    193         movq            rax,        mm7
    194 .x8x8sad_wmt_early_exit:
    195 
    196     ; begin epilog
    197     pop         rdi
    198     pop         rsi
    199     pop         rbx
    200     UNSHADOW_ARGS
    201     pop         rbp
    202     ret
    203 
    204 ;unsigned int vp8_sad4x4_wmt(
    205 ;    unsigned char *src_ptr,
    206 ;    int  src_stride,
    207 ;    unsigned char *ref_ptr,
    208 ;    int  ref_stride)
    209 global sym(vp8_sad4x4_wmt) PRIVATE
    210 sym(vp8_sad4x4_wmt):
    211     push        rbp
    212     mov         rbp, rsp
    213     SHADOW_ARGS_TO_STACK 4
    214     push        rsi
    215     push        rdi
    216     ; end prolog
    217 
    218         mov             rsi,        arg(0) ;src_ptr
    219         mov             rdi,        arg(2) ;ref_ptr
    220 
    221         movsxd          rax,        dword ptr arg(1) ;src_stride
    222         movsxd          rdx,        dword ptr arg(3) ;ref_stride
    223 
    224         movd            mm0,        DWORD PTR [rsi]
    225         movd            mm1,        DWORD PTR [rdi]
    226 
    227         movd            mm2,        DWORD PTR [rsi+rax]
    228         movd            mm3,        DWORD PTR [rdi+rdx]
    229 
    230         punpcklbw       mm0,        mm2
    231         punpcklbw       mm1,        mm3
    232 
    233         psadbw          mm0,        mm1
    234         lea             rsi,        [rsi+rax*2]
    235 
    236         lea             rdi,        [rdi+rdx*2]
    237         movd            mm4,        DWORD PTR [rsi]
    238 
    239         movd            mm5,        DWORD PTR [rdi]
    240         movd            mm6,        DWORD PTR [rsi+rax]
    241 
    242         movd            mm7,        DWORD PTR [rdi+rdx]
    243         punpcklbw       mm4,        mm6
    244 
    245         punpcklbw       mm5,        mm7
    246         psadbw          mm4,        mm5
    247 
    248         paddw           mm0,        mm4
    249         movq            rax,        mm0
    250 
    251     ; begin epilog
    252     pop rdi
    253     pop rsi
    254     UNSHADOW_ARGS
    255     pop         rbp
    256     ret
    257 
    258 
    259 ;unsigned int vp8_sad16x8_wmt(
    260 ;    unsigned char *src_ptr,
    261 ;    int  src_stride,
    262 ;    unsigned char *ref_ptr,
    263 ;    int  ref_stride)
    264 global sym(vp8_sad16x8_wmt) PRIVATE
    265 sym(vp8_sad16x8_wmt):
    266     push        rbp
    267     mov         rbp, rsp
    268     SHADOW_ARGS_TO_STACK 5
    269     push        rbx
    270     push        rsi
    271     push        rdi
    272     ; end prolog
    273 
    274 
    275         mov             rsi,        arg(0) ;src_ptr
    276         mov             rdi,        arg(2) ;ref_ptr
    277 
    278         movsxd          rbx,        dword ptr arg(1) ;src_stride
    279         movsxd          rdx,        dword ptr arg(3) ;ref_stride
    280 
    281         lea             rcx,        [rsi+rbx*8]
    282         pxor            mm7,        mm7
    283 
    284 .x16x8sad_wmt_loop:
    285 
    286         movq            rax,        mm7
    287         cmp             eax,        arg(4)
    288         ja              .x16x8sad_wmt_early_exit
    289 
    290         movq            mm0,        QWORD PTR [rsi]
    291         movq            mm2,        QWORD PTR [rsi+8]
    292 
    293         movq            mm1,        QWORD PTR [rdi]
    294         movq            mm3,        QWORD PTR [rdi+8]
    295 
    296         movq            mm4,        QWORD PTR [rsi+rbx]
    297         movq            mm5,        QWORD PTR [rdi+rdx]
    298 
    299         psadbw          mm0,        mm1
    300         psadbw          mm2,        mm3
    301 
    302         movq            mm1,        QWORD PTR [rsi+rbx+8]
    303         movq            mm3,        QWORD PTR [rdi+rdx+8]
    304 
    305         psadbw          mm4,        mm5
    306         psadbw          mm1,        mm3
    307 
    308         lea             rsi,        [rsi+rbx*2]
    309         lea             rdi,        [rdi+rdx*2]
    310 
    311         paddw           mm0,        mm2
    312         paddw           mm4,        mm1
    313 
    314         paddw           mm7,        mm0
    315         paddw           mm7,        mm4
    316 
    317         cmp             rsi,        rcx
    318         jne             .x16x8sad_wmt_loop
    319 
    320         movq            rax,        mm7
    321 
    322 .x16x8sad_wmt_early_exit:
    323 
    324     ; begin epilog
    325     pop         rdi
    326     pop         rsi
    327     pop         rbx
    328     UNSHADOW_ARGS
    329     pop         rbp
    330     ret
    331 
    332 ;void vp8_copy32xn_sse2(
    333 ;    unsigned char *src_ptr,
    334 ;    int  src_stride,
    335 ;    unsigned char *dst_ptr,
    336 ;    int  dst_stride,
    337 ;    int height);
    338 global sym(vp8_copy32xn_sse2) PRIVATE
    339 sym(vp8_copy32xn_sse2):
    340     push        rbp
    341     mov         rbp, rsp
    342     SHADOW_ARGS_TO_STACK 5
    343     SAVE_XMM 7
    344     push        rsi
    345     push        rdi
    346     ; end prolog
    347 
    348         mov             rsi,        arg(0) ;src_ptr
    349         mov             rdi,        arg(2) ;dst_ptr
    350 
    351         movsxd          rax,        dword ptr arg(1) ;src_stride
    352         movsxd          rdx,        dword ptr arg(3) ;dst_stride
    353         movsxd          rcx,        dword ptr arg(4) ;height
    354 
    355 .block_copy_sse2_loopx4:
    356         movdqu          xmm0,       XMMWORD PTR [rsi]
    357         movdqu          xmm1,       XMMWORD PTR [rsi + 16]
    358         movdqu          xmm2,       XMMWORD PTR [rsi + rax]
    359         movdqu          xmm3,       XMMWORD PTR [rsi + rax + 16]
    360 
    361         lea             rsi,        [rsi+rax*2]
    362 
    363         movdqu          xmm4,       XMMWORD PTR [rsi]
    364         movdqu          xmm5,       XMMWORD PTR [rsi + 16]
    365         movdqu          xmm6,       XMMWORD PTR [rsi + rax]
    366         movdqu          xmm7,       XMMWORD PTR [rsi + rax + 16]
    367 
    368         lea             rsi,    [rsi+rax*2]
    369 
    370         movdqa          XMMWORD PTR [rdi], xmm0
    371         movdqa          XMMWORD PTR [rdi + 16], xmm1
    372         movdqa          XMMWORD PTR [rdi + rdx], xmm2
    373         movdqa          XMMWORD PTR [rdi + rdx + 16], xmm3
    374 
    375         lea             rdi,    [rdi+rdx*2]
    376 
    377         movdqa          XMMWORD PTR [rdi], xmm4
    378         movdqa          XMMWORD PTR [rdi + 16], xmm5
    379         movdqa          XMMWORD PTR [rdi + rdx], xmm6
    380         movdqa          XMMWORD PTR [rdi + rdx + 16], xmm7
    381 
    382         lea             rdi,    [rdi+rdx*2]
    383 
    384         sub             rcx,     4
    385         cmp             rcx,     4
    386         jge             .block_copy_sse2_loopx4
    387 
    388         cmp             rcx, 0
    389         je              .copy_is_done
    390 
    391 .block_copy_sse2_loop:
    392         movdqu          xmm0,       XMMWORD PTR [rsi]
    393         movdqu          xmm1,       XMMWORD PTR [rsi + 16]
    394         lea             rsi,    [rsi+rax]
    395 
    396         movdqa          XMMWORD PTR [rdi], xmm0
    397         movdqa          XMMWORD PTR [rdi + 16], xmm1
    398         lea             rdi,    [rdi+rdx]
    399 
    400         sub             rcx,     1
    401         jne             .block_copy_sse2_loop
    402 
    403 .copy_is_done:
    404     ; begin epilog
    405     pop rdi
    406     pop rsi
    407     RESTORE_XMM
    408     UNSHADOW_ARGS
    409     pop         rbp
    410     ret
    411