Home | History | Annotate | Download | only in x86
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12 %include "vpx_ports/x86_abi_support.asm"
     13 
     14 %idefine QWORD
     15 
     16 %macro PROCESS_16X2X3 1
     17 %if %1
     18         movdqa          xmm0,       [rsi]
     19         lddqu           xmm5,       [rdi]
     20         lddqu           xmm6,       [rdi+1]
     21         lddqu           xmm7,       [rdi+2]
     22 
     23         psadbw          xmm5,       xmm0
     24         psadbw          xmm6,       xmm0
     25         psadbw          xmm7,       xmm0
     26 %else
     27         movdqa          xmm0,       [rsi]
     28         lddqu           xmm1,       [rdi]
     29         lddqu           xmm2,       [rdi+1]
     30         lddqu           xmm3,       [rdi+2]
     31 
     32         psadbw          xmm1,       xmm0
     33         psadbw          xmm2,       xmm0
     34         psadbw          xmm3,       xmm0
     35 
     36         paddw           xmm5,       xmm1
     37         paddw           xmm6,       xmm2
     38         paddw           xmm7,       xmm3
     39 %endif
     40         movdqa          xmm0,       QWORD PTR [rsi+rax]
     41         lddqu           xmm1,       QWORD PTR [rdi+rdx]
     42         lddqu           xmm2,       QWORD PTR [rdi+rdx+1]
     43         lddqu           xmm3,       QWORD PTR [rdi+rdx+2]
     44 
     45         lea             rsi,        [rsi+rax*2]
     46         lea             rdi,        [rdi+rdx*2]
     47 
     48         psadbw          xmm1,       xmm0
     49         psadbw          xmm2,       xmm0
     50         psadbw          xmm3,       xmm0
     51 
     52         paddw           xmm5,       xmm1
     53         paddw           xmm6,       xmm2
     54         paddw           xmm7,       xmm3
     55 %endmacro
     56 
     57 %macro PROCESS_8X2X3 1
     58 %if %1
     59         movq            mm0,       [rsi]
     60         movq            mm5,       [rdi]
     61         movq            mm6,       [rdi+1]
     62         movq            mm7,       [rdi+2]
     63 
     64         psadbw          mm5,       mm0
     65         psadbw          mm6,       mm0
     66         psadbw          mm7,       mm0
     67 %else
     68         movq            mm0,       [rsi]
     69         movq            mm1,       [rdi]
     70         movq            mm2,       [rdi+1]
     71         movq            mm3,       [rdi+2]
     72 
     73         psadbw          mm1,       mm0
     74         psadbw          mm2,       mm0
     75         psadbw          mm3,       mm0
     76 
     77         paddw           mm5,       mm1
     78         paddw           mm6,       mm2
     79         paddw           mm7,       mm3
     80 %endif
     81         movq            mm0,       QWORD PTR [rsi+rax]
     82         movq            mm1,       QWORD PTR [rdi+rdx]
     83         movq            mm2,       QWORD PTR [rdi+rdx+1]
     84         movq            mm3,       QWORD PTR [rdi+rdx+2]
     85 
     86         lea             rsi,       [rsi+rax*2]
     87         lea             rdi,       [rdi+rdx*2]
     88 
     89         psadbw          mm1,       mm0
     90         psadbw          mm2,       mm0
     91         psadbw          mm3,       mm0
     92 
     93         paddw           mm5,       mm1
     94         paddw           mm6,       mm2
     95         paddw           mm7,       mm3
     96 %endmacro
     97 
     98 %macro LOAD_X4_ADDRESSES 5
     99         mov             %2,         [%1+REG_SZ_BYTES*0]
    100         mov             %3,         [%1+REG_SZ_BYTES*1]
    101 
    102         mov             %4,         [%1+REG_SZ_BYTES*2]
    103         mov             %5,         [%1+REG_SZ_BYTES*3]
    104 %endmacro
    105 
    106 %macro PROCESS_16X2X4 1
    107 %if %1
    108         movdqa          xmm0,       [rsi]
    109         lddqu           xmm4,       [rcx]
    110         lddqu           xmm5,       [rdx]
    111         lddqu           xmm6,       [rbx]
    112         lddqu           xmm7,       [rdi]
    113 
    114         psadbw          xmm4,       xmm0
    115         psadbw          xmm5,       xmm0
    116         psadbw          xmm6,       xmm0
    117         psadbw          xmm7,       xmm0
    118 %else
    119         movdqa          xmm0,       [rsi]
    120         lddqu           xmm1,       [rcx]
    121         lddqu           xmm2,       [rdx]
    122         lddqu           xmm3,       [rbx]
    123 
    124         psadbw          xmm1,       xmm0
    125         psadbw          xmm2,       xmm0
    126         psadbw          xmm3,       xmm0
    127 
    128         paddw           xmm4,       xmm1
    129         lddqu           xmm1,       [rdi]
    130         paddw           xmm5,       xmm2
    131         paddw           xmm6,       xmm3
    132 
    133         psadbw          xmm1,       xmm0
    134         paddw           xmm7,       xmm1
    135 %endif
    136         movdqa          xmm0,       QWORD PTR [rsi+rax]
    137         lddqu           xmm1,       QWORD PTR [rcx+rbp]
    138         lddqu           xmm2,       QWORD PTR [rdx+rbp]
    139         lddqu           xmm3,       QWORD PTR [rbx+rbp]
    140 
    141         psadbw          xmm1,       xmm0
    142         psadbw          xmm2,       xmm0
    143         psadbw          xmm3,       xmm0
    144 
    145         paddw           xmm4,       xmm1
    146         lddqu           xmm1,       QWORD PTR [rdi+rbp]
    147         paddw           xmm5,       xmm2
    148         paddw           xmm6,       xmm3
    149 
    150         lea             rsi,        [rsi+rax*2]
    151         lea             rcx,        [rcx+rbp*2]
    152 
    153         lea             rdx,        [rdx+rbp*2]
    154         lea             rbx,        [rbx+rbp*2]
    155 
    156         lea             rdi,        [rdi+rbp*2]
    157 
    158         psadbw          xmm1,       xmm0
    159         paddw           xmm7,       xmm1
    160 
    161 %endmacro
    162 
    163 %macro PROCESS_8X2X4 1
    164 %if %1
    165         movq            mm0,        [rsi]
    166         movq            mm4,        [rcx]
    167         movq            mm5,        [rdx]
    168         movq            mm6,        [rbx]
    169         movq            mm7,        [rdi]
    170 
    171         psadbw          mm4,        mm0
    172         psadbw          mm5,        mm0
    173         psadbw          mm6,        mm0
    174         psadbw          mm7,        mm0
    175 %else
    176         movq            mm0,        [rsi]
    177         movq            mm1,        [rcx]
    178         movq            mm2,        [rdx]
    179         movq            mm3,        [rbx]
    180 
    181         psadbw          mm1,        mm0
    182         psadbw          mm2,        mm0
    183         psadbw          mm3,        mm0
    184 
    185         paddw           mm4,        mm1
    186         movq            mm1,        [rdi]
    187         paddw           mm5,        mm2
    188         paddw           mm6,        mm3
    189 
    190         psadbw          mm1,        mm0
    191         paddw           mm7,        mm1
    192 %endif
    193         movq            mm0,        QWORD PTR [rsi+rax]
    194         movq            mm1,        QWORD PTR [rcx+rbp]
    195         movq            mm2,        QWORD PTR [rdx+rbp]
    196         movq            mm3,        QWORD PTR [rbx+rbp]
    197 
    198         psadbw          mm1,        mm0
    199         psadbw          mm2,        mm0
    200         psadbw          mm3,        mm0
    201 
    202         paddw           mm4,        mm1
    203         movq            mm1,        QWORD PTR [rdi+rbp]
    204         paddw           mm5,        mm2
    205         paddw           mm6,        mm3
    206 
    207         lea             rsi,        [rsi+rax*2]
    208         lea             rcx,        [rcx+rbp*2]
    209 
    210         lea             rdx,        [rdx+rbp*2]
    211         lea             rbx,        [rbx+rbp*2]
    212 
    213         lea             rdi,        [rdi+rbp*2]
    214 
    215         psadbw          mm1,        mm0
    216         paddw           mm7,        mm1
    217 
    218 %endmacro
    219 
    220 ;void int vp8_sad16x16x3_sse3(
    221 ;    unsigned char *src_ptr,
    222 ;    int  src_stride,
    223 ;    unsigned char *ref_ptr,
    224 ;    int  ref_stride,
    225 ;    int  *results)
    226 global sym(vp8_sad16x16x3_sse3)
    227 sym(vp8_sad16x16x3_sse3):
    228     push        rbp
    229     mov         rbp, rsp
    230     SHADOW_ARGS_TO_STACK 5
    231     push        rsi
    232     push        rdi
    233     ; end prolog
    234 
    235         mov             rsi,        arg(0) ;src_ptr
    236         mov             rdi,        arg(2) ;ref_ptr
    237 
    238         movsxd          rax,        dword ptr arg(1) ;src_stride
    239         movsxd          rdx,        dword ptr arg(3) ;ref_stride
    240 
    241         PROCESS_16X2X3 1
    242         PROCESS_16X2X3 0
    243         PROCESS_16X2X3 0
    244         PROCESS_16X2X3 0
    245         PROCESS_16X2X3 0
    246         PROCESS_16X2X3 0
    247         PROCESS_16X2X3 0
    248         PROCESS_16X2X3 0
    249 
    250         mov             rdi,        arg(4) ;Results
    251 
    252         movq            xmm0,       xmm5
    253         psrldq          xmm5,       8
    254 
    255         paddw           xmm0,       xmm5
    256         movd            [rdi],      xmm0
    257 ;-
    258         movq            xmm0,       xmm6
    259         psrldq          xmm6,       8
    260 
    261         paddw           xmm0,       xmm6
    262         movd            [rdi+4],    xmm0
    263 ;-
    264         movq            xmm0,       xmm7
    265         psrldq          xmm7,       8
    266 
    267         paddw           xmm0,       xmm7
    268         movd            [rdi+8],    xmm0
    269 
    270     ; begin epilog
    271     pop         rdi
    272     pop         rsi
    273     UNSHADOW_ARGS
    274     pop         rbp
    275     ret
    276 
    277 ;void int vp8_sad16x8x3_sse3(
    278 ;    unsigned char *src_ptr,
    279 ;    int  src_stride,
    280 ;    unsigned char *ref_ptr,
    281 ;    int  ref_stride,
    282 ;    int  *results)
    283 global sym(vp8_sad16x8x3_sse3)
    284 sym(vp8_sad16x8x3_sse3):
    285     push        rbp
    286     mov         rbp, rsp
    287     SHADOW_ARGS_TO_STACK 5
    288     push        rsi
    289     push        rdi
    290     ; end prolog
    291 
    292         mov             rsi,        arg(0) ;src_ptr
    293         mov             rdi,        arg(2) ;ref_ptr
    294 
    295         movsxd          rax,        dword ptr arg(1) ;src_stride
    296         movsxd          rdx,        dword ptr arg(3) ;ref_stride
    297 
    298         PROCESS_16X2X3 1
    299         PROCESS_16X2X3 0
    300         PROCESS_16X2X3 0
    301         PROCESS_16X2X3 0
    302 
    303         mov             rdi,        arg(4) ;Results
    304 
    305         movq            xmm0,       xmm5
    306         psrldq          xmm5,       8
    307 
    308         paddw           xmm0,       xmm5
    309         movd            [rdi],      xmm0
    310 ;-
    311         movq            xmm0,       xmm6
    312         psrldq          xmm6,       8
    313 
    314         paddw           xmm0,       xmm6
    315         movd            [rdi+4],    xmm0
    316 ;-
    317         movq            xmm0,       xmm7
    318         psrldq          xmm7,       8
    319 
    320         paddw           xmm0,       xmm7
    321         movd            [rdi+8],    xmm0
    322 
    323     ; begin epilog
    324     pop         rdi
    325     pop         rsi
    326     UNSHADOW_ARGS
    327     pop         rbp
    328     ret
    329 
    330 ;void int vp8_sad8x16x3_sse3(
    331 ;    unsigned char *src_ptr,
    332 ;    int  src_stride,
    333 ;    unsigned char *ref_ptr,
    334 ;    int  ref_stride,
    335 ;    int  *results)
    336 global sym(vp8_sad8x16x3_sse3)
    337 sym(vp8_sad8x16x3_sse3):
    338     push        rbp
    339     mov         rbp, rsp
    340     SHADOW_ARGS_TO_STACK 5
    341     push        rsi
    342     push        rdi
    343     ; end prolog
    344 
    345         mov             rsi,        arg(0) ;src_ptr
    346         mov             rdi,        arg(2) ;ref_ptr
    347 
    348         movsxd          rax,        dword ptr arg(1) ;src_stride
    349         movsxd          rdx,        dword ptr arg(3) ;ref_stride
    350 
    351         PROCESS_8X2X3 1
    352         PROCESS_8X2X3 0
    353         PROCESS_8X2X3 0
    354         PROCESS_8X2X3 0
    355         PROCESS_8X2X3 0
    356         PROCESS_8X2X3 0
    357         PROCESS_8X2X3 0
    358         PROCESS_8X2X3 0
    359 
    360         mov             rdi,        arg(4) ;Results
    361 
    362         movd            [rdi],      mm5
    363         movd            [rdi+4],    mm6
    364         movd            [rdi+8],    mm7
    365 
    366     ; begin epilog
    367     pop         rdi
    368     pop         rsi
    369     UNSHADOW_ARGS
    370     pop         rbp
    371     ret
    372 
    373 ;void int vp8_sad8x8x3_sse3(
    374 ;    unsigned char *src_ptr,
    375 ;    int  src_stride,
    376 ;    unsigned char *ref_ptr,
    377 ;    int  ref_stride,
    378 ;    int  *results)
    379 global sym(vp8_sad8x8x3_sse3)
    380 sym(vp8_sad8x8x3_sse3):
    381     push        rbp
    382     mov         rbp, rsp
    383     SHADOW_ARGS_TO_STACK 5
    384     push        rsi
    385     push        rdi
    386     ; end prolog
    387 
    388         mov             rsi,        arg(0) ;src_ptr
    389         mov             rdi,        arg(2) ;ref_ptr
    390 
    391         movsxd          rax,        dword ptr arg(1) ;src_stride
    392         movsxd          rdx,        dword ptr arg(3) ;ref_stride
    393 
    394         PROCESS_8X2X3 1
    395         PROCESS_8X2X3 0
    396         PROCESS_8X2X3 0
    397         PROCESS_8X2X3 0
    398 
    399         mov             rdi,        arg(4) ;Results
    400 
    401         movd            [rdi],      mm5
    402         movd            [rdi+4],    mm6
    403         movd            [rdi+8],    mm7
    404 
    405     ; begin epilog
    406     pop         rdi
    407     pop         rsi
    408     UNSHADOW_ARGS
    409     pop         rbp
    410     ret
    411 
    412 ;void int vp8_sad4x4x3_sse3(
    413 ;    unsigned char *src_ptr,
    414 ;    int  src_stride,
    415 ;    unsigned char *ref_ptr,
    416 ;    int  ref_stride,
    417 ;    int  *results)
    418 global sym(vp8_sad4x4x3_sse3)
    419 sym(vp8_sad4x4x3_sse3):
    420     push        rbp
    421     mov         rbp, rsp
    422     SHADOW_ARGS_TO_STACK 5
    423     push        rsi
    424     push        rdi
    425     ; end prolog
    426 
    427         mov             rsi,        arg(0) ;src_ptr
    428         mov             rdi,        arg(2) ;ref_ptr
    429 
    430         movsxd          rax,        dword ptr arg(1) ;src_stride
    431         movsxd          rdx,        dword ptr arg(3) ;ref_stride
    432 
    433         movd            mm0,        QWORD PTR [rsi]
    434         movd            mm1,        QWORD PTR [rdi]
    435 
    436         movd            mm2,        QWORD PTR [rsi+rax]
    437         movd            mm3,        QWORD PTR [rdi+rdx]
    438 
    439         punpcklbw       mm0,        mm2
    440         punpcklbw       mm1,        mm3
    441 
    442         movd            mm4,        QWORD PTR [rdi+1]
    443         movd            mm5,        QWORD PTR [rdi+2]
    444 
    445         movd            mm2,        QWORD PTR [rdi+rdx+1]
    446         movd            mm3,        QWORD PTR [rdi+rdx+2]
    447 
    448         psadbw          mm1,        mm0
    449 
    450         punpcklbw       mm4,        mm2
    451         punpcklbw       mm5,        mm3
    452 
    453         psadbw          mm4,        mm0
    454         psadbw          mm5,        mm0
    455 
    456 
    457 
    458         lea             rsi,        [rsi+rax*2]
    459         lea             rdi,        [rdi+rdx*2]
    460 
    461         movd            mm0,        QWORD PTR [rsi]
    462         movd            mm2,        QWORD PTR [rdi]
    463 
    464         movd            mm3,        QWORD PTR [rsi+rax]
    465         movd            mm6,        QWORD PTR [rdi+rdx]
    466 
    467         punpcklbw       mm0,        mm3
    468         punpcklbw       mm2,        mm6
    469 
    470         movd            mm3,        QWORD PTR [rdi+1]
    471         movd            mm7,        QWORD PTR [rdi+2]
    472 
    473         psadbw          mm2,        mm0
    474 
    475         paddw           mm1,        mm2
    476 
    477         movd            mm2,        QWORD PTR [rdi+rdx+1]
    478         movd            mm6,        QWORD PTR [rdi+rdx+2]
    479 
    480         punpcklbw       mm3,        mm2
    481         punpcklbw       mm7,        mm6
    482 
    483         psadbw          mm3,        mm0
    484         psadbw          mm7,        mm0
    485 
    486         paddw           mm3,        mm4
    487         paddw           mm7,        mm5
    488 
    489         mov             rdi,        arg(4) ;Results
    490         movd            [rdi],      mm1
    491 
    492         movd            [rdi+4],    mm3
    493         movd            [rdi+8],    mm7
    494 
    495 
    496     ; begin epilog
    497     pop rdi
    498     pop rsi
    499     UNSHADOW_ARGS
    500     pop         rbp
    501     ret
    502 
    503 ;unsigned int vp8_sad16x16_sse3(
    504 ;    unsigned char *src_ptr,
    505 ;    int  src_stride,
    506 ;    unsigned char *ref_ptr,
    507 ;    int  ref_stride,
    508 ;    int  max_err)
    509 ;%define lddqu movdqu
    510 global sym(vp8_sad16x16_sse3)
    511 sym(vp8_sad16x16_sse3):
    512     push        rbp
    513     mov         rbp, rsp
    514     SHADOW_ARGS_TO_STACK 5
    515     push        rbx
    516     push        rsi
    517     push        rdi
    518     ; end prolog
    519 
    520         mov             rsi,        arg(0) ;src_ptr
    521         mov             rdi,        arg(2) ;ref_ptr
    522 
    523         movsxd          rbx,        dword ptr arg(1) ;src_stride
    524         movsxd          rdx,        dword ptr arg(3) ;ref_stride
    525 
    526         lea             rcx,        [rsi+rbx*8]
    527 
    528         lea             rcx,        [rcx+rbx*8]
    529         pxor            mm7,        mm7
    530 
    531 vp8_sad16x16_sse3_loop:
    532 
    533         movd            rax,        mm7
    534         cmp             rax,        arg(4)
    535         jg              vp8_sad16x16_early_exit
    536 
    537         movq            mm0,        QWORD PTR [rsi]
    538         movq            mm2,        QWORD PTR [rsi+8]
    539 
    540         movq            mm1,        QWORD PTR [rdi]
    541         movq            mm3,        QWORD PTR [rdi+8]
    542 
    543         movq            mm4,        QWORD PTR [rsi+rbx]
    544         movq            mm5,        QWORD PTR [rdi+rdx]
    545 
    546         psadbw          mm0,        mm1
    547         psadbw          mm2,        mm3
    548 
    549         movq            mm1,        QWORD PTR [rsi+rbx+8]
    550         movq            mm3,        QWORD PTR [rdi+rdx+8]
    551 
    552         psadbw          mm4,        mm5
    553         psadbw          mm1,        mm3
    554 
    555         lea             rsi,        [rsi+rbx*2]
    556         lea             rdi,        [rdi+rdx*2]
    557 
    558         paddw           mm0,        mm2
    559         paddw           mm4,        mm1
    560 
    561         paddw           mm7,        mm0
    562         paddw           mm7,        mm4
    563 
    564         cmp             rsi,        rcx
    565         jne             vp8_sad16x16_sse3_loop
    566 
    567         movd            rax,        mm7
    568 
    569 vp8_sad16x16_early_exit:
    570 
    571     ; begin epilog
    572     pop         rdi
    573     pop         rsi
    574     pop         rbx
    575     UNSHADOW_ARGS
    576     pop         rbp
    577     ret
    578 
    579 ;void vp8_sad16x16x4d_sse3(
    580 ;    unsigned char *src_ptr,
    581 ;    int  src_stride,
    582 ;    unsigned char *ref_ptr_base,
    583 ;    int  ref_stride,
    584 ;    int  *results)
    585 global sym(vp8_sad16x16x4d_sse3)
    586 sym(vp8_sad16x16x4d_sse3):
    587     push        rbp
    588     mov         rbp, rsp
    589     SHADOW_ARGS_TO_STACK 5
    590     push        rsi
    591     push        rdi
    592     push        rbx
    593     ; end prolog
    594 
    595         push            rbp
    596         mov             rdi,        arg(2) ; ref_ptr_base
    597 
    598         LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi
    599 
    600         mov             rsi,        arg(0) ;src_ptr
    601 
    602         movsxd          rbx,        dword ptr arg(1) ;src_stride
    603         movsxd          rbp,        dword ptr arg(3) ;ref_stride
    604 
    605         xchg            rbx,        rax
    606 
    607         PROCESS_16X2X4 1
    608         PROCESS_16X2X4 0
    609         PROCESS_16X2X4 0
    610         PROCESS_16X2X4 0
    611         PROCESS_16X2X4 0
    612         PROCESS_16X2X4 0
    613         PROCESS_16X2X4 0
    614         PROCESS_16X2X4 0
    615 
    616         pop             rbp
    617         mov             rdi,        arg(4) ;Results
    618 
    619         movq            xmm0,       xmm4
    620         psrldq          xmm4,       8
    621 
    622         paddw           xmm0,       xmm4
    623         movd            [rdi],      xmm0
    624 ;-
    625         movq            xmm0,       xmm5
    626         psrldq          xmm5,       8
    627 
    628         paddw           xmm0,       xmm5
    629         movd            [rdi+4],    xmm0
    630 ;-
    631         movq            xmm0,       xmm6
    632         psrldq          xmm6,       8
    633 
    634         paddw           xmm0,       xmm6
    635         movd            [rdi+8],    xmm0
    636 ;-
    637         movq            xmm0,       xmm7
    638         psrldq          xmm7,       8
    639 
    640         paddw           xmm0,       xmm7
    641         movd            [rdi+12],   xmm0
    642 
    643     ; begin epilog
    644     pop         rbx
    645     pop         rdi
    646     pop         rsi
    647     UNSHADOW_ARGS
    648     pop         rbp
    649     ret
    650 
    651 ;void vp8_sad16x8x4d_sse3(
    652 ;    unsigned char *src_ptr,
    653 ;    int  src_stride,
    654 ;    unsigned char *ref_ptr_base,
    655 ;    int  ref_stride,
    656 ;    int  *results)
    657 global sym(vp8_sad16x8x4d_sse3)
    658 sym(vp8_sad16x8x4d_sse3):
    659     push        rbp
    660     mov         rbp, rsp
    661     SHADOW_ARGS_TO_STACK 5
    662     push        rsi
    663     push        rdi
    664     push        rbx
    665     ; end prolog
    666 
    667         push            rbp
    668         mov             rdi,        arg(2) ; ref_ptr_base
    669 
    670         LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi
    671 
    672         mov             rsi,        arg(0) ;src_ptr
    673 
    674         movsxd          rbx,        dword ptr arg(1) ;src_stride
    675         movsxd          rbp,        dword ptr arg(3) ;ref_stride
    676 
    677         xchg            rbx,        rax
    678 
    679         PROCESS_16X2X4 1
    680         PROCESS_16X2X4 0
    681         PROCESS_16X2X4 0
    682         PROCESS_16X2X4 0
    683 
    684         pop             rbp
    685         mov             rdi,        arg(4) ;Results
    686 
    687         movq            xmm0,       xmm4
    688         psrldq          xmm4,       8
    689 
    690         paddw           xmm0,       xmm4
    691         movd            [rdi],      xmm0
    692 ;-
    693         movq            xmm0,       xmm5
    694         psrldq          xmm5,       8
    695 
    696         paddw           xmm0,       xmm5
    697         movd            [rdi+4],    xmm0
    698 ;-
    699         movq            xmm0,       xmm6
    700         psrldq          xmm6,       8
    701 
    702         paddw           xmm0,       xmm6
    703         movd            [rdi+8],    xmm0
    704 ;-
    705         movq            xmm0,       xmm7
    706         psrldq          xmm7,       8
    707 
    708         paddw           xmm0,       xmm7
    709         movd            [rdi+12],   xmm0
    710 
    711     ; begin epilog
    712     pop         rbx
    713     pop         rdi
    714     pop         rsi
    715     UNSHADOW_ARGS
    716     pop         rbp
    717     ret
    718 
    719 ;void int vp8_sad8x16x4d_sse3(
    720 ;    unsigned char *src_ptr,
    721 ;    int  src_stride,
    722 ;    unsigned char *ref_ptr,
    723 ;    int  ref_stride,
    724 ;    int  *results)
    725 global sym(vp8_sad8x16x4d_sse3)
    726 sym(vp8_sad8x16x4d_sse3):
    727     push        rbp
    728     mov         rbp, rsp
    729     SHADOW_ARGS_TO_STACK 5
    730     push        rsi
    731     push        rdi
    732     push        rbx
    733     ; end prolog
    734 
    735         push            rbp
    736         mov             rdi,        arg(2) ; ref_ptr_base
    737 
    738         LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi
    739 
    740         mov             rsi,        arg(0) ;src_ptr
    741 
    742         movsxd          rbx,        dword ptr arg(1) ;src_stride
    743         movsxd          rbp,        dword ptr arg(3) ;ref_stride
    744 
    745         xchg            rbx,        rax
    746 
    747         PROCESS_8X2X4 1
    748         PROCESS_8X2X4 0
    749         PROCESS_8X2X4 0
    750         PROCESS_8X2X4 0
    751         PROCESS_8X2X4 0
    752         PROCESS_8X2X4 0
    753         PROCESS_8X2X4 0
    754         PROCESS_8X2X4 0
    755 
    756         pop             rbp
    757         mov             rdi,        arg(4) ;Results
    758 
    759         movd            [rdi],      mm4
    760         movd            [rdi+4],    mm5
    761         movd            [rdi+8],    mm6
    762         movd            [rdi+12],   mm7
    763 
    764     ; begin epilog
    765     pop         rbx
    766     pop         rdi
    767     pop         rsi
    768     UNSHADOW_ARGS
    769     pop         rbp
    770     ret
    771 
    772 ;void int vp8_sad8x8x4d_sse3(
    773 ;    unsigned char *src_ptr,
    774 ;    int  src_stride,
    775 ;    unsigned char *ref_ptr,
    776 ;    int  ref_stride,
    777 ;    int  *results)
    778 global sym(vp8_sad8x8x4d_sse3)
    779 sym(vp8_sad8x8x4d_sse3):
    780     push        rbp
    781     mov         rbp, rsp
    782     SHADOW_ARGS_TO_STACK 5
    783     push        rsi
    784     push        rdi
    785     push        rbx
    786     ; end prolog
    787 
    788         push            rbp
    789         mov             rdi,        arg(2) ; ref_ptr_base
    790 
    791         LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi
    792 
    793         mov             rsi,        arg(0) ;src_ptr
    794 
    795         movsxd          rbx,        dword ptr arg(1) ;src_stride
    796         movsxd          rbp,        dword ptr arg(3) ;ref_stride
    797 
    798         xchg            rbx,        rax
    799 
    800         PROCESS_8X2X4 1
    801         PROCESS_8X2X4 0
    802         PROCESS_8X2X4 0
    803         PROCESS_8X2X4 0
    804 
    805         pop             rbp
    806         mov             rdi,        arg(4) ;Results
    807 
    808         movd            [rdi],      mm4
    809         movd            [rdi+4],    mm5
    810         movd            [rdi+8],    mm6
    811         movd            [rdi+12],   mm7
    812 
    813     ; begin epilog
    814     pop         rbx
    815     pop         rdi
    816     pop         rsi
    817     UNSHADOW_ARGS
    818     pop         rbp
    819     ret
    820 
    821 ;void int vp8_sad4x4x4d_sse3(
    822 ;    unsigned char *src_ptr,
    823 ;    int  src_stride,
    824 ;    unsigned char *ref_ptr,
    825 ;    int  ref_stride,
    826 ;    int  *results)
    827 global sym(vp8_sad4x4x4d_sse3)
    828 sym(vp8_sad4x4x4d_sse3):
    829     push        rbp
    830     mov         rbp, rsp
    831     SHADOW_ARGS_TO_STACK 5
    832     push        rsi
    833     push        rdi
    834     push        rbx
    835     ; end prolog
    836 
    837         push            rbp
    838         mov             rdi,        arg(2) ; ref_ptr_base
    839 
    840         LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi
    841 
    842         mov             rsi,        arg(0) ;src_ptr
    843 
    844         movsxd          rbx,        dword ptr arg(1) ;src_stride
    845         movsxd          rbp,        dword ptr arg(3) ;ref_stride
    846 
    847         xchg            rbx,        rax
    848 
    849         movd            mm0,        QWORD PTR [rsi]
    850         movd            mm1,        QWORD PTR [rcx]
    851 
    852         movd            mm2,        QWORD PTR [rsi+rax]
    853         movd            mm3,        QWORD PTR [rcx+rbp]
    854 
    855         punpcklbw       mm0,        mm2
    856         punpcklbw       mm1,        mm3
    857 
    858         movd            mm4,        QWORD PTR [rdx]
    859         movd            mm5,        QWORD PTR [rbx]
    860 
    861         movd            mm6,        QWORD PTR [rdi]
    862         movd            mm2,        QWORD PTR [rdx+rbp]
    863 
    864         movd            mm3,        QWORD PTR [rbx+rbp]
    865         movd            mm7,        QWORD PTR [rdi+rbp]
    866 
    867         psadbw          mm1,        mm0
    868 
    869         punpcklbw       mm4,        mm2
    870         punpcklbw       mm5,        mm3
    871 
    872         punpcklbw       mm6,        mm7
    873         psadbw          mm4,        mm0
    874 
    875         psadbw          mm5,        mm0
    876         psadbw          mm6,        mm0
    877 
    878 
    879 
    880         lea             rsi,        [rsi+rax*2]
    881         lea             rcx,        [rcx+rbp*2]
    882 
    883         lea             rdx,        [rdx+rbp*2]
    884         lea             rbx,        [rbx+rbp*2]
    885 
    886         lea             rdi,        [rdi+rbp*2]
    887 
    888         movd            mm0,        QWORD PTR [rsi]
    889         movd            mm2,        QWORD PTR [rcx]
    890 
    891         movd            mm3,        QWORD PTR [rsi+rax]
    892         movd            mm7,        QWORD PTR [rcx+rbp]
    893 
    894         punpcklbw       mm0,        mm3
    895         punpcklbw       mm2,        mm7
    896 
    897         movd            mm3,        QWORD PTR [rdx]
    898         movd            mm7,        QWORD PTR [rbx]
    899 
    900         psadbw          mm2,        mm0
    901         mov             rax,        rbp
    902 
    903         pop             rbp
    904         mov             rsi,        arg(4) ;Results
    905 
    906         paddw           mm1,        mm2
    907         movd            [rsi],      mm1
    908 
    909         movd            mm2,        QWORD PTR [rdx+rax]
    910         movd            mm1,        QWORD PTR [rbx+rax]
    911 
    912         punpcklbw       mm3,        mm2
    913         punpcklbw       mm7,        mm1
    914 
    915         psadbw          mm3,        mm0
    916         psadbw          mm7,        mm0
    917 
    918         movd            mm2,        QWORD PTR [rdi]
    919         movd            mm1,        QWORD PTR [rdi+rax]
    920 
    921         paddw           mm3,        mm4
    922         paddw           mm7,        mm5
    923 
    924         movd            [rsi+4],    mm3
    925         punpcklbw       mm2,        mm1
    926 
    927         movd            [rsi+8],    mm7
    928         psadbw          mm2,        mm0
    929 
    930         paddw           mm2,        mm6
    931         movd            [rsi+12],   mm2
    932 
    933 
    934     ; begin epilog
    935     pop         rbx
    936     pop         rdi
    937     pop         rsi
    938     UNSHADOW_ARGS
    939     pop         rbp
    940     ret
    941