Home | History | Annotate | Download | only in x86
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 %include "vpx_ports/x86_abi_support.asm"
     12 
     13 %macro STACK_FRAME_CREATE_X3 0
     14 %if ABI_IS_32BIT
     15   %define     src_ptr       rsi
     16   %define     src_stride    rax
     17   %define     ref_ptr       rdi
     18   %define     ref_stride    rdx
     19   %define     end_ptr       rcx
     20   %define     ret_var       rbx
     21   %define     result_ptr    arg(4)
     22   %define     max_err       arg(4)
     23     push        rbp
     24     mov         rbp,        rsp
     25     push        rsi
     26     push        rdi
     27     push        rbx
     28 
     29     mov         rsi,        arg(0)              ; src_ptr
     30     mov         rdi,        arg(2)              ; ref_ptr
     31 
     32     movsxd      rax,        dword ptr arg(1)    ; src_stride
     33     movsxd      rdx,        dword ptr arg(3)    ; ref_stride
     34 %else
     35   %ifidn __OUTPUT_FORMAT__,x64
     36     %define     src_ptr     rcx
     37     %define     src_stride  rdx
     38     %define     ref_ptr     r8
     39     %define     ref_stride  r9
     40     %define     end_ptr     r10
     41     %define     ret_var     r11
     42     %define     result_ptr  [rsp+8+4*8]
     43     %define     max_err     [rsp+8+4*8]
     44   %else
     45     %define     src_ptr     rdi
     46     %define     src_stride  rsi
     47     %define     ref_ptr     rdx
     48     %define     ref_stride  rcx
     49     %define     end_ptr     r9
     50     %define     ret_var     r10
     51     %define     result_ptr  r8
     52     %define     max_err     r8
     53   %endif
     54 %endif
     55 
     56 %endmacro
     57 
     58 %macro STACK_FRAME_DESTROY_X3 0
     59   %define     src_ptr
     60   %define     src_stride
     61   %define     ref_ptr
     62   %define     ref_stride
     63   %define     end_ptr
     64   %define     ret_var
     65   %define     result_ptr
     66   %define     max_err
     67 
     68 %if ABI_IS_32BIT
     69     pop         rbx
     70     pop         rdi
     71     pop         rsi
     72     pop         rbp
     73 %else
     74   %ifidn __OUTPUT_FORMAT__,x64
     75   %endif
     76 %endif
     77     ret
     78 %endmacro
     79 
     80 %macro STACK_FRAME_CREATE_X4 0
     81 %if ABI_IS_32BIT
     82   %define     src_ptr       rsi
     83   %define     src_stride    rax
     84   %define     r0_ptr        rcx
     85   %define     r1_ptr        rdx
     86   %define     r2_ptr        rbx
     87   %define     r3_ptr        rdi
     88   %define     ref_stride    rbp
     89   %define     result_ptr    arg(4)
     90     push        rbp
     91     mov         rbp,        rsp
     92     push        rsi
     93     push        rdi
     94     push        rbx
     95 
     96     push        rbp
     97     mov         rdi,        arg(2)              ; ref_ptr_base
     98 
     99     LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi
    100 
    101     mov         rsi,        arg(0)              ; src_ptr
    102 
    103     movsxd      rbx,        dword ptr arg(1)    ; src_stride
    104     movsxd      rbp,        dword ptr arg(3)    ; ref_stride
    105 
    106     xchg        rbx,        rax
    107 %else
    108   %ifidn __OUTPUT_FORMAT__,x64
    109     %define     src_ptr     rcx
    110     %define     src_stride  rdx
    111     %define     r0_ptr      rsi
    112     %define     r1_ptr      r10
    113     %define     r2_ptr      r11
    114     %define     r3_ptr      r8
    115     %define     ref_stride  r9
    116     %define     result_ptr  [rsp+16+4*8]
    117     push        rsi
    118 
    119     LOAD_X4_ADDRESSES r8, r0_ptr, r1_ptr, r2_ptr, r3_ptr
    120   %else
    121     %define     src_ptr     rdi
    122     %define     src_stride  rsi
    123     %define     r0_ptr      r9
    124     %define     r1_ptr      r10
    125     %define     r2_ptr      r11
    126     %define     r3_ptr      rdx
    127     %define     ref_stride  rcx
    128     %define     result_ptr  r8
    129 
    130     LOAD_X4_ADDRESSES rdx, r0_ptr, r1_ptr, r2_ptr, r3_ptr
    131 
    132   %endif
    133 %endif
    134 %endmacro
    135 
    136 %macro STACK_FRAME_DESTROY_X4 0
    137   %define     src_ptr
    138   %define     src_stride
    139   %define     r0_ptr
    140   %define     r1_ptr
    141   %define     r2_ptr
    142   %define     r3_ptr
    143   %define     ref_stride
    144   %define     result_ptr
    145 
    146 %if ABI_IS_32BIT
    147     pop         rbx
    148     pop         rdi
    149     pop         rsi
    150     pop         rbp
    151 %else
    152   %ifidn __OUTPUT_FORMAT__,x64
    153     pop         rsi
    154   %endif
    155 %endif
    156     ret
    157 %endmacro
    158 
    159 %macro PROCESS_16X2X3 5
    160 %if %1==0
    161         movdqa          xmm0,       XMMWORD PTR [%2]
    162         lddqu           xmm5,       XMMWORD PTR [%3]
    163         lddqu           xmm6,       XMMWORD PTR [%3+1]
    164         lddqu           xmm7,       XMMWORD PTR [%3+2]
    165 
    166         psadbw          xmm5,       xmm0
    167         psadbw          xmm6,       xmm0
    168         psadbw          xmm7,       xmm0
    169 %else
    170         movdqa          xmm0,       XMMWORD PTR [%2]
    171         lddqu           xmm1,       XMMWORD PTR [%3]
    172         lddqu           xmm2,       XMMWORD PTR [%3+1]
    173         lddqu           xmm3,       XMMWORD PTR [%3+2]
    174 
    175         psadbw          xmm1,       xmm0
    176         psadbw          xmm2,       xmm0
    177         psadbw          xmm3,       xmm0
    178 
    179         paddw           xmm5,       xmm1
    180         paddw           xmm6,       xmm2
    181         paddw           xmm7,       xmm3
    182 %endif
    183         movdqa          xmm0,       XMMWORD PTR [%2+%4]
    184         lddqu           xmm1,       XMMWORD PTR [%3+%5]
    185         lddqu           xmm2,       XMMWORD PTR [%3+%5+1]
    186         lddqu           xmm3,       XMMWORD PTR [%3+%5+2]
    187 
    188 %if %1==0 || %1==1
    189         lea             %2,         [%2+%4*2]
    190         lea             %3,         [%3+%5*2]
    191 %endif
    192 
    193         psadbw          xmm1,       xmm0
    194         psadbw          xmm2,       xmm0
    195         psadbw          xmm3,       xmm0
    196 
    197         paddw           xmm5,       xmm1
    198         paddw           xmm6,       xmm2
    199         paddw           xmm7,       xmm3
    200 %endmacro
    201 
    202 %macro PROCESS_8X2X3 5
    203 %if %1==0
    204         movq            mm0,       QWORD PTR [%2]
    205         movq            mm5,       QWORD PTR [%3]
    206         movq            mm6,       QWORD PTR [%3+1]
    207         movq            mm7,       QWORD PTR [%3+2]
    208 
    209         psadbw          mm5,       mm0
    210         psadbw          mm6,       mm0
    211         psadbw          mm7,       mm0
    212 %else
    213         movq            mm0,       QWORD PTR [%2]
    214         movq            mm1,       QWORD PTR [%3]
    215         movq            mm2,       QWORD PTR [%3+1]
    216         movq            mm3,       QWORD PTR [%3+2]
    217 
    218         psadbw          mm1,       mm0
    219         psadbw          mm2,       mm0
    220         psadbw          mm3,       mm0
    221 
    222         paddw           mm5,       mm1
    223         paddw           mm6,       mm2
    224         paddw           mm7,       mm3
    225 %endif
    226         movq            mm0,       QWORD PTR [%2+%4]
    227         movq            mm1,       QWORD PTR [%3+%5]
    228         movq            mm2,       QWORD PTR [%3+%5+1]
    229         movq            mm3,       QWORD PTR [%3+%5+2]
    230 
    231 %if %1==0 || %1==1
    232         lea             %2,        [%2+%4*2]
    233         lea             %3,        [%3+%5*2]
    234 %endif
    235 
    236         psadbw          mm1,       mm0
    237         psadbw          mm2,       mm0
    238         psadbw          mm3,       mm0
    239 
    240         paddw           mm5,       mm1
    241         paddw           mm6,       mm2
    242         paddw           mm7,       mm3
    243 %endmacro
    244 
    245 %macro LOAD_X4_ADDRESSES 5
    246         mov             %2,         [%1+REG_SZ_BYTES*0]
    247         mov             %3,         [%1+REG_SZ_BYTES*1]
    248 
    249         mov             %4,         [%1+REG_SZ_BYTES*2]
    250         mov             %5,         [%1+REG_SZ_BYTES*3]
    251 %endmacro
    252 
    253 %macro PROCESS_16X2X4 8
    254 %if %1==0
    255         movdqa          xmm0,       XMMWORD PTR [%2]
    256         lddqu           xmm4,       XMMWORD PTR [%3]
    257         lddqu           xmm5,       XMMWORD PTR [%4]
    258         lddqu           xmm6,       XMMWORD PTR [%5]
    259         lddqu           xmm7,       XMMWORD PTR [%6]
    260 
    261         psadbw          xmm4,       xmm0
    262         psadbw          xmm5,       xmm0
    263         psadbw          xmm6,       xmm0
    264         psadbw          xmm7,       xmm0
    265 %else
    266         movdqa          xmm0,       XMMWORD PTR [%2]
    267         lddqu           xmm1,       XMMWORD PTR [%3]
    268         lddqu           xmm2,       XMMWORD PTR [%4]
    269         lddqu           xmm3,       XMMWORD PTR [%5]
    270 
    271         psadbw          xmm1,       xmm0
    272         psadbw          xmm2,       xmm0
    273         psadbw          xmm3,       xmm0
    274 
    275         paddw           xmm4,       xmm1
    276         lddqu           xmm1,       XMMWORD PTR [%6]
    277         paddw           xmm5,       xmm2
    278         paddw           xmm6,       xmm3
    279 
    280         psadbw          xmm1,       xmm0
    281         paddw           xmm7,       xmm1
    282 %endif
    283         movdqa          xmm0,       XMMWORD PTR [%2+%7]
    284         lddqu           xmm1,       XMMWORD PTR [%3+%8]
    285         lddqu           xmm2,       XMMWORD PTR [%4+%8]
    286         lddqu           xmm3,       XMMWORD PTR [%5+%8]
    287 
    288         psadbw          xmm1,       xmm0
    289         psadbw          xmm2,       xmm0
    290         psadbw          xmm3,       xmm0
    291 
    292         paddw           xmm4,       xmm1
    293         lddqu           xmm1,       XMMWORD PTR [%6+%8]
    294         paddw           xmm5,       xmm2
    295         paddw           xmm6,       xmm3
    296 
    297 %if %1==0 || %1==1
    298         lea             %2,         [%2+%7*2]
    299         lea             %3,         [%3+%8*2]
    300 
    301         lea             %4,         [%4+%8*2]
    302         lea             %5,         [%5+%8*2]
    303 
    304         lea             %6,         [%6+%8*2]
    305 %endif
    306         psadbw          xmm1,       xmm0
    307         paddw           xmm7,       xmm1
    308 
    309 %endmacro
    310 
    311 %macro PROCESS_8X2X4 8
    312 %if %1==0
    313         movq            mm0,        QWORD PTR [%2]
    314         movq            mm4,        QWORD PTR [%3]
    315         movq            mm5,        QWORD PTR [%4]
    316         movq            mm6,        QWORD PTR [%5]
    317         movq            mm7,        QWORD PTR [%6]
    318 
    319         psadbw          mm4,        mm0
    320         psadbw          mm5,        mm0
    321         psadbw          mm6,        mm0
    322         psadbw          mm7,        mm0
    323 %else
    324         movq            mm0,        QWORD PTR [%2]
    325         movq            mm1,        QWORD PTR [%3]
    326         movq            mm2,        QWORD PTR [%4]
    327         movq            mm3,        QWORD PTR [%5]
    328 
    329         psadbw          mm1,        mm0
    330         psadbw          mm2,        mm0
    331         psadbw          mm3,        mm0
    332 
    333         paddw           mm4,        mm1
    334         movq            mm1,        QWORD PTR [%6]
    335         paddw           mm5,        mm2
    336         paddw           mm6,        mm3
    337 
    338         psadbw          mm1,        mm0
    339         paddw           mm7,        mm1
    340 %endif
    341         movq            mm0,        QWORD PTR [%2+%7]
    342         movq            mm1,        QWORD PTR [%3+%8]
    343         movq            mm2,        QWORD PTR [%4+%8]
    344         movq            mm3,        QWORD PTR [%5+%8]
    345 
    346         psadbw          mm1,        mm0
    347         psadbw          mm2,        mm0
    348         psadbw          mm3,        mm0
    349 
    350         paddw           mm4,        mm1
    351         movq            mm1,        QWORD PTR [%6+%8]
    352         paddw           mm5,        mm2
    353         paddw           mm6,        mm3
    354 
    355 %if %1==0 || %1==1
    356         lea             %2,         [%2+%7*2]
    357         lea             %3,         [%3+%8*2]
    358 
    359         lea             %4,         [%4+%8*2]
    360         lea             %5,         [%5+%8*2]
    361 
    362         lea             %6,         [%6+%8*2]
    363 %endif
    364         psadbw          mm1,        mm0
    365         paddw           mm7,        mm1
    366 
    367 %endmacro
    368 
    369 ;void int vp8_sad16x16x3_sse3(
    370 ;    unsigned char *src_ptr,
    371 ;    int  src_stride,
    372 ;    unsigned char *ref_ptr,
    373 ;    int  ref_stride,
    374 ;    int  *results)
    375 global sym(vp8_sad16x16x3_sse3)
    376 sym(vp8_sad16x16x3_sse3):
    377 
    378     STACK_FRAME_CREATE_X3
    379 
    380         PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
    381         PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
    382         PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
    383         PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
    384         PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
    385         PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
    386         PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
    387         PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
    388 
    389         mov             rcx,        result_ptr
    390 
    391         movq            xmm0,       xmm5
    392         psrldq          xmm5,       8
    393 
    394         paddw           xmm0,       xmm5
    395         movd            [rcx],      xmm0
    396 ;-
    397         movq            xmm0,       xmm6
    398         psrldq          xmm6,       8
    399 
    400         paddw           xmm0,       xmm6
    401         movd            [rcx+4],    xmm0
    402 ;-
    403         movq            xmm0,       xmm7
    404         psrldq          xmm7,       8
    405 
    406         paddw           xmm0,       xmm7
    407         movd            [rcx+8],    xmm0
    408 
    409     STACK_FRAME_DESTROY_X3
    410 
    411 ;void int vp8_sad16x8x3_sse3(
    412 ;    unsigned char *src_ptr,
    413 ;    int  src_stride,
    414 ;    unsigned char *ref_ptr,
    415 ;    int  ref_stride,
    416 ;    int  *results)
    417 global sym(vp8_sad16x8x3_sse3)
    418 sym(vp8_sad16x8x3_sse3):
    419 
    420     STACK_FRAME_CREATE_X3
    421 
    422         PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
    423         PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
    424         PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
    425         PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
    426 
    427         mov             rcx,        result_ptr
    428 
    429         movq            xmm0,       xmm5
    430         psrldq          xmm5,       8
    431 
    432         paddw           xmm0,       xmm5
    433         movd            [rcx],      xmm0
    434 ;-
    435         movq            xmm0,       xmm6
    436         psrldq          xmm6,       8
    437 
    438         paddw           xmm0,       xmm6
    439         movd            [rcx+4],    xmm0
    440 ;-
    441         movq            xmm0,       xmm7
    442         psrldq          xmm7,       8
    443 
    444         paddw           xmm0,       xmm7
    445         movd            [rcx+8],    xmm0
    446 
    447     STACK_FRAME_DESTROY_X3
    448 
    449 ;void int vp8_sad8x16x3_sse3(
    450 ;    unsigned char *src_ptr,
    451 ;    int  src_stride,
    452 ;    unsigned char *ref_ptr,
    453 ;    int  ref_stride,
    454 ;    int  *results)
    455 global sym(vp8_sad8x16x3_sse3)
    456 sym(vp8_sad8x16x3_sse3):
    457 
    458     STACK_FRAME_CREATE_X3
    459 
    460         PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
    461         PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
    462         PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
    463         PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
    464         PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
    465         PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
    466         PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
    467         PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
    468 
    469         mov             rcx,        result_ptr
    470 
    471         punpckldq       mm5,        mm6
    472 
    473         movq            [rcx],      mm5
    474         movd            [rcx+8],    mm7
    475 
    476     STACK_FRAME_DESTROY_X3
    477 
    478 ;void int vp8_sad8x8x3_sse3(
    479 ;    unsigned char *src_ptr,
    480 ;    int  src_stride,
    481 ;    unsigned char *ref_ptr,
    482 ;    int  ref_stride,
    483 ;    int  *results)
    484 global sym(vp8_sad8x8x3_sse3)
    485 sym(vp8_sad8x8x3_sse3):
    486 
    487     STACK_FRAME_CREATE_X3
    488 
    489         PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
    490         PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
    491         PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
    492         PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
    493 
    494         mov             rcx,        result_ptr
    495 
    496         punpckldq       mm5,        mm6
    497 
    498         movq            [rcx],      mm5
    499         movd            [rcx+8],    mm7
    500 
    501     STACK_FRAME_DESTROY_X3
    502 
    503 ;void int vp8_sad4x4x3_sse3(
    504 ;    unsigned char *src_ptr,
    505 ;    int  src_stride,
    506 ;    unsigned char *ref_ptr,
    507 ;    int  ref_stride,
    508 ;    int  *results)
    509 global sym(vp8_sad4x4x3_sse3)
    510 sym(vp8_sad4x4x3_sse3):
    511 
    512     STACK_FRAME_CREATE_X3
    513 
    514         movd            mm0,        DWORD PTR [src_ptr]
    515         movd            mm1,        DWORD PTR [ref_ptr]
    516 
    517         movd            mm2,        DWORD PTR [src_ptr+src_stride]
    518         movd            mm3,        DWORD PTR [ref_ptr+ref_stride]
    519 
    520         punpcklbw       mm0,        mm2
    521         punpcklbw       mm1,        mm3
    522 
    523         movd            mm4,        DWORD PTR [ref_ptr+1]
    524         movd            mm5,        DWORD PTR [ref_ptr+2]
    525 
    526         movd            mm2,        DWORD PTR [ref_ptr+ref_stride+1]
    527         movd            mm3,        DWORD PTR [ref_ptr+ref_stride+2]
    528 
    529         psadbw          mm1,        mm0
    530 
    531         punpcklbw       mm4,        mm2
    532         punpcklbw       mm5,        mm3
    533 
    534         psadbw          mm4,        mm0
    535         psadbw          mm5,        mm0
    536 
    537         lea             src_ptr,    [src_ptr+src_stride*2]
    538         lea             ref_ptr,    [ref_ptr+ref_stride*2]
    539 
    540         movd            mm0,        DWORD PTR [src_ptr]
    541         movd            mm2,        DWORD PTR [ref_ptr]
    542 
    543         movd            mm3,        DWORD PTR [src_ptr+src_stride]
    544         movd            mm6,        DWORD PTR [ref_ptr+ref_stride]
    545 
    546         punpcklbw       mm0,        mm3
    547         punpcklbw       mm2,        mm6
    548 
    549         movd            mm3,        DWORD PTR [ref_ptr+1]
    550         movd            mm7,        DWORD PTR [ref_ptr+2]
    551 
    552         psadbw          mm2,        mm0
    553 
    554         paddw           mm1,        mm2
    555 
    556         movd            mm2,        DWORD PTR [ref_ptr+ref_stride+1]
    557         movd            mm6,        DWORD PTR [ref_ptr+ref_stride+2]
    558 
    559         punpcklbw       mm3,        mm2
    560         punpcklbw       mm7,        mm6
    561 
    562         psadbw          mm3,        mm0
    563         psadbw          mm7,        mm0
    564 
    565         paddw           mm3,        mm4
    566         paddw           mm7,        mm5
    567 
    568         mov             rcx,        result_ptr
    569 
    570         punpckldq       mm1,        mm3
    571 
    572         movq            [rcx],      mm1
    573         movd            [rcx+8],    mm7
    574 
    575     STACK_FRAME_DESTROY_X3
    576 
    577 ;unsigned int vp8_sad16x16_sse3(
    578 ;    unsigned char *src_ptr,
    579 ;    int  src_stride,
    580 ;    unsigned char *ref_ptr,
    581 ;    int  ref_stride,
    582 ;    int  max_err)
    583 ;%define lddqu movdqu
    584 global sym(vp8_sad16x16_sse3)
    585 sym(vp8_sad16x16_sse3):
    586 
    587     STACK_FRAME_CREATE_X3
    588 
    589         mov             end_ptr,    4
    590         pxor            xmm7,        xmm7
    591 
    592 .vp8_sad16x16_sse3_loop:
    593         movdqa          xmm0,       XMMWORD PTR [src_ptr]
    594         movdqu          xmm1,       XMMWORD PTR [ref_ptr]
    595         movdqa          xmm2,       XMMWORD PTR [src_ptr+src_stride]
    596         movdqu          xmm3,       XMMWORD PTR [ref_ptr+ref_stride]
    597 
    598         lea             src_ptr,    [src_ptr+src_stride*2]
    599         lea             ref_ptr,    [ref_ptr+ref_stride*2]
    600 
    601         movdqa          xmm4,       XMMWORD PTR [src_ptr]
    602         movdqu          xmm5,       XMMWORD PTR [ref_ptr]
    603         movdqa          xmm6,       XMMWORD PTR [src_ptr+src_stride]
    604 
    605         psadbw          xmm0,       xmm1
    606 
    607         movdqu          xmm1,       XMMWORD PTR [ref_ptr+ref_stride]
    608 
    609         psadbw          xmm2,       xmm3
    610         psadbw          xmm4,       xmm5
    611         psadbw          xmm6,       xmm1
    612 
    613         lea             src_ptr,    [src_ptr+src_stride*2]
    614         lea             ref_ptr,    [ref_ptr+ref_stride*2]
    615 
    616         paddw           xmm7,        xmm0
    617         paddw           xmm7,        xmm2
    618         paddw           xmm7,        xmm4
    619         paddw           xmm7,        xmm6
    620 
    621         sub             end_ptr,     1
    622         jne             .vp8_sad16x16_sse3_loop
    623 
    624         movq            xmm0,       xmm7
    625         psrldq          xmm7,       8
    626         paddw           xmm0,       xmm7
    627         movq            rax,        xmm0
    628 
    629     STACK_FRAME_DESTROY_X3
    630 
    631 ;void vp8_sad16x16x4d_sse3(
    632 ;    unsigned char *src_ptr,
    633 ;    int  src_stride,
    634 ;    unsigned char *ref_ptr_base,
    635 ;    int  ref_stride,
    636 ;    int  *results)
    637 global sym(vp8_sad16x16x4d_sse3)
    638 sym(vp8_sad16x16x4d_sse3):
    639 
    640     STACK_FRAME_CREATE_X4
    641 
    642         PROCESS_16X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
    643         PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
    644         PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
    645         PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
    646         PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
    647         PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
    648         PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
    649         PROCESS_16X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
    650 
    651 %if ABI_IS_32BIT
    652         pop             rbp
    653 %endif
    654         mov             rcx,        result_ptr
    655 
    656         movq            xmm0,       xmm4
    657         psrldq          xmm4,       8
    658 
    659         paddw           xmm0,       xmm4
    660         movd            [rcx],      xmm0
    661 ;-
    662         movq            xmm0,       xmm5
    663         psrldq          xmm5,       8
    664 
    665         paddw           xmm0,       xmm5
    666         movd            [rcx+4],    xmm0
    667 ;-
    668         movq            xmm0,       xmm6
    669         psrldq          xmm6,       8
    670 
    671         paddw           xmm0,       xmm6
    672         movd            [rcx+8],    xmm0
    673 ;-
    674         movq            xmm0,       xmm7
    675         psrldq          xmm7,       8
    676 
    677         paddw           xmm0,       xmm7
    678         movd            [rcx+12],   xmm0
    679 
    680     STACK_FRAME_DESTROY_X4
    681 
    682 ;void vp8_sad16x8x4d_sse3(
    683 ;    unsigned char *src_ptr,
    684 ;    int  src_stride,
    685 ;    unsigned char *ref_ptr_base,
    686 ;    int  ref_stride,
    687 ;    int  *results)
    688 global sym(vp8_sad16x8x4d_sse3)
    689 sym(vp8_sad16x8x4d_sse3):
    690 
    691     STACK_FRAME_CREATE_X4
    692 
    693         PROCESS_16X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
    694         PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
    695         PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
    696         PROCESS_16X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
    697 
    698 %if ABI_IS_32BIT
    699         pop             rbp
    700 %endif
    701         mov             rcx,        result_ptr
    702 
    703         movq            xmm0,       xmm4
    704         psrldq          xmm4,       8
    705 
    706         paddw           xmm0,       xmm4
    707         movd            [rcx],      xmm0
    708 ;-
    709         movq            xmm0,       xmm5
    710         psrldq          xmm5,       8
    711 
    712         paddw           xmm0,       xmm5
    713         movd            [rcx+4],    xmm0
    714 ;-
    715         movq            xmm0,       xmm6
    716         psrldq          xmm6,       8
    717 
    718         paddw           xmm0,       xmm6
    719         movd            [rcx+8],    xmm0
    720 ;-
    721         movq            xmm0,       xmm7
    722         psrldq          xmm7,       8
    723 
    724         paddw           xmm0,       xmm7
    725         movd            [rcx+12],   xmm0
    726 
    727     STACK_FRAME_DESTROY_X4
    728 
    729 ;void int vp8_sad8x16x4d_sse3(
    730 ;    unsigned char *src_ptr,
    731 ;    int  src_stride,
    732 ;    unsigned char *ref_ptr,
    733 ;    int  ref_stride,
    734 ;    int  *results)
    735 global sym(vp8_sad8x16x4d_sse3)
    736 sym(vp8_sad8x16x4d_sse3):
    737 
    738     STACK_FRAME_CREATE_X4
    739 
    740         PROCESS_8X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
    741         PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
    742         PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
    743         PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
    744         PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
    745         PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
    746         PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
    747         PROCESS_8X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
    748 
    749 %if ABI_IS_32BIT
    750         pop             rbp
    751 %endif
    752         mov             rcx,        result_ptr
    753 
    754         punpckldq       mm4,        mm5
    755         punpckldq       mm6,        mm7
    756 
    757         movq            [rcx],      mm4
    758         movq            [rcx+8],    mm6
    759 
    760     STACK_FRAME_DESTROY_X4
    761 
    762 ;void int vp8_sad8x8x4d_sse3(
    763 ;    unsigned char *src_ptr,
    764 ;    int  src_stride,
    765 ;    unsigned char *ref_ptr,
    766 ;    int  ref_stride,
    767 ;    int  *results)
    768 global sym(vp8_sad8x8x4d_sse3)
    769 sym(vp8_sad8x8x4d_sse3):
    770 
    771     STACK_FRAME_CREATE_X4
    772 
    773         PROCESS_8X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
    774         PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
    775         PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
    776         PROCESS_8X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
    777 
    778 %if ABI_IS_32BIT
    779         pop             rbp
    780 %endif
    781         mov             rcx,        result_ptr
    782 
    783         punpckldq       mm4,        mm5
    784         punpckldq       mm6,        mm7
    785 
    786         movq            [rcx],      mm4
    787         movq            [rcx+8],    mm6
    788 
    789     STACK_FRAME_DESTROY_X4
    790 
    791 ;void int vp8_sad4x4x4d_sse3(
    792 ;    unsigned char *src_ptr,
    793 ;    int  src_stride,
    794 ;    unsigned char *ref_ptr,
    795 ;    int  ref_stride,
    796 ;    int  *results)
    797 global sym(vp8_sad4x4x4d_sse3)
    798 sym(vp8_sad4x4x4d_sse3):
    799 
    800     STACK_FRAME_CREATE_X4
    801 
    802         movd            mm0,        DWORD PTR [src_ptr]
    803         movd            mm1,        DWORD PTR [r0_ptr]
    804 
    805         movd            mm2,        DWORD PTR [src_ptr+src_stride]
    806         movd            mm3,        DWORD PTR [r0_ptr+ref_stride]
    807 
    808         punpcklbw       mm0,        mm2
    809         punpcklbw       mm1,        mm3
    810 
    811         movd            mm4,        DWORD PTR [r1_ptr]
    812         movd            mm5,        DWORD PTR [r2_ptr]
    813 
    814         movd            mm6,        DWORD PTR [r3_ptr]
    815         movd            mm2,        DWORD PTR [r1_ptr+ref_stride]
    816 
    817         movd            mm3,        DWORD PTR [r2_ptr+ref_stride]
    818         movd            mm7,        DWORD PTR [r3_ptr+ref_stride]
    819 
    820         psadbw          mm1,        mm0
    821 
    822         punpcklbw       mm4,        mm2
    823         punpcklbw       mm5,        mm3
    824 
    825         punpcklbw       mm6,        mm7
    826         psadbw          mm4,        mm0
    827 
    828         psadbw          mm5,        mm0
    829         psadbw          mm6,        mm0
    830 
    831 
    832 
    833         lea             src_ptr,    [src_ptr+src_stride*2]
    834         lea             r0_ptr,     [r0_ptr+ref_stride*2]
    835 
    836         lea             r1_ptr,     [r1_ptr+ref_stride*2]
    837         lea             r2_ptr,     [r2_ptr+ref_stride*2]
    838 
    839         lea             r3_ptr,     [r3_ptr+ref_stride*2]
    840 
    841         movd            mm0,        DWORD PTR [src_ptr]
    842         movd            mm2,        DWORD PTR [r0_ptr]
    843 
    844         movd            mm3,        DWORD PTR [src_ptr+src_stride]
    845         movd            mm7,        DWORD PTR [r0_ptr+ref_stride]
    846 
    847         punpcklbw       mm0,        mm3
    848         punpcklbw       mm2,        mm7
    849 
    850         movd            mm3,        DWORD PTR [r1_ptr]
    851         movd            mm7,        DWORD PTR [r2_ptr]
    852 
    853         psadbw          mm2,        mm0
    854 %if ABI_IS_32BIT
    855         mov             rax,        rbp
    856 
    857         pop             rbp
    858 %define     ref_stride    rax
    859 %endif
    860         mov             rsi,        result_ptr
    861 
    862         paddw           mm1,        mm2
    863         movd            [rsi],      mm1
    864 
    865         movd            mm2,        DWORD PTR [r1_ptr+ref_stride]
    866         movd            mm1,        DWORD PTR [r2_ptr+ref_stride]
    867 
    868         punpcklbw       mm3,        mm2
    869         punpcklbw       mm7,        mm1
    870 
    871         psadbw          mm3,        mm0
    872         psadbw          mm7,        mm0
    873 
    874         movd            mm2,        DWORD PTR [r3_ptr]
    875         movd            mm1,        DWORD PTR [r3_ptr+ref_stride]
    876 
    877         paddw           mm3,        mm4
    878         paddw           mm7,        mm5
    879 
    880         movd            [rsi+4],    mm3
    881         punpcklbw       mm2,        mm1
    882 
    883         movd            [rsi+8],    mm7
    884         psadbw          mm2,        mm0
    885 
    886         paddw           mm2,        mm6
    887         movd            [rsi+12],   mm2
    888 
    889 
    890     STACK_FRAME_DESTROY_X4
    891