Home | History | Annotate | Download | only in x86
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 %include "vpx_ports/x86_abi_support.asm"
     12 
     13 %macro STACK_FRAME_CREATE_X3 0
     14 %if ABI_IS_32BIT
     15   %define     src_ptr       rsi
     16   %define     src_stride    rax
     17   %define     ref_ptr       rdi
     18   %define     ref_stride    rdx
     19   %define     end_ptr       rcx
     20   %define     ret_var       rbx
     21   %define     result_ptr    arg(4)
     22   %define     max_sad       arg(4)
     23   %define     height        dword ptr arg(4)
     24     push        rbp
     25     mov         rbp,        rsp
     26     push        rsi
     27     push        rdi
     28     push        rbx
     29 
     30     mov         rsi,        arg(0)              ; src_ptr
     31     mov         rdi,        arg(2)              ; ref_ptr
     32 
     33     movsxd      rax,        dword ptr arg(1)    ; src_stride
     34     movsxd      rdx,        dword ptr arg(3)    ; ref_stride
     35 %else
     36   %ifidn __OUTPUT_FORMAT__,x64
     37     SAVE_XMM 7, u
     38     %define     src_ptr     rcx
     39     %define     src_stride  rdx
     40     %define     ref_ptr     r8
     41     %define     ref_stride  r9
     42     %define     end_ptr     r10
     43     %define     ret_var     r11
     44     %define     result_ptr  [rsp+xmm_stack_space+8+4*8]
     45     %define     max_sad     [rsp+xmm_stack_space+8+4*8]
     46     %define     height      dword ptr [rsp+xmm_stack_space+8+4*8]
     47   %else
     48     %define     src_ptr     rdi
     49     %define     src_stride  rsi
     50     %define     ref_ptr     rdx
     51     %define     ref_stride  rcx
     52     %define     end_ptr     r9
     53     %define     ret_var     r10
     54     %define     result_ptr  r8
     55     %define     max_sad     r8
     56     %define     height      r8
     57   %endif
     58 %endif
     59 
     60 %endmacro
     61 
     62 %macro STACK_FRAME_DESTROY_X3 0
     63   %define     src_ptr
     64   %define     src_stride
     65   %define     ref_ptr
     66   %define     ref_stride
     67   %define     end_ptr
     68   %define     ret_var
     69   %define     result_ptr
     70   %define     max_sad
     71   %define     height
     72 
     73 %if ABI_IS_32BIT
     74     pop         rbx
     75     pop         rdi
     76     pop         rsi
     77     pop         rbp
     78 %else
     79   %ifidn __OUTPUT_FORMAT__,x64
     80     RESTORE_XMM
     81   %endif
     82 %endif
     83     ret
     84 %endmacro
     85 
     86 %macro STACK_FRAME_CREATE_X4 0
     87 %if ABI_IS_32BIT
     88   %define     src_ptr       rsi
     89   %define     src_stride    rax
     90   %define     r0_ptr        rcx
     91   %define     r1_ptr        rdx
     92   %define     r2_ptr        rbx
     93   %define     r3_ptr        rdi
     94   %define     ref_stride    rbp
     95   %define     result_ptr    arg(4)
     96     push        rbp
     97     mov         rbp,        rsp
     98     push        rsi
     99     push        rdi
    100     push        rbx
    101 
    102     push        rbp
    103     mov         rdi,        arg(2)              ; ref_ptr_base
    104 
    105     LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi
    106 
    107     mov         rsi,        arg(0)              ; src_ptr
    108 
    109     movsxd      rbx,        dword ptr arg(1)    ; src_stride
    110     movsxd      rbp,        dword ptr arg(3)    ; ref_stride
    111 
    112     xchg        rbx,        rax
    113 %else
    114   %ifidn __OUTPUT_FORMAT__,x64
    115     SAVE_XMM 7, u
    116     %define     src_ptr     rcx
    117     %define     src_stride  rdx
    118     %define     r0_ptr      rsi
    119     %define     r1_ptr      r10
    120     %define     r2_ptr      r11
    121     %define     r3_ptr      r8
    122     %define     ref_stride  r9
    123     %define     result_ptr  [rsp+xmm_stack_space+16+4*8]
    124     push        rsi
    125 
    126     LOAD_X4_ADDRESSES r8, r0_ptr, r1_ptr, r2_ptr, r3_ptr
    127   %else
    128     %define     src_ptr     rdi
    129     %define     src_stride  rsi
    130     %define     r0_ptr      r9
    131     %define     r1_ptr      r10
    132     %define     r2_ptr      r11
    133     %define     r3_ptr      rdx
    134     %define     ref_stride  rcx
    135     %define     result_ptr  r8
    136 
    137     LOAD_X4_ADDRESSES rdx, r0_ptr, r1_ptr, r2_ptr, r3_ptr
    138 
    139   %endif
    140 %endif
    141 %endmacro
    142 
    143 %macro STACK_FRAME_DESTROY_X4 0
    144   %define     src_ptr
    145   %define     src_stride
    146   %define     r0_ptr
    147   %define     r1_ptr
    148   %define     r2_ptr
    149   %define     r3_ptr
    150   %define     ref_stride
    151   %define     result_ptr
    152 
    153 %if ABI_IS_32BIT
    154     pop         rbx
    155     pop         rdi
    156     pop         rsi
    157     pop         rbp
    158 %else
    159   %ifidn __OUTPUT_FORMAT__,x64
    160     pop         rsi
    161     RESTORE_XMM
    162   %endif
    163 %endif
    164     ret
    165 %endmacro
    166 
    167 %macro PROCESS_16X2X3 5
    168 %if %1==0
    169         movdqa          xmm0,       XMMWORD PTR [%2]
    170         lddqu           xmm5,       XMMWORD PTR [%3]
    171         lddqu           xmm6,       XMMWORD PTR [%3+1]
    172         lddqu           xmm7,       XMMWORD PTR [%3+2]
    173 
    174         psadbw          xmm5,       xmm0
    175         psadbw          xmm6,       xmm0
    176         psadbw          xmm7,       xmm0
    177 %else
    178         movdqa          xmm0,       XMMWORD PTR [%2]
    179         lddqu           xmm1,       XMMWORD PTR [%3]
    180         lddqu           xmm2,       XMMWORD PTR [%3+1]
    181         lddqu           xmm3,       XMMWORD PTR [%3+2]
    182 
    183         psadbw          xmm1,       xmm0
    184         psadbw          xmm2,       xmm0
    185         psadbw          xmm3,       xmm0
    186 
    187         paddw           xmm5,       xmm1
    188         paddw           xmm6,       xmm2
    189         paddw           xmm7,       xmm3
    190 %endif
    191         movdqa          xmm0,       XMMWORD PTR [%2+%4]
    192         lddqu           xmm1,       XMMWORD PTR [%3+%5]
    193         lddqu           xmm2,       XMMWORD PTR [%3+%5+1]
    194         lddqu           xmm3,       XMMWORD PTR [%3+%5+2]
    195 
    196 %if %1==0 || %1==1
    197         lea             %2,         [%2+%4*2]
    198         lea             %3,         [%3+%5*2]
    199 %endif
    200 
    201         psadbw          xmm1,       xmm0
    202         psadbw          xmm2,       xmm0
    203         psadbw          xmm3,       xmm0
    204 
    205         paddw           xmm5,       xmm1
    206         paddw           xmm6,       xmm2
    207         paddw           xmm7,       xmm3
    208 %endmacro
    209 
    210 %macro PROCESS_8X2X3 5
    211 %if %1==0
    212         movq            mm0,       QWORD PTR [%2]
    213         movq            mm5,       QWORD PTR [%3]
    214         movq            mm6,       QWORD PTR [%3+1]
    215         movq            mm7,       QWORD PTR [%3+2]
    216 
    217         psadbw          mm5,       mm0
    218         psadbw          mm6,       mm0
    219         psadbw          mm7,       mm0
    220 %else
    221         movq            mm0,       QWORD PTR [%2]
    222         movq            mm1,       QWORD PTR [%3]
    223         movq            mm2,       QWORD PTR [%3+1]
    224         movq            mm3,       QWORD PTR [%3+2]
    225 
    226         psadbw          mm1,       mm0
    227         psadbw          mm2,       mm0
    228         psadbw          mm3,       mm0
    229 
    230         paddw           mm5,       mm1
    231         paddw           mm6,       mm2
    232         paddw           mm7,       mm3
    233 %endif
    234         movq            mm0,       QWORD PTR [%2+%4]
    235         movq            mm1,       QWORD PTR [%3+%5]
    236         movq            mm2,       QWORD PTR [%3+%5+1]
    237         movq            mm3,       QWORD PTR [%3+%5+2]
    238 
    239 %if %1==0 || %1==1
    240         lea             %2,        [%2+%4*2]
    241         lea             %3,        [%3+%5*2]
    242 %endif
    243 
    244         psadbw          mm1,       mm0
    245         psadbw          mm2,       mm0
    246         psadbw          mm3,       mm0
    247 
    248         paddw           mm5,       mm1
    249         paddw           mm6,       mm2
    250         paddw           mm7,       mm3
    251 %endmacro
    252 
    253 %macro LOAD_X4_ADDRESSES 5
    254         mov             %2,         [%1+REG_SZ_BYTES*0]
    255         mov             %3,         [%1+REG_SZ_BYTES*1]
    256 
    257         mov             %4,         [%1+REG_SZ_BYTES*2]
    258         mov             %5,         [%1+REG_SZ_BYTES*3]
    259 %endmacro
    260 
    261 %macro PROCESS_16X2X4 8
    262 %if %1==0
    263         movdqa          xmm0,       XMMWORD PTR [%2]
    264         lddqu           xmm4,       XMMWORD PTR [%3]
    265         lddqu           xmm5,       XMMWORD PTR [%4]
    266         lddqu           xmm6,       XMMWORD PTR [%5]
    267         lddqu           xmm7,       XMMWORD PTR [%6]
    268 
    269         psadbw          xmm4,       xmm0
    270         psadbw          xmm5,       xmm0
    271         psadbw          xmm6,       xmm0
    272         psadbw          xmm7,       xmm0
    273 %else
    274         movdqa          xmm0,       XMMWORD PTR [%2]
    275         lddqu           xmm1,       XMMWORD PTR [%3]
    276         lddqu           xmm2,       XMMWORD PTR [%4]
    277         lddqu           xmm3,       XMMWORD PTR [%5]
    278 
    279         psadbw          xmm1,       xmm0
    280         psadbw          xmm2,       xmm0
    281         psadbw          xmm3,       xmm0
    282 
    283         paddw           xmm4,       xmm1
    284         lddqu           xmm1,       XMMWORD PTR [%6]
    285         paddw           xmm5,       xmm2
    286         paddw           xmm6,       xmm3
    287 
    288         psadbw          xmm1,       xmm0
    289         paddw           xmm7,       xmm1
    290 %endif
    291         movdqa          xmm0,       XMMWORD PTR [%2+%7]
    292         lddqu           xmm1,       XMMWORD PTR [%3+%8]
    293         lddqu           xmm2,       XMMWORD PTR [%4+%8]
    294         lddqu           xmm3,       XMMWORD PTR [%5+%8]
    295 
    296         psadbw          xmm1,       xmm0
    297         psadbw          xmm2,       xmm0
    298         psadbw          xmm3,       xmm0
    299 
    300         paddw           xmm4,       xmm1
    301         lddqu           xmm1,       XMMWORD PTR [%6+%8]
    302         paddw           xmm5,       xmm2
    303         paddw           xmm6,       xmm3
    304 
    305 %if %1==0 || %1==1
    306         lea             %2,         [%2+%7*2]
    307         lea             %3,         [%3+%8*2]
    308 
    309         lea             %4,         [%4+%8*2]
    310         lea             %5,         [%5+%8*2]
    311 
    312         lea             %6,         [%6+%8*2]
    313 %endif
    314         psadbw          xmm1,       xmm0
    315         paddw           xmm7,       xmm1
    316 
    317 %endmacro
    318 
    319 %macro PROCESS_8X2X4 8
    320 %if %1==0
    321         movq            mm0,        QWORD PTR [%2]
    322         movq            mm4,        QWORD PTR [%3]
    323         movq            mm5,        QWORD PTR [%4]
    324         movq            mm6,        QWORD PTR [%5]
    325         movq            mm7,        QWORD PTR [%6]
    326 
    327         psadbw          mm4,        mm0
    328         psadbw          mm5,        mm0
    329         psadbw          mm6,        mm0
    330         psadbw          mm7,        mm0
    331 %else
    332         movq            mm0,        QWORD PTR [%2]
    333         movq            mm1,        QWORD PTR [%3]
    334         movq            mm2,        QWORD PTR [%4]
    335         movq            mm3,        QWORD PTR [%5]
    336 
    337         psadbw          mm1,        mm0
    338         psadbw          mm2,        mm0
    339         psadbw          mm3,        mm0
    340 
    341         paddw           mm4,        mm1
    342         movq            mm1,        QWORD PTR [%6]
    343         paddw           mm5,        mm2
    344         paddw           mm6,        mm3
    345 
    346         psadbw          mm1,        mm0
    347         paddw           mm7,        mm1
    348 %endif
    349         movq            mm0,        QWORD PTR [%2+%7]
    350         movq            mm1,        QWORD PTR [%3+%8]
    351         movq            mm2,        QWORD PTR [%4+%8]
    352         movq            mm3,        QWORD PTR [%5+%8]
    353 
    354         psadbw          mm1,        mm0
    355         psadbw          mm2,        mm0
    356         psadbw          mm3,        mm0
    357 
    358         paddw           mm4,        mm1
    359         movq            mm1,        QWORD PTR [%6+%8]
    360         paddw           mm5,        mm2
    361         paddw           mm6,        mm3
    362 
    363 %if %1==0 || %1==1
    364         lea             %2,         [%2+%7*2]
    365         lea             %3,         [%3+%8*2]
    366 
    367         lea             %4,         [%4+%8*2]
    368         lea             %5,         [%5+%8*2]
    369 
    370         lea             %6,         [%6+%8*2]
    371 %endif
    372         psadbw          mm1,        mm0
    373         paddw           mm7,        mm1
    374 
    375 %endmacro
    376 
    377 ;void int vp8_sad16x16x3_sse3(
    378 ;    unsigned char *src_ptr,
    379 ;    int  src_stride,
    380 ;    unsigned char *ref_ptr,
    381 ;    int  ref_stride,
    382 ;    int  *results)
    383 global sym(vp8_sad16x16x3_sse3) PRIVATE
    384 sym(vp8_sad16x16x3_sse3):
    385 
    386     STACK_FRAME_CREATE_X3
    387 
    388         PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
    389         PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
    390         PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
    391         PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
    392         PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
    393         PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
    394         PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
    395         PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
    396 
    397         mov             rcx,        result_ptr
    398 
    399         movq            xmm0,       xmm5
    400         psrldq          xmm5,       8
    401 
    402         paddw           xmm0,       xmm5
    403         movd            [rcx],      xmm0
    404 ;-
    405         movq            xmm0,       xmm6
    406         psrldq          xmm6,       8
    407 
    408         paddw           xmm0,       xmm6
    409         movd            [rcx+4],    xmm0
    410 ;-
    411         movq            xmm0,       xmm7
    412         psrldq          xmm7,       8
    413 
    414         paddw           xmm0,       xmm7
    415         movd            [rcx+8],    xmm0
    416 
    417     STACK_FRAME_DESTROY_X3
    418 
    419 ;void int vp8_sad16x8x3_sse3(
    420 ;    unsigned char *src_ptr,
    421 ;    int  src_stride,
    422 ;    unsigned char *ref_ptr,
    423 ;    int  ref_stride,
    424 ;    int  *results)
    425 global sym(vp8_sad16x8x3_sse3) PRIVATE
    426 sym(vp8_sad16x8x3_sse3):
    427 
    428     STACK_FRAME_CREATE_X3
    429 
    430         PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
    431         PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
    432         PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
    433         PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
    434 
    435         mov             rcx,        result_ptr
    436 
    437         movq            xmm0,       xmm5
    438         psrldq          xmm5,       8
    439 
    440         paddw           xmm0,       xmm5
    441         movd            [rcx],      xmm0
    442 ;-
    443         movq            xmm0,       xmm6
    444         psrldq          xmm6,       8
    445 
    446         paddw           xmm0,       xmm6
    447         movd            [rcx+4],    xmm0
    448 ;-
    449         movq            xmm0,       xmm7
    450         psrldq          xmm7,       8
    451 
    452         paddw           xmm0,       xmm7
    453         movd            [rcx+8],    xmm0
    454 
    455     STACK_FRAME_DESTROY_X3
    456 
    457 ;void int vp8_sad8x16x3_sse3(
    458 ;    unsigned char *src_ptr,
    459 ;    int  src_stride,
    460 ;    unsigned char *ref_ptr,
    461 ;    int  ref_stride,
    462 ;    int  *results)
    463 global sym(vp8_sad8x16x3_sse3) PRIVATE
    464 sym(vp8_sad8x16x3_sse3):
    465 
    466     STACK_FRAME_CREATE_X3
    467 
    468         PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
    469         PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
    470         PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
    471         PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
    472         PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
    473         PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
    474         PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
    475         PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
    476 
    477         mov             rcx,        result_ptr
    478 
    479         punpckldq       mm5,        mm6
    480 
    481         movq            [rcx],      mm5
    482         movd            [rcx+8],    mm7
    483 
    484     STACK_FRAME_DESTROY_X3
    485 
    486 ;void int vp8_sad8x8x3_sse3(
    487 ;    unsigned char *src_ptr,
    488 ;    int  src_stride,
    489 ;    unsigned char *ref_ptr,
    490 ;    int  ref_stride,
    491 ;    int  *results)
    492 global sym(vp8_sad8x8x3_sse3) PRIVATE
    493 sym(vp8_sad8x8x3_sse3):
    494 
    495     STACK_FRAME_CREATE_X3
    496 
    497         PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
    498         PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
    499         PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
    500         PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
    501 
    502         mov             rcx,        result_ptr
    503 
    504         punpckldq       mm5,        mm6
    505 
    506         movq            [rcx],      mm5
    507         movd            [rcx+8],    mm7
    508 
    509     STACK_FRAME_DESTROY_X3
    510 
    511 ;void int vp8_sad4x4x3_sse3(
    512 ;    unsigned char *src_ptr,
    513 ;    int  src_stride,
    514 ;    unsigned char *ref_ptr,
    515 ;    int  ref_stride,
    516 ;    int  *results)
    517 global sym(vp8_sad4x4x3_sse3) PRIVATE
    518 sym(vp8_sad4x4x3_sse3):
    519 
    520     STACK_FRAME_CREATE_X3
    521 
    522         movd            mm0,        DWORD PTR [src_ptr]
    523         movd            mm1,        DWORD PTR [ref_ptr]
    524 
    525         movd            mm2,        DWORD PTR [src_ptr+src_stride]
    526         movd            mm3,        DWORD PTR [ref_ptr+ref_stride]
    527 
    528         punpcklbw       mm0,        mm2
    529         punpcklbw       mm1,        mm3
    530 
    531         movd            mm4,        DWORD PTR [ref_ptr+1]
    532         movd            mm5,        DWORD PTR [ref_ptr+2]
    533 
    534         movd            mm2,        DWORD PTR [ref_ptr+ref_stride+1]
    535         movd            mm3,        DWORD PTR [ref_ptr+ref_stride+2]
    536 
    537         psadbw          mm1,        mm0
    538 
    539         punpcklbw       mm4,        mm2
    540         punpcklbw       mm5,        mm3
    541 
    542         psadbw          mm4,        mm0
    543         psadbw          mm5,        mm0
    544 
    545         lea             src_ptr,    [src_ptr+src_stride*2]
    546         lea             ref_ptr,    [ref_ptr+ref_stride*2]
    547 
    548         movd            mm0,        DWORD PTR [src_ptr]
    549         movd            mm2,        DWORD PTR [ref_ptr]
    550 
    551         movd            mm3,        DWORD PTR [src_ptr+src_stride]
    552         movd            mm6,        DWORD PTR [ref_ptr+ref_stride]
    553 
    554         punpcklbw       mm0,        mm3
    555         punpcklbw       mm2,        mm6
    556 
    557         movd            mm3,        DWORD PTR [ref_ptr+1]
    558         movd            mm7,        DWORD PTR [ref_ptr+2]
    559 
    560         psadbw          mm2,        mm0
    561 
    562         paddw           mm1,        mm2
    563 
    564         movd            mm2,        DWORD PTR [ref_ptr+ref_stride+1]
    565         movd            mm6,        DWORD PTR [ref_ptr+ref_stride+2]
    566 
    567         punpcklbw       mm3,        mm2
    568         punpcklbw       mm7,        mm6
    569 
    570         psadbw          mm3,        mm0
    571         psadbw          mm7,        mm0
    572 
    573         paddw           mm3,        mm4
    574         paddw           mm7,        mm5
    575 
    576         mov             rcx,        result_ptr
    577 
    578         punpckldq       mm1,        mm3
    579 
    580         movq            [rcx],      mm1
    581         movd            [rcx+8],    mm7
    582 
    583     STACK_FRAME_DESTROY_X3
    584 
    585 ;unsigned int vp8_sad16x16_sse3(
    586 ;    unsigned char *src_ptr,
    587 ;    int  src_stride,
    588 ;    unsigned char *ref_ptr,
    589 ;    int  ref_stride,
    590 ;    int  max_sad)
    591 ;%define lddqu movdqu
    592 global sym(vp8_sad16x16_sse3) PRIVATE
    593 sym(vp8_sad16x16_sse3):
    594 
    595     STACK_FRAME_CREATE_X3
    596 
    597         mov             end_ptr,    4
    598         pxor            xmm7,        xmm7
    599 
    600 .vp8_sad16x16_sse3_loop:
    601         movdqa          xmm0,       XMMWORD PTR [src_ptr]
    602         movdqu          xmm1,       XMMWORD PTR [ref_ptr]
    603         movdqa          xmm2,       XMMWORD PTR [src_ptr+src_stride]
    604         movdqu          xmm3,       XMMWORD PTR [ref_ptr+ref_stride]
    605 
    606         lea             src_ptr,    [src_ptr+src_stride*2]
    607         lea             ref_ptr,    [ref_ptr+ref_stride*2]
    608 
    609         movdqa          xmm4,       XMMWORD PTR [src_ptr]
    610         movdqu          xmm5,       XMMWORD PTR [ref_ptr]
    611         movdqa          xmm6,       XMMWORD PTR [src_ptr+src_stride]
    612 
    613         psadbw          xmm0,       xmm1
    614 
    615         movdqu          xmm1,       XMMWORD PTR [ref_ptr+ref_stride]
    616 
    617         psadbw          xmm2,       xmm3
    618         psadbw          xmm4,       xmm5
    619         psadbw          xmm6,       xmm1
    620 
    621         lea             src_ptr,    [src_ptr+src_stride*2]
    622         lea             ref_ptr,    [ref_ptr+ref_stride*2]
    623 
    624         paddw           xmm7,        xmm0
    625         paddw           xmm7,        xmm2
    626         paddw           xmm7,        xmm4
    627         paddw           xmm7,        xmm6
    628 
    629         sub             end_ptr,     1
    630         jne             .vp8_sad16x16_sse3_loop
    631 
    632         movq            xmm0,       xmm7
    633         psrldq          xmm7,       8
    634         paddw           xmm0,       xmm7
    635         movq            rax,        xmm0
    636 
    637     STACK_FRAME_DESTROY_X3
    638 
    639 ;void vp8_copy32xn_sse3(
    640 ;    unsigned char *src_ptr,
    641 ;    int  src_stride,
    642 ;    unsigned char *dst_ptr,
    643 ;    int  dst_stride,
    644 ;    int height);
    645 global sym(vp8_copy32xn_sse3) PRIVATE
    646 sym(vp8_copy32xn_sse3):
    647 
    648     STACK_FRAME_CREATE_X3
    649 
    650 .block_copy_sse3_loopx4:
    651         lea             end_ptr,    [src_ptr+src_stride*2]
    652 
    653         movdqu          xmm0,       XMMWORD PTR [src_ptr]
    654         movdqu          xmm1,       XMMWORD PTR [src_ptr + 16]
    655         movdqu          xmm2,       XMMWORD PTR [src_ptr + src_stride]
    656         movdqu          xmm3,       XMMWORD PTR [src_ptr + src_stride + 16]
    657         movdqu          xmm4,       XMMWORD PTR [end_ptr]
    658         movdqu          xmm5,       XMMWORD PTR [end_ptr + 16]
    659         movdqu          xmm6,       XMMWORD PTR [end_ptr + src_stride]
    660         movdqu          xmm7,       XMMWORD PTR [end_ptr + src_stride + 16]
    661 
    662         lea             src_ptr,    [src_ptr+src_stride*4]
    663 
    664         lea             end_ptr,    [ref_ptr+ref_stride*2]
    665 
    666         movdqa          XMMWORD PTR [ref_ptr], xmm0
    667         movdqa          XMMWORD PTR [ref_ptr + 16], xmm1
    668         movdqa          XMMWORD PTR [ref_ptr + ref_stride], xmm2
    669         movdqa          XMMWORD PTR [ref_ptr + ref_stride + 16], xmm3
    670         movdqa          XMMWORD PTR [end_ptr], xmm4
    671         movdqa          XMMWORD PTR [end_ptr + 16], xmm5
    672         movdqa          XMMWORD PTR [end_ptr + ref_stride], xmm6
    673         movdqa          XMMWORD PTR [end_ptr + ref_stride + 16], xmm7
    674 
    675         lea             ref_ptr,    [ref_ptr+ref_stride*4]
    676 
    677         sub             height,     4
    678         cmp             height,     4
    679         jge             .block_copy_sse3_loopx4
    680 
    681         ;Check to see if there is more rows need to be copied.
    682         cmp             height, 0
    683         je              .copy_is_done
    684 
    685 .block_copy_sse3_loop:
    686         movdqu          xmm0,       XMMWORD PTR [src_ptr]
    687         movdqu          xmm1,       XMMWORD PTR [src_ptr + 16]
    688         lea             src_ptr,    [src_ptr+src_stride]
    689 
    690         movdqa          XMMWORD PTR [ref_ptr], xmm0
    691         movdqa          XMMWORD PTR [ref_ptr + 16], xmm1
    692         lea             ref_ptr,    [ref_ptr+ref_stride]
    693 
    694         sub             height,     1
    695         jne             .block_copy_sse3_loop
    696 
    697 .copy_is_done:
    698     STACK_FRAME_DESTROY_X3
    699 
    700 ;void vp8_sad16x16x4d_sse3(
    701 ;    unsigned char *src_ptr,
    702 ;    int  src_stride,
    703 ;    unsigned char *ref_ptr_base,
    704 ;    int  ref_stride,
    705 ;    int  *results)
    706 global sym(vp8_sad16x16x4d_sse3) PRIVATE
    707 sym(vp8_sad16x16x4d_sse3):
    708 
    709     STACK_FRAME_CREATE_X4
    710 
    711         PROCESS_16X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
    712         PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
    713         PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
    714         PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
    715         PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
    716         PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
    717         PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
    718         PROCESS_16X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
    719 
    720 %if ABI_IS_32BIT
    721         pop             rbp
    722 %endif
    723         mov             rcx,        result_ptr
    724 
    725         movq            xmm0,       xmm4
    726         psrldq          xmm4,       8
    727 
    728         paddw           xmm0,       xmm4
    729         movd            [rcx],      xmm0
    730 ;-
    731         movq            xmm0,       xmm5
    732         psrldq          xmm5,       8
    733 
    734         paddw           xmm0,       xmm5
    735         movd            [rcx+4],    xmm0
    736 ;-
    737         movq            xmm0,       xmm6
    738         psrldq          xmm6,       8
    739 
    740         paddw           xmm0,       xmm6
    741         movd            [rcx+8],    xmm0
    742 ;-
    743         movq            xmm0,       xmm7
    744         psrldq          xmm7,       8
    745 
    746         paddw           xmm0,       xmm7
    747         movd            [rcx+12],   xmm0
    748 
    749     STACK_FRAME_DESTROY_X4
    750 
    751 ;void vp8_sad16x8x4d_sse3(
    752 ;    unsigned char *src_ptr,
    753 ;    int  src_stride,
    754 ;    unsigned char *ref_ptr_base,
    755 ;    int  ref_stride,
    756 ;    int  *results)
    757 global sym(vp8_sad16x8x4d_sse3) PRIVATE
    758 sym(vp8_sad16x8x4d_sse3):
    759 
    760     STACK_FRAME_CREATE_X4
    761 
    762         PROCESS_16X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
    763         PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
    764         PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
    765         PROCESS_16X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
    766 
    767 %if ABI_IS_32BIT
    768         pop             rbp
    769 %endif
    770         mov             rcx,        result_ptr
    771 
    772         movq            xmm0,       xmm4
    773         psrldq          xmm4,       8
    774 
    775         paddw           xmm0,       xmm4
    776         movd            [rcx],      xmm0
    777 ;-
    778         movq            xmm0,       xmm5
    779         psrldq          xmm5,       8
    780 
    781         paddw           xmm0,       xmm5
    782         movd            [rcx+4],    xmm0
    783 ;-
    784         movq            xmm0,       xmm6
    785         psrldq          xmm6,       8
    786 
    787         paddw           xmm0,       xmm6
    788         movd            [rcx+8],    xmm0
    789 ;-
    790         movq            xmm0,       xmm7
    791         psrldq          xmm7,       8
    792 
    793         paddw           xmm0,       xmm7
    794         movd            [rcx+12],   xmm0
    795 
    796     STACK_FRAME_DESTROY_X4
    797 
    798 ;void int vp8_sad8x16x4d_sse3(
    799 ;    unsigned char *src_ptr,
    800 ;    int  src_stride,
    801 ;    unsigned char *ref_ptr,
    802 ;    int  ref_stride,
    803 ;    int  *results)
    804 global sym(vp8_sad8x16x4d_sse3) PRIVATE
    805 sym(vp8_sad8x16x4d_sse3):
    806 
    807     STACK_FRAME_CREATE_X4
    808 
    809         PROCESS_8X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
    810         PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
    811         PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
    812         PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
    813         PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
    814         PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
    815         PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
    816         PROCESS_8X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
    817 
    818 %if ABI_IS_32BIT
    819         pop             rbp
    820 %endif
    821         mov             rcx,        result_ptr
    822 
    823         punpckldq       mm4,        mm5
    824         punpckldq       mm6,        mm7
    825 
    826         movq            [rcx],      mm4
    827         movq            [rcx+8],    mm6
    828 
    829     STACK_FRAME_DESTROY_X4
    830 
    831 ;void int vp8_sad8x8x4d_sse3(
    832 ;    unsigned char *src_ptr,
    833 ;    int  src_stride,
    834 ;    unsigned char *ref_ptr,
    835 ;    int  ref_stride,
    836 ;    int  *results)
    837 global sym(vp8_sad8x8x4d_sse3) PRIVATE
    838 sym(vp8_sad8x8x4d_sse3):
    839 
    840     STACK_FRAME_CREATE_X4
    841 
    842         PROCESS_8X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
    843         PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
    844         PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
    845         PROCESS_8X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
    846 
    847 %if ABI_IS_32BIT
    848         pop             rbp
    849 %endif
    850         mov             rcx,        result_ptr
    851 
    852         punpckldq       mm4,        mm5
    853         punpckldq       mm6,        mm7
    854 
    855         movq            [rcx],      mm4
    856         movq            [rcx+8],    mm6
    857 
    858     STACK_FRAME_DESTROY_X4
    859 
    860 ;void int vp8_sad4x4x4d_sse3(
    861 ;    unsigned char *src_ptr,
    862 ;    int  src_stride,
    863 ;    unsigned char *ref_ptr,
    864 ;    int  ref_stride,
    865 ;    int  *results)
    866 global sym(vp8_sad4x4x4d_sse3) PRIVATE
    867 sym(vp8_sad4x4x4d_sse3):
    868 
    869     STACK_FRAME_CREATE_X4
    870 
    871         movd            mm0,        DWORD PTR [src_ptr]
    872         movd            mm1,        DWORD PTR [r0_ptr]
    873 
    874         movd            mm2,        DWORD PTR [src_ptr+src_stride]
    875         movd            mm3,        DWORD PTR [r0_ptr+ref_stride]
    876 
    877         punpcklbw       mm0,        mm2
    878         punpcklbw       mm1,        mm3
    879 
    880         movd            mm4,        DWORD PTR [r1_ptr]
    881         movd            mm5,        DWORD PTR [r2_ptr]
    882 
    883         movd            mm6,        DWORD PTR [r3_ptr]
    884         movd            mm2,        DWORD PTR [r1_ptr+ref_stride]
    885 
    886         movd            mm3,        DWORD PTR [r2_ptr+ref_stride]
    887         movd            mm7,        DWORD PTR [r3_ptr+ref_stride]
    888 
    889         psadbw          mm1,        mm0
    890 
    891         punpcklbw       mm4,        mm2
    892         punpcklbw       mm5,        mm3
    893 
    894         punpcklbw       mm6,        mm7
    895         psadbw          mm4,        mm0
    896 
    897         psadbw          mm5,        mm0
    898         psadbw          mm6,        mm0
    899 
    900 
    901 
    902         lea             src_ptr,    [src_ptr+src_stride*2]
    903         lea             r0_ptr,     [r0_ptr+ref_stride*2]
    904 
    905         lea             r1_ptr,     [r1_ptr+ref_stride*2]
    906         lea             r2_ptr,     [r2_ptr+ref_stride*2]
    907 
    908         lea             r3_ptr,     [r3_ptr+ref_stride*2]
    909 
    910         movd            mm0,        DWORD PTR [src_ptr]
    911         movd            mm2,        DWORD PTR [r0_ptr]
    912 
    913         movd            mm3,        DWORD PTR [src_ptr+src_stride]
    914         movd            mm7,        DWORD PTR [r0_ptr+ref_stride]
    915 
    916         punpcklbw       mm0,        mm3
    917         punpcklbw       mm2,        mm7
    918 
    919         movd            mm3,        DWORD PTR [r1_ptr]
    920         movd            mm7,        DWORD PTR [r2_ptr]
    921 
    922         psadbw          mm2,        mm0
    923 %if ABI_IS_32BIT
    924         mov             rax,        rbp
    925 
    926         pop             rbp
    927 %define     ref_stride    rax
    928 %endif
    929         mov             rsi,        result_ptr
    930 
    931         paddw           mm1,        mm2
    932         movd            [rsi],      mm1
    933 
    934         movd            mm2,        DWORD PTR [r1_ptr+ref_stride]
    935         movd            mm1,        DWORD PTR [r2_ptr+ref_stride]
    936 
    937         punpcklbw       mm3,        mm2
    938         punpcklbw       mm7,        mm1
    939 
    940         psadbw          mm3,        mm0
    941         psadbw          mm7,        mm0
    942 
    943         movd            mm2,        DWORD PTR [r3_ptr]
    944         movd            mm1,        DWORD PTR [r3_ptr+ref_stride]
    945 
    946         paddw           mm3,        mm4
    947         paddw           mm7,        mm5
    948 
    949         movd            [rsi+4],    mm3
    950         punpcklbw       mm2,        mm1
    951 
    952         movd            [rsi+8],    mm7
    953         psadbw          mm2,        mm0
    954 
    955         paddw           mm2,        mm6
    956         movd            [rsi+12],   mm2
    957 
    958 
    959     STACK_FRAME_DESTROY_X4
    960 
    961