Home | History | Annotate | Download | only in x86
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12 %include "vpx_ports/x86_abi_support.asm"
     13 
     14 ;unsigned int vp9_get_mb_ss_sse2
     15 ;(
     16 ;    short *src_ptr
     17 ;)
     18 global sym(vp9_get_mb_ss_sse2) PRIVATE
     19 sym(vp9_get_mb_ss_sse2):
     20     push        rbp
     21     mov         rbp, rsp
     22     SHADOW_ARGS_TO_STACK 1
     23     GET_GOT     rbx
     24     push rsi
     25     push rdi
     26     sub         rsp, 16
     27     ; end prolog
     28 
     29 
     30         mov         rax, arg(0) ;[src_ptr]
     31         mov         rcx, 8
     32         pxor        xmm4, xmm4
     33 
     34 .NEXTROW:
     35         movdqa      xmm0, [rax]
     36         movdqa      xmm1, [rax+16]
     37         movdqa      xmm2, [rax+32]
     38         movdqa      xmm3, [rax+48]
     39         pmaddwd     xmm0, xmm0
     40         pmaddwd     xmm1, xmm1
     41         pmaddwd     xmm2, xmm2
     42         pmaddwd     xmm3, xmm3
     43 
     44         paddd       xmm0, xmm1
     45         paddd       xmm2, xmm3
     46         paddd       xmm4, xmm0
     47         paddd       xmm4, xmm2
     48 
     49         add         rax, 0x40
     50         dec         rcx
     51         ja          .NEXTROW
     52 
     53         movdqa      xmm3,xmm4
     54         psrldq      xmm4,8
     55         paddd       xmm4,xmm3
     56         movdqa      xmm3,xmm4
     57         psrldq      xmm4,4
     58         paddd       xmm4,xmm3
     59         movq        rax,xmm4
     60 
     61 
     62     ; begin epilog
     63     add rsp, 16
     64     pop rdi
     65     pop rsi
     66     RESTORE_GOT
     67     UNSHADOW_ARGS
     68     pop         rbp
     69     ret
     70 
     71 
     72 ;unsigned int vp9_get16x16var_sse2
     73 ;(
     74 ;    unsigned char   *  src_ptr,
     75 ;    int             source_stride,
     76 ;    unsigned char   *  ref_ptr,
     77 ;    int             recon_stride,
     78 ;    unsigned int    *  SSE,
     79 ;    int             *  Sum
     80 ;)
     81 global sym(vp9_get16x16var_sse2) PRIVATE
     82 sym(vp9_get16x16var_sse2):
     83     push        rbp
     84     mov         rbp, rsp
     85     SHADOW_ARGS_TO_STACK 6
     86     SAVE_XMM 7
     87     push rbx
     88     push rsi
     89     push rdi
     90     ; end prolog
     91 
     92         mov         rsi,            arg(0) ;[src_ptr]
     93         mov         rdi,            arg(2) ;[ref_ptr]
     94 
     95         movsxd      rax,            DWORD PTR arg(1) ;[source_stride]
     96         movsxd      rdx,            DWORD PTR arg(3) ;[recon_stride]
     97 
     98         ; Prefetch data
     99         lea             rcx,    [rax+rax*2]
    100         prefetcht0      [rsi]
    101         prefetcht0      [rsi+rax]
    102         prefetcht0      [rsi+rax*2]
    103         prefetcht0      [rsi+rcx]
    104         lea             rbx,    [rsi+rax*4]
    105         prefetcht0      [rbx]
    106         prefetcht0      [rbx+rax]
    107         prefetcht0      [rbx+rax*2]
    108         prefetcht0      [rbx+rcx]
    109 
    110         lea             rcx,    [rdx+rdx*2]
    111         prefetcht0      [rdi]
    112         prefetcht0      [rdi+rdx]
    113         prefetcht0      [rdi+rdx*2]
    114         prefetcht0      [rdi+rcx]
    115         lea             rbx,    [rdi+rdx*4]
    116         prefetcht0      [rbx]
    117         prefetcht0      [rbx+rdx]
    118         prefetcht0      [rbx+rdx*2]
    119         prefetcht0      [rbx+rcx]
    120 
    121         pxor        xmm0,           xmm0                        ; clear xmm0 for unpack
    122         pxor        xmm7,           xmm7                        ; clear xmm7 for accumulating diffs
    123 
    124         pxor        xmm6,           xmm6                        ; clear xmm6 for accumulating sse
    125         mov         rcx,            16
    126 
    127 .var16loop:
    128         movdqu      xmm1,           XMMWORD PTR [rsi]
    129         movdqu      xmm2,           XMMWORD PTR [rdi]
    130 
    131         prefetcht0      [rsi+rax*8]
    132         prefetcht0      [rdi+rdx*8]
    133 
    134         movdqa      xmm3,           xmm1
    135         movdqa      xmm4,           xmm2
    136 
    137 
    138         punpcklbw   xmm1,           xmm0
    139         punpckhbw   xmm3,           xmm0
    140 
    141         punpcklbw   xmm2,           xmm0
    142         punpckhbw   xmm4,           xmm0
    143 
    144 
    145         psubw       xmm1,           xmm2
    146         psubw       xmm3,           xmm4
    147 
    148         paddw       xmm7,           xmm1
    149         pmaddwd     xmm1,           xmm1
    150 
    151         paddw       xmm7,           xmm3
    152         pmaddwd     xmm3,           xmm3
    153 
    154         paddd       xmm6,           xmm1
    155         paddd       xmm6,           xmm3
    156 
    157         add         rsi,            rax
    158         add         rdi,            rdx
    159 
    160         sub         rcx,            1
    161         jnz         .var16loop
    162 
    163 
    164         movdqa      xmm1,           xmm6
    165         pxor        xmm6,           xmm6
    166 
    167         pxor        xmm5,           xmm5
    168         punpcklwd   xmm6,           xmm7
    169 
    170         punpckhwd   xmm5,           xmm7
    171         psrad       xmm5,           16
    172 
    173         psrad       xmm6,           16
    174         paddd       xmm6,           xmm5
    175 
    176         movdqa      xmm2,           xmm1
    177         punpckldq   xmm1,           xmm0
    178 
    179         punpckhdq   xmm2,           xmm0
    180         movdqa      xmm7,           xmm6
    181 
    182         paddd       xmm1,           xmm2
    183         punpckldq   xmm6,           xmm0
    184 
    185         punpckhdq   xmm7,           xmm0
    186         paddd       xmm6,           xmm7
    187 
    188         movdqa      xmm2,           xmm1
    189         movdqa      xmm7,           xmm6
    190 
    191         psrldq      xmm1,           8
    192         psrldq      xmm6,           8
    193 
    194         paddd       xmm7,           xmm6
    195         paddd       xmm1,           xmm2
    196 
    197         mov         rax,            arg(5) ;[Sum]
    198         mov         rdi,            arg(4) ;[SSE]
    199 
    200         movd DWORD PTR [rax],       xmm7
    201         movd DWORD PTR [rdi],       xmm1
    202 
    203 
    204     ; begin epilog
    205     pop rdi
    206     pop rsi
    207     pop rbx
    208     RESTORE_XMM
    209     UNSHADOW_ARGS
    210     pop         rbp
    211     ret
    212 
    213 
    214 
    215 
    216 ;unsigned int vp9_get8x8var_sse2
    217 ;(
    218 ;    unsigned char   *  src_ptr,
    219 ;    int             source_stride,
    220 ;    unsigned char   *  ref_ptr,
    221 ;    int             recon_stride,
    222 ;    unsigned int    *  SSE,
    223 ;    int             *  Sum
    224 ;)
    225 global sym(vp9_get8x8var_sse2) PRIVATE
    226 sym(vp9_get8x8var_sse2):
    227     push        rbp
    228     mov         rbp, rsp
    229     SHADOW_ARGS_TO_STACK 6
    230     SAVE_XMM 7
    231     GET_GOT     rbx
    232     push rsi
    233     push rdi
    234     sub         rsp, 16
    235     ; end prolog
    236 
    237         mov         rsi,            arg(0) ;[src_ptr]
    238         mov         rdi,            arg(2) ;[ref_ptr]
    239 
    240         movsxd      rax,            DWORD PTR arg(1) ;[source_stride]
    241         movsxd      rdx,            DWORD PTR arg(3) ;[recon_stride]
    242 
    243         pxor        xmm0,           xmm0                        ; clear xmm0 for unpack
    244         pxor        xmm7,           xmm7                        ; clear xmm7 for accumulating diffs
    245 
    246         movq        xmm1,           QWORD PTR [rsi]
    247         movq        xmm2,           QWORD PTR [rdi]
    248 
    249         punpcklbw   xmm1,           xmm0
    250         punpcklbw   xmm2,           xmm0
    251 
    252         psubsw      xmm1,           xmm2
    253         paddw       xmm7,           xmm1
    254 
    255         pmaddwd     xmm1,           xmm1
    256 
    257         movq        xmm2,           QWORD PTR[rsi + rax]
    258         movq        xmm3,           QWORD PTR[rdi + rdx]
    259 
    260         punpcklbw   xmm2,           xmm0
    261         punpcklbw   xmm3,           xmm0
    262 
    263         psubsw      xmm2,           xmm3
    264         paddw       xmm7,           xmm2
    265 
    266         pmaddwd     xmm2,           xmm2
    267         paddd       xmm1,           xmm2
    268 
    269 
    270         movq        xmm2,           QWORD PTR[rsi + rax * 2]
    271         movq        xmm3,           QWORD PTR[rdi + rdx * 2]
    272 
    273         punpcklbw   xmm2,           xmm0
    274         punpcklbw   xmm3,           xmm0
    275 
    276         psubsw      xmm2,           xmm3
    277         paddw       xmm7,           xmm2
    278 
    279         pmaddwd     xmm2,           xmm2
    280         paddd       xmm1,           xmm2
    281 
    282 
    283         lea         rsi,            [rsi + rax * 2]
    284         lea         rdi,            [rdi + rdx * 2]
    285         movq        xmm2,           QWORD PTR[rsi + rax]
    286         movq        xmm3,           QWORD PTR[rdi + rdx]
    287 
    288         punpcklbw   xmm2,           xmm0
    289         punpcklbw   xmm3,           xmm0
    290 
    291         psubsw      xmm2,           xmm3
    292         paddw       xmm7,           xmm2
    293 
    294         pmaddwd     xmm2,           xmm2
    295         paddd       xmm1,           xmm2
    296 
    297         movq        xmm2,           QWORD PTR[rsi + rax *2]
    298         movq        xmm3,           QWORD PTR[rdi + rdx *2]
    299 
    300         punpcklbw   xmm2,           xmm0
    301         punpcklbw   xmm3,           xmm0
    302 
    303         psubsw      xmm2,           xmm3
    304         paddw       xmm7,           xmm2
    305 
    306         pmaddwd     xmm2,           xmm2
    307         paddd       xmm1,           xmm2
    308 
    309 
    310         lea         rsi,            [rsi + rax * 2]
    311         lea         rdi,            [rdi + rdx * 2]
    312 
    313 
    314         movq        xmm2,           QWORD PTR[rsi + rax]
    315         movq        xmm3,           QWORD PTR[rdi + rdx]
    316 
    317         punpcklbw   xmm2,           xmm0
    318         punpcklbw   xmm3,           xmm0
    319 
    320         psubsw      xmm2,           xmm3
    321         paddw       xmm7,           xmm2
    322 
    323         pmaddwd     xmm2,           xmm2
    324         paddd       xmm1,           xmm2
    325 
    326         movq        xmm2,           QWORD PTR[rsi + rax *2]
    327         movq        xmm3,           QWORD PTR[rdi + rdx *2]
    328 
    329         punpcklbw   xmm2,           xmm0
    330         punpcklbw   xmm3,           xmm0
    331 
    332         psubsw      xmm2,           xmm3
    333         paddw       xmm7,           xmm2
    334 
    335         pmaddwd     xmm2,           xmm2
    336         paddd       xmm1,           xmm2
    337 
    338 
    339         lea         rsi,            [rsi + rax * 2]
    340         lea         rdi,            [rdi + rdx * 2]
    341 
    342         movq        xmm2,           QWORD PTR[rsi + rax]
    343         movq        xmm3,           QWORD PTR[rdi + rdx]
    344 
    345         punpcklbw   xmm2,           xmm0
    346         punpcklbw   xmm3,           xmm0
    347 
    348         psubsw      xmm2,           xmm3
    349         paddw       xmm7,           xmm2
    350 
    351         pmaddwd     xmm2,           xmm2
    352         paddd       xmm1,           xmm2
    353 
    354 
    355         movdqa      xmm6,           xmm7
    356         punpcklwd   xmm6,           xmm0
    357 
    358         punpckhwd   xmm7,           xmm0
    359         movdqa      xmm2,           xmm1
    360 
    361         paddw       xmm6,           xmm7
    362         punpckldq   xmm1,           xmm0
    363 
    364         punpckhdq   xmm2,           xmm0
    365         movdqa      xmm7,           xmm6
    366 
    367         paddd       xmm1,           xmm2
    368         punpckldq   xmm6,           xmm0
    369 
    370         punpckhdq   xmm7,           xmm0
    371         paddw       xmm6,           xmm7
    372 
    373         movdqa      xmm2,           xmm1
    374         movdqa      xmm7,           xmm6
    375 
    376         psrldq      xmm1,           8
    377         psrldq      xmm6,           8
    378 
    379         paddw       xmm7,           xmm6
    380         paddd       xmm1,           xmm2
    381 
    382         mov         rax,            arg(5) ;[Sum]
    383         mov         rdi,            arg(4) ;[SSE]
    384 
    385         movq        rdx,            xmm7
    386         movsx       rcx,            dx
    387 
    388         mov  dword ptr [rax],       ecx
    389         movd DWORD PTR [rdi],       xmm1
    390 
    391     ; begin epilog
    392     add rsp, 16
    393     pop rdi
    394     pop rsi
    395     RESTORE_GOT
    396     RESTORE_XMM
    397     UNSHADOW_ARGS
    398     pop         rbp
    399     ret
    400 
    401 ;void vp9_half_horiz_vert_variance8x_h_sse2
    402 ;(
    403 ;    unsigned char *ref_ptr,
    404 ;    int ref_pixels_per_line,
    405 ;    unsigned char *src_ptr,
    406 ;    int src_pixels_per_line,
    407 ;    unsigned int Height,
    408 ;    int *sum,
    409 ;    unsigned int *sumsquared
    410 ;)
    411 global sym(vp9_half_horiz_vert_variance8x_h_sse2) PRIVATE
    412 sym(vp9_half_horiz_vert_variance8x_h_sse2):
    413     push        rbp
    414     mov         rbp, rsp
    415     SHADOW_ARGS_TO_STACK 7
    416     SAVE_XMM 7
    417     GET_GOT     rbx
    418     push rsi
    419     push rdi
    420     ; end prolog
    421 
    422 %if ABI_IS_32BIT=0
    423     movsxd          r8, dword ptr arg(1) ;ref_pixels_per_line
    424     movsxd          r9, dword ptr arg(3) ;src_pixels_per_line
    425 %endif
    426 
    427         pxor            xmm6,           xmm6                ;  error accumulator
    428         pxor            xmm7,           xmm7                ;  sse eaccumulator
    429         mov             rsi,            arg(0) ;ref_ptr              ;
    430 
    431         mov             rdi,            arg(2) ;src_ptr              ;
    432         movsxd          rcx,            dword ptr arg(4) ;Height              ;
    433         movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line
    434 
    435         pxor            xmm0,           xmm0                ;
    436 
    437         movq            xmm5,           QWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s8
    438         movq            xmm3,           QWORD PTR [rsi+1]   ;  xmm3 = s1,s2,s3..s9
    439         pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3) horizontal line 1
    440 
    441 %if ABI_IS_32BIT
    442         add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source
    443 %else
    444         add             rsi, r8
    445 %endif
    446 
    447 .half_horiz_vert_variance8x_h_1:
    448 
    449         movq            xmm1,           QWORD PTR [rsi]     ;
    450         movq            xmm2,           QWORD PTR [rsi+1]   ;
    451         pavgb           xmm1,           xmm2                ;  xmm1 = avg(xmm1,xmm3) horizontal line i+1
    452 
    453         pavgb           xmm5,           xmm1                ;  xmm = vertical average of the above
    454         punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
    455 
    456         movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d8
    457         punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
    458 
    459         psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
    460         paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
    461         pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
    462         paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
    463 
    464         movdqa          xmm5,           xmm1                ;  save xmm1 for use on the next row
    465 
    466 %if ABI_IS_32BIT
    467         add             esi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source
    468         add             edi,            dword ptr arg(3) ;src_pixels_per_line    ;  next destination
    469 %else
    470         add             rsi, r8
    471         add             rdi, r9
    472 %endif
    473 
    474         sub             rcx,            1                   ;
    475         jnz             .half_horiz_vert_variance8x_h_1     ;
    476 
    477         movdq2q         mm6,            xmm6                ;
    478         movdq2q         mm7,            xmm7                ;
    479 
    480         psrldq          xmm6,           8
    481         psrldq          xmm7,           8
    482 
    483         movdq2q         mm2,            xmm6
    484         movdq2q         mm3,            xmm7
    485 
    486         paddw           mm6,            mm2
    487         paddd           mm7,            mm3
    488 
    489         pxor            mm3,            mm3                 ;
    490         pxor            mm2,            mm2                 ;
    491 
    492         punpcklwd       mm2,            mm6                 ;
    493         punpckhwd       mm3,            mm6                 ;
    494 
    495         paddd           mm2,            mm3                 ;
    496         movq            mm6,            mm2                 ;
    497 
    498         psrlq           mm6,            32                  ;
    499         paddd           mm2,            mm6                 ;
    500 
    501         psrad           mm2,            16                  ;
    502         movq            mm4,            mm7                 ;
    503 
    504         psrlq           mm4,            32                  ;
    505         paddd           mm4,            mm7                 ;
    506 
    507         mov             rsi,            arg(5) ; sum
    508         mov             rdi,            arg(6) ; sumsquared
    509 
    510         movd            [rsi],          mm2                 ;
    511         movd            [rdi],          mm4                 ;
    512 
    513 
    514     ; begin epilog
    515     pop rdi
    516     pop rsi
    517     RESTORE_GOT
    518     RESTORE_XMM
    519     UNSHADOW_ARGS
    520     pop         rbp
    521     ret
    522 
    523 ;void vp9_half_vert_variance8x_h_sse2
    524 ;(
    525 ;    unsigned char *ref_ptr,
    526 ;    int ref_pixels_per_line,
    527 ;    unsigned char *src_ptr,
    528 ;    int src_pixels_per_line,
    529 ;    unsigned int Height,
    530 ;    int *sum,
    531 ;    unsigned int *sumsquared
    532 ;)
    533 global sym(vp9_half_vert_variance8x_h_sse2) PRIVATE
    534 sym(vp9_half_vert_variance8x_h_sse2):
    535     push        rbp
    536     mov         rbp, rsp
    537     SHADOW_ARGS_TO_STACK 7
    538     SAVE_XMM 7
    539     GET_GOT     rbx
    540     push rsi
    541     push rdi
    542     ; end prolog
    543 
    544 %if ABI_IS_32BIT=0
    545     movsxd          r8, dword ptr arg(1) ;ref_pixels_per_line
    546     movsxd          r9, dword ptr arg(3) ;src_pixels_per_line
    547 %endif
    548 
    549         pxor            xmm6,           xmm6                ;  error accumulator
    550         pxor            xmm7,           xmm7                ;  sse eaccumulator
    551         mov             rsi,            arg(0) ;ref_ptr              ;
    552 
    553         mov             rdi,            arg(2) ;src_ptr              ;
    554         movsxd          rcx,            dword ptr arg(4) ;Height              ;
    555         movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line
    556 
    557         pxor            xmm0,           xmm0                ;
    558 .half_vert_variance8x_h_1:
    559         movq            xmm5,           QWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s8
    560         movq            xmm3,           QWORD PTR [rsi+rax] ;  xmm3 = s1,s2,s3..s9
    561 
    562         pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
    563         punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
    564 
    565         movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d8
    566         punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
    567 
    568         psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
    569         paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
    570         pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
    571         paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
    572 
    573 %if ABI_IS_32BIT
    574         add             esi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source
    575         add             edi,            dword ptr arg(3) ;src_pixels_per_line    ;  next destination
    576 %else
    577         add             rsi, r8
    578         add             rdi, r9
    579 %endif
    580 
    581         sub             rcx,            1                   ;
    582         jnz             .half_vert_variance8x_h_1          ;
    583 
    584         movdq2q         mm6,            xmm6                ;
    585         movdq2q         mm7,            xmm7                ;
    586 
    587         psrldq          xmm6,           8
    588         psrldq          xmm7,           8
    589 
    590         movdq2q         mm2,            xmm6
    591         movdq2q         mm3,            xmm7
    592 
    593         paddw           mm6,            mm2
    594         paddd           mm7,            mm3
    595 
    596         pxor            mm3,            mm3                 ;
    597         pxor            mm2,            mm2                 ;
    598 
    599         punpcklwd       mm2,            mm6                 ;
    600         punpckhwd       mm3,            mm6                 ;
    601 
    602         paddd           mm2,            mm3                 ;
    603         movq            mm6,            mm2                 ;
    604 
    605         psrlq           mm6,            32                  ;
    606         paddd           mm2,            mm6                 ;
    607 
    608         psrad           mm2,            16                  ;
    609         movq            mm4,            mm7                 ;
    610 
    611         psrlq           mm4,            32                  ;
    612         paddd           mm4,            mm7                 ;
    613 
    614         mov             rsi,            arg(5) ; sum
    615         mov             rdi,            arg(6) ; sumsquared
    616 
    617         movd            [rsi],          mm2                 ;
    618         movd            [rdi],          mm4                 ;
    619 
    620 
    621     ; begin epilog
    622     pop rdi
    623     pop rsi
    624     RESTORE_GOT
    625     RESTORE_XMM
    626     UNSHADOW_ARGS
    627     pop         rbp
    628     ret
    629 
    630 
    631 ;void vp9_half_horiz_variance8x_h_sse2
    632 ;(
    633 ;    unsigned char *ref_ptr,
    634 ;    int ref_pixels_per_line,
    635 ;    unsigned char *src_ptr,
    636 ;    int src_pixels_per_line,
    637 ;    unsigned int Height,
    638 ;    int *sum,
    639 ;    unsigned int *sumsquared
    640 ;)
    641 global sym(vp9_half_horiz_variance8x_h_sse2) PRIVATE
    642 sym(vp9_half_horiz_variance8x_h_sse2):
    643     push        rbp
    644     mov         rbp, rsp
    645     SHADOW_ARGS_TO_STACK 7
    646     SAVE_XMM 7
    647     GET_GOT     rbx
    648     push rsi
    649     push rdi
    650     ; end prolog
    651 
    652 %if ABI_IS_32BIT=0
    653     movsxd          r8, dword ptr arg(1) ;ref_pixels_per_line
    654     movsxd          r9, dword ptr arg(3) ;src_pixels_per_line
    655 %endif
    656 
    657         pxor            xmm6,           xmm6                ;  error accumulator
    658         pxor            xmm7,           xmm7                ;  sse eaccumulator
    659         mov             rsi,            arg(0) ;ref_ptr              ;
    660 
    661         mov             rdi,            arg(2) ;src_ptr              ;
    662         movsxd          rcx,            dword ptr arg(4) ;Height              ;
    663 
    664         pxor            xmm0,           xmm0                ;
    665 .half_horiz_variance8x_h_1:
    666         movq            xmm5,           QWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s8
    667         movq            xmm3,           QWORD PTR [rsi+1]   ;  xmm3 = s1,s2,s3..s9
    668 
    669         pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
    670         punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
    671 
    672         movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d8
    673         punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
    674 
    675         psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
    676         paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
    677         pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
    678         paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
    679 
    680 %if ABI_IS_32BIT
    681         add             esi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source
    682         add             edi,            dword ptr arg(3) ;src_pixels_per_line    ;  next destination
    683 %else
    684         add             rsi, r8
    685         add             rdi, r9
    686 %endif
    687         sub             rcx,            1                   ;
    688         jnz             .half_horiz_variance8x_h_1          ;
    689 
    690         movdq2q         mm6,            xmm6                ;
    691         movdq2q         mm7,            xmm7                ;
    692 
    693         psrldq          xmm6,           8
    694         psrldq          xmm7,           8
    695 
    696         movdq2q         mm2,            xmm6
    697         movdq2q         mm3,            xmm7
    698 
    699         paddw           mm6,            mm2
    700         paddd           mm7,            mm3
    701 
    702         pxor            mm3,            mm3                 ;
    703         pxor            mm2,            mm2                 ;
    704 
    705         punpcklwd       mm2,            mm6                 ;
    706         punpckhwd       mm3,            mm6                 ;
    707 
    708         paddd           mm2,            mm3                 ;
    709         movq            mm6,            mm2                 ;
    710 
    711         psrlq           mm6,            32                  ;
    712         paddd           mm2,            mm6                 ;
    713 
    714         psrad           mm2,            16                  ;
    715         movq            mm4,            mm7                 ;
    716 
    717         psrlq           mm4,            32                  ;
    718         paddd           mm4,            mm7                 ;
    719 
    720         mov             rsi,            arg(5) ; sum
    721         mov             rdi,            arg(6) ; sumsquared
    722 
    723         movd            [rsi],          mm2                 ;
    724         movd            [rdi],          mm4                 ;
    725 
    726 
    727     ; begin epilog
    728     pop rdi
    729     pop rsi
    730     RESTORE_GOT
    731     RESTORE_XMM
    732     UNSHADOW_ARGS
    733     pop         rbp
    734     ret
    735