Home | History | Annotate | Download | only in x86
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 %include "vpx_ports/x86_abi_support.asm"
     12 
     13 ;void vp9_half_horiz_vert_variance16x_h_sse2
     14 ;(
     15 ;    unsigned char *ref_ptr,
     16 ;    int ref_pixels_per_line,
     17 ;    unsigned char *src_ptr,
     18 ;    int src_pixels_per_line,
     19 ;    unsigned int Height,
     20 ;    int *sum,
     21 ;    unsigned int *sumsquared
     22 ;)
     23 global sym(vp9_half_horiz_vert_variance16x_h_sse2) PRIVATE
     24 sym(vp9_half_horiz_vert_variance16x_h_sse2):
     25     push        rbp
     26     mov         rbp, rsp
     27     SHADOW_ARGS_TO_STACK 7
     28     SAVE_XMM 7
     29     GET_GOT     rbx
     30     push rsi
     31     push rdi
     32     ; end prolog
     33 
     34         pxor            xmm6,           xmm6                ;  error accumulator
     35         pxor            xmm7,           xmm7                ;  sse eaccumulator
     36         mov             rsi,            arg(0) ;ref_ptr              ;
     37 
     38         mov             rdi,            arg(2) ;src_ptr              ;
     39         movsxd          rcx,            dword ptr arg(4) ;Height              ;
     40         movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line
     41         movsxd          rdx,            dword ptr arg(3)    ;src_pixels_per_line
     42 
     43         pxor            xmm0,           xmm0                ;
     44 
     45         movdqu          xmm5,           XMMWORD PTR [rsi]
     46         movdqu          xmm3,           XMMWORD PTR [rsi+1]
     47         pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3) horizontal line 1
     48 
     49         lea             rsi,            [rsi + rax]
     50 
     51 .half_horiz_vert_variance16x_h_1:
     52         movdqu          xmm1,           XMMWORD PTR [rsi]     ;
     53         movdqu          xmm2,           XMMWORD PTR [rsi+1]   ;
     54         pavgb           xmm1,           xmm2                ;  xmm1 = avg(xmm1,xmm3) horizontal line i+1
     55 
     56         pavgb           xmm5,           xmm1                ;  xmm = vertical average of the above
     57 
     58         movdqa          xmm4,           xmm5
     59         punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
     60         punpckhbw       xmm4,           xmm0
     61 
     62         movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d7
     63         punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
     64         psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
     65 
     66         movq            xmm3,           QWORD PTR [rdi+8]
     67         punpcklbw       xmm3,           xmm0
     68         psubw           xmm4,           xmm3
     69 
     70         paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
     71         paddw           xmm6,           xmm4
     72         pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
     73         pmaddwd         xmm4,           xmm4
     74         paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
     75         paddd           xmm7,           xmm4
     76 
     77         movdqa          xmm5,           xmm1                ;  save xmm1 for use on the next row
     78 
     79         lea             rsi,            [rsi + rax]
     80         lea             rdi,            [rdi + rdx]
     81 
     82         sub             rcx,            1                   ;
     83         jnz             .half_horiz_vert_variance16x_h_1    ;
     84 
     85         pxor        xmm1,           xmm1
     86         pxor        xmm5,           xmm5
     87 
     88         punpcklwd   xmm0,           xmm6
     89         punpckhwd   xmm1,           xmm6
     90         psrad       xmm0,           16
     91         psrad       xmm1,           16
     92         paddd       xmm0,           xmm1
     93         movdqa      xmm1,           xmm0
     94 
     95         movdqa      xmm6,           xmm7
     96         punpckldq   xmm6,           xmm5
     97         punpckhdq   xmm7,           xmm5
     98         paddd       xmm6,           xmm7
     99 
    100         punpckldq   xmm0,           xmm5
    101         punpckhdq   xmm1,           xmm5
    102         paddd       xmm0,           xmm1
    103 
    104         movdqa      xmm7,           xmm6
    105         movdqa      xmm1,           xmm0
    106 
    107         psrldq      xmm7,           8
    108         psrldq      xmm1,           8
    109 
    110         paddd       xmm6,           xmm7
    111         paddd       xmm0,           xmm1
    112 
    113         mov         rsi,            arg(5) ;[Sum]
    114         mov         rdi,            arg(6) ;[SSE]
    115 
    116         movd        [rsi],       xmm0
    117         movd        [rdi],       xmm6
    118 
    119     ; begin epilog
    120     pop rdi
    121     pop rsi
    122     RESTORE_GOT
    123     RESTORE_XMM
    124     UNSHADOW_ARGS
    125     pop         rbp
    126     ret
    127 
    128 ;void vp9_half_vert_variance16x_h_sse2
    129 ;(
    130 ;    unsigned char *ref_ptr,
    131 ;    int ref_pixels_per_line,
    132 ;    unsigned char *src_ptr,
    133 ;    int src_pixels_per_line,
    134 ;    unsigned int Height,
    135 ;    int *sum,
    136 ;    unsigned int *sumsquared
    137 ;)
    138 global sym(vp9_half_vert_variance16x_h_sse2) PRIVATE
    139 sym(vp9_half_vert_variance16x_h_sse2):
    140     push        rbp
    141     mov         rbp, rsp
    142     SHADOW_ARGS_TO_STACK 7
    143     SAVE_XMM 7
    144     GET_GOT     rbx
    145     push rsi
    146     push rdi
    147     ; end prolog
    148 
    149         pxor            xmm6,           xmm6                ;  error accumulator
    150         pxor            xmm7,           xmm7                ;  sse eaccumulator
    151         mov             rsi,            arg(0)              ;ref_ptr
    152 
    153         mov             rdi,            arg(2)              ;src_ptr
    154         movsxd          rcx,            dword ptr arg(4)    ;Height
    155         movsxd          rax,            dword ptr arg(1)    ;ref_pixels_per_line
    156         movsxd          rdx,            dword ptr arg(3)    ;src_pixels_per_line
    157 
    158         movdqu          xmm5,           XMMWORD PTR [rsi]
    159         lea             rsi,            [rsi + rax          ]
    160         pxor            xmm0,           xmm0
    161 
    162 .half_vert_variance16x_h_1:
    163         movdqu          xmm3,           XMMWORD PTR [rsi]
    164 
    165         pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
    166         movdqa          xmm4,           xmm5
    167         punpcklbw       xmm5,           xmm0
    168         punpckhbw       xmm4,           xmm0
    169 
    170         movq            xmm2,           QWORD PTR [rdi]
    171         punpcklbw       xmm2,           xmm0
    172         psubw           xmm5,           xmm2
    173         movq            xmm2,           QWORD PTR [rdi+8]
    174         punpcklbw       xmm2,           xmm0
    175         psubw           xmm4,           xmm2
    176 
    177         paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
    178         paddw           xmm6,           xmm4
    179         pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
    180         pmaddwd         xmm4,           xmm4
    181         paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
    182         paddd           xmm7,           xmm4
    183 
    184         movdqa          xmm5,           xmm3
    185 
    186         lea             rsi,            [rsi + rax]
    187         lea             rdi,            [rdi + rdx]
    188 
    189         sub             rcx,            1
    190         jnz             .half_vert_variance16x_h_1
    191 
    192         pxor        xmm1,           xmm1
    193         pxor        xmm5,           xmm5
    194 
    195         punpcklwd   xmm0,           xmm6
    196         punpckhwd   xmm1,           xmm6
    197         psrad       xmm0,           16
    198         psrad       xmm1,           16
    199         paddd       xmm0,           xmm1
    200         movdqa      xmm1,           xmm0
    201 
    202         movdqa      xmm6,           xmm7
    203         punpckldq   xmm6,           xmm5
    204         punpckhdq   xmm7,           xmm5
    205         paddd       xmm6,           xmm7
    206 
    207         punpckldq   xmm0,           xmm5
    208         punpckhdq   xmm1,           xmm5
    209         paddd       xmm0,           xmm1
    210 
    211         movdqa      xmm7,           xmm6
    212         movdqa      xmm1,           xmm0
    213 
    214         psrldq      xmm7,           8
    215         psrldq      xmm1,           8
    216 
    217         paddd       xmm6,           xmm7
    218         paddd       xmm0,           xmm1
    219 
    220         mov         rsi,            arg(5) ;[Sum]
    221         mov         rdi,            arg(6) ;[SSE]
    222 
    223         movd        [rsi],       xmm0
    224         movd        [rdi],       xmm6
    225 
    226     ; begin epilog
    227     pop rdi
    228     pop rsi
    229     RESTORE_GOT
    230     RESTORE_XMM
    231     UNSHADOW_ARGS
    232     pop         rbp
    233     ret
    234 
    235 ;void vp9_half_horiz_variance16x_h_sse2
    236 ;(
    237 ;    unsigned char *ref_ptr,
    238 ;    int ref_pixels_per_line,
    239 ;    unsigned char *src_ptr,
    240 ;    int src_pixels_per_line,
    241 ;    unsigned int Height,
    242 ;    int *sum,
    243 ;    unsigned int *sumsquared
    244 ;)
    245 global sym(vp9_half_horiz_variance16x_h_sse2) PRIVATE
    246 sym(vp9_half_horiz_variance16x_h_sse2):
    247     push        rbp
    248     mov         rbp, rsp
    249     SHADOW_ARGS_TO_STACK 7
    250     SAVE_XMM 7
    251     GET_GOT     rbx
    252     push rsi
    253     push rdi
    254     ; end prolog
    255 
    256         pxor            xmm6,           xmm6                ;  error accumulator
    257         pxor            xmm7,           xmm7                ;  sse eaccumulator
    258         mov             rsi,            arg(0) ;ref_ptr              ;
    259 
    260         mov             rdi,            arg(2) ;src_ptr              ;
    261         movsxd          rcx,            dword ptr arg(4) ;Height              ;
    262         movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line
    263         movsxd          rdx,            dword ptr arg(3)    ;src_pixels_per_line
    264 
    265         pxor            xmm0,           xmm0                ;
    266 
    267 .half_horiz_variance16x_h_1:
    268         movdqu          xmm5,           XMMWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s15
    269         movdqu          xmm3,           XMMWORD PTR [rsi+1]   ;  xmm3 = s1,s2,s3..s16
    270 
    271         pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
    272         movdqa          xmm1,           xmm5
    273         punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
    274         punpckhbw       xmm1,           xmm0
    275 
    276         movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d7
    277         punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
    278         movq            xmm2,           QWORD PTR [rdi+8]
    279         punpcklbw       xmm2,           xmm0
    280 
    281         psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
    282         psubw           xmm1,           xmm2
    283         paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
    284         paddw           xmm6,           xmm1
    285         pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
    286         pmaddwd         xmm1,           xmm1
    287         paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
    288         paddd           xmm7,           xmm1
    289 
    290         lea             rsi,            [rsi + rax]
    291         lea             rdi,            [rdi + rdx]
    292 
    293         sub             rcx,            1                   ;
    294         jnz             .half_horiz_variance16x_h_1         ;
    295 
    296         pxor        xmm1,           xmm1
    297         pxor        xmm5,           xmm5
    298 
    299         punpcklwd   xmm0,           xmm6
    300         punpckhwd   xmm1,           xmm6
    301         psrad       xmm0,           16
    302         psrad       xmm1,           16
    303         paddd       xmm0,           xmm1
    304         movdqa      xmm1,           xmm0
    305 
    306         movdqa      xmm6,           xmm7
    307         punpckldq   xmm6,           xmm5
    308         punpckhdq   xmm7,           xmm5
    309         paddd       xmm6,           xmm7
    310 
    311         punpckldq   xmm0,           xmm5
    312         punpckhdq   xmm1,           xmm5
    313         paddd       xmm0,           xmm1
    314 
    315         movdqa      xmm7,           xmm6
    316         movdqa      xmm1,           xmm0
    317 
    318         psrldq      xmm7,           8
    319         psrldq      xmm1,           8
    320 
    321         paddd       xmm6,           xmm7
    322         paddd       xmm0,           xmm1
    323 
    324         mov         rsi,            arg(5) ;[Sum]
    325         mov         rdi,            arg(6) ;[SSE]
    326 
    327         movd        [rsi],       xmm0
    328         movd        [rdi],       xmm6
    329 
    330     ; begin epilog
    331     pop rdi
    332     pop rsi
    333     RESTORE_GOT
    334     RESTORE_XMM
    335     UNSHADOW_ARGS
    336     pop         rbp
    337     ret
    338