Home | History | Annotate | Download | only in x86
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12 %include "vpx_ports/x86_abi_support.asm"
     13 
     14 ;int vp8_block_error_xmm(short *coeff_ptr,  short *dcoef_ptr)
     15 global sym(vp8_block_error_xmm) PRIVATE
     16 sym(vp8_block_error_xmm):
     17     push        rbp
     18     mov         rbp, rsp
     19     SHADOW_ARGS_TO_STACK 2
     20     push rsi
     21     push rdi
     22     ; end prologue
     23 
     24         mov         rsi,        arg(0) ;coeff_ptr
     25         mov         rdi,        arg(1) ;dcoef_ptr
     26 
     27         movdqa      xmm0,       [rsi]
     28         movdqa      xmm1,       [rdi]
     29 
     30         movdqa      xmm2,       [rsi+16]
     31         movdqa      xmm3,       [rdi+16]
     32 
     33         psubw       xmm0,       xmm1
     34         psubw       xmm2,       xmm3
     35 
     36         pmaddwd     xmm0,       xmm0
     37         pmaddwd     xmm2,       xmm2
     38 
     39         paddd       xmm0,       xmm2
     40 
     41         pxor        xmm5,       xmm5
     42         movdqa      xmm1,       xmm0
     43 
     44         punpckldq   xmm0,       xmm5
     45         punpckhdq   xmm1,       xmm5
     46 
     47         paddd       xmm0,       xmm1
     48         movdqa      xmm1,       xmm0
     49 
     50         psrldq      xmm0,       8
     51         paddd       xmm0,       xmm1
     52 
     53         movq        rax,        xmm0
     54 
     55     pop rdi
     56     pop rsi
     57     ; begin epilog
     58     UNSHADOW_ARGS
     59     pop         rbp
     60     ret
     61 
     62 ;int vp8_block_error_mmx(short *coeff_ptr,  short *dcoef_ptr)
     63 global sym(vp8_block_error_mmx) PRIVATE
     64 sym(vp8_block_error_mmx):
     65     push        rbp
     66     mov         rbp, rsp
     67     SHADOW_ARGS_TO_STACK 2
     68     push rsi
     69     push rdi
     70     ; end prolog
     71 
     72 
     73         mov         rsi,        arg(0) ;coeff_ptr
     74         pxor        mm7,        mm7
     75 
     76         mov         rdi,        arg(1) ;dcoef_ptr
     77         movq        mm3,        [rsi]
     78 
     79         movq        mm4,        [rdi]
     80         movq        mm5,        [rsi+8]
     81 
     82         movq        mm6,        [rdi+8]
     83         pxor        mm1,        mm1 ; from movd mm1, dc ; dc =0
     84 
     85         movq        mm2,        mm7
     86         psubw       mm5,        mm6
     87 
     88         por         mm1,        mm2
     89         pmaddwd     mm5,        mm5
     90 
     91         pcmpeqw     mm1,        mm7
     92         psubw       mm3,        mm4
     93 
     94         pand        mm1,        mm3
     95         pmaddwd     mm1,        mm1
     96 
     97         paddd       mm1,        mm5
     98         movq        mm3,        [rsi+16]
     99 
    100         movq        mm4,        [rdi+16]
    101         movq        mm5,        [rsi+24]
    102 
    103         movq        mm6,        [rdi+24]
    104         psubw       mm5,        mm6
    105 
    106         pmaddwd     mm5,        mm5
    107         psubw       mm3,        mm4
    108 
    109         pmaddwd     mm3,        mm3
    110         paddd       mm3,        mm5
    111 
    112         paddd       mm1,        mm3
    113         movq        mm0,        mm1
    114 
    115         psrlq       mm1,        32
    116         paddd       mm0,        mm1
    117 
    118         movq        rax,        mm0
    119 
    120     pop rdi
    121     pop rsi
    122     ; begin epilog
    123     UNSHADOW_ARGS
    124     pop         rbp
    125     ret
    126 
    127 
    128 ;int vp8_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
    129 global sym(vp8_mbblock_error_mmx_impl) PRIVATE
    130 sym(vp8_mbblock_error_mmx_impl):
    131     push        rbp
    132     mov         rbp, rsp
    133     SHADOW_ARGS_TO_STACK 3
    134     push rsi
    135     push rdi
    136     ; end prolog
    137 
    138 
    139         mov         rsi,        arg(0) ;coeff_ptr
    140         pxor        mm7,        mm7
    141 
    142         mov         rdi,        arg(1) ;dcoef_ptr
    143         pxor        mm2,        mm2
    144 
    145         movd        mm1,        dword ptr arg(2) ;dc
    146         por         mm1,        mm2
    147 
    148         pcmpeqw     mm1,        mm7
    149         mov         rcx,        16
    150 
    151 .mberror_loop_mmx:
    152         movq        mm3,       [rsi]
    153         movq        mm4,       [rdi]
    154 
    155         movq        mm5,       [rsi+8]
    156         movq        mm6,       [rdi+8]
    157 
    158 
    159         psubw       mm5,        mm6
    160         pmaddwd     mm5,        mm5
    161 
    162         psubw       mm3,        mm4
    163         pand        mm3,        mm1
    164 
    165         pmaddwd     mm3,        mm3
    166         paddd       mm2,        mm5
    167 
    168         paddd       mm2,        mm3
    169         movq        mm3,       [rsi+16]
    170 
    171         movq        mm4,       [rdi+16]
    172         movq        mm5,       [rsi+24]
    173 
    174         movq        mm6,       [rdi+24]
    175         psubw       mm5,        mm6
    176 
    177         pmaddwd     mm5,        mm5
    178         psubw       mm3,        mm4
    179 
    180         pmaddwd     mm3,        mm3
    181         paddd       mm2,        mm5
    182 
    183         paddd       mm2,        mm3
    184         add         rsi,        32
    185 
    186         add         rdi,        32
    187         sub         rcx,        1
    188 
    189         jnz         .mberror_loop_mmx
    190 
    191         movq        mm0,        mm2
    192         psrlq       mm2,        32
    193 
    194         paddd       mm0,        mm2
    195         movq        rax,        mm0
    196 
    197     pop rdi
    198     pop rsi
    199     ; begin epilog
    200     UNSHADOW_ARGS
    201     pop         rbp
    202     ret
    203 
    204 
    205 ;int vp8_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
    206 global sym(vp8_mbblock_error_xmm_impl) PRIVATE
    207 sym(vp8_mbblock_error_xmm_impl):
    208     push        rbp
    209     mov         rbp, rsp
    210     SHADOW_ARGS_TO_STACK 3
    211     SAVE_XMM 6
    212     push rsi
    213     push rdi
    214     ; end prolog
    215 
    216 
    217         mov         rsi,        arg(0) ;coeff_ptr
    218         pxor        xmm6,       xmm6
    219 
    220         mov         rdi,        arg(1) ;dcoef_ptr
    221         pxor        xmm4,       xmm4
    222 
    223         movd        xmm5,       dword ptr arg(2) ;dc
    224         por         xmm5,       xmm4
    225 
    226         pcmpeqw     xmm5,       xmm6
    227         mov         rcx,        16
    228 
    229 .mberror_loop:
    230         movdqa      xmm0,       [rsi]
    231         movdqa      xmm1,       [rdi]
    232 
    233         movdqa      xmm2,       [rsi+16]
    234         movdqa      xmm3,       [rdi+16]
    235 
    236 
    237         psubw       xmm2,       xmm3
    238         pmaddwd     xmm2,       xmm2
    239 
    240         psubw       xmm0,       xmm1
    241         pand        xmm0,       xmm5
    242 
    243         pmaddwd     xmm0,       xmm0
    244         add         rsi,        32
    245 
    246         add         rdi,        32
    247 
    248         sub         rcx,        1
    249         paddd       xmm4,       xmm2
    250 
    251         paddd       xmm4,       xmm0
    252         jnz         .mberror_loop
    253 
    254         movdqa      xmm0,       xmm4
    255         punpckldq   xmm0,       xmm6
    256 
    257         punpckhdq   xmm4,       xmm6
    258         paddd       xmm0,       xmm4
    259 
    260         movdqa      xmm1,       xmm0
    261         psrldq      xmm0,       8
    262 
    263         paddd       xmm0,       xmm1
    264         movq        rax,        xmm0
    265 
    266     pop rdi
    267     pop rsi
    268     ; begin epilog
    269     RESTORE_XMM
    270     UNSHADOW_ARGS
    271     pop         rbp
    272     ret
    273 
    274 
    275 ;int vp8_mbuverror_mmx_impl(short *s_ptr, short *d_ptr);
    276 global sym(vp8_mbuverror_mmx_impl) PRIVATE
    277 sym(vp8_mbuverror_mmx_impl):
    278     push        rbp
    279     mov         rbp, rsp
    280     SHADOW_ARGS_TO_STACK 2
    281     push rsi
    282     push rdi
    283     ; end prolog
    284 
    285 
    286         mov             rsi,        arg(0) ;s_ptr
    287         mov             rdi,        arg(1) ;d_ptr
    288 
    289         mov             rcx,        16
    290         pxor            mm7,        mm7
    291 
    292 .mbuverror_loop_mmx:
    293 
    294         movq            mm1,        [rsi]
    295         movq            mm2,        [rdi]
    296 
    297         psubw           mm1,        mm2
    298         pmaddwd         mm1,        mm1
    299 
    300 
    301         movq            mm3,        [rsi+8]
    302         movq            mm4,        [rdi+8]
    303 
    304         psubw           mm3,        mm4
    305         pmaddwd         mm3,        mm3
    306 
    307 
    308         paddd           mm7,        mm1
    309         paddd           mm7,        mm3
    310 
    311 
    312         add             rsi,        16
    313         add             rdi,        16
    314 
    315         dec             rcx
    316         jnz             .mbuverror_loop_mmx
    317 
    318         movq            mm0,        mm7
    319         psrlq           mm7,        32
    320 
    321         paddd           mm0,        mm7
    322         movq            rax,        mm0
    323 
    324     pop rdi
    325     pop rsi
    326     ; begin epilog
    327     UNSHADOW_ARGS
    328     pop         rbp
    329     ret
    330 
    331 
    332 ;int vp8_mbuverror_xmm_impl(short *s_ptr, short *d_ptr);
    333 global sym(vp8_mbuverror_xmm_impl) PRIVATE
    334 sym(vp8_mbuverror_xmm_impl):
    335     push        rbp
    336     mov         rbp, rsp
    337     SHADOW_ARGS_TO_STACK 2
    338     push rsi
    339     push rdi
    340     ; end prolog
    341 
    342 
    343         mov             rsi,        arg(0) ;s_ptr
    344         mov             rdi,        arg(1) ;d_ptr
    345 
    346         mov             rcx,        16
    347         pxor            xmm3,       xmm3
    348 
    349 .mbuverror_loop:
    350 
    351         movdqa          xmm1,       [rsi]
    352         movdqa          xmm2,       [rdi]
    353 
    354         psubw           xmm1,       xmm2
    355         pmaddwd         xmm1,       xmm1
    356 
    357         paddd           xmm3,       xmm1
    358 
    359         add             rsi,        16
    360         add             rdi,        16
    361 
    362         dec             rcx
    363         jnz             .mbuverror_loop
    364 
    365         pxor        xmm0,           xmm0
    366         movdqa      xmm1,           xmm3
    367 
    368         movdqa      xmm2,           xmm1
    369         punpckldq   xmm1,           xmm0
    370 
    371         punpckhdq   xmm2,           xmm0
    372         paddd       xmm1,           xmm2
    373 
    374         movdqa      xmm2,           xmm1
    375 
    376         psrldq      xmm1,           8
    377         paddd       xmm1,           xmm2
    378 
    379         movq            rax,            xmm1
    380 
    381     pop rdi
    382     pop rsi
    383     ; begin epilog
    384     UNSHADOW_ARGS
    385     pop         rbp
    386     ret
    387