Home | History | Annotate | Download | only in x86
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12 %include "vpx_ports/x86_abi_support.asm"
     13 
     14 ;int vp8_block_error_xmm(short *coeff_ptr,  short *dcoef_ptr)
     15 global sym(vp8_block_error_xmm)
     16 sym(vp8_block_error_xmm):
     17     push        rbp
     18     mov         rbp, rsp
     19     SHADOW_ARGS_TO_STACK 2
     20     push rsi
     21     push rdi
     22     ; end prologue
     23 
     24         mov         rsi,        arg(0) ;coeff_ptr
     25 
     26         mov         rdi,        arg(1) ;dcoef_ptr
     27         movdqa      xmm3,       [rsi]
     28 
     29         movdqa      xmm4,       [rdi]
     30         movdqa      xmm5,       [rsi+16]
     31 
     32         movdqa      xmm6,       [rdi+16]
     33         psubw       xmm3,       xmm4
     34 
     35         psubw       xmm5,       xmm6
     36         pmaddwd     xmm3,       xmm3
     37         pmaddwd     xmm5,       xmm5
     38 
     39         paddd       xmm3,       xmm5
     40 
     41         pxor        xmm7,       xmm7
     42         movdqa      xmm0,       xmm3
     43 
     44         punpckldq   xmm0,       xmm7
     45         punpckhdq   xmm3,       xmm7
     46 
     47         paddd       xmm0,       xmm3
     48         movdqa      xmm3,       xmm0
     49 
     50         psrldq      xmm0,       8
     51         paddd       xmm0,       xmm3
     52 
     53         movq        rax,        xmm0
     54 
     55     pop rdi
     56     pop rsi
     57     ; begin epilog
     58     UNSHADOW_ARGS
     59     pop         rbp
     60     ret
     61 
     62 ;int vp8_block_error_mmx(short *coeff_ptr,  short *dcoef_ptr)
     63 global sym(vp8_block_error_mmx)
     64 sym(vp8_block_error_mmx):
     65     push        rbp
     66     mov         rbp, rsp
     67     SHADOW_ARGS_TO_STACK 2
     68     push rsi
     69     push rdi
     70     ; end prolog
     71 
     72 
     73         mov         rsi,        arg(0) ;coeff_ptr
     74         pxor        mm7,        mm7
     75 
     76         mov         rdi,        arg(1) ;dcoef_ptr
     77         movq        mm3,        [rsi]
     78 
     79         movq        mm4,        [rdi]
     80         movq        mm5,        [rsi+8]
     81 
     82         movq        mm6,        [rdi+8]
     83         pxor        mm1,        mm1 ; from movd mm1, dc ; dc =0
     84 
     85         movq        mm2,        mm7
     86         psubw       mm5,        mm6
     87 
     88         por         mm1,        mm2
     89         pmaddwd     mm5,        mm5
     90 
     91         pcmpeqw     mm1,        mm7
     92         psubw       mm3,        mm4
     93 
     94         pand        mm1,        mm3
     95         pmaddwd     mm1,        mm1
     96 
     97         paddd       mm1,        mm5
     98         movq        mm3,        [rsi+16]
     99 
    100         movq        mm4,        [rdi+16]
    101         movq        mm5,        [rsi+24]
    102 
    103         movq        mm6,        [rdi+24]
    104         psubw       mm5,        mm6
    105 
    106         pmaddwd     mm5,        mm5
    107         psubw       mm3,        mm4
    108 
    109         pmaddwd     mm3,        mm3
    110         paddd       mm3,        mm5
    111 
    112         paddd       mm1,        mm3
    113         movq        mm0,        mm1
    114 
    115         psrlq       mm1,        32
    116         paddd       mm0,        mm1
    117 
    118         movq        rax,        mm0
    119 
    120     pop rdi
    121     pop rsi
    122     ; begin epilog
    123     UNSHADOW_ARGS
    124     pop         rbp
    125     ret
    126 
    127 
    128 ;int vp8_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
    129 global sym(vp8_mbblock_error_mmx_impl)
    130 sym(vp8_mbblock_error_mmx_impl):
    131     push        rbp
    132     mov         rbp, rsp
    133     SHADOW_ARGS_TO_STACK 3
    134     push rsi
    135     push rdi
    136     ; end prolog
    137 
    138 
    139         mov         rsi,        arg(0) ;coeff_ptr
    140         pxor        mm7,        mm7
    141 
    142         mov         rdi,        arg(1) ;dcoef_ptr
    143         pxor        mm2,        mm2
    144 
    145         movd        mm1,        dword ptr arg(2) ;dc
    146         por         mm1,        mm2
    147 
    148         pcmpeqw     mm1,        mm7
    149         mov         rcx,        16
    150 
    151 mberror_loop_mmx:
    152         movq        mm3,       [rsi]
    153         movq        mm4,       [rdi]
    154 
    155         movq        mm5,       [rsi+8]
    156         movq        mm6,       [rdi+8]
    157 
    158 
    159         psubw       mm5,        mm6
    160         pmaddwd     mm5,        mm5
    161 
    162         psubw       mm3,        mm4
    163         pand        mm3,        mm1
    164 
    165         pmaddwd     mm3,        mm3
    166         paddd       mm2,        mm5
    167 
    168         paddd       mm2,        mm3
    169         movq        mm3,       [rsi+16]
    170 
    171         movq        mm4,       [rdi+16]
    172         movq        mm5,       [rsi+24]
    173 
    174         movq        mm6,       [rdi+24]
    175         psubw       mm5,        mm6
    176 
    177         pmaddwd     mm5,        mm5
    178         psubw       mm3,        mm4
    179 
    180         pmaddwd     mm3,        mm3
    181         paddd       mm2,        mm5
    182 
    183         paddd       mm2,        mm3
    184         add         rsi,        32
    185 
    186         add         rdi,        32
    187         sub         rcx,        1
    188 
    189         jnz         mberror_loop_mmx
    190 
    191         movq        mm0,        mm2
    192         psrlq       mm2,        32
    193 
    194         paddd       mm0,        mm2
    195         movq        rax,        mm0
    196 
    197     pop rdi
    198     pop rsi
    199     ; begin epilog
    200     UNSHADOW_ARGS
    201     pop         rbp
    202     ret
    203 
    204 
    205 ;int vp8_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
    206 global sym(vp8_mbblock_error_xmm_impl)
    207 sym(vp8_mbblock_error_xmm_impl):
    208     push        rbp
    209     mov         rbp, rsp
    210     SHADOW_ARGS_TO_STACK 3
    211     push rsi
    212     push rdi
    213     ; end prolog
    214 
    215 
    216         mov         rsi,        arg(0) ;coeff_ptr
    217         pxor        xmm7,       xmm7
    218 
    219         mov         rdi,        arg(1) ;dcoef_ptr
    220         pxor        xmm2,       xmm2
    221 
    222         movd        xmm1,       dword ptr arg(2) ;dc
    223         por         xmm1,       xmm2
    224 
    225         pcmpeqw     xmm1,       xmm7
    226         mov         rcx,        16
    227 
    228 mberror_loop:
    229         movdqa      xmm3,       [rsi]
    230         movdqa      xmm4,       [rdi]
    231 
    232         movdqa      xmm5,       [rsi+16]
    233         movdqa      xmm6,       [rdi+16]
    234 
    235 
    236         psubw       xmm5,       xmm6
    237         pmaddwd     xmm5,       xmm5
    238 
    239         psubw       xmm3,       xmm4
    240         pand        xmm3,       xmm1
    241 
    242         pmaddwd     xmm3,       xmm3
    243         add         rsi,        32
    244 
    245         add         rdi,        32
    246 
    247         sub         rcx,        1
    248         paddd       xmm2,       xmm5
    249 
    250         paddd       xmm2,       xmm3
    251         jnz         mberror_loop
    252 
    253         movdqa      xmm0,       xmm2
    254         punpckldq   xmm0,       xmm7
    255 
    256         punpckhdq   xmm2,       xmm7
    257         paddd       xmm0,       xmm2
    258 
    259         movdqa      xmm1,       xmm0
    260         psrldq      xmm0,       8
    261 
    262         paddd       xmm0,       xmm1
    263         movq        rax,        xmm0
    264 
    265     pop rdi
    266     pop rsi
    267     ; begin epilog
    268     UNSHADOW_ARGS
    269     pop         rbp
    270     ret
    271 
    272 
    273 ;int vp8_mbuverror_mmx_impl(short *s_ptr, short *d_ptr);
    274 global sym(vp8_mbuverror_mmx_impl)
    275 sym(vp8_mbuverror_mmx_impl):
    276     push        rbp
    277     mov         rbp, rsp
    278     SHADOW_ARGS_TO_STACK 2
    279     push rsi
    280     push rdi
    281     ; end prolog
    282 
    283 
    284         mov             rsi,        arg(0) ;s_ptr
    285         mov             rdi,        arg(1) ;d_ptr
    286 
    287         mov             rcx,        16
    288         pxor            mm7,        mm7
    289 
    290 mbuverror_loop_mmx:
    291 
    292         movq            mm1,        [rsi]
    293         movq            mm2,        [rdi]
    294 
    295         psubw           mm1,        mm2
    296         pmaddwd         mm1,        mm1
    297 
    298 
    299         movq            mm3,        [rsi+8]
    300         movq            mm4,        [rdi+8]
    301 
    302         psubw           mm3,        mm4
    303         pmaddwd         mm3,        mm3
    304 
    305 
    306         paddd           mm7,        mm1
    307         paddd           mm7,        mm3
    308 
    309 
    310         add             rsi,        16
    311         add             rdi,        16
    312 
    313         dec             rcx
    314         jnz             mbuverror_loop_mmx
    315 
    316         movq            mm0,        mm7
    317         psrlq           mm7,        32
    318 
    319         paddd           mm0,        mm7
    320         movq            rax,        mm0
    321 
    322     pop rdi
    323     pop rsi
    324     ; begin epilog
    325     UNSHADOW_ARGS
    326     pop         rbp
    327     ret
    328 
    329 
    330 ;int vp8_mbuverror_xmm_impl(short *s_ptr, short *d_ptr);
    331 global sym(vp8_mbuverror_xmm_impl)
    332 sym(vp8_mbuverror_xmm_impl):
    333     push        rbp
    334     mov         rbp, rsp
    335     SHADOW_ARGS_TO_STACK 2
    336     push rsi
    337     push rdi
    338     ; end prolog
    339 
    340 
    341         mov             rsi,        arg(0) ;s_ptr
    342         mov             rdi,        arg(1) ;d_ptr
    343 
    344         mov             rcx,        16
    345         pxor            xmm7,       xmm7
    346 
    347 mbuverror_loop:
    348 
    349         movdqa          xmm1,       [rsi]
    350         movdqa          xmm2,       [rdi]
    351 
    352         psubw           xmm1,       xmm2
    353         pmaddwd         xmm1,       xmm1
    354 
    355         paddd           xmm7,       xmm1
    356 
    357         add             rsi,        16
    358         add             rdi,        16
    359 
    360         dec             rcx
    361         jnz             mbuverror_loop
    362 
    363         pxor        xmm0,           xmm0
    364         movdqa      xmm1,           xmm7
    365 
    366         movdqa      xmm2,           xmm1
    367         punpckldq   xmm1,           xmm0
    368 
    369         punpckhdq   xmm2,           xmm0
    370         paddd       xmm1,           xmm2
    371 
    372         movdqa      xmm2,           xmm1
    373 
    374         psrldq      xmm1,           8
    375         paddd       xmm1,           xmm2
    376 
    377         movq            rax,            xmm1
    378 
    379     pop rdi
    380     pop rsi
    381     ; begin epilog
    382     UNSHADOW_ARGS
    383     pop         rbp
    384     ret
    385