Home | History | Annotate | Download | only in x86
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12 %include "vpx_ports/x86_abi_support.asm"
     13 
     14 global sym(vp8_sad16x16_mmx) PRIVATE
     15 global sym(vp8_sad8x16_mmx) PRIVATE
     16 global sym(vp8_sad8x8_mmx) PRIVATE
     17 global sym(vp8_sad4x4_mmx) PRIVATE
     18 global sym(vp8_sad16x8_mmx) PRIVATE
     19 
     20 ;unsigned int vp8_sad16x16_mmx(
     21 ;    unsigned char *src_ptr,
     22 ;    int  src_stride,
     23 ;    unsigned char *ref_ptr,
     24 ;    int  ref_stride)
     25 sym(vp8_sad16x16_mmx):
     26     push        rbp
     27     mov         rbp, rsp
     28     SHADOW_ARGS_TO_STACK 4
     29     push rsi
     30     push rdi
     31     ; end prolog
     32 
     33         mov             rsi,        arg(0) ;src_ptr
     34         mov             rdi,        arg(2) ;ref_ptr
     35 
     36         movsxd          rax,        dword ptr arg(1) ;src_stride
     37         movsxd          rdx,        dword ptr arg(3) ;ref_stride
     38 
     39         lea             rcx,        [rsi+rax*8]
     40 
     41         lea             rcx,        [rcx+rax*8]
     42         pxor            mm7,        mm7
     43 
     44         pxor            mm6,        mm6
     45 
     46 .x16x16sad_mmx_loop:
     47 
     48         movq            mm0,        QWORD PTR [rsi]
     49         movq            mm2,        QWORD PTR [rsi+8]
     50 
     51         movq            mm1,        QWORD PTR [rdi]
     52         movq            mm3,        QWORD PTR [rdi+8]
     53 
     54         movq            mm4,        mm0
     55         movq            mm5,        mm2
     56 
     57         psubusb         mm0,        mm1
     58         psubusb         mm1,        mm4
     59 
     60         psubusb         mm2,        mm3
     61         psubusb         mm3,        mm5
     62 
     63         por             mm0,        mm1
     64         por             mm2,        mm3
     65 
     66         movq            mm1,        mm0
     67         movq            mm3,        mm2
     68 
     69         punpcklbw       mm0,        mm6
     70         punpcklbw       mm2,        mm6
     71 
     72         punpckhbw       mm1,        mm6
     73         punpckhbw       mm3,        mm6
     74 
     75         paddw           mm0,        mm2
     76         paddw           mm1,        mm3
     77 
     78 
     79         lea             rsi,        [rsi+rax]
     80         add             rdi,        rdx
     81 
     82         paddw           mm7,        mm0
     83         paddw           mm7,        mm1
     84 
     85         cmp             rsi,        rcx
     86         jne             .x16x16sad_mmx_loop
     87 
     88 
     89         movq            mm0,        mm7
     90 
     91         punpcklwd       mm0,        mm6
     92         punpckhwd       mm7,        mm6
     93 
     94         paddw           mm0,        mm7
     95         movq            mm7,        mm0
     96 
     97 
     98         psrlq           mm0,        32
     99         paddw           mm7,        mm0
    100 
    101         movq            rax,        mm7
    102 
    103     pop rdi
    104     pop rsi
    105     mov rsp, rbp
    106     ; begin epilog
    107     UNSHADOW_ARGS
    108     pop         rbp
    109     ret
    110 
    111 
    112 ;unsigned int vp8_sad8x16_mmx(
    113 ;    unsigned char *src_ptr,
    114 ;    int  src_stride,
    115 ;    unsigned char *ref_ptr,
    116 ;    int  ref_stride)
    117 sym(vp8_sad8x16_mmx):
    118     push        rbp
    119     mov         rbp, rsp
    120     SHADOW_ARGS_TO_STACK 4
    121     push rsi
    122     push rdi
    123     ; end prolog
    124 
    125         mov             rsi,        arg(0) ;src_ptr
    126         mov             rdi,        arg(2) ;ref_ptr
    127 
    128         movsxd          rax,        dword ptr arg(1) ;src_stride
    129         movsxd          rdx,        dword ptr arg(3) ;ref_stride
    130 
    131         lea             rcx,        [rsi+rax*8]
    132 
    133         lea             rcx,        [rcx+rax*8]
    134         pxor            mm7,        mm7
    135 
    136         pxor            mm6,        mm6
    137 
    138 .x8x16sad_mmx_loop:
    139 
    140         movq            mm0,        QWORD PTR [rsi]
    141         movq            mm1,        QWORD PTR [rdi]
    142 
    143         movq            mm2,        mm0
    144         psubusb         mm0,        mm1
    145 
    146         psubusb         mm1,        mm2
    147         por             mm0,        mm1
    148 
    149         movq            mm2,        mm0
    150         punpcklbw       mm0,        mm6
    151 
    152         punpckhbw       mm2,        mm6
    153         lea             rsi,        [rsi+rax]
    154 
    155         add             rdi,        rdx
    156         paddw           mm7,        mm0
    157 
    158         paddw           mm7,        mm2
    159         cmp             rsi,        rcx
    160 
    161         jne             .x8x16sad_mmx_loop
    162 
    163         movq            mm0,        mm7
    164         punpcklwd       mm0,        mm6
    165 
    166         punpckhwd       mm7,        mm6
    167         paddw           mm0,        mm7
    168 
    169         movq            mm7,        mm0
    170         psrlq           mm0,        32
    171 
    172         paddw           mm7,        mm0
    173         movq            rax,        mm7
    174 
    175     pop rdi
    176     pop rsi
    177     mov rsp, rbp
    178     ; begin epilog
    179     UNSHADOW_ARGS
    180     pop         rbp
    181     ret
    182 
    183 
    184 ;unsigned int vp8_sad8x8_mmx(
    185 ;    unsigned char *src_ptr,
    186 ;    int  src_stride,
    187 ;    unsigned char *ref_ptr,
    188 ;    int  ref_stride)
    189 sym(vp8_sad8x8_mmx):
    190     push        rbp
    191     mov         rbp, rsp
    192     SHADOW_ARGS_TO_STACK 4
    193     push rsi
    194     push rdi
    195     ; end prolog
    196 
    197         mov             rsi,        arg(0) ;src_ptr
    198         mov             rdi,        arg(2) ;ref_ptr
    199 
    200         movsxd          rax,        dword ptr arg(1) ;src_stride
    201         movsxd          rdx,        dword ptr arg(3) ;ref_stride
    202 
    203         lea             rcx,        [rsi+rax*8]
    204         pxor            mm7,        mm7
    205 
    206         pxor            mm6,        mm6
    207 
    208 .x8x8sad_mmx_loop:
    209 
    210         movq            mm0,        QWORD PTR [rsi]
    211         movq            mm1,        QWORD PTR [rdi]
    212 
    213         movq            mm2,        mm0
    214         psubusb         mm0,        mm1
    215 
    216         psubusb         mm1,        mm2
    217         por             mm0,        mm1
    218 
    219         movq            mm2,        mm0
    220         punpcklbw       mm0,        mm6
    221 
    222         punpckhbw       mm2,        mm6
    223         paddw           mm0,        mm2
    224 
    225         lea             rsi,       [rsi+rax]
    226         add             rdi,        rdx
    227 
    228         paddw           mm7,       mm0
    229         cmp             rsi,        rcx
    230 
    231         jne             .x8x8sad_mmx_loop
    232 
    233         movq            mm0,        mm7
    234         punpcklwd       mm0,        mm6
    235 
    236         punpckhwd       mm7,        mm6
    237         paddw           mm0,        mm7
    238 
    239         movq            mm7,        mm0
    240         psrlq           mm0,        32
    241 
    242         paddw           mm7,        mm0
    243         movq            rax,        mm7
    244 
    245     pop rdi
    246     pop rsi
    247     mov rsp, rbp
    248     ; begin epilog
    249     UNSHADOW_ARGS
    250     pop         rbp
    251     ret
    252 
    253 
    254 ;unsigned int vp8_sad4x4_mmx(
    255 ;    unsigned char *src_ptr,
    256 ;    int  src_stride,
    257 ;    unsigned char *ref_ptr,
    258 ;    int  ref_stride)
    259 sym(vp8_sad4x4_mmx):
    260     push        rbp
    261     mov         rbp, rsp
    262     SHADOW_ARGS_TO_STACK 4
    263     push rsi
    264     push rdi
    265     ; end prolog
    266 
    267         mov             rsi,        arg(0) ;src_ptr
    268         mov             rdi,        arg(2) ;ref_ptr
    269 
    270         movsxd          rax,        dword ptr arg(1) ;src_stride
    271         movsxd          rdx,        dword ptr arg(3) ;ref_stride
    272 
    273         movd            mm0,        DWORD PTR [rsi]
    274         movd            mm1,        DWORD PTR [rdi]
    275 
    276         movd            mm2,        DWORD PTR [rsi+rax]
    277         movd            mm3,        DWORD PTR [rdi+rdx]
    278 
    279         punpcklbw       mm0,        mm2
    280         punpcklbw       mm1,        mm3
    281 
    282         movq            mm2,        mm0
    283         psubusb         mm0,        mm1
    284 
    285         psubusb         mm1,        mm2
    286         por             mm0,        mm1
    287 
    288         movq            mm2,        mm0
    289         pxor            mm3,        mm3
    290 
    291         punpcklbw       mm0,        mm3
    292         punpckhbw       mm2,        mm3
    293 
    294         paddw           mm0,        mm2
    295 
    296         lea             rsi,        [rsi+rax*2]
    297         lea             rdi,        [rdi+rdx*2]
    298 
    299         movd            mm4,        DWORD PTR [rsi]
    300         movd            mm5,        DWORD PTR [rdi]
    301 
    302         movd            mm6,        DWORD PTR [rsi+rax]
    303         movd            mm7,        DWORD PTR [rdi+rdx]
    304 
    305         punpcklbw       mm4,        mm6
    306         punpcklbw       mm5,        mm7
    307 
    308         movq            mm6,        mm4
    309         psubusb         mm4,        mm5
    310 
    311         psubusb         mm5,        mm6
    312         por             mm4,        mm5
    313 
    314         movq            mm5,        mm4
    315         punpcklbw       mm4,        mm3
    316 
    317         punpckhbw       mm5,        mm3
    318         paddw           mm4,        mm5
    319 
    320         paddw           mm0,        mm4
    321         movq            mm1,        mm0
    322 
    323         punpcklwd       mm0,        mm3
    324         punpckhwd       mm1,        mm3
    325 
    326         paddw           mm0,        mm1
    327         movq            mm1,        mm0
    328 
    329         psrlq           mm0,        32
    330         paddw           mm0,        mm1
    331 
    332         movq            rax,        mm0
    333 
    334     pop rdi
    335     pop rsi
    336     mov rsp, rbp
    337     ; begin epilog
    338     UNSHADOW_ARGS
    339     pop         rbp
    340     ret
    341 
    342 
    343 ;unsigned int vp8_sad16x8_mmx(
    344 ;    unsigned char *src_ptr,
    345 ;    int  src_stride,
    346 ;    unsigned char *ref_ptr,
    347 ;    int  ref_stride)
    348 sym(vp8_sad16x8_mmx):
    349     push        rbp
    350     mov         rbp, rsp
    351     SHADOW_ARGS_TO_STACK 4
    352     push rsi
    353     push rdi
    354     ; end prolog
    355 
    356         mov             rsi,        arg(0) ;src_ptr
    357         mov             rdi,        arg(2) ;ref_ptr
    358 
    359         movsxd          rax,        dword ptr arg(1) ;src_stride
    360         movsxd          rdx,        dword ptr arg(3) ;ref_stride
    361 
    362         lea             rcx,        [rsi+rax*8]
    363         pxor            mm7,        mm7
    364 
    365         pxor            mm6,        mm6
    366 
    367 .x16x8sad_mmx_loop:
    368 
    369         movq            mm0,       [rsi]
    370         movq            mm1,       [rdi]
    371 
    372         movq            mm2,        [rsi+8]
    373         movq            mm3,        [rdi+8]
    374 
    375         movq            mm4,        mm0
    376         movq            mm5,        mm2
    377 
    378         psubusb         mm0,        mm1
    379         psubusb         mm1,        mm4
    380 
    381         psubusb         mm2,        mm3
    382         psubusb         mm3,        mm5
    383 
    384         por             mm0,        mm1
    385         por             mm2,        mm3
    386 
    387         movq            mm1,        mm0
    388         movq            mm3,        mm2
    389 
    390         punpcklbw       mm0,        mm6
    391         punpckhbw       mm1,        mm6
    392 
    393         punpcklbw       mm2,        mm6
    394         punpckhbw       mm3,        mm6
    395 
    396 
    397         paddw           mm0,        mm2
    398         paddw           mm1,        mm3
    399 
    400         paddw           mm0,        mm1
    401         lea             rsi,        [rsi+rax]
    402 
    403         add             rdi,        rdx
    404         paddw           mm7,        mm0
    405 
    406         cmp             rsi,        rcx
    407         jne             .x16x8sad_mmx_loop
    408 
    409         movq            mm0,        mm7
    410         punpcklwd       mm0,        mm6
    411 
    412         punpckhwd       mm7,        mm6
    413         paddw           mm0,        mm7
    414 
    415         movq            mm7,        mm0
    416         psrlq           mm0,        32
    417 
    418         paddw           mm7,        mm0
    419         movq            rax,        mm7
    420 
    421     pop rdi
    422     pop rsi
    423     mov rsp, rbp
    424     ; begin epilog
    425     UNSHADOW_ARGS
    426     pop         rbp
    427     ret
    428