Home | History | Annotate | Download | only in x86
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12 %include "vpx_ports/x86_abi_support.asm"
     13 
     14 ;unsigned int vp9_get_mb_ss_mmx( short *src_ptr )
     15 global sym(vp9_get_mb_ss_mmx) PRIVATE
     16 sym(vp9_get_mb_ss_mmx):
     17     push        rbp
     18     mov         rbp, rsp
     19     SHADOW_ARGS_TO_STACK 7
     20     GET_GOT     rbx
     21     push rsi
     22     push rdi
     23     sub         rsp, 8
     24     ; end prolog
     25 
     26         mov         rax, arg(0) ;src_ptr
     27         mov         rcx, 16
     28         pxor        mm4, mm4
     29 
     30 .NEXTROW:
     31         movq        mm0, [rax]
     32         movq        mm1, [rax+8]
     33         movq        mm2, [rax+16]
     34         movq        mm3, [rax+24]
     35         pmaddwd     mm0, mm0
     36         pmaddwd     mm1, mm1
     37         pmaddwd     mm2, mm2
     38         pmaddwd     mm3, mm3
     39 
     40         paddd       mm4, mm0
     41         paddd       mm4, mm1
     42         paddd       mm4, mm2
     43         paddd       mm4, mm3
     44 
     45         add         rax, 32
     46         dec         rcx
     47         ja          .NEXTROW
     48         movq        QWORD PTR [rsp], mm4
     49 
     50         ;return sum[0]+sum[1];
     51         movsxd      rax, dword ptr [rsp]
     52         movsxd      rcx, dword ptr [rsp+4]
     53         add         rax, rcx
     54 
     55 
     56     ; begin epilog
     57     add rsp, 8
     58     pop rdi
     59     pop rsi
     60     RESTORE_GOT
     61     UNSHADOW_ARGS
     62     pop         rbp
     63     ret
     64 
     65 
     66 ;unsigned int vp9_get8x8var_mmx
     67 ;(
     68 ;    unsigned char *src_ptr,
     69 ;    int  source_stride,
     70 ;    unsigned char *ref_ptr,
     71 ;    int  recon_stride,
     72 ;    unsigned int *SSE,
     73 ;    int *Sum
     74 ;)
     75 global sym(vp9_get8x8var_mmx) PRIVATE
     76 sym(vp9_get8x8var_mmx):
     77     push        rbp
     78     mov         rbp, rsp
     79     SHADOW_ARGS_TO_STACK 6
     80     push rsi
     81     push rdi
     82     push rbx
     83     sub         rsp, 16
     84     ; end prolog
     85 
     86 
     87         pxor        mm5, mm5                    ; Blank mmx6
     88         pxor        mm6, mm6                    ; Blank mmx7
     89         pxor        mm7, mm7                    ; Blank mmx7
     90 
     91         mov         rax, arg(0) ;[src_ptr]  ; Load base addresses
     92         mov         rbx, arg(2) ;[ref_ptr]
     93         movsxd      rcx, dword ptr arg(1) ;[source_stride]
     94         movsxd      rdx, dword ptr arg(3) ;[recon_stride]
     95 
     96         ; Row 1
     97         movq        mm0, [rax]                  ; Copy eight bytes to mm0
     98         movq        mm1, [rbx]                  ; Copy eight bytes to mm1
     99         movq        mm2, mm0                    ; Take copies
    100         movq        mm3, mm1                    ; Take copies
    101 
    102         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
    103         punpcklbw   mm1, mm6
    104         punpckhbw   mm2, mm6                    ; unpack to higher prrcision
    105         punpckhbw   mm3, mm6
    106         psubsw      mm0, mm1                    ; A-B (low order) to MM0
    107         psubsw      mm2, mm3                    ; A-B (high order) to MM2
    108 
    109         paddw       mm5, mm0                    ; accumulate differences in mm5
    110         paddw       mm5, mm2                    ; accumulate differences in mm5
    111 
    112         pmaddwd     mm0, mm0                    ; square and accumulate
    113         pmaddwd     mm2, mm2                    ; square and accumulate
    114         add         rbx,rdx                     ; Inc pointer into ref data
    115         add         rax,rcx                     ; Inc pointer into the new data
    116         movq        mm1, [rbx]                  ; Copy eight bytes to mm1
    117         paddd       mm7, mm0                    ; accumulate in mm7
    118         paddd       mm7, mm2                    ; accumulate in mm7
    119 
    120 
    121         ; Row 2
    122         movq        mm0, [rax]                  ; Copy eight bytes to mm0
    123         movq        mm2, mm0                    ; Take copies
    124         movq        mm3, mm1                    ; Take copies
    125 
    126         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
    127         punpcklbw   mm1, mm6
    128         punpckhbw   mm2, mm6                    ; unpack to higher prrcision
    129         punpckhbw   mm3, mm6
    130         psubsw      mm0, mm1                    ; A-B (low order) to MM0
    131         psubsw      mm2, mm3                    ; A-B (high order) to MM2
    132 
    133         paddw       mm5, mm0                    ; accumulate differences in mm5
    134         paddw       mm5, mm2                    ; accumulate differences in mm5
    135 
    136         pmaddwd     mm0, mm0                    ; square and accumulate
    137         pmaddwd     mm2, mm2                    ; square and accumulate
    138         add         rbx,rdx                     ; Inc pointer into ref data
    139         add         rax,rcx                     ; Inc pointer into the new data
    140         movq        mm1, [rbx]                  ; Copy eight bytes to mm1
    141         paddd       mm7, mm0                    ; accumulate in mm7
    142         paddd       mm7, mm2                    ; accumulate in mm7
    143 
    144         ; Row 3
    145         movq        mm0, [rax]                  ; Copy eight bytes to mm0
    146         movq        mm2, mm0                    ; Take copies
    147         movq        mm3, mm1                    ; Take copies
    148 
    149         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
    150         punpcklbw   mm1, mm6
    151         punpckhbw   mm2, mm6                    ; unpack to higher prrcision
    152         punpckhbw   mm3, mm6
    153         psubsw      mm0, mm1                    ; A-B (low order) to MM0
    154         psubsw      mm2, mm3                    ; A-B (high order) to MM2
    155 
    156         paddw       mm5, mm0                    ; accumulate differences in mm5
    157         paddw       mm5, mm2                    ; accumulate differences in mm5
    158 
    159         pmaddwd     mm0, mm0                    ; square and accumulate
    160         pmaddwd     mm2, mm2                    ; square and accumulate
    161         add         rbx,rdx                     ; Inc pointer into ref data
    162         add         rax,rcx                     ; Inc pointer into the new data
    163         movq        mm1, [rbx]                  ; Copy eight bytes to mm1
    164         paddd       mm7, mm0                    ; accumulate in mm7
    165         paddd       mm7, mm2                    ; accumulate in mm7
    166 
    167         ; Row 4
    168         movq        mm0, [rax]                  ; Copy eight bytes to mm0
    169         movq        mm2, mm0                    ; Take copies
    170         movq        mm3, mm1                    ; Take copies
    171 
    172         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
    173         punpcklbw   mm1, mm6
    174         punpckhbw   mm2, mm6                    ; unpack to higher prrcision
    175         punpckhbw   mm3, mm6
    176         psubsw      mm0, mm1                    ; A-B (low order) to MM0
    177         psubsw      mm2, mm3                    ; A-B (high order) to MM2
    178 
    179         paddw       mm5, mm0                    ; accumulate differences in mm5
    180         paddw       mm5, mm2                    ; accumulate differences in mm5
    181 
    182         pmaddwd     mm0, mm0                    ; square and accumulate
    183         pmaddwd     mm2, mm2                    ; square and accumulate
    184         add         rbx,rdx                     ; Inc pointer into ref data
    185         add         rax,rcx                     ; Inc pointer into the new data
    186         movq        mm1, [rbx]                  ; Copy eight bytes to mm1
    187         paddd       mm7, mm0                    ; accumulate in mm7
    188         paddd       mm7, mm2                    ; accumulate in mm7
    189 
    190         ; Row 5
    191         movq        mm0, [rax]                  ; Copy eight bytes to mm0
    192         movq        mm2, mm0                    ; Take copies
    193         movq        mm3, mm1                    ; Take copies
    194 
    195         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
    196         punpcklbw   mm1, mm6
    197         punpckhbw   mm2, mm6                    ; unpack to higher prrcision
    198         punpckhbw   mm3, mm6
    199         psubsw      mm0, mm1                    ; A-B (low order) to MM0
    200         psubsw      mm2, mm3                    ; A-B (high order) to MM2
    201 
    202         paddw       mm5, mm0                    ; accumulate differences in mm5
    203         paddw       mm5, mm2                    ; accumulate differences in mm5
    204 
    205         pmaddwd     mm0, mm0                    ; square and accumulate
    206         pmaddwd     mm2, mm2                    ; square and accumulate
    207         add         rbx,rdx                     ; Inc pointer into ref data
    208         add         rax,rcx                     ; Inc pointer into the new data
    209         movq        mm1, [rbx]                  ; Copy eight bytes to mm1
    210         ;              movq        mm4, [rbx + rdx]
    211         paddd       mm7, mm0                    ; accumulate in mm7
    212         paddd       mm7, mm2                    ; accumulate in mm7
    213 
    214         ; Row 6
    215         movq        mm0, [rax]                  ; Copy eight bytes to mm0
    216         movq        mm2, mm0                    ; Take copies
    217         movq        mm3, mm1                    ; Take copies
    218 
    219         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
    220         punpcklbw   mm1, mm6
    221         punpckhbw   mm2, mm6                    ; unpack to higher prrcision
    222         punpckhbw   mm3, mm6
    223         psubsw      mm0, mm1                    ; A-B (low order) to MM0
    224         psubsw      mm2, mm3                    ; A-B (high order) to MM2
    225 
    226         paddw       mm5, mm0                    ; accumulate differences in mm5
    227         paddw       mm5, mm2                    ; accumulate differences in mm5
    228 
    229         pmaddwd     mm0, mm0                    ; square and accumulate
    230         pmaddwd     mm2, mm2                    ; square and accumulate
    231         add         rbx,rdx                     ; Inc pointer into ref data
    232         add         rax,rcx                     ; Inc pointer into the new data
    233         movq        mm1, [rbx]                  ; Copy eight bytes to mm1
    234         paddd       mm7, mm0                    ; accumulate in mm7
    235         paddd       mm7, mm2                    ; accumulate in mm7
    236 
    237         ; Row 7
    238         movq        mm0, [rax]                  ; Copy eight bytes to mm0
    239         movq        mm2, mm0                    ; Take copies
    240         movq        mm3, mm1                    ; Take copies
    241 
    242         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
    243         punpcklbw   mm1, mm6
    244         punpckhbw   mm2, mm6                    ; unpack to higher prrcision
    245         punpckhbw   mm3, mm6
    246         psubsw      mm0, mm1                    ; A-B (low order) to MM0
    247         psubsw      mm2, mm3                    ; A-B (high order) to MM2
    248 
    249         paddw       mm5, mm0                    ; accumulate differences in mm5
    250         paddw       mm5, mm2                    ; accumulate differences in mm5
    251 
    252         pmaddwd     mm0, mm0                    ; square and accumulate
    253         pmaddwd     mm2, mm2                    ; square and accumulate
    254         add         rbx,rdx                     ; Inc pointer into ref data
    255         add         rax,rcx                     ; Inc pointer into the new data
    256         movq        mm1, [rbx]                  ; Copy eight bytes to mm1
    257         paddd       mm7, mm0                    ; accumulate in mm7
    258         paddd       mm7, mm2                    ; accumulate in mm7
    259 
    260         ; Row 8
    261         movq        mm0, [rax]                  ; Copy eight bytes to mm0
    262         movq        mm2, mm0                    ; Take copies
    263         movq        mm3, mm1                    ; Take copies
    264 
    265         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
    266         punpcklbw   mm1, mm6
    267         punpckhbw   mm2, mm6                    ; unpack to higher prrcision
    268         punpckhbw   mm3, mm6
    269         psubsw      mm0, mm1                    ; A-B (low order) to MM0
    270         psubsw      mm2, mm3                    ; A-B (high order) to MM2
    271 
    272         paddw       mm5, mm0                    ; accumulate differences in mm5
    273         paddw       mm5, mm2                    ; accumulate differences in mm5
    274 
    275         pmaddwd     mm0, mm0                    ; square and accumulate
    276         pmaddwd     mm2, mm2                    ; square and accumulate
    277         add         rbx,rdx                     ; Inc pointer into ref data
    278         add         rax,rcx                     ; Inc pointer into the new data
    279         paddd       mm7, mm0                    ; accumulate in mm7
    280         paddd       mm7, mm2                    ; accumulate in mm7
    281 
    282         ; Now accumulate the final results.
    283         movq        QWORD PTR [rsp+8], mm5      ; copy back accumulated results into normal memory
    284         movq        QWORD PTR [rsp], mm7        ; copy back accumulated results into normal memory
    285         movsx       rdx, WORD PTR [rsp+8]
    286         movsx       rcx, WORD PTR [rsp+10]
    287         movsx       rbx, WORD PTR [rsp+12]
    288         movsx       rax, WORD PTR [rsp+14]
    289         add         rdx, rcx
    290         add         rbx, rax
    291         add         rdx, rbx    ;XSum
    292         movsxd      rax, DWORD PTR [rsp]
    293         movsxd      rcx, DWORD PTR [rsp+4]
    294         add         rax, rcx    ;XXSum
    295         mov         rsi, arg(4) ;SSE
    296         mov         rdi, arg(5) ;Sum
    297         mov         dword ptr [rsi], eax
    298         mov         dword ptr [rdi], edx
    299         xor         rax, rax    ; return 0
    300 
    301 
    302     ; begin epilog
    303     add rsp, 16
    304     pop rbx
    305     pop rdi
    306     pop rsi
    307     UNSHADOW_ARGS
    308     pop         rbp
    309     ret
    310 
    311 
    312 
    313 ;unsigned int
    314 ;vp9_get4x4var_mmx
    315 ;(
    316 ;    unsigned char *src_ptr,
    317 ;    int  source_stride,
    318 ;    unsigned char *ref_ptr,
    319 ;    int  recon_stride,
    320 ;    unsigned int *SSE,
    321 ;    int *Sum
    322 ;)
    323 global sym(vp9_get4x4var_mmx) PRIVATE
    324 sym(vp9_get4x4var_mmx):
    325     push        rbp
    326     mov         rbp, rsp
    327     SHADOW_ARGS_TO_STACK 6
    328     push rsi
    329     push rdi
    330     push rbx
    331     sub         rsp, 16
    332     ; end prolog
    333 
    334 
    335         pxor        mm5, mm5                    ; Blank mmx6
    336         pxor        mm6, mm6                    ; Blank mmx7
    337         pxor        mm7, mm7                    ; Blank mmx7
    338 
    339         mov         rax, arg(0) ;[src_ptr]  ; Load base addresses
    340         mov         rbx, arg(2) ;[ref_ptr]
    341         movsxd      rcx, dword ptr arg(1) ;[source_stride]
    342         movsxd      rdx, dword ptr arg(3) ;[recon_stride]
    343 
    344         ; Row 1
    345         movd        mm0, [rax]                  ; Copy 4 bytes to mm0
    346         movd        mm1, [rbx]                  ; Copy 4 bytes to mm1
    347         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
    348         punpcklbw   mm1, mm6
    349         psubsw      mm0, mm1                    ; A-B (low order) to MM0
    350         paddw       mm5, mm0                    ; accumulate differences in mm5
    351         pmaddwd     mm0, mm0                    ; square and accumulate
    352         add         rbx,rdx                     ; Inc pointer into ref data
    353         add         rax,rcx                     ; Inc pointer into the new data
    354         movd        mm1, [rbx]                  ; Copy 4 bytes to mm1
    355         paddd       mm7, mm0                    ; accumulate in mm7
    356 
    357 
    358         ; Row 2
    359         movd        mm0, [rax]                  ; Copy 4 bytes to mm0
    360         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
    361         punpcklbw   mm1, mm6
    362         psubsw      mm0, mm1                    ; A-B (low order) to MM0
    363         paddw       mm5, mm0                    ; accumulate differences in mm5
    364 
    365         pmaddwd     mm0, mm0                    ; square and accumulate
    366         add         rbx,rdx                     ; Inc pointer into ref data
    367         add         rax,rcx                     ; Inc pointer into the new data
    368         movd        mm1, [rbx]                  ; Copy 4 bytes to mm1
    369         paddd       mm7, mm0                    ; accumulate in mm7
    370 
    371         ; Row 3
    372         movd        mm0, [rax]                  ; Copy 4 bytes to mm0
    373         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
    374         punpcklbw   mm1, mm6
    375         psubsw      mm0, mm1                    ; A-B (low order) to MM0
    376         paddw       mm5, mm0                    ; accumulate differences in mm5
    377 
    378         pmaddwd     mm0, mm0                    ; square and accumulate
    379         add         rbx,rdx                     ; Inc pointer into ref data
    380         add         rax,rcx                     ; Inc pointer into the new data
    381         movd        mm1, [rbx]                  ; Copy 4 bytes to mm1
    382         paddd       mm7, mm0                    ; accumulate in mm7
    383 
    384         ; Row 4
    385         movd        mm0, [rax]                  ; Copy 4 bytes to mm0
    386 
    387         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
    388         punpcklbw   mm1, mm6
    389         psubsw      mm0, mm1                    ; A-B (low order) to MM0
    390 
    391         paddw       mm5, mm0                    ; accumulate differences in mm5
    392 
    393         pmaddwd     mm0, mm0                    ; square and accumulate
    394         paddd       mm7, mm0                    ; accumulate in mm7
    395 
    396 
    397         ; Now accumulate the final results.
    398         movq        QWORD PTR [rsp+8], mm5      ; copy back accumulated results into normal memory
    399         movq        QWORD PTR [rsp], mm7        ; copy back accumulated results into normal memory
    400         movsx       rdx, WORD PTR [rsp+8]
    401         movsx       rcx, WORD PTR [rsp+10]
    402         movsx       rbx, WORD PTR [rsp+12]
    403         movsx       rax, WORD PTR [rsp+14]
    404         add         rdx, rcx
    405         add         rbx, rax
    406         add         rdx, rbx    ;XSum
    407         movsxd      rax, DWORD PTR [rsp]
    408         movsxd      rcx, DWORD PTR [rsp+4]
    409         add         rax, rcx    ;XXSum
    410         mov         rsi, arg(4) ;SSE
    411         mov         rdi, arg(5) ;Sum
    412         mov         dword ptr [rsi], eax
    413         mov         dword ptr [rdi], edx
    414         xor         rax, rax    ; return 0
    415 
    416 
    417     ; begin epilog
    418     add rsp, 16
    419     pop rbx
    420     pop rdi
    421     pop rsi
    422     UNSHADOW_ARGS
    423     pop         rbp
    424     ret
    425 
    426 
    427 
    428 ;unsigned int
    429 ;vp9_get4x4sse_cs_mmx
    430 ;(
    431 ;    unsigned char *src_ptr,
    432 ;    int  source_stride,
    433 ;    unsigned char *ref_ptr,
    434 ;    int  recon_stride
    435 ;)
    436 global sym(vp9_get4x4sse_cs_mmx) PRIVATE
    437 sym(vp9_get4x4sse_cs_mmx):
    438     push        rbp
    439     mov         rbp, rsp
    440     SHADOW_ARGS_TO_STACK 4
    441     push rsi
    442     push rdi
    443     push rbx
    444     ; end prolog
    445 
    446 
    447         pxor        mm6, mm6                    ; Blank mmx7
    448         pxor        mm7, mm7                    ; Blank mmx7
    449 
    450         mov         rax, arg(0) ;[src_ptr]  ; Load base addresses
    451         mov         rbx, arg(2) ;[ref_ptr]
    452         movsxd      rcx, dword ptr arg(1) ;[source_stride]
    453         movsxd      rdx, dword ptr arg(3) ;[recon_stride]
    454         ; Row 1
    455         movd        mm0, [rax]                  ; Copy eight bytes to mm0
    456         movd        mm1, [rbx]                  ; Copy eight bytes to mm1
    457         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
    458         punpcklbw   mm1, mm6
    459         psubsw      mm0, mm1                    ; A-B (low order) to MM0
    460         pmaddwd     mm0, mm0                    ; square and accumulate
    461         add         rbx,rdx                     ; Inc pointer into ref data
    462         add         rax,rcx                     ; Inc pointer into the new data
    463         movd        mm1, [rbx]                  ; Copy eight bytes to mm1
    464         paddd       mm7, mm0                    ; accumulate in mm7
    465 
    466         ; Row 2
    467         movd        mm0, [rax]                  ; Copy eight bytes to mm0
    468         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
    469         punpcklbw   mm1, mm6
    470         psubsw      mm0, mm1                    ; A-B (low order) to MM0
    471         pmaddwd     mm0, mm0                    ; square and accumulate
    472         add         rbx,rdx                     ; Inc pointer into ref data
    473         add         rax,rcx                     ; Inc pointer into the new data
    474         movd        mm1, [rbx]                  ; Copy eight bytes to mm1
    475         paddd       mm7, mm0                    ; accumulate in mm7
    476 
    477         ; Row 3
    478         movd        mm0, [rax]                  ; Copy eight bytes to mm0
    479         punpcklbw   mm1, mm6
    480         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
    481         psubsw      mm0, mm1                    ; A-B (low order) to MM0
    482 
    483         pmaddwd     mm0, mm0                    ; square and accumulate
    484         add         rbx,rdx                     ; Inc pointer into ref data
    485         add         rax,rcx                     ; Inc pointer into the new data
    486         movd        mm1, [rbx]                  ; Copy eight bytes to mm1
    487         paddd       mm7, mm0                    ; accumulate in mm7
    488 
    489         ; Row 4
    490         movd        mm0, [rax]                  ; Copy eight bytes to mm0
    491         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
    492         punpcklbw   mm1, mm6
    493         psubsw      mm0, mm1                    ; A-B (low order) to MM0
    494         pmaddwd     mm0, mm0                    ; square and accumulate
    495         paddd       mm7, mm0                    ; accumulate in mm7
    496 
    497         movq        mm0,    mm7                 ;
    498         psrlq       mm7,    32
    499 
    500         paddd       mm0,    mm7
    501         movq        rax,    mm0
    502 
    503 
    504     ; begin epilog
    505     pop rbx
    506     pop rdi
    507     pop rsi
    508     UNSHADOW_ARGS
    509     pop         rbp
    510     ret
    511