Home | History | Annotate | Download | only in x86
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12 %include "vpx_ports/x86_abi_support.asm"
     13 
     14 
     15 ;void vp8_dequantize_b_impl_mmx(short *sq, short *dq, short *q)
     16 global sym(vp8_dequantize_b_impl_mmx) PRIVATE
     17 sym(vp8_dequantize_b_impl_mmx):
     18     push        rbp
     19     mov         rbp, rsp
     20     SHADOW_ARGS_TO_STACK 3
     21     push        rsi
     22     push        rdi
     23     ; end prolog
     24 
     25         mov       rsi, arg(0) ;sq
     26         mov       rdi, arg(1) ;dq
     27         mov       rax, arg(2) ;q
     28 
     29         movq      mm1, [rsi]
     30         pmullw    mm1, [rax+0]            ; mm4 *= kernel 0 modifiers.
     31         movq      [rdi], mm1
     32 
     33         movq      mm1, [rsi+8]
     34         pmullw    mm1, [rax+8]            ; mm4 *= kernel 0 modifiers.
     35         movq      [rdi+8], mm1
     36 
     37         movq      mm1, [rsi+16]
     38         pmullw    mm1, [rax+16]            ; mm4 *= kernel 0 modifiers.
     39         movq      [rdi+16], mm1
     40 
     41         movq      mm1, [rsi+24]
     42         pmullw    mm1, [rax+24]            ; mm4 *= kernel 0 modifiers.
     43         movq      [rdi+24], mm1
     44 
     45     ; begin epilog
     46     pop rdi
     47     pop rsi
     48     UNSHADOW_ARGS
     49     pop         rbp
     50     ret
     51 
     52 
     53 ;void dequant_idct_add_mmx(
     54 ;short *input,            0
     55 ;short *dq,               1
     56 ;unsigned char *dest,     2
     57 ;int stride)              3
     58 global sym(vp8_dequant_idct_add_mmx) PRIVATE
     59 sym(vp8_dequant_idct_add_mmx):
     60     push        rbp
     61     mov         rbp, rsp
     62     SHADOW_ARGS_TO_STACK 4
     63     GET_GOT     rbx
     64     push        rdi
     65     ; end prolog
     66 
     67         mov         rax,    arg(0) ;input
     68         mov         rdx,    arg(1) ;dq
     69 
     70 
     71         movq        mm0,    [rax   ]
     72         pmullw      mm0,    [rdx]
     73 
     74         movq        mm1,    [rax +8]
     75         pmullw      mm1,    [rdx +8]
     76 
     77         movq        mm2,    [rax+16]
     78         pmullw      mm2,    [rdx+16]
     79 
     80         movq        mm3,    [rax+24]
     81         pmullw      mm3,    [rdx+24]
     82 
     83         mov         rdx,    arg(2) ;dest
     84 
     85         pxor        mm7,    mm7
     86 
     87 
     88         movq        [rax],   mm7
     89         movq        [rax+8], mm7
     90 
     91         movq        [rax+16],mm7
     92         movq        [rax+24],mm7
     93 
     94 
     95         movsxd      rdi,            dword ptr arg(3) ;stride
     96 
     97         psubw       mm0,            mm2             ; b1= 0-2
     98         paddw       mm2,            mm2             ;
     99 
    100         movq        mm5,            mm1
    101         paddw       mm2,            mm0             ; a1 =0+2
    102 
    103         pmulhw      mm5,            [GLOBAL(x_s1sqr2)];
    104         paddw       mm5,            mm1             ; ip1 * sin(pi/8) * sqrt(2)
    105 
    106         movq        mm7,            mm3             ;
    107         pmulhw      mm7,            [GLOBAL(x_c1sqr2less1)];
    108 
    109         paddw       mm7,            mm3             ; ip3 * cos(pi/8) * sqrt(2)
    110         psubw       mm7,            mm5             ; c1
    111 
    112         movq        mm5,            mm1
    113         movq        mm4,            mm3
    114 
    115         pmulhw      mm5,            [GLOBAL(x_c1sqr2less1)]
    116         paddw       mm5,            mm1
    117 
    118         pmulhw      mm3,            [GLOBAL(x_s1sqr2)]
    119         paddw       mm3,            mm4
    120 
    121         paddw       mm3,            mm5             ; d1
    122         movq        mm6,            mm2             ; a1
    123 
    124         movq        mm4,            mm0             ; b1
    125         paddw       mm2,            mm3             ;0
    126 
    127         paddw       mm4,            mm7             ;1
    128         psubw       mm0,            mm7             ;2
    129 
    130         psubw       mm6,            mm3             ;3
    131 
    132         movq        mm1,            mm2             ; 03 02 01 00
    133         movq        mm3,            mm4             ; 23 22 21 20
    134 
    135         punpcklwd   mm1,            mm0             ; 11 01 10 00
    136         punpckhwd   mm2,            mm0             ; 13 03 12 02
    137 
    138         punpcklwd   mm3,            mm6             ; 31 21 30 20
    139         punpckhwd   mm4,            mm6             ; 33 23 32 22
    140 
    141         movq        mm0,            mm1             ; 11 01 10 00
    142         movq        mm5,            mm2             ; 13 03 12 02
    143 
    144         punpckldq   mm0,            mm3             ; 30 20 10 00
    145         punpckhdq   mm1,            mm3             ; 31 21 11 01
    146 
    147         punpckldq   mm2,            mm4             ; 32 22 12 02
    148         punpckhdq   mm5,            mm4             ; 33 23 13 03
    149 
    150         movq        mm3,            mm5             ; 33 23 13 03
    151 
    152         psubw       mm0,            mm2             ; b1= 0-2
    153         paddw       mm2,            mm2             ;
    154 
    155         movq        mm5,            mm1
    156         paddw       mm2,            mm0             ; a1 =0+2
    157 
    158         pmulhw      mm5,            [GLOBAL(x_s1sqr2)];
    159         paddw       mm5,            mm1             ; ip1 * sin(pi/8) * sqrt(2)
    160 
    161         movq        mm7,            mm3             ;
    162         pmulhw      mm7,            [GLOBAL(x_c1sqr2less1)];
    163 
    164         paddw       mm7,            mm3             ; ip3 * cos(pi/8) * sqrt(2)
    165         psubw       mm7,            mm5             ; c1
    166 
    167         movq        mm5,            mm1
    168         movq        mm4,            mm3
    169 
    170         pmulhw      mm5,            [GLOBAL(x_c1sqr2less1)]
    171         paddw       mm5,            mm1
    172 
    173         pmulhw      mm3,            [GLOBAL(x_s1sqr2)]
    174         paddw       mm3,            mm4
    175 
    176         paddw       mm3,            mm5             ; d1
    177         paddw       mm0,            [GLOBAL(fours)]
    178 
    179         paddw       mm2,            [GLOBAL(fours)]
    180         movq        mm6,            mm2             ; a1
    181 
    182         movq        mm4,            mm0             ; b1
    183         paddw       mm2,            mm3             ;0
    184 
    185         paddw       mm4,            mm7             ;1
    186         psubw       mm0,            mm7             ;2
    187 
    188         psubw       mm6,            mm3             ;3
    189         psraw       mm2,            3
    190 
    191         psraw       mm0,            3
    192         psraw       mm4,            3
    193 
    194         psraw       mm6,            3
    195 
    196         movq        mm1,            mm2             ; 03 02 01 00
    197         movq        mm3,            mm4             ; 23 22 21 20
    198 
    199         punpcklwd   mm1,            mm0             ; 11 01 10 00
    200         punpckhwd   mm2,            mm0             ; 13 03 12 02
    201 
    202         punpcklwd   mm3,            mm6             ; 31 21 30 20
    203         punpckhwd   mm4,            mm6             ; 33 23 32 22
    204 
    205         movq        mm0,            mm1             ; 11 01 10 00
    206         movq        mm5,            mm2             ; 13 03 12 02
    207 
    208         punpckldq   mm0,            mm3             ; 30 20 10 00
    209         punpckhdq   mm1,            mm3             ; 31 21 11 01
    210 
    211         punpckldq   mm2,            mm4             ; 32 22 12 02
    212         punpckhdq   mm5,            mm4             ; 33 23 13 03
    213 
    214         pxor        mm7,            mm7
    215 
    216         movd        mm4,            [rdx]
    217         punpcklbw   mm4,            mm7
    218         paddsw      mm0,            mm4
    219         packuswb    mm0,            mm7
    220         movd        [rdx],          mm0
    221 
    222         movd        mm4,            [rdx+rdi]
    223         punpcklbw   mm4,            mm7
    224         paddsw      mm1,            mm4
    225         packuswb    mm1,            mm7
    226         movd        [rdx+rdi],      mm1
    227 
    228         movd        mm4,            [rdx+2*rdi]
    229         punpcklbw   mm4,            mm7
    230         paddsw      mm2,            mm4
    231         packuswb    mm2,            mm7
    232         movd        [rdx+rdi*2],    mm2
    233 
    234         add         rdx,            rdi
    235 
    236         movd        mm4,            [rdx+2*rdi]
    237         punpcklbw   mm4,            mm7
    238         paddsw      mm5,            mm4
    239         packuswb    mm5,            mm7
    240         movd        [rdx+rdi*2],    mm5
    241 
    242     ; begin epilog
    243     pop rdi
    244     RESTORE_GOT
    245     UNSHADOW_ARGS
    246     pop         rbp
    247     ret
    248 
    249 SECTION_RODATA
    250 align 16
    251 x_s1sqr2:
    252     times 4 dw 0x8A8C
    253 align 16
    254 x_c1sqr2less1:
    255     times 4 dw 0x4E7B
    256 align 16
    257 fours:
    258     times 4 dw 0x0004
    259