Home | History | Annotate | Download | only in x86
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12 %include "vpx_ports/x86_abi_support.asm"
     13 
     14 SECTION .text
     15 
     16 ;void vp8_dequantize_b_impl_mmx(short *sq, short *dq, short *q)
     17 global sym(vp8_dequantize_b_impl_mmx) PRIVATE
     18 sym(vp8_dequantize_b_impl_mmx):
     19     push        rbp
     20     mov         rbp, rsp
     21     SHADOW_ARGS_TO_STACK 3
     22     push        rsi
     23     push        rdi
     24     ; end prolog
     25 
     26         mov       rsi, arg(0) ;sq
     27         mov       rdi, arg(1) ;dq
     28         mov       rax, arg(2) ;q
     29 
     30         movq      mm1, [rsi]
     31         pmullw    mm1, [rax+0]            ; mm4 *= kernel 0 modifiers.
     32         movq      [rdi], mm1
     33 
     34         movq      mm1, [rsi+8]
     35         pmullw    mm1, [rax+8]            ; mm4 *= kernel 0 modifiers.
     36         movq      [rdi+8], mm1
     37 
     38         movq      mm1, [rsi+16]
     39         pmullw    mm1, [rax+16]            ; mm4 *= kernel 0 modifiers.
     40         movq      [rdi+16], mm1
     41 
     42         movq      mm1, [rsi+24]
     43         pmullw    mm1, [rax+24]            ; mm4 *= kernel 0 modifiers.
     44         movq      [rdi+24], mm1
     45 
     46     ; begin epilog
     47     pop rdi
     48     pop rsi
     49     UNSHADOW_ARGS
     50     pop         rbp
     51     ret
     52 
     53 
     54 ;void dequant_idct_add_mmx(
     55 ;short *input,            0
     56 ;short *dq,               1
     57 ;unsigned char *dest,     2
     58 ;int stride)              3
     59 global sym(vp8_dequant_idct_add_mmx) PRIVATE
     60 sym(vp8_dequant_idct_add_mmx):
     61     push        rbp
     62     mov         rbp, rsp
     63     SHADOW_ARGS_TO_STACK 4
     64     GET_GOT     rbx
     65     push        rdi
     66     ; end prolog
     67 
     68         mov         rax,    arg(0) ;input
     69         mov         rdx,    arg(1) ;dq
     70 
     71 
     72         movq        mm0,    [rax   ]
     73         pmullw      mm0,    [rdx]
     74 
     75         movq        mm1,    [rax +8]
     76         pmullw      mm1,    [rdx +8]
     77 
     78         movq        mm2,    [rax+16]
     79         pmullw      mm2,    [rdx+16]
     80 
     81         movq        mm3,    [rax+24]
     82         pmullw      mm3,    [rdx+24]
     83 
     84         mov         rdx,    arg(2) ;dest
     85 
     86         pxor        mm7,    mm7
     87 
     88 
     89         movq        [rax],   mm7
     90         movq        [rax+8], mm7
     91 
     92         movq        [rax+16],mm7
     93         movq        [rax+24],mm7
     94 
     95 
     96         movsxd      rdi,            dword ptr arg(3) ;stride
     97 
     98         psubw       mm0,            mm2             ; b1= 0-2
     99         paddw       mm2,            mm2             ;
    100 
    101         movq        mm5,            mm1
    102         paddw       mm2,            mm0             ; a1 =0+2
    103 
    104         pmulhw      mm5,            [GLOBAL(x_s1sqr2)];
    105         paddw       mm5,            mm1             ; ip1 * sin(pi/8) * sqrt(2)
    106 
    107         movq        mm7,            mm3             ;
    108         pmulhw      mm7,            [GLOBAL(x_c1sqr2less1)];
    109 
    110         paddw       mm7,            mm3             ; ip3 * cos(pi/8) * sqrt(2)
    111         psubw       mm7,            mm5             ; c1
    112 
    113         movq        mm5,            mm1
    114         movq        mm4,            mm3
    115 
    116         pmulhw      mm5,            [GLOBAL(x_c1sqr2less1)]
    117         paddw       mm5,            mm1
    118 
    119         pmulhw      mm3,            [GLOBAL(x_s1sqr2)]
    120         paddw       mm3,            mm4
    121 
    122         paddw       mm3,            mm5             ; d1
    123         movq        mm6,            mm2             ; a1
    124 
    125         movq        mm4,            mm0             ; b1
    126         paddw       mm2,            mm3             ;0
    127 
    128         paddw       mm4,            mm7             ;1
    129         psubw       mm0,            mm7             ;2
    130 
    131         psubw       mm6,            mm3             ;3
    132 
    133         movq        mm1,            mm2             ; 03 02 01 00
    134         movq        mm3,            mm4             ; 23 22 21 20
    135 
    136         punpcklwd   mm1,            mm0             ; 11 01 10 00
    137         punpckhwd   mm2,            mm0             ; 13 03 12 02
    138 
    139         punpcklwd   mm3,            mm6             ; 31 21 30 20
    140         punpckhwd   mm4,            mm6             ; 33 23 32 22
    141 
    142         movq        mm0,            mm1             ; 11 01 10 00
    143         movq        mm5,            mm2             ; 13 03 12 02
    144 
    145         punpckldq   mm0,            mm3             ; 30 20 10 00
    146         punpckhdq   mm1,            mm3             ; 31 21 11 01
    147 
    148         punpckldq   mm2,            mm4             ; 32 22 12 02
    149         punpckhdq   mm5,            mm4             ; 33 23 13 03
    150 
    151         movq        mm3,            mm5             ; 33 23 13 03
    152 
    153         psubw       mm0,            mm2             ; b1= 0-2
    154         paddw       mm2,            mm2             ;
    155 
    156         movq        mm5,            mm1
    157         paddw       mm2,            mm0             ; a1 =0+2
    158 
    159         pmulhw      mm5,            [GLOBAL(x_s1sqr2)];
    160         paddw       mm5,            mm1             ; ip1 * sin(pi/8) * sqrt(2)
    161 
    162         movq        mm7,            mm3             ;
    163         pmulhw      mm7,            [GLOBAL(x_c1sqr2less1)];
    164 
    165         paddw       mm7,            mm3             ; ip3 * cos(pi/8) * sqrt(2)
    166         psubw       mm7,            mm5             ; c1
    167 
    168         movq        mm5,            mm1
    169         movq        mm4,            mm3
    170 
    171         pmulhw      mm5,            [GLOBAL(x_c1sqr2less1)]
    172         paddw       mm5,            mm1
    173 
    174         pmulhw      mm3,            [GLOBAL(x_s1sqr2)]
    175         paddw       mm3,            mm4
    176 
    177         paddw       mm3,            mm5             ; d1
    178         paddw       mm0,            [GLOBAL(fours)]
    179 
    180         paddw       mm2,            [GLOBAL(fours)]
    181         movq        mm6,            mm2             ; a1
    182 
    183         movq        mm4,            mm0             ; b1
    184         paddw       mm2,            mm3             ;0
    185 
    186         paddw       mm4,            mm7             ;1
    187         psubw       mm0,            mm7             ;2
    188 
    189         psubw       mm6,            mm3             ;3
    190         psraw       mm2,            3
    191 
    192         psraw       mm0,            3
    193         psraw       mm4,            3
    194 
    195         psraw       mm6,            3
    196 
    197         movq        mm1,            mm2             ; 03 02 01 00
    198         movq        mm3,            mm4             ; 23 22 21 20
    199 
    200         punpcklwd   mm1,            mm0             ; 11 01 10 00
    201         punpckhwd   mm2,            mm0             ; 13 03 12 02
    202 
    203         punpcklwd   mm3,            mm6             ; 31 21 30 20
    204         punpckhwd   mm4,            mm6             ; 33 23 32 22
    205 
    206         movq        mm0,            mm1             ; 11 01 10 00
    207         movq        mm5,            mm2             ; 13 03 12 02
    208 
    209         punpckldq   mm0,            mm3             ; 30 20 10 00
    210         punpckhdq   mm1,            mm3             ; 31 21 11 01
    211 
    212         punpckldq   mm2,            mm4             ; 32 22 12 02
    213         punpckhdq   mm5,            mm4             ; 33 23 13 03
    214 
    215         pxor        mm7,            mm7
    216 
    217         movd        mm4,            [rdx]
    218         punpcklbw   mm4,            mm7
    219         paddsw      mm0,            mm4
    220         packuswb    mm0,            mm7
    221         movd        [rdx],          mm0
    222 
    223         movd        mm4,            [rdx+rdi]
    224         punpcklbw   mm4,            mm7
    225         paddsw      mm1,            mm4
    226         packuswb    mm1,            mm7
    227         movd        [rdx+rdi],      mm1
    228 
    229         movd        mm4,            [rdx+2*rdi]
    230         punpcklbw   mm4,            mm7
    231         paddsw      mm2,            mm4
    232         packuswb    mm2,            mm7
    233         movd        [rdx+rdi*2],    mm2
    234 
    235         add         rdx,            rdi
    236 
    237         movd        mm4,            [rdx+2*rdi]
    238         punpcklbw   mm4,            mm7
    239         paddsw      mm5,            mm4
    240         packuswb    mm5,            mm7
    241         movd        [rdx+rdi*2],    mm5
    242 
    243     ; begin epilog
    244     pop rdi
    245     RESTORE_GOT
    246     UNSHADOW_ARGS
    247     pop         rbp
    248     ret
    249 
    250 SECTION_RODATA
    251 align 16
    252 x_s1sqr2:
    253     times 4 dw 0x8A8C
    254 align 16
    255 x_c1sqr2less1:
    256     times 4 dw 0x4E7B
    257 align 16
    258 fours:
    259     times 4 dw 0x0004
    260