Home | History | Annotate | Download | only in x86
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12 %include "vpx_ports/x86_abi_support.asm"
     13 
     14 ; /****************************************************************************
     15 ; * Notes:
     16 ; *
     17 ; * This implementation makes use of 16 bit fixed point version of two multiply
     18 ; * constants:
     19 ; *        1.   sqrt(2) * cos (pi/8)
     20 ; *        2.   sqrt(2) * sin (pi/8)
     21 ; * Because the first constant is bigger than 1, to maintain the same 16 bit
     22 ; * fixed point precision as the second one, we use a trick of
     23 ; *        x * a = x + x*(a-1)
     24 ; * so
     25 ; *        x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1).
     26 ; *
     27 ; * For the second constant, because of the 16bit version is 35468, which
     28 ; * is bigger than 32768, in signed 16 bit multiply, it becomes a negative
     29 ; * number.
     30 ; *        (x * (unsigned)35468 >> 16) = x * (signed)35468 >> 16 + x
     31 ; *
     32 ; **************************************************************************/
     33 
     34 SECTION .text
     35 
     36 ;void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred,
     37 ;int pitch, unsigned char *dest,int stride)
     38 global sym(vp8_short_idct4x4llm_mmx) PRIVATE
     39 sym(vp8_short_idct4x4llm_mmx):
     40     push        rbp
     41     mov         rbp, rsp
     42     SHADOW_ARGS_TO_STACK 5
     43     GET_GOT     rbx
     44     push        rsi
     45     push        rdi
     46     ; end prolog
     47 
     48     mov         rax,    arg(0)              ;input
     49     mov         rsi,    arg(1)              ;pred
     50 
     51     movq        mm0,    [rax   ]
     52     movq        mm1,    [rax+ 8]
     53     movq        mm2,    [rax+16]
     54     movq        mm3,    [rax+24]
     55 
     56 %if 0
     57     pxor        mm7,    mm7
     58     movq        [rax],   mm7
     59     movq        [rax+8], mm7
     60     movq        [rax+16],mm7
     61     movq        [rax+24],mm7
     62 %endif
     63     movsxd      rax,    dword ptr arg(2)    ;pitch
     64     mov         rdx,    arg(3)              ;dest
     65     movsxd      rdi,    dword ptr arg(4)    ;stride
     66 
     67 
     68     psubw       mm0,            mm2             ; b1= 0-2
     69     paddw       mm2,            mm2             ;
     70 
     71     movq        mm5,            mm1
     72     paddw       mm2,            mm0             ; a1 =0+2
     73 
     74     pmulhw      mm5,            [GLOBAL(x_s1sqr2)];
     75     paddw       mm5,            mm1             ; ip1 * sin(pi/8) * sqrt(2)
     76 
     77     movq        mm7,            mm3             ;
     78     pmulhw      mm7,            [GLOBAL(x_c1sqr2less1)];
     79 
     80     paddw       mm7,            mm3             ; ip3 * cos(pi/8) * sqrt(2)
     81     psubw       mm7,            mm5             ; c1
     82 
     83     movq        mm5,            mm1
     84     movq        mm4,            mm3
     85 
     86     pmulhw      mm5,            [GLOBAL(x_c1sqr2less1)]
     87     paddw       mm5,            mm1
     88 
     89     pmulhw      mm3,            [GLOBAL(x_s1sqr2)]
     90     paddw       mm3,            mm4
     91 
     92     paddw       mm3,            mm5             ; d1
     93     movq        mm6,            mm2             ; a1
     94 
     95     movq        mm4,            mm0             ; b1
     96     paddw       mm2,            mm3             ;0
     97 
     98     paddw       mm4,            mm7             ;1
     99     psubw       mm0,            mm7             ;2
    100 
    101     psubw       mm6,            mm3             ;3
    102 
    103     movq        mm1,            mm2             ; 03 02 01 00
    104     movq        mm3,            mm4             ; 23 22 21 20
    105 
    106     punpcklwd   mm1,            mm0             ; 11 01 10 00
    107     punpckhwd   mm2,            mm0             ; 13 03 12 02
    108 
    109     punpcklwd   mm3,            mm6             ; 31 21 30 20
    110     punpckhwd   mm4,            mm6             ; 33 23 32 22
    111 
    112     movq        mm0,            mm1             ; 11 01 10 00
    113     movq        mm5,            mm2             ; 13 03 12 02
    114 
    115     punpckldq   mm0,            mm3             ; 30 20 10 00
    116     punpckhdq   mm1,            mm3             ; 31 21 11 01
    117 
    118     punpckldq   mm2,            mm4             ; 32 22 12 02
    119     punpckhdq   mm5,            mm4             ; 33 23 13 03
    120 
    121     movq        mm3,            mm5             ; 33 23 13 03
    122 
    123     psubw       mm0,            mm2             ; b1= 0-2
    124     paddw       mm2,            mm2             ;
    125 
    126     movq        mm5,            mm1
    127     paddw       mm2,            mm0             ; a1 =0+2
    128 
    129     pmulhw      mm5,            [GLOBAL(x_s1sqr2)];
    130     paddw       mm5,            mm1             ; ip1 * sin(pi/8) * sqrt(2)
    131 
    132     movq        mm7,            mm3             ;
    133     pmulhw      mm7,            [GLOBAL(x_c1sqr2less1)];
    134 
    135     paddw       mm7,            mm3             ; ip3 * cos(pi/8) * sqrt(2)
    136     psubw       mm7,            mm5             ; c1
    137 
    138     movq        mm5,            mm1
    139     movq        mm4,            mm3
    140 
    141     pmulhw      mm5,            [GLOBAL(x_c1sqr2less1)]
    142     paddw       mm5,            mm1
    143 
    144     pmulhw      mm3,            [GLOBAL(x_s1sqr2)]
    145     paddw       mm3,            mm4
    146 
    147     paddw       mm3,            mm5             ; d1
    148     paddw       mm0,            [GLOBAL(fours)]
    149 
    150     paddw       mm2,            [GLOBAL(fours)]
    151     movq        mm6,            mm2             ; a1
    152 
    153     movq        mm4,            mm0             ; b1
    154     paddw       mm2,            mm3             ;0
    155 
    156     paddw       mm4,            mm7             ;1
    157     psubw       mm0,            mm7             ;2
    158 
    159     psubw       mm6,            mm3             ;3
    160     psraw       mm2,            3
    161 
    162     psraw       mm0,            3
    163     psraw       mm4,            3
    164 
    165     psraw       mm6,            3
    166 
    167     movq        mm1,            mm2             ; 03 02 01 00
    168     movq        mm3,            mm4             ; 23 22 21 20
    169 
    170     punpcklwd   mm1,            mm0             ; 11 01 10 00
    171     punpckhwd   mm2,            mm0             ; 13 03 12 02
    172 
    173     punpcklwd   mm3,            mm6             ; 31 21 30 20
    174     punpckhwd   mm4,            mm6             ; 33 23 32 22
    175 
    176     movq        mm0,            mm1             ; 11 01 10 00
    177     movq        mm5,            mm2             ; 13 03 12 02
    178 
    179     punpckldq   mm0,            mm3             ; 30 20 10 00
    180     punpckhdq   mm1,            mm3             ; 31 21 11 01
    181 
    182     punpckldq   mm2,            mm4             ; 32 22 12 02
    183     punpckhdq   mm5,            mm4             ; 33 23 13 03
    184 
    185     pxor        mm7,            mm7
    186 
    187     movd        mm4,            [rsi]
    188     punpcklbw   mm4,            mm7
    189     paddsw      mm0,            mm4
    190     packuswb    mm0,            mm7
    191     movd        [rdx],          mm0
    192 
    193     movd        mm4,            [rsi+rax]
    194     punpcklbw   mm4,            mm7
    195     paddsw      mm1,            mm4
    196     packuswb    mm1,            mm7
    197     movd        [rdx+rdi],      mm1
    198 
    199     movd        mm4,            [rsi+2*rax]
    200     punpcklbw   mm4,            mm7
    201     paddsw      mm2,            mm4
    202     packuswb    mm2,            mm7
    203     movd        [rdx+rdi*2],    mm2
    204 
    205     add         rdx,            rdi
    206     add         rsi,            rax
    207 
    208     movd        mm4,            [rsi+2*rax]
    209     punpcklbw   mm4,            mm7
    210     paddsw      mm5,            mm4
    211     packuswb    mm5,            mm7
    212     movd        [rdx+rdi*2],    mm5
    213 
    214     ; begin epilog
    215     pop rdi
    216     pop rsi
    217     RESTORE_GOT
    218     UNSHADOW_ARGS
    219     pop         rbp
    220     ret
    221 
    222 ;void vp8_dc_only_idct_add_mmx(
    223 ;short input_dc,
    224 ;unsigned char *pred_ptr,
    225 ;int pred_stride,
    226 ;unsigned char *dst_ptr,
    227 ;int stride)
    228 global sym(vp8_dc_only_idct_add_mmx) PRIVATE
    229 sym(vp8_dc_only_idct_add_mmx):
    230     push        rbp
    231     mov         rbp, rsp
    232     SHADOW_ARGS_TO_STACK 5
    233     GET_GOT     rbx
    234     ; end prolog
    235 
    236         movd        mm5,            arg(0) ;input_dc
    237         mov         rax,            arg(1) ;pred_ptr
    238         movsxd      rdx,            dword ptr arg(2) ;pred_stride
    239 
    240         pxor        mm0,            mm0
    241 
    242         paddw       mm5,            [GLOBAL(fours)]
    243         lea         rcx,            [rdx + rdx*2]
    244 
    245         psraw       mm5,            3
    246 
    247         punpcklwd   mm5,            mm5
    248 
    249         punpckldq   mm5,            mm5
    250 
    251         movd        mm1,            [rax]
    252         movd        mm2,            [rax+rdx]
    253         movd        mm3,            [rax+2*rdx]
    254         movd        mm4,            [rax+rcx]
    255 
    256         mov         rax,            arg(3) ;d -- destination
    257         movsxd      rdx,            dword ptr arg(4) ;dst_stride
    258 
    259         punpcklbw   mm1,            mm0
    260         paddsw      mm1,            mm5
    261         packuswb    mm1,            mm0              ; pack and unpack to saturate
    262         lea         rcx,            [rdx + rdx*2]
    263 
    264         punpcklbw   mm2,            mm0
    265         paddsw      mm2,            mm5
    266         packuswb    mm2,            mm0              ; pack and unpack to saturate
    267 
    268         punpcklbw   mm3,            mm0
    269         paddsw      mm3,            mm5
    270         packuswb    mm3,            mm0              ; pack and unpack to saturate
    271 
    272         punpcklbw   mm4,            mm0
    273         paddsw      mm4,            mm5
    274         packuswb    mm4,            mm0              ; pack and unpack to saturate
    275 
    276         movd        [rax],          mm1
    277         movd        [rax+rdx],      mm2
    278         movd        [rax+2*rdx],    mm3
    279         movd        [rax+rcx],      mm4
    280 
    281     ; begin epilog
    282     RESTORE_GOT
    283     UNSHADOW_ARGS
    284     pop         rbp
    285     ret
    286 
    287 SECTION_RODATA
    288 align 16
    289 x_s1sqr2:
    290     times 4 dw 0x8A8C
    291 align 16
    292 x_c1sqr2less1:
    293     times 4 dw 0x4E7B
    294 align 16
    295 fours:
    296     times 4 dw 0x0004
    297