Home | History | Annotate | Download | only in x86
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12 %include "vpx_ports/x86_abi_support.asm"
     13 
     14 ; /****************************************************************************
     15 ; * Notes:
     16 ; *
     17 ; * This implementation makes use of 16 bit fixed point version of two multiply
     18 ; * constants:
     19 ; *        1.   sqrt(2) * cos (pi/8)
     20 ; *        2.   sqrt(2) * sin (pi/8)
     21 ; * Because the first constant is bigger than 1, to maintain the same 16 bit
     22 ; * fixed point precision as the second one, we use a trick of
     23 ; *        x * a = x + x*(a-1)
     24 ; * so
     25 ; *        x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1).
     26 ; *
     27 ; * For the second constant, because of the 16bit version is 35468, which
     28 ; * is bigger than 32768, in signed 16 bit multiply, it becomes a negative
     29 ; * number.
     30 ; *        (x * (unsigned)35468 >> 16) = x * (signed)35468 >> 16 + x
     31 ; *
     32 ; **************************************************************************/
     33 
     34 
     35 ;void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred,
     36 ;int pitch, unsigned char *dest,int stride)
     37 global sym(vp8_short_idct4x4llm_mmx) PRIVATE
     38 sym(vp8_short_idct4x4llm_mmx):
     39     push        rbp
     40     mov         rbp, rsp
     41     SHADOW_ARGS_TO_STACK 5
     42     GET_GOT     rbx
     43     push        rsi
     44     push        rdi
     45     ; end prolog
     46 
     47     mov         rax,    arg(0)              ;input
     48     mov         rsi,    arg(1)              ;pred
     49 
     50     movq        mm0,    [rax   ]
     51     movq        mm1,    [rax+ 8]
     52     movq        mm2,    [rax+16]
     53     movq        mm3,    [rax+24]
     54 
     55 %if 0
     56     pxor        mm7,    mm7
     57     movq        [rax],   mm7
     58     movq        [rax+8], mm7
     59     movq        [rax+16],mm7
     60     movq        [rax+24],mm7
     61 %endif
     62     movsxd      rax,    dword ptr arg(2)    ;pitch
     63     mov         rdx,    arg(3)              ;dest
     64     movsxd      rdi,    dword ptr arg(4)    ;stride
     65 
     66 
     67     psubw       mm0,            mm2             ; b1= 0-2
     68     paddw       mm2,            mm2             ;
     69 
     70     movq        mm5,            mm1
     71     paddw       mm2,            mm0             ; a1 =0+2
     72 
     73     pmulhw      mm5,            [GLOBAL(x_s1sqr2)];
     74     paddw       mm5,            mm1             ; ip1 * sin(pi/8) * sqrt(2)
     75 
     76     movq        mm7,            mm3             ;
     77     pmulhw      mm7,            [GLOBAL(x_c1sqr2less1)];
     78 
     79     paddw       mm7,            mm3             ; ip3 * cos(pi/8) * sqrt(2)
     80     psubw       mm7,            mm5             ; c1
     81 
     82     movq        mm5,            mm1
     83     movq        mm4,            mm3
     84 
     85     pmulhw      mm5,            [GLOBAL(x_c1sqr2less1)]
     86     paddw       mm5,            mm1
     87 
     88     pmulhw      mm3,            [GLOBAL(x_s1sqr2)]
     89     paddw       mm3,            mm4
     90 
     91     paddw       mm3,            mm5             ; d1
     92     movq        mm6,            mm2             ; a1
     93 
     94     movq        mm4,            mm0             ; b1
     95     paddw       mm2,            mm3             ;0
     96 
     97     paddw       mm4,            mm7             ;1
     98     psubw       mm0,            mm7             ;2
     99 
    100     psubw       mm6,            mm3             ;3
    101 
    102     movq        mm1,            mm2             ; 03 02 01 00
    103     movq        mm3,            mm4             ; 23 22 21 20
    104 
    105     punpcklwd   mm1,            mm0             ; 11 01 10 00
    106     punpckhwd   mm2,            mm0             ; 13 03 12 02
    107 
    108     punpcklwd   mm3,            mm6             ; 31 21 30 20
    109     punpckhwd   mm4,            mm6             ; 33 23 32 22
    110 
    111     movq        mm0,            mm1             ; 11 01 10 00
    112     movq        mm5,            mm2             ; 13 03 12 02
    113 
    114     punpckldq   mm0,            mm3             ; 30 20 10 00
    115     punpckhdq   mm1,            mm3             ; 31 21 11 01
    116 
    117     punpckldq   mm2,            mm4             ; 32 22 12 02
    118     punpckhdq   mm5,            mm4             ; 33 23 13 03
    119 
    120     movq        mm3,            mm5             ; 33 23 13 03
    121 
    122     psubw       mm0,            mm2             ; b1= 0-2
    123     paddw       mm2,            mm2             ;
    124 
    125     movq        mm5,            mm1
    126     paddw       mm2,            mm0             ; a1 =0+2
    127 
    128     pmulhw      mm5,            [GLOBAL(x_s1sqr2)];
    129     paddw       mm5,            mm1             ; ip1 * sin(pi/8) * sqrt(2)
    130 
    131     movq        mm7,            mm3             ;
    132     pmulhw      mm7,            [GLOBAL(x_c1sqr2less1)];
    133 
    134     paddw       mm7,            mm3             ; ip3 * cos(pi/8) * sqrt(2)
    135     psubw       mm7,            mm5             ; c1
    136 
    137     movq        mm5,            mm1
    138     movq        mm4,            mm3
    139 
    140     pmulhw      mm5,            [GLOBAL(x_c1sqr2less1)]
    141     paddw       mm5,            mm1
    142 
    143     pmulhw      mm3,            [GLOBAL(x_s1sqr2)]
    144     paddw       mm3,            mm4
    145 
    146     paddw       mm3,            mm5             ; d1
    147     paddw       mm0,            [GLOBAL(fours)]
    148 
    149     paddw       mm2,            [GLOBAL(fours)]
    150     movq        mm6,            mm2             ; a1
    151 
    152     movq        mm4,            mm0             ; b1
    153     paddw       mm2,            mm3             ;0
    154 
    155     paddw       mm4,            mm7             ;1
    156     psubw       mm0,            mm7             ;2
    157 
    158     psubw       mm6,            mm3             ;3
    159     psraw       mm2,            3
    160 
    161     psraw       mm0,            3
    162     psraw       mm4,            3
    163 
    164     psraw       mm6,            3
    165 
    166     movq        mm1,            mm2             ; 03 02 01 00
    167     movq        mm3,            mm4             ; 23 22 21 20
    168 
    169     punpcklwd   mm1,            mm0             ; 11 01 10 00
    170     punpckhwd   mm2,            mm0             ; 13 03 12 02
    171 
    172     punpcklwd   mm3,            mm6             ; 31 21 30 20
    173     punpckhwd   mm4,            mm6             ; 33 23 32 22
    174 
    175     movq        mm0,            mm1             ; 11 01 10 00
    176     movq        mm5,            mm2             ; 13 03 12 02
    177 
    178     punpckldq   mm0,            mm3             ; 30 20 10 00
    179     punpckhdq   mm1,            mm3             ; 31 21 11 01
    180 
    181     punpckldq   mm2,            mm4             ; 32 22 12 02
    182     punpckhdq   mm5,            mm4             ; 33 23 13 03
    183 
    184     pxor        mm7,            mm7
    185 
    186     movd        mm4,            [rsi]
    187     punpcklbw   mm4,            mm7
    188     paddsw      mm0,            mm4
    189     packuswb    mm0,            mm7
    190     movd        [rdx],          mm0
    191 
    192     movd        mm4,            [rsi+rax]
    193     punpcklbw   mm4,            mm7
    194     paddsw      mm1,            mm4
    195     packuswb    mm1,            mm7
    196     movd        [rdx+rdi],      mm1
    197 
    198     movd        mm4,            [rsi+2*rax]
    199     punpcklbw   mm4,            mm7
    200     paddsw      mm2,            mm4
    201     packuswb    mm2,            mm7
    202     movd        [rdx+rdi*2],    mm2
    203 
    204     add         rdx,            rdi
    205     add         rsi,            rax
    206 
    207     movd        mm4,            [rsi+2*rax]
    208     punpcklbw   mm4,            mm7
    209     paddsw      mm5,            mm4
    210     packuswb    mm5,            mm7
    211     movd        [rdx+rdi*2],    mm5
    212 
    213     ; begin epilog
    214     pop rdi
    215     pop rsi
    216     RESTORE_GOT
    217     UNSHADOW_ARGS
    218     pop         rbp
    219     ret
    220 
    221 ;void vp8_dc_only_idct_add_mmx(
    222 ;short input_dc,
    223 ;unsigned char *pred_ptr,
    224 ;int pred_stride,
    225 ;unsigned char *dst_ptr,
    226 ;int stride)
    227 global sym(vp8_dc_only_idct_add_mmx) PRIVATE
    228 sym(vp8_dc_only_idct_add_mmx):
    229     push        rbp
    230     mov         rbp, rsp
    231     SHADOW_ARGS_TO_STACK 5
    232     GET_GOT     rbx
    233     ; end prolog
    234 
    235         movd        mm5,            arg(0) ;input_dc
    236         mov         rax,            arg(1) ;pred_ptr
    237         movsxd      rdx,            dword ptr arg(2) ;pred_stride
    238 
    239         pxor        mm0,            mm0
    240 
    241         paddw       mm5,            [GLOBAL(fours)]
    242         lea         rcx,            [rdx + rdx*2]
    243 
    244         psraw       mm5,            3
    245 
    246         punpcklwd   mm5,            mm5
    247 
    248         punpckldq   mm5,            mm5
    249 
    250         movd        mm1,            [rax]
    251         movd        mm2,            [rax+rdx]
    252         movd        mm3,            [rax+2*rdx]
    253         movd        mm4,            [rax+rcx]
    254 
    255         mov         rax,            arg(3) ;d -- destination
    256         movsxd      rdx,            dword ptr arg(4) ;dst_stride
    257 
    258         punpcklbw   mm1,            mm0
    259         paddsw      mm1,            mm5
    260         packuswb    mm1,            mm0              ; pack and unpack to saturate
    261         lea         rcx,            [rdx + rdx*2]
    262 
    263         punpcklbw   mm2,            mm0
    264         paddsw      mm2,            mm5
    265         packuswb    mm2,            mm0              ; pack and unpack to saturate
    266 
    267         punpcklbw   mm3,            mm0
    268         paddsw      mm3,            mm5
    269         packuswb    mm3,            mm0              ; pack and unpack to saturate
    270 
    271         punpcklbw   mm4,            mm0
    272         paddsw      mm4,            mm5
    273         packuswb    mm4,            mm0              ; pack and unpack to saturate
    274 
    275         movd        [rax],          mm1
    276         movd        [rax+rdx],      mm2
    277         movd        [rax+2*rdx],    mm3
    278         movd        [rax+rcx],      mm4
    279 
    280     ; begin epilog
    281     RESTORE_GOT
    282     UNSHADOW_ARGS
    283     pop         rbp
    284     ret
    285 
    286 SECTION_RODATA
    287 align 16
    288 x_s1sqr2:
    289     times 4 dw 0x8A8C
    290 align 16
    291 x_c1sqr2less1:
    292     times 4 dw 0x4E7B
    293 align 16
    294 fours:
    295     times 4 dw 0x0004
    296