Home | History | Annotate | Download | only in x86
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12 %include "vpx_ports/x86_abi_support.asm"
     13 
     14 ; /****************************************************************************
     15 ; * Notes:
     16 ; *
     17 ; * This implementation makes use of 16 bit fixed point verio of two multiply
     18 ; * constants:
     19 ; *        1.   sqrt(2) * cos (pi/8)
     20 ; *         2.   sqrt(2) * sin (pi/8)
     21 ; * Becuase the first constant is bigger than 1, to maintain the same 16 bit
     22 ; * fixed point prrcision as the second one, we use a trick of
     23 ; *        x * a = x + x*(a-1)
     24 ; * so
     25 ; *        x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1).
     26 ; *
     27 ; * For     the second constant, becuase of the 16bit version is 35468, which
     28 ; * is bigger than 32768, in signed 16 bit multiply, it become a negative
     29 ; * number.
     30 ; *        (x * (unsigned)35468 >> 16) = x * (signed)35468 >> 16 + x
     31 ; *
     32 ; **************************************************************************/
     33 
     34 
     35 ;void short_idct4x4llm_mmx(short *input, short *output, int pitch)
     36 global sym(vp8_short_idct4x4llm_mmx)
     37 sym(vp8_short_idct4x4llm_mmx):
     38     push        rbp
     39     mov         rbp, rsp
     40     SHADOW_ARGS_TO_STACK 3
     41     GET_GOT     rbx
     42     ; end prolog
     43 
     44         mov         rax,            arg(0) ;input
     45         mov         rdx,            arg(1) ;output
     46 
     47         movq        mm0,            [rax   ]
     48         movq        mm1,            [rax+ 8]
     49 
     50         movq        mm2,            [rax+16]
     51         movq        mm3,            [rax+24]
     52 
     53         movsxd      rax,            dword ptr arg(2) ;pitch
     54 
     55         psubw       mm0,            mm2             ; b1= 0-2
     56         paddw       mm2,            mm2             ;
     57 
     58         movq        mm5,            mm1
     59         paddw       mm2,            mm0             ; a1 =0+2
     60 
     61         pmulhw      mm5,            [x_s1sqr2 GLOBAL]        ;
     62         paddw       mm5,            mm1             ; ip1 * sin(pi/8) * sqrt(2)
     63 
     64         movq        mm7,            mm3             ;
     65         pmulhw      mm7,            [x_c1sqr2less1 GLOBAL]    ;
     66 
     67         paddw       mm7,            mm3             ; ip3 * cos(pi/8) * sqrt(2)
     68         psubw       mm7,            mm5             ; c1
     69 
     70         movq        mm5,            mm1
     71         movq        mm4,            mm3
     72 
     73         pmulhw      mm5,            [x_c1sqr2less1 GLOBAL]
     74         paddw       mm5,            mm1
     75 
     76         pmulhw      mm3,            [x_s1sqr2 GLOBAL]
     77         paddw       mm3,            mm4
     78 
     79         paddw       mm3,            mm5             ; d1
     80         movq        mm6,            mm2             ; a1
     81 
     82         movq        mm4,            mm0             ; b1
     83         paddw       mm2,            mm3             ;0
     84 
     85         paddw       mm4,            mm7             ;1
     86         psubw       mm0,            mm7             ;2
     87 
     88         psubw       mm6,            mm3             ;3
     89 
     90         movq        mm1,            mm2             ; 03 02 01 00
     91         movq        mm3,            mm4             ; 23 22 21 20
     92 
     93         punpcklwd   mm1,            mm0             ; 11 01 10 00
     94         punpckhwd   mm2,            mm0             ; 13 03 12 02
     95 
     96         punpcklwd   mm3,            mm6             ; 31 21 30 20
     97         punpckhwd   mm4,            mm6             ; 33 23 32 22
     98 
     99         movq        mm0,            mm1             ; 11 01 10 00
    100         movq        mm5,            mm2             ; 13 03 12 02
    101 
    102         punpckldq   mm0,            mm3             ; 30 20 10 00
    103         punpckhdq   mm1,            mm3             ; 31 21 11 01
    104 
    105         punpckldq   mm2,            mm4             ; 32 22 12 02
    106         punpckhdq   mm5,            mm4             ; 33 23 13 03
    107 
    108         movq        mm3,            mm5             ; 33 23 13 03
    109 
    110         psubw       mm0,            mm2             ; b1= 0-2
    111         paddw       mm2,            mm2             ;
    112 
    113         movq        mm5,            mm1
    114         paddw       mm2,            mm0             ; a1 =0+2
    115 
    116         pmulhw      mm5,            [x_s1sqr2 GLOBAL]         ;
    117         paddw       mm5,            mm1             ; ip1 * sin(pi/8) * sqrt(2)
    118 
    119         movq        mm7,            mm3             ;
    120         pmulhw      mm7,            [x_c1sqr2less1 GLOBAL]    ;
    121 
    122         paddw       mm7,            mm3             ; ip3 * cos(pi/8) * sqrt(2)
    123         psubw       mm7,            mm5             ; c1
    124 
    125         movq        mm5,            mm1
    126         movq        mm4,            mm3
    127 
    128         pmulhw      mm5,            [x_c1sqr2less1 GLOBAL]
    129         paddw       mm5,            mm1
    130 
    131         pmulhw      mm3,            [x_s1sqr2 GLOBAL]
    132         paddw       mm3,            mm4
    133 
    134         paddw       mm3,            mm5             ; d1
    135         paddw       mm0,            [fours GLOBAL]
    136 
    137         paddw       mm2,            [fours GLOBAL]
    138         movq        mm6,            mm2             ; a1
    139 
    140         movq        mm4,            mm0             ; b1
    141         paddw       mm2,            mm3             ;0
    142 
    143         paddw       mm4,            mm7             ;1
    144         psubw       mm0,            mm7             ;2
    145 
    146         psubw       mm6,            mm3             ;3
    147         psraw       mm2,            3
    148 
    149         psraw       mm0,            3
    150         psraw       mm4,            3
    151 
    152         psraw       mm6,            3
    153 
    154         movq        mm1,            mm2             ; 03 02 01 00
    155         movq        mm3,            mm4             ; 23 22 21 20
    156 
    157         punpcklwd   mm1,            mm0             ; 11 01 10 00
    158         punpckhwd   mm2,            mm0             ; 13 03 12 02
    159 
    160         punpcklwd   mm3,            mm6             ; 31 21 30 20
    161         punpckhwd   mm4,            mm6             ; 33 23 32 22
    162 
    163         movq        mm0,            mm1             ; 11 01 10 00
    164         movq        mm5,            mm2             ; 13 03 12 02
    165 
    166         punpckldq   mm0,            mm3             ; 30 20 10 00
    167         punpckhdq   mm1,            mm3             ; 31 21 11 01
    168 
    169         punpckldq   mm2,            mm4             ; 32 22 12 02
    170         punpckhdq   mm5,            mm4             ; 33 23 13 03
    171 
    172         movq        [rdx],          mm0
    173 
    174         movq        [rdx+rax],      mm1
    175         movq        [rdx+rax*2],    mm2
    176 
    177         add         rdx,            rax
    178         movq        [rdx+rax*2],    mm5
    179 
    180     ; begin epilog
    181     RESTORE_GOT
    182     UNSHADOW_ARGS
    183     pop         rbp
    184     ret
    185 
    186 
    187 ;void short_idct4x4llm_1_mmx(short *input, short *output, int pitch)
    188 global sym(vp8_short_idct4x4llm_1_mmx)
    189 sym(vp8_short_idct4x4llm_1_mmx):
    190     push        rbp
    191     mov         rbp, rsp
    192     SHADOW_ARGS_TO_STACK 3
    193     GET_GOT     rbx
    194     ; end prolog
    195 
    196         mov         rax,            arg(0) ;input
    197         movd        mm0,            [rax]
    198 
    199         paddw       mm0,            [fours GLOBAL]
    200         mov         rdx,            arg(1) ;output
    201 
    202         psraw       mm0,            3
    203         movsxd      rax,            dword ptr arg(2) ;pitch
    204 
    205         punpcklwd   mm0,            mm0
    206         punpckldq   mm0,            mm0
    207 
    208         movq        [rdx],          mm0
    209         movq        [rdx+rax],      mm0
    210 
    211         movq        [rdx+rax*2],    mm0
    212         add         rdx,            rax
    213 
    214         movq        [rdx+rax*2],    mm0
    215 
    216 
    217     ; begin epilog
    218     RESTORE_GOT
    219     UNSHADOW_ARGS
    220     pop         rbp
    221     ret
    222 
    223 ;void vp8_dc_only_idct_add_mmx(short input_dc, unsigned char *pred_ptr, unsigned char *dst_ptr, int pitch, int stride)
    224 global sym(vp8_dc_only_idct_add_mmx)
    225 sym(vp8_dc_only_idct_add_mmx):
    226     push        rbp
    227     mov         rbp, rsp
    228     SHADOW_ARGS_TO_STACK 5
    229     GET_GOT     rbx
    230     push        rsi
    231     push        rdi
    232     ; end prolog
    233 
    234         mov         rsi,            arg(1) ;s -- prediction
    235         mov         rdi,            arg(2) ;d -- destination
    236         movsxd      rax,            dword ptr arg(4) ;stride
    237         movsxd      rdx,            dword ptr arg(3) ;pitch
    238         pxor        mm0,            mm0
    239 
    240         movd        mm5,            arg(0) ;input_dc
    241 
    242         paddw       mm5,            [fours GLOBAL]
    243 
    244         psraw       mm5,            3
    245 
    246         punpcklwd   mm5,            mm5
    247         punpckldq   mm5,            mm5
    248 
    249         movd        mm1,            [rsi]
    250         punpcklbw   mm1,            mm0
    251         paddsw      mm1,            mm5
    252         packuswb    mm1,            mm0              ; pack and unpack to saturate
    253         movd        [rdi],          mm1
    254 
    255         movd        mm2,            [rsi+rdx]
    256         punpcklbw   mm2,            mm0
    257         paddsw      mm2,            mm5
    258         packuswb    mm2,            mm0              ; pack and unpack to saturate
    259         movd        [rdi+rax],      mm2
    260 
    261         movd        mm3,            [rsi+2*rdx]
    262         punpcklbw   mm3,            mm0
    263         paddsw      mm3,            mm5
    264         packuswb    mm3,            mm0              ; pack and unpack to saturate
    265         movd        [rdi+2*rax],    mm3
    266 
    267         add         rdi,            rax
    268         add         rsi,            rdx
    269         movd        mm4,            [rsi+2*rdx]
    270         punpcklbw   mm4,            mm0
    271         paddsw      mm4,            mm5
    272         packuswb    mm4,            mm0              ; pack and unpack to saturate
    273         movd        [rdi+2*rax],    mm4
    274 
    275     ; begin epilog
    276     pop rdi
    277     pop rsi
    278     RESTORE_GOT
    279     UNSHADOW_ARGS
    280     pop         rbp
    281     ret
    282 
    283 SECTION_RODATA
    284 align 16
    285 x_s1sqr2:
    286     times 4 dw 0x8A8C
    287 align 16
    288 x_c1sqr2less1:
    289     times 4 dw 0x4E7B
    290 align 16
    291 fours:
    292     times 4 dw 0x0004
    293