Home | History | Annotate | Download | only in x86
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12 %include "vpx_ports/x86_abi_support.asm"
     13 
     14 
     15 ;void vp8_dequantize_b_impl_mmx(short *sq, short *dq, short *q)
     16 global sym(vp8_dequantize_b_impl_mmx)
     17 sym(vp8_dequantize_b_impl_mmx):
     18     push        rbp
     19     mov         rbp, rsp
     20     SHADOW_ARGS_TO_STACK 3
     21     push        rsi
     22     push        rdi
     23     ; end prolog
     24 
     25         mov       rsi, arg(0) ;sq
     26         mov       rdi, arg(1) ;dq
     27         mov       rax, arg(2) ;q
     28 
     29         movq      mm1, [rsi]
     30         pmullw    mm1, [rax+0]            ; mm4 *= kernel 0 modifiers.
     31         movq      [rdi], mm1
     32 
     33         movq      mm1, [rsi+8]
     34         pmullw    mm1, [rax+8]            ; mm4 *= kernel 0 modifiers.
     35         movq      [rdi+8], mm1
     36 
     37         movq      mm1, [rsi+16]
     38         pmullw    mm1, [rax+16]            ; mm4 *= kernel 0 modifiers.
     39         movq      [rdi+16], mm1
     40 
     41         movq      mm1, [rsi+24]
     42         pmullw    mm1, [rax+24]            ; mm4 *= kernel 0 modifiers.
     43         movq      [rdi+24], mm1
     44 
     45     ; begin epilog
     46     pop rdi
     47     pop rsi
     48     UNSHADOW_ARGS
     49     pop         rbp
     50     ret
     51 
     52 
     53 ;void dequant_idct_add_mmx(short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride)
     54 global sym(vp8_dequant_idct_add_mmx)
     55 sym(vp8_dequant_idct_add_mmx):
     56     push        rbp
     57     mov         rbp, rsp
     58     SHADOW_ARGS_TO_STACK 6
     59     GET_GOT     rbx
     60     push        rsi
     61     push        rdi
     62     ; end prolog
     63 
     64         mov         rax,    arg(0) ;input
     65         mov         rdx,    arg(1) ;dq
     66 
     67 
     68         movq        mm0,    [rax   ]
     69         pmullw      mm0,    [rdx]
     70 
     71         movq        mm1,    [rax +8]
     72         pmullw      mm1,    [rdx +8]
     73 
     74         movq        mm2,    [rax+16]
     75         pmullw      mm2,    [rdx+16]
     76 
     77         movq        mm3,    [rax+24]
     78         pmullw      mm3,    [rdx+24]
     79 
     80         mov         rdx,    arg(3) ;dest
     81         mov         rsi,    arg(2) ;pred
     82         pxor        mm7,    mm7
     83 
     84 
     85         movq        [rax],   mm7
     86         movq        [rax+8], mm7
     87 
     88         movq        [rax+16],mm7
     89         movq        [rax+24],mm7
     90 
     91 
     92         movsxd      rax,            dword ptr arg(4) ;pitch
     93         movsxd      rdi,            dword ptr arg(5) ;stride
     94 
     95         psubw       mm0,            mm2             ; b1= 0-2
     96         paddw       mm2,            mm2             ;
     97 
     98         movq        mm5,            mm1
     99         paddw       mm2,            mm0             ; a1 =0+2
    100 
    101         pmulhw      mm5,            [x_s1sqr2 GLOBAL];
    102         paddw       mm5,            mm1             ; ip1 * sin(pi/8) * sqrt(2)
    103 
    104         movq        mm7,            mm3             ;
    105         pmulhw      mm7,            [x_c1sqr2less1 GLOBAL];
    106 
    107         paddw       mm7,            mm3             ; ip3 * cos(pi/8) * sqrt(2)
    108         psubw       mm7,            mm5             ; c1
    109 
    110         movq        mm5,            mm1
    111         movq        mm4,            mm3
    112 
    113         pmulhw      mm5,            [x_c1sqr2less1 GLOBAL]
    114         paddw       mm5,            mm1
    115 
    116         pmulhw      mm3,            [x_s1sqr2 GLOBAL]
    117         paddw       mm3,            mm4
    118 
    119         paddw       mm3,            mm5             ; d1
    120         movq        mm6,            mm2             ; a1
    121 
    122         movq        mm4,            mm0             ; b1
    123         paddw       mm2,            mm3             ;0
    124 
    125         paddw       mm4,            mm7             ;1
    126         psubw       mm0,            mm7             ;2
    127 
    128         psubw       mm6,            mm3             ;3
    129 
    130         movq        mm1,            mm2             ; 03 02 01 00
    131         movq        mm3,            mm4             ; 23 22 21 20
    132 
    133         punpcklwd   mm1,            mm0             ; 11 01 10 00
    134         punpckhwd   mm2,            mm0             ; 13 03 12 02
    135 
    136         punpcklwd   mm3,            mm6             ; 31 21 30 20
    137         punpckhwd   mm4,            mm6             ; 33 23 32 22
    138 
    139         movq        mm0,            mm1             ; 11 01 10 00
    140         movq        mm5,            mm2             ; 13 03 12 02
    141 
    142         punpckldq   mm0,            mm3             ; 30 20 10 00
    143         punpckhdq   mm1,            mm3             ; 31 21 11 01
    144 
    145         punpckldq   mm2,            mm4             ; 32 22 12 02
    146         punpckhdq   mm5,            mm4             ; 33 23 13 03
    147 
    148         movq        mm3,            mm5             ; 33 23 13 03
    149 
    150         psubw       mm0,            mm2             ; b1= 0-2
    151         paddw       mm2,            mm2             ;
    152 
    153         movq        mm5,            mm1
    154         paddw       mm2,            mm0             ; a1 =0+2
    155 
    156         pmulhw      mm5,            [x_s1sqr2 GLOBAL];
    157         paddw       mm5,            mm1             ; ip1 * sin(pi/8) * sqrt(2)
    158 
    159         movq        mm7,            mm3             ;
    160         pmulhw      mm7,            [x_c1sqr2less1 GLOBAL];
    161 
    162         paddw       mm7,            mm3             ; ip3 * cos(pi/8) * sqrt(2)
    163         psubw       mm7,            mm5             ; c1
    164 
    165         movq        mm5,            mm1
    166         movq        mm4,            mm3
    167 
    168         pmulhw      mm5,            [x_c1sqr2less1 GLOBAL]
    169         paddw       mm5,            mm1
    170 
    171         pmulhw      mm3,            [x_s1sqr2 GLOBAL]
    172         paddw       mm3,            mm4
    173 
    174         paddw       mm3,            mm5             ; d1
    175         paddw       mm0,            [fours GLOBAL]
    176 
    177         paddw       mm2,            [fours GLOBAL]
    178         movq        mm6,            mm2             ; a1
    179 
    180         movq        mm4,            mm0             ; b1
    181         paddw       mm2,            mm3             ;0
    182 
    183         paddw       mm4,            mm7             ;1
    184         psubw       mm0,            mm7             ;2
    185 
    186         psubw       mm6,            mm3             ;3
    187         psraw       mm2,            3
    188 
    189         psraw       mm0,            3
    190         psraw       mm4,            3
    191 
    192         psraw       mm6,            3
    193 
    194         movq        mm1,            mm2             ; 03 02 01 00
    195         movq        mm3,            mm4             ; 23 22 21 20
    196 
    197         punpcklwd   mm1,            mm0             ; 11 01 10 00
    198         punpckhwd   mm2,            mm0             ; 13 03 12 02
    199 
    200         punpcklwd   mm3,            mm6             ; 31 21 30 20
    201         punpckhwd   mm4,            mm6             ; 33 23 32 22
    202 
    203         movq        mm0,            mm1             ; 11 01 10 00
    204         movq        mm5,            mm2             ; 13 03 12 02
    205 
    206         punpckldq   mm0,            mm3             ; 30 20 10 00
    207         punpckhdq   mm1,            mm3             ; 31 21 11 01
    208 
    209         punpckldq   mm2,            mm4             ; 32 22 12 02
    210         punpckhdq   mm5,            mm4             ; 33 23 13 03
    211 
    212         pxor        mm7,            mm7
    213 
    214         movd        mm4,            [rsi]
    215         punpcklbw   mm4,            mm7
    216         paddsw      mm0,            mm4
    217         packuswb    mm0,            mm7
    218         movd        [rdx],          mm0
    219 
    220         movd        mm4,            [rsi+rax]
    221         punpcklbw   mm4,            mm7
    222         paddsw      mm1,            mm4
    223         packuswb    mm1,            mm7
    224         movd        [rdx+rdi],      mm1
    225 
    226         movd        mm4,            [rsi+2*rax]
    227         punpcklbw   mm4,            mm7
    228         paddsw      mm2,            mm4
    229         packuswb    mm2,            mm7
    230         movd        [rdx+rdi*2],    mm2
    231 
    232         add         rdx,            rdi
    233         add         rsi,            rax
    234 
    235         movd        mm4,            [rsi+2*rax]
    236         punpcklbw   mm4,            mm7
    237         paddsw      mm5,            mm4
    238         packuswb    mm5,            mm7
    239         movd        [rdx+rdi*2],    mm5
    240 
    241     ; begin epilog
    242     pop rdi
    243     pop rsi
    244     RESTORE_GOT
    245     UNSHADOW_ARGS
    246     pop         rbp
    247     ret
    248 
    249 
    250 ;void dequant_dc_idct_add_mmx(short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride, int Dc)
    251 global sym(vp8_dequant_dc_idct_add_mmx)
    252 sym(vp8_dequant_dc_idct_add_mmx):
    253     push        rbp
    254     mov         rbp, rsp
    255     SHADOW_ARGS_TO_STACK 7
    256     GET_GOT     rbx
    257     push        rsi
    258     push        rdi
    259     ; end prolog
    260 
    261         mov         rax,    arg(0) ;input
    262         mov         rdx,    arg(1) ;dq
    263 
    264         movq        mm0,    [rax   ]
    265         pmullw      mm0,    [rdx]
    266 
    267         movq        mm1,    [rax +8]
    268         pmullw      mm1,    [rdx +8]
    269 
    270         movq        mm2,    [rax+16]
    271         pmullw      mm2,    [rdx+16]
    272 
    273         movq        mm3,    [rax+24]
    274         pmullw      mm3,    [rdx+24]
    275 
    276         mov         rdx,    arg(3) ;dest
    277         mov         rsi,    arg(2) ;pred
    278         pxor        mm7,    mm7
    279 
    280 
    281         movq        [rax],   mm7
    282         movq        [rax+8], mm7
    283 
    284         movq        [rax+16],mm7
    285         movq        [rax+24],mm7
    286 
    287         ; move lower word of Dc to lower word of mm0
    288         psrlq       mm0,    16
    289         movzx       rcx,    word ptr arg(6) ;Dc
    290         psllq       mm0,    16
    291         movd        mm7,    rcx
    292         por         mm0,    mm7
    293 
    294         movsxd      rax,            dword ptr arg(4) ;pitch
    295         movsxd      rdi,            dword ptr arg(5) ;stride
    296 
    297         psubw       mm0,            mm2             ; b1= 0-2
    298         paddw       mm2,            mm2             ;
    299 
    300         movq        mm5,            mm1
    301         paddw       mm2,            mm0             ; a1 =0+2
    302 
    303         pmulhw      mm5,            [x_s1sqr2 GLOBAL];
    304         paddw       mm5,            mm1             ; ip1 * sin(pi/8) * sqrt(2)
    305 
    306         movq        mm7,            mm3             ;
    307         pmulhw      mm7,            [x_c1sqr2less1 GLOBAL];
    308 
    309         paddw       mm7,            mm3             ; ip3 * cos(pi/8) * sqrt(2)
    310         psubw       mm7,            mm5             ; c1
    311 
    312         movq        mm5,            mm1
    313         movq        mm4,            mm3
    314 
    315         pmulhw      mm5,            [x_c1sqr2less1 GLOBAL]
    316         paddw       mm5,            mm1
    317 
    318         pmulhw      mm3,            [x_s1sqr2 GLOBAL]
    319         paddw       mm3,            mm4
    320 
    321         paddw       mm3,            mm5             ; d1
    322         movq        mm6,            mm2             ; a1
    323 
    324         movq        mm4,            mm0             ; b1
    325         paddw       mm2,            mm3             ;0
    326 
    327         paddw       mm4,            mm7             ;1
    328         psubw       mm0,            mm7             ;2
    329 
    330         psubw       mm6,            mm3             ;3
    331 
    332         movq        mm1,            mm2             ; 03 02 01 00
    333         movq        mm3,            mm4             ; 23 22 21 20
    334 
    335         punpcklwd   mm1,            mm0             ; 11 01 10 00
    336         punpckhwd   mm2,            mm0             ; 13 03 12 02
    337 
    338         punpcklwd   mm3,            mm6             ; 31 21 30 20
    339         punpckhwd   mm4,            mm6             ; 33 23 32 22
    340 
    341         movq        mm0,            mm1             ; 11 01 10 00
    342         movq        mm5,            mm2             ; 13 03 12 02
    343 
    344         punpckldq   mm0,            mm3             ; 30 20 10 00
    345         punpckhdq   mm1,            mm3             ; 31 21 11 01
    346 
    347         punpckldq   mm2,            mm4             ; 32 22 12 02
    348         punpckhdq   mm5,            mm4             ; 33 23 13 03
    349 
    350         movq        mm3,            mm5             ; 33 23 13 03
    351 
    352         psubw       mm0,            mm2             ; b1= 0-2
    353         paddw       mm2,            mm2             ;
    354 
    355         movq        mm5,            mm1
    356         paddw       mm2,            mm0             ; a1 =0+2
    357 
    358         pmulhw      mm5,            [x_s1sqr2 GLOBAL];
    359         paddw       mm5,            mm1             ; ip1 * sin(pi/8) * sqrt(2)
    360 
    361         movq        mm7,            mm3             ;
    362         pmulhw      mm7,            [x_c1sqr2less1 GLOBAL];
    363 
    364         paddw       mm7,            mm3             ; ip3 * cos(pi/8) * sqrt(2)
    365         psubw       mm7,            mm5             ; c1
    366 
    367         movq        mm5,            mm1
    368         movq        mm4,            mm3
    369 
    370         pmulhw      mm5,            [x_c1sqr2less1 GLOBAL]
    371         paddw       mm5,            mm1
    372 
    373         pmulhw      mm3,            [x_s1sqr2 GLOBAL]
    374         paddw       mm3,            mm4
    375 
    376         paddw       mm3,            mm5             ; d1
    377         paddw       mm0,            [fours GLOBAL]
    378 
    379         paddw       mm2,            [fours GLOBAL]
    380         movq        mm6,            mm2             ; a1
    381 
    382         movq        mm4,            mm0             ; b1
    383         paddw       mm2,            mm3             ;0
    384 
    385         paddw       mm4,            mm7             ;1
    386         psubw       mm0,            mm7             ;2
    387 
    388         psubw       mm6,            mm3             ;3
    389         psraw       mm2,            3
    390 
    391         psraw       mm0,            3
    392         psraw       mm4,            3
    393 
    394         psraw       mm6,            3
    395 
    396         movq        mm1,            mm2             ; 03 02 01 00
    397         movq        mm3,            mm4             ; 23 22 21 20
    398 
    399         punpcklwd   mm1,            mm0             ; 11 01 10 00
    400         punpckhwd   mm2,            mm0             ; 13 03 12 02
    401 
    402         punpcklwd   mm3,            mm6             ; 31 21 30 20
    403         punpckhwd   mm4,            mm6             ; 33 23 32 22
    404 
    405         movq        mm0,            mm1             ; 11 01 10 00
    406         movq        mm5,            mm2             ; 13 03 12 02
    407 
    408         punpckldq   mm0,            mm3             ; 30 20 10 00
    409         punpckhdq   mm1,            mm3             ; 31 21 11 01
    410 
    411         punpckldq   mm2,            mm4             ; 32 22 12 02
    412         punpckhdq   mm5,            mm4             ; 33 23 13 03
    413 
    414         pxor        mm7,            mm7
    415 
    416         movd        mm4,            [rsi]
    417         punpcklbw   mm4,            mm7
    418         paddsw      mm0,            mm4
    419         packuswb    mm0,            mm7
    420         movd        [rdx],          mm0
    421 
    422         movd        mm4,            [rsi+rax]
    423         punpcklbw   mm4,            mm7
    424         paddsw      mm1,            mm4
    425         packuswb    mm1,            mm7
    426         movd        [rdx+rdi],      mm1
    427 
    428         movd        mm4,            [rsi+2*rax]
    429         punpcklbw   mm4,            mm7
    430         paddsw      mm2,            mm4
    431         packuswb    mm2,            mm7
    432         movd        [rdx+rdi*2],    mm2
    433 
    434         add         rdx,            rdi
    435         add         rsi,            rax
    436 
    437         movd        mm4,            [rsi+2*rax]
    438         punpcklbw   mm4,            mm7
    439         paddsw      mm5,            mm4
    440         packuswb    mm5,            mm7
    441         movd        [rdx+rdi*2],    mm5
    442 
    443     ; begin epilog
    444     pop rdi
    445     pop rsi
    446     RESTORE_GOT
    447     UNSHADOW_ARGS
    448     pop         rbp
    449     ret
    450 
    451 
    452 SECTION_RODATA
    453 align 16
    454 x_s1sqr2:
    455     times 4 dw 0x8A8C
    456 align 16
    457 x_c1sqr2less1:
    458     times 4 dw 0x4E7B
    459 align 16
    460 fours:
    461     times 4 dw 0x0004
    462