Home | History | Annotate | Download | only in x86
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12 %include "vpx_ports/x86_abi_support.asm"
     13 
     14 ;void vp8_short_inv_walsh4x4_mmx(short *input, short *output)
     15 global sym(vp8_short_inv_walsh4x4_mmx) PRIVATE
     16 sym(vp8_short_inv_walsh4x4_mmx):
     17     push        rbp
     18     mov         rbp, rsp
     19     SHADOW_ARGS_TO_STACK 2
     20     ; end prolog
     21 
     22     mov         rdx, arg(0)
     23     mov         rax, 30003h
     24 
     25     movq        mm0, [rdx + 0]    ;ip[0]
     26     movq        mm1, [rdx + 8]    ;ip[4]
     27     movq        mm7, rax
     28 
     29     movq        mm2, [rdx + 16]   ;ip[8]
     30     movq        mm3, [rdx + 24]   ;ip[12]
     31     punpcklwd   mm7, mm7          ;0003000300030003h
     32     mov         rdx, arg(1)
     33 
     34     movq        mm4, mm0
     35     movq        mm5, mm1
     36 
     37     paddw       mm4, mm3          ;ip[0] + ip[12] aka al
     38     paddw       mm5, mm2          ;ip[4] + ip[8] aka bl
     39 
     40     movq        mm6, mm4          ;temp al
     41     paddw       mm4, mm5          ;al + bl
     42     psubw       mm6, mm5          ;al - bl
     43 
     44     psubw       mm0, mm3          ;ip[0] - ip[12] aka d1
     45     psubw       mm1, mm2          ;ip[4] - ip[8] aka c1
     46 
     47     movq        mm5, mm0          ;temp dl
     48     paddw       mm0, mm1          ;dl + cl
     49     psubw       mm5, mm1          ;dl - cl
     50 
     51     ; 03 02 01 00
     52     ; 13 12 11 10
     53     ; 23 22 21 20
     54     ; 33 32 31 30
     55 
     56     movq        mm3, mm4          ; 03 02 01 00
     57     punpcklwd   mm4, mm0          ; 11 01 10 00
     58     punpckhwd   mm3, mm0          ; 13 03 12 02
     59 
     60     movq        mm1, mm6          ; 23 22 21 20
     61     punpcklwd   mm6, mm5          ; 31 21 30 20
     62     punpckhwd   mm1, mm5          ; 33 23 32 22
     63 
     64     movq        mm0, mm4          ; 11 01 10 00
     65     movq        mm2, mm3          ; 13 03 12 02
     66 
     67     punpckldq   mm0, mm6          ; 30 20 10 00 aka ip[0]
     68     punpckhdq   mm4, mm6          ; 31 21 11 01 aka ip[4]
     69 
     70     punpckldq   mm2, mm1          ; 32 22 12 02 aka ip[8]
     71     punpckhdq   mm3, mm1          ; 33 23 13 03 aka ip[12]
     72 ;~~~~~~~~~~~~~~~~~~~~~
     73     movq        mm1, mm0
     74     movq        mm5, mm4
     75     paddw       mm1, mm3          ;ip[0] + ip[12] aka al
     76     paddw       mm5, mm2          ;ip[4] + ip[8] aka bl
     77 
     78     movq        mm6, mm1          ;temp al
     79     paddw       mm1, mm5          ;al + bl
     80     psubw       mm6, mm5          ;al - bl
     81     paddw       mm1, mm7
     82     paddw       mm6, mm7
     83     psraw       mm1, 3
     84     psraw       mm6, 3
     85 
     86     psubw       mm0, mm3          ;ip[0] - ip[12] aka d1
     87     psubw       mm4, mm2          ;ip[4] - ip[8] aka c1
     88 
     89     movq        mm5, mm0          ;temp dl
     90     paddw       mm0, mm4          ;dl + cl
     91     psubw       mm5, mm4          ;dl - cl
     92     paddw       mm0, mm7
     93     paddw       mm5, mm7
     94     psraw       mm0, 3
     95     psraw       mm5, 3
     96 ;~~~~~~~~~~~~~~~~~~~~~
     97 
     98     movd        eax, mm1
     99     movd        ecx, mm0
    100     psrlq       mm0, 32
    101     psrlq       mm1, 32
    102     mov         word ptr[rdx+32*0], ax
    103     mov         word ptr[rdx+32*1], cx
    104     shr         eax, 16
    105     shr         ecx, 16
    106     mov         word ptr[rdx+32*4], ax
    107     mov         word ptr[rdx+32*5], cx
    108     movd        eax, mm1
    109     movd        ecx, mm0
    110     mov         word ptr[rdx+32*8], ax
    111     mov         word ptr[rdx+32*9], cx
    112     shr         eax, 16
    113     shr         ecx, 16
    114     mov         word ptr[rdx+32*12], ax
    115     mov         word ptr[rdx+32*13], cx
    116 
    117     movd        eax, mm6
    118     movd        ecx, mm5
    119     psrlq       mm5, 32
    120     psrlq       mm6, 32
    121     mov         word ptr[rdx+32*2], ax
    122     mov         word ptr[rdx+32*3], cx
    123     shr         eax, 16
    124     shr         ecx, 16
    125     mov         word ptr[rdx+32*6], ax
    126     mov         word ptr[rdx+32*7], cx
    127     movd        eax, mm6
    128     movd        ecx, mm5
    129     mov         word ptr[rdx+32*10], ax
    130     mov         word ptr[rdx+32*11], cx
    131     shr         eax, 16
    132     shr         ecx, 16
    133     mov         word ptr[rdx+32*14], ax
    134     mov         word ptr[rdx+32*15], cx
    135 
    136     ; begin epilog
    137     UNSHADOW_ARGS
    138     pop         rbp
    139     ret
    140 
    141