Home | History | Annotate | Download | only in x86
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12 %include "vpx_ports/x86_abi_support.asm"
     13 
     14 ;void vp8_short_inv_walsh4x4_1_mmx(short *input, short *output)
     15 global sym(vp8_short_inv_walsh4x4_1_mmx)
     16 sym(vp8_short_inv_walsh4x4_1_mmx):
     17     push        rbp
     18     mov         rbp, rsp
     19     SHADOW_ARGS_TO_STACK 2
     20     push        rsi
     21     push        rdi
     22     ; end prolog
     23 
     24     mov     rsi, arg(0)
     25     mov     rax, 3
     26 
     27     mov     rdi, arg(1)
     28     add     rax, [rsi]          ;input[0] + 3
     29 
     30     movd    mm0, eax
     31 
     32     punpcklwd mm0, mm0          ;x x val val
     33 
     34     punpckldq mm0, mm0          ;val val val val
     35 
     36     psraw   mm0, 3            ;(input[0] + 3) >> 3
     37 
     38     movq  [rdi + 0], mm0
     39     movq  [rdi + 8], mm0
     40     movq  [rdi + 16], mm0
     41     movq  [rdi + 24], mm0
     42 
     43     ; begin epilog
     44     pop rdi
     45     pop rsi
     46     UNSHADOW_ARGS
     47     pop         rbp
     48     ret
     49 
     50 ;void vp8_short_inv_walsh4x4_mmx(short *input, short *output)
     51 global sym(vp8_short_inv_walsh4x4_mmx)
     52 sym(vp8_short_inv_walsh4x4_mmx):
     53     push        rbp
     54     mov         rbp, rsp
     55     SHADOW_ARGS_TO_STACK 2
     56     push        rsi
     57     push        rdi
     58     ; end prolog
     59 
     60     mov     rax, 3
     61     mov     rsi, arg(0)
     62     mov     rdi, arg(1)
     63     shl     rax, 16
     64 
     65     movq    mm0, [rsi + 0]        ;ip[0]
     66     movq    mm1, [rsi + 8]        ;ip[4]
     67     or      rax, 3            ;00030003h
     68 
     69     movq    mm2, [rsi + 16]       ;ip[8]
     70     movq    mm3, [rsi + 24]       ;ip[12]
     71 
     72     movq    mm7, rax
     73     movq    mm4, mm0
     74 
     75     punpcklwd mm7, mm7          ;0003000300030003h
     76     movq    mm5, mm1
     77 
     78     paddw   mm4, mm3          ;ip[0] + ip[12] aka al
     79     paddw   mm5, mm2          ;ip[4] + ip[8] aka bl
     80 
     81     movq    mm6, mm4          ;temp al
     82 
     83     paddw   mm4, mm5          ;al + bl
     84     psubw   mm6, mm5          ;al - bl
     85 
     86     psubw   mm0, mm3          ;ip[0] - ip[12] aka d1
     87     psubw   mm1, mm2          ;ip[4] - ip[8] aka c1
     88 
     89     movq    mm5, mm0          ;temp dl
     90 
     91     paddw   mm0, mm1          ;dl + cl
     92     psubw   mm5, mm1          ;dl - cl
     93 
     94     ; 03 02 01 00
     95     ; 13 12 11 10
     96     ; 23 22 21 20
     97     ; 33 32 31 30
     98 
     99     movq    mm3, mm4          ; 03 02 01 00
    100     punpcklwd mm4, mm0          ; 11 01 10 00
    101     punpckhwd mm3, mm0          ; 13 03 12 02
    102 
    103     movq    mm1, mm6          ; 23 22 21 20
    104     punpcklwd mm6, mm5          ; 31 21 30 20
    105     punpckhwd mm1, mm5          ; 33 23 32 22
    106 
    107     movq    mm0, mm4          ; 11 01 10 00
    108     movq    mm2, mm3          ; 13 03 12 02
    109 
    110     punpckldq mm0, mm6          ; 30 20 10 00 aka ip[0]
    111     punpckhdq mm4, mm6          ; 31 21 11 01 aka ip[4]
    112 
    113     punpckldq mm2, mm1          ; 32 22 12 02 aka ip[8]
    114     punpckhdq mm3, mm1          ; 33 23 13 03 aka ip[12]
    115 ;~~~~~~~~~~~~~~~~~~~~~
    116     movq    mm1, mm0
    117     movq    mm5, mm4
    118 
    119     paddw   mm1, mm3          ;ip[0] + ip[12] aka al
    120     paddw   mm5, mm2          ;ip[4] + ip[8] aka bl
    121 
    122     movq    mm6, mm1          ;temp al
    123 
    124     paddw   mm1, mm5          ;al + bl
    125     psubw   mm6, mm5          ;al - bl
    126 
    127     psubw   mm0, mm3          ;ip[0] - ip[12] aka d1
    128     psubw   mm4, mm2          ;ip[4] - ip[8] aka c1
    129 
    130     movq    mm5, mm0          ;temp dl
    131 
    132     paddw   mm0, mm4          ;dl + cl
    133     psubw   mm5, mm4          ;dl - cl
    134 ;~~~~~~~~~~~~~~~~~~~~~
    135     movq    mm3, mm1          ; 03 02 01 00
    136     punpcklwd mm1, mm0          ; 11 01 10 00
    137     punpckhwd mm3, mm0          ; 13 03 12 02
    138 
    139     movq    mm4, mm6          ; 23 22 21 20
    140     punpcklwd mm6, mm5          ; 31 21 30 20
    141     punpckhwd mm4, mm5          ; 33 23 32 22
    142 
    143     movq    mm0, mm1          ; 11 01 10 00
    144     movq    mm2, mm3          ; 13 03 12 02
    145 
    146     punpckldq mm0, mm6          ; 30 20 10 00 aka ip[0]
    147     punpckhdq mm1, mm6          ; 31 21 11 01 aka ip[4]
    148 
    149     punpckldq mm2, mm4          ; 32 22 12 02 aka ip[8]
    150     punpckhdq mm3, mm4          ; 33 23 13 03 aka ip[12]
    151 
    152     paddw   mm0, mm7
    153     paddw   mm1, mm7
    154     paddw   mm2, mm7
    155     paddw   mm3, mm7
    156 
    157     psraw   mm0, 3
    158     psraw   mm1, 3
    159     psraw   mm2, 3
    160     psraw   mm3, 3
    161 
    162     movq  [rdi + 0], mm0
    163     movq  [rdi + 8], mm1
    164     movq  [rdi + 16], mm2
    165     movq  [rdi + 24], mm3
    166 
    167     ; begin epilog
    168     pop rdi
    169     pop rsi
    170     UNSHADOW_ARGS
    171     pop         rbp
    172     ret
    173 
    174