Home | History | Annotate | Download | only in x86
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12 %include "vpx_ports/x86_abi_support.asm"
     13 
     14 SECTION .text
     15 
     16 ;void vp8_short_walsh4x4_sse2(short *input, short *output, int pitch)
     17 global sym(vp8_short_walsh4x4_sse2) PRIVATE
     18 sym(vp8_short_walsh4x4_sse2):
     19     push        rbp
     20     mov         rbp, rsp
     21     SHADOW_ARGS_TO_STACK 3
     22     SAVE_XMM 7
     23     GET_GOT     rbx
     24     push        rsi
     25     push        rdi
     26     ; end prolog
     27 
     28     mov     rsi, arg(0)           ; input
     29     mov     rdi, arg(1)           ; output
     30     movsxd  rdx, dword ptr arg(2) ; pitch
     31 
     32     ; first for loop
     33     movq    xmm0, MMWORD PTR [rsi]           ; load input
     34     movq    xmm1, MMWORD PTR [rsi + rdx]
     35     lea     rsi,  [rsi + rdx*2]
     36     movq    xmm2, MMWORD PTR [rsi]
     37     movq    xmm3, MMWORD PTR [rsi + rdx]
     38 
     39     punpcklwd xmm0,  xmm1
     40     punpcklwd xmm2,  xmm3
     41 
     42     movdqa    xmm1, xmm0
     43     punpckldq xmm0, xmm2           ; ip[1] ip[0]
     44     punpckhdq xmm1, xmm2           ; ip[3] ip[2]
     45 
     46     movdqa    xmm2, xmm0
     47     paddw     xmm0, xmm1
     48     psubw     xmm2, xmm1
     49 
     50     psllw     xmm0, 2              ; d1  a1
     51     psllw     xmm2, 2              ; c1  b1
     52 
     53     movdqa    xmm1, xmm0
     54     punpcklqdq xmm0, xmm2          ; b1  a1
     55     punpckhqdq xmm1, xmm2          ; c1  d1
     56 
     57     pxor      xmm6, xmm6
     58     movq      xmm6, xmm0
     59     pxor      xmm7, xmm7
     60     pcmpeqw   xmm7, xmm6
     61     paddw     xmm7, [GLOBAL(c1)]
     62 
     63     movdqa    xmm2, xmm0
     64     paddw     xmm0, xmm1           ; b1+c1  a1+d1
     65     psubw     xmm2, xmm1           ; b1-c1  a1-d1
     66     paddw     xmm0, xmm7           ; b1+c1  a1+d1+(a1!=0)
     67 
     68     ; second for loop
     69     ; input: 13  9  5  1 12  8  4  0 (xmm0)
     70     ;        14 10  6  2 15 11  7  3 (xmm2)
     71     ; after shuffle:
     72     ;        13  5  9  1 12  4  8  0 (xmm0)
     73     ;        14  6 10  2 15  7 11  3 (xmm1)
     74     pshuflw   xmm3, xmm0, 0xd8
     75     pshufhw   xmm0, xmm3, 0xd8
     76     pshuflw   xmm3, xmm2, 0xd8
     77     pshufhw   xmm1, xmm3, 0xd8
     78 
     79     movdqa    xmm2, xmm0
     80     pmaddwd   xmm0, [GLOBAL(c1)]    ; d11 a11 d10 a10
     81     pmaddwd   xmm2, [GLOBAL(cn1)]   ; c11 b11 c10 b10
     82     movdqa    xmm3, xmm1
     83     pmaddwd   xmm1, [GLOBAL(c1)]    ; d12 a12 d13 a13
     84     pmaddwd   xmm3, [GLOBAL(cn1)]   ; c12 b12 c13 b13
     85 
     86     pshufd    xmm4, xmm0, 0xd8      ; d11 d10 a11 a10
     87     pshufd    xmm5, xmm2, 0xd8      ; c11 c10 b11 b10
     88     pshufd    xmm6, xmm1, 0x72      ; d13 d12 a13 a12
     89     pshufd    xmm7, xmm3, 0x72      ; c13 c12 b13 b12
     90 
     91     movdqa    xmm0, xmm4
     92     punpcklqdq xmm0, xmm5           ; b11 b10 a11 a10
     93     punpckhqdq xmm4, xmm5           ; c11 c10 d11 d10
     94     movdqa    xmm1, xmm6
     95     punpcklqdq xmm1, xmm7           ; b13 b12 a13 a12
     96     punpckhqdq xmm6, xmm7           ; c13 c12 d13 d12
     97 
     98     movdqa    xmm2, xmm0
     99     paddd     xmm0, xmm4            ; b21 b20 a21 a20
    100     psubd     xmm2, xmm4            ; c21 c20 d21 d20
    101     movdqa    xmm3, xmm1
    102     paddd     xmm1, xmm6            ; b23 b22 a23 a22
    103     psubd     xmm3, xmm6            ; c23 c22 d23 d22
    104 
    105     pxor      xmm4, xmm4
    106     movdqa    xmm5, xmm4
    107     pcmpgtd   xmm4, xmm0
    108     pcmpgtd   xmm5, xmm2
    109     pand      xmm4, [GLOBAL(cd1)]
    110     pand      xmm5, [GLOBAL(cd1)]
    111 
    112     pxor      xmm6, xmm6
    113     movdqa    xmm7, xmm6
    114     pcmpgtd   xmm6, xmm1
    115     pcmpgtd   xmm7, xmm3
    116     pand      xmm6, [GLOBAL(cd1)]
    117     pand      xmm7, [GLOBAL(cd1)]
    118 
    119     paddd     xmm0, xmm4
    120     paddd     xmm2, xmm5
    121     paddd     xmm0, [GLOBAL(cd3)]
    122     paddd     xmm2, [GLOBAL(cd3)]
    123     paddd     xmm1, xmm6
    124     paddd     xmm3, xmm7
    125     paddd     xmm1, [GLOBAL(cd3)]
    126     paddd     xmm3, [GLOBAL(cd3)]
    127 
    128     psrad     xmm0, 3
    129     psrad     xmm1, 3
    130     psrad     xmm2, 3
    131     psrad     xmm3, 3
    132     movdqa    xmm4, xmm0
    133     punpcklqdq xmm0, xmm1           ; a23 a22 a21 a20
    134     punpckhqdq xmm4, xmm1           ; b23 b22 b21 b20
    135     movdqa    xmm5, xmm2
    136     punpckhqdq xmm2, xmm3           ; c23 c22 c21 c20
    137     punpcklqdq xmm5, xmm3           ; d23 d22 d21 d20
    138 
    139     packssdw  xmm0, xmm4            ; b23 b22 b21 b20 a23 a22 a21 a20
    140     packssdw  xmm2, xmm5            ; d23 d22 d21 d20 c23 c22 c21 c20
    141 
    142     movdqa  XMMWORD PTR [rdi], xmm0
    143     movdqa  XMMWORD PTR [rdi + 16], xmm2
    144 
    145     ; begin epilog
    146     pop rdi
    147     pop rsi
    148     RESTORE_GOT
    149     RESTORE_XMM
    150     UNSHADOW_ARGS
    151     pop         rbp
    152     ret
    153 
    154 SECTION_RODATA
    155 align 16
    156 c1:
    157     dw 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001
    158 align 16
    159 cn1:
    160     dw 0x0001, 0xffff, 0x0001, 0xffff, 0x0001, 0xffff, 0x0001, 0xffff
    161 align 16
    162 cd1:
    163     dd 0x00000001, 0x00000001, 0x00000001, 0x00000001
    164 align 16
    165 cd3:
    166     dd 0x00000003, 0x00000003, 0x00000003, 0x00000003
    167