Home | History | Annotate | Download | only in x86
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12 %include "vpx_ports/x86_abi_support.asm"
     13 
     14 ;void vp8_short_fdct4x4_sse2(short *input, short *output, int pitch)
     15 global sym(vp8_short_fdct4x4_sse2)
     16 sym(vp8_short_fdct4x4_sse2):
     17     push        rbp
     18     mov         rbp, rsp
     19     SHADOW_ARGS_TO_STACK 3
     20 ;;    SAVE_XMM
     21     GET_GOT     rbx
     22     push        rsi
     23     push        rdi
     24     ; end prolog
     25 
     26     mov         rsi, arg(0)
     27     movsxd      rax, DWORD PTR arg(2)
     28     lea         rdi, [rsi + rax*2]
     29 
     30     movq        xmm0, MMWORD PTR[rsi   ]        ;03 02 01 00
     31     movq        xmm2, MMWORD PTR[rsi + rax]     ;13 12 11 10
     32     movq        xmm1, MMWORD PTR[rsi + rax*2]   ;23 22 21 20
     33     movq        xmm3, MMWORD PTR[rdi + rax]     ;33 32 31 30
     34 
     35     punpcklqdq  xmm0, xmm2                      ;13 12 11 10 03 02 01 00
     36     punpcklqdq  xmm1, xmm3                      ;33 32 31 30 23 22 21 20
     37 
     38     mov         rdi, arg(1)
     39 
     40     movdqa      xmm2, xmm0
     41     punpckldq   xmm0, xmm1                      ;23 22 03 02 21 20 01 00
     42     punpckhdq   xmm2, xmm1                      ;33 32 13 12 31 30 11 10
     43     movdqa      xmm1, xmm0
     44     punpckldq   xmm0, xmm2                      ;31 21 30 20 11 10 01 00
     45     pshufhw     xmm1, xmm1, 0b1h                ;22 23 02 03 xx xx xx xx
     46     pshufhw     xmm2, xmm2, 0b1h                ;32 33 12 13 xx xx xx xx
     47 
     48     punpckhdq   xmm1, xmm2                      ;32 33 22 23 12 13 02 03
     49     movdqa      xmm3, xmm0
     50     paddw       xmm0, xmm1                      ;b1 a1 b1 a1 b1 a1 b1 a1
     51     psubw       xmm3, xmm1                      ;c1 d1 c1 d1 c1 d1 c1 d1
     52     psllw       xmm0, 3                         ;b1 <<= 3 a1 <<= 3
     53     psllw       xmm3, 3                         ;c1 <<= 3 d1 <<= 3
     54     movdqa      xmm1, xmm0
     55     pmaddwd     xmm0, XMMWORD PTR[GLOBAL(_mult_add)]    ;a1 + b1
     56     pmaddwd     xmm1, XMMWORD PTR[GLOBAL(_mult_sub)]    ;a1 - b1
     57     movdqa      xmm4, xmm3
     58     pmaddwd     xmm3, XMMWORD PTR[GLOBAL(_5352_2217)]   ;c1*2217 + d1*5352
     59     pmaddwd     xmm4, XMMWORD PTR[GLOBAL(_2217_neg5352)];d1*2217 - c1*5352
     60 
     61     paddd       xmm3, XMMWORD PTR[GLOBAL(_14500)]
     62     paddd       xmm4, XMMWORD PTR[GLOBAL(_7500)]
     63     psrad       xmm3, 12            ;(c1 * 2217 + d1 * 5352 +  14500)>>12
     64     psrad       xmm4, 12            ;(d1 * 2217 - c1 * 5352 +   7500)>>12
     65 
     66     packssdw    xmm0, xmm1                      ;op[2] op[0]
     67     packssdw    xmm3, xmm4                      ;op[3] op[1]
     68     ; 23 22 21 20 03 02 01 00
     69     ;
     70     ; 33 32 31 30 13 12 11 10
     71     ;
     72     movdqa      xmm2, xmm0
     73     punpcklqdq  xmm0, xmm3                      ;13 12 11 10 03 02 01 00
     74     punpckhqdq  xmm2, xmm3                      ;23 22 21 20 33 32 31 30
     75 
     76     movdqa      xmm3, xmm0
     77     punpcklwd   xmm0, xmm2                      ;32 30 22 20 12 10 02 00
     78     punpckhwd   xmm3, xmm2                      ;33 31 23 21 13 11 03 01
     79     movdqa      xmm2, xmm0
     80     punpcklwd   xmm0, xmm3                      ;13 12 11 10 03 02 01 00
     81     punpckhwd   xmm2, xmm3                      ;33 32 31 30 23 22 21 20
     82 
     83     movdqa      xmm5, XMMWORD PTR[GLOBAL(_7)]
     84     pshufd      xmm2, xmm2, 04eh
     85     movdqa      xmm3, xmm0
     86     paddw       xmm0, xmm2                      ;b1 b1 b1 b1 a1 a1 a1 a1
     87     psubw       xmm3, xmm2                      ;c1 c1 c1 c1 d1 d1 d1 d1
     88 
     89     pshufd      xmm0, xmm0, 0d8h                ;b1 b1 a1 a1 b1 b1 a1 a1
     90     movdqa      xmm2, xmm3                      ;save d1 for compare
     91     pshufd      xmm3, xmm3, 0d8h                ;c1 c1 d1 d1 c1 c1 d1 d1
     92     pshuflw     xmm0, xmm0, 0d8h                ;b1 b1 a1 a1 b1 a1 b1 a1
     93     pshuflw     xmm3, xmm3, 0d8h                ;c1 c1 d1 d1 c1 d1 c1 d1
     94     pshufhw     xmm0, xmm0, 0d8h                ;b1 a1 b1 a1 b1 a1 b1 a1
     95     pshufhw     xmm3, xmm3, 0d8h                ;c1 d1 c1 d1 c1 d1 c1 d1
     96     movdqa      xmm1, xmm0
     97     pmaddwd     xmm0, XMMWORD PTR[GLOBAL(_mult_add)] ;a1 + b1
     98     pmaddwd     xmm1, XMMWORD PTR[GLOBAL(_mult_sub)] ;a1 - b1
     99 
    100     pxor        xmm4, xmm4                      ;zero out for compare
    101     paddd       xmm0, xmm5
    102     paddd       xmm1, xmm5
    103     pcmpeqw     xmm2, xmm4
    104     psrad       xmm0, 4                         ;(a1 + b1 + 7)>>4
    105     psrad       xmm1, 4                         ;(a1 - b1 + 7)>>4
    106     pandn       xmm2, XMMWORD PTR[GLOBAL(_cmp_mask)] ;clear upper,
    107                                                      ;and keep bit 0 of lower
    108 
    109     movdqa      xmm4, xmm3
    110     pmaddwd     xmm3, XMMWORD PTR[GLOBAL(_5352_2217)]    ;c1*2217 + d1*5352
    111     pmaddwd     xmm4, XMMWORD PTR[GLOBAL(_2217_neg5352)] ;d1*2217 - c1*5352
    112     paddd       xmm3, XMMWORD PTR[GLOBAL(_12000)]
    113     paddd       xmm4, XMMWORD PTR[GLOBAL(_51000)]
    114     packssdw    xmm0, xmm1                      ;op[8] op[0]
    115     psrad       xmm3, 16                ;(c1 * 2217 + d1 * 5352 +  12000)>>16
    116     psrad       xmm4, 16                ;(d1 * 2217 - c1 * 5352 +  51000)>>16
    117 
    118     packssdw    xmm3, xmm4                      ;op[12] op[4]
    119     movdqa      xmm1, xmm0
    120     paddw       xmm3, xmm2                      ;op[4] += (d1!=0)
    121     punpcklqdq  xmm0, xmm3                      ;op[4] op[0]
    122     punpckhqdq  xmm1, xmm3                      ;op[12] op[8]
    123 
    124     movdqa      XMMWORD PTR[rdi + 0], xmm0
    125     movdqa      XMMWORD PTR[rdi + 16], xmm1
    126 
    127     ; begin epilog
    128     pop rdi
    129     pop rsi
    130     RESTORE_GOT
    131 ;;    RESTORE_XMM
    132     UNSHADOW_ARGS
    133     pop         rbp
    134     ret
    135 
    136 SECTION_RODATA
    137 align 16
    138 _5352_2217:
    139     dw 5352
    140     dw 2217
    141     dw 5352
    142     dw 2217
    143     dw 5352
    144     dw 2217
    145     dw 5352
    146     dw 2217
    147 align 16
    148 _2217_neg5352:
    149     dw 2217
    150     dw -5352
    151     dw 2217
    152     dw -5352
    153     dw 2217
    154     dw -5352
    155     dw 2217
    156     dw -5352
    157 align 16
    158 _mult_add:
    159     times 8 dw 1
    160 align 16
    161 _cmp_mask:
    162     times 4 dw 1
    163     times 4 dw 0
    164 
    165 align 16
    166 _mult_sub:
    167     dw 1
    168     dw -1
    169     dw 1
    170     dw -1
    171     dw 1
    172     dw -1
    173     dw 1
    174     dw -1
    175 align 16
    176 _7:
    177     times 4 dd 7
    178 align 16
    179 _14500:
    180     times 4 dd 14500
    181 align 16
    182 _7500:
    183     times 4 dd 7500
    184 align 16
    185 _12000:
    186     times 4 dd 12000
    187 align 16
    188 _51000:
    189     times 4 dd 51000
    190