Home | History | Annotate | Download | only in x86
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12 %include "vpx_ports/x86_abi_support.asm"
     13 
     14 %macro STACK_FRAME_CREATE 0
     15 %if ABI_IS_32BIT
     16   %define       input       rsi
     17   %define       output      rdi
     18   %define       pitch       rax
     19     push        rbp
     20     mov         rbp, rsp
     21     GET_GOT     rbx
     22     push        rsi
     23     push        rdi
     24     ; end prolog
     25 
     26     mov         rsi, arg(0)
     27     mov         rdi, arg(1)
     28 
     29     movsxd      rax, dword ptr arg(2)
     30     lea         rcx, [rsi + rax*2]
     31 %else
     32   %ifidn __OUTPUT_FORMAT__,x64
     33     %define     input       rcx
     34     %define     output      rdx
     35     %define     pitch       r8
     36     SAVE_XMM 7, u
     37   %else
     38     %define     input       rdi
     39     %define     output      rsi
     40     %define     pitch       rdx
     41   %endif
     42 %endif
     43 %endmacro
     44 
     45 %macro STACK_FRAME_DESTROY 0
     46   %define     input
     47   %define     output
     48   %define     pitch
     49 
     50 %if ABI_IS_32BIT
     51     pop         rdi
     52     pop         rsi
     53     RESTORE_GOT
     54     pop         rbp
     55 %else
     56   %ifidn __OUTPUT_FORMAT__,x64
     57     RESTORE_XMM
     58   %endif
     59 %endif
     60     ret
     61 %endmacro
     62 
     63 ;void vp8_short_fdct4x4_sse2(short *input, short *output, int pitch)
     64 global sym(vp8_short_fdct4x4_sse2) PRIVATE
     65 sym(vp8_short_fdct4x4_sse2):
     66 
     67     STACK_FRAME_CREATE
     68 
     69     movq        xmm0, MMWORD PTR[input        ] ;03 02 01 00
     70     movq        xmm2, MMWORD PTR[input+  pitch] ;13 12 11 10
     71     lea         input,          [input+2*pitch]
     72     movq        xmm1, MMWORD PTR[input        ] ;23 22 21 20
     73     movq        xmm3, MMWORD PTR[input+  pitch] ;33 32 31 30
     74 
     75     punpcklqdq  xmm0, xmm2                      ;13 12 11 10 03 02 01 00
     76     punpcklqdq  xmm1, xmm3                      ;33 32 31 30 23 22 21 20
     77 
     78     movdqa      xmm2, xmm0
     79     punpckldq   xmm0, xmm1                      ;23 22 03 02 21 20 01 00
     80     punpckhdq   xmm2, xmm1                      ;33 32 13 12 31 30 11 10
     81     movdqa      xmm1, xmm0
     82     punpckldq   xmm0, xmm2                      ;31 21 30 20 11 10 01 00
     83     pshufhw     xmm1, xmm1, 0b1h                ;22 23 02 03 xx xx xx xx
     84     pshufhw     xmm2, xmm2, 0b1h                ;32 33 12 13 xx xx xx xx
     85 
     86     punpckhdq   xmm1, xmm2                      ;32 33 22 23 12 13 02 03
     87     movdqa      xmm3, xmm0
     88     paddw       xmm0, xmm1                      ;b1 a1 b1 a1 b1 a1 b1 a1
     89     psubw       xmm3, xmm1                      ;c1 d1 c1 d1 c1 d1 c1 d1
     90     psllw       xmm0, 3                         ;b1 <<= 3 a1 <<= 3
     91     psllw       xmm3, 3                         ;c1 <<= 3 d1 <<= 3
     92 
     93     movdqa      xmm1, xmm0
     94     pmaddwd     xmm0, XMMWORD PTR[GLOBAL(_mult_add)]    ;a1 + b1
     95     pmaddwd     xmm1, XMMWORD PTR[GLOBAL(_mult_sub)]    ;a1 - b1
     96     movdqa      xmm4, xmm3
     97     pmaddwd     xmm3, XMMWORD PTR[GLOBAL(_5352_2217)]   ;c1*2217 + d1*5352
     98     pmaddwd     xmm4, XMMWORD PTR[GLOBAL(_2217_neg5352)];d1*2217 - c1*5352
     99 
    100     paddd       xmm3, XMMWORD PTR[GLOBAL(_14500)]
    101     paddd       xmm4, XMMWORD PTR[GLOBAL(_7500)]
    102     psrad       xmm3, 12            ;(c1 * 2217 + d1 * 5352 +  14500)>>12
    103     psrad       xmm4, 12            ;(d1 * 2217 - c1 * 5352 +   7500)>>12
    104 
    105     packssdw    xmm0, xmm1                      ;op[2] op[0]
    106     packssdw    xmm3, xmm4                      ;op[3] op[1]
    107     ; 23 22 21 20 03 02 01 00
    108     ;
    109     ; 33 32 31 30 13 12 11 10
    110     ;
    111     movdqa      xmm2, xmm0
    112     punpcklqdq  xmm0, xmm3                      ;13 12 11 10 03 02 01 00
    113     punpckhqdq  xmm2, xmm3                      ;23 22 21 20 33 32 31 30
    114 
    115     movdqa      xmm3, xmm0
    116     punpcklwd   xmm0, xmm2                      ;32 30 22 20 12 10 02 00
    117     punpckhwd   xmm3, xmm2                      ;33 31 23 21 13 11 03 01
    118     movdqa      xmm2, xmm0
    119     punpcklwd   xmm0, xmm3                      ;13 12 11 10 03 02 01 00
    120     punpckhwd   xmm2, xmm3                      ;33 32 31 30 23 22 21 20
    121 
    122     movdqa      xmm5, XMMWORD PTR[GLOBAL(_7)]
    123     pshufd      xmm2, xmm2, 04eh
    124     movdqa      xmm3, xmm0
    125     paddw       xmm0, xmm2                      ;b1 b1 b1 b1 a1 a1 a1 a1
    126     psubw       xmm3, xmm2                      ;c1 c1 c1 c1 d1 d1 d1 d1
    127 
    128     pshufd      xmm0, xmm0, 0d8h                ;b1 b1 a1 a1 b1 b1 a1 a1
    129     movdqa      xmm2, xmm3                      ;save d1 for compare
    130     pshufd      xmm3, xmm3, 0d8h                ;c1 c1 d1 d1 c1 c1 d1 d1
    131     pshuflw     xmm0, xmm0, 0d8h                ;b1 b1 a1 a1 b1 a1 b1 a1
    132     pshuflw     xmm3, xmm3, 0d8h                ;c1 c1 d1 d1 c1 d1 c1 d1
    133     pshufhw     xmm0, xmm0, 0d8h                ;b1 a1 b1 a1 b1 a1 b1 a1
    134     pshufhw     xmm3, xmm3, 0d8h                ;c1 d1 c1 d1 c1 d1 c1 d1
    135     movdqa      xmm1, xmm0
    136     pmaddwd     xmm0, XMMWORD PTR[GLOBAL(_mult_add)] ;a1 + b1
    137     pmaddwd     xmm1, XMMWORD PTR[GLOBAL(_mult_sub)] ;a1 - b1
    138 
    139     pxor        xmm4, xmm4                      ;zero out for compare
    140     paddd       xmm0, xmm5
    141     paddd       xmm1, xmm5
    142     pcmpeqw     xmm2, xmm4
    143     psrad       xmm0, 4                         ;(a1 + b1 + 7)>>4
    144     psrad       xmm1, 4                         ;(a1 - b1 + 7)>>4
    145     pandn       xmm2, XMMWORD PTR[GLOBAL(_cmp_mask)] ;clear upper,
    146                                                      ;and keep bit 0 of lower
    147 
    148     movdqa      xmm4, xmm3
    149     pmaddwd     xmm3, XMMWORD PTR[GLOBAL(_5352_2217)]    ;c1*2217 + d1*5352
    150     pmaddwd     xmm4, XMMWORD PTR[GLOBAL(_2217_neg5352)] ;d1*2217 - c1*5352
    151     paddd       xmm3, XMMWORD PTR[GLOBAL(_12000)]
    152     paddd       xmm4, XMMWORD PTR[GLOBAL(_51000)]
    153     packssdw    xmm0, xmm1                      ;op[8] op[0]
    154     psrad       xmm3, 16                ;(c1 * 2217 + d1 * 5352 +  12000)>>16
    155     psrad       xmm4, 16                ;(d1 * 2217 - c1 * 5352 +  51000)>>16
    156 
    157     packssdw    xmm3, xmm4                      ;op[12] op[4]
    158     movdqa      xmm1, xmm0
    159     paddw       xmm3, xmm2                      ;op[4] += (d1!=0)
    160     punpcklqdq  xmm0, xmm3                      ;op[4] op[0]
    161     punpckhqdq  xmm1, xmm3                      ;op[12] op[8]
    162 
    163     movdqa      XMMWORD PTR[output +  0], xmm0
    164     movdqa      XMMWORD PTR[output + 16], xmm1
    165 
    166     STACK_FRAME_DESTROY
    167 
    168 ;void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch)
    169 global sym(vp8_short_fdct8x4_sse2) PRIVATE
    170 sym(vp8_short_fdct8x4_sse2):
    171 
    172     STACK_FRAME_CREATE
    173 
    174         ; read the input data
    175         movdqa      xmm0,       [input        ]
    176         movdqa      xmm2,       [input+  pitch]
    177         lea         input,      [input+2*pitch]
    178         movdqa      xmm4,       [input        ]
    179         movdqa      xmm3,       [input+  pitch]
    180 
    181         ; transpose for the first stage
    182         movdqa      xmm1,       xmm0        ; 00 01 02 03 04 05 06 07
    183         movdqa      xmm5,       xmm4        ; 20 21 22 23 24 25 26 27
    184 
    185         punpcklwd   xmm0,       xmm2        ; 00 10 01 11 02 12 03 13
    186         punpckhwd   xmm1,       xmm2        ; 04 14 05 15 06 16 07 17
    187 
    188         punpcklwd   xmm4,       xmm3        ; 20 30 21 31 22 32 23 33
    189         punpckhwd   xmm5,       xmm3        ; 24 34 25 35 26 36 27 37
    190 
    191         movdqa      xmm2,       xmm0        ; 00 10 01 11 02 12 03 13
    192         punpckldq   xmm0,       xmm4        ; 00 10 20 30 01 11 21 31
    193 
    194         punpckhdq   xmm2,       xmm4        ; 02 12 22 32 03 13 23 33
    195 
    196         movdqa      xmm4,       xmm1        ; 04 14 05 15 06 16 07 17
    197         punpckldq   xmm4,       xmm5        ; 04 14 24 34 05 15 25 35
    198 
    199         punpckhdq   xmm1,       xmm5        ; 06 16 26 36 07 17 27 37
    200         movdqa      xmm3,       xmm2        ; 02 12 22 32 03 13 23 33
    201 
    202         punpckhqdq  xmm3,       xmm1        ; 03 13 23 33 07 17 27 37
    203         punpcklqdq  xmm2,       xmm1        ; 02 12 22 32 06 16 26 36
    204 
    205         movdqa      xmm1,       xmm0        ; 00 10 20 30 01 11 21 31
    206         punpcklqdq  xmm0,       xmm4        ; 00 10 20 30 04 14 24 34
    207 
    208         punpckhqdq  xmm1,       xmm4        ; 01 11 21 32 05 15 25 35
    209 
    210         ; xmm0 0
    211         ; xmm1 1
    212         ; xmm2 2
    213         ; xmm3 3
    214 
    215         ; first stage
    216         movdqa      xmm5,       xmm0
    217         movdqa      xmm4,       xmm1
    218 
    219         paddw       xmm0,       xmm3        ; a1 = 0 + 3
    220         paddw       xmm1,       xmm2        ; b1 = 1 + 2
    221 
    222         psubw       xmm4,       xmm2        ; c1 = 1 - 2
    223         psubw       xmm5,       xmm3        ; d1 = 0 - 3
    224 
    225         psllw       xmm5,        3
    226         psllw       xmm4,        3
    227 
    228         psllw       xmm0,        3
    229         psllw       xmm1,        3
    230 
    231         ; output 0 and 2
    232         movdqa      xmm2,       xmm0        ; a1
    233 
    234         paddw       xmm0,       xmm1        ; op[0] = a1 + b1
    235         psubw       xmm2,       xmm1        ; op[2] = a1 - b1
    236 
    237         ; output 1 and 3
    238         ; interleave c1, d1
    239         movdqa      xmm1,       xmm5        ; d1
    240         punpcklwd   xmm1,       xmm4        ; c1 d1
    241         punpckhwd   xmm5,       xmm4        ; c1 d1
    242 
    243         movdqa      xmm3,       xmm1
    244         movdqa      xmm4,       xmm5
    245 
    246         pmaddwd     xmm1,       XMMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
    247         pmaddwd     xmm4,       XMMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
    248 
    249         pmaddwd     xmm3,       XMMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
    250         pmaddwd     xmm5,       XMMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
    251 
    252         paddd       xmm1,       XMMWORD PTR[GLOBAL(_14500)]
    253         paddd       xmm4,       XMMWORD PTR[GLOBAL(_14500)]
    254         paddd       xmm3,       XMMWORD PTR[GLOBAL(_7500)]
    255         paddd       xmm5,       XMMWORD PTR[GLOBAL(_7500)]
    256 
    257         psrad       xmm1,       12          ; (c1 * 2217 + d1 * 5352 +  14500)>>12
    258         psrad       xmm4,       12          ; (c1 * 2217 + d1 * 5352 +  14500)>>12
    259         psrad       xmm3,       12          ; (d1 * 2217 - c1 * 5352 +   7500)>>12
    260         psrad       xmm5,       12          ; (d1 * 2217 - c1 * 5352 +   7500)>>12
    261 
    262         packssdw    xmm1,       xmm4        ; op[1]
    263         packssdw    xmm3,       xmm5        ; op[3]
    264 
    265         ; done with vertical
    266         ; transpose for the second stage
    267         movdqa      xmm4,       xmm0         ; 00 10 20 30 04 14 24 34
    268         movdqa      xmm5,       xmm2         ; 02 12 22 32 06 16 26 36
    269 
    270         punpcklwd   xmm0,       xmm1         ; 00 01 10 11 20 21 30 31
    271         punpckhwd   xmm4,       xmm1         ; 04 05 14 15 24 25 34 35
    272 
    273         punpcklwd   xmm2,       xmm3         ; 02 03 12 13 22 23 32 33
    274         punpckhwd   xmm5,       xmm3         ; 06 07 16 17 26 27 36 37
    275 
    276         movdqa      xmm1,       xmm0         ; 00 01 10 11 20 21 30 31
    277         punpckldq   xmm0,       xmm2         ; 00 01 02 03 10 11 12 13
    278 
    279         punpckhdq   xmm1,       xmm2         ; 20 21 22 23 30 31 32 33
    280 
    281         movdqa      xmm2,       xmm4         ; 04 05 14 15 24 25 34 35
    282         punpckldq   xmm2,       xmm5         ; 04 05 06 07 14 15 16 17
    283 
    284         punpckhdq   xmm4,       xmm5         ; 24 25 26 27 34 35 36 37
    285         movdqa      xmm3,       xmm1         ; 20 21 22 23 30 31 32 33
    286 
    287         punpckhqdq  xmm3,       xmm4         ; 30 31 32 33 34 35 36 37
    288         punpcklqdq  xmm1,       xmm4         ; 20 21 22 23 24 25 26 27
    289 
    290         movdqa      xmm4,       xmm0         ; 00 01 02 03 10 11 12 13
    291         punpcklqdq  xmm0,       xmm2         ; 00 01 02 03 04 05 06 07
    292 
    293         punpckhqdq  xmm4,       xmm2         ; 10 11 12 13 14 15 16 17
    294 
    295         ; xmm0 0
    296         ; xmm1 4
    297         ; xmm2 1
    298         ; xmm3 3
    299 
    300         movdqa      xmm5,       xmm0
    301         movdqa      xmm2,       xmm1
    302 
    303         paddw       xmm0,       xmm3        ; a1 = 0 + 3
    304         paddw       xmm1,       xmm4        ; b1 = 1 + 2
    305 
    306         psubw       xmm4,       xmm2        ; c1 = 1 - 2
    307         psubw       xmm5,       xmm3        ; d1 = 0 - 3
    308 
    309         pxor        xmm6,       xmm6        ; zero out for compare
    310 
    311         pcmpeqw     xmm6,       xmm5        ; d1 != 0
    312 
    313         pandn       xmm6,       XMMWORD PTR[GLOBAL(_cmp_mask8x4)]   ; clear upper,
    314                                                                     ; and keep bit 0 of lower
    315 
    316         ; output 0 and 2
    317         movdqa      xmm2,       xmm0        ; a1
    318 
    319         paddw       xmm0,       xmm1        ; a1 + b1
    320         psubw       xmm2,       xmm1        ; a1 - b1
    321 
    322         paddw       xmm0,       XMMWORD PTR[GLOBAL(_7w)]
    323         paddw       xmm2,       XMMWORD PTR[GLOBAL(_7w)]
    324 
    325         psraw       xmm0,       4           ; op[0] = (a1 + b1 + 7)>>4
    326         psraw       xmm2,       4           ; op[8] = (a1 - b1 + 7)>>4
    327 
    328         ; output 1 and 3
    329         ; interleave c1, d1
    330         movdqa      xmm1,       xmm5        ; d1
    331         punpcklwd   xmm1,       xmm4        ; c1 d1
    332         punpckhwd   xmm5,       xmm4        ; c1 d1
    333 
    334         movdqa      xmm3,       xmm1
    335         movdqa      xmm4,       xmm5
    336 
    337         pmaddwd     xmm1,       XMMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
    338         pmaddwd     xmm4,       XMMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
    339 
    340         pmaddwd     xmm3,       XMMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
    341         pmaddwd     xmm5,       XMMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
    342 
    343         paddd       xmm1,       XMMWORD PTR[GLOBAL(_12000)]
    344         paddd       xmm4,       XMMWORD PTR[GLOBAL(_12000)]
    345         paddd       xmm3,       XMMWORD PTR[GLOBAL(_51000)]
    346         paddd       xmm5,       XMMWORD PTR[GLOBAL(_51000)]
    347 
    348         psrad       xmm1,       16          ; (c1 * 2217 + d1 * 5352 +  14500)>>16
    349         psrad       xmm4,       16          ; (c1 * 2217 + d1 * 5352 +  14500)>>16
    350         psrad       xmm3,       16          ; (d1 * 2217 - c1 * 5352 +   7500)>>16
    351         psrad       xmm5,       16          ; (d1 * 2217 - c1 * 5352 +   7500)>>16
    352 
    353         packssdw    xmm1,       xmm4        ; op[4]
    354         packssdw    xmm3,       xmm5        ; op[12]
    355 
    356         paddw       xmm1,       xmm6        ; op[4] += (d1!=0)
    357 
    358         movdqa      xmm4,       xmm0
    359         movdqa      xmm5,       xmm2
    360 
    361         punpcklqdq  xmm0,       xmm1
    362         punpckhqdq  xmm4,       xmm1
    363 
    364         punpcklqdq  xmm2,       xmm3
    365         punpckhqdq  xmm5,       xmm3
    366 
    367         movdqa      XMMWORD PTR[output + 0 ],  xmm0
    368         movdqa      XMMWORD PTR[output + 16],  xmm2
    369         movdqa      XMMWORD PTR[output + 32],  xmm4
    370         movdqa      XMMWORD PTR[output + 48],  xmm5
    371 
    372     STACK_FRAME_DESTROY
    373 
    374 SECTION_RODATA
    375 align 16
    376 _5352_2217:
    377     dw 5352
    378     dw 2217
    379     dw 5352
    380     dw 2217
    381     dw 5352
    382     dw 2217
    383     dw 5352
    384     dw 2217
    385 align 16
    386 _2217_neg5352:
    387     dw 2217
    388     dw -5352
    389     dw 2217
    390     dw -5352
    391     dw 2217
    392     dw -5352
    393     dw 2217
    394     dw -5352
    395 align 16
    396 _mult_add:
    397     times 8 dw 1
    398 align 16
    399 _cmp_mask:
    400     times 4 dw 1
    401     times 4 dw 0
    402 align 16
    403 _cmp_mask8x4:
    404     times 8 dw 1
    405 align 16
    406 _mult_sub:
    407     dw 1
    408     dw -1
    409     dw 1
    410     dw -1
    411     dw 1
    412     dw -1
    413     dw 1
    414     dw -1
    415 align 16
    416 _7:
    417     times 4 dd 7
    418 align 16
    419 _7w:
    420     times 8 dw 7
    421 align 16
    422 _14500:
    423     times 4 dd 14500
    424 align 16
    425 _7500:
    426     times 4 dd 7500
    427 align 16
    428 _12000:
    429     times 4 dd 12000
    430 align 16
    431 _51000:
    432     times 4 dd 51000
    433