Home | History | Annotate | Download | only in x86
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12 %include "vpx_ports/x86_abi_support.asm"
     13 
     14 %macro STACK_FRAME_CREATE 0
     15 %if ABI_IS_32BIT
     16   %define       input       rsi
     17   %define       output      rdi
     18   %define       pitch       rax
     19     push        rbp
     20     mov         rbp, rsp
     21     GET_GOT     rbx
     22     push        rsi
     23     push        rdi
     24     ; end prolog
     25 
     26     mov         rsi, arg(0)
     27     mov         rdi, arg(1)
     28 
     29     movsxd      rax, dword ptr arg(2)
     30     lea         rcx, [rsi + rax*2]
     31 %else
     32   %if LIBVPX_YASM_WIN64
     33     %define     input       rcx
     34     %define     output      rdx
     35     %define     pitch       r8
     36     SAVE_XMM 7, u
     37   %else
     38     %define     input       rdi
     39     %define     output      rsi
     40     %define     pitch       rdx
     41   %endif
     42 %endif
     43 %endmacro
     44 
     45 %macro STACK_FRAME_DESTROY 0
     46   %define     input
     47   %define     output
     48   %define     pitch
     49 
     50 %if ABI_IS_32BIT
     51     pop         rdi
     52     pop         rsi
     53     RESTORE_GOT
     54     pop         rbp
     55 %else
     56   %if LIBVPX_YASM_WIN64
     57     RESTORE_XMM
     58   %endif
     59 %endif
     60     ret
     61 %endmacro
     62 
     63 SECTION .text
     64 
     65 ;void vp8_short_fdct4x4_sse2(short *input, short *output, int pitch)
     66 global sym(vp8_short_fdct4x4_sse2) PRIVATE
     67 sym(vp8_short_fdct4x4_sse2):
     68 
     69     STACK_FRAME_CREATE
     70 
     71     movq        xmm0, MMWORD PTR[input        ] ;03 02 01 00
     72     movq        xmm2, MMWORD PTR[input+  pitch] ;13 12 11 10
     73     lea         input,          [input+2*pitch]
     74     movq        xmm1, MMWORD PTR[input        ] ;23 22 21 20
     75     movq        xmm3, MMWORD PTR[input+  pitch] ;33 32 31 30
     76 
     77     punpcklqdq  xmm0, xmm2                      ;13 12 11 10 03 02 01 00
     78     punpcklqdq  xmm1, xmm3                      ;33 32 31 30 23 22 21 20
     79 
     80     movdqa      xmm2, xmm0
     81     punpckldq   xmm0, xmm1                      ;23 22 03 02 21 20 01 00
     82     punpckhdq   xmm2, xmm1                      ;33 32 13 12 31 30 11 10
     83     movdqa      xmm1, xmm0
     84     punpckldq   xmm0, xmm2                      ;31 21 30 20 11 10 01 00
     85     pshufhw     xmm1, xmm1, 0b1h                ;22 23 02 03 xx xx xx xx
     86     pshufhw     xmm2, xmm2, 0b1h                ;32 33 12 13 xx xx xx xx
     87 
     88     punpckhdq   xmm1, xmm2                      ;32 33 22 23 12 13 02 03
     89     movdqa      xmm3, xmm0
     90     paddw       xmm0, xmm1                      ;b1 a1 b1 a1 b1 a1 b1 a1
     91     psubw       xmm3, xmm1                      ;c1 d1 c1 d1 c1 d1 c1 d1
     92     psllw       xmm0, 3                         ;b1 <<= 3 a1 <<= 3
     93     psllw       xmm3, 3                         ;c1 <<= 3 d1 <<= 3
     94 
     95     movdqa      xmm1, xmm0
     96     pmaddwd     xmm0, XMMWORD PTR[GLOBAL(_mult_add)]    ;a1 + b1
     97     pmaddwd     xmm1, XMMWORD PTR[GLOBAL(_mult_sub)]    ;a1 - b1
     98     movdqa      xmm4, xmm3
     99     pmaddwd     xmm3, XMMWORD PTR[GLOBAL(_5352_2217)]   ;c1*2217 + d1*5352
    100     pmaddwd     xmm4, XMMWORD PTR[GLOBAL(_2217_neg5352)];d1*2217 - c1*5352
    101 
    102     paddd       xmm3, XMMWORD PTR[GLOBAL(_14500)]
    103     paddd       xmm4, XMMWORD PTR[GLOBAL(_7500)]
    104     psrad       xmm3, 12            ;(c1 * 2217 + d1 * 5352 +  14500)>>12
    105     psrad       xmm4, 12            ;(d1 * 2217 - c1 * 5352 +   7500)>>12
    106 
    107     packssdw    xmm0, xmm1                      ;op[2] op[0]
    108     packssdw    xmm3, xmm4                      ;op[3] op[1]
    109     ; 23 22 21 20 03 02 01 00
    110     ;
    111     ; 33 32 31 30 13 12 11 10
    112     ;
    113     movdqa      xmm2, xmm0
    114     punpcklqdq  xmm0, xmm3                      ;13 12 11 10 03 02 01 00
    115     punpckhqdq  xmm2, xmm3                      ;23 22 21 20 33 32 31 30
    116 
    117     movdqa      xmm3, xmm0
    118     punpcklwd   xmm0, xmm2                      ;32 30 22 20 12 10 02 00
    119     punpckhwd   xmm3, xmm2                      ;33 31 23 21 13 11 03 01
    120     movdqa      xmm2, xmm0
    121     punpcklwd   xmm0, xmm3                      ;13 12 11 10 03 02 01 00
    122     punpckhwd   xmm2, xmm3                      ;33 32 31 30 23 22 21 20
    123 
    124     movdqa      xmm5, XMMWORD PTR[GLOBAL(_7)]
    125     pshufd      xmm2, xmm2, 04eh
    126     movdqa      xmm3, xmm0
    127     paddw       xmm0, xmm2                      ;b1 b1 b1 b1 a1 a1 a1 a1
    128     psubw       xmm3, xmm2                      ;c1 c1 c1 c1 d1 d1 d1 d1
    129 
    130     pshufd      xmm0, xmm0, 0d8h                ;b1 b1 a1 a1 b1 b1 a1 a1
    131     movdqa      xmm2, xmm3                      ;save d1 for compare
    132     pshufd      xmm3, xmm3, 0d8h                ;c1 c1 d1 d1 c1 c1 d1 d1
    133     pshuflw     xmm0, xmm0, 0d8h                ;b1 b1 a1 a1 b1 a1 b1 a1
    134     pshuflw     xmm3, xmm3, 0d8h                ;c1 c1 d1 d1 c1 d1 c1 d1
    135     pshufhw     xmm0, xmm0, 0d8h                ;b1 a1 b1 a1 b1 a1 b1 a1
    136     pshufhw     xmm3, xmm3, 0d8h                ;c1 d1 c1 d1 c1 d1 c1 d1
    137     movdqa      xmm1, xmm0
    138     pmaddwd     xmm0, XMMWORD PTR[GLOBAL(_mult_add)] ;a1 + b1
    139     pmaddwd     xmm1, XMMWORD PTR[GLOBAL(_mult_sub)] ;a1 - b1
    140 
    141     pxor        xmm4, xmm4                      ;zero out for compare
    142     paddd       xmm0, xmm5
    143     paddd       xmm1, xmm5
    144     pcmpeqw     xmm2, xmm4
    145     psrad       xmm0, 4                         ;(a1 + b1 + 7)>>4
    146     psrad       xmm1, 4                         ;(a1 - b1 + 7)>>4
    147     pandn       xmm2, XMMWORD PTR[GLOBAL(_cmp_mask)] ;clear upper,
    148                                                      ;and keep bit 0 of lower
    149 
    150     movdqa      xmm4, xmm3
    151     pmaddwd     xmm3, XMMWORD PTR[GLOBAL(_5352_2217)]    ;c1*2217 + d1*5352
    152     pmaddwd     xmm4, XMMWORD PTR[GLOBAL(_2217_neg5352)] ;d1*2217 - c1*5352
    153     paddd       xmm3, XMMWORD PTR[GLOBAL(_12000)]
    154     paddd       xmm4, XMMWORD PTR[GLOBAL(_51000)]
    155     packssdw    xmm0, xmm1                      ;op[8] op[0]
    156     psrad       xmm3, 16                ;(c1 * 2217 + d1 * 5352 +  12000)>>16
    157     psrad       xmm4, 16                ;(d1 * 2217 - c1 * 5352 +  51000)>>16
    158 
    159     packssdw    xmm3, xmm4                      ;op[12] op[4]
    160     movdqa      xmm1, xmm0
    161     paddw       xmm3, xmm2                      ;op[4] += (d1!=0)
    162     punpcklqdq  xmm0, xmm3                      ;op[4] op[0]
    163     punpckhqdq  xmm1, xmm3                      ;op[12] op[8]
    164 
    165     movdqa      XMMWORD PTR[output +  0], xmm0
    166     movdqa      XMMWORD PTR[output + 16], xmm1
    167 
    168     STACK_FRAME_DESTROY
    169 
    170 ;void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch)
    171 global sym(vp8_short_fdct8x4_sse2) PRIVATE
    172 sym(vp8_short_fdct8x4_sse2):
    173 
    174     STACK_FRAME_CREATE
    175 
    176         ; read the input data
    177         movdqa      xmm0,       [input        ]
    178         movdqa      xmm2,       [input+  pitch]
    179         lea         input,      [input+2*pitch]
    180         movdqa      xmm4,       [input        ]
    181         movdqa      xmm3,       [input+  pitch]
    182 
    183         ; transpose for the first stage
    184         movdqa      xmm1,       xmm0        ; 00 01 02 03 04 05 06 07
    185         movdqa      xmm5,       xmm4        ; 20 21 22 23 24 25 26 27
    186 
    187         punpcklwd   xmm0,       xmm2        ; 00 10 01 11 02 12 03 13
    188         punpckhwd   xmm1,       xmm2        ; 04 14 05 15 06 16 07 17
    189 
    190         punpcklwd   xmm4,       xmm3        ; 20 30 21 31 22 32 23 33
    191         punpckhwd   xmm5,       xmm3        ; 24 34 25 35 26 36 27 37
    192 
    193         movdqa      xmm2,       xmm0        ; 00 10 01 11 02 12 03 13
    194         punpckldq   xmm0,       xmm4        ; 00 10 20 30 01 11 21 31
    195 
    196         punpckhdq   xmm2,       xmm4        ; 02 12 22 32 03 13 23 33
    197 
    198         movdqa      xmm4,       xmm1        ; 04 14 05 15 06 16 07 17
    199         punpckldq   xmm4,       xmm5        ; 04 14 24 34 05 15 25 35
    200 
    201         punpckhdq   xmm1,       xmm5        ; 06 16 26 36 07 17 27 37
    202         movdqa      xmm3,       xmm2        ; 02 12 22 32 03 13 23 33
    203 
    204         punpckhqdq  xmm3,       xmm1        ; 03 13 23 33 07 17 27 37
    205         punpcklqdq  xmm2,       xmm1        ; 02 12 22 32 06 16 26 36
    206 
    207         movdqa      xmm1,       xmm0        ; 00 10 20 30 01 11 21 31
    208         punpcklqdq  xmm0,       xmm4        ; 00 10 20 30 04 14 24 34
    209 
    210         punpckhqdq  xmm1,       xmm4        ; 01 11 21 32 05 15 25 35
    211 
    212         ; xmm0 0
    213         ; xmm1 1
    214         ; xmm2 2
    215         ; xmm3 3
    216 
    217         ; first stage
    218         movdqa      xmm5,       xmm0
    219         movdqa      xmm4,       xmm1
    220 
    221         paddw       xmm0,       xmm3        ; a1 = 0 + 3
    222         paddw       xmm1,       xmm2        ; b1 = 1 + 2
    223 
    224         psubw       xmm4,       xmm2        ; c1 = 1 - 2
    225         psubw       xmm5,       xmm3        ; d1 = 0 - 3
    226 
    227         psllw       xmm5,        3
    228         psllw       xmm4,        3
    229 
    230         psllw       xmm0,        3
    231         psllw       xmm1,        3
    232 
    233         ; output 0 and 2
    234         movdqa      xmm2,       xmm0        ; a1
    235 
    236         paddw       xmm0,       xmm1        ; op[0] = a1 + b1
    237         psubw       xmm2,       xmm1        ; op[2] = a1 - b1
    238 
    239         ; output 1 and 3
    240         ; interleave c1, d1
    241         movdqa      xmm1,       xmm5        ; d1
    242         punpcklwd   xmm1,       xmm4        ; c1 d1
    243         punpckhwd   xmm5,       xmm4        ; c1 d1
    244 
    245         movdqa      xmm3,       xmm1
    246         movdqa      xmm4,       xmm5
    247 
    248         pmaddwd     xmm1,       XMMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
    249         pmaddwd     xmm4,       XMMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
    250 
    251         pmaddwd     xmm3,       XMMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
    252         pmaddwd     xmm5,       XMMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
    253 
    254         paddd       xmm1,       XMMWORD PTR[GLOBAL(_14500)]
    255         paddd       xmm4,       XMMWORD PTR[GLOBAL(_14500)]
    256         paddd       xmm3,       XMMWORD PTR[GLOBAL(_7500)]
    257         paddd       xmm5,       XMMWORD PTR[GLOBAL(_7500)]
    258 
    259         psrad       xmm1,       12          ; (c1 * 2217 + d1 * 5352 +  14500)>>12
    260         psrad       xmm4,       12          ; (c1 * 2217 + d1 * 5352 +  14500)>>12
    261         psrad       xmm3,       12          ; (d1 * 2217 - c1 * 5352 +   7500)>>12
    262         psrad       xmm5,       12          ; (d1 * 2217 - c1 * 5352 +   7500)>>12
    263 
    264         packssdw    xmm1,       xmm4        ; op[1]
    265         packssdw    xmm3,       xmm5        ; op[3]
    266 
    267         ; done with vertical
    268         ; transpose for the second stage
    269         movdqa      xmm4,       xmm0         ; 00 10 20 30 04 14 24 34
    270         movdqa      xmm5,       xmm2         ; 02 12 22 32 06 16 26 36
    271 
    272         punpcklwd   xmm0,       xmm1         ; 00 01 10 11 20 21 30 31
    273         punpckhwd   xmm4,       xmm1         ; 04 05 14 15 24 25 34 35
    274 
    275         punpcklwd   xmm2,       xmm3         ; 02 03 12 13 22 23 32 33
    276         punpckhwd   xmm5,       xmm3         ; 06 07 16 17 26 27 36 37
    277 
    278         movdqa      xmm1,       xmm0         ; 00 01 10 11 20 21 30 31
    279         punpckldq   xmm0,       xmm2         ; 00 01 02 03 10 11 12 13
    280 
    281         punpckhdq   xmm1,       xmm2         ; 20 21 22 23 30 31 32 33
    282 
    283         movdqa      xmm2,       xmm4         ; 04 05 14 15 24 25 34 35
    284         punpckldq   xmm2,       xmm5         ; 04 05 06 07 14 15 16 17
    285 
    286         punpckhdq   xmm4,       xmm5         ; 24 25 26 27 34 35 36 37
    287         movdqa      xmm3,       xmm1         ; 20 21 22 23 30 31 32 33
    288 
    289         punpckhqdq  xmm3,       xmm4         ; 30 31 32 33 34 35 36 37
    290         punpcklqdq  xmm1,       xmm4         ; 20 21 22 23 24 25 26 27
    291 
    292         movdqa      xmm4,       xmm0         ; 00 01 02 03 10 11 12 13
    293         punpcklqdq  xmm0,       xmm2         ; 00 01 02 03 04 05 06 07
    294 
    295         punpckhqdq  xmm4,       xmm2         ; 10 11 12 13 14 15 16 17
    296 
    297         ; xmm0 0
    298         ; xmm1 4
    299         ; xmm2 1
    300         ; xmm3 3
    301 
    302         movdqa      xmm5,       xmm0
    303         movdqa      xmm2,       xmm1
    304 
    305         paddw       xmm0,       xmm3        ; a1 = 0 + 3
    306         paddw       xmm1,       xmm4        ; b1 = 1 + 2
    307 
    308         psubw       xmm4,       xmm2        ; c1 = 1 - 2
    309         psubw       xmm5,       xmm3        ; d1 = 0 - 3
    310 
    311         pxor        xmm6,       xmm6        ; zero out for compare
    312 
    313         pcmpeqw     xmm6,       xmm5        ; d1 != 0
    314 
    315         pandn       xmm6,       XMMWORD PTR[GLOBAL(_cmp_mask8x4)]   ; clear upper,
    316                                                                     ; and keep bit 0 of lower
    317 
    318         ; output 0 and 2
    319         movdqa      xmm2,       xmm0        ; a1
    320 
    321         paddw       xmm0,       xmm1        ; a1 + b1
    322         psubw       xmm2,       xmm1        ; a1 - b1
    323 
    324         paddw       xmm0,       XMMWORD PTR[GLOBAL(_7w)]
    325         paddw       xmm2,       XMMWORD PTR[GLOBAL(_7w)]
    326 
    327         psraw       xmm0,       4           ; op[0] = (a1 + b1 + 7)>>4
    328         psraw       xmm2,       4           ; op[8] = (a1 - b1 + 7)>>4
    329 
    330         ; output 1 and 3
    331         ; interleave c1, d1
    332         movdqa      xmm1,       xmm5        ; d1
    333         punpcklwd   xmm1,       xmm4        ; c1 d1
    334         punpckhwd   xmm5,       xmm4        ; c1 d1
    335 
    336         movdqa      xmm3,       xmm1
    337         movdqa      xmm4,       xmm5
    338 
    339         pmaddwd     xmm1,       XMMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
    340         pmaddwd     xmm4,       XMMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
    341 
    342         pmaddwd     xmm3,       XMMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
    343         pmaddwd     xmm5,       XMMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
    344 
    345         paddd       xmm1,       XMMWORD PTR[GLOBAL(_12000)]
    346         paddd       xmm4,       XMMWORD PTR[GLOBAL(_12000)]
    347         paddd       xmm3,       XMMWORD PTR[GLOBAL(_51000)]
    348         paddd       xmm5,       XMMWORD PTR[GLOBAL(_51000)]
    349 
    350         psrad       xmm1,       16          ; (c1 * 2217 + d1 * 5352 +  14500)>>16
    351         psrad       xmm4,       16          ; (c1 * 2217 + d1 * 5352 +  14500)>>16
    352         psrad       xmm3,       16          ; (d1 * 2217 - c1 * 5352 +   7500)>>16
    353         psrad       xmm5,       16          ; (d1 * 2217 - c1 * 5352 +   7500)>>16
    354 
    355         packssdw    xmm1,       xmm4        ; op[4]
    356         packssdw    xmm3,       xmm5        ; op[12]
    357 
    358         paddw       xmm1,       xmm6        ; op[4] += (d1!=0)
    359 
    360         movdqa      xmm4,       xmm0
    361         movdqa      xmm5,       xmm2
    362 
    363         punpcklqdq  xmm0,       xmm1
    364         punpckhqdq  xmm4,       xmm1
    365 
    366         punpcklqdq  xmm2,       xmm3
    367         punpckhqdq  xmm5,       xmm3
    368 
    369         movdqa      XMMWORD PTR[output + 0 ],  xmm0
    370         movdqa      XMMWORD PTR[output + 16],  xmm2
    371         movdqa      XMMWORD PTR[output + 32],  xmm4
    372         movdqa      XMMWORD PTR[output + 48],  xmm5
    373 
    374     STACK_FRAME_DESTROY
    375 
    376 SECTION_RODATA
    377 align 16
    378 _5352_2217:
    379     dw 5352
    380     dw 2217
    381     dw 5352
    382     dw 2217
    383     dw 5352
    384     dw 2217
    385     dw 5352
    386     dw 2217
    387 align 16
    388 _2217_neg5352:
    389     dw 2217
    390     dw -5352
    391     dw 2217
    392     dw -5352
    393     dw 2217
    394     dw -5352
    395     dw 2217
    396     dw -5352
    397 align 16
    398 _mult_add:
    399     times 8 dw 1
    400 align 16
    401 _cmp_mask:
    402     times 4 dw 1
    403     times 4 dw 0
    404 align 16
    405 _cmp_mask8x4:
    406     times 8 dw 1
    407 align 16
    408 _mult_sub:
    409     dw 1
    410     dw -1
    411     dw 1
    412     dw -1
    413     dw 1
    414     dw -1
    415     dw 1
    416     dw -1
    417 align 16
    418 _7:
    419     times 4 dd 7
    420 align 16
    421 _7w:
    422     times 8 dw 7
    423 align 16
    424 _14500:
    425     times 4 dd 14500
    426 align 16
    427 _7500:
    428     times 4 dd 7500
    429 align 16
    430 _12000:
    431     times 4 dd 12000
    432 align 16
    433 _51000:
    434     times 4 dd 51000
    435