Home | History | Annotate | Download | only in x86
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12 %include "vpx_ports/x86_abi_support.asm"
     13 
     14 %macro STACK_FRAME_CREATE 0
     15 %if ABI_IS_32BIT
     16   %define       input       rsi
     17   %define       output      rdi
     18   %define       pitch       rax
     19     push        rbp
     20     mov         rbp, rsp
     21     GET_GOT     rbx
     22     push        rsi
     23     push        rdi
     24     ; end prolog
     25 
     26     mov         rsi, arg(0)
     27     mov         rdi, arg(1)
     28 
     29     movsxd      rax, dword ptr arg(2)
     30     lea         rcx, [rsi + rax*2]
     31 %else
     32   %ifidn __OUTPUT_FORMAT__,x64
     33     %define     input       rcx
     34     %define     output      rdx
     35     %define     pitch       r8
     36   %else
     37     %define     input       rdi
     38     %define     output      rsi
     39     %define     pitch       rdx
     40   %endif
     41 %endif
     42 %endmacro
     43 
     44 %macro STACK_FRAME_DESTROY 0
     45   %define     input
     46   %define     output
     47   %define     pitch
     48 
     49 %if ABI_IS_32BIT
     50     pop         rdi
     51     pop         rsi
     52     RESTORE_GOT
     53     pop         rbp
     54 %else
     55   %ifidn __OUTPUT_FORMAT__,x64
     56   %endif
     57 %endif
     58     ret
     59 %endmacro
     60 
     61 ;void vp8_short_fdct4x4_sse2(short *input, short *output, int pitch)
     62 global sym(vp8_short_fdct4x4_sse2)
     63 sym(vp8_short_fdct4x4_sse2):
     64 
     65     STACK_FRAME_CREATE
     66 
     67     movq        xmm0, MMWORD PTR[input        ] ;03 02 01 00
     68     movq        xmm2, MMWORD PTR[input+  pitch] ;13 12 11 10
     69     lea         input,          [input+2*pitch]
     70     movq        xmm1, MMWORD PTR[input        ] ;23 22 21 20
     71     movq        xmm3, MMWORD PTR[input+  pitch] ;33 32 31 30
     72 
     73     punpcklqdq  xmm0, xmm2                      ;13 12 11 10 03 02 01 00
     74     punpcklqdq  xmm1, xmm3                      ;33 32 31 30 23 22 21 20
     75 
     76     movdqa      xmm2, xmm0
     77     punpckldq   xmm0, xmm1                      ;23 22 03 02 21 20 01 00
     78     punpckhdq   xmm2, xmm1                      ;33 32 13 12 31 30 11 10
     79     movdqa      xmm1, xmm0
     80     punpckldq   xmm0, xmm2                      ;31 21 30 20 11 10 01 00
     81     pshufhw     xmm1, xmm1, 0b1h                ;22 23 02 03 xx xx xx xx
     82     pshufhw     xmm2, xmm2, 0b1h                ;32 33 12 13 xx xx xx xx
     83 
     84     punpckhdq   xmm1, xmm2                      ;32 33 22 23 12 13 02 03
     85     movdqa      xmm3, xmm0
     86     paddw       xmm0, xmm1                      ;b1 a1 b1 a1 b1 a1 b1 a1
     87     psubw       xmm3, xmm1                      ;c1 d1 c1 d1 c1 d1 c1 d1
     88     psllw       xmm0, 3                         ;b1 <<= 3 a1 <<= 3
     89     psllw       xmm3, 3                         ;c1 <<= 3 d1 <<= 3
     90 
     91     movdqa      xmm1, xmm0
     92     pmaddwd     xmm0, XMMWORD PTR[GLOBAL(_mult_add)]    ;a1 + b1
     93     pmaddwd     xmm1, XMMWORD PTR[GLOBAL(_mult_sub)]    ;a1 - b1
     94     movdqa      xmm4, xmm3
     95     pmaddwd     xmm3, XMMWORD PTR[GLOBAL(_5352_2217)]   ;c1*2217 + d1*5352
     96     pmaddwd     xmm4, XMMWORD PTR[GLOBAL(_2217_neg5352)];d1*2217 - c1*5352
     97 
     98     paddd       xmm3, XMMWORD PTR[GLOBAL(_14500)]
     99     paddd       xmm4, XMMWORD PTR[GLOBAL(_7500)]
    100     psrad       xmm3, 12            ;(c1 * 2217 + d1 * 5352 +  14500)>>12
    101     psrad       xmm4, 12            ;(d1 * 2217 - c1 * 5352 +   7500)>>12
    102 
    103     packssdw    xmm0, xmm1                      ;op[2] op[0]
    104     packssdw    xmm3, xmm4                      ;op[3] op[1]
    105     ; 23 22 21 20 03 02 01 00
    106     ;
    107     ; 33 32 31 30 13 12 11 10
    108     ;
    109     movdqa      xmm2, xmm0
    110     punpcklqdq  xmm0, xmm3                      ;13 12 11 10 03 02 01 00
    111     punpckhqdq  xmm2, xmm3                      ;23 22 21 20 33 32 31 30
    112 
    113     movdqa      xmm3, xmm0
    114     punpcklwd   xmm0, xmm2                      ;32 30 22 20 12 10 02 00
    115     punpckhwd   xmm3, xmm2                      ;33 31 23 21 13 11 03 01
    116     movdqa      xmm2, xmm0
    117     punpcklwd   xmm0, xmm3                      ;13 12 11 10 03 02 01 00
    118     punpckhwd   xmm2, xmm3                      ;33 32 31 30 23 22 21 20
    119 
    120     movdqa      xmm5, XMMWORD PTR[GLOBAL(_7)]
    121     pshufd      xmm2, xmm2, 04eh
    122     movdqa      xmm3, xmm0
    123     paddw       xmm0, xmm2                      ;b1 b1 b1 b1 a1 a1 a1 a1
    124     psubw       xmm3, xmm2                      ;c1 c1 c1 c1 d1 d1 d1 d1
    125 
    126     pshufd      xmm0, xmm0, 0d8h                ;b1 b1 a1 a1 b1 b1 a1 a1
    127     movdqa      xmm2, xmm3                      ;save d1 for compare
    128     pshufd      xmm3, xmm3, 0d8h                ;c1 c1 d1 d1 c1 c1 d1 d1
    129     pshuflw     xmm0, xmm0, 0d8h                ;b1 b1 a1 a1 b1 a1 b1 a1
    130     pshuflw     xmm3, xmm3, 0d8h                ;c1 c1 d1 d1 c1 d1 c1 d1
    131     pshufhw     xmm0, xmm0, 0d8h                ;b1 a1 b1 a1 b1 a1 b1 a1
    132     pshufhw     xmm3, xmm3, 0d8h                ;c1 d1 c1 d1 c1 d1 c1 d1
    133     movdqa      xmm1, xmm0
    134     pmaddwd     xmm0, XMMWORD PTR[GLOBAL(_mult_add)] ;a1 + b1
    135     pmaddwd     xmm1, XMMWORD PTR[GLOBAL(_mult_sub)] ;a1 - b1
    136 
    137     pxor        xmm4, xmm4                      ;zero out for compare
    138     paddd       xmm0, xmm5
    139     paddd       xmm1, xmm5
    140     pcmpeqw     xmm2, xmm4
    141     psrad       xmm0, 4                         ;(a1 + b1 + 7)>>4
    142     psrad       xmm1, 4                         ;(a1 - b1 + 7)>>4
    143     pandn       xmm2, XMMWORD PTR[GLOBAL(_cmp_mask)] ;clear upper,
    144                                                      ;and keep bit 0 of lower
    145 
    146     movdqa      xmm4, xmm3
    147     pmaddwd     xmm3, XMMWORD PTR[GLOBAL(_5352_2217)]    ;c1*2217 + d1*5352
    148     pmaddwd     xmm4, XMMWORD PTR[GLOBAL(_2217_neg5352)] ;d1*2217 - c1*5352
    149     paddd       xmm3, XMMWORD PTR[GLOBAL(_12000)]
    150     paddd       xmm4, XMMWORD PTR[GLOBAL(_51000)]
    151     packssdw    xmm0, xmm1                      ;op[8] op[0]
    152     psrad       xmm3, 16                ;(c1 * 2217 + d1 * 5352 +  12000)>>16
    153     psrad       xmm4, 16                ;(d1 * 2217 - c1 * 5352 +  51000)>>16
    154 
    155     packssdw    xmm3, xmm4                      ;op[12] op[4]
    156     movdqa      xmm1, xmm0
    157     paddw       xmm3, xmm2                      ;op[4] += (d1!=0)
    158     punpcklqdq  xmm0, xmm3                      ;op[4] op[0]
    159     punpckhqdq  xmm1, xmm3                      ;op[12] op[8]
    160 
    161     movdqa      XMMWORD PTR[output +  0], xmm0
    162     movdqa      XMMWORD PTR[output + 16], xmm1
    163 
    164     STACK_FRAME_DESTROY
    165 
    166 ;void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch)
    167 global sym(vp8_short_fdct8x4_sse2)
    168 sym(vp8_short_fdct8x4_sse2):
    169 
    170     STACK_FRAME_CREATE
    171 
    172         ; read the input data
    173         movdqa      xmm0,       [input        ]
    174         movdqa      xmm2,       [input+  pitch]
    175         lea         input,      [input+2*pitch]
    176         movdqa      xmm4,       [input        ]
    177         movdqa      xmm3,       [input+  pitch]
    178 
    179         ; transpose for the first stage
    180         movdqa      xmm1,       xmm0        ; 00 01 02 03 04 05 06 07
    181         movdqa      xmm5,       xmm4        ; 20 21 22 23 24 25 26 27
    182 
    183         punpcklwd   xmm0,       xmm2        ; 00 10 01 11 02 12 03 13
    184         punpckhwd   xmm1,       xmm2        ; 04 14 05 15 06 16 07 17
    185 
    186         punpcklwd   xmm4,       xmm3        ; 20 30 21 31 22 32 23 33
    187         punpckhwd   xmm5,       xmm3        ; 24 34 25 35 26 36 27 37
    188 
    189         movdqa      xmm2,       xmm0        ; 00 10 01 11 02 12 03 13
    190         punpckldq   xmm0,       xmm4        ; 00 10 20 30 01 11 21 31
    191 
    192         punpckhdq   xmm2,       xmm4        ; 02 12 22 32 03 13 23 33
    193 
    194         movdqa      xmm4,       xmm1        ; 04 14 05 15 06 16 07 17
    195         punpckldq   xmm4,       xmm5        ; 04 14 24 34 05 15 25 35
    196 
    197         punpckhdq   xmm1,       xmm5        ; 06 16 26 36 07 17 27 37
    198         movdqa      xmm3,       xmm2        ; 02 12 22 32 03 13 23 33
    199 
    200         punpckhqdq  xmm3,       xmm1        ; 03 13 23 33 07 17 27 37
    201         punpcklqdq  xmm2,       xmm1        ; 02 12 22 32 06 16 26 36
    202 
    203         movdqa      xmm1,       xmm0        ; 00 10 20 30 01 11 21 31
    204         punpcklqdq  xmm0,       xmm4        ; 00 10 20 30 04 14 24 34
    205 
    206         punpckhqdq  xmm1,       xmm4        ; 01 11 21 32 05 15 25 35
    207 
    208         ; xmm0 0
    209         ; xmm1 1
    210         ; xmm2 2
    211         ; xmm3 3
    212 
    213         ; first stage
    214         movdqa      xmm5,       xmm0
    215         movdqa      xmm4,       xmm1
    216 
    217         paddw       xmm0,       xmm3        ; a1 = 0 + 3
    218         paddw       xmm1,       xmm2        ; b1 = 1 + 2
    219 
    220         psubw       xmm4,       xmm2        ; c1 = 1 - 2
    221         psubw       xmm5,       xmm3        ; d1 = 0 - 3
    222 
    223         psllw       xmm5,        3
    224         psllw       xmm4,        3
    225 
    226         psllw       xmm0,        3
    227         psllw       xmm1,        3
    228 
    229         ; output 0 and 2
    230         movdqa      xmm2,       xmm0        ; a1
    231 
    232         paddw       xmm0,       xmm1        ; op[0] = a1 + b1
    233         psubw       xmm2,       xmm1        ; op[2] = a1 - b1
    234 
    235         ; output 1 and 3
    236         ; interleave c1, d1
    237         movdqa      xmm1,       xmm5        ; d1
    238         punpcklwd   xmm1,       xmm4        ; c1 d1
    239         punpckhwd   xmm5,       xmm4        ; c1 d1
    240 
    241         movdqa      xmm3,       xmm1
    242         movdqa      xmm4,       xmm5
    243 
    244         pmaddwd     xmm1,       XMMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
    245         pmaddwd     xmm4,       XMMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
    246 
    247         pmaddwd     xmm3,       XMMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
    248         pmaddwd     xmm5,       XMMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
    249 
    250         paddd       xmm1,       XMMWORD PTR[GLOBAL(_14500)]
    251         paddd       xmm4,       XMMWORD PTR[GLOBAL(_14500)]
    252         paddd       xmm3,       XMMWORD PTR[GLOBAL(_7500)]
    253         paddd       xmm5,       XMMWORD PTR[GLOBAL(_7500)]
    254 
    255         psrad       xmm1,       12          ; (c1 * 2217 + d1 * 5352 +  14500)>>12
    256         psrad       xmm4,       12          ; (c1 * 2217 + d1 * 5352 +  14500)>>12
    257         psrad       xmm3,       12          ; (d1 * 2217 - c1 * 5352 +   7500)>>12
    258         psrad       xmm5,       12          ; (d1 * 2217 - c1 * 5352 +   7500)>>12
    259 
    260         packssdw    xmm1,       xmm4        ; op[1]
    261         packssdw    xmm3,       xmm5        ; op[3]
    262 
    263         ; done with vertical
    264         ; transpose for the second stage
    265         movdqa      xmm4,       xmm0         ; 00 10 20 30 04 14 24 34
    266         movdqa      xmm5,       xmm2         ; 02 12 22 32 06 16 26 36
    267 
    268         punpcklwd   xmm0,       xmm1         ; 00 01 10 11 20 21 30 31
    269         punpckhwd   xmm4,       xmm1         ; 04 05 14 15 24 25 34 35
    270 
    271         punpcklwd   xmm2,       xmm3         ; 02 03 12 13 22 23 32 33
    272         punpckhwd   xmm5,       xmm3         ; 06 07 16 17 26 27 36 37
    273 
    274         movdqa      xmm1,       xmm0         ; 00 01 10 11 20 21 30 31
    275         punpckldq   xmm0,       xmm2         ; 00 01 02 03 10 11 12 13
    276 
    277         punpckhdq   xmm1,       xmm2         ; 20 21 22 23 30 31 32 33
    278 
    279         movdqa      xmm2,       xmm4         ; 04 05 14 15 24 25 34 35
    280         punpckldq   xmm2,       xmm5         ; 04 05 06 07 14 15 16 17
    281 
    282         punpckhdq   xmm4,       xmm5         ; 24 25 26 27 34 35 36 37
    283         movdqa      xmm3,       xmm1         ; 20 21 22 23 30 31 32 33
    284 
    285         punpckhqdq  xmm3,       xmm4         ; 30 31 32 33 34 35 36 37
    286         punpcklqdq  xmm1,       xmm4         ; 20 21 22 23 24 25 26 27
    287 
    288         movdqa      xmm4,       xmm0         ; 00 01 02 03 10 11 12 13
    289         punpcklqdq  xmm0,       xmm2         ; 00 01 02 03 04 05 06 07
    290 
    291         punpckhqdq  xmm4,       xmm2         ; 10 11 12 13 14 15 16 17
    292 
    293         ; xmm0 0
    294         ; xmm1 4
    295         ; xmm2 1
    296         ; xmm3 3
    297 
    298         movdqa      xmm5,       xmm0
    299         movdqa      xmm2,       xmm1
    300 
    301         paddw       xmm0,       xmm3        ; a1 = 0 + 3
    302         paddw       xmm1,       xmm4        ; b1 = 1 + 2
    303 
    304         psubw       xmm4,       xmm2        ; c1 = 1 - 2
    305         psubw       xmm5,       xmm3        ; d1 = 0 - 3
    306 
    307         pxor        xmm6,       xmm6        ; zero out for compare
    308 
    309         pcmpeqw     xmm6,       xmm5        ; d1 != 0
    310 
    311         pandn       xmm6,       XMMWORD PTR[GLOBAL(_cmp_mask8x4)]   ; clear upper,
    312                                                                     ; and keep bit 0 of lower
    313 
    314         ; output 0 and 2
    315         movdqa      xmm2,       xmm0        ; a1
    316 
    317         paddw       xmm0,       xmm1        ; a1 + b1
    318         psubw       xmm2,       xmm1        ; a1 - b1
    319 
    320         paddw       xmm0,       XMMWORD PTR[GLOBAL(_7w)]
    321         paddw       xmm2,       XMMWORD PTR[GLOBAL(_7w)]
    322 
    323         psraw       xmm0,       4           ; op[0] = (a1 + b1 + 7)>>4
    324         psraw       xmm2,       4           ; op[8] = (a1 - b1 + 7)>>4
    325 
    326         ; output 1 and 3
    327         ; interleave c1, d1
    328         movdqa      xmm1,       xmm5        ; d1
    329         punpcklwd   xmm1,       xmm4        ; c1 d1
    330         punpckhwd   xmm5,       xmm4        ; c1 d1
    331 
    332         movdqa      xmm3,       xmm1
    333         movdqa      xmm4,       xmm5
    334 
    335         pmaddwd     xmm1,       XMMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
    336         pmaddwd     xmm4,       XMMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
    337 
    338         pmaddwd     xmm3,       XMMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
    339         pmaddwd     xmm5,       XMMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
    340 
    341         paddd       xmm1,       XMMWORD PTR[GLOBAL(_12000)]
    342         paddd       xmm4,       XMMWORD PTR[GLOBAL(_12000)]
    343         paddd       xmm3,       XMMWORD PTR[GLOBAL(_51000)]
    344         paddd       xmm5,       XMMWORD PTR[GLOBAL(_51000)]
    345 
    346         psrad       xmm1,       16          ; (c1 * 2217 + d1 * 5352 +  14500)>>16
    347         psrad       xmm4,       16          ; (c1 * 2217 + d1 * 5352 +  14500)>>16
    348         psrad       xmm3,       16          ; (d1 * 2217 - c1 * 5352 +   7500)>>16
    349         psrad       xmm5,       16          ; (d1 * 2217 - c1 * 5352 +   7500)>>16
    350 
    351         packssdw    xmm1,       xmm4        ; op[4]
    352         packssdw    xmm3,       xmm5        ; op[12]
    353 
    354         paddw       xmm1,       xmm6        ; op[4] += (d1!=0)
    355 
    356         movdqa      xmm4,       xmm0
    357         movdqa      xmm5,       xmm2
    358 
    359         punpcklqdq  xmm0,       xmm1
    360         punpckhqdq  xmm4,       xmm1
    361 
    362         punpcklqdq  xmm2,       xmm3
    363         punpckhqdq  xmm5,       xmm3
    364 
    365         movdqa      XMMWORD PTR[output + 0 ],  xmm0
    366         movdqa      XMMWORD PTR[output + 16],  xmm2
    367         movdqa      XMMWORD PTR[output + 32],  xmm4
    368         movdqa      XMMWORD PTR[output + 48],  xmm5
    369 
    370     STACK_FRAME_DESTROY
    371 
    372 SECTION_RODATA
    373 align 16
    374 _5352_2217:
    375     dw 5352
    376     dw 2217
    377     dw 5352
    378     dw 2217
    379     dw 5352
    380     dw 2217
    381     dw 5352
    382     dw 2217
    383 align 16
    384 _2217_neg5352:
    385     dw 2217
    386     dw -5352
    387     dw 2217
    388     dw -5352
    389     dw 2217
    390     dw -5352
    391     dw 2217
    392     dw -5352
    393 align 16
    394 _mult_add:
    395     times 8 dw 1
    396 align 16
    397 _cmp_mask:
    398     times 4 dw 1
    399     times 4 dw 0
    400 align 16
    401 _cmp_mask8x4:
    402     times 8 dw 1
    403 align 16
    404 _mult_sub:
    405     dw 1
    406     dw -1
    407     dw 1
    408     dw -1
    409     dw 1
    410     dw -1
    411     dw 1
    412     dw -1
    413 align 16
    414 _7:
    415     times 4 dd 7
    416 align 16
    417 _7w:
    418     times 8 dw 7
    419 align 16
    420 _14500:
    421     times 4 dd 14500
    422 align 16
    423 _7500:
    424     times 4 dd 7500
    425 align 16
    426 _12000:
    427     times 4 dd 12000
    428 align 16
    429 _51000:
    430     times 4 dd 51000
    431