Home | History | Annotate | Download | only in x86
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12 %include "vpx_ports/x86_abi_support.asm"
     13 
     14 section .text
     15     global sym(vp8_short_fdct4x4_mmx)
     16     global sym(vp8_short_fdct8x4_wmt)
     17 
     18 
     19 %define         DCTCONSTANTSBITS         (16)
     20 %define         DCTROUNDINGVALUE         (1<< (DCTCONSTANTSBITS-1))
     21 %define         x_c1                      (60547)          ; cos(pi  /8) * (1<<15)
     22 %define         x_c2                      (46341)          ; cos(pi*2/8) * (1<<15)
     23 %define         x_c3                      (25080)          ; cos(pi*3/8) * (1<<15)
     24 
     25 
     26 ;void vp8_short_fdct4x4_mmx(short *input, short *output, int pitch)
     27 sym(vp8_short_fdct4x4_mmx):
     28     push        rbp
     29     mov         rbp, rsp
     30     SHADOW_ARGS_TO_STACK 3
     31     GET_GOT     rbx
     32     push rsi
     33     push rdi
     34     ; end prolog
     35         mov     rsi,    arg(0) ;input
     36         mov     rdi,    arg(1) ;output
     37 
     38         lea     rdx,    [GLOBAL(dct_const_mmx)]
     39         movsxd  rax,    dword ptr arg(2) ;pitch
     40 
     41         lea     rcx,    [rsi + rax*2]
     42         ; read the input data
     43         movq    mm0,    [rsi]
     44         movq    mm1,    [rsi + rax    ]
     45 
     46         movq    mm2,    [rcx]
     47         movq    mm3,    [rcx + rax]
     48         ; get the constants
     49         ;shift to left by 1 for prescision
     50         psllw   mm0,    3
     51         psllw   mm1,    3
     52 
     53         psllw   mm2,    3
     54         psllw   mm3,    3
     55 
     56         ; transpose for the second stage
     57         movq    mm4,    mm0         ; 00 01 02 03
     58         movq    mm5,    mm2         ; 10 11 12 03
     59 
     60         punpcklwd   mm0,    mm1     ; 00 10 01 11
     61         punpckhwd   mm4,    mm1     ; 02 12 03 13
     62 
     63         punpcklwd   mm2,    mm3     ; 20 30 21 31
     64         punpckhwd   mm5,    mm3     ; 22 32 23 33
     65 
     66 
     67         movq        mm1,    mm0     ; 00 10 01 11
     68         punpckldq   mm0,    mm2     ; 00 10 20 30
     69 
     70         punpckhdq   mm1,    mm2     ; 01 11 21 31
     71 
     72         movq        mm2,    mm4     ; 02 12 03 13
     73         punpckldq   mm2,    mm5     ; 02 12 22 32
     74 
     75         punpckhdq   mm4,    mm5     ; 03 13 23 33
     76         movq        mm3,    mm4
     77 
     78 
     79         ; first stage
     80         movq    mm5,    mm0
     81         movq    mm4,    mm1
     82 
     83         paddw   mm0,    mm3         ; a = 0 + 3
     84         paddw   mm1,    mm2         ; b = 1 + 2
     85 
     86         psubw   mm4,    mm2         ; c = 1 - 2
     87         psubw   mm5,    mm3         ; d = 0 - 3
     88 
     89 
     90         ; output 0 and 2
     91         movq    mm6,    [rdx +  16] ; c2
     92         movq    mm2,    mm0         ; a
     93 
     94         paddw   mm0,    mm1         ; a + b
     95         psubw   mm2,    mm1         ; a - b
     96 
     97         movq    mm1,    mm0         ; a + b
     98         pmulhw  mm0,    mm6         ; 00 01 02 03
     99 
    100         paddw   mm0,    mm1         ; output 00 01 02 03
    101         pmulhw  mm6,    mm2         ; 20 21 22 23
    102 
    103         paddw   mm2,    mm6         ; output 20 21 22 23
    104 
    105         ; output 1 and 3
    106         movq    mm6,    [rdx +  8]  ; c1
    107         movq    mm7,    [rdx + 24]  ; c3
    108 
    109         movq    mm1,    mm4         ; c
    110         movq    mm3,    mm5         ; d
    111 
    112         pmulhw  mm1,    mm7         ; c * c3
    113         pmulhw  mm3,    mm6         ; d * c1
    114 
    115         paddw   mm3,    mm5         ; d * c1 rounded
    116         paddw   mm1,    mm3         ; output 10 11 12 13
    117 
    118         movq    mm3,    mm4         ; c
    119         pmulhw  mm5,    mm7         ; d * c3
    120 
    121         pmulhw  mm4,    mm6         ; c * c1
    122         paddw   mm3,    mm4         ; round c* c1
    123 
    124         psubw   mm5,    mm3         ; output 30 31 32 33
    125         movq    mm3,    mm5
    126 
    127 
    128         ; done with vertical
    129         ; transpose for the second stage
    130         movq    mm4,    mm0         ; 00 01 02 03
    131         movq    mm5,    mm2         ; 10 11 12 03
    132 
    133         punpcklwd   mm0,    mm1     ; 00 10 01 11
    134         punpckhwd   mm4,    mm1     ; 02 12 03 13
    135 
    136         punpcklwd   mm2,    mm3     ; 20 30 21 31
    137         punpckhwd   mm5,    mm3     ; 22 32 23 33
    138 
    139 
    140         movq        mm1,    mm0     ; 00 10 01 11
    141         punpckldq   mm0,    mm2     ; 00 10 20 30
    142 
    143         punpckhdq   mm1,    mm2     ; 01 11 21 31
    144 
    145         movq        mm2,    mm4     ; 02 12 03 13
    146         punpckldq   mm2,    mm5     ; 02 12 22 32
    147 
    148         punpckhdq   mm4,    mm5     ; 03 13 23 33
    149         movq        mm3,    mm4
    150 
    151 
    152         ; first stage
    153         movq    mm5,    mm0
    154         movq    mm4,    mm1
    155 
    156         paddw   mm0,    mm3         ; a = 0 + 3
    157         paddw   mm1,    mm2         ; b = 1 + 2
    158 
    159         psubw   mm4,    mm2         ; c = 1 - 2
    160         psubw   mm5,    mm3         ; d = 0 - 3
    161 
    162 
    163         ; output 0 and 2
    164         movq    mm6,    [rdx +  16] ; c2
    165         movq    mm2,    mm0         ; a
    166         paddw   mm0,    mm1         ; a + b
    167 
    168         psubw   mm2,    mm1         ; a - b
    169 
    170         movq    mm1,    mm0         ; a + b
    171         pmulhw  mm0,    mm6         ; 00 01 02 03
    172 
    173         paddw   mm0,    mm1         ; output 00 01 02 03
    174         pmulhw  mm6,    mm2         ; 20 21 22 23
    175 
    176         paddw   mm2,    mm6         ; output 20 21 22 23
    177 
    178 
    179         ; output 1 and 3
    180         movq    mm6,    [rdx +  8]  ; c1
    181         movq    mm7,    [rdx + 24]  ; c3
    182 
    183         movq    mm1,    mm4         ; c
    184         movq    mm3,    mm5         ; d
    185 
    186         pmulhw  mm1,    mm7         ; c * c3
    187         pmulhw  mm3,    mm6         ; d * c1
    188 
    189         paddw   mm3,    mm5         ; d * c1 rounded
    190         paddw   mm1,    mm3         ; output 10 11 12 13
    191 
    192         movq    mm3,    mm4         ; c
    193         pmulhw  mm5,    mm7         ; d * c3
    194 
    195         pmulhw  mm4,    mm6         ; c * c1
    196         paddw   mm3,    mm4         ; round c* c1
    197 
    198         psubw   mm5,    mm3         ; output 30 31 32 33
    199         movq    mm3,    mm5
    200         ; done with vertical
    201 
    202         pcmpeqw mm4,    mm4
    203         pcmpeqw mm5,    mm5
    204         psrlw   mm4,    15
    205         psrlw   mm5,    15
    206 
    207         psllw   mm4,    2
    208         psllw   mm5,    2
    209 
    210         paddw   mm0,    mm4
    211         paddw   mm1,    mm5
    212         paddw   mm2,    mm4
    213         paddw   mm3,    mm5
    214 
    215         psraw   mm0, 3
    216         psraw   mm1, 3
    217         psraw   mm2, 3
    218         psraw   mm3, 3
    219 
    220         movq        [rdi   ],   mm0
    221         movq        [rdi+ 8],   mm1
    222         movq        [rdi+16],   mm2
    223         movq        [rdi+24],   mm3
    224 
    225     ; begin epilog
    226     pop rdi
    227     pop rsi
    228     RESTORE_GOT
    229     UNSHADOW_ARGS
    230     pop         rbp
    231     ret
    232 
    233 
    234 ;void vp8_short_fdct8x4_wmt(short *input, short *output, int pitch)
    235 sym(vp8_short_fdct8x4_wmt):
    236     push        rbp
    237     mov         rbp, rsp
    238     SHADOW_ARGS_TO_STACK 3
    239     GET_GOT     rbx
    240     push rsi
    241     push rdi
    242     ; end prolog
    243         mov         rsi,    arg(0) ;input
    244         mov         rdi,    arg(1) ;output
    245 
    246         lea         rdx,    [GLOBAL(dct_const_xmm)]
    247         movsxd      rax,    dword ptr arg(2) ;pitch
    248 
    249         lea         rcx,    [rsi + rax*2]
    250         ; read the input data
    251         movdqa      xmm0,       [rsi]
    252         movdqa      xmm2,       [rsi + rax]
    253 
    254         movdqa      xmm4,       [rcx]
    255         movdqa      xmm3,       [rcx + rax]
    256         ; get the constants
    257         ;shift to left by 1 for prescision
    258         psllw       xmm0,        3
    259         psllw       xmm2,        3
    260 
    261         psllw       xmm4,        3
    262         psllw       xmm3,        3
    263 
    264         ; transpose for the second stage
    265         movdqa      xmm1,       xmm0         ; 00 01 02 03 04 05 06 07
    266         movdqa      xmm5,       xmm4         ; 20 21 22 23 24 25 26 27
    267 
    268         punpcklwd   xmm0,       xmm2         ; 00 10 01 11 02 12 03 13
    269         punpckhwd   xmm1,       xmm2         ; 04 14 05 15 06 16 07 17
    270 
    271         punpcklwd   xmm4,       xmm3         ; 20 30 21 31 22 32 23 33
    272         punpckhwd   xmm5,       xmm3         ; 24 34 25 35 26 36 27 37
    273 
    274         movdqa      xmm2,       xmm0         ; 00 10 01 11 02 12 03 13
    275         punpckldq   xmm0,       xmm4         ; 00 10 20 30 01 11 21 31
    276 
    277         punpckhdq   xmm2,       xmm4         ; 02 12 22 32 03 13 23 33
    278 
    279 
    280         movdqa      xmm4,       xmm1         ; 04 14 05 15 06 16 07 17
    281         punpckldq   xmm4,       xmm5         ; 04 14 24 34 05 15 25 35
    282 
    283         punpckhdq   xmm1,       xmm5         ; 06 16 26 36 07 17 27 37
    284         movdqa      xmm3,       xmm2         ; 02 12 22 32 03 13 23 33
    285 
    286         punpckhqdq  xmm3,       xmm1         ; 03 13 23 33 07 17 27 37
    287         punpcklqdq  xmm2,       xmm1         ; 02 12 22 32 06 16 26 36
    288 
    289         movdqa      xmm1,       xmm0         ; 00 10 20 30 01 11 21 31
    290         punpcklqdq  xmm0,       xmm4         ; 00 10 20 30 04 14 24 34
    291 
    292         punpckhqdq  xmm1,       xmm4         ; 01 11 21 32 05 15 25 35
    293 
    294         ; xmm0 0
    295         ; xmm1 1
    296         ; xmm2 2
    297         ; xmm3 3
    298 
    299         ; first stage
    300         movdqa      xmm5,       xmm0
    301         movdqa      xmm4,       xmm1
    302 
    303         paddw       xmm0,       xmm3         ; a = 0 + 3
    304         paddw       xmm1,       xmm2         ; b = 1 + 2
    305 
    306         psubw       xmm4,       xmm2         ; c = 1 - 2
    307         psubw       xmm5,       xmm3         ; d = 0 - 3
    308 
    309 
    310         ; output 0 and 2
    311         movdqa      xmm6,       [rdx +  32] ; c2
    312         movdqa      xmm2,       xmm0         ; a
    313 
    314         paddw       xmm0,       xmm1         ; a + b
    315         psubw       xmm2,       xmm1         ; a - b
    316 
    317         movdqa      xmm1,       xmm0         ; a + b
    318         pmulhw      xmm0,       xmm6         ; 00 01 02 03
    319 
    320         paddw       xmm0,       xmm1         ; output 00 01 02 03
    321         pmulhw      xmm6,       xmm2         ; 20 21 22 23
    322 
    323         paddw       xmm2,       xmm6         ; output 20 21 22 23
    324 
    325         ; output 1 and 3
    326         movdqa      xmm6,       [rdx + 16]  ; c1
    327         movdqa      xmm7,       [rdx + 48]  ; c3
    328 
    329         movdqa      xmm1,       xmm4         ; c
    330         movdqa      xmm3,       xmm5         ; d
    331 
    332         pmulhw      xmm1,       xmm7         ; c * c3
    333         pmulhw      xmm3,       xmm6         ; d * c1
    334 
    335         paddw       xmm3,       xmm5         ; d * c1 rounded
    336         paddw       xmm1,       xmm3         ; output 10 11 12 13
    337 
    338         movdqa      xmm3,       xmm4         ; c
    339         pmulhw      xmm5,       xmm7         ; d * c3
    340 
    341         pmulhw      xmm4,       xmm6         ; c * c1
    342         paddw       xmm3,       xmm4         ; round c* c1
    343 
    344         psubw       xmm5,       xmm3         ; output 30 31 32 33
    345         movdqa      xmm3,       xmm5
    346 
    347 
    348         ; done with vertical
    349         ; transpose for the second stage
    350         movdqa      xmm4,       xmm2         ; 02 12 22 32 06 16 26 36
    351         movdqa      xmm2,       xmm1         ; 01 11 21 31 05 15 25 35
    352 
    353         movdqa      xmm1,       xmm0         ; 00 10 20 30 04 14 24 34
    354         movdqa      xmm5,       xmm4         ; 02 12 22 32 06 16 26 36
    355 
    356         punpcklwd   xmm0,       xmm2         ; 00 01 10 11 20 21 30 31
    357         punpckhwd   xmm1,       xmm2         ; 04 05 14 15 24 25 34 35
    358 
    359         punpcklwd   xmm4,       xmm3         ; 02 03 12 13 22 23 32 33
    360         punpckhwd   xmm5,       xmm3         ; 06 07 16 17 26 27 36 37
    361 
    362         movdqa      xmm2,       xmm0         ; 00 01 10 11 20 21 30 31
    363         punpckldq   xmm0,       xmm4         ; 00 01 02 03 10 11 12 13
    364 
    365         punpckhdq   xmm2,       xmm4         ; 20 21 22 23 30 31 32 33
    366 
    367 
    368         movdqa      xmm4,       xmm1         ; 04 05 14 15 24 25 34 35
    369         punpckldq   xmm4,       xmm5         ; 04 05 06 07 14 15 16 17
    370 
    371         punpckhdq   xmm1,       xmm5         ; 24 25 26 27 34 35 36 37
    372         movdqa      xmm3,       xmm2         ; 20 21 22 23 30 31 32 33
    373 
    374         punpckhqdq  xmm3,       xmm1         ; 30 31 32 33 34 35 36 37
    375         punpcklqdq  xmm2,       xmm1         ; 20 21 22 23 24 25 26 27
    376 
    377         movdqa      xmm1,       xmm0         ; 00 01 02 03 10 11 12 13
    378         punpcklqdq  xmm0,       xmm4         ; 00 01 02 03 04 05 06 07
    379 
    380         punpckhqdq  xmm1,       xmm4         ; 10 11 12 13 14 15 16 17
    381 
    382         ; first stage
    383         movdqa      xmm5,       xmm0
    384         movdqa      xmm4,       xmm1
    385 
    386         paddw       xmm0,       xmm3         ; a = 0 + 3
    387         paddw       xmm1,       xmm2         ; b = 1 + 2
    388 
    389         psubw       xmm4,       xmm2         ; c = 1 - 2
    390         psubw       xmm5,       xmm3         ; d = 0 - 3
    391 
    392 
    393         ; output 0 and 2
    394         movdqa      xmm6,       [rdx +  32] ; c2
    395         movdqa      xmm2,       xmm0         ; a
    396 
    397         paddw       xmm0,       xmm1         ; a + b
    398         psubw       xmm2,       xmm1         ; a - b
    399 
    400         movdqa      xmm1,       xmm0         ; a + b
    401         pmulhw      xmm0,       xmm6         ; 00 01 02 03
    402 
    403         paddw       xmm0,       xmm1         ; output 00 01 02 03
    404         pmulhw      xmm6,       xmm2         ; 20 21 22 23
    405 
    406         paddw       xmm2,       xmm6         ; output 20 21 22 23
    407 
    408         ; output 1 and 3
    409         movdqa      xmm6,       [rdx + 16]  ; c1
    410         movdqa      xmm7,       [rdx + 48]  ; c3
    411 
    412         movdqa      xmm1,       xmm4         ; c
    413         movdqa      xmm3,       xmm5         ; d
    414 
    415         pmulhw      xmm1,       xmm7         ; c * c3
    416         pmulhw      xmm3,       xmm6         ; d * c1
    417 
    418         paddw       xmm3,       xmm5         ; d * c1 rounded
    419         paddw       xmm1,       xmm3         ; output 10 11 12 13
    420 
    421         movdqa      xmm3,       xmm4         ; c
    422         pmulhw      xmm5,       xmm7         ; d * c3
    423 
    424         pmulhw      xmm4,       xmm6         ; c * c1
    425         paddw       xmm3,       xmm4         ; round c* c1
    426 
    427         psubw       xmm5,       xmm3         ; output 30 31 32 33
    428         movdqa      xmm3,       xmm5
    429         ; done with vertical
    430 
    431 
    432         pcmpeqw     xmm4,       xmm4
    433         pcmpeqw     xmm5,       xmm5;
    434         psrlw       xmm4,       15
    435         psrlw       xmm5,       15
    436 
    437         psllw       xmm4,       2
    438         psllw       xmm5,       2
    439 
    440         paddw       xmm0,       xmm4
    441         paddw       xmm1,       xmm5
    442         paddw       xmm2,       xmm4
    443         paddw       xmm3,       xmm5
    444 
    445         psraw       xmm0,       3
    446         psraw       xmm1,       3
    447         psraw       xmm2,       3
    448         psraw       xmm3,       3
    449 
    450         movq        QWORD PTR[rdi   ],   xmm0
    451         movq        QWORD PTR[rdi+ 8],   xmm1
    452         movq        QWORD PTR[rdi+16],   xmm2
    453         movq        QWORD PTR[rdi+24],   xmm3
    454 
    455         psrldq      xmm0,       8
    456         psrldq      xmm1,       8
    457         psrldq      xmm2,       8
    458         psrldq      xmm3,       8
    459 
    460         movq        QWORD PTR[rdi+32],   xmm0
    461         movq        QWORD PTR[rdi+40],   xmm1
    462         movq        QWORD PTR[rdi+48],   xmm2
    463         movq        QWORD PTR[rdi+56],   xmm3
    464     ; begin epilog
    465     pop rdi
    466     pop rsi
    467     RESTORE_GOT
    468     UNSHADOW_ARGS
    469     pop         rbp
    470     ret
    471 
    472 
    473 SECTION_RODATA
    474 ;static const unsigned int dct1st_stage_rounding_mmx[2] =
    475 align 16
    476 dct1st_stage_rounding_mmx:
    477     times 2 dd 8192
    478 
    479 
    480 ;static const unsigned int dct2nd_stage_rounding_mmx[2] =
    481 align 16
    482 dct2nd_stage_rounding_mmx:
    483     times 2 dd 32768
    484 
    485 
    486 ;static const short dct_matrix[4][4]=
    487 align 16
    488 dct_matrix:
    489     times 4 dw 23170
    490 
    491     dw  30274
    492     dw  12540
    493     dw -12540
    494     dw -30274
    495 
    496     dw 23170
    497     times 2 dw -23170
    498     dw 23170
    499 
    500     dw  12540
    501     dw -30274
    502     dw  30274
    503     dw -12540
    504 
    505 
    506 ;static const unsigned short dct_const_mmx[4 * 4]=
    507 align 16
    508 dct_const_mmx:
    509     times 4 dw 0
    510     times 4 dw 60547
    511     times 4 dw 46341
    512     times 4 dw 25080
    513 
    514 
    515 ;static const unsigned short dct_const_xmm[8 * 4]=
    516 align 16
    517 dct_const_xmm:
    518     times 8 dw 0
    519     times 8 dw 60547
    520     times 8 dw 46341
    521     times 8 dw 25080
    522