Home | History | Annotate | Download | only in x86
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12 %include "vpx_ports/x86_abi_support.asm"
     13 
     14 ;void vp8_short_fdct4x4_mmx(short *input, short *output, int pitch)
     15 global sym(vp8_short_fdct4x4_mmx)
     16 sym(vp8_short_fdct4x4_mmx):
     17     push        rbp
     18     mov         rbp,        rsp
     19     SHADOW_ARGS_TO_STACK 3
     20     GET_GOT     rbx
     21     push        rsi
     22     push        rdi
     23     ; end prolog
     24 
     25         mov         rsi,        arg(0)      ; input
     26         mov         rdi,        arg(1)      ; output
     27 
     28         movsxd      rax,        dword ptr arg(2) ;pitch
     29 
     30         lea         rcx,        [rsi + rax*2]
     31         ; read the input data
     32         movq        mm0,        [rsi]
     33         movq        mm1,        [rsi + rax]
     34 
     35         movq        mm2,        [rcx]
     36         movq        mm4,        [rcx + rax]
     37 
     38         ; transpose for the first stage
     39         movq        mm3,        mm0         ; 00 01 02 03
     40         movq        mm5,        mm2         ; 20 21 22 23
     41 
     42         punpcklwd   mm0,        mm1         ; 00 10 01 11
     43         punpckhwd   mm3,        mm1         ; 02 12 03 13
     44 
     45         punpcklwd   mm2,        mm4         ; 20 30 21 31
     46         punpckhwd   mm5,        mm4         ; 22 32 23 33
     47 
     48         movq        mm1,        mm0         ; 00 10 01 11
     49         punpckldq   mm0,        mm2         ; 00 10 20 30
     50 
     51         punpckhdq   mm1,        mm2         ; 01 11 21 31
     52 
     53         movq        mm2,        mm3         ; 02 12 03 13
     54         punpckldq   mm2,        mm5         ; 02 12 22 32
     55 
     56         punpckhdq   mm3,        mm5         ; 03 13 23 33
     57 
     58         ; mm0 0
     59         ; mm1 1
     60         ; mm2 2
     61         ; mm3 3
     62 
     63         ; first stage
     64         movq        mm5,        mm0
     65         movq        mm4,        mm1
     66 
     67         paddw       mm0,        mm3         ; a1 = 0 + 3
     68         paddw       mm1,        mm2         ; b1 = 1 + 2
     69 
     70         psubw       mm4,        mm2         ; c1 = 1 - 2
     71         psubw       mm5,        mm3         ; d1 = 0 - 3
     72 
     73         psllw       mm5,        3
     74         psllw       mm4,        3
     75 
     76         psllw       mm0,        3
     77         psllw       mm1,        3
     78 
     79         ; output 0 and 2
     80         movq        mm2,        mm0         ; a1
     81 
     82         paddw       mm0,        mm1         ; op[0] = a1 + b1
     83         psubw       mm2,        mm1         ; op[2] = a1 - b1
     84 
     85         ; output 1 and 3
     86         ; interleave c1, d1
     87         movq        mm1,        mm5         ; d1
     88         punpcklwd   mm1,        mm4         ; c1 d1
     89         punpckhwd   mm5,        mm4         ; c1 d1
     90 
     91         movq        mm3,        mm1
     92         movq        mm4,        mm5
     93 
     94         pmaddwd     mm1,        MMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
     95         pmaddwd     mm4,        MMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
     96 
     97         pmaddwd     mm3,        MMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
     98         pmaddwd     mm5,        MMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
     99 
    100         paddd       mm1,        MMWORD PTR[GLOBAL(_14500)]
    101         paddd       mm4,        MMWORD PTR[GLOBAL(_14500)]
    102         paddd       mm3,        MMWORD PTR[GLOBAL(_7500)]
    103         paddd       mm5,        MMWORD PTR[GLOBAL(_7500)]
    104 
    105         psrad       mm1,        12          ; (c1 * 2217 + d1 * 5352 +  14500)>>12
    106         psrad       mm4,        12          ; (c1 * 2217 + d1 * 5352 +  14500)>>12
    107         psrad       mm3,        12          ; (d1 * 2217 - c1 * 5352 +   7500)>>12
    108         psrad       mm5,        12          ; (d1 * 2217 - c1 * 5352 +   7500)>>12
    109 
    110         packssdw    mm1,        mm4         ; op[1]
    111         packssdw    mm3,        mm5         ; op[3]
    112 
    113         ; done with vertical
    114         ; transpose for the second stage
    115         movq        mm4,        mm0         ; 00 10 20 30
    116         movq        mm5,        mm2         ; 02 12 22 32
    117 
    118         punpcklwd   mm0,        mm1         ; 00 01 10 11
    119         punpckhwd   mm4,        mm1         ; 20 21 30 31
    120 
    121         punpcklwd   mm2,        mm3         ; 02 03 12 13
    122         punpckhwd   mm5,        mm3         ; 22 23 32 33
    123 
    124         movq        mm1,        mm0         ; 00 01 10 11
    125         punpckldq   mm0,        mm2         ; 00 01 02 03
    126 
    127         punpckhdq   mm1,        mm2         ; 01 22 12 13
    128 
    129         movq        mm2,        mm4         ; 20 31 30 31
    130         punpckldq   mm2,        mm5         ; 20 21 22 23
    131 
    132         punpckhdq   mm4,        mm5         ; 30 31 32 33
    133 
    134         ; mm0 0
    135         ; mm1 1
    136         ; mm2 2
    137         ; mm3 4
    138 
    139         movq        mm5,        mm0
    140         movq        mm3,        mm1
    141 
    142         paddw       mm0,        mm4         ; a1 = 0 + 3
    143         paddw       mm1,        mm2         ; b1 = 1 + 2
    144 
    145         psubw       mm3,        mm2         ; c1 = 1 - 2
    146         psubw       mm5,        mm4         ; d1 = 0 - 3
    147 
    148         pxor        mm6,        mm6         ; zero out for compare
    149 
    150         pcmpeqw     mm6,        mm5         ; d1 != 0
    151 
    152         pandn       mm6,        MMWORD PTR[GLOBAL(_cmp_mask)]   ; clear upper,
    153                                                                 ; and keep bit 0 of lower
    154 
    155         ; output 0 and 2
    156         movq        mm2,        mm0         ; a1
    157 
    158         paddw       mm0,        mm1         ; a1 + b1
    159         psubw       mm2,        mm1         ; a1 - b1
    160 
    161         paddw       mm0,        MMWORD PTR[GLOBAL(_7w)]
    162         paddw       mm2,        MMWORD PTR[GLOBAL(_7w)]
    163 
    164         psraw       mm0,        4           ; op[0] = (a1 + b1 + 7)>>4
    165         psraw       mm2,        4           ; op[8] = (a1 - b1 + 7)>>4
    166 
    167         movq        MMWORD PTR[rdi + 0 ],  mm0
    168         movq        MMWORD PTR[rdi + 16],  mm2
    169 
    170         ; output 1 and 3
    171         ; interleave c1, d1
    172         movq        mm1,        mm5         ; d1
    173         punpcklwd   mm1,        mm3         ; c1 d1
    174         punpckhwd   mm5,        mm3         ; c1 d1
    175 
    176         movq        mm3,        mm1
    177         movq        mm4,        mm5
    178 
    179         pmaddwd     mm1,        MMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
    180         pmaddwd     mm4,        MMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
    181 
    182         pmaddwd     mm3,        MMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
    183         pmaddwd     mm5,        MMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
    184 
    185         paddd       mm1,        MMWORD PTR[GLOBAL(_12000)]
    186         paddd       mm4,        MMWORD PTR[GLOBAL(_12000)]
    187         paddd       mm3,        MMWORD PTR[GLOBAL(_51000)]
    188         paddd       mm5,        MMWORD PTR[GLOBAL(_51000)]
    189 
    190         psrad       mm1,        16          ; (c1 * 2217 + d1 * 5352 +  14500)>>16
    191         psrad       mm4,        16          ; (c1 * 2217 + d1 * 5352 +  14500)>>16
    192         psrad       mm3,        16          ; (d1 * 2217 - c1 * 5352 +   7500)>>16
    193         psrad       mm5,        16          ; (d1 * 2217 - c1 * 5352 +   7500)>>16
    194 
    195         packssdw    mm1,        mm4         ; op[4]
    196         packssdw    mm3,        mm5         ; op[12]
    197 
    198         paddw       mm1,        mm6         ; op[4] += (d1!=0)
    199 
    200         movq        MMWORD PTR[rdi + 8 ],  mm1
    201         movq        MMWORD PTR[rdi + 24],  mm3
    202 
    203      ; begin epilog
    204     pop         rdi
    205     pop         rsi
    206     RESTORE_GOT
    207     UNSHADOW_ARGS
    208     pop         rbp
    209     ret
    210 
    211 SECTION_RODATA
    212 align 8
    213 _5352_2217:
    214     dw 5352
    215     dw 2217
    216     dw 5352
    217     dw 2217
    218 align 8
    219 _2217_neg5352:
    220     dw 2217
    221     dw -5352
    222     dw 2217
    223     dw -5352
    224 align 8
    225 _cmp_mask:
    226     times 4 dw 1
    227 align 8
    228 _7w:
    229     times 4 dw 7
    230 align 8
    231 _14500:
    232     times 2 dd 14500
    233 align 8
    234 _7500:
    235     times 2 dd 7500
    236 align 8
    237 _12000:
    238     times 2 dd 12000
    239 align 8
    240 _51000:
    241     times 2 dd 51000
    242