Home | History | Annotate | Download | only in x86
      1 ;
      2 ;  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 %include "third_party/x86inc/x86inc.asm"
     12 
     13 SECTION_RODATA
     14 
     15 pw_11585x2: times 8 dw 23170
     16 pd_8192:    times 4 dd 8192
     17 
     18 %macro TRANSFORM_COEFFS 2
     19 pw_%1_%2:   dw  %1,  %2,  %1,  %2,  %1,  %2,  %1,  %2
     20 pw_%2_m%1:  dw  %2, -%1,  %2, -%1,  %2, -%1,  %2, -%1
     21 %endmacro
     22 
     23 TRANSFORM_COEFFS 11585,  11585
     24 TRANSFORM_COEFFS 15137,   6270
     25 TRANSFORM_COEFFS 16069,   3196
     26 TRANSFORM_COEFFS  9102,  13623
     27 
     28 SECTION .text
     29 
     30 %if ARCH_X86_64
     31 INIT_XMM ssse3
     32 cglobal fdct8x8, 3, 5, 13, input, output, stride
     33 
     34   mova               m8, [GLOBAL(pd_8192)]
     35   mova              m12, [GLOBAL(pw_11585x2)]
     36 
     37   lea                r3, [2 * strideq]
     38   lea                r4, [4 * strideq]
     39   mova               m0, [inputq]
     40   mova               m1, [inputq + r3]
     41   lea                inputq, [inputq + r4]
     42   mova               m2, [inputq]
     43   mova               m3, [inputq + r3]
     44   lea                inputq, [inputq + r4]
     45   mova               m4, [inputq]
     46   mova               m5, [inputq + r3]
     47   lea                inputq, [inputq + r4]
     48   mova               m6, [inputq]
     49   mova               m7, [inputq + r3]
     50 
     51   ; left shift by 2 to increase forward transformation precision
     52   psllw              m0, 2
     53   psllw              m1, 2
     54   psllw              m2, 2
     55   psllw              m3, 2
     56   psllw              m4, 2
     57   psllw              m5, 2
     58   psllw              m6, 2
     59   psllw              m7, 2
     60 
     61   ; column transform
     62   ; stage 1
     63   paddw m10, m0, m7
     64   psubw m0, m7
     65 
     66   paddw m9, m1, m6
     67   psubw m1, m6
     68 
     69   paddw m7, m2, m5
     70   psubw m2, m5
     71 
     72   paddw m6, m3, m4
     73   psubw m3, m4
     74 
     75   ; stage 2
     76   paddw m5, m9, m7
     77   psubw m9, m7
     78 
     79   paddw m4, m10, m6
     80   psubw m10, m6
     81 
     82   paddw m7, m1, m2
     83   psubw m1, m2
     84 
     85   ; stage 3
     86   paddw m6, m4, m5
     87   psubw m4, m5
     88 
     89   pmulhrsw m1, m12
     90   pmulhrsw m7, m12
     91 
     92   ; sin(pi / 8), cos(pi / 8)
     93   punpcklwd m2, m10, m9
     94   punpckhwd m10, m9
     95   pmaddwd m5, m2, [GLOBAL(pw_15137_6270)]
     96   pmaddwd m2, [GLOBAL(pw_6270_m15137)]
     97   pmaddwd m9, m10, [GLOBAL(pw_15137_6270)]
     98   pmaddwd m10, [GLOBAL(pw_6270_m15137)]
     99   paddd m5, m8
    100   paddd m2, m8
    101   paddd m9, m8
    102   paddd m10, m8
    103   psrad m5, 14
    104   psrad m2, 14
    105   psrad m9, 14
    106   psrad m10, 14
    107   packssdw m5, m9
    108   packssdw m2, m10
    109 
    110   pmulhrsw m6, m12
    111   pmulhrsw m4, m12
    112 
    113   paddw m9, m3, m1
    114   psubw m3, m1
    115 
    116   paddw m10, m0, m7
    117   psubw m0, m7
    118 
    119   ; stage 4
    120   ; sin(pi / 16), cos(pi / 16)
    121   punpcklwd m1, m10, m9
    122   punpckhwd m10, m9
    123   pmaddwd m7, m1, [GLOBAL(pw_16069_3196)]
    124   pmaddwd m1, [GLOBAL(pw_3196_m16069)]
    125   pmaddwd m9, m10, [GLOBAL(pw_16069_3196)]
    126   pmaddwd m10, [GLOBAL(pw_3196_m16069)]
    127   paddd m7, m8
    128   paddd m1, m8
    129   paddd m9, m8
    130   paddd m10, m8
    131   psrad m7, 14
    132   psrad m1, 14
    133   psrad m9, 14
    134   psrad m10, 14
    135   packssdw m7, m9
    136   packssdw m1, m10
    137 
    138   ; sin(3 * pi / 16), cos(3 * pi / 16)
    139   punpcklwd m11, m0, m3
    140   punpckhwd m0, m3
    141   pmaddwd m9, m11, [GLOBAL(pw_9102_13623)]
    142   pmaddwd m11, [GLOBAL(pw_13623_m9102)]
    143   pmaddwd m3, m0, [GLOBAL(pw_9102_13623)]
    144   pmaddwd m0, [GLOBAL(pw_13623_m9102)]
    145   paddd m9, m8
    146   paddd m11, m8
    147   paddd m3, m8
    148   paddd m0, m8
    149   psrad m9, 14
    150   psrad m11, 14
    151   psrad m3, 14
    152   psrad m0, 14
    153   packssdw m9, m3
    154   packssdw m11, m0
    155 
    156   ; transpose
    157   ; stage 1
    158   punpcklwd m0, m6, m7
    159   punpcklwd m3, m5, m11
    160   punpckhwd m6, m7
    161   punpckhwd m5, m11
    162   punpcklwd m7, m4, m9
    163   punpcklwd m10, m2, m1
    164   punpckhwd m4, m9
    165   punpckhwd m2, m1
    166 
    167   ; stage 2
    168   punpckldq m9, m0, m3
    169   punpckldq m1, m6, m5
    170   punpckhdq m0, m3
    171   punpckhdq m6, m5
    172   punpckldq m3, m7, m10
    173   punpckldq m5, m4, m2
    174   punpckhdq m7, m10
    175   punpckhdq m4, m2
    176 
    177   ; stage 3
    178   punpcklqdq m10, m9, m3
    179   punpckhqdq m9, m3
    180   punpcklqdq m2, m0, m7
    181   punpckhqdq m0, m7
    182   punpcklqdq m3, m1, m5
    183   punpckhqdq m1, m5
    184   punpcklqdq m7, m6, m4
    185   punpckhqdq m6, m4
    186 
    187   ; row transform
    188   ; stage 1
    189   paddw m5, m10, m6
    190   psubw m10, m6
    191 
    192   paddw m4, m9, m7
    193   psubw m9, m7
    194 
    195   paddw m6, m2, m1
    196   psubw m2, m1
    197 
    198   paddw m7, m0, m3
    199   psubw m0, m3
    200 
    201   ;stage 2
    202   paddw m1, m5, m7
    203   psubw m5, m7
    204 
    205   paddw m3, m4, m6
    206   psubw m4, m6
    207 
    208   paddw m7, m9, m2
    209   psubw m9, m2
    210 
    211   ; stage 3
    212   punpcklwd m6, m1, m3
    213   punpckhwd m1, m3
    214   pmaddwd m2, m6, [GLOBAL(pw_11585_11585)]
    215   pmaddwd m6, [GLOBAL(pw_11585_m11585)]
    216   pmaddwd m3, m1, [GLOBAL(pw_11585_11585)]
    217   pmaddwd m1, [GLOBAL(pw_11585_m11585)]
    218   paddd m2, m8
    219   paddd m6, m8
    220   paddd m3, m8
    221   paddd m1, m8
    222   psrad m2, 14
    223   psrad m6, 14
    224   psrad m3, 14
    225   psrad m1, 14
    226   packssdw m2, m3
    227   packssdw m6, m1
    228 
    229   pmulhrsw m7, m12
    230   pmulhrsw m9, m12
    231 
    232   punpcklwd m3, m5, m4
    233   punpckhwd m5, m4
    234   pmaddwd m1, m3, [GLOBAL(pw_15137_6270)]
    235   pmaddwd m3, [GLOBAL(pw_6270_m15137)]
    236   pmaddwd m4, m5, [GLOBAL(pw_15137_6270)]
    237   pmaddwd m5, [GLOBAL(pw_6270_m15137)]
    238   paddd m1, m8
    239   paddd m3, m8
    240   paddd m4, m8
    241   paddd m5, m8
    242   psrad m1, 14
    243   psrad m3, 14
    244   psrad m4, 14
    245   psrad m5, 14
    246   packssdw m1, m4
    247   packssdw m3, m5
    248 
    249   paddw m4, m0, m9
    250   psubw m0, m9
    251 
    252   paddw m5, m10, m7
    253   psubw m10, m7
    254 
    255   ; stage 4
    256   punpcklwd m9, m5, m4
    257   punpckhwd m5, m4
    258   pmaddwd m7, m9, [GLOBAL(pw_16069_3196)]
    259   pmaddwd m9, [GLOBAL(pw_3196_m16069)]
    260   pmaddwd m4, m5, [GLOBAL(pw_16069_3196)]
    261   pmaddwd m5, [GLOBAL(pw_3196_m16069)]
    262   paddd m7, m8
    263   paddd m9, m8
    264   paddd m4, m8
    265   paddd m5, m8
    266   psrad m7, 14
    267   psrad m9, 14
    268   psrad m4, 14
    269   psrad m5, 14
    270   packssdw m7, m4
    271   packssdw m9, m5
    272 
    273   punpcklwd m4, m10, m0
    274   punpckhwd m10, m0
    275   pmaddwd m5, m4, [GLOBAL(pw_9102_13623)]
    276   pmaddwd m4, [GLOBAL(pw_13623_m9102)]
    277   pmaddwd m0, m10, [GLOBAL(pw_9102_13623)]
    278   pmaddwd m10, [GLOBAL(pw_13623_m9102)]
    279   paddd m5, m8
    280   paddd m4, m8
    281   paddd m0, m8
    282   paddd m10, m8
    283   psrad m5, 14
    284   psrad m4, 14
    285   psrad m0, 14
    286   psrad m10, 14
    287   packssdw m5, m0
    288   packssdw m4, m10
    289 
    290   ; transpose
    291   ; stage 1
    292   punpcklwd m0, m2, m7
    293   punpcklwd m10, m1, m4
    294   punpckhwd m2, m7
    295   punpckhwd m1, m4
    296   punpcklwd m7, m6, m5
    297   punpcklwd m4, m3, m9
    298   punpckhwd m6, m5
    299   punpckhwd m3, m9
    300 
    301   ; stage 2
    302   punpckldq m5, m0, m10
    303   punpckldq m9, m2, m1
    304   punpckhdq m0, m10
    305   punpckhdq m2, m1
    306   punpckldq m10, m7, m4
    307   punpckldq m1, m6, m3
    308   punpckhdq m7, m4
    309   punpckhdq m6, m3
    310 
    311   ; stage 3
    312   punpcklqdq m4, m5, m10
    313   punpckhqdq m5, m10
    314   punpcklqdq m3, m0, m7
    315   punpckhqdq m0, m7
    316   punpcklqdq m10, m9, m1
    317   punpckhqdq m9, m1
    318   punpcklqdq m7, m2, m6
    319   punpckhqdq m2, m6
    320 
    321   psraw m1, m4, 15
    322   psraw m6, m5, 15
    323   psraw m8, m3, 15
    324   psraw m11, m0, 15
    325 
    326   psubw m4, m1
    327   psubw m5, m6
    328   psubw m3, m8
    329   psubw m0, m11
    330 
    331   psraw m4, 1
    332   psraw m5, 1
    333   psraw m3, 1
    334   psraw m0, 1
    335 
    336   psraw m1, m10, 15
    337   psraw m6, m9, 15
    338   psraw m8, m7, 15
    339   psraw m11, m2, 15
    340 
    341   psubw m10, m1
    342   psubw m9, m6
    343   psubw m7, m8
    344   psubw m2, m11
    345 
    346   psraw m10, 1
    347   psraw m9, 1
    348   psraw m7, 1
    349   psraw m2, 1
    350 
    351   mova              [outputq +   0], m4
    352   mova              [outputq +  16], m5
    353   mova              [outputq +  32], m3
    354   mova              [outputq +  48], m0
    355   mova              [outputq +  64], m10
    356   mova              [outputq +  80], m9
    357   mova              [outputq +  96], m7
    358   mova              [outputq + 112], m2
    359 
    360   RET
    361 %endif
    362