Home | History | Annotate | Download | only in ppc
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12     .globl vp8_short_fdct4x4_ppc
     13     .globl vp8_short_fdct8x4_ppc
     14 
     15 .macro load_c V, LABEL, OFF, R0, R1
     16     lis     \R0, \LABEL@ha
     17     la      \R1, \LABEL@l(\R0)
     18     lvx     \V, \OFF, \R1
     19 .endm
     20 
     21 ;# Forward and inverse DCTs are nearly identical; only differences are
     22 ;#   in normalization (fwd is twice unitary, inv is half unitary)
     23 ;#   and that they are of course transposes of each other.
     24 ;#
     25 ;#   The following three accomplish most of implementation and
     26 ;#   are used only by ppc_idct.c and ppc_fdct.c.
     27 .macro prologue
     28     mfspr   r11, 256            ;# get old VRSAVE
     29     oris    r12, r11, 0xfffc
     30     mtspr   256, r12            ;# set VRSAVE
     31 
     32     stwu    r1,-32(r1)          ;# create space on the stack
     33 
     34     li      r6, 16
     35 
     36     load_c v0, dct_tab, 0, r9, r10
     37     lvx     v1,   r6, r10
     38     addi    r10, r10, 32
     39     lvx     v2,    0, r10
     40     lvx     v3,   r6, r10
     41 
     42     load_c v4, ppc_dctperm_tab,  0, r9, r10
     43     load_c v5, ppc_dctperm_tab, r6, r9, r10
     44 
     45     load_c v6, round_tab, 0, r10, r9
     46 .endm
     47 
     48 .macro epilogue
     49     addi    r1, r1, 32          ;# recover stack
     50 
     51     mtspr   256, r11            ;# reset old VRSAVE
     52 .endm
     53 
     54 ;# Do horiz xf on two rows of coeffs  v8 = a0 a1 a2 a3  b0 b1 b2 b3.
     55 ;#   a/A are the even rows 0,2   b/B are the odd rows 1,3
     56 ;#   For fwd transform, indices are horizontal positions, then frequencies.
     57 ;#   For inverse transform, frequencies then positions.
     58 ;#   The two resulting  A0..A3  B0..B3  are later combined
     59 ;#   and vertically transformed.
     60 
     61 .macro two_rows_horiz Dst
     62     vperm   v9, v8, v8, v4      ;# v9 = a2 a3 a0 a1  b2 b3 b0 b1
     63 
     64     vmsumshm v10, v0, v8, v6
     65     vmsumshm v10, v1, v9, v10
     66     vsraw   v10, v10, v7        ;# v10 = A0 A1  B0 B1
     67 
     68     vmsumshm v11, v2, v8, v6
     69     vmsumshm v11, v3, v9, v11
     70     vsraw   v11, v11, v7        ;# v11 = A2 A3  B2 B3
     71 
     72     vpkuwum v10, v10, v11       ;# v10  = A0 A1  B0 B1  A2 A3  B2 B3
     73     vperm   \Dst, v10, v10, v5  ;# Dest = A0 B0  A1 B1  A2 B2  A3 B3
     74 .endm
     75 
     76 ;# Vertical xf on two rows. DCT values in comments are for inverse transform;
     77 ;#   forward transform uses transpose.
     78 
     79 .macro two_rows_vert Ceven, Codd
     80     vspltw  v8, \Ceven, 0       ;# v8 = c00 c10  or  c02 c12 four times
     81     vspltw  v9, \Codd,  0       ;# v9 = c20 c30  or  c22 c32 ""
     82     vmsumshm v8, v8, v12, v6
     83     vmsumshm v8, v9, v13, v8
     84     vsraw   v10, v8, v7
     85 
     86     vspltw  v8, \Codd,  1       ;# v8 = c01 c11  or  c03 c13
     87     vspltw  v9, \Ceven, 1       ;# v9 = c21 c31  or  c23 c33
     88     vmsumshm v8, v8, v12, v6
     89     vmsumshm v8, v9, v13, v8
     90     vsraw   v8, v8, v7
     91 
     92     vpkuwum v8, v10, v8         ;# v8 = rows 0,1  or 2,3
     93 .endm
     94 
     95 .macro two_rows_h Dest
     96     stw     r0,  0(r8)
     97     lwz     r0,  4(r3)
     98     stw     r0,  4(r8)
     99     lwzux   r0, r3,r5
    100     stw     r0,  8(r8)
    101     lwz     r0,  4(r3)
    102     stw     r0, 12(r8)
    103     lvx     v8,  0,r8
    104     two_rows_horiz \Dest
    105 .endm
    106 
    107     .align 2
    108 ;# r3 short *input
    109 ;# r4 short *output
    110 ;# r5 int pitch
    111 vp8_short_fdct4x4_ppc:
    112 
    113     prologue
    114 
    115     vspltisw v7, 14             ;# == 14, fits in 5 signed bits
    116     addi    r8, r1, 0
    117 
    118 
    119     lwz     r0, 0(r3)
    120     two_rows_h v12                ;# v12 = H00 H10  H01 H11  H02 H12  H03 H13
    121 
    122     lwzux   r0, r3, r5
    123     two_rows_h v13                ;# v13 = H20 H30  H21 H31  H22 H32  H23 H33
    124 
    125     lvx     v6, r6, r9          ;# v6 = Vround
    126     vspltisw v7, -16            ;# == 16 == -16, only low 5 bits matter
    127 
    128     two_rows_vert v0, v1
    129     stvx    v8, 0, r4
    130     two_rows_vert v2, v3
    131     stvx    v8, r6, r4
    132 
    133     epilogue
    134 
    135     blr
    136 
    137     .align 2
    138 ;# r3 short *input
    139 ;# r4 short *output
    140 ;# r5 int pitch
    141 vp8_short_fdct8x4_ppc:
    142     prologue
    143 
    144     vspltisw v7, 14             ;# == 14, fits in 5 signed bits
    145     addi    r8,  r1, 0
    146     addi    r10, r3, 0
    147 
    148     lwz     r0, 0(r3)
    149     two_rows_h v12                ;# v12 = H00 H10  H01 H11  H02 H12  H03 H13
    150 
    151     lwzux   r0, r3, r5
    152     two_rows_h v13                ;# v13 = H20 H30  H21 H31  H22 H32  H23 H33
    153 
    154     lvx     v6, r6, r9          ;# v6 = Vround
    155     vspltisw v7, -16            ;# == 16 == -16, only low 5 bits matter
    156 
    157     two_rows_vert v0, v1
    158     stvx    v8, 0, r4
    159     two_rows_vert v2, v3
    160     stvx    v8, r6, r4
    161 
    162     ;# Next block
    163     addi    r3, r10, 8
    164     addi    r4, r4, 32
    165     lvx     v6, 0, r9           ;# v6 = Hround
    166 
    167     vspltisw v7, 14             ;# == 14, fits in 5 signed bits
    168     addi    r8, r1, 0
    169 
    170     lwz     r0, 0(r3)
    171     two_rows_h v12                ;# v12 = H00 H10  H01 H11  H02 H12  H03 H13
    172 
    173     lwzux   r0, r3, r5
    174     two_rows_h v13                ;# v13 = H20 H30  H21 H31  H22 H32  H23 H33
    175 
    176     lvx     v6, r6, r9          ;# v6 = Vround
    177     vspltisw v7, -16            ;# == 16 == -16, only low 5 bits matter
    178 
    179     two_rows_vert v0, v1
    180     stvx    v8, 0, r4
    181     two_rows_vert v2, v3
    182     stvx    v8, r6, r4
    183 
    184     epilogue
    185 
    186     blr
    187 
    188     .data
    189     .align 4
    190 ppc_dctperm_tab:
    191     .byte 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11
    192     .byte 0,1,4,5, 2,3,6,7, 8,9,12,13, 10,11,14,15
    193 
    194     .align 4
    195 dct_tab:
    196     .short  23170, 23170,-12540,-30274, 23170, 23170,-12540,-30274
    197     .short  23170, 23170, 30274, 12540, 23170, 23170, 30274, 12540
    198 
    199     .short  23170,-23170, 30274,-12540, 23170,-23170, 30274,-12540
    200     .short -23170, 23170, 12540,-30274,-23170, 23170, 12540,-30274
    201 
    202     .align 4
    203 round_tab:
    204     .long (1 << (14-1)), (1 << (14-1)), (1 << (14-1)), (1 << (14-1))
    205     .long (1 << (16-1)), (1 << (16-1)), (1 << (16-1)), (1 << (16-1))
    206