Home | History | Annotate | Download | only in ppc
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12     .globl short_idct4x4llm_ppc
     13 
     14 .macro load_c V, LABEL, OFF, R0, R1
     15     lis     \R0, \LABEL@ha
     16     la      \R1, \LABEL@l(\R0)
     17     lvx     \V, \OFF, \R1
     18 .endm
     19 
     20 ;# r3 short *input
     21 ;# r4 short *output
     22 ;# r5 int pitch
     23     .align 2
     24 short_idct4x4llm_ppc:
     25     mfspr   r11, 256            ;# get old VRSAVE
     26     oris    r12, r11, 0xfff8
     27     mtspr   256, r12            ;# set VRSAVE
     28 
     29     load_c v8, sinpi8sqrt2, 0, r9, r10
     30     load_c v9, cospi8sqrt2minus1, 0, r9, r10
     31     load_c v10, hi_hi, 0, r9, r10
     32     load_c v11, lo_lo, 0, r9, r10
     33     load_c v12, shift_16, 0, r9, r10
     34 
     35     li      r10,  16
     36     lvx     v0,   0, r3         ;# input ip[0], ip[ 4]
     37     lvx     v1, r10, r3         ;# input ip[8], ip[12]
     38 
     39     ;# first pass
     40     vupkhsh v2, v0
     41     vupkhsh v3, v1
     42     vaddsws v6, v2, v3          ;# a1 = ip[0]+ip[8]
     43     vsubsws v7, v2, v3          ;# b1 = ip[0]-ip[8]
     44 
     45     vupklsh v0, v0
     46     vmulosh v4, v0, v8
     47     vsraw   v4, v4, v12
     48     vaddsws v4, v4, v0          ;# ip[ 4] * sin(pi/8) * sqrt(2)
     49 
     50     vupklsh v1, v1
     51     vmulosh v5, v1, v9
     52     vsraw   v5, v5, v12         ;# ip[12] * cos(pi/8) * sqrt(2)
     53     vaddsws v5, v5, v1
     54 
     55     vsubsws v4, v4, v5          ;# c1
     56 
     57     vmulosh v3, v1, v8
     58     vsraw   v3, v3, v12
     59     vaddsws v3, v3, v1          ;# ip[12] * sin(pi/8) * sqrt(2)
     60 
     61     vmulosh v5, v0, v9
     62     vsraw   v5, v5, v12         ;# ip[ 4] * cos(pi/8) * sqrt(2)
     63     vaddsws v5, v5, v0
     64 
     65     vaddsws v3, v3, v5          ;# d1
     66 
     67     vaddsws v0, v6, v3          ;# a1 + d1
     68     vsubsws v3, v6, v3          ;# a1 - d1
     69 
     70     vaddsws v1, v7, v4          ;# b1 + c1
     71     vsubsws v2, v7, v4          ;# b1 - c1
     72 
     73     ;# transpose input
     74     vmrghw  v4, v0, v1          ;# a0 b0 a1 b1
     75     vmrghw  v5, v2, v3          ;# c0 d0 c1 d1
     76 
     77     vmrglw  v6, v0, v1          ;# a2 b2 a3 b3
     78     vmrglw  v7, v2, v3          ;# c2 d2 c3 d3
     79 
     80     vperm   v0, v4, v5, v10     ;# a0 b0 c0 d0
     81     vperm   v1, v4, v5, v11     ;# a1 b1 c1 d1
     82 
     83     vperm   v2, v6, v7, v10     ;# a2 b2 c2 d2
     84     vperm   v3, v6, v7, v11     ;# a3 b3 c3 d3
     85 
     86     ;# second pass
     87     vaddsws v6, v0, v2          ;# a1 = ip[0]+ip[8]
     88     vsubsws v7, v0, v2          ;# b1 = ip[0]-ip[8]
     89 
     90     vmulosh v4, v1, v8
     91     vsraw   v4, v4, v12
     92     vaddsws v4, v4, v1          ;# ip[ 4] * sin(pi/8) * sqrt(2)
     93 
     94     vmulosh v5, v3, v9
     95     vsraw   v5, v5, v12         ;# ip[12] * cos(pi/8) * sqrt(2)
     96     vaddsws v5, v5, v3
     97 
     98     vsubsws v4, v4, v5          ;# c1
     99 
    100     vmulosh v2, v3, v8
    101     vsraw   v2, v2, v12
    102     vaddsws v2, v2, v3          ;# ip[12] * sin(pi/8) * sqrt(2)
    103 
    104     vmulosh v5, v1, v9
    105     vsraw   v5, v5, v12         ;# ip[ 4] * cos(pi/8) * sqrt(2)
    106     vaddsws v5, v5, v1
    107 
    108     vaddsws v3, v2, v5          ;# d1
    109 
    110     vaddsws v0, v6, v3          ;# a1 + d1
    111     vsubsws v3, v6, v3          ;# a1 - d1
    112 
    113     vaddsws v1, v7, v4          ;# b1 + c1
    114     vsubsws v2, v7, v4          ;# b1 - c1
    115 
    116     vspltish v6, 4
    117     vspltish v7, 3
    118 
    119     vpkswss v0, v0, v1
    120     vpkswss v1, v2, v3
    121 
    122     vaddshs v0, v0, v6
    123     vaddshs v1, v1, v6
    124 
    125     vsrah   v0, v0, v7
    126     vsrah   v1, v1, v7
    127 
    128     ;# transpose output
    129     vmrghh  v2, v0, v1          ;# a0 c0 a1 c1 a2 c2 a3 c3
    130     vmrglh  v3, v0, v1          ;# b0 d0 b1 d1 b2 d2 b3 d3
    131 
    132     vmrghh  v0, v2, v3          ;# a0 b0 c0 d0 a1 b1 c1 d1
    133     vmrglh  v1, v2, v3          ;# a2 b2 c2 d2 a3 b3 c3 d3
    134 
    135     stwu    r1,-416(r1)         ;# create space on the stack
    136 
    137     stvx    v0,  0, r1
    138     lwz     r6, 0(r1)
    139     stw     r6, 0(r4)
    140     lwz     r6, 4(r1)
    141     stw     r6, 4(r4)
    142 
    143     add     r4, r4, r5
    144 
    145     lwz     r6,  8(r1)
    146     stw     r6,  0(r4)
    147     lwz     r6, 12(r1)
    148     stw     r6,  4(r4)
    149 
    150     add     r4, r4, r5
    151 
    152     stvx    v1,  0, r1
    153     lwz     r6, 0(r1)
    154     stw     r6, 0(r4)
    155     lwz     r6, 4(r1)
    156     stw     r6, 4(r4)
    157 
    158     add     r4, r4, r5
    159 
    160     lwz     r6,  8(r1)
    161     stw     r6,  0(r4)
    162     lwz     r6, 12(r1)
    163     stw     r6,  4(r4)
    164 
    165     addi    r1, r1, 416         ;# recover stack
    166 
    167     mtspr   256, r11            ;# reset old VRSAVE
    168 
    169     blr
    170 
    171     .align 4
    172 sinpi8sqrt2:
    173     .short  35468, 35468, 35468, 35468, 35468, 35468, 35468, 35468
    174 
    175     .align 4
    176 cospi8sqrt2minus1:
    177     .short  20091, 20091, 20091, 20091, 20091, 20091, 20091, 20091
    178 
    179     .align 4
    180 shift_16:
    181     .long      16,    16,    16,    16
    182 
    183     .align 4
    184 hi_hi:
    185     .byte     0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23
    186 
    187     .align 4
    188 lo_lo:
    189     .byte     8,  9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31
    190