Home | History | Annotate | Download | only in neon
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12     EXPORT  |vp8_short_fdct4x4_neon|
     13     EXPORT  |vp8_short_fdct8x4_neon|
     14     ARM
     15     REQUIRE8
     16     PRESERVE8
     17 
     18 
     19     AREA ||.text||, CODE, READONLY, ALIGN=2
     20 
     21 ; r0    short *input
     22 ; r1    short *output
     23 ; r2    int pitch
     24 ; Input has a pitch, output is contiguous
     25 |vp8_short_fdct4x4_neon| PROC
     26     ldr             r12, _dct_matrix_
     27     vld1.16         d0, [r0], r2
     28     vld1.16         d1, [r0], r2
     29     vld1.16         d2, [r0], r2
     30     vld1.16         d3, [r0]
     31     vld1.16         {q2, q3}, [r12]
     32 
     33 ;first stage
     34     vmull.s16       q11, d4, d0[0]              ;i=0
     35     vmull.s16       q12, d4, d1[0]              ;i=1
     36     vmull.s16       q13, d4, d2[0]              ;i=2
     37     vmull.s16       q14, d4, d3[0]              ;i=3
     38 
     39     vmlal.s16       q11, d5, d0[1]
     40     vmlal.s16       q12, d5, d1[1]
     41     vmlal.s16       q13, d5, d2[1]
     42     vmlal.s16       q14, d5, d3[1]
     43 
     44     vmlal.s16       q11, d6, d0[2]
     45     vmlal.s16       q12, d6, d1[2]
     46     vmlal.s16       q13, d6, d2[2]
     47     vmlal.s16       q14, d6, d3[2]
     48 
     49     vmlal.s16       q11, d7, d0[3]              ;sumtemp for i=0
     50     vmlal.s16       q12, d7, d1[3]              ;sumtemp for i=1
     51     vmlal.s16       q13, d7, d2[3]              ;sumtemp for i=2
     52     vmlal.s16       q14, d7, d3[3]              ;sumtemp for i=3
     53 
     54     ; rounding
     55     vrshrn.i32      d22, q11, #14
     56     vrshrn.i32      d24, q12, #14
     57     vrshrn.i32      d26, q13, #14
     58     vrshrn.i32      d28, q14, #14
     59 
     60 ;second stage
     61     vmull.s16       q4, d22, d4[0]              ;i=0
     62     vmull.s16       q5, d22, d4[1]              ;i=1
     63     vmull.s16       q6, d22, d4[2]              ;i=2
     64     vmull.s16       q7, d22, d4[3]              ;i=3
     65 
     66     vmlal.s16       q4, d24, d5[0]
     67     vmlal.s16       q5, d24, d5[1]
     68     vmlal.s16       q6, d24, d5[2]
     69     vmlal.s16       q7, d24, d5[3]
     70 
     71     vmlal.s16       q4, d26, d6[0]
     72     vmlal.s16       q5, d26, d6[1]
     73     vmlal.s16       q6, d26, d6[2]
     74     vmlal.s16       q7, d26, d6[3]
     75 
     76     vmlal.s16       q4, d28, d7[0]              ;sumtemp for i=0
     77     vmlal.s16       q5, d28, d7[1]              ;sumtemp for i=1
     78     vmlal.s16       q6, d28, d7[2]              ;sumtemp for i=2
     79     vmlal.s16       q7, d28, d7[3]              ;sumtemp for i=3
     80 
     81     vrshr.s32       q0, q4, #16
     82     vrshr.s32       q1, q5, #16
     83     vrshr.s32       q2, q6, #16
     84     vrshr.s32       q3, q7, #16
     85 
     86     vmovn.i32       d0, q0
     87     vmovn.i32       d1, q1
     88     vmovn.i32       d2, q2
     89     vmovn.i32       d3, q3
     90 
     91     vst1.16         {q0, q1}, [r1]
     92 
     93     bx              lr
     94 
     95     ENDP
     96 
     97 ; r0    short *input
     98 ; r1    short *output
     99 ; r2    int pitch
    100 |vp8_short_fdct8x4_neon| PROC
    101     ; Store link register and input before calling
    102     ;  first 4x4 fdct.  Do not need to worry about
    103     ;  output or pitch because those pointers are not
    104     ;  touched in the 4x4 fdct function
    105     stmdb           sp!, {r0, lr}
    106 
    107     bl              vp8_short_fdct4x4_neon
    108 
    109     ldmia           sp!, {r0, lr}
    110 
    111     ; Move to the next block of data.
    112     add             r0, r0, #8
    113     add             r1, r1, #32
    114 
    115     ; Second time through do not store off the
    116     ;  link register, just return from the 4x4 fdtc
    117     b               vp8_short_fdct4x4_neon
    118 
    119     ; Should never get to this.
    120     bx              lr
    121 
    122     ENDP
    123 
    124 ;-----------------
    125 
    126 _dct_matrix_
    127     DCD     dct_matrix
    128 dct_matrix
    129 ;   DCW     23170,  30274,  23170, 12540
    130 ;   DCW     23170,  12540, -23170,-30274
    131 ;   DCW     23170, -12540, -23170, 30274
    132 ;   DCW     23170, -30274,  23170,-12540
    133 ; 23170 =  0x5a82
    134 ; -23170 =  0xa57e
    135 ; 30274 =  0x7642
    136 ; -30274 =  0x89be
    137 ; 12540 =  0x30fc
    138 ; -12540 = 0xcf04
    139     DCD     0x76425a82, 0x30fc5a82
    140     DCD     0x30fc5a82, 0x89bea57e
    141     DCD     0xcf045a82, 0x7642a57e
    142     DCD     0x89be5a82, 0xcf045a82
    143 
    144     END
    145