1 ; 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3 ; 4 ; Use of this source code is governed by a BSD-style license 5 ; that can be found in the LICENSE file in the root of the source 6 ; tree. An additional intellectual property rights grant can be found 7 ; in the file PATENTS. All contributing project authors may 8 ; be found in the AUTHORS file in the root of the source tree. 9 ; 10 11 12 EXPORT |vp8_short_fdct4x4_neon| 13 EXPORT |vp8_short_fdct8x4_neon| 14 ARM 15 REQUIRE8 16 PRESERVE8 17 18 19 AREA ||.text||, CODE, READONLY, ALIGN=2 20 21 ; r0 short *input 22 ; r1 short *output 23 ; r2 int pitch 24 ; Input has a pitch, output is contiguous 25 |vp8_short_fdct4x4_neon| PROC 26 ldr r12, _dct_matrix_ 27 vld1.16 d0, [r0], r2 28 vld1.16 d1, [r0], r2 29 vld1.16 d2, [r0], r2 30 vld1.16 d3, [r0] 31 vld1.16 {q2, q3}, [r12] 32 33 ;first stage 34 vmull.s16 q11, d4, d0[0] ;i=0 35 vmull.s16 q12, d4, d1[0] ;i=1 36 vmull.s16 q13, d4, d2[0] ;i=2 37 vmull.s16 q14, d4, d3[0] ;i=3 38 39 vmlal.s16 q11, d5, d0[1] 40 vmlal.s16 q12, d5, d1[1] 41 vmlal.s16 q13, d5, d2[1] 42 vmlal.s16 q14, d5, d3[1] 43 44 vmlal.s16 q11, d6, d0[2] 45 vmlal.s16 q12, d6, d1[2] 46 vmlal.s16 q13, d6, d2[2] 47 vmlal.s16 q14, d6, d3[2] 48 49 vmlal.s16 q11, d7, d0[3] ;sumtemp for i=0 50 vmlal.s16 q12, d7, d1[3] ;sumtemp for i=1 51 vmlal.s16 q13, d7, d2[3] ;sumtemp for i=2 52 vmlal.s16 q14, d7, d3[3] ;sumtemp for i=3 53 54 ; rounding 55 vrshrn.i32 d22, q11, #14 56 vrshrn.i32 d24, q12, #14 57 vrshrn.i32 d26, q13, #14 58 vrshrn.i32 d28, q14, #14 59 60 ;second stage 61 vmull.s16 q4, d22, d4[0] ;i=0 62 vmull.s16 q5, d22, d4[1] ;i=1 63 vmull.s16 q6, d22, d4[2] ;i=2 64 vmull.s16 q7, d22, d4[3] ;i=3 65 66 vmlal.s16 q4, d24, d5[0] 67 vmlal.s16 q5, d24, d5[1] 68 vmlal.s16 q6, d24, d5[2] 69 vmlal.s16 q7, d24, d5[3] 70 71 vmlal.s16 q4, d26, d6[0] 72 vmlal.s16 q5, d26, d6[1] 73 vmlal.s16 q6, d26, d6[2] 74 vmlal.s16 q7, d26, d6[3] 75 76 vmlal.s16 q4, d28, d7[0] ;sumtemp for i=0 77 vmlal.s16 q5, d28, d7[1] ;sumtemp for i=1 78 vmlal.s16 q6, d28, d7[2] ;sumtemp for i=2 79 vmlal.s16 q7, d28, d7[3] ;sumtemp for i=3 80 81 vrshr.s32 q0, q4, #16 82 vrshr.s32 q1, q5, #16 83 vrshr.s32 q2, q6, #16 84 vrshr.s32 q3, q7, #16 85 86 vmovn.i32 d0, q0 87 vmovn.i32 d1, q1 88 vmovn.i32 d2, q2 89 vmovn.i32 d3, q3 90 91 vst1.16 {q0, q1}, [r1] 92 93 bx lr 94 95 ENDP 96 97 ; r0 short *input 98 ; r1 short *output 99 ; r2 int pitch 100 |vp8_short_fdct8x4_neon| PROC 101 ; Store link register and input before calling 102 ; first 4x4 fdct. Do not need to worry about 103 ; output or pitch because those pointers are not 104 ; touched in the 4x4 fdct function 105 stmdb sp!, {r0, lr} 106 107 bl vp8_short_fdct4x4_neon 108 109 ldmia sp!, {r0, lr} 110 111 ; Move to the next block of data. 112 add r0, r0, #8 113 add r1, r1, #32 114 115 ; Second time through do not store off the 116 ; link register, just return from the 4x4 fdtc 117 b vp8_short_fdct4x4_neon 118 119 ; Should never get to this. 120 bx lr 121 122 ENDP 123 124 ;----------------- 125 126 _dct_matrix_ 127 DCD dct_matrix 128 dct_matrix 129 ; DCW 23170, 30274, 23170, 12540 130 ; DCW 23170, 12540, -23170,-30274 131 ; DCW 23170, -12540, -23170, 30274 132 ; DCW 23170, -30274, 23170,-12540 133 ; 23170 = 0x5a82 134 ; -23170 = 0xa57e 135 ; 30274 = 0x7642 136 ; -30274 = 0x89be 137 ; 12540 = 0x30fc 138 ; -12540 = 0xcf04 139 DCD 0x76425a82, 0x30fc5a82 140 DCD 0x30fc5a82, 0x89bea57e 141 DCD 0xcf045a82, 0x7642a57e 142 DCD 0x89be5a82, 0xcf045a82 143 144 END 145