1 ; 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3 ; 4 ; Use of this source code is governed by a BSD-style license 5 ; that can be found in the LICENSE file in the root of the source 6 ; tree. An additional intellectual property rights grant can be found 7 ; in the file PATENTS. All contributing project authors may 8 ; be found in the AUTHORS file in the root of the source tree. 9 ; 10 11 12 EXPORT |vp8_fast_fdct4x4_neon| 13 14 ARM 15 REQUIRE8 16 PRESERVE8 17 18 AREA ||.text||, CODE, READONLY, ALIGN=2 19 ;void vp8_fast_fdct4x4_c(short *input, short *output, int pitch); 20 ;NOTE: 21 ;The input *src_diff. src_diff is calculated as: 22 ;diff_ptr[c] = src_ptr[c] - pred_ptr[c]; (in Subtract* function) 23 ;In which *src_ptr and *pred_ptr both are unsigned char. 24 ;Therefore, *src_diff should be in the range of [-255, 255]. 25 ;CAUTION: 26 ;The input values of 25th block are set in vp8_build_dcblock function, which are out of [-255, 255]. 27 ;But, VP8 encoder only uses vp8_short_fdct4x4_c for 25th block, not vp8_fast_fdct4x4_c. That makes 28 ;it ok for assuming *input in [-255, 255] in vp8_fast_fdct4x4_c, but not ok in vp8_short_fdct4x4_c. 29 30 |vp8_fast_fdct4x4_neon| PROC 31 vld1.16 {d2}, [r0], r2 ;load input 32 ldr r12, _ffdct_coeff_ 33 vld1.16 {d3}, [r0], r2 34 vld1.16 {d4}, [r0], r2 35 vld1.16 {d0}, [r12] 36 vld1.16 {d5}, [r0], r2 37 38 ;First for-loop 39 ;transpose d2, d3, d4, d5. Then, d2=ip[0], d3=ip[1], d4=ip[2], d5=ip[3] 40 vtrn.32 d2, d4 41 vtrn.32 d3, d5 42 vtrn.16 d2, d3 43 vtrn.16 d4, d5 44 45 vadd.s16 d6, d2, d5 ;ip[0]+ip[3] 46 vadd.s16 d7, d3, d4 ;ip[1]+ip[2] 47 vsub.s16 d8, d3, d4 ;ip[1]-ip[2] 48 vsub.s16 d9, d2, d5 ;ip[0]-ip[3] 49 vshl.i16 q3, q3, #1 ; a1, b1 50 vshl.i16 q4, q4, #1 ; c1, d1 51 52 vadd.s16 d10, d6, d7 ;temp1 = a1 + b1 53 vsub.s16 d11, d6, d7 ;temp2 = a1 - b1 54 55 vqdmulh.s16 q6, q5, d0[1] 56 vqdmulh.s16 q8, q4, d0[0] 57 vqdmulh.s16 q7, q4, d0[2] 58 59 vshr.s16 q6, q6, #1 60 vshr.s16 q8, q8, #1 61 vshr.s16 q7, q7, #1 ;d14:temp1 = ( c1 * x_c3)>>16; d15:temp1 = (d1 * x_c3)>>16 62 vadd.s16 q8, q4, q8 ;d16:temp2 = ((c1 * x_c1)>>16) + c1; d17:temp2 = ((d1 * x_c1)>>16) + d1 63 64 vadd.s16 d2, d10, d12 ;op[0] = ((temp1 * x_c2 )>>16) + temp1 65 vadd.s16 d4, d11, d13 ;op[2] = ((temp2 * x_c2 )>>16) + temp2 66 vadd.s16 d3, d14, d17 ;op[1] = temp1 + temp2 -- q is not necessary, just for protection 67 vsub.s16 d5, d15, d16 ;op[3] = temp1 - temp2 68 69 ;Second for-loop 70 ;transpose d2, d3, d4, d5. Then, d2=ip[0], d3=ip[4], d4=ip[8], d5=ip[12] 71 vtrn.32 d2, d4 72 vtrn.32 d3, d5 73 vtrn.16 d2, d3 74 vtrn.16 d4, d5 75 76 vadd.s16 d6, d2, d5 ;a1 = ip[0]+ip[12] 77 vadd.s16 d7, d3, d4 ;b1 = ip[4]+ip[8] 78 vsub.s16 d8, d3, d4 ;c1 = ip[4]-ip[8] 79 vsub.s16 d9, d2, d5 ;d1 = ip[0]-ip[12] 80 81 vadd.s16 d10, d6, d7 ;temp1 = a1 + b1 82 vsub.s16 d11, d6, d7 ;temp2 = a1 - b1 83 84 85 vqdmulh.s16 q6, q5, d0[1] 86 vqdmulh.s16 q8, q4, d0[0] 87 vqdmulh.s16 q7, q4, d0[2] 88 89 vshr.s16 q6, q6, #1 90 vshr.s16 q8, q8, #1 91 vshr.s16 q7, q7, #1 ;d14:temp1 = ( c1 * x_c3)>>16; d15:temp1 = (d1 * x_c3)>>16 92 vadd.s16 q8, q4, q8 ;d16:temp2 = ((c1 * x_c1)>>16) + c1; d17:temp2 = ((d1 * x_c1)>>16) + d1 93 94 vadd.s16 d2, d10, d12 ;a2 = ((temp1 * x_c2 )>>16) + temp1 95 vadd.s16 d4, d11, d13 ;c2 = ((temp2 * x_c2 )>>16) + temp2 96 vadd.s16 d3, d14, d17 ;b2 = temp1 + temp2 -- q is not necessary, just for protection 97 vsub.s16 d5, d15, d16 ;d2 = temp1 - temp2 98 99 vclt.s16 q3, q1, #0 100 vclt.s16 q4, q2, #0 101 102 vsub.s16 q1, q1, q3 103 vsub.s16 q2, q2, q4 104 105 vshr.s16 q1, q1, #1 106 vshr.s16 q2, q2, #1 107 108 vst1.16 {q1, q2}, [r1] 109 110 bx lr 111 112 ENDP 113 114 ;----------------- 115 116 _ffdct_coeff_ 117 DCD ffdct_coeff 118 ffdct_coeff 119 ; 60547 = 0xEC83 120 ; 46341 = 0xB505 121 ; 25080 = 0x61F8 122 DCD 0xB505EC83, 0x000061F8 123 124 END 125