1 ; 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3 ; 4 ; Use of this source code is governed by a BSD-style license 5 ; that can be found in the LICENSE file in the root of the source 6 ; tree. An additional intellectual property rights grant can be found 7 ; in the file PATENTS. All contributing project authors may 8 ; be found in the AUTHORS file in the root of the source tree. 9 ; 10 11 12 EXPORT |vp8_mse16x16_neon| 13 EXPORT |vp8_get16x16pred_error_neon| 14 EXPORT |vp8_get4x4sse_cs_neon| 15 16 ARM 17 REQUIRE8 18 PRESERVE8 19 20 AREA ||.text||, CODE, READONLY, ALIGN=2 21 ;============================ 22 ; r0 unsigned char *src_ptr 23 ; r1 int source_stride 24 ; r2 unsigned char *ref_ptr 25 ; r3 int recon_stride 26 ; stack unsigned int *sse 27 ;note: in this function, sum is never used. So, we can remove this part of calculation 28 ;from vp8_variance(). 29 30 |vp8_mse16x16_neon| PROC 31 vmov.i8 q7, #0 ;q7, q8, q9, q10 - sse 32 vmov.i8 q8, #0 33 vmov.i8 q9, #0 34 vmov.i8 q10, #0 35 36 mov r12, #8 37 38 mse16x16_neon_loop 39 vld1.8 {q0}, [r0], r1 ;Load up source and reference 40 vld1.8 {q2}, [r2], r3 41 vld1.8 {q1}, [r0], r1 42 vld1.8 {q3}, [r2], r3 43 44 vsubl.u8 q11, d0, d4 45 vsubl.u8 q12, d1, d5 46 vsubl.u8 q13, d2, d6 47 vsubl.u8 q14, d3, d7 48 49 vmlal.s16 q7, d22, d22 50 vmlal.s16 q8, d23, d23 51 52 subs r12, r12, #1 53 54 vmlal.s16 q9, d24, d24 55 vmlal.s16 q10, d25, d25 56 vmlal.s16 q7, d26, d26 57 vmlal.s16 q8, d27, d27 58 vmlal.s16 q9, d28, d28 59 vmlal.s16 q10, d29, d29 60 61 bne mse16x16_neon_loop 62 63 vadd.u32 q7, q7, q8 64 vadd.u32 q9, q9, q10 65 66 ldr r12, [sp] ;load *sse from stack 67 68 vadd.u32 q10, q7, q9 69 vpaddl.u32 q1, q10 70 vadd.u64 d0, d2, d3 71 72 vst1.32 {d0[0]}, [r12] 73 vmov.32 r0, d0[0] 74 75 bx lr 76 77 ENDP 78 79 ;============================ 80 ; r0 unsigned char *src_ptr 81 ; r1 int src_stride 82 ; r2 unsigned char *ref_ptr 83 ; r3 int ref_stride 84 |vp8_get16x16pred_error_neon| PROC 85 vmov.i8 q8, #0 ;q8 - sum 86 vmov.i8 q9, #0 ;q9, q10 - pred_error 87 vmov.i8 q10, #0 88 89 mov r12, #8 90 91 get16x16pred_error_neon_loop 92 vld1.8 {q0}, [r0], r1 ;Load up source and reference 93 vld1.8 {q2}, [r2], r3 94 vld1.8 {q1}, [r0], r1 95 vld1.8 {q3}, [r2], r3 96 97 vsubl.u8 q11, d0, d4 98 vsubl.u8 q12, d1, d5 99 vsubl.u8 q13, d2, d6 100 vsubl.u8 q14, d3, d7 101 102 vpadal.s16 q8, q11 103 vmlal.s16 q9, d22, d22 104 vmlal.s16 q10, d23, d23 105 106 subs r12, r12, #1 107 108 vpadal.s16 q8, q12 109 vmlal.s16 q9, d24, d24 110 vmlal.s16 q10, d25, d25 111 vpadal.s16 q8, q13 112 vmlal.s16 q9, d26, d26 113 vmlal.s16 q10, d27, d27 114 vpadal.s16 q8, q14 115 vmlal.s16 q9, d28, d28 116 vmlal.s16 q10, d29, d29 117 118 bne get16x16pred_error_neon_loop 119 120 vadd.u32 q10, q9, q10 121 vpaddl.s32 q0, q8 122 123 vpaddl.u32 q1, q10 124 vadd.s64 d0, d0, d1 125 vadd.u64 d1, d2, d3 126 127 vmull.s32 q5, d0, d0 128 vshr.s32 d10, d10, #8 129 vsub.s32 d0, d1, d10 130 131 vmov.32 r0, d0[0] 132 bx lr 133 134 ENDP 135 136 ;============================= 137 ; r0 unsigned char *src_ptr, 138 ; r1 int source_stride, 139 ; r2 unsigned char *ref_ptr, 140 ; r3 int recon_stride 141 |vp8_get4x4sse_cs_neon| PROC 142 vld1.8 {d0}, [r0], r1 ;Load up source and reference 143 vld1.8 {d4}, [r2], r3 144 vld1.8 {d1}, [r0], r1 145 vld1.8 {d5}, [r2], r3 146 vld1.8 {d2}, [r0], r1 147 vld1.8 {d6}, [r2], r3 148 vld1.8 {d3}, [r0], r1 149 vld1.8 {d7}, [r2], r3 150 151 vsubl.u8 q11, d0, d4 152 vsubl.u8 q12, d1, d5 153 vsubl.u8 q13, d2, d6 154 vsubl.u8 q14, d3, d7 155 156 vmull.s16 q7, d22, d22 157 vmull.s16 q8, d24, d24 158 vmull.s16 q9, d26, d26 159 vmull.s16 q10, d28, d28 160 161 vadd.u32 q7, q7, q8 162 vadd.u32 q9, q9, q10 163 vadd.u32 q9, q7, q9 164 165 vpaddl.u32 q1, q9 166 vadd.u64 d0, d2, d3 167 168 vmov.32 r0, d0[0] 169 bx lr 170 171 ENDP 172 173 END 174