1 ; 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3 ; 4 ; Use of this source code is governed by a BSD-style license 5 ; that can be found in the LICENSE file in the root of the source 6 ; tree. An additional intellectual property rights grant can be found 7 ; in the file PATENTS. All contributing project authors may 8 ; be found in the AUTHORS file in the root of the source tree. 9 ; 10 11 12 EXPORT |vp8_loop_filter_simple_horizontal_edge_armv6| 13 EXPORT |vp8_loop_filter_simple_vertical_edge_armv6| 14 15 AREA |.text|, CODE, READONLY ; name this block of code 16 17 MACRO 18 TRANSPOSE_MATRIX $a0, $a1, $a2, $a3, $b0, $b1, $b2, $b3 19 ; input: $a0, $a1, $a2, $a3; output: $b0, $b1, $b2, $b3 20 ; a0: 03 02 01 00 21 ; a1: 13 12 11 10 22 ; a2: 23 22 21 20 23 ; a3: 33 32 31 30 24 ; b3 b2 b1 b0 25 26 uxtb16 $b1, $a1 ; xx 12 xx 10 27 uxtb16 $b0, $a0 ; xx 02 xx 00 28 uxtb16 $b3, $a3 ; xx 32 xx 30 29 uxtb16 $b2, $a2 ; xx 22 xx 20 30 orr $b1, $b0, $b1, lsl #8 ; 12 02 10 00 31 orr $b3, $b2, $b3, lsl #8 ; 32 22 30 20 32 33 uxtb16 $a1, $a1, ror #8 ; xx 13 xx 11 34 uxtb16 $a3, $a3, ror #8 ; xx 33 xx 31 35 uxtb16 $a0, $a0, ror #8 ; xx 03 xx 01 36 uxtb16 $a2, $a2, ror #8 ; xx 23 xx 21 37 orr $a0, $a0, $a1, lsl #8 ; 13 03 11 01 38 orr $a2, $a2, $a3, lsl #8 ; 33 23 31 21 39 40 pkhtb $b2, $b3, $b1, asr #16 ; 32 22 12 02 -- p1 41 pkhbt $b0, $b1, $b3, lsl #16 ; 30 20 10 00 -- p3 42 43 pkhtb $b3, $a2, $a0, asr #16 ; 33 23 13 03 -- p0 44 pkhbt $b1, $a0, $a2, lsl #16 ; 31 21 11 01 -- p2 45 MEND 46 47 48 src RN r0 49 pstep RN r1 50 51 ;r0 unsigned char *src_ptr, 52 ;r1 int src_pixel_step, 53 ;r2 const char *flimit, 54 ;r3 const char *limit, 55 ;stack const char *thresh, 56 ;stack int count 57 58 ; All 16 elements in flimit are equal. So, in the code, only one load is needed 59 ; for flimit. Same applies to limit. thresh is not used in simple looopfilter 60 61 ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- 62 |vp8_loop_filter_simple_horizontal_edge_armv6| PROC 63 ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- 64 stmdb sp!, {r4 - r11, lr} 65 66 ldr r12, [r3] ; limit 67 ldr r3, [src, -pstep, lsl #1] ; p1 68 ldr r4, [src, -pstep] ; p0 69 ldr r5, [src] ; q0 70 ldr r6, [src, pstep] ; q1 71 ldr r7, [r2] ; flimit 72 ldr r2, c0x80808080 73 ldr r9, [sp, #40] ; count for 8-in-parallel 74 uadd8 r7, r7, r7 ; flimit * 2 75 mov r9, r9, lsl #1 ; double the count. we're doing 4 at a time 76 uadd8 r12, r7, r12 ; flimit * 2 + limit 77 mov lr, #0 ; need 0 in a couple places 78 79 |simple_hnext8| 80 ; vp8_simple_filter_mask() 81 82 uqsub8 r7, r3, r6 ; p1 - q1 83 uqsub8 r8, r6, r3 ; q1 - p1 84 uqsub8 r10, r4, r5 ; p0 - q0 85 uqsub8 r11, r5, r4 ; q0 - p0 86 orr r8, r8, r7 ; abs(p1 - q1) 87 orr r10, r10, r11 ; abs(p0 - q0) 88 uqadd8 r10, r10, r10 ; abs(p0 - q0) * 2 89 uhadd8 r8, r8, lr ; abs(p1 - q2) >> 1 90 uqadd8 r10, r10, r8 ; abs(p0 - q0)*2 + abs(p1 - q1)/2 91 mvn r8, #0 92 usub8 r10, r12, r10 ; compare to flimit. usub8 sets GE flags 93 sel r10, r8, lr ; filter mask: F or 0 94 cmp r10, #0 95 beq simple_hskip_filter ; skip filtering if all masks are 0x00 96 97 ;vp8_simple_filter() 98 99 eor r3, r3, r2 ; p1 offset to convert to a signed value 100 eor r6, r6, r2 ; q1 offset to convert to a signed value 101 eor r4, r4, r2 ; p0 offset to convert to a signed value 102 eor r5, r5, r2 ; q0 offset to convert to a signed value 103 104 qsub8 r3, r3, r6 ; vp8_filter = p1 - q1 105 qsub8 r6, r5, r4 ; q0 - p0 106 qadd8 r3, r3, r6 ; += q0 - p0 107 ldr r7, c0x04040404 108 qadd8 r3, r3, r6 ; += q0 - p0 109 ldr r8, c0x03030303 110 qadd8 r3, r3, r6 ; vp8_filter = p1-q1 + 3*(q0-p0)) 111 ;STALL 112 and r3, r3, r10 ; vp8_filter &= mask 113 114 qadd8 r7 , r3 , r7 ; Filter1 = vp8_filter + 4 115 qadd8 r8 , r3 , r8 ; Filter2 = vp8_filter + 3 116 117 shadd8 r7 , r7 , lr 118 shadd8 r8 , r8 , lr 119 shadd8 r7 , r7 , lr 120 shadd8 r8 , r8 , lr 121 shadd8 r7 , r7 , lr ; Filter1 >>= 3 122 shadd8 r8 , r8 , lr ; Filter2 >>= 3 123 124 qsub8 r5 ,r5, r7 ; u = q0 - Filter1 125 qadd8 r4, r4, r8 ; u = p0 + Filter2 126 eor r5, r5, r2 ; *oq0 = u^0x80 127 str r5, [src] ; store oq0 result 128 eor r4, r4, r2 ; *op0 = u^0x80 129 str r4, [src, -pstep] ; store op0 result 130 131 |simple_hskip_filter| 132 subs r9, r9, #1 133 addne src, src, #4 ; next row 134 135 ldrne r3, [src, -pstep, lsl #1] ; p1 136 ldrne r4, [src, -pstep] ; p0 137 ldrne r5, [src] ; q0 138 ldrne r6, [src, pstep] ; q1 139 140 bne simple_hnext8 141 142 ldmia sp!, {r4 - r11, pc} 143 ENDP ; |vp8_loop_filter_simple_horizontal_edge_armv6| 144 145 146 ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- 147 |vp8_loop_filter_simple_vertical_edge_armv6| PROC 148 ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- 149 stmdb sp!, {r4 - r11, lr} 150 151 ldr r12, [r2] ; r12: flimit 152 ldr r2, c0x80808080 153 ldr r7, [r3] ; limit 154 155 ; load soure data to r7, r8, r9, r10 156 ldrh r3, [src, #-2] 157 ldrh r4, [src], pstep 158 uadd8 r12, r12, r12 ; flimit * 2 159 160 ldrh r5, [src, #-2] 161 ldrh r6, [src], pstep 162 uadd8 r12, r12, r7 ; flimit * 2 + limit 163 164 pkhbt r7, r3, r4, lsl #16 165 166 ldrh r3, [src, #-2] 167 ldrh r4, [src], pstep 168 ldr r11, [sp, #40] ; count (r11) for 8-in-parallel 169 170 pkhbt r8, r5, r6, lsl #16 171 172 ldrh r5, [src, #-2] 173 ldrh r6, [src], pstep 174 mov r11, r11, lsl #1 ; 4-in-parallel 175 176 |simple_vnext8| 177 ; vp8_simple_filter_mask() function 178 pkhbt r9, r3, r4, lsl #16 179 pkhbt r10, r5, r6, lsl #16 180 181 ;transpose r7, r8, r9, r10 to r3, r4, r5, r6 182 TRANSPOSE_MATRIX r7, r8, r9, r10, r3, r4, r5, r6 183 184 uqsub8 r7, r3, r6 ; p1 - q1 185 uqsub8 r8, r6, r3 ; q1 - p1 186 uqsub8 r9, r4, r5 ; p0 - q0 187 uqsub8 r10, r5, r4 ; q0 - p0 188 orr r7, r7, r8 ; abs(p1 - q1) 189 orr r9, r9, r10 ; abs(p0 - q0) 190 mov r8, #0 191 uqadd8 r9, r9, r9 ; abs(p0 - q0) * 2 192 uhadd8 r7, r7, r8 ; abs(p1 - q1) / 2 193 uqadd8 r7, r7, r9 ; abs(p0 - q0)*2 + abs(p1 - q1)/2 194 mvn r10, #0 ; r10 == -1 195 196 usub8 r7, r12, r7 ; compare to flimit 197 sel lr, r10, r8 ; filter mask 198 199 cmp lr, #0 200 beq simple_vskip_filter ; skip filtering 201 202 ;vp8_simple_filter() function 203 eor r3, r3, r2 ; p1 offset to convert to a signed value 204 eor r6, r6, r2 ; q1 offset to convert to a signed value 205 eor r4, r4, r2 ; p0 offset to convert to a signed value 206 eor r5, r5, r2 ; q0 offset to convert to a signed value 207 208 qsub8 r3, r3, r6 ; vp8_filter = p1 - q1 209 qsub8 r6, r5, r4 ; q0 - p0 210 211 qadd8 r3, r3, r6 ; vp8_filter += q0 - p0 212 ldr r9, c0x03030303 ; r9 = 3 213 214 qadd8 r3, r3, r6 ; vp8_filter += q0 - p0 215 ldr r7, c0x04040404 216 217 qadd8 r3, r3, r6 ; vp8_filter = p1-q1 + 3*(q0-p0)) 218 ;STALL 219 and r3, r3, lr ; vp8_filter &= mask 220 221 qadd8 r9 , r3 , r9 ; Filter2 = vp8_filter + 3 222 qadd8 r3 , r3 , r7 ; Filter1 = vp8_filter + 4 223 224 shadd8 r9 , r9 , r8 225 shadd8 r3 , r3 , r8 226 shadd8 r9 , r9 , r8 227 shadd8 r3 , r3 , r8 228 shadd8 r9 , r9 , r8 ; Filter2 >>= 3 229 shadd8 r3 , r3 , r8 ; Filter1 >>= 3 230 231 ;calculate output 232 sub src, src, pstep, lsl #2 233 234 qadd8 r4, r4, r9 ; u = p0 + Filter2 235 qsub8 r5, r5, r3 ; u = q0 - Filter1 236 eor r4, r4, r2 ; *op0 = u^0x80 237 eor r5, r5, r2 ; *oq0 = u^0x80 238 239 strb r4, [src, #-1] ; store the result 240 mov r4, r4, lsr #8 241 strb r5, [src], pstep 242 mov r5, r5, lsr #8 243 244 strb r4, [src, #-1] 245 mov r4, r4, lsr #8 246 strb r5, [src], pstep 247 mov r5, r5, lsr #8 248 249 strb r4, [src, #-1] 250 mov r4, r4, lsr #8 251 strb r5, [src], pstep 252 mov r5, r5, lsr #8 253 254 strb r4, [src, #-1] 255 strb r5, [src], pstep 256 257 |simple_vskip_filter| 258 subs r11, r11, #1 259 260 ; load soure data to r7, r8, r9, r10 261 ldrneh r3, [src, #-2] 262 ldrneh r4, [src], pstep 263 264 ldrneh r5, [src, #-2] 265 ldrneh r6, [src], pstep 266 267 pkhbt r7, r3, r4, lsl #16 268 269 ldrneh r3, [src, #-2] 270 ldrneh r4, [src], pstep 271 272 pkhbt r8, r5, r6, lsl #16 273 274 ldrneh r5, [src, #-2] 275 ldrneh r6, [src], pstep 276 277 bne simple_vnext8 278 279 ldmia sp!, {r4 - r11, pc} 280 ENDP ; |vp8_loop_filter_simple_vertical_edge_armv6| 281 282 ; Constant Pool 283 c0x80808080 DCD 0x80808080 284 c0x03030303 DCD 0x03030303 285 c0x04040404 DCD 0x04040404 286 287 END 288