1 ; 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3 ; 4 ; Use of this source code is governed by a BSD-style license 5 ; that can be found in the LICENSE file in the root of the source 6 ; tree. An additional intellectual property rights grant can be found 7 ; in the file PATENTS. All contributing project authors may 8 ; be found in the AUTHORS file in the root of the source tree. 9 ; 10 11 12 EXPORT |vp8_sixtap_predict8x4_armv6| 13 14 AREA |.text|, CODE, READONLY ; name this block of code 15 ;------------------------------------- 16 ; r0 unsigned char *src_ptr, 17 ; r1 int src_pixels_per_line, 18 ; r2 int xoffset, 19 ; r3 int yoffset, 20 ; stack unsigned char *dst_ptr, 21 ; stack int dst_pitch 22 ;------------------------------------- 23 ;note: In first pass, store the result in transpose(8linesx9columns) on stack. Temporary stack size is 184. 24 ;Line width is 20 that is 9 short data plus 2 to make it 4bytes aligned. In second pass, load data from stack, 25 ;and the result is stored in transpose. 26 |vp8_sixtap_predict8x4_armv6| PROC 27 stmdb sp!, {r4 - r11, lr} 28 str r3, [sp, #-184]! ;reserve space on stack for temporary storage, store yoffset 29 30 cmp r2, #0 ;skip first_pass filter if xoffset=0 31 add lr, sp, #4 ;point to temporary buffer 32 beq skip_firstpass_filter 33 34 ;first-pass filter 35 ldr r12, _filter8_coeff_ 36 sub r0, r0, r1, lsl #1 37 38 add r2, r12, r2, lsl #4 ;calculate filter location 39 add r0, r0, #3 ;adjust src only for loading convinience 40 41 ldr r3, [r2] ; load up packed filter coefficients 42 ldr r4, [r2, #4] 43 ldr r5, [r2, #8] 44 45 mov r2, #0x90000 ; height=9 is top part of counter 46 47 sub r1, r1, #8 48 49 |first_pass_hloop_v6| 50 ldrb r6, [r0, #-5] ; load source data 51 ldrb r7, [r0, #-4] 52 ldrb r8, [r0, #-3] 53 ldrb r9, [r0, #-2] 54 ldrb r10, [r0, #-1] 55 56 orr r2, r2, #0x4 ; construct loop counter. width=8=4x2 57 58 pkhbt r6, r6, r7, lsl #16 ; r7 | r6 59 pkhbt r7, r7, r8, lsl #16 ; r8 | r7 60 61 pkhbt r8, r8, r9, lsl #16 ; r9 | r8 62 pkhbt r9, r9, r10, lsl #16 ; r10 | r9 63 64 |first_pass_wloop_v6| 65 smuad r11, r6, r3 ; vp8_filter[0], vp8_filter[1] 66 smuad r12, r7, r3 67 68 ldrb r6, [r0], #1 69 70 smlad r11, r8, r4, r11 ; vp8_filter[2], vp8_filter[3] 71 ldrb r7, [r0], #1 72 smlad r12, r9, r4, r12 73 74 pkhbt r10, r10, r6, lsl #16 ; r10 | r9 75 pkhbt r6, r6, r7, lsl #16 ; r11 | r10 76 smlad r11, r10, r5, r11 ; vp8_filter[4], vp8_filter[5] 77 smlad r12, r6, r5, r12 78 79 sub r2, r2, #1 80 81 add r11, r11, #0x40 ; round_shift_and_clamp 82 tst r2, #0xff ; test loop counter 83 usat r11, #8, r11, asr #7 84 add r12, r12, #0x40 85 strh r11, [lr], #20 ; result is transposed and stored, which 86 usat r12, #8, r12, asr #7 87 88 strh r12, [lr], #20 89 90 movne r11, r6 91 movne r12, r7 92 93 movne r6, r8 94 movne r7, r9 95 movne r8, r10 96 movne r9, r11 97 movne r10, r12 98 99 bne first_pass_wloop_v6 100 101 ;;add r9, ppl, #30 ; attempt to load 2 adjacent cache lines 102 ;;IF ARCHITECTURE=6 103 ;pld [src, ppl] 104 ;;pld [src, r9] 105 ;;ENDIF 106 107 subs r2, r2, #0x10000 108 109 sub lr, lr, #158 110 111 add r0, r0, r1 ; move to next input line 112 113 bne first_pass_hloop_v6 114 115 ;second pass filter 116 secondpass_filter 117 ldr r3, [sp], #4 ; load back yoffset 118 ldr r0, [sp, #216] ; load dst address from stack 180+36 119 ldr r1, [sp, #220] ; load dst stride from stack 180+40 120 121 cmp r3, #0 122 beq skip_secondpass_filter 123 124 ldr r12, _filter8_coeff_ 125 add lr, r12, r3, lsl #4 ;calculate filter location 126 127 mov r2, #0x00080000 128 129 ldr r3, [lr] ; load up packed filter coefficients 130 ldr r4, [lr, #4] 131 ldr r5, [lr, #8] 132 133 pkhbt r12, r4, r3 ; pack the filter differently 134 pkhbt r11, r5, r4 135 136 second_pass_hloop_v6 137 ldr r6, [sp] ; load the data 138 ldr r7, [sp, #4] 139 140 orr r2, r2, #2 ; loop counter 141 142 second_pass_wloop_v6 143 smuad lr, r3, r6 ; apply filter 144 smulbt r10, r3, r6 145 146 ldr r8, [sp, #8] 147 148 smlad lr, r4, r7, lr 149 smladx r10, r12, r7, r10 150 151 ldrh r9, [sp, #12] 152 153 smlad lr, r5, r8, lr 154 smladx r10, r11, r8, r10 155 156 add sp, sp, #4 157 smlatb r10, r5, r9, r10 158 159 sub r2, r2, #1 160 161 add lr, lr, #0x40 ; round_shift_and_clamp 162 tst r2, #0xff 163 usat lr, #8, lr, asr #7 164 add r10, r10, #0x40 165 strb lr, [r0], r1 ; the result is transposed back and stored 166 usat r10, #8, r10, asr #7 167 168 strb r10, [r0],r1 169 170 movne r6, r7 171 movne r7, r8 172 173 bne second_pass_wloop_v6 174 175 subs r2, r2, #0x10000 176 add sp, sp, #12 ; updata src for next loop (20-8) 177 sub r0, r0, r1, lsl #2 178 add r0, r0, #1 179 180 bne second_pass_hloop_v6 181 182 add sp, sp, #20 183 ldmia sp!, {r4 - r11, pc} 184 185 ;-------------------- 186 skip_firstpass_filter 187 sub r0, r0, r1, lsl #1 188 sub r1, r1, #8 189 mov r2, #9 190 191 skip_firstpass_hloop 192 ldrb r4, [r0], #1 ; load data 193 subs r2, r2, #1 194 ldrb r5, [r0], #1 195 strh r4, [lr], #20 ; store it to immediate buffer 196 ldrb r6, [r0], #1 ; load data 197 strh r5, [lr], #20 198 ldrb r7, [r0], #1 199 strh r6, [lr], #20 200 ldrb r8, [r0], #1 201 strh r7, [lr], #20 202 ldrb r9, [r0], #1 203 strh r8, [lr], #20 204 ldrb r10, [r0], #1 205 strh r9, [lr], #20 206 ldrb r11, [r0], #1 207 strh r10, [lr], #20 208 add r0, r0, r1 ; move to next input line 209 strh r11, [lr], #20 210 211 sub lr, lr, #158 ; move over to next column 212 bne skip_firstpass_hloop 213 214 b secondpass_filter 215 216 ;-------------------- 217 skip_secondpass_filter 218 mov r2, #8 219 add sp, sp, #4 ;start from src[0] instead of src[-2] 220 221 skip_secondpass_hloop 222 ldr r6, [sp], #4 223 subs r2, r2, #1 224 ldr r8, [sp], #4 225 226 mov r7, r6, lsr #16 ; unpack 227 strb r6, [r0], r1 228 mov r9, r8, lsr #16 229 strb r7, [r0], r1 230 add sp, sp, #12 ; 20-8 231 strb r8, [r0], r1 232 strb r9, [r0], r1 233 234 sub r0, r0, r1, lsl #2 235 add r0, r0, #1 236 237 bne skip_secondpass_hloop 238 239 add sp, sp, #16 ; 180 - (160 +4) 240 241 ldmia sp!, {r4 - r11, pc} 242 243 ENDP 244 245 ;----------------- 246 ;One word each is reserved. Label filter_coeff can be used to access the data. 247 ;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ... 248 _filter8_coeff_ 249 DCD filter8_coeff 250 filter8_coeff 251 DCD 0x00000000, 0x00000080, 0x00000000, 0x00000000 252 DCD 0xfffa0000, 0x000c007b, 0x0000ffff, 0x00000000 253 DCD 0xfff50002, 0x0024006c, 0x0001fff8, 0x00000000 254 DCD 0xfff70000, 0x0032005d, 0x0000fffa, 0x00000000 255 DCD 0xfff00003, 0x004d004d, 0x0003fff0, 0x00000000 256 DCD 0xfffa0000, 0x005d0032, 0x0000fff7, 0x00000000 257 DCD 0xfff80001, 0x006c0024, 0x0002fff5, 0x00000000 258 DCD 0xffff0000, 0x007b000c, 0x0000fffa, 0x00000000 259 260 ;DCD 0, 0, 128, 0, 0, 0 261 ;DCD 0, -6, 123, 12, -1, 0 262 ;DCD 2, -11, 108, 36, -8, 1 263 ;DCD 0, -9, 93, 50, -6, 0 264 ;DCD 3, -16, 77, 77, -16, 3 265 ;DCD 0, -6, 50, 93, -9, 0 266 ;DCD 1, -8, 36, 108, -11, 2 267 ;DCD 0, -1, 12, 123, -6, 0 268 269 END 270