Home | History | Annotate | Download | only in armv6
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12     EXPORT  |vp8_filter_block2d_first_pass_armv6|
     13     EXPORT  |vp8_filter_block2d_second_pass_armv6|
     14     EXPORT  |vp8_filter4_block2d_second_pass_armv6|
     15     EXPORT  |vp8_filter_block2d_first_pass_only_armv6|
     16     EXPORT  |vp8_filter_block2d_second_pass_only_armv6|
     17 
     18     AREA    |.text|, CODE, READONLY  ; name this block of code
     19 ;-------------------------------------
     20 ; r0    unsigned char *src_ptr
     21 ; r1    short         *output_ptr
     22 ; r2    unsigned int src_pixels_per_line
     23 ; r3    unsigned int output_width
     24 ; stack unsigned int output_height
     25 ; stack const short *vp8_filter
     26 ;-------------------------------------
     27 ; vp8_filter the input and put in the output array.  Apply the 6 tap FIR filter with
     28 ; the output being a 2 byte value and the intput being a 1 byte value.
     29 |vp8_filter_block2d_first_pass_armv6| PROC
     30     stmdb   sp!, {r4 - r11, lr}
     31 
     32     ldr     r11, [sp, #40]                  ; vp8_filter address
     33     ldr     r7, [sp, #36]                   ; output height
     34 
     35     sub     r2, r2, r3                      ; inside loop increments input array,
     36                                             ; so the height loop only needs to add
     37                                             ; r2 - width to the input pointer
     38 
     39     mov     r3, r3, lsl #1                  ; multiply width by 2 because using shorts
     40     add     r12, r3, #16                    ; square off the output
     41     sub     sp, sp, #4
     42 
     43     ;;IF ARCHITECTURE=6
     44     ;pld        [r0, #-2]
     45     ;;pld       [r0, #30]
     46     ;;ENDIF
     47 
     48     ldr     r4, [r11]                       ; load up packed filter coefficients
     49     ldr     r5, [r11, #4]
     50     ldr     r6, [r11, #8]
     51 
     52     str     r1, [sp]                        ; push destination to stack
     53     mov     r7, r7, lsl #16                 ; height is top part of counter
     54 
     55 ; six tap filter
     56 |height_loop_1st_6|
     57     ldrb    r8, [r0, #-2]                   ; load source data
     58     ldrb    r9, [r0, #-1]
     59     ldrb    r10, [r0], #2
     60     orr     r7, r7, r3, lsr #2              ; construct loop counter
     61 
     62 |width_loop_1st_6|
     63     ldrb    r11, [r0, #-1]
     64 
     65     pkhbt   lr, r8, r9, lsl #16             ; r9 | r8
     66     pkhbt   r8, r9, r10, lsl #16            ; r10 | r9
     67 
     68     ldrb    r9, [r0]
     69 
     70     smuad   lr, lr, r4                      ; apply the filter
     71     pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
     72     smuad   r8, r8, r4
     73     pkhbt   r11, r11, r9, lsl #16           ; r9 | r11
     74 
     75     smlad   lr, r10, r5, lr
     76     ldrb    r10, [r0, #1]
     77     smlad   r8, r11, r5, r8
     78     ldrb    r11, [r0, #2]
     79 
     80     sub     r7, r7, #1
     81 
     82     pkhbt   r9, r9, r10, lsl #16            ; r10 | r9
     83     pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
     84 
     85     smlad   lr, r9, r6, lr
     86     smlad   r11, r10, r6, r8
     87 
     88     ands    r10, r7, #0xff                  ; test loop counter
     89 
     90     add     lr, lr, #0x40                   ; round_shift_and_clamp
     91     ldrneb  r8, [r0, #-2]                   ; load data for next loop
     92     usat    lr, #8, lr, asr #7
     93     add     r11, r11, #0x40
     94     ldrneb  r9, [r0, #-1]
     95     usat    r11, #8, r11, asr #7
     96 
     97     strh    lr, [r1], r12                   ; result is transposed and stored, which
     98                                             ; will make second pass filtering easier.
     99     ldrneb  r10, [r0], #2
    100     strh    r11, [r1], r12
    101 
    102     bne     width_loop_1st_6
    103 
    104     ;;add       r9, r2, #30                 ; attempt to load 2 adjacent cache lines
    105     ;;IF ARCHITECTURE=6
    106     ;pld        [r0, r2]
    107     ;;pld       [r0, r9]
    108     ;;ENDIF
    109 
    110     ldr     r1, [sp]                        ; load and update dst address
    111     subs    r7, r7, #0x10000
    112     add     r0, r0, r2                      ; move to next input line
    113     add     r1, r1, #2                      ; move over to next column
    114     str     r1, [sp]
    115 
    116     bne     height_loop_1st_6
    117 
    118     add     sp, sp, #4
    119     ldmia   sp!, {r4 - r11, pc}
    120 
    121     ENDP
    122 
    123 ;---------------------------------
    124 ; r0    short         *src_ptr,
    125 ; r1    unsigned char *output_ptr,
    126 ; r2    unsigned int output_pitch,
    127 ; r3    unsigned int cnt,
    128 ; stack const short *vp8_filter
    129 ;---------------------------------
    130 |vp8_filter_block2d_second_pass_armv6| PROC
    131     stmdb   sp!, {r4 - r11, lr}
    132 
    133     ldr     r11, [sp, #36]                  ; vp8_filter address
    134     sub     sp, sp, #4
    135     mov     r7, r3, lsl #16                 ; height is top part of counter
    136     str     r1, [sp]                        ; push destination to stack
    137 
    138     ldr     r4, [r11]                       ; load up packed filter coefficients
    139     ldr     r5, [r11, #4]
    140     ldr     r6, [r11, #8]
    141 
    142     pkhbt   r12, r5, r4                     ; pack the filter differently
    143     pkhbt   r11, r6, r5
    144 
    145     sub     r0, r0, #4                      ; offset input buffer
    146 
    147 |height_loop_2nd|
    148     ldr     r8, [r0]                        ; load the data
    149     ldr     r9, [r0, #4]
    150     orr     r7, r7, r3, lsr #1              ; loop counter
    151 
    152 |width_loop_2nd|
    153     smuad   lr, r4, r8                      ; apply filter
    154     sub     r7, r7, #1
    155     smulbt  r8, r4, r8
    156 
    157     ldr     r10, [r0, #8]
    158 
    159     smlad   lr, r5, r9, lr
    160     smladx  r8, r12, r9, r8
    161 
    162     ldrh    r9, [r0, #12]
    163 
    164     smlad   lr, r6, r10, lr
    165     smladx  r8, r11, r10, r8
    166 
    167     add     r0, r0, #4
    168     smlatb  r10, r6, r9, r8
    169 
    170     add     lr, lr, #0x40                   ; round_shift_and_clamp
    171     ands    r8, r7, #0xff
    172     usat    lr, #8, lr, asr #7
    173     add     r10, r10, #0x40
    174     strb    lr, [r1], r2                    ; the result is transposed back and stored
    175     usat    r10, #8, r10, asr #7
    176 
    177     ldrne   r8, [r0]                        ; load data for next loop
    178     ldrne   r9, [r0, #4]
    179     strb    r10, [r1], r2
    180 
    181     bne     width_loop_2nd
    182 
    183     ldr     r1, [sp]                        ; update dst for next loop
    184     subs    r7, r7, #0x10000
    185     add     r0, r0, #16                     ; updata src for next loop
    186     add     r1, r1, #1
    187     str     r1, [sp]
    188 
    189     bne     height_loop_2nd
    190 
    191     add     sp, sp, #4
    192     ldmia   sp!, {r4 - r11, pc}
    193 
    194     ENDP
    195 
    196 ;---------------------------------
    197 ; r0    short         *src_ptr,
    198 ; r1    unsigned char *output_ptr,
    199 ; r2    unsigned int output_pitch,
    200 ; r3    unsigned int cnt,
    201 ; stack const short *vp8_filter
    202 ;---------------------------------
    203 |vp8_filter4_block2d_second_pass_armv6| PROC
    204     stmdb   sp!, {r4 - r11, lr}
    205 
    206     ldr     r11, [sp, #36]                  ; vp8_filter address
    207     mov     r7, r3, lsl #16                 ; height is top part of counter
    208 
    209     ldr     r4, [r11]                       ; load up packed filter coefficients
    210     add     lr, r1, r3                      ; save final destination pointer
    211     ldr     r5, [r11, #4]
    212     ldr     r6, [r11, #8]
    213 
    214     pkhbt   r12, r5, r4                     ; pack the filter differently
    215     pkhbt   r11, r6, r5
    216     mov     r4, #0x40                       ; rounding factor (for smlad{x})
    217 
    218 |height_loop_2nd_4|
    219     ldrd    r8, [r0, #-4]                   ; load the data
    220     orr     r7, r7, r3, lsr #1              ; loop counter
    221 
    222 |width_loop_2nd_4|
    223     ldr     r10, [r0, #4]!
    224     smladx  r6, r9, r12, r4                 ; apply filter
    225     pkhbt   r8, r9, r8
    226     smlad   r5, r8, r12, r4
    227     pkhbt   r8, r10, r9
    228     smladx  r6, r10, r11, r6
    229     sub     r7, r7, #1
    230     smlad   r5, r8, r11, r5
    231 
    232     mov     r8, r9                          ; shift the data for the next loop
    233     mov     r9, r10
    234 
    235     usat    r6, #8, r6, asr #7              ; shift and clamp
    236     usat    r5, #8, r5, asr #7
    237 
    238     strb    r5, [r1], r2                    ; the result is transposed back and stored
    239     tst     r7, #0xff
    240     strb    r6, [r1], r2
    241 
    242     bne     width_loop_2nd_4
    243 
    244     subs    r7, r7, #0x10000
    245     add     r0, r0, #16                     ; update src for next loop
    246     sub     r1, lr, r7, lsr #16             ; update dst for next loop
    247 
    248     bne     height_loop_2nd_4
    249 
    250     ldmia   sp!, {r4 - r11, pc}
    251 
    252     ENDP
    253 
    254 ;------------------------------------
    255 ; r0    unsigned char *src_ptr
    256 ; r1    unsigned char *output_ptr,
    257 ; r2    unsigned int src_pixels_per_line
    258 ; r3    unsigned int cnt,
    259 ; stack unsigned int output_pitch,
    260 ; stack const short *vp8_filter
    261 ;------------------------------------
    262 |vp8_filter_block2d_first_pass_only_armv6| PROC
    263     stmdb   sp!, {r4 - r11, lr}
    264 
    265     ldr     r4, [sp, #36]                   ; output pitch
    266     ldr     r11, [sp, #40]                  ; HFilter address
    267     sub     sp, sp, #8
    268 
    269     mov     r7, r3
    270     sub     r2, r2, r3                      ; inside loop increments input array,
    271                                             ; so the height loop only needs to add
    272                                             ; r2 - width to the input pointer
    273 
    274     sub     r4, r4, r3
    275     str     r4, [sp]                        ; save modified output pitch
    276     str     r2, [sp, #4]
    277 
    278     mov     r2, #0x40
    279 
    280     ldr     r4, [r11]                       ; load up packed filter coefficients
    281     ldr     r5, [r11, #4]
    282     ldr     r6, [r11, #8]
    283 
    284 ; six tap filter
    285 |height_loop_1st_only_6|
    286     ldrb    r8, [r0, #-2]                   ; load data
    287     ldrb    r9, [r0, #-1]
    288     ldrb    r10, [r0], #2
    289 
    290     mov     r12, r3, lsr #1                 ; loop counter
    291 
    292 |width_loop_1st_only_6|
    293     ldrb    r11, [r0, #-1]
    294 
    295     pkhbt   lr, r8, r9, lsl #16             ; r9 | r8
    296     pkhbt   r8, r9, r10, lsl #16            ; r10 | r9
    297 
    298     ldrb    r9, [r0]
    299 
    300 ;;  smuad   lr, lr, r4
    301     smlad   lr, lr, r4, r2
    302     pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
    303 ;;  smuad   r8, r8, r4
    304     smlad   r8, r8, r4, r2
    305     pkhbt   r11, r11, r9, lsl #16           ; r9 | r11
    306 
    307     smlad   lr, r10, r5, lr
    308     ldrb    r10, [r0, #1]
    309     smlad   r8, r11, r5, r8
    310     ldrb    r11, [r0, #2]
    311 
    312     subs    r12, r12, #1
    313 
    314     pkhbt   r9, r9, r10, lsl #16            ; r10 | r9
    315     pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
    316 
    317     smlad   lr, r9, r6, lr
    318     smlad   r10, r10, r6, r8
    319 
    320 ;;  add     lr, lr, #0x40                   ; round_shift_and_clamp
    321     ldrneb  r8, [r0, #-2]                   ; load data for next loop
    322     usat    lr, #8, lr, asr #7
    323 ;;  add     r10, r10, #0x40
    324     strb    lr, [r1], #1                    ; store the result
    325     usat    r10, #8, r10, asr #7
    326 
    327     ldrneb  r9, [r0, #-1]
    328     strb    r10, [r1], #1
    329     ldrneb  r10, [r0], #2
    330 
    331     bne     width_loop_1st_only_6
    332 
    333     ;;add       r9, r2, #30                 ; attempt to load 2 adjacent cache lines
    334     ;;IF ARCHITECTURE=6
    335     ;pld        [r0, r2]
    336     ;;pld       [r0, r9]
    337     ;;ENDIF
    338 
    339     ldr     lr, [sp]                        ; load back output pitch
    340     ldr     r12, [sp, #4]                   ; load back output pitch
    341     subs    r7, r7, #1
    342     add     r0, r0, r12                     ; updata src for next loop
    343     add     r1, r1, lr                      ; update dst for next loop
    344 
    345     bne     height_loop_1st_only_6
    346 
    347     add     sp, sp, #8
    348     ldmia   sp!, {r4 - r11, pc}
    349     ENDP  ; |vp8_filter_block2d_first_pass_only_armv6|
    350 
    351 
    352 ;------------------------------------
    353 ; r0    unsigned char *src_ptr,
    354 ; r1    unsigned char *output_ptr,
    355 ; r2    unsigned int src_pixels_per_line
    356 ; r3    unsigned int cnt,
    357 ; stack unsigned int output_pitch,
    358 ; stack const short *vp8_filter
    359 ;------------------------------------
    360 |vp8_filter_block2d_second_pass_only_armv6| PROC
    361     stmdb   sp!, {r4 - r11, lr}
    362 
    363     ldr     r11, [sp, #40]                  ; VFilter address
    364     ldr     r12, [sp, #36]                  ; output pitch
    365 
    366     mov     r7, r3, lsl #16                 ; height is top part of counter
    367     sub     r0, r0, r2, lsl #1              ; need 6 elements for filtering, 2 before, 3 after
    368 
    369     sub     sp, sp, #8
    370 
    371     ldr     r4, [r11]                       ; load up packed filter coefficients
    372     ldr     r5, [r11, #4]
    373     ldr     r6, [r11, #8]
    374 
    375     str     r0, [sp]                        ; save r0 to stack
    376     str     r1, [sp, #4]                    ; save dst to stack
    377 
    378 ; six tap filter
    379 |width_loop_2nd_only_6|
    380     ldrb    r8, [r0], r2                    ; load data
    381     orr     r7, r7, r3                      ; loop counter
    382     ldrb    r9, [r0], r2
    383     ldrb    r10, [r0], r2
    384 
    385 |height_loop_2nd_only_6|
    386     ; filter first column in this inner loop, than, move to next colum.
    387     ldrb    r11, [r0], r2
    388 
    389     pkhbt   lr, r8, r9, lsl #16             ; r9 | r8
    390     pkhbt   r8, r9, r10, lsl #16            ; r10 | r9
    391 
    392     ldrb    r9, [r0], r2
    393 
    394     smuad   lr, lr, r4
    395     pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
    396     smuad   r8, r8, r4
    397     pkhbt   r11, r11, r9, lsl #16           ; r9 | r11
    398 
    399     smlad   lr, r10, r5, lr
    400     ldrb    r10, [r0], r2
    401     smlad   r8, r11, r5, r8
    402     ldrb    r11, [r0]
    403 
    404     sub     r7, r7, #2
    405     sub     r0, r0, r2, lsl #2
    406 
    407     pkhbt   r9, r9, r10, lsl #16            ; r10 | r9
    408     pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
    409 
    410     smlad   lr, r9, r6, lr
    411     smlad   r10, r10, r6, r8
    412 
    413     ands    r9, r7, #0xff
    414 
    415     add     lr, lr, #0x40                   ; round_shift_and_clamp
    416     ldrneb  r8, [r0], r2                    ; load data for next loop
    417     usat    lr, #8, lr, asr #7
    418     add     r10, r10, #0x40
    419     strb    lr, [r1], r12                   ; store the result for the column
    420     usat    r10, #8, r10, asr #7
    421 
    422     ldrneb  r9, [r0], r2
    423     strb    r10, [r1], r12
    424     ldrneb  r10, [r0], r2
    425 
    426     bne     height_loop_2nd_only_6
    427 
    428     ldr     r0, [sp]
    429     ldr     r1, [sp, #4]
    430     subs    r7, r7, #0x10000
    431     add     r0, r0, #1                      ; move to filter next column
    432     str     r0, [sp]
    433     add     r1, r1, #1
    434     str     r1, [sp, #4]
    435 
    436     bne     width_loop_2nd_only_6
    437 
    438     add     sp, sp, #8
    439 
    440     ldmia   sp!, {r4 - r11, pc}
    441     ENDP  ; |vp8_filter_block2d_second_pass_only_armv6|
    442 
    443     END
    444