Home | History | Annotate | Download | only in armv6
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12     EXPORT  |vp8_sixtap_predict8x4_armv6|
     13 
     14     AREA    |.text|, CODE, READONLY  ; name this block of code
     15 ;-------------------------------------
     16 ; r0    unsigned char *src_ptr,
     17 ; r1    int  src_pixels_per_line,
     18 ; r2    int  xoffset,
     19 ; r3    int  yoffset,
     20 ; stack unsigned char *dst_ptr,
     21 ; stack int  dst_pitch
     22 ;-------------------------------------
     23 ;note: In first pass, store the result in transpose(8linesx9columns) on stack. Temporary stack size is 184.
     24 ;Line width is 20 that is 9 short data plus 2 to make it 4bytes aligned. In second pass, load data from stack,
     25 ;and the result is stored in transpose.
     26 |vp8_sixtap_predict8x4_armv6| PROC
     27     stmdb       sp!, {r4 - r11, lr}
     28     str         r3, [sp, #-184]!            ;reserve space on stack for temporary storage, store yoffset
     29 
     30     cmp         r2, #0                      ;skip first_pass filter if xoffset=0
     31     add         lr, sp, #4                  ;point to temporary buffer
     32     beq         skip_firstpass_filter
     33 
     34 ;first-pass filter
     35     adr         r12, filter8_coeff
     36     sub         r0, r0, r1, lsl #1
     37 
     38     add         r3, r1, #10                 ; preload next low
     39     pld         [r0, r3]
     40 
     41     add         r2, r12, r2, lsl #4         ;calculate filter location
     42     add         r0, r0, #3                  ;adjust src only for loading convinience
     43 
     44     ldr         r3, [r2]                    ; load up packed filter coefficients
     45     ldr         r4, [r2, #4]
     46     ldr         r5, [r2, #8]
     47 
     48     mov         r2, #0x90000                ; height=9 is top part of counter
     49 
     50     sub         r1, r1, #8
     51 
     52 |first_pass_hloop_v6|
     53     ldrb        r6, [r0, #-5]               ; load source data
     54     ldrb        r7, [r0, #-4]
     55     ldrb        r8, [r0, #-3]
     56     ldrb        r9, [r0, #-2]
     57     ldrb        r10, [r0, #-1]
     58 
     59     orr         r2, r2, #0x4                ; construct loop counter. width=8=4x2
     60 
     61     pkhbt       r6, r6, r7, lsl #16         ; r7 | r6
     62     pkhbt       r7, r7, r8, lsl #16         ; r8 | r7
     63 
     64     pkhbt       r8, r8, r9, lsl #16         ; r9 | r8
     65     pkhbt       r9, r9, r10, lsl #16        ; r10 | r9
     66 
     67 |first_pass_wloop_v6|
     68     smuad       r11, r6, r3                 ; vp8_filter[0], vp8_filter[1]
     69     smuad       r12, r7, r3
     70 
     71     ldrb        r6, [r0], #1
     72 
     73     smlad       r11, r8, r4, r11            ; vp8_filter[2], vp8_filter[3]
     74     ldrb        r7, [r0], #1
     75     smlad       r12, r9, r4, r12
     76 
     77     pkhbt       r10, r10, r6, lsl #16       ; r10 | r9
     78     pkhbt       r6, r6, r7, lsl #16         ; r11 | r10
     79     smlad       r11, r10, r5, r11           ; vp8_filter[4], vp8_filter[5]
     80     smlad       r12, r6, r5, r12
     81 
     82     sub         r2, r2, #1
     83 
     84     add         r11, r11, #0x40             ; round_shift_and_clamp
     85     tst         r2, #0xff                   ; test loop counter
     86     usat        r11, #8, r11, asr #7
     87     add         r12, r12, #0x40
     88     strh        r11, [lr], #20              ; result is transposed and stored, which
     89     usat        r12, #8, r12, asr #7
     90 
     91     strh        r12, [lr], #20
     92 
     93     movne       r11, r6
     94     movne       r12, r7
     95 
     96     movne       r6, r8
     97     movne       r7, r9
     98     movne       r8, r10
     99     movne       r9, r11
    100     movne       r10, r12
    101 
    102     bne         first_pass_wloop_v6
    103 
    104     ;;add       r9, ppl, #30                ; attempt to load 2 adjacent cache lines
    105     ;;IF ARCHITECTURE=6
    106     ;pld        [src, ppl]
    107     ;;pld       [src, r9]
    108     ;;ENDIF
    109 
    110     subs        r2, r2, #0x10000
    111 
    112     sub         lr, lr, #158
    113 
    114     add         r0, r0, r1                  ; move to next input line
    115 
    116     add         r11, r1, #18                ; preload next low. adding back block width(=8), which is subtracted earlier
    117     pld         [r0, r11]
    118 
    119     bne         first_pass_hloop_v6
    120 
    121 ;second pass filter
    122 secondpass_filter
    123     ldr         r3, [sp], #4                ; load back yoffset
    124     ldr         r0, [sp, #216]              ; load dst address from stack 180+36
    125     ldr         r1, [sp, #220]              ; load dst stride from stack 180+40
    126 
    127     cmp         r3, #0
    128     beq         skip_secondpass_filter
    129 
    130     adr         r12, filter8_coeff
    131     add         lr, r12, r3, lsl #4         ;calculate filter location
    132 
    133     mov         r2, #0x00080000
    134 
    135     ldr         r3, [lr]                    ; load up packed filter coefficients
    136     ldr         r4, [lr, #4]
    137     ldr         r5, [lr, #8]
    138 
    139     pkhbt       r12, r4, r3                 ; pack the filter differently
    140     pkhbt       r11, r5, r4
    141 
    142 second_pass_hloop_v6
    143     ldr         r6, [sp]                    ; load the data
    144     ldr         r7, [sp, #4]
    145 
    146     orr         r2, r2, #2                  ; loop counter
    147 
    148 second_pass_wloop_v6
    149     smuad       lr, r3, r6                  ; apply filter
    150     smulbt      r10, r3, r6
    151 
    152     ldr         r8, [sp, #8]
    153 
    154     smlad       lr, r4, r7, lr
    155     smladx      r10, r12, r7, r10
    156 
    157     ldrh        r9, [sp, #12]
    158 
    159     smlad       lr, r5, r8, lr
    160     smladx      r10, r11, r8, r10
    161 
    162     add         sp, sp, #4
    163     smlatb      r10, r5, r9, r10
    164 
    165     sub         r2, r2, #1
    166 
    167     add         lr, lr, #0x40               ; round_shift_and_clamp
    168     tst         r2, #0xff
    169     usat        lr, #8, lr, asr #7
    170     add         r10, r10, #0x40
    171     strb        lr, [r0], r1                ; the result is transposed back and stored
    172     usat        r10, #8, r10, asr #7
    173 
    174     strb        r10, [r0],r1
    175 
    176     movne       r6, r7
    177     movne       r7, r8
    178 
    179     bne         second_pass_wloop_v6
    180 
    181     subs        r2, r2, #0x10000
    182     add         sp, sp, #12                 ; updata src for next loop (20-8)
    183     sub         r0, r0, r1, lsl #2
    184     add         r0, r0, #1
    185 
    186     bne         second_pass_hloop_v6
    187 
    188     add         sp, sp, #20
    189     ldmia       sp!, {r4 - r11, pc}
    190 
    191 ;--------------------
    192 skip_firstpass_filter
    193     sub         r0, r0, r1, lsl #1
    194     sub         r1, r1, #8
    195     mov         r2, #9
    196 
    197 skip_firstpass_hloop
    198     ldrb        r4, [r0], #1                ; load data
    199     subs        r2, r2, #1
    200     ldrb        r5, [r0], #1
    201     strh        r4, [lr], #20               ; store it to immediate buffer
    202     ldrb        r6, [r0], #1                ; load data
    203     strh        r5, [lr], #20
    204     ldrb        r7, [r0], #1
    205     strh        r6, [lr], #20
    206     ldrb        r8, [r0], #1
    207     strh        r7, [lr], #20
    208     ldrb        r9, [r0], #1
    209     strh        r8, [lr], #20
    210     ldrb        r10, [r0], #1
    211     strh        r9, [lr], #20
    212     ldrb        r11, [r0], #1
    213     strh        r10, [lr], #20
    214     add         r0, r0, r1                  ; move to next input line
    215     strh        r11, [lr], #20
    216 
    217     sub         lr, lr, #158                ; move over to next column
    218     bne         skip_firstpass_hloop
    219 
    220     b           secondpass_filter
    221 
    222 ;--------------------
    223 skip_secondpass_filter
    224     mov         r2, #8
    225     add         sp, sp, #4                  ;start from src[0] instead of src[-2]
    226 
    227 skip_secondpass_hloop
    228     ldr         r6, [sp], #4
    229     subs        r2, r2, #1
    230     ldr         r8, [sp], #4
    231 
    232     mov         r7, r6, lsr #16             ; unpack
    233     strb        r6, [r0], r1
    234     mov         r9, r8, lsr #16
    235     strb        r7, [r0], r1
    236     add         sp, sp, #12                 ; 20-8
    237     strb        r8, [r0], r1
    238     strb        r9, [r0], r1
    239 
    240     sub         r0, r0, r1, lsl #2
    241     add         r0, r0, #1
    242 
    243     bne         skip_secondpass_hloop
    244 
    245     add         sp, sp, #16                 ; 180 - (160 +4)
    246 
    247     ldmia       sp!, {r4 - r11, pc}
    248 
    249     ENDP
    250 
    251 ;-----------------
    252 ;One word each is reserved. Label filter_coeff can be used to access the data.
    253 ;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
    254 filter8_coeff
    255     DCD     0x00000000,     0x00000080,     0x00000000,     0x00000000
    256     DCD     0xfffa0000,     0x000c007b,     0x0000ffff,     0x00000000
    257     DCD     0xfff50002,     0x0024006c,     0x0001fff8,     0x00000000
    258     DCD     0xfff70000,     0x0032005d,     0x0000fffa,     0x00000000
    259     DCD     0xfff00003,     0x004d004d,     0x0003fff0,     0x00000000
    260     DCD     0xfffa0000,     0x005d0032,     0x0000fff7,     0x00000000
    261     DCD     0xfff80001,     0x006c0024,     0x0002fff5,     0x00000000
    262     DCD     0xffff0000,     0x007b000c,     0x0000fffa,     0x00000000
    263 
    264     ;DCD        0,  0,  128,    0,   0,  0
    265     ;DCD        0, -6,  123,   12,  -1,  0
    266     ;DCD        2, -11, 108,   36,  -8,  1
    267     ;DCD        0, -9,   93,   50,  -6,  0
    268     ;DCD        3, -16,  77,   77, -16,  3
    269     ;DCD        0, -6,   50,   93,  -9,  0
    270     ;DCD        1, -8,   36,  108, -11,  2
    271     ;DCD        0, -1,   12,  123,  -6,  0
    272 
    273     END
    274