Home | History | Annotate | Download | only in armv6
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12     EXPORT  |vp8_sixtap_predict8x4_armv6|
     13 
     14     AREA    |.text|, CODE, READONLY  ; name this block of code
     15 ;-------------------------------------
     16 ; r0    unsigned char *src_ptr,
     17 ; r1    int  src_pixels_per_line,
     18 ; r2    int  xoffset,
     19 ; r3    int  yoffset,
     20 ; stack unsigned char *dst_ptr,
     21 ; stack int  dst_pitch
     22 ;-------------------------------------
     23 ;note: In first pass, store the result in transpose(8linesx9columns) on stack. Temporary stack size is 184.
     24 ;Line width is 20 that is 9 short data plus 2 to make it 4bytes aligned. In second pass, load data from stack,
     25 ;and the result is stored in transpose.
     26 |vp8_sixtap_predict8x4_armv6| PROC
     27     stmdb       sp!, {r4 - r11, lr}
     28     str         r3, [sp, #-184]!            ;reserve space on stack for temporary storage, store yoffset
     29 
     30     cmp         r2, #0                      ;skip first_pass filter if xoffset=0
     31     add         lr, sp, #4                  ;point to temporary buffer
     32     beq         skip_firstpass_filter
     33 
     34 ;first-pass filter
     35     ldr         r12, _filter8_coeff_
     36     sub         r0, r0, r1, lsl #1
     37 
     38     add         r2, r12, r2, lsl #4         ;calculate filter location
     39     add         r0, r0, #3                  ;adjust src only for loading convinience
     40 
     41     ldr         r3, [r2]                    ; load up packed filter coefficients
     42     ldr         r4, [r2, #4]
     43     ldr         r5, [r2, #8]
     44 
     45     mov         r2, #0x90000                ; height=9 is top part of counter
     46 
     47     sub         r1, r1, #8
     48 
     49 |first_pass_hloop_v6|
     50     ldrb        r6, [r0, #-5]               ; load source data
     51     ldrb        r7, [r0, #-4]
     52     ldrb        r8, [r0, #-3]
     53     ldrb        r9, [r0, #-2]
     54     ldrb        r10, [r0, #-1]
     55 
     56     orr         r2, r2, #0x4                ; construct loop counter. width=8=4x2
     57 
     58     pkhbt       r6, r6, r7, lsl #16         ; r7 | r6
     59     pkhbt       r7, r7, r8, lsl #16         ; r8 | r7
     60 
     61     pkhbt       r8, r8, r9, lsl #16         ; r9 | r8
     62     pkhbt       r9, r9, r10, lsl #16        ; r10 | r9
     63 
     64 |first_pass_wloop_v6|
     65     smuad       r11, r6, r3                 ; vp8_filter[0], vp8_filter[1]
     66     smuad       r12, r7, r3
     67 
     68     ldrb        r6, [r0], #1
     69 
     70     smlad       r11, r8, r4, r11            ; vp8_filter[2], vp8_filter[3]
     71     ldrb        r7, [r0], #1
     72     smlad       r12, r9, r4, r12
     73 
     74     pkhbt       r10, r10, r6, lsl #16       ; r10 | r9
     75     pkhbt       r6, r6, r7, lsl #16         ; r11 | r10
     76     smlad       r11, r10, r5, r11           ; vp8_filter[4], vp8_filter[5]
     77     smlad       r12, r6, r5, r12
     78 
     79     sub         r2, r2, #1
     80 
     81     add         r11, r11, #0x40             ; round_shift_and_clamp
     82     tst         r2, #0xff                   ; test loop counter
     83     usat        r11, #8, r11, asr #7
     84     add         r12, r12, #0x40
     85     strh        r11, [lr], #20              ; result is transposed and stored, which
     86     usat        r12, #8, r12, asr #7
     87 
     88     strh        r12, [lr], #20
     89 
     90     movne       r11, r6
     91     movne       r12, r7
     92 
     93     movne       r6, r8
     94     movne       r7, r9
     95     movne       r8, r10
     96     movne       r9, r11
     97     movne       r10, r12
     98 
     99     bne         first_pass_wloop_v6
    100 
    101     ;;add       r9, ppl, #30                ; attempt to load 2 adjacent cache lines
    102     ;;IF ARCHITECTURE=6
    103     ;pld        [src, ppl]
    104     ;;pld       [src, r9]
    105     ;;ENDIF
    106 
    107     subs        r2, r2, #0x10000
    108 
    109     sub         lr, lr, #158
    110 
    111     add         r0, r0, r1                  ; move to next input line
    112 
    113     bne         first_pass_hloop_v6
    114 
    115 ;second pass filter
    116 secondpass_filter
    117     ldr         r3, [sp], #4                ; load back yoffset
    118     ldr         r0, [sp, #216]              ; load dst address from stack 180+36
    119     ldr         r1, [sp, #220]              ; load dst stride from stack 180+40
    120 
    121     cmp         r3, #0
    122     beq         skip_secondpass_filter
    123 
    124     ldr         r12, _filter8_coeff_
    125     add         lr, r12, r3, lsl #4         ;calculate filter location
    126 
    127     mov         r2, #0x00080000
    128 
    129     ldr         r3, [lr]                    ; load up packed filter coefficients
    130     ldr         r4, [lr, #4]
    131     ldr         r5, [lr, #8]
    132 
    133     pkhbt       r12, r4, r3                 ; pack the filter differently
    134     pkhbt       r11, r5, r4
    135 
    136 second_pass_hloop_v6
    137     ldr         r6, [sp]                    ; load the data
    138     ldr         r7, [sp, #4]
    139 
    140     orr         r2, r2, #2                  ; loop counter
    141 
    142 second_pass_wloop_v6
    143     smuad       lr, r3, r6                  ; apply filter
    144     smulbt      r10, r3, r6
    145 
    146     ldr         r8, [sp, #8]
    147 
    148     smlad       lr, r4, r7, lr
    149     smladx      r10, r12, r7, r10
    150 
    151     ldrh        r9, [sp, #12]
    152 
    153     smlad       lr, r5, r8, lr
    154     smladx      r10, r11, r8, r10
    155 
    156     add         sp, sp, #4
    157     smlatb      r10, r5, r9, r10
    158 
    159     sub         r2, r2, #1
    160 
    161     add         lr, lr, #0x40               ; round_shift_and_clamp
    162     tst         r2, #0xff
    163     usat        lr, #8, lr, asr #7
    164     add         r10, r10, #0x40
    165     strb        lr, [r0], r1                ; the result is transposed back and stored
    166     usat        r10, #8, r10, asr #7
    167 
    168     strb        r10, [r0],r1
    169 
    170     movne       r6, r7
    171     movne       r7, r8
    172 
    173     bne         second_pass_wloop_v6
    174 
    175     subs        r2, r2, #0x10000
    176     add         sp, sp, #12                 ; updata src for next loop (20-8)
    177     sub         r0, r0, r1, lsl #2
    178     add         r0, r0, #1
    179 
    180     bne         second_pass_hloop_v6
    181 
    182     add         sp, sp, #20
    183     ldmia       sp!, {r4 - r11, pc}
    184 
    185 ;--------------------
    186 skip_firstpass_filter
    187     sub         r0, r0, r1, lsl #1
    188     sub         r1, r1, #8
    189     mov         r2, #9
    190 
    191 skip_firstpass_hloop
    192     ldrb        r4, [r0], #1                ; load data
    193     subs        r2, r2, #1
    194     ldrb        r5, [r0], #1
    195     strh        r4, [lr], #20               ; store it to immediate buffer
    196     ldrb        r6, [r0], #1                ; load data
    197     strh        r5, [lr], #20
    198     ldrb        r7, [r0], #1
    199     strh        r6, [lr], #20
    200     ldrb        r8, [r0], #1
    201     strh        r7, [lr], #20
    202     ldrb        r9, [r0], #1
    203     strh        r8, [lr], #20
    204     ldrb        r10, [r0], #1
    205     strh        r9, [lr], #20
    206     ldrb        r11, [r0], #1
    207     strh        r10, [lr], #20
    208     add         r0, r0, r1                  ; move to next input line
    209     strh        r11, [lr], #20
    210 
    211     sub         lr, lr, #158                ; move over to next column
    212     bne         skip_firstpass_hloop
    213 
    214     b           secondpass_filter
    215 
    216 ;--------------------
    217 skip_secondpass_filter
    218     mov         r2, #8
    219     add         sp, sp, #4                  ;start from src[0] instead of src[-2]
    220 
    221 skip_secondpass_hloop
    222     ldr         r6, [sp], #4
    223     subs        r2, r2, #1
    224     ldr         r8, [sp], #4
    225 
    226     mov         r7, r6, lsr #16             ; unpack
    227     strb        r6, [r0], r1
    228     mov         r9, r8, lsr #16
    229     strb        r7, [r0], r1
    230     add         sp, sp, #12                 ; 20-8
    231     strb        r8, [r0], r1
    232     strb        r9, [r0], r1
    233 
    234     sub         r0, r0, r1, lsl #2
    235     add         r0, r0, #1
    236 
    237     bne         skip_secondpass_hloop
    238 
    239     add         sp, sp, #16                 ; 180 - (160 +4)
    240 
    241     ldmia       sp!, {r4 - r11, pc}
    242 
    243     ENDP
    244 
    245 ;-----------------
    246 ;One word each is reserved. Label filter_coeff can be used to access the data.
    247 ;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
    248 _filter8_coeff_
    249     DCD     filter8_coeff
    250 filter8_coeff
    251     DCD     0x00000000,     0x00000080,     0x00000000,     0x00000000
    252     DCD     0xfffa0000,     0x000c007b,     0x0000ffff,     0x00000000
    253     DCD     0xfff50002,     0x0024006c,     0x0001fff8,     0x00000000
    254     DCD     0xfff70000,     0x0032005d,     0x0000fffa,     0x00000000
    255     DCD     0xfff00003,     0x004d004d,     0x0003fff0,     0x00000000
    256     DCD     0xfffa0000,     0x005d0032,     0x0000fff7,     0x00000000
    257     DCD     0xfff80001,     0x006c0024,     0x0002fff5,     0x00000000
    258     DCD     0xffff0000,     0x007b000c,     0x0000fffa,     0x00000000
    259 
    260     ;DCD        0,  0,  128,    0,   0,  0
    261     ;DCD        0, -6,  123,   12,  -1,  0
    262     ;DCD        2, -11, 108,   36,  -8,  1
    263     ;DCD        0, -9,   93,   50,  -6,  0
    264     ;DCD        3, -16,  77,   77, -16,  3
    265     ;DCD        0, -6,   50,   93,  -9,  0
    266     ;DCD        1, -8,   36,  108, -11,  2
    267     ;DCD        0, -1,   12,  123,  -6,  0
    268 
    269     END
    270