Home | History | Annotate | Download | only in neon
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12     EXPORT |vp8cx_pack_tokens_into_partitions_armv7|
     13 
     14     INCLUDE vpx_vp8_enc_asm_offsets.asm
     15 
     16     ARM
     17     REQUIRE8
     18     PRESERVE8
     19 
     20     AREA    |.text|, CODE, READONLY
     21 
     22 ; r0 VP8_COMP *cpi
     23 ; r1 unsigned char *cx_data
     24 ; r2 int num_part
     25 ; r3 *size
     26 ; s0 vp8_coef_encodings
     27 ; s1 vp8_extra_bits,
     28 ; s2 const vp8_tree_index *,
     29 
     30 |vp8cx_pack_tokens_into_partitions_armv7| PROC
     31     push    {r4-r11, lr}
     32     sub     sp, sp, #44
     33 
     34     ; Compute address of cpi->common.mb_rows
     35     ldr     r4, _VP8_COMP_common_
     36     ldr     r6, _VP8_COMMON_MBrows_
     37     add     r4, r0, r4
     38 
     39     ldr     r5, [r4, r6]                ; load up mb_rows
     40 
     41     str     r5, [sp, #36]               ; save mb_rows
     42     str     r1, [sp, #24]               ; save cx_data
     43     str     r2, [sp, #20]               ; save num_part
     44     str     r3, [sp, #8]                ; save *size
     45 
     46     ; *size = 3*(num_part -1 );
     47     sub     r2, r2, #1                  ; num_part - 1
     48     add     r2, r2, r2, lsl #1          ; 3*(num_part - 1)
     49     str     r2, [r3]
     50 
     51     add     r2, r2, r1                  ; cx_data + *size
     52     str     r2, [sp, #40]               ; ptr
     53 
     54     ldr     r4, _VP8_COMP_tplist_
     55     add     r4, r0, r4
     56     ldr     r7, [r4, #0]                ; dereference cpi->tp_list
     57     str     r7, [sp, #32]               ; store start of cpi->tp_list
     58 
     59     ldr     r11, _VP8_COMP_bc2_         ; load up vp8_writer out of cpi
     60     add     r0, r0, r11
     61 
     62     mov     r11, #0
     63     str     r11, [sp, #28]              ; i
     64 
     65 numparts_loop
     66     ldr     r10, [sp, #40]              ; ptr
     67     ldr     r5,  [sp, #36]              ; move mb_rows to the counting section
     68     str     r5,  [sp, #12]
     69 
     70     ; Reset all of the VP8 Writer data for each partition that
     71     ; is processed.
     72     ; start_encode
     73     mov     r2, #0                      ; vp8_writer_lowvalue
     74     mov     r5, #255                    ; vp8_writer_range
     75     mvn     r3, #23                     ; vp8_writer_count
     76 
     77     str     r2,  [r0, #vp8_writer_value]
     78     str     r2,  [r0, #vp8_writer_pos]
     79     str     r10, [r0, #vp8_writer_buffer]
     80 
     81 mb_row_loop
     82 
     83     ldr     r1, [r7, #tokenlist_start]
     84     ldr     r9, [r7, #tokenlist_stop]
     85     str     r9, [sp, #0]                ; save stop for later comparison
     86     str     r7, [sp, #16]               ; tokenlist address for next time
     87 
     88     b       check_p_lt_stop
     89 
     90     ; actual work gets done here!
     91 
     92 while_p_lt_stop
     93     ldr     r6, [r1, #tokenextra_token] ; t
     94     ldr     r4, [sp, #80]               ; vp8_coef_encodings
     95     mov     lr, #0
     96     add     r4, r4, r6, lsl #3          ; a = vp8_coef_encodings + t
     97     ldr     r9, [r1, #tokenextra_context_tree]   ; pp
     98 
     99     ldr     r7, [r1, #tokenextra_skip_eob_node]
    100 
    101     ldr     r6, [r4, #vp8_token_value]  ; v
    102     ldr     r8, [r4, #vp8_token_len]    ; n
    103 
    104     ; vp8 specific skip_eob_node
    105     cmp     r7, #0
    106     movne   lr, #2                      ; i = 2
    107     subne   r8, r8, #1                  ; --n
    108 
    109     ; reverse the stream of bits to be packed.  Normally
    110     ; the most significant bit is peeled off and compared
    111     ; in the form of (v >> --n) & 1.  ARM architecture has
    112     ; the ability to set a flag based on the value of the
    113     ; bit shifted off the bottom of the register.  To make
    114     ; that happen the bitstream is reversed.
    115     rbit    r12, r6
    116     rsb     r4, r8, #32                 ; 32-n
    117     ldr     r10, [sp, #88]              ; vp8_coef_tree
    118 
    119     ; v is kept in r12 during the token pack loop
    120     lsr     r12, r12, r4                ; v >>= 32 - n
    121 
    122 ; loop start
    123 token_loop
    124     ldrb    r4, [r9, lr, asr #1]        ; pp [i>>1]
    125     sub     r7, r5, #1                  ; range-1
    126 
    127     ; Decisions are made based on the bit value shifted
    128     ; off of v, so set a flag here based on this.
    129     ; This value is refered to as "bb"
    130     lsrs    r12, r12, #1                ; bb = v >> n
    131     mul     r4, r4, r7                  ; ((range-1) * pp[i>>1]))
    132 
    133     ; bb can only be 0 or 1.  So only execute this statement
    134     ; if bb == 1, otherwise it will act like i + 0
    135     addcs   lr, lr, #1                  ; i + bb
    136 
    137     mov     r7, #1
    138     ldrsb   lr, [r10, lr]               ; i = vp8_coef_tree[i+bb]
    139     add     r4, r7, r4, lsr #8          ; 1 + (((range-1) * pp[i>>1]) >> 8)
    140 
    141     addcs   r2, r2, r4                  ; if  (bb) lowvalue += split
    142     subcs   r4, r5, r4                  ; if  (bb) range = range-split
    143 
    144     ; Counting the leading zeros is used to normalize range.
    145     clz     r6, r4
    146     sub     r6, r6, #24                 ; shift
    147 
    148     ; Flag is set on the sum of count.  This flag is used later
    149     ; to determine if count >= 0
    150     adds    r3, r3, r6                  ; count += shift
    151     lsl     r5, r4, r6                  ; range <<= shift
    152     bmi     token_count_lt_zero         ; if(count >= 0)
    153 
    154     sub     r6, r6, r3                  ; offset = shift - count
    155     sub     r4, r6, #1                  ; offset-1
    156     lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
    157     bpl     token_high_bit_not_set
    158 
    159     ldr     r4, [r0, #vp8_writer_pos]   ; x
    160     sub     r4, r4, #1                  ; x = w->pos-1
    161     b       token_zero_while_start
    162 token_zero_while_loop
    163     mov     r10, #0
    164     strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0
    165     sub     r4, r4, #1                  ; x--
    166 token_zero_while_start
    167     cmp     r4, #0
    168     ldrge   r7, [r0, #vp8_writer_buffer]
    169     ldrb    r11, [r7, r4]
    170     cmpge   r11, #0xff
    171     beq     token_zero_while_loop
    172 
    173     ldr     r7, [r0, #vp8_writer_buffer]
    174     ldrb    r10, [r7, r4]               ; w->buffer[x]
    175     add     r10, r10, #1
    176     strb    r10, [r7, r4]               ; w->buffer[x] + 1
    177 token_high_bit_not_set
    178     rsb     r4, r6, #24                 ; 24-offset
    179     ldr     r10, [r0, #vp8_writer_buffer]
    180     lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
    181     ldr     r4, [r0, #vp8_writer_pos]   ; w->pos
    182     lsl     r2, r2, r6                  ; lowvalue <<= offset
    183     mov     r6, r3                      ; shift = count
    184     add     r11, r4, #1                 ; w->pos++
    185     bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
    186     str     r11, [r0, #vp8_writer_pos]
    187     sub     r3, r3, #8                  ; count -= 8
    188     strb    r7, [r10, r4]               ; w->buffer[w->pos++]
    189 
    190     ; r10 is used earlier in the loop, but r10 is used as
    191     ; temp variable here.  So after r10 is used, reload
    192     ; vp8_coef_tree_dcd into r10
    193     ldr     r10, [sp, #88]              ; vp8_coef_tree
    194 
    195 token_count_lt_zero
    196     lsl     r2, r2, r6                  ; lowvalue <<= shift
    197 
    198     subs    r8, r8, #1                  ; --n
    199     bne     token_loop
    200 
    201     ldr     r6, [r1, #tokenextra_token] ; t
    202     ldr     r7, [sp, #84]                ; vp8_extra_bits
    203     ; Add t * sizeof (vp8_extra_bit_struct) to get the desired
    204     ;  element.  Here vp8_extra_bit_struct == 20
    205     add     r6, r6, r6, lsl #2          ; b = vp8_extra_bits + t
    206     add     r12, r7, r6, lsl #2         ; b = vp8_extra_bits + t
    207 
    208     ldr     r4, [r12, #vp8_extra_bit_struct_base_val]
    209     cmp     r4, #0
    210     beq     skip_extra_bits
    211 
    212 ;   if( b->base_val)
    213     ldr     r8, [r12, #vp8_extra_bit_struct_len] ; L
    214     ldr     lr, [r1, #tokenextra_extra] ; e = p->Extra
    215     cmp     r8, #0                      ; if( L)
    216     beq     no_extra_bits
    217 
    218     ldr     r9, [r12, #vp8_extra_bit_struct_prob]
    219     asr     r7, lr, #1                  ; v=e>>1
    220 
    221     ldr     r10, [r12, #vp8_extra_bit_struct_tree]
    222     str     r10, [sp, #4]               ; b->tree
    223 
    224     rbit    r12, r7                     ; reverse v
    225     rsb     r4, r8, #32
    226     lsr     r12, r12, r4
    227 
    228     mov     lr, #0                      ; i = 0
    229 
    230 extra_bits_loop
    231     ldrb    r4, [r9, lr, asr #1]        ; pp[i>>1]
    232     sub     r7, r5, #1                  ; range-1
    233     lsrs    r12, r12, #1                ; v >> n
    234     mul     r4, r4, r7                  ; (range-1) * pp[i>>1]
    235     addcs   lr, lr, #1                  ; i + bb
    236 
    237     mov     r7, #1
    238     ldrsb   lr, [r10, lr]               ; i = b->tree[i+bb]
    239     add     r4, r7, r4, lsr #8          ; split = 1 +  (((range-1) * pp[i>>1]) >> 8)
    240 
    241     addcs   r2, r2, r4                  ; if  (bb) lowvalue += split
    242     subcs   r4, r5, r4                  ; if  (bb) range = range-split
    243 
    244     clz     r6, r4
    245     sub     r6, r6, #24
    246 
    247     adds    r3, r3, r6                  ; count += shift
    248     lsl     r5, r4, r6                  ; range <<= shift
    249     bmi     extra_count_lt_zero         ; if(count >= 0)
    250 
    251     sub     r6, r6, r3                  ; offset= shift - count
    252     sub     r4, r6, #1                  ; offset-1
    253     lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
    254     bpl     extra_high_bit_not_set
    255 
    256     ldr     r4, [r0, #vp8_writer_pos]   ; x
    257     sub     r4, r4, #1                  ; x = w->pos - 1
    258     b       extra_zero_while_start
    259 extra_zero_while_loop
    260     mov     r10, #0
    261     strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0
    262     sub     r4, r4, #1                  ; x--
    263 extra_zero_while_start
    264     cmp     r4, #0
    265     ldrge   r7, [r0, #vp8_writer_buffer]
    266     ldrb    r11, [r7, r4]
    267     cmpge   r11, #0xff
    268     beq     extra_zero_while_loop
    269 
    270     ldr     r7, [r0, #vp8_writer_buffer]
    271     ldrb    r10, [r7, r4]
    272     add     r10, r10, #1
    273     strb    r10, [r7, r4]
    274 extra_high_bit_not_set
    275     rsb     r4, r6, #24                 ; 24-offset
    276     ldr     r10, [r0, #vp8_writer_buffer]
    277     lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
    278     ldr     r4, [r0, #vp8_writer_pos]
    279     lsl     r2, r2, r6                  ; lowvalue <<= offset
    280     mov     r6, r3                      ; shift = count
    281     add     r11, r4, #1                 ; w->pos++
    282     bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
    283     str     r11, [r0, #vp8_writer_pos]
    284     sub     r3, r3, #8                  ; count -= 8
    285     strb    r7, [r10, r4]               ; w->buffer[w->pos++]=(lowvalue >> (24-offset))
    286     ldr     r10, [sp, #4]               ; b->tree
    287 extra_count_lt_zero
    288     lsl     r2, r2, r6
    289 
    290     subs    r8, r8, #1                  ; --n
    291     bne     extra_bits_loop             ; while (n)
    292 
    293 no_extra_bits
    294     ldr     lr, [r1, #4]                ; e = p->Extra
    295     add     r4, r5, #1                  ; range + 1
    296     tst     lr, #1
    297     lsr     r4, r4, #1                  ; split = (range + 1) >> 1
    298     addne   r2, r2, r4                  ; lowvalue += split
    299     subne   r4, r5, r4                  ; range = range-split
    300     tst     r2, #0x80000000             ; lowvalue & 0x80000000
    301     lsl     r5, r4, #1                  ; range <<= 1
    302     beq     end_high_bit_not_set
    303 
    304     ldr     r4, [r0, #vp8_writer_pos]
    305     mov     r7, #0
    306     sub     r4, r4, #1
    307     b       end_zero_while_start
    308 end_zero_while_loop
    309     strb    r7, [r6, r4]
    310     sub     r4, r4, #1                  ; x--
    311 end_zero_while_start
    312     cmp     r4, #0
    313     ldrge   r6, [r0, #vp8_writer_buffer]
    314     ldrb    r12, [r6, r4]
    315     cmpge   r12, #0xff
    316     beq     end_zero_while_loop
    317 
    318     ldr     r6, [r0, #vp8_writer_buffer]
    319     ldrb    r7, [r6, r4]
    320     add     r7, r7, #1
    321     strb    r7, [r6, r4]
    322 end_high_bit_not_set
    323     adds    r3, r3, #1                  ; ++count
    324     lsl     r2, r2, #1                  ; lowvalue  <<= 1
    325     bne     end_count_zero
    326 
    327     ldr     r4, [r0, #vp8_writer_pos]
    328     mvn     r3, #7
    329     ldr     r7, [r0, #vp8_writer_buffer]
    330     lsr     r6, r2, #24                 ; lowvalue >> 24
    331     add     r12, r4, #1                 ; w->pos++
    332     bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
    333     str     r12, [r0, #0x10]
    334     strb    r6, [r7, r4]
    335 end_count_zero
    336 skip_extra_bits
    337     add     r1, r1, #TOKENEXTRA_SZ      ; ++p
    338 check_p_lt_stop
    339     ldr     r4, [sp, #0]                ; stop
    340     cmp     r1, r4                      ; while( p < stop)
    341     bcc     while_p_lt_stop
    342 
    343     ldr     r10, [sp, #20]              ; num_parts
    344     mov     r1, #TOKENLIST_SZ
    345     mul     r1, r10, r1
    346 
    347     ldr     r6, [sp, #12]               ; mb_rows
    348     ldr     r7, [sp, #16]               ; tokenlist address
    349     subs    r6, r6, r10
    350     add     r7, r7, r1                  ; next element in the array
    351     str     r6, [sp, #12]
    352     bgt     mb_row_loop
    353 
    354     mov     r12, #32
    355 
    356 stop_encode_loop
    357     sub     r7, r5, #1                  ; range-1
    358 
    359     mov     r4, r7, lsl #7              ; ((range-1) * 128)
    360 
    361     mov     r7, #1
    362     add     r4, r7, r4, lsr #8          ; 1 + (((range-1) * 128) >> 8)
    363 
    364     ; Counting the leading zeros is used to normalize range.
    365     clz     r6, r4
    366     sub     r6, r6, #24                 ; shift
    367 
    368     ; Flag is set on the sum of count.  This flag is used later
    369     ; to determine if count >= 0
    370     adds    r3, r3, r6                  ; count += shift
    371     lsl     r5, r4, r6                  ; range <<= shift
    372     bmi     token_count_lt_zero_se      ; if(count >= 0)
    373 
    374     sub     r6, r6, r3                  ; offset = shift - count
    375     sub     r4, r6, #1                  ; offset-1
    376     lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
    377     bpl     token_high_bit_not_set_se
    378 
    379     ldr     r4, [r0, #vp8_writer_pos]   ; x
    380     sub     r4, r4, #1                  ; x = w->pos-1
    381     b       token_zero_while_start_se
    382 token_zero_while_loop_se
    383     mov     r10, #0
    384     strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0
    385     sub     r4, r4, #1                  ; x--
    386 token_zero_while_start_se
    387     cmp     r4, #0
    388     ldrge   r7, [r0, #vp8_writer_buffer]
    389     ldrb    r11, [r7, r4]
    390     cmpge   r11, #0xff
    391     beq     token_zero_while_loop_se
    392 
    393     ldr     r7, [r0, #vp8_writer_buffer]
    394     ldrb    r10, [r7, r4]               ; w->buffer[x]
    395     add     r10, r10, #1
    396     strb    r10, [r7, r4]               ; w->buffer[x] + 1
    397 token_high_bit_not_set_se
    398     rsb     r4, r6, #24                 ; 24-offset
    399     ldr     r10, [r0, #vp8_writer_buffer]
    400     lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
    401     ldr     r4, [r0, #vp8_writer_pos]   ; w->pos
    402     lsl     r2, r2, r6                  ; lowvalue <<= offset
    403     mov     r6, r3                      ; shift = count
    404     add     r11, r4, #1                 ; w->pos++
    405     bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
    406     str     r11, [r0, #vp8_writer_pos]
    407     sub     r3, r3, #8                  ; count -= 8
    408     strb    r7, [r10, r4]               ; w->buffer[w->pos++]
    409 
    410 token_count_lt_zero_se
    411     lsl     r2, r2, r6                  ; lowvalue <<= shift
    412 
    413     subs    r12, r12, #1
    414     bne     stop_encode_loop
    415 
    416     ldr     r10, [sp, #8]               ; *size
    417     ldr     r11, [r10]
    418     ldr     r4,  [r0, #vp8_writer_pos]  ; w->pos
    419     add     r11, r11, r4                ; *size += w->pos
    420     str     r11, [r10]
    421 
    422     ldr     r9, [sp, #20]               ; num_parts
    423     sub     r9, r9, #1
    424     ldr     r10, [sp, #28]              ; i
    425     cmp     r10, r9                     ; if(i<(num_part - 1))
    426     bge     skip_write_partition
    427 
    428     ldr     r12, [sp, #40]              ; ptr
    429     add     r12, r12, r4                ; ptr += w->pos
    430     str     r12, [sp, #40]
    431 
    432     ldr     r9, [sp, #24]               ; cx_data
    433     mov     r8, r4, asr #8
    434     strb    r4, [r9, #0]
    435     strb    r8, [r9, #1]
    436     mov     r4, r4, asr #16
    437     strb    r4, [r9, #2]
    438 
    439     add     r9, r9, #3                  ; cx_data += 3
    440     str     r9, [sp, #24]
    441 
    442 skip_write_partition
    443 
    444     ldr     r11, [sp, #28]              ; i
    445     ldr     r10, [sp, #20]              ; num_parts
    446 
    447     add     r11, r11, #1                ; i++
    448     str     r11, [sp, #28]
    449 
    450     ldr     r7, [sp, #32]               ; cpi->tp_list[i]
    451     mov     r1, #TOKENLIST_SZ
    452     add     r7, r7, r1                  ; next element in cpi->tp_list
    453     str     r7, [sp, #32]               ; cpi->tp_list[i+1]
    454 
    455     cmp     r10, r11
    456     bgt     numparts_loop
    457 
    458 
    459     add     sp, sp, #44
    460     pop     {r4-r11, pc}
    461     ENDP
    462 
    463 _VP8_COMP_common_
    464     DCD     vp8_comp_common
    465 _VP8_COMMON_MBrows_
    466     DCD     vp8_common_mb_rows
    467 _VP8_COMP_tplist_
    468     DCD     vp8_comp_tplist
    469 _VP8_COMP_bc2_
    470     DCD     vp8_comp_bc2
    471 
    472     END
    473